diff --git "a/CompeteSMoE/competesmoe_versions/Full_competesmoev21/trainer_state.json" "b/CompeteSMoE/competesmoe_versions/Full_competesmoev21/trainer_state.json" new file mode 100644--- /dev/null +++ "b/CompeteSMoE/competesmoe_versions/Full_competesmoev21/trainer_state.json" @@ -0,0 +1,249523 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.05028445, + "auxiliary_loss_mlp": 0.02215396, + "balance_loss_clip": 2.43573999, + "balance_loss_mlp": 1.76983953, + "epoch": 6.012325266796934e-05, + "flos": 24456507091200.0, + "grad_norm": 55.0007689153118, + "language_loss": 2.85272503, + "learning_rate": 0.0, + "loss": 1.94613922, + "num_input_tokens_seen": 19155, + "step": 1, + "time_per_iteration": 14.6953706741333 + }, + { + "auxiliary_loss_clip": 0.03379929, + "auxiliary_loss_mlp": 0.01460768, + "balance_loss_clip": 1.62770474, + "balance_loss_mlp": 1.19068694, + "epoch": 0.00012024650533593868, + "flos": 20225931246720.0, + "grad_norm": 36.21369939567931, + "language_loss": 1.83011317, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.87852001, + "num_input_tokens_seen": 36175, + "step": 2, + "time_per_iteration": 2.4755165576934814 + }, + { + "auxiliary_loss_clip": 0.03319962, + "auxiliary_loss_mlp": 0.01441938, + "balance_loss_clip": 1.62580693, + "balance_loss_mlp": 1.18959546, + "epoch": 0.000180369758003908, + "flos": 22309935454080.0, + "grad_norm": 32.71534780746326, + "language_loss": 1.57499588, + "learning_rate": 7.073439208833112e-07, + "loss": 1.62261486, + "num_input_tokens_seen": 54870, + "step": 3, + "time_per_iteration": 2.429921865463257 + }, + { + "auxiliary_loss_clip": 0.03362487, + "auxiliary_loss_mlp": 0.01452239, + "balance_loss_clip": 1.62410331, + "balance_loss_mlp": 1.15640903, + "epoch": 0.00024049301067187735, + "flos": 22414650577920.0, + "grad_norm": 51.139974107101715, + "language_loss": 1.67653668, + "learning_rate": 8.925686513863519e-07, + "loss": 1.724684, + "num_input_tokens_seen": 74575, + "step": 4, + "time_per_iteration": 2.5677058696746826 + }, + { + "auxiliary_loss_clip": 0.03402039, + "auxiliary_loss_mlp": 0.01504593, + "balance_loss_clip": 1.6250596, + "balance_loss_mlp": 1.21677387, + "epoch": 0.0003006162633398467, + "flos": 21396978449280.0, + "grad_norm": 56.19314693935947, + "language_loss": 1.91152883, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.96059501, + "num_input_tokens_seen": 92580, + "step": 5, + "time_per_iteration": 2.7236881256103516 + }, + { + "auxiliary_loss_clip": 0.03371567, + "auxiliary_loss_mlp": 0.01516591, + "balance_loss_clip": 1.61563373, + "balance_loss_mlp": 1.22171497, + "epoch": 0.000360739516007816, + "flos": 21652375127040.0, + "grad_norm": 33.79752291712156, + "language_loss": 1.61086583, + "learning_rate": 1.153628246576487e-06, + "loss": 1.65974748, + "num_input_tokens_seen": 109705, + "step": 6, + "time_per_iteration": 2.7299628257751465 + }, + { + "auxiliary_loss_clip": 0.03354501, + "auxiliary_loss_mlp": 0.01487324, + "balance_loss_clip": 1.61587119, + "balance_loss_mlp": 1.20389163, + "epoch": 0.0004208627686757854, + "flos": 27159742897920.0, + "grad_norm": 25.002730852649435, + "language_loss": 1.53405786, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.58247614, + "num_input_tokens_seen": 129425, + "step": 7, + "time_per_iteration": 2.7463953495025635 + }, + { + "auxiliary_loss_clip": 0.03321391, + "auxiliary_loss_mlp": 0.01442803, + "balance_loss_clip": 1.61226654, + "balance_loss_mlp": 1.16528416, + "epoch": 0.0004809860213437547, + "flos": 31319096135040.0, + "grad_norm": 31.607545338794147, + "language_loss": 1.43749309, + "learning_rate": 1.338852977079528e-06, + "loss": 1.48513496, + "num_input_tokens_seen": 149210, + "step": 8, + "time_per_iteration": 2.7866902351379395 + }, + { + "auxiliary_loss_clip": 0.03369369, + "auxiliary_loss_mlp": 0.01498114, + "balance_loss_clip": 1.61160421, + "balance_loss_mlp": 1.21372795, + "epoch": 0.000541109274011724, + "flos": 32160411463680.0, + "grad_norm": 71.01780320086927, + "language_loss": 1.49897873, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.54765356, + "num_input_tokens_seen": 169055, + "step": 9, + "time_per_iteration": 2.7751379013061523 + }, + { + "auxiliary_loss_clip": 0.0330958, + "auxiliary_loss_mlp": 0.01474859, + "balance_loss_clip": 1.6153338, + "balance_loss_mlp": 1.20592284, + "epoch": 0.0006012325266796934, + "flos": 18916808163840.0, + "grad_norm": 23.230106739376225, + "language_loss": 1.44756222, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.49540663, + "num_input_tokens_seen": 188045, + "step": 10, + "time_per_iteration": 2.68770432472229 + }, + { + "auxiliary_loss_clip": 0.03366141, + "auxiliary_loss_mlp": 0.01493413, + "balance_loss_clip": 1.6211977, + "balance_loss_mlp": 1.21951771, + "epoch": 0.0006613557793476627, + "flos": 20774861867520.0, + "grad_norm": 18.65159540626604, + "language_loss": 1.45394874, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.5025444, + "num_input_tokens_seen": 207035, + "step": 11, + "time_per_iteration": 2.667656421661377 + }, + { + "auxiliary_loss_clip": 0.03294416, + "auxiliary_loss_mlp": 0.01451949, + "balance_loss_clip": 1.60798573, + "balance_loss_mlp": 1.17557406, + "epoch": 0.000721479032015632, + "flos": 16581680997120.0, + "grad_norm": 18.695296112122374, + "language_loss": 1.45259953, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.50006318, + "num_input_tokens_seen": 223225, + "step": 12, + "time_per_iteration": 2.7024025917053223 + }, + { + "auxiliary_loss_clip": 0.03336224, + "auxiliary_loss_mlp": 0.01408365, + "balance_loss_clip": 1.61989343, + "balance_loss_mlp": 1.14934683, + "epoch": 0.0007816022846836014, + "flos": 23805471144960.0, + "grad_norm": 14.075216949325474, + "language_loss": 1.29353356, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.34097958, + "num_input_tokens_seen": 242570, + "step": 13, + "time_per_iteration": 2.7084388732910156 + }, + { + "auxiliary_loss_clip": 0.03290794, + "auxiliary_loss_mlp": 0.01471898, + "balance_loss_clip": 1.61285377, + "balance_loss_mlp": 1.20334339, + "epoch": 0.0008417255373515708, + "flos": 19172204841600.0, + "grad_norm": 5.850888394264952, + "language_loss": 1.2096591, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.25728607, + "num_input_tokens_seen": 261215, + "step": 14, + "time_per_iteration": 2.667006015777588 + }, + { + "auxiliary_loss_clip": 0.03275878, + "auxiliary_loss_mlp": 0.0143239, + "balance_loss_clip": 1.6181798, + "balance_loss_mlp": 1.16898489, + "epoch": 0.00090184879001954, + "flos": 26395564026240.0, + "grad_norm": 14.910512461050104, + "language_loss": 1.12998271, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.17706537, + "num_input_tokens_seen": 280035, + "step": 15, + "time_per_iteration": 4.154406309127808 + }, + { + "auxiliary_loss_clip": 0.03241476, + "auxiliary_loss_mlp": 0.01412075, + "balance_loss_clip": 1.60297179, + "balance_loss_mlp": 1.16240263, + "epoch": 0.0009619720426875094, + "flos": 24679500785280.0, + "grad_norm": 4.507189979377554, + "language_loss": 1.1118933, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.15842879, + "num_input_tokens_seen": 300265, + "step": 16, + "time_per_iteration": 4.129991769790649 + }, + { + "auxiliary_loss_clip": 0.03228652, + "auxiliary_loss_mlp": 0.01418673, + "balance_loss_clip": 1.60950506, + "balance_loss_mlp": 1.17834735, + "epoch": 0.0010220952953554788, + "flos": 18624531196800.0, + "grad_norm": 4.703126954084794, + "language_loss": 1.12791896, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.17439222, + "num_input_tokens_seen": 317375, + "step": 17, + "time_per_iteration": 2.643577814102173 + }, + { + "auxiliary_loss_clip": 0.0316523, + "auxiliary_loss_mlp": 0.01381873, + "balance_loss_clip": 1.60745263, + "balance_loss_mlp": 1.1499393, + "epoch": 0.001082218548023448, + "flos": 26142537646080.0, + "grad_norm": 4.210276973546229, + "language_loss": 1.08280706, + "learning_rate": 1.860972167459798e-06, + "loss": 1.12827802, + "num_input_tokens_seen": 337975, + "step": 18, + "time_per_iteration": 2.686873435974121 + }, + { + "auxiliary_loss_clip": 0.03191023, + "auxiliary_loss_mlp": 0.01399878, + "balance_loss_clip": 1.60620987, + "balance_loss_mlp": 1.13456583, + "epoch": 0.0011423418006914173, + "flos": 19609776322560.0, + "grad_norm": 5.6844132181873706, + "language_loss": 1.02718735, + "learning_rate": 1.89578346593066e-06, + "loss": 1.07309651, + "num_input_tokens_seen": 356635, + "step": 19, + "time_per_iteration": 2.646482229232788 + }, + { + "auxiliary_loss_clip": 0.0313424, + "auxiliary_loss_mlp": 0.01342309, + "balance_loss_clip": 1.60809088, + "balance_loss_mlp": 1.12172449, + "epoch": 0.0012024650533593868, + "flos": 17895365107200.0, + "grad_norm": 3.8666625595729234, + "language_loss": 1.16783154, + "learning_rate": 1.928808765521199e-06, + "loss": 1.21259713, + "num_input_tokens_seen": 375625, + "step": 20, + "time_per_iteration": 2.7233519554138184 + }, + { + "auxiliary_loss_clip": 0.03122459, + "auxiliary_loss_mlp": 0.01379383, + "balance_loss_clip": 1.58902466, + "balance_loss_mlp": 1.12990153, + "epoch": 0.001262588306027356, + "flos": 21252043071360.0, + "grad_norm": 4.496949259977923, + "language_loss": 1.06348491, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.10850334, + "num_input_tokens_seen": 394350, + "step": 21, + "time_per_iteration": 2.6852407455444336 + }, + { + "auxiliary_loss_clip": 0.03027258, + "auxiliary_loss_mlp": 0.01384844, + "balance_loss_clip": 1.57162118, + "balance_loss_mlp": 1.14928687, + "epoch": 0.0013227115586953253, + "flos": 26104077158400.0, + "grad_norm": 4.063335269739779, + "language_loss": 1.06536341, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.10948443, + "num_input_tokens_seen": 413255, + "step": 22, + "time_per_iteration": 2.6957693099975586 + }, + { + "auxiliary_loss_clip": 0.02976226, + "auxiliary_loss_mlp": 0.01337113, + "balance_loss_clip": 1.57302022, + "balance_loss_mlp": 1.12549293, + "epoch": 0.0013828348113632948, + "flos": 23951376190080.0, + "grad_norm": 4.0043070186919945, + "language_loss": 0.91960108, + "learning_rate": 2.018794797290208e-06, + "loss": 0.96273446, + "num_input_tokens_seen": 433065, + "step": 23, + "time_per_iteration": 2.7151856422424316 + }, + { + "auxiliary_loss_clip": 0.02942104, + "auxiliary_loss_mlp": 0.01363138, + "balance_loss_clip": 1.56537771, + "balance_loss_mlp": 1.14169502, + "epoch": 0.001442958064031264, + "flos": 15959851724160.0, + "grad_norm": 2.8984264488227947, + "language_loss": 1.08421791, + "learning_rate": 2.046196897962839e-06, + "loss": 1.12727034, + "num_input_tokens_seen": 451175, + "step": 24, + "time_per_iteration": 2.647780179977417 + }, + { + "auxiliary_loss_clip": 0.02831438, + "auxiliary_loss_mlp": 0.01330443, + "balance_loss_clip": 1.55820298, + "balance_loss_mlp": 1.11882234, + "epoch": 0.0015030813166992333, + "flos": 18108350801280.0, + "grad_norm": 4.086800214849867, + "language_loss": 1.01468349, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.05630231, + "num_input_tokens_seen": 468775, + "step": 25, + "time_per_iteration": 2.6759719848632812 + }, + { + "auxiliary_loss_clip": 0.02825556, + "auxiliary_loss_mlp": 0.01311203, + "balance_loss_clip": 1.56096947, + "balance_loss_mlp": 1.1007272, + "epoch": 0.0015632045693672028, + "flos": 22234558763520.0, + "grad_norm": 2.9939950823146444, + "language_loss": 1.06775463, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.10912228, + "num_input_tokens_seen": 488530, + "step": 26, + "time_per_iteration": 2.669372081756592 + }, + { + "auxiliary_loss_clip": 0.02768238, + "auxiliary_loss_mlp": 0.01325317, + "balance_loss_clip": 1.55126643, + "balance_loss_mlp": 1.12437797, + "epoch": 0.001623327822035172, + "flos": 23991955580160.0, + "grad_norm": 2.6367761940866243, + "language_loss": 0.95648253, + "learning_rate": 2.122031762649933e-06, + "loss": 0.99741799, + "num_input_tokens_seen": 510495, + "step": 27, + "time_per_iteration": 2.723604917526245 + }, + { + "auxiliary_loss_clip": 0.02746536, + "auxiliary_loss_mlp": 0.01312074, + "balance_loss_clip": 1.55639291, + "balance_loss_mlp": 1.13011312, + "epoch": 0.0016834510747031415, + "flos": 19677647070720.0, + "grad_norm": 2.51170184201261, + "language_loss": 1.06443524, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.10502148, + "num_input_tokens_seen": 528605, + "step": 28, + "time_per_iteration": 2.710132598876953 + }, + { + "auxiliary_loss_clip": 0.02710518, + "auxiliary_loss_mlp": 0.01320961, + "balance_loss_clip": 1.5401603, + "balance_loss_mlp": 1.13337302, + "epoch": 0.0017435743273711108, + "flos": 20923819568640.0, + "grad_norm": 2.442918982822478, + "language_loss": 1.02692401, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.06723869, + "num_input_tokens_seen": 548515, + "step": 29, + "time_per_iteration": 2.7579588890075684 + }, + { + "auxiliary_loss_clip": 0.02700055, + "auxiliary_loss_mlp": 0.01313743, + "balance_loss_clip": 1.53621042, + "balance_loss_mlp": 1.12691784, + "epoch": 0.00180369758003908, + "flos": 19528976678400.0, + "grad_norm": 3.457351653427521, + "language_loss": 1.19303858, + "learning_rate": 2.189868360711334e-06, + "loss": 1.23317659, + "num_input_tokens_seen": 564025, + "step": 30, + "time_per_iteration": 2.7372548580169678 + }, + { + "auxiliary_loss_clip": 0.02619048, + "auxiliary_loss_mlp": 0.01338699, + "balance_loss_clip": 1.52283287, + "balance_loss_mlp": 1.1572144, + "epoch": 0.0018638208327070496, + "flos": 27453169100160.0, + "grad_norm": 2.9388233477720576, + "language_loss": 1.02500653, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.0645839, + "num_input_tokens_seen": 583345, + "step": 31, + "time_per_iteration": 2.7278308868408203 + }, + { + "auxiliary_loss_clip": 0.02589437, + "auxiliary_loss_mlp": 0.01332604, + "balance_loss_clip": 1.52444232, + "balance_loss_mlp": 1.15274119, + "epoch": 0.0019239440853750188, + "flos": 13589460380160.0, + "grad_norm": 3.0802985300117496, + "language_loss": 0.95587057, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.99509096, + "num_input_tokens_seen": 600010, + "step": 32, + "time_per_iteration": 2.6885178089141846 + }, + { + "auxiliary_loss_clip": 0.02570478, + "auxiliary_loss_mlp": 0.013027, + "balance_loss_clip": 1.51956296, + "balance_loss_mlp": 1.13571191, + "epoch": 0.001984067338042988, + "flos": 11253866336640.0, + "grad_norm": 3.076006431182188, + "language_loss": 0.95174849, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.99048024, + "num_input_tokens_seen": 616295, + "step": 33, + "time_per_iteration": 2.630277395248413 + }, + { + "auxiliary_loss_clip": 0.02427026, + "auxiliary_loss_mlp": 0.01303763, + "balance_loss_clip": 1.48692524, + "balance_loss_mlp": 1.14640713, + "epoch": 0.0020441905907109576, + "flos": 22386245898240.0, + "grad_norm": 2.19908778643991, + "language_loss": 0.91466326, + "learning_rate": 2.270454923596497e-06, + "loss": 0.95197117, + "num_input_tokens_seen": 637640, + "step": 34, + "time_per_iteration": 2.7644619941711426 + }, + { + "auxiliary_loss_clip": 0.0237835, + "auxiliary_loss_mlp": 0.01269303, + "balance_loss_clip": 1.45181537, + "balance_loss_mlp": 1.11518967, + "epoch": 0.0021043138433789266, + "flos": 49778580337920.0, + "grad_norm": 2.726513935839375, + "language_loss": 0.76664925, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.80312586, + "num_input_tokens_seen": 659710, + "step": 35, + "time_per_iteration": 2.8921875953674316 + }, + { + "auxiliary_loss_clip": 0.02349098, + "auxiliary_loss_mlp": 0.01274919, + "balance_loss_clip": 1.4645474, + "balance_loss_mlp": 1.13139129, + "epoch": 0.002164437096046896, + "flos": 20557961591040.0, + "grad_norm": 4.391738830652312, + "language_loss": 0.88711071, + "learning_rate": 2.307256493152974e-06, + "loss": 0.92335081, + "num_input_tokens_seen": 679670, + "step": 36, + "time_per_iteration": 2.7051918506622314 + }, + { + "auxiliary_loss_clip": 0.02291043, + "auxiliary_loss_mlp": 0.01334067, + "balance_loss_clip": 1.4514792, + "balance_loss_mlp": 1.18796396, + "epoch": 0.0022245603487148656, + "flos": 26542295084160.0, + "grad_norm": 3.317240939905439, + "language_loss": 0.92799407, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.9642452, + "num_input_tokens_seen": 700170, + "step": 37, + "time_per_iteration": 2.736189842224121 + }, + { + "auxiliary_loss_clip": 0.02251115, + "auxiliary_loss_mlp": 0.01277599, + "balance_loss_clip": 1.44627595, + "balance_loss_mlp": 1.15610075, + "epoch": 0.0022846836013828346, + "flos": 20338188226560.0, + "grad_norm": 7.254900237501925, + "language_loss": 1.04009855, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.07538557, + "num_input_tokens_seen": 718545, + "step": 38, + "time_per_iteration": 2.717092275619507 + }, + { + "auxiliary_loss_clip": 0.02221873, + "auxiliary_loss_mlp": 0.01258488, + "balance_loss_clip": 1.44061971, + "balance_loss_mlp": 1.13555932, + "epoch": 0.002344806854050804, + "flos": 26247575992320.0, + "grad_norm": 2.3980310601176247, + "language_loss": 0.8540687, + "learning_rate": 2.358792165262154e-06, + "loss": 0.88887233, + "num_input_tokens_seen": 739865, + "step": 39, + "time_per_iteration": 2.729814291000366 + }, + { + "auxiliary_loss_clip": 0.02196512, + "auxiliary_loss_mlp": 0.01250654, + "balance_loss_clip": 1.43028092, + "balance_loss_mlp": 1.12200332, + "epoch": 0.0024049301067187736, + "flos": 11801539981440.0, + "grad_norm": 2.6927796083967332, + "language_loss": 0.90215278, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.93662447, + "num_input_tokens_seen": 755770, + "step": 40, + "time_per_iteration": 2.652189016342163 + }, + { + "auxiliary_loss_clip": 0.02148083, + "auxiliary_loss_mlp": 0.01277126, + "balance_loss_clip": 1.42066431, + "balance_loss_mlp": 1.16330528, + "epoch": 0.0024650533593867426, + "flos": 20631506688000.0, + "grad_norm": 4.20323719666801, + "language_loss": 0.93329579, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.96754795, + "num_input_tokens_seen": 773440, + "step": 41, + "time_per_iteration": 2.6637909412384033 + }, + { + "auxiliary_loss_clip": 0.02113187, + "auxiliary_loss_mlp": 0.01258577, + "balance_loss_clip": 1.41261578, + "balance_loss_mlp": 1.15424466, + "epoch": 0.002525176612054712, + "flos": 18406122549120.0, + "grad_norm": 2.842144062106217, + "language_loss": 0.97722417, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.01094186, + "num_input_tokens_seen": 790455, + "step": 42, + "time_per_iteration": 2.6459341049194336 + }, + { + "auxiliary_loss_clip": 0.02075838, + "auxiliary_loss_mlp": 0.01304844, + "balance_loss_clip": 1.41428459, + "balance_loss_mlp": 1.19660187, + "epoch": 0.0025852998647226816, + "flos": 28184023128960.0, + "grad_norm": 2.4960241772092204, + "language_loss": 0.97440422, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.00821102, + "num_input_tokens_seen": 810645, + "step": 43, + "time_per_iteration": 2.7594943046569824 + }, + { + "auxiliary_loss_clip": 0.02095906, + "auxiliary_loss_mlp": 0.01318647, + "balance_loss_clip": 1.41281855, + "balance_loss_mlp": 1.20506418, + "epoch": 0.0026454231173906506, + "flos": 14283110897280.0, + "grad_norm": 2.0273665833954135, + "language_loss": 0.9364934, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.97063887, + "num_input_tokens_seen": 827470, + "step": 44, + "time_per_iteration": 2.692615270614624 + }, + { + "auxiliary_loss_clip": 0.02060208, + "auxiliary_loss_mlp": 0.01275303, + "balance_loss_clip": 1.41113913, + "balance_loss_mlp": 1.17612088, + "epoch": 0.00270554637005862, + "flos": 22419211605120.0, + "grad_norm": 2.361400658571967, + "language_loss": 0.98666775, + "learning_rate": 2.450927955901469e-06, + "loss": 1.02002287, + "num_input_tokens_seen": 847285, + "step": 45, + "time_per_iteration": 2.7195322513580322 + }, + { + "auxiliary_loss_clip": 0.02033795, + "auxiliary_loss_mlp": 0.01232111, + "balance_loss_clip": 1.3965255, + "balance_loss_mlp": 1.1438005, + "epoch": 0.0027656696227265896, + "flos": 23985778440960.0, + "grad_norm": 2.195102108830285, + "language_loss": 1.02617097, + "learning_rate": 2.465079122983384e-06, + "loss": 1.05883002, + "num_input_tokens_seen": 867545, + "step": 46, + "time_per_iteration": 2.707385540008545 + }, + { + "auxiliary_loss_clip": 0.01998348, + "auxiliary_loss_mlp": 0.01274711, + "balance_loss_clip": 1.38709247, + "balance_loss_mlp": 1.18301487, + "epoch": 0.0028257928753945586, + "flos": 37669503087360.0, + "grad_norm": 2.7655537814074775, + "language_loss": 0.88142788, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.91415852, + "num_input_tokens_seen": 889915, + "step": 47, + "time_per_iteration": 2.8250505924224854 + }, + { + "auxiliary_loss_clip": 0.01959817, + "auxiliary_loss_mlp": 0.01257752, + "balance_loss_clip": 1.3761456, + "balance_loss_mlp": 1.17049098, + "epoch": 0.002885916128062528, + "flos": 22454547609600.0, + "grad_norm": 3.1940090818211533, + "language_loss": 0.87990278, + "learning_rate": 2.492481223656015e-06, + "loss": 0.9120785, + "num_input_tokens_seen": 908975, + "step": 48, + "time_per_iteration": 2.6966872215270996 + }, + { + "auxiliary_loss_clip": 0.01959587, + "auxiliary_loss_mlp": 0.01244453, + "balance_loss_clip": 1.36591625, + "balance_loss_mlp": 1.15137434, + "epoch": 0.0029460393807304976, + "flos": 27012796358400.0, + "grad_norm": 2.2473168101519976, + "language_loss": 0.89837682, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.93041724, + "num_input_tokens_seen": 929810, + "step": 49, + "time_per_iteration": 2.7578954696655273 + }, + { + "auxiliary_loss_clip": 0.0195209, + "auxiliary_loss_mlp": 0.01236839, + "balance_loss_clip": 1.36050057, + "balance_loss_mlp": 1.15086579, + "epoch": 0.0030061626333984666, + "flos": 15851832549120.0, + "grad_norm": 2.678205539545924, + "language_loss": 0.91038376, + "learning_rate": 2.51876455396287e-06, + "loss": 0.94227308, + "num_input_tokens_seen": 948650, + "step": 50, + "time_per_iteration": 2.7679762840270996 + }, + { + "auxiliary_loss_clip": 0.01949342, + "auxiliary_loss_mlp": 0.01201979, + "balance_loss_clip": 1.36510336, + "balance_loss_mlp": 1.11948586, + "epoch": 0.003066285886066436, + "flos": 31827052316160.0, + "grad_norm": 6.998995439329058, + "language_loss": 0.87146574, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.90297896, + "num_input_tokens_seen": 966455, + "step": 51, + "time_per_iteration": 2.80306339263916 + }, + { + "auxiliary_loss_clip": 0.0190609, + "auxiliary_loss_mlp": 0.01209201, + "balance_loss_clip": 1.35485411, + "balance_loss_mlp": 1.12952113, + "epoch": 0.0031264091387344056, + "flos": 41427482774400.0, + "grad_norm": 2.0861152285633393, + "language_loss": 0.95259207, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.98374504, + "num_input_tokens_seen": 988110, + "step": 52, + "time_per_iteration": 2.896541118621826 + }, + { + "auxiliary_loss_clip": 0.01902539, + "auxiliary_loss_mlp": 0.01242914, + "balance_loss_clip": 1.35093045, + "balance_loss_mlp": 1.16213787, + "epoch": 0.0031865323914023747, + "flos": 23440941970560.0, + "grad_norm": 7.435644268017929, + "language_loss": 0.92274666, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.95420128, + "num_input_tokens_seen": 1008550, + "step": 53, + "time_per_iteration": 2.766141891479492 + }, + { + "auxiliary_loss_clip": 0.01892641, + "auxiliary_loss_mlp": 0.01197169, + "balance_loss_clip": 1.35497916, + "balance_loss_mlp": 1.11548698, + "epoch": 0.003246655644070344, + "flos": 14429195510400.0, + "grad_norm": 2.450631049511502, + "language_loss": 0.82807302, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.85897124, + "num_input_tokens_seen": 1026840, + "step": 54, + "time_per_iteration": 4.120918273925781 + }, + { + "auxiliary_loss_clip": 0.01889992, + "auxiliary_loss_mlp": 0.0120935, + "balance_loss_clip": 1.34473562, + "balance_loss_mlp": 1.12957573, + "epoch": 0.0033067788967383136, + "flos": 35918247496320.0, + "grad_norm": 2.5900754745378096, + "language_loss": 0.81334317, + "learning_rate": 2.580130221340046e-06, + "loss": 0.84433663, + "num_input_tokens_seen": 1048875, + "step": 55, + "time_per_iteration": 4.219754695892334 + }, + { + "auxiliary_loss_clip": 0.01879125, + "auxiliary_loss_mlp": 0.0120095, + "balance_loss_clip": 1.33860755, + "balance_loss_mlp": 1.1210326, + "epoch": 0.003366902149406283, + "flos": 22958732862720.0, + "grad_norm": 2.659004278044306, + "language_loss": 0.87156689, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.90236759, + "num_input_tokens_seen": 1066435, + "step": 56, + "time_per_iteration": 2.8395962715148926 + }, + { + "auxiliary_loss_clip": 0.01880357, + "auxiliary_loss_mlp": 0.0116199, + "balance_loss_clip": 1.33138347, + "balance_loss_mlp": 1.08779407, + "epoch": 0.003427025402074252, + "flos": 26582838560640.0, + "grad_norm": 1.935221967321181, + "language_loss": 0.92988372, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.96030712, + "num_input_tokens_seen": 1090330, + "step": 57, + "time_per_iteration": 2.7686638832092285 + }, + { + "auxiliary_loss_clip": 0.01843143, + "auxiliary_loss_mlp": 0.01215065, + "balance_loss_clip": 1.33584261, + "balance_loss_mlp": 1.1426816, + "epoch": 0.0034871486547422216, + "flos": 23951196622080.0, + "grad_norm": 2.15414077833082, + "language_loss": 0.99566519, + "learning_rate": 2.614325098333948e-06, + "loss": 1.02624726, + "num_input_tokens_seen": 1109840, + "step": 58, + "time_per_iteration": 2.8083763122558594 + }, + { + "auxiliary_loss_clip": 0.01824337, + "auxiliary_loss_mlp": 0.01196799, + "balance_loss_clip": 1.32193422, + "balance_loss_mlp": 1.12532127, + "epoch": 0.003547271907410191, + "flos": 21214983214080.0, + "grad_norm": 2.1868428241543847, + "language_loss": 0.88220567, + "learning_rate": 2.625331386578098e-06, + "loss": 0.91241705, + "num_input_tokens_seen": 1128415, + "step": 59, + "time_per_iteration": 2.7655038833618164 + }, + { + "auxiliary_loss_clip": 0.01845515, + "auxiliary_loss_mlp": 0.01158162, + "balance_loss_clip": 1.3304832, + "balance_loss_mlp": 1.0851109, + "epoch": 0.00360739516007816, + "flos": 16504903676160.0, + "grad_norm": 2.056067662422279, + "language_loss": 0.93512869, + "learning_rate": 2.63615268640451e-06, + "loss": 0.9651655, + "num_input_tokens_seen": 1146515, + "step": 60, + "time_per_iteration": 2.7158358097076416 + }, + { + "auxiliary_loss_clip": 0.01825818, + "auxiliary_loss_mlp": 0.01171191, + "balance_loss_clip": 1.3151412, + "balance_loss_mlp": 1.10262179, + "epoch": 0.0036675184127461296, + "flos": 19464805031040.0, + "grad_norm": 2.191723713012442, + "language_loss": 0.89917547, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.92914557, + "num_input_tokens_seen": 1166330, + "step": 61, + "time_per_iteration": 2.739276885986328 + }, + { + "auxiliary_loss_clip": 0.01809171, + "auxiliary_loss_mlp": 0.01142422, + "balance_loss_clip": 1.31125736, + "balance_loss_mlp": 1.07523572, + "epoch": 0.003727641665414099, + "flos": 20957323979520.0, + "grad_norm": 1.900677919231022, + "language_loss": 0.88398385, + "learning_rate": 2.657264485425803e-06, + "loss": 0.91349977, + "num_input_tokens_seen": 1186010, + "step": 62, + "time_per_iteration": 2.707585573196411 + }, + { + "auxiliary_loss_clip": 0.01789304, + "auxiliary_loss_mlp": 0.01161585, + "balance_loss_clip": 1.30166042, + "balance_loss_mlp": 1.09153748, + "epoch": 0.003787764918082068, + "flos": 18406050721920.0, + "grad_norm": 2.7560077151168842, + "language_loss": 0.96264374, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.99215263, + "num_input_tokens_seen": 1204985, + "step": 63, + "time_per_iteration": 2.656906843185425 + }, + { + "auxiliary_loss_clip": 0.01796334, + "auxiliary_loss_mlp": 0.01171342, + "balance_loss_clip": 1.3090297, + "balance_loss_mlp": 1.10444212, + "epoch": 0.0038478881707500376, + "flos": 12459243962880.0, + "grad_norm": 3.893973202840674, + "language_loss": 0.98791313, + "learning_rate": 2.677705954159056e-06, + "loss": 1.01758993, + "num_input_tokens_seen": 1223545, + "step": 64, + "time_per_iteration": 2.6722960472106934 + }, + { + "auxiliary_loss_clip": 0.01803413, + "auxiliary_loss_mlp": 0.01148376, + "balance_loss_clip": 1.30799556, + "balance_loss_mlp": 1.08066511, + "epoch": 0.003908011423418007, + "flos": 13553334276480.0, + "grad_norm": 2.7879410686899453, + "language_loss": 0.85275209, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.88226998, + "num_input_tokens_seen": 1241175, + "step": 65, + "time_per_iteration": 2.6174910068511963 + }, + { + "auxiliary_loss_clip": 0.017804, + "auxiliary_loss_mlp": 0.01155168, + "balance_loss_clip": 1.29514968, + "balance_loss_mlp": 1.08750486, + "epoch": 0.003968134676085976, + "flos": 18333475292160.0, + "grad_norm": 1.9571751601986513, + "language_loss": 0.85182297, + "learning_rate": 2.697518353781685e-06, + "loss": 0.88117868, + "num_input_tokens_seen": 1259315, + "step": 66, + "time_per_iteration": 2.658709764480591 + }, + { + "auxiliary_loss_clip": 0.01783856, + "auxiliary_loss_mlp": 0.0115392, + "balance_loss_clip": 1.29331565, + "balance_loss_mlp": 1.07791221, + "epoch": 0.004028257928753946, + "flos": 20485242506880.0, + "grad_norm": 2.3698059868821684, + "language_loss": 0.96468216, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.99406004, + "num_input_tokens_seen": 1277055, + "step": 67, + "time_per_iteration": 2.641746759414673 + }, + { + "auxiliary_loss_clip": 0.01754411, + "auxiliary_loss_mlp": 0.01155409, + "balance_loss_clip": 1.28637362, + "balance_loss_mlp": 1.08288217, + "epoch": 0.004088381181421915, + "flos": 18843837684480.0, + "grad_norm": 2.7406767736009443, + "language_loss": 0.94616473, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.97526288, + "num_input_tokens_seen": 1294355, + "step": 68, + "time_per_iteration": 2.64577579498291 + }, + { + "auxiliary_loss_clip": 0.01749401, + "auxiliary_loss_mlp": 0.0115652, + "balance_loss_clip": 1.28411579, + "balance_loss_mlp": 1.08666384, + "epoch": 0.004148504434089885, + "flos": 19427817000960.0, + "grad_norm": 2.1205839470398553, + "language_loss": 0.95834875, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.98740798, + "num_input_tokens_seen": 1313525, + "step": 69, + "time_per_iteration": 2.646423101425171 + }, + { + "auxiliary_loss_clip": 0.01742981, + "auxiliary_loss_mlp": 0.01158017, + "balance_loss_clip": 1.28688824, + "balance_loss_mlp": 1.09292889, + "epoch": 0.004208627686757853, + "flos": 20811023884800.0, + "grad_norm": 2.3019682152894108, + "language_loss": 0.98015571, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.00916576, + "num_input_tokens_seen": 1330505, + "step": 70, + "time_per_iteration": 2.632352352142334 + }, + { + "auxiliary_loss_clip": 0.01749632, + "auxiliary_loss_mlp": 0.01144484, + "balance_loss_clip": 1.27708316, + "balance_loss_mlp": 1.07515192, + "epoch": 0.004268750939425823, + "flos": 19098623831040.0, + "grad_norm": 2.6068441171994383, + "language_loss": 0.9399153, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.96885651, + "num_input_tokens_seen": 1349615, + "step": 71, + "time_per_iteration": 2.668095588684082 + }, + { + "auxiliary_loss_clip": 0.01823203, + "auxiliary_loss_mlp": 0.01299011, + "balance_loss_clip": 1.43772125, + "balance_loss_mlp": 1.26105523, + "epoch": 0.004328874192093792, + "flos": 52439635514880.0, + "grad_norm": 2.410621039578377, + "language_loss": 0.65646505, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68768716, + "num_input_tokens_seen": 1410275, + "step": 72, + "time_per_iteration": 3.161329507827759 + }, + { + "auxiliary_loss_clip": 0.01806294, + "auxiliary_loss_mlp": 0.01279273, + "balance_loss_clip": 1.43114007, + "balance_loss_mlp": 1.24131632, + "epoch": 0.004388997444761762, + "flos": 66473239564800.0, + "grad_norm": 2.3489531348975827, + "language_loss": 0.63693726, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66779292, + "num_input_tokens_seen": 1473020, + "step": 73, + "time_per_iteration": 3.2129199504852295 + }, + { + "auxiliary_loss_clip": 0.01722247, + "auxiliary_loss_mlp": 0.01143778, + "balance_loss_clip": 1.26832461, + "balance_loss_mlp": 1.0759716, + "epoch": 0.004449120697429731, + "flos": 18952970181120.0, + "grad_norm": 5.099355792569285, + "language_loss": 0.85942364, + "learning_rate": 2.771181708202938e-06, + "loss": 0.88808388, + "num_input_tokens_seen": 1490385, + "step": 74, + "time_per_iteration": 2.6345021724700928 + }, + { + "auxiliary_loss_clip": 0.01726457, + "auxiliary_loss_mlp": 0.01162006, + "balance_loss_clip": 1.26792169, + "balance_loss_mlp": 1.09281671, + "epoch": 0.004509243950097701, + "flos": 21105491581440.0, + "grad_norm": 2.014284118337651, + "language_loss": 0.97076827, + "learning_rate": 2.779824149153005e-06, + "loss": 0.99965286, + "num_input_tokens_seen": 1509725, + "step": 75, + "time_per_iteration": 2.652470350265503 + }, + { + "auxiliary_loss_clip": 0.01704382, + "auxiliary_loss_mlp": 0.01143069, + "balance_loss_clip": 1.26351404, + "balance_loss_mlp": 1.07655072, + "epoch": 0.004569367202765669, + "flos": 20698730991360.0, + "grad_norm": 2.3505645788118255, + "language_loss": 0.87827659, + "learning_rate": 2.788352117317012e-06, + "loss": 0.90675104, + "num_input_tokens_seen": 1527245, + "step": 76, + "time_per_iteration": 2.6256396770477295 + }, + { + "auxiliary_loss_clip": 0.01705014, + "auxiliary_loss_mlp": 0.01147183, + "balance_loss_clip": 1.26219273, + "balance_loss_mlp": 1.07746994, + "epoch": 0.004629490455433639, + "flos": 28658474899200.0, + "grad_norm": 1.8559931700429257, + "language_loss": 0.91905999, + "learning_rate": 2.796768605577095e-06, + "loss": 0.94758201, + "num_input_tokens_seen": 1548930, + "step": 77, + "time_per_iteration": 2.687666893005371 + }, + { + "auxiliary_loss_clip": 0.01694467, + "auxiliary_loss_mlp": 0.01168123, + "balance_loss_clip": 1.26229167, + "balance_loss_mlp": 1.09764647, + "epoch": 0.004689613708101608, + "flos": 11072409805440.0, + "grad_norm": 2.1652402395719954, + "language_loss": 0.92441529, + "learning_rate": 2.80507649095533e-06, + "loss": 0.95304114, + "num_input_tokens_seen": 1565695, + "step": 78, + "time_per_iteration": 2.58862042427063 + }, + { + "auxiliary_loss_clip": 0.0169101, + "auxiliary_loss_mlp": 0.01155126, + "balance_loss_clip": 1.25769186, + "balance_loss_mlp": 1.08693886, + "epoch": 0.004749736960769578, + "flos": 21799106184960.0, + "grad_norm": 2.108146663949779, + "language_loss": 0.82620943, + "learning_rate": 2.813278540517843e-06, + "loss": 0.85467076, + "num_input_tokens_seen": 1582625, + "step": 79, + "time_per_iteration": 2.6427085399627686 + }, + { + "auxiliary_loss_clip": 0.01703252, + "auxiliary_loss_mlp": 0.01134423, + "balance_loss_clip": 1.25972986, + "balance_loss_mlp": 1.06428039, + "epoch": 0.004809860213437547, + "flos": 19792597570560.0, + "grad_norm": 2.1708779551822315, + "language_loss": 0.91327262, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.94164932, + "num_input_tokens_seen": 1601725, + "step": 80, + "time_per_iteration": 2.6145427227020264 + }, + { + "auxiliary_loss_clip": 0.01672714, + "auxiliary_loss_mlp": 0.01144933, + "balance_loss_clip": 1.25285006, + "balance_loss_mlp": 1.07450438, + "epoch": 0.004869983466105517, + "flos": 26574327037440.0, + "grad_norm": 2.2416963077029606, + "language_loss": 0.94922227, + "learning_rate": 2.829375683533245e-06, + "loss": 0.97739875, + "num_input_tokens_seen": 1622420, + "step": 81, + "time_per_iteration": 2.6512176990509033 + }, + { + "auxiliary_loss_clip": 0.01686987, + "auxiliary_loss_mlp": 0.01148845, + "balance_loss_clip": 1.25672615, + "balance_loss_mlp": 1.08256507, + "epoch": 0.004930106718773485, + "flos": 12823378087680.0, + "grad_norm": 3.669547251248084, + "language_loss": 0.9616437, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.99000204, + "num_input_tokens_seen": 1640715, + "step": 82, + "time_per_iteration": 2.643275499343872 + }, + { + "auxiliary_loss_clip": 0.01670098, + "auxiliary_loss_mlp": 0.01158464, + "balance_loss_clip": 1.24507713, + "balance_loss_mlp": 1.08875036, + "epoch": 0.004990229971441455, + "flos": 25774919902080.0, + "grad_norm": 1.9758254064941683, + "language_loss": 0.86667681, + "learning_rate": 2.84508017388607e-06, + "loss": 0.89496243, + "num_input_tokens_seen": 1662210, + "step": 83, + "time_per_iteration": 2.670858144760132 + }, + { + "auxiliary_loss_clip": 0.01662818, + "auxiliary_loss_mlp": 0.01157515, + "balance_loss_clip": 1.24631083, + "balance_loss_mlp": 1.08756244, + "epoch": 0.005050353224109424, + "flos": 17457254922240.0, + "grad_norm": 2.9897262230439945, + "language_loss": 0.91794491, + "learning_rate": 2.852791070641559e-06, + "loss": 0.94614816, + "num_input_tokens_seen": 1681070, + "step": 84, + "time_per_iteration": 2.6115405559539795 + }, + { + "auxiliary_loss_clip": 0.01651316, + "auxiliary_loss_mlp": 0.01176073, + "balance_loss_clip": 1.35944915, + "balance_loss_mlp": 1.13964319, + "epoch": 0.005110476476777394, + "flos": 69805460367360.0, + "grad_norm": 1.4054310774635628, + "language_loss": 0.62552822, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.65380204, + "num_input_tokens_seen": 1747140, + "step": 85, + "time_per_iteration": 3.2172696590423584 + }, + { + "auxiliary_loss_clip": 0.01649871, + "auxiliary_loss_mlp": 0.01128349, + "balance_loss_clip": 1.23618102, + "balance_loss_mlp": 1.05791974, + "epoch": 0.005170599729445363, + "flos": 24790105739520.0, + "grad_norm": 1.7500560553946958, + "language_loss": 0.9083637, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.9361459, + "num_input_tokens_seen": 1767475, + "step": 86, + "time_per_iteration": 2.6689977645874023 + }, + { + "auxiliary_loss_clip": 0.01653536, + "auxiliary_loss_mlp": 0.01160042, + "balance_loss_clip": 1.24235368, + "balance_loss_mlp": 1.08913648, + "epoch": 0.005230722982113333, + "flos": 23258048895360.0, + "grad_norm": 2.4677866560037742, + "language_loss": 0.82005864, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.84819448, + "num_input_tokens_seen": 1784980, + "step": 87, + "time_per_iteration": 2.670135021209717 + }, + { + "auxiliary_loss_clip": 0.01643607, + "auxiliary_loss_mlp": 0.01154, + "balance_loss_clip": 1.24081039, + "balance_loss_mlp": 1.0852406, + "epoch": 0.005290846234781301, + "flos": 16727909264640.0, + "grad_norm": 1.961193686878651, + "language_loss": 0.9573524, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.9853285, + "num_input_tokens_seen": 1803030, + "step": 88, + "time_per_iteration": 2.6151418685913086 + }, + { + "auxiliary_loss_clip": 0.01659738, + "auxiliary_loss_mlp": 0.01148207, + "balance_loss_clip": 1.24067593, + "balance_loss_mlp": 1.08106875, + "epoch": 0.005350969487449271, + "flos": 20886077352960.0, + "grad_norm": 2.347517995338878, + "language_loss": 0.85947311, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.88755256, + "num_input_tokens_seen": 1822865, + "step": 89, + "time_per_iteration": 2.6218857765197754 + }, + { + "auxiliary_loss_clip": 0.01646946, + "auxiliary_loss_mlp": 0.0113309, + "balance_loss_clip": 1.23444152, + "balance_loss_mlp": 1.06652355, + "epoch": 0.00541109274011724, + "flos": 26209977431040.0, + "grad_norm": 2.241965829183184, + "language_loss": 0.91555059, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.94335091, + "num_input_tokens_seen": 1842435, + "step": 90, + "time_per_iteration": 2.6585636138916016 + }, + { + "auxiliary_loss_clip": 0.016283, + "auxiliary_loss_mlp": 0.01133835, + "balance_loss_clip": 1.23020971, + "balance_loss_mlp": 1.06564736, + "epoch": 0.00547121599278521, + "flos": 21178569801600.0, + "grad_norm": 2.145498282744817, + "language_loss": 0.8583855, + "learning_rate": 2.90432674275074e-06, + "loss": 0.88600683, + "num_input_tokens_seen": 1860065, + "step": 91, + "time_per_iteration": 2.5992393493652344 + }, + { + "auxiliary_loss_clip": 0.01626555, + "auxiliary_loss_mlp": 0.01138118, + "balance_loss_clip": 1.22574973, + "balance_loss_mlp": 1.07236195, + "epoch": 0.005531339245453179, + "flos": 19718801078400.0, + "grad_norm": 2.2441876605118694, + "language_loss": 0.86894584, + "learning_rate": 2.91136344867656e-06, + "loss": 0.89659262, + "num_input_tokens_seen": 1878135, + "step": 92, + "time_per_iteration": 2.618533134460449 + }, + { + "auxiliary_loss_clip": 0.01619025, + "auxiliary_loss_mlp": 0.0117656, + "balance_loss_clip": 1.21727514, + "balance_loss_mlp": 1.10861111, + "epoch": 0.005591462498121149, + "flos": 17636089760640.0, + "grad_norm": 3.0834147923948843, + "language_loss": 0.91943467, + "learning_rate": 2.918324080615938e-06, + "loss": 0.9473905, + "num_input_tokens_seen": 1894895, + "step": 93, + "time_per_iteration": 3.972862720489502 + }, + { + "auxiliary_loss_clip": 0.01630959, + "auxiliary_loss_mlp": 0.01151034, + "balance_loss_clip": 1.22391486, + "balance_loss_mlp": 1.0800333, + "epoch": 0.005651585750789117, + "flos": 20011221699840.0, + "grad_norm": 2.185108478248704, + "language_loss": 0.87443423, + "learning_rate": 2.925210265866963e-06, + "loss": 0.90225422, + "num_input_tokens_seen": 1913220, + "step": 94, + "time_per_iteration": 4.0016748905181885 + }, + { + "auxiliary_loss_clip": 0.01567896, + "auxiliary_loss_mlp": 0.01056951, + "balance_loss_clip": 1.31671977, + "balance_loss_mlp": 1.02242851, + "epoch": 0.005711709003457087, + "flos": 59812957981440.0, + "grad_norm": 1.37905093686974, + "language_loss": 0.6814155, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70766395, + "num_input_tokens_seen": 1970970, + "step": 95, + "time_per_iteration": 4.422675848007202 + }, + { + "auxiliary_loss_clip": 0.0160875, + "auxiliary_loss_mlp": 0.0115055, + "balance_loss_clip": 1.21243787, + "balance_loss_mlp": 1.08384049, + "epoch": 0.005771832256125056, + "flos": 15559591495680.0, + "grad_norm": 2.32620823946676, + "language_loss": 0.90304142, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.93063444, + "num_input_tokens_seen": 1988930, + "step": 96, + "time_per_iteration": 3.933746576309204 + }, + { + "auxiliary_loss_clip": 0.01600813, + "auxiliary_loss_mlp": 0.01137477, + "balance_loss_clip": 1.21507454, + "balance_loss_mlp": 1.07539272, + "epoch": 0.005831955508793026, + "flos": 22528380015360.0, + "grad_norm": 2.6204599891678457, + "language_loss": 0.89715993, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.92454278, + "num_input_tokens_seen": 2006285, + "step": 97, + "time_per_iteration": 2.60235595703125 + }, + { + "auxiliary_loss_clip": 0.01587801, + "auxiliary_loss_mlp": 0.01133789, + "balance_loss_clip": 1.20708227, + "balance_loss_mlp": 1.06507659, + "epoch": 0.005892078761460995, + "flos": 22049834094720.0, + "grad_norm": 2.0322866039359617, + "language_loss": 0.76363957, + "learning_rate": 2.952041322436969e-06, + "loss": 0.79085547, + "num_input_tokens_seen": 2024905, + "step": 98, + "time_per_iteration": 2.591341733932495 + }, + { + "auxiliary_loss_clip": 0.01539643, + "auxiliary_loss_mlp": 0.01040228, + "balance_loss_clip": 1.29784811, + "balance_loss_mlp": 1.00627744, + "epoch": 0.005952202014128965, + "flos": 68539143317760.0, + "grad_norm": 1.664661326294011, + "language_loss": 0.65456271, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.68036139, + "num_input_tokens_seen": 2086220, + "step": 99, + "time_per_iteration": 3.197314739227295 + }, + { + "auxiliary_loss_clip": 0.0158858, + "auxiliary_loss_mlp": 0.01141251, + "balance_loss_clip": 1.20651817, + "balance_loss_mlp": 1.07096481, + "epoch": 0.006012325266796933, + "flos": 22960887678720.0, + "grad_norm": 1.9390287261590322, + "language_loss": 0.90803373, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.93533206, + "num_input_tokens_seen": 2103365, + "step": 100, + "time_per_iteration": 2.6139118671417236 + }, + { + "auxiliary_loss_clip": 0.01600248, + "auxiliary_loss_mlp": 0.01146735, + "balance_loss_clip": 1.2089324, + "balance_loss_mlp": 1.08059716, + "epoch": 0.006072448519464903, + "flos": 17347942857600.0, + "grad_norm": 2.0278910381237685, + "language_loss": 0.90985447, + "learning_rate": 2.971455421902446e-06, + "loss": 0.93732429, + "num_input_tokens_seen": 2121995, + "step": 101, + "time_per_iteration": 2.5757923126220703 + }, + { + "auxiliary_loss_clip": 0.01587865, + "auxiliary_loss_mlp": 0.0114934, + "balance_loss_clip": 1.209023, + "balance_loss_mlp": 1.07929277, + "epoch": 0.006132571772132872, + "flos": 24681116897280.0, + "grad_norm": 2.1615040216640615, + "language_loss": 0.90539706, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.93276918, + "num_input_tokens_seen": 2141815, + "step": 102, + "time_per_iteration": 2.629912853240967 + }, + { + "auxiliary_loss_clip": 0.01582211, + "auxiliary_loss_mlp": 0.01131483, + "balance_loss_clip": 1.20589662, + "balance_loss_mlp": 1.06696725, + "epoch": 0.006192695024800842, + "flos": 21465675210240.0, + "grad_norm": 3.2030167113942607, + "language_loss": 0.87929732, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.90643418, + "num_input_tokens_seen": 2161125, + "step": 103, + "time_per_iteration": 2.636082887649536 + }, + { + "auxiliary_loss_clip": 0.01581841, + "auxiliary_loss_mlp": 0.01136058, + "balance_loss_clip": 1.20760822, + "balance_loss_mlp": 1.06977773, + "epoch": 0.006252818277468811, + "flos": 17420410546560.0, + "grad_norm": 1.8735352852882923, + "language_loss": 0.9355042, + "learning_rate": 2.990301221458371e-06, + "loss": 0.96268308, + "num_input_tokens_seen": 2179510, + "step": 104, + "time_per_iteration": 2.6202988624572754 + }, + { + "auxiliary_loss_clip": 0.01573431, + "auxiliary_loss_mlp": 0.01146299, + "balance_loss_clip": 1.1980418, + "balance_loss_mlp": 1.08240306, + "epoch": 0.006312941530136781, + "flos": 19099557584640.0, + "grad_norm": 2.7313439084549205, + "language_loss": 0.96352613, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.99072349, + "num_input_tokens_seen": 2197870, + "step": 105, + "time_per_iteration": 2.5902717113494873 + }, + { + "auxiliary_loss_clip": 0.01570953, + "auxiliary_loss_mlp": 0.01158322, + "balance_loss_clip": 1.19701767, + "balance_loss_mlp": 1.08727288, + "epoch": 0.006373064782804749, + "flos": 24060831909120.0, + "grad_norm": 2.553229446543694, + "language_loss": 0.86808527, + "learning_rate": 3.002565443382063e-06, + "loss": 0.89537799, + "num_input_tokens_seen": 2217495, + "step": 106, + "time_per_iteration": 2.647836923599243 + }, + { + "auxiliary_loss_clip": 0.01555072, + "auxiliary_loss_mlp": 0.0114002, + "balance_loss_clip": 1.18401361, + "balance_loss_mlp": 1.07216597, + "epoch": 0.006433188035472719, + "flos": 18332433797760.0, + "grad_norm": 2.1469546870305622, + "language_loss": 0.83395767, + "learning_rate": 3.008611048208843e-06, + "loss": 0.86090857, + "num_input_tokens_seen": 2236520, + "step": 107, + "time_per_iteration": 2.5631537437438965 + }, + { + "auxiliary_loss_clip": 0.01482584, + "auxiliary_loss_mlp": 0.01035622, + "balance_loss_clip": 1.25975657, + "balance_loss_mlp": 1.00396001, + "epoch": 0.006493311288140688, + "flos": 62562387594240.0, + "grad_norm": 0.998404320905639, + "language_loss": 0.64882183, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67400384, + "num_input_tokens_seen": 2300140, + "step": 108, + "time_per_iteration": 3.1637301445007324 + }, + { + "auxiliary_loss_clip": 0.01548455, + "auxiliary_loss_mlp": 0.0113263, + "balance_loss_clip": 1.18692231, + "balance_loss_mlp": 1.06463289, + "epoch": 0.006553434540808658, + "flos": 19500141035520.0, + "grad_norm": 2.053521313218508, + "language_loss": 0.97542548, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.00223637, + "num_input_tokens_seen": 2317320, + "step": 109, + "time_per_iteration": 2.5526187419891357 + }, + { + "auxiliary_loss_clip": 0.01545556, + "auxiliary_loss_mlp": 0.01140444, + "balance_loss_clip": 1.18802834, + "balance_loss_mlp": 1.07449782, + "epoch": 0.006613557793476627, + "flos": 21105132445440.0, + "grad_norm": 1.630666508662532, + "language_loss": 0.8404845, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.8673445, + "num_input_tokens_seen": 2337820, + "step": 110, + "time_per_iteration": 2.616708993911743 + }, + { + "auxiliary_loss_clip": 0.01540502, + "auxiliary_loss_mlp": 0.01147396, + "balance_loss_clip": 1.18204618, + "balance_loss_mlp": 1.08125901, + "epoch": 0.006673681046144597, + "flos": 26030747543040.0, + "grad_norm": 2.010077669083485, + "language_loss": 0.82815057, + "learning_rate": 3.032241303393073e-06, + "loss": 0.85502958, + "num_input_tokens_seen": 2358560, + "step": 111, + "time_per_iteration": 2.625277280807495 + }, + { + "auxiliary_loss_clip": 0.01541746, + "auxiliary_loss_mlp": 0.01132896, + "balance_loss_clip": 1.18560529, + "balance_loss_mlp": 1.06923842, + "epoch": 0.006733804298812566, + "flos": 23147767163520.0, + "grad_norm": 2.3265765920509334, + "language_loss": 0.93846434, + "learning_rate": 3.0380158011446e-06, + "loss": 0.96521074, + "num_input_tokens_seen": 2379005, + "step": 112, + "time_per_iteration": 2.611826181411743 + }, + { + "auxiliary_loss_clip": 0.01544588, + "auxiliary_loss_mlp": 0.01139652, + "balance_loss_clip": 1.18151045, + "balance_loss_mlp": 1.07608914, + "epoch": 0.006793927551480535, + "flos": 11764444210560.0, + "grad_norm": 2.191456257024638, + "language_loss": 0.79307699, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.81991935, + "num_input_tokens_seen": 2395610, + "step": 113, + "time_per_iteration": 2.5578553676605225 + }, + { + "auxiliary_loss_clip": 0.01533231, + "auxiliary_loss_mlp": 0.01133007, + "balance_loss_clip": 1.1781671, + "balance_loss_mlp": 1.06748986, + "epoch": 0.006854050804148504, + "flos": 19171953446400.0, + "grad_norm": 2.4098368230777236, + "language_loss": 0.93183094, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.95849335, + "num_input_tokens_seen": 2415005, + "step": 114, + "time_per_iteration": 2.589203357696533 + }, + { + "auxiliary_loss_clip": 0.01540303, + "auxiliary_loss_mlp": 0.01136206, + "balance_loss_clip": 1.17805719, + "balance_loss_mlp": 1.07645822, + "epoch": 0.006914174056816474, + "flos": 21981891519360.0, + "grad_norm": 2.2584966292907476, + "language_loss": 0.94365168, + "learning_rate": 3.055034911425055e-06, + "loss": 0.97041684, + "num_input_tokens_seen": 2433965, + "step": 115, + "time_per_iteration": 2.646357536315918 + }, + { + "auxiliary_loss_clip": 0.01533715, + "auxiliary_loss_mlp": 0.01117057, + "balance_loss_clip": 1.1748389, + "balance_loss_mlp": 1.05087209, + "epoch": 0.006974297309484443, + "flos": 16289152634880.0, + "grad_norm": 2.70043417163262, + "language_loss": 0.81703532, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.84354299, + "num_input_tokens_seen": 2451605, + "step": 116, + "time_per_iteration": 2.586735725402832 + }, + { + "auxiliary_loss_clip": 0.01526877, + "auxiliary_loss_mlp": 0.01127142, + "balance_loss_clip": 1.17653263, + "balance_loss_mlp": 1.06324625, + "epoch": 0.007034420562152413, + "flos": 26104005331200.0, + "grad_norm": 3.307788139369655, + "language_loss": 0.87972999, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.90627015, + "num_input_tokens_seen": 2472035, + "step": 117, + "time_per_iteration": 2.620471477508545 + }, + { + "auxiliary_loss_clip": 0.01524723, + "auxiliary_loss_mlp": 0.01145442, + "balance_loss_clip": 1.17411852, + "balance_loss_mlp": 1.08030558, + "epoch": 0.007094543814820382, + "flos": 14204609723520.0, + "grad_norm": 2.9924321404075283, + "language_loss": 0.84531951, + "learning_rate": 3.071615712271274e-06, + "loss": 0.87202114, + "num_input_tokens_seen": 2489285, + "step": 118, + "time_per_iteration": 2.5893595218658447 + }, + { + "auxiliary_loss_clip": 0.01534311, + "auxiliary_loss_mlp": 0.01162676, + "balance_loss_clip": 1.17496443, + "balance_loss_mlp": 1.09849334, + "epoch": 0.007154667067488351, + "flos": 14976007228800.0, + "grad_norm": 2.3342046067825684, + "language_loss": 0.9922536, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.01922345, + "num_input_tokens_seen": 2506460, + "step": 119, + "time_per_iteration": 2.553607225418091 + }, + { + "auxiliary_loss_clip": 0.01536343, + "auxiliary_loss_mlp": 0.0112179, + "balance_loss_clip": 1.17107284, + "balance_loss_mlp": 1.05970621, + "epoch": 0.00721479032015632, + "flos": 20193288762240.0, + "grad_norm": 15.184582650523994, + "language_loss": 0.8922199, + "learning_rate": 3.082437012097686e-06, + "loss": 0.91880119, + "num_input_tokens_seen": 2525565, + "step": 120, + "time_per_iteration": 2.594038963317871 + }, + { + "auxiliary_loss_clip": 0.0152173, + "auxiliary_loss_mlp": 0.01132875, + "balance_loss_clip": 1.17209888, + "balance_loss_mlp": 1.06893134, + "epoch": 0.00727491357282429, + "flos": 23147228459520.0, + "grad_norm": 1.8714293252338285, + "language_loss": 0.93179202, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.95833802, + "num_input_tokens_seen": 2546605, + "step": 121, + "time_per_iteration": 2.604733467102051 + }, + { + "auxiliary_loss_clip": 0.01523365, + "auxiliary_loss_mlp": 0.01147342, + "balance_loss_clip": 1.17240238, + "balance_loss_mlp": 1.08449531, + "epoch": 0.007335036825492259, + "flos": 15521669712000.0, + "grad_norm": 3.459256696633899, + "language_loss": 0.9007864, + "learning_rate": 3.09307943925077e-06, + "loss": 0.92749345, + "num_input_tokens_seen": 2560730, + "step": 122, + "time_per_iteration": 2.574723958969116 + }, + { + "auxiliary_loss_clip": 0.01518139, + "auxiliary_loss_mlp": 0.01144373, + "balance_loss_clip": 1.16636777, + "balance_loss_mlp": 1.07761562, + "epoch": 0.007395160078160229, + "flos": 24243365848320.0, + "grad_norm": 2.3584371957887527, + "language_loss": 0.92536861, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.9519937, + "num_input_tokens_seen": 2579550, + "step": 123, + "time_per_iteration": 2.6269428730010986 + }, + { + "auxiliary_loss_clip": 0.01516059, + "auxiliary_loss_mlp": 0.01128002, + "balance_loss_clip": 1.1623075, + "balance_loss_mlp": 1.06648982, + "epoch": 0.007455283330828198, + "flos": 31759792099200.0, + "grad_norm": 2.2815383880508238, + "language_loss": 0.70950246, + "learning_rate": 3.103548811118979e-06, + "loss": 0.73594308, + "num_input_tokens_seen": 2600390, + "step": 124, + "time_per_iteration": 2.6536858081817627 + }, + { + "auxiliary_loss_clip": 0.01505263, + "auxiliary_loss_mlp": 0.01123382, + "balance_loss_clip": 1.16403329, + "balance_loss_mlp": 1.0602963, + "epoch": 0.007515406583496167, + "flos": 26615157822720.0, + "grad_norm": 2.209027792278896, + "language_loss": 0.88152587, + "learning_rate": 3.108720342404542e-06, + "loss": 0.9078123, + "num_input_tokens_seen": 2620770, + "step": 125, + "time_per_iteration": 2.638362407684326 + }, + { + "auxiliary_loss_clip": 0.01518682, + "auxiliary_loss_mlp": 0.01142877, + "balance_loss_clip": 1.16336763, + "balance_loss_mlp": 1.08041179, + "epoch": 0.007575529836164136, + "flos": 18223696350720.0, + "grad_norm": 3.2666806178006214, + "language_loss": 0.8218801, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.84849572, + "num_input_tokens_seen": 2639900, + "step": 126, + "time_per_iteration": 2.5847151279449463 + }, + { + "auxiliary_loss_clip": 0.0151346, + "auxiliary_loss_mlp": 0.01139835, + "balance_loss_clip": 1.16303134, + "balance_loss_mlp": 1.07870495, + "epoch": 0.007635653088832106, + "flos": 21580410228480.0, + "grad_norm": 3.414985913792108, + "language_loss": 0.67462319, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.70115614, + "num_input_tokens_seen": 2657450, + "step": 127, + "time_per_iteration": 2.6407558917999268 + }, + { + "auxiliary_loss_clip": 0.01501589, + "auxiliary_loss_mlp": 0.01131695, + "balance_loss_clip": 1.16464043, + "balance_loss_mlp": 1.06851387, + "epoch": 0.007695776341500075, + "flos": 25375054723200.0, + "grad_norm": 4.270137040519248, + "language_loss": 0.88051999, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.90685284, + "num_input_tokens_seen": 2678150, + "step": 128, + "time_per_iteration": 2.612355947494507 + }, + { + "auxiliary_loss_clip": 0.01504377, + "auxiliary_loss_mlp": 0.01141049, + "balance_loss_clip": 1.16031742, + "balance_loss_mlp": 1.0778681, + "epoch": 0.007755899594168045, + "flos": 22343906741760.0, + "grad_norm": 1.8014598610728807, + "language_loss": 0.84627002, + "learning_rate": 3.129000827968184e-06, + "loss": 0.87272429, + "num_input_tokens_seen": 2698290, + "step": 129, + "time_per_iteration": 2.6156222820281982 + }, + { + "auxiliary_loss_clip": 0.01497321, + "auxiliary_loss_mlp": 0.01131018, + "balance_loss_clip": 1.15940428, + "balance_loss_mlp": 1.06778908, + "epoch": 0.007816022846836013, + "flos": 22638230784000.0, + "grad_norm": 2.192384067395493, + "language_loss": 0.97199118, + "learning_rate": 3.133972684206866e-06, + "loss": 0.99827456, + "num_input_tokens_seen": 2717630, + "step": 130, + "time_per_iteration": 2.6140406131744385 + }, + { + "auxiliary_loss_clip": 0.01492265, + "auxiliary_loss_mlp": 0.01132502, + "balance_loss_clip": 1.15676177, + "balance_loss_mlp": 1.06884396, + "epoch": 0.007876146099503984, + "flos": 18182901479040.0, + "grad_norm": 2.6004986070079252, + "language_loss": 0.82402104, + "learning_rate": 3.138906441556014e-06, + "loss": 0.85026872, + "num_input_tokens_seen": 2735835, + "step": 131, + "time_per_iteration": 2.5636849403381348 + }, + { + "auxiliary_loss_clip": 0.01500218, + "auxiliary_loss_mlp": 0.01127216, + "balance_loss_clip": 1.15860033, + "balance_loss_mlp": 1.06613278, + "epoch": 0.007936269352171952, + "flos": 27119486730240.0, + "grad_norm": 3.1391445150548836, + "language_loss": 0.82872903, + "learning_rate": 3.143802679474861e-06, + "loss": 0.85500336, + "num_input_tokens_seen": 2756335, + "step": 132, + "time_per_iteration": 4.080858469009399 + }, + { + "auxiliary_loss_clip": 0.01491852, + "auxiliary_loss_mlp": 0.01126826, + "balance_loss_clip": 1.15346324, + "balance_loss_mlp": 1.06493282, + "epoch": 0.007996392604839923, + "flos": 19026335710080.0, + "grad_norm": 2.430030618541236, + "language_loss": 0.9506011, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.97678792, + "num_input_tokens_seen": 2775090, + "step": 133, + "time_per_iteration": 3.9750640392303467 + }, + { + "auxiliary_loss_clip": 0.01487712, + "auxiliary_loss_mlp": 0.01127585, + "balance_loss_clip": 1.16184449, + "balance_loss_mlp": 1.06793308, + "epoch": 0.008056515857507891, + "flos": 25484151306240.0, + "grad_norm": 1.667690260557188, + "language_loss": 0.73300362, + "learning_rate": 3.153484849651286e-06, + "loss": 0.75915658, + "num_input_tokens_seen": 2795320, + "step": 134, + "time_per_iteration": 4.0368804931640625 + }, + { + "auxiliary_loss_clip": 0.01483885, + "auxiliary_loss_mlp": 0.01129553, + "balance_loss_clip": 1.15032005, + "balance_loss_mlp": 1.06489432, + "epoch": 0.00811663911017586, + "flos": 20557566541440.0, + "grad_norm": 3.2583640340397686, + "language_loss": 0.88490343, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.9110378, + "num_input_tokens_seen": 2812815, + "step": 135, + "time_per_iteration": 2.5706255435943604 + }, + { + "auxiliary_loss_clip": 0.01487791, + "auxiliary_loss_mlp": 0.01132749, + "balance_loss_clip": 1.15467739, + "balance_loss_mlp": 1.06737447, + "epoch": 0.00817676236284383, + "flos": 18799738761600.0, + "grad_norm": 2.3872646448381247, + "language_loss": 0.89151216, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.91771758, + "num_input_tokens_seen": 2830445, + "step": 136, + "time_per_iteration": 2.5756802558898926 + }, + { + "auxiliary_loss_clip": 0.01483359, + "auxiliary_loss_mlp": 0.01107188, + "balance_loss_clip": 1.14921665, + "balance_loss_mlp": 1.04748821, + "epoch": 0.008236885615511799, + "flos": 23873593288320.0, + "grad_norm": 2.2425191343364554, + "language_loss": 0.83894992, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.86485541, + "num_input_tokens_seen": 2846965, + "step": 137, + "time_per_iteration": 2.6100833415985107 + }, + { + "auxiliary_loss_clip": 0.01480652, + "auxiliary_loss_mlp": 0.01117771, + "balance_loss_clip": 1.1488812, + "balance_loss_mlp": 1.05783308, + "epoch": 0.00829700886817977, + "flos": 24643626076800.0, + "grad_norm": 1.7736266978645918, + "language_loss": 0.90182352, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.92780781, + "num_input_tokens_seen": 2867520, + "step": 138, + "time_per_iteration": 2.6025633811950684 + }, + { + "auxiliary_loss_clip": 0.01469131, + "auxiliary_loss_mlp": 0.01120798, + "balance_loss_clip": 1.14758015, + "balance_loss_mlp": 1.05647254, + "epoch": 0.008357132120847738, + "flos": 25262007644160.0, + "grad_norm": 2.5326850374808063, + "language_loss": 0.91454554, + "learning_rate": 3.177071816289865e-06, + "loss": 0.94044477, + "num_input_tokens_seen": 2885675, + "step": 139, + "time_per_iteration": 2.5911431312561035 + }, + { + "auxiliary_loss_clip": 0.01486186, + "auxiliary_loss_mlp": 0.01122771, + "balance_loss_clip": 1.15409458, + "balance_loss_mlp": 1.06082976, + "epoch": 0.008417255373515706, + "flos": 27344898529920.0, + "grad_norm": 2.2806446367544484, + "language_loss": 0.85681903, + "learning_rate": 3.181687263893095e-06, + "loss": 0.88290858, + "num_input_tokens_seen": 2905960, + "step": 140, + "time_per_iteration": 2.6238269805908203 + }, + { + "auxiliary_loss_clip": 0.01472527, + "auxiliary_loss_mlp": 0.01119801, + "balance_loss_clip": 1.14867771, + "balance_loss_mlp": 1.05890918, + "epoch": 0.008477378626183677, + "flos": 17639070589440.0, + "grad_norm": 3.897387513384623, + "language_loss": 0.84215617, + "learning_rate": 3.186269861057098e-06, + "loss": 0.86807942, + "num_input_tokens_seen": 2922780, + "step": 141, + "time_per_iteration": 2.547086715698242 + }, + { + "auxiliary_loss_clip": 0.01476949, + "auxiliary_loss_mlp": 0.01134924, + "balance_loss_clip": 1.14633572, + "balance_loss_mlp": 1.07365096, + "epoch": 0.008537501878851645, + "flos": 13881342297600.0, + "grad_norm": 2.201710229460226, + "language_loss": 0.8140502, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.84016895, + "num_input_tokens_seen": 2938765, + "step": 142, + "time_per_iteration": 2.5310564041137695 + }, + { + "auxiliary_loss_clip": 0.01394938, + "auxiliary_loss_mlp": 0.01033514, + "balance_loss_clip": 1.20332384, + "balance_loss_mlp": 1.00261533, + "epoch": 0.008597625131519616, + "flos": 71248101281280.0, + "grad_norm": 1.0481380678861736, + "language_loss": 0.67008018, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69436479, + "num_input_tokens_seen": 3006665, + "step": 143, + "time_per_iteration": 3.2908971309661865 + }, + { + "auxiliary_loss_clip": 0.01468626, + "auxiliary_loss_mlp": 0.01123175, + "balance_loss_clip": 1.14668345, + "balance_loss_mlp": 1.06228304, + "epoch": 0.008657748384187584, + "flos": 17602836744960.0, + "grad_norm": 2.1555867117648204, + "language_loss": 0.84040892, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.86632693, + "num_input_tokens_seen": 3024335, + "step": 144, + "time_per_iteration": 2.5745999813079834 + }, + { + "auxiliary_loss_clip": 0.01455917, + "auxiliary_loss_mlp": 0.01112389, + "balance_loss_clip": 1.14087963, + "balance_loss_mlp": 1.04930329, + "epoch": 0.008717871636855555, + "flos": 19715317459200.0, + "grad_norm": 1.8360984748164988, + "language_loss": 0.88471884, + "learning_rate": 3.204280886775619e-06, + "loss": 0.91040194, + "num_input_tokens_seen": 3043300, + "step": 145, + "time_per_iteration": 2.5579302310943604 + }, + { + "auxiliary_loss_clip": 0.01471664, + "auxiliary_loss_mlp": 0.0112909, + "balance_loss_clip": 1.14207482, + "balance_loss_mlp": 1.06619477, + "epoch": 0.008777994889523523, + "flos": 24717422568960.0, + "grad_norm": 1.7737621531723198, + "language_loss": 0.85963774, + "learning_rate": 3.208706005112005e-06, + "loss": 0.88564527, + "num_input_tokens_seen": 3064610, + "step": 146, + "time_per_iteration": 2.5866341590881348 + }, + { + "auxiliary_loss_clip": 0.01375054, + "auxiliary_loss_mlp": 0.01030385, + "balance_loss_clip": 1.18814135, + "balance_loss_mlp": 1.00082064, + "epoch": 0.008838118142191492, + "flos": 70132067758080.0, + "grad_norm": 0.8628216619847996, + "language_loss": 0.60204238, + "learning_rate": 3.213100917627104e-06, + "loss": 0.62609679, + "num_input_tokens_seen": 3130385, + "step": 147, + "time_per_iteration": 3.224196672439575 + }, + { + "auxiliary_loss_clip": 0.01463024, + "auxiliary_loss_mlp": 0.01122554, + "balance_loss_clip": 1.14601707, + "balance_loss_mlp": 1.06466651, + "epoch": 0.008898241394859462, + "flos": 20044797937920.0, + "grad_norm": 2.90727007877629, + "language_loss": 0.84645188, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.87230766, + "num_input_tokens_seen": 3149760, + "step": 148, + "time_per_iteration": 2.5968053340911865 + }, + { + "auxiliary_loss_clip": 0.01468888, + "auxiliary_loss_mlp": 0.01144426, + "balance_loss_clip": 1.14840221, + "balance_loss_mlp": 1.07991028, + "epoch": 0.008958364647527431, + "flos": 10743611685120.0, + "grad_norm": 2.740979599299946, + "language_loss": 0.88686371, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.91299689, + "num_input_tokens_seen": 3164500, + "step": 149, + "time_per_iteration": 2.5281078815460205 + }, + { + "auxiliary_loss_clip": 0.01464736, + "auxiliary_loss_mlp": 0.01109027, + "balance_loss_clip": 1.14274573, + "balance_loss_mlp": 1.05090094, + "epoch": 0.009018487900195401, + "flos": 29127467802240.0, + "grad_norm": 2.0048823269355913, + "language_loss": 0.93001103, + "learning_rate": 3.226108474846181e-06, + "loss": 0.95574868, + "num_input_tokens_seen": 3182455, + "step": 150, + "time_per_iteration": 2.6309010982513428 + }, + { + "auxiliary_loss_clip": 0.01453068, + "auxiliary_loss_mlp": 0.01110994, + "balance_loss_clip": 1.13838744, + "balance_loss_mlp": 1.05467951, + "epoch": 0.00907861115286337, + "flos": 32963661354240.0, + "grad_norm": 1.8804704290952083, + "language_loss": 0.74305212, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.76869273, + "num_input_tokens_seen": 3203995, + "step": 151, + "time_per_iteration": 2.6726725101470947 + }, + { + "auxiliary_loss_clip": 0.01463953, + "auxiliary_loss_mlp": 0.01127498, + "balance_loss_clip": 1.14192545, + "balance_loss_mlp": 1.06899047, + "epoch": 0.009138734405531338, + "flos": 21762441377280.0, + "grad_norm": 2.8661847016129176, + "language_loss": 0.88202453, + "learning_rate": 3.234636443010188e-06, + "loss": 0.90793908, + "num_input_tokens_seen": 3222575, + "step": 152, + "time_per_iteration": 2.586298704147339 + }, + { + "auxiliary_loss_clip": 0.01464188, + "auxiliary_loss_mlp": 0.01121655, + "balance_loss_clip": 1.14814472, + "balance_loss_mlp": 1.0626229, + "epoch": 0.009198857658199309, + "flos": 20842517134080.0, + "grad_norm": 4.25413307712974, + "language_loss": 0.84180242, + "learning_rate": 3.238858439669943e-06, + "loss": 0.86766088, + "num_input_tokens_seen": 3240180, + "step": 153, + "time_per_iteration": 2.555903911590576 + }, + { + "auxiliary_loss_clip": 0.01453985, + "auxiliary_loss_mlp": 0.01136568, + "balance_loss_clip": 1.139691, + "balance_loss_mlp": 1.07600951, + "epoch": 0.009258980910867277, + "flos": 24827381078400.0, + "grad_norm": 1.9241127881808737, + "language_loss": 0.89898205, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.92488766, + "num_input_tokens_seen": 3259800, + "step": 154, + "time_per_iteration": 2.59694766998291 + }, + { + "auxiliary_loss_clip": 0.01458693, + "auxiliary_loss_mlp": 0.01157528, + "balance_loss_clip": 1.14241755, + "balance_loss_mlp": 1.09842443, + "epoch": 0.009319104163535248, + "flos": 28767786963840.0, + "grad_norm": 2.016152984236956, + "language_loss": 0.89719486, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.92335707, + "num_input_tokens_seen": 3280400, + "step": 155, + "time_per_iteration": 2.6048922538757324 + }, + { + "auxiliary_loss_clip": 0.01462118, + "auxiliary_loss_mlp": 0.011184, + "balance_loss_clip": 1.14079165, + "balance_loss_mlp": 1.06098843, + "epoch": 0.009379227416203216, + "flos": 16582004219520.0, + "grad_norm": 2.352405376931058, + "language_loss": 0.8663311, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.89213622, + "num_input_tokens_seen": 3297600, + "step": 156, + "time_per_iteration": 2.5507659912109375 + }, + { + "auxiliary_loss_clip": 0.01459403, + "auxiliary_loss_mlp": 0.01120798, + "balance_loss_clip": 1.14356971, + "balance_loss_mlp": 1.06271887, + "epoch": 0.009439350668871187, + "flos": 18329919845760.0, + "grad_norm": 2.2212045492080152, + "language_loss": 0.99220395, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.01800597, + "num_input_tokens_seen": 3313635, + "step": 157, + "time_per_iteration": 2.513223648071289 + }, + { + "auxiliary_loss_clip": 0.01443777, + "auxiliary_loss_mlp": 0.01144576, + "balance_loss_clip": 1.13854277, + "balance_loss_mlp": 1.08635402, + "epoch": 0.009499473921539155, + "flos": 24349912565760.0, + "grad_norm": 2.0620233773336776, + "language_loss": 0.88287854, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.9087621, + "num_input_tokens_seen": 3333735, + "step": 158, + "time_per_iteration": 2.622466802597046 + }, + { + "auxiliary_loss_clip": 0.01450484, + "auxiliary_loss_mlp": 0.01125572, + "balance_loss_clip": 1.13807034, + "balance_loss_mlp": 1.06668293, + "epoch": 0.009559597174207124, + "flos": 16399326625920.0, + "grad_norm": 2.630174686943085, + "language_loss": 0.86455482, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.89031541, + "num_input_tokens_seen": 3348800, + "step": 159, + "time_per_iteration": 2.5305891036987305 + }, + { + "auxiliary_loss_clip": 0.01439689, + "auxiliary_loss_mlp": 0.01130469, + "balance_loss_clip": 1.13249254, + "balance_loss_mlp": 1.07115078, + "epoch": 0.009619720426875094, + "flos": 22856890826880.0, + "grad_norm": 2.1226552290314915, + "language_loss": 0.86700493, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.89270651, + "num_input_tokens_seen": 3368595, + "step": 160, + "time_per_iteration": 2.589097261428833 + }, + { + "auxiliary_loss_clip": 0.01446639, + "auxiliary_loss_mlp": 0.01121242, + "balance_loss_clip": 1.13815951, + "balance_loss_mlp": 1.06647718, + "epoch": 0.009679843679543063, + "flos": 19135001329920.0, + "grad_norm": 2.399768866187191, + "language_loss": 0.91769397, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.94337279, + "num_input_tokens_seen": 3384975, + "step": 161, + "time_per_iteration": 2.5633223056793213 + }, + { + "auxiliary_loss_clip": 0.01451841, + "auxiliary_loss_mlp": 0.01112201, + "balance_loss_clip": 1.13853979, + "balance_loss_mlp": 1.05669701, + "epoch": 0.009739966932211033, + "flos": 20302995876480.0, + "grad_norm": 1.7731282281010101, + "language_loss": 0.91328567, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.9389261, + "num_input_tokens_seen": 3404755, + "step": 162, + "time_per_iteration": 2.596208095550537 + }, + { + "auxiliary_loss_clip": 0.01334637, + "auxiliary_loss_mlp": 0.01027357, + "balance_loss_clip": 1.16269302, + "balance_loss_mlp": 1.0044688, + "epoch": 0.009800090184879002, + "flos": 67034234177280.0, + "grad_norm": 1.1799250159430006, + "language_loss": 0.72456527, + "learning_rate": 3.279622189013474e-06, + "loss": 0.74818516, + "num_input_tokens_seen": 3467210, + "step": 163, + "time_per_iteration": 3.121882915496826 + }, + { + "auxiliary_loss_clip": 0.01437328, + "auxiliary_loss_mlp": 0.01118105, + "balance_loss_clip": 1.1356113, + "balance_loss_mlp": 1.06255364, + "epoch": 0.00986021343754697, + "flos": 17164690646400.0, + "grad_norm": 2.0775708426327135, + "language_loss": 0.84366113, + "learning_rate": 3.283560135133457e-06, + "loss": 0.86921549, + "num_input_tokens_seen": 3483220, + "step": 164, + "time_per_iteration": 2.5571401119232178 + }, + { + "auxiliary_loss_clip": 0.01428237, + "auxiliary_loss_mlp": 0.01103308, + "balance_loss_clip": 1.1271925, + "balance_loss_mlp": 1.04839993, + "epoch": 0.00992033669021494, + "flos": 17749424148480.0, + "grad_norm": 2.209673035564369, + "language_loss": 0.89136529, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.91668069, + "num_input_tokens_seen": 3501465, + "step": 165, + "time_per_iteration": 2.5336015224456787 + }, + { + "auxiliary_loss_clip": 0.01431874, + "auxiliary_loss_mlp": 0.01130153, + "balance_loss_clip": 1.12813318, + "balance_loss_mlp": 1.07040501, + "epoch": 0.00998045994288291, + "flos": 25297164080640.0, + "grad_norm": 1.684703336109223, + "language_loss": 0.79867178, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.82429206, + "num_input_tokens_seen": 3520480, + "step": 166, + "time_per_iteration": 2.6028335094451904 + }, + { + "auxiliary_loss_clip": 0.01436563, + "auxiliary_loss_mlp": 0.01124782, + "balance_loss_clip": 1.13194895, + "balance_loss_mlp": 1.06646466, + "epoch": 0.01004058319555088, + "flos": 32298954220800.0, + "grad_norm": 2.2362976367249465, + "language_loss": 0.91916299, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.94477642, + "num_input_tokens_seen": 3539570, + "step": 167, + "time_per_iteration": 2.677703380584717 + }, + { + "auxiliary_loss_clip": 0.01427267, + "auxiliary_loss_mlp": 0.01141527, + "balance_loss_clip": 1.12919569, + "balance_loss_mlp": 1.08573735, + "epoch": 0.010100706448218848, + "flos": 11319941404800.0, + "grad_norm": 17.821889522094118, + "language_loss": 0.90689933, + "learning_rate": 3.299075396334735e-06, + "loss": 0.93258727, + "num_input_tokens_seen": 3555465, + "step": 168, + "time_per_iteration": 2.54180908203125 + }, + { + "auxiliary_loss_clip": 0.014222, + "auxiliary_loss_mlp": 0.01113226, + "balance_loss_clip": 1.12524581, + "balance_loss_mlp": 1.05605364, + "epoch": 0.010160829700886819, + "flos": 29719491765120.0, + "grad_norm": 1.7414076639704423, + "language_loss": 0.87183541, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.89718962, + "num_input_tokens_seen": 3578970, + "step": 169, + "time_per_iteration": 2.643301010131836 + }, + { + "auxiliary_loss_clip": 0.01424114, + "auxiliary_loss_mlp": 0.0111806, + "balance_loss_clip": 1.12758684, + "balance_loss_mlp": 1.06136453, + "epoch": 0.010220952953554787, + "flos": 20412343854720.0, + "grad_norm": 1.849921094585435, + "language_loss": 0.84461129, + "learning_rate": 3.306695037731344e-06, + "loss": 0.87003303, + "num_input_tokens_seen": 3597275, + "step": 170, + "time_per_iteration": 2.5366506576538086 + }, + { + "auxiliary_loss_clip": 0.01432615, + "auxiliary_loss_mlp": 0.0113412, + "balance_loss_clip": 1.12817299, + "balance_loss_mlp": 1.07694769, + "epoch": 0.010281076206222756, + "flos": 31285124847360.0, + "grad_norm": 1.954262740453898, + "language_loss": 0.89880979, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.9244771, + "num_input_tokens_seen": 3618905, + "step": 171, + "time_per_iteration": 3.988121509552002 + }, + { + "auxiliary_loss_clip": 0.01427192, + "auxiliary_loss_mlp": 0.01113776, + "balance_loss_clip": 1.13071418, + "balance_loss_mlp": 1.05967844, + "epoch": 0.010341199458890726, + "flos": 21982286568960.0, + "grad_norm": 1.9890784317586754, + "language_loss": 0.88885248, + "learning_rate": 3.314225558471224e-06, + "loss": 0.91426218, + "num_input_tokens_seen": 3639610, + "step": 172, + "time_per_iteration": 5.470140695571899 + }, + { + "auxiliary_loss_clip": 0.01415364, + "auxiliary_loss_mlp": 0.0112938, + "balance_loss_clip": 1.12345064, + "balance_loss_mlp": 1.07430518, + "epoch": 0.010401322711558695, + "flos": 30810529422720.0, + "grad_norm": 2.4544757027320867, + "language_loss": 0.81026721, + "learning_rate": 3.317958045350308e-06, + "loss": 0.83571458, + "num_input_tokens_seen": 3664030, + "step": 173, + "time_per_iteration": 2.6381349563598633 + }, + { + "auxiliary_loss_clip": 0.01427926, + "auxiliary_loss_mlp": 0.01111802, + "balance_loss_clip": 1.12783968, + "balance_loss_mlp": 1.05908799, + "epoch": 0.010461445964226665, + "flos": 24715124098560.0, + "grad_norm": 2.3130663912890372, + "language_loss": 0.82624376, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.85164112, + "num_input_tokens_seen": 3683615, + "step": 174, + "time_per_iteration": 3.964766025543213 + }, + { + "auxiliary_loss_clip": 0.01421404, + "auxiliary_loss_mlp": 0.01130851, + "balance_loss_clip": 1.12515438, + "balance_loss_mlp": 1.07563281, + "epoch": 0.010521569216894634, + "flos": 27710361457920.0, + "grad_norm": 4.53227548004005, + "language_loss": 0.72889018, + "learning_rate": 3.325358726641591e-06, + "loss": 0.75441277, + "num_input_tokens_seen": 3704540, + "step": 175, + "time_per_iteration": 2.6175849437713623 + }, + { + "auxiliary_loss_clip": 0.01424875, + "auxiliary_loss_mlp": 0.01134794, + "balance_loss_clip": 1.12624717, + "balance_loss_mlp": 1.077335, + "epoch": 0.010581692469562603, + "flos": 12458346122880.0, + "grad_norm": 2.215496166423868, + "language_loss": 0.98043299, + "learning_rate": 3.329027409977902e-06, + "loss": 1.00602973, + "num_input_tokens_seen": 3721320, + "step": 176, + "time_per_iteration": 2.536527156829834 + }, + { + "auxiliary_loss_clip": 0.01409507, + "auxiliary_loss_mlp": 0.01136615, + "balance_loss_clip": 1.12271774, + "balance_loss_mlp": 1.08325744, + "epoch": 0.010641815722230573, + "flos": 19427601519360.0, + "grad_norm": 2.3793021769723945, + "language_loss": 0.76715404, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.79261529, + "num_input_tokens_seen": 3739385, + "step": 177, + "time_per_iteration": 2.5875117778778076 + }, + { + "auxiliary_loss_clip": 0.01420815, + "auxiliary_loss_mlp": 0.01106223, + "balance_loss_clip": 1.12172461, + "balance_loss_mlp": 1.05150616, + "epoch": 0.010701938974898541, + "flos": 18332577452160.0, + "grad_norm": 2.5208344481808145, + "language_loss": 0.7673099, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.79258031, + "num_input_tokens_seen": 3756360, + "step": 178, + "time_per_iteration": 2.540008783340454 + }, + { + "auxiliary_loss_clip": 0.01429671, + "auxiliary_loss_mlp": 0.01113634, + "balance_loss_clip": 1.1279, + "balance_loss_mlp": 1.05660415, + "epoch": 0.010762062227566512, + "flos": 19203985399680.0, + "grad_norm": 5.625168480636054, + "language_loss": 0.84153199, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.866965, + "num_input_tokens_seen": 3773930, + "step": 179, + "time_per_iteration": 2.5780298709869385 + }, + { + "auxiliary_loss_clip": 0.0141849, + "auxiliary_loss_mlp": 0.01115483, + "balance_loss_clip": 1.12092733, + "balance_loss_mlp": 1.05883479, + "epoch": 0.01082218548023448, + "flos": 31425427370880.0, + "grad_norm": 2.3201176097769336, + "language_loss": 0.83794487, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.86328459, + "num_input_tokens_seen": 3793630, + "step": 180, + "time_per_iteration": 2.653860092163086 + }, + { + "auxiliary_loss_clip": 0.01420244, + "auxiliary_loss_mlp": 0.01126542, + "balance_loss_clip": 1.12370443, + "balance_loss_mlp": 1.07032311, + "epoch": 0.01088230873290245, + "flos": 25046436170880.0, + "grad_norm": 3.389870122364849, + "language_loss": 0.77760291, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.80307078, + "num_input_tokens_seen": 3813610, + "step": 181, + "time_per_iteration": 2.602083206176758 + }, + { + "auxiliary_loss_clip": 0.01414328, + "auxiliary_loss_mlp": 0.01132399, + "balance_loss_clip": 1.11986363, + "balance_loss_mlp": 1.07656169, + "epoch": 0.01094243198557042, + "flos": 22893411980160.0, + "grad_norm": 3.2463410828251447, + "language_loss": 0.76189518, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.78736246, + "num_input_tokens_seen": 3831390, + "step": 182, + "time_per_iteration": 2.5717039108276367 + }, + { + "auxiliary_loss_clip": 0.01413239, + "auxiliary_loss_mlp": 0.0112979, + "balance_loss_clip": 1.11999297, + "balance_loss_mlp": 1.07323682, + "epoch": 0.011002555238238388, + "flos": 17165049782400.0, + "grad_norm": 2.063151557577861, + "language_loss": 0.87629259, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.90172291, + "num_input_tokens_seen": 3849705, + "step": 183, + "time_per_iteration": 2.6329638957977295 + }, + { + "auxiliary_loss_clip": 0.01416452, + "auxiliary_loss_mlp": 0.01115786, + "balance_loss_clip": 1.12317443, + "balance_loss_mlp": 1.0637629, + "epoch": 0.011062678490906358, + "flos": 22310150935680.0, + "grad_norm": 2.3173840021027616, + "language_loss": 0.86574775, + "learning_rate": 3.357647774369736e-06, + "loss": 0.89107013, + "num_input_tokens_seen": 3869230, + "step": 184, + "time_per_iteration": 2.6276638507843018 + }, + { + "auxiliary_loss_clip": 0.01413436, + "auxiliary_loss_mlp": 0.01111929, + "balance_loss_clip": 1.12426162, + "balance_loss_mlp": 1.0555191, + "epoch": 0.011122801743574327, + "flos": 24388373053440.0, + "grad_norm": 1.7694754831112893, + "language_loss": 0.8363868, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.86164051, + "num_input_tokens_seen": 3889735, + "step": 185, + "time_per_iteration": 2.5675837993621826 + }, + { + "auxiliary_loss_clip": 0.01421919, + "auxiliary_loss_mlp": 0.01112162, + "balance_loss_clip": 1.12186217, + "balance_loss_mlp": 1.05322468, + "epoch": 0.011182924996242297, + "flos": 18150258994560.0, + "grad_norm": 8.656498439157938, + "language_loss": 0.71250904, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.73784983, + "num_input_tokens_seen": 3908855, + "step": 186, + "time_per_iteration": 2.55117130279541 + }, + { + "auxiliary_loss_clip": 0.01417885, + "auxiliary_loss_mlp": 0.01114629, + "balance_loss_clip": 1.12164783, + "balance_loss_mlp": 1.06229615, + "epoch": 0.011243048248910266, + "flos": 15486800584320.0, + "grad_norm": 2.20026710514897, + "language_loss": 1.02145648, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.04678166, + "num_input_tokens_seen": 3923865, + "step": 187, + "time_per_iteration": 2.4884941577911377 + }, + { + "auxiliary_loss_clip": 0.01408593, + "auxiliary_loss_mlp": 0.0111436, + "balance_loss_clip": 1.12207508, + "balance_loss_mlp": 1.05795002, + "epoch": 0.011303171501578235, + "flos": 40916868986880.0, + "grad_norm": 1.7090552237737895, + "language_loss": 0.75063562, + "learning_rate": 3.371494591560139e-06, + "loss": 0.77586514, + "num_input_tokens_seen": 3946870, + "step": 188, + "time_per_iteration": 2.7186062335968018 + }, + { + "auxiliary_loss_clip": 0.01309838, + "auxiliary_loss_mlp": 0.01021905, + "balance_loss_clip": 1.1507796, + "balance_loss_mlp": 1.00168729, + "epoch": 0.011363294754246205, + "flos": 66302697790080.0, + "grad_norm": 0.7554724394987258, + "language_loss": 0.56205845, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.58537591, + "num_input_tokens_seen": 4010005, + "step": 189, + "time_per_iteration": 3.196989059448242 + }, + { + "auxiliary_loss_clip": 0.01405679, + "auxiliary_loss_mlp": 0.01126205, + "balance_loss_clip": 1.1169889, + "balance_loss_mlp": 1.06998563, + "epoch": 0.011423418006914174, + "flos": 24900279730560.0, + "grad_norm": 6.005254765255265, + "language_loss": 0.94985455, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.97517335, + "num_input_tokens_seen": 4029035, + "step": 190, + "time_per_iteration": 2.6015756130218506 + }, + { + "auxiliary_loss_clip": 0.01406775, + "auxiliary_loss_mlp": 0.0110634, + "balance_loss_clip": 1.11924791, + "balance_loss_mlp": 1.05264854, + "epoch": 0.011483541259582144, + "flos": 19791879298560.0, + "grad_norm": 2.7197161783663604, + "language_loss": 0.84206676, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.86719787, + "num_input_tokens_seen": 4046995, + "step": 191, + "time_per_iteration": 2.5323266983032227 + }, + { + "auxiliary_loss_clip": 0.01404397, + "auxiliary_loss_mlp": 0.01125127, + "balance_loss_clip": 1.11493492, + "balance_loss_mlp": 1.0720073, + "epoch": 0.011543664512250112, + "flos": 26176939896960.0, + "grad_norm": 2.201907053112553, + "language_loss": 0.91682249, + "learning_rate": 3.385049875042367e-06, + "loss": 0.94211769, + "num_input_tokens_seen": 4065865, + "step": 192, + "time_per_iteration": 2.597071647644043 + }, + { + "auxiliary_loss_clip": 0.01400303, + "auxiliary_loss_mlp": 0.01119157, + "balance_loss_clip": 1.1158154, + "balance_loss_mlp": 1.0609349, + "epoch": 0.011603787764918083, + "flos": 23768985905280.0, + "grad_norm": 2.1300935308954405, + "language_loss": 0.86971748, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.89491206, + "num_input_tokens_seen": 4085305, + "step": 193, + "time_per_iteration": 2.5387165546417236 + }, + { + "auxiliary_loss_clip": 0.01403924, + "auxiliary_loss_mlp": 0.01097898, + "balance_loss_clip": 1.11476374, + "balance_loss_mlp": 1.04558897, + "epoch": 0.011663911017586051, + "flos": 25954688494080.0, + "grad_norm": 2.2112664099146864, + "language_loss": 0.92197573, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.94699395, + "num_input_tokens_seen": 4105185, + "step": 194, + "time_per_iteration": 2.621145486831665 + }, + { + "auxiliary_loss_clip": 0.01407568, + "auxiliary_loss_mlp": 0.01108546, + "balance_loss_clip": 1.11843884, + "balance_loss_mlp": 1.0549016, + "epoch": 0.01172403427025402, + "flos": 17895149625600.0, + "grad_norm": 2.9085011049502643, + "language_loss": 0.89838111, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.9235422, + "num_input_tokens_seen": 4123160, + "step": 195, + "time_per_iteration": 2.6105167865753174 + }, + { + "auxiliary_loss_clip": 0.0140414, + "auxiliary_loss_mlp": 0.01122494, + "balance_loss_clip": 1.11828589, + "balance_loss_mlp": 1.06665659, + "epoch": 0.01178415752292199, + "flos": 17894539094400.0, + "grad_norm": 3.818867041584348, + "language_loss": 0.85752439, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.88279068, + "num_input_tokens_seen": 4140425, + "step": 196, + "time_per_iteration": 2.5276665687561035 + }, + { + "auxiliary_loss_clip": 0.01398811, + "auxiliary_loss_mlp": 0.01108084, + "balance_loss_clip": 1.11376595, + "balance_loss_mlp": 1.05334306, + "epoch": 0.011844280775589959, + "flos": 22893555634560.0, + "grad_norm": 4.012936769829807, + "language_loss": 0.93208826, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.95715725, + "num_input_tokens_seen": 4159555, + "step": 197, + "time_per_iteration": 2.5361878871917725 + }, + { + "auxiliary_loss_clip": 0.01397215, + "auxiliary_loss_mlp": 0.01112574, + "balance_loss_clip": 1.11556768, + "balance_loss_mlp": 1.05873883, + "epoch": 0.01190440402825793, + "flos": 26980333441920.0, + "grad_norm": 2.102881369770901, + "language_loss": 0.79127729, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.81637526, + "num_input_tokens_seen": 4180480, + "step": 198, + "time_per_iteration": 2.634080648422241 + }, + { + "auxiliary_loss_clip": 0.01395536, + "auxiliary_loss_mlp": 0.0112367, + "balance_loss_clip": 1.11708331, + "balance_loss_mlp": 1.07031214, + "epoch": 0.011964527280925898, + "flos": 20521584092160.0, + "grad_norm": 1.822930891076439, + "language_loss": 0.88180757, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.90699971, + "num_input_tokens_seen": 4198835, + "step": 199, + "time_per_iteration": 2.5609943866729736 + }, + { + "auxiliary_loss_clip": 0.01406401, + "auxiliary_loss_mlp": 0.01127057, + "balance_loss_clip": 1.12024951, + "balance_loss_mlp": 1.06916928, + "epoch": 0.012024650533593867, + "flos": 27745984771200.0, + "grad_norm": 1.8984927758396204, + "language_loss": 0.81235689, + "learning_rate": 3.411333205349222e-06, + "loss": 0.83769149, + "num_input_tokens_seen": 4219335, + "step": 200, + "time_per_iteration": 2.5740325450897217 + }, + { + "auxiliary_loss_clip": 0.01404394, + "auxiliary_loss_mlp": 0.01104464, + "balance_loss_clip": 1.11645484, + "balance_loss_mlp": 1.0486263, + "epoch": 0.012084773786261837, + "flos": 10452017076480.0, + "grad_norm": 2.468785306108615, + "language_loss": 0.87407899, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.89916754, + "num_input_tokens_seen": 4236940, + "step": 201, + "time_per_iteration": 2.507051706314087 + }, + { + "auxiliary_loss_clip": 0.01402561, + "auxiliary_loss_mlp": 0.01115408, + "balance_loss_clip": 1.11822748, + "balance_loss_mlp": 1.05897415, + "epoch": 0.012144897038929806, + "flos": 23105751229440.0, + "grad_norm": 1.6606607098224535, + "language_loss": 0.84100997, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.86618966, + "num_input_tokens_seen": 4256755, + "step": 202, + "time_per_iteration": 2.555718421936035 + }, + { + "auxiliary_loss_clip": 0.01390737, + "auxiliary_loss_mlp": 0.01109407, + "balance_loss_clip": 1.10995722, + "balance_loss_mlp": 1.05538154, + "epoch": 0.012205020291597776, + "flos": 21033203460480.0, + "grad_norm": 1.7696686502964045, + "language_loss": 0.9015373, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.92653871, + "num_input_tokens_seen": 4276505, + "step": 203, + "time_per_iteration": 2.53181529045105 + }, + { + "auxiliary_loss_clip": 0.01283381, + "auxiliary_loss_mlp": 0.0102524, + "balance_loss_clip": 1.13304305, + "balance_loss_mlp": 1.00664377, + "epoch": 0.012265143544265745, + "flos": 68447785075200.0, + "grad_norm": 1.014308913471958, + "language_loss": 0.61245143, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.63553762, + "num_input_tokens_seen": 4330965, + "step": 204, + "time_per_iteration": 3.04207444190979 + }, + { + "auxiliary_loss_clip": 0.01399078, + "auxiliary_loss_mlp": 0.01112439, + "balance_loss_clip": 1.1131146, + "balance_loss_mlp": 1.05912888, + "epoch": 0.012325266796933715, + "flos": 17019252478080.0, + "grad_norm": 3.06258456984693, + "language_loss": 0.91636145, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.94147664, + "num_input_tokens_seen": 4348200, + "step": 205, + "time_per_iteration": 2.5142509937286377 + }, + { + "auxiliary_loss_clip": 0.01407598, + "auxiliary_loss_mlp": 0.01123055, + "balance_loss_clip": 1.11846578, + "balance_loss_mlp": 1.06788456, + "epoch": 0.012385390049601683, + "flos": 20190056538240.0, + "grad_norm": 2.185024001407723, + "language_loss": 0.89197052, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.9172771, + "num_input_tokens_seen": 4365460, + "step": 206, + "time_per_iteration": 2.536412000656128 + }, + { + "auxiliary_loss_clip": 0.01397336, + "auxiliary_loss_mlp": 0.01104995, + "balance_loss_clip": 1.11256635, + "balance_loss_mlp": 1.05187559, + "epoch": 0.012445513302269652, + "flos": 16253134272000.0, + "grad_norm": 2.021868357623696, + "language_loss": 0.95616609, + "learning_rate": 3.43348263905683e-06, + "loss": 0.98118943, + "num_input_tokens_seen": 4383650, + "step": 207, + "time_per_iteration": 2.5420873165130615 + }, + { + "auxiliary_loss_clip": 0.01397126, + "auxiliary_loss_mlp": 0.0111878, + "balance_loss_clip": 1.11766446, + "balance_loss_mlp": 1.06556463, + "epoch": 0.012505636554937622, + "flos": 23769380954880.0, + "grad_norm": 1.8848398223328486, + "language_loss": 0.76012576, + "learning_rate": 3.436585547151547e-06, + "loss": 0.78528482, + "num_input_tokens_seen": 4403765, + "step": 208, + "time_per_iteration": 2.558464765548706 + }, + { + "auxiliary_loss_clip": 0.01385122, + "auxiliary_loss_mlp": 0.01113114, + "balance_loss_clip": 1.11152649, + "balance_loss_mlp": 1.05994642, + "epoch": 0.012565759807605591, + "flos": 30591546157440.0, + "grad_norm": 2.3871136578092074, + "language_loss": 0.98527992, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.01026237, + "num_input_tokens_seen": 4421935, + "step": 209, + "time_per_iteration": 2.5995798110961914 + }, + { + "auxiliary_loss_clip": 0.0139087, + "auxiliary_loss_mlp": 0.01117035, + "balance_loss_clip": 1.11314762, + "balance_loss_mlp": 1.06286621, + "epoch": 0.012625883060273561, + "flos": 40113511355520.0, + "grad_norm": 2.6439605937013115, + "language_loss": 0.85144472, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.87652373, + "num_input_tokens_seen": 4441470, + "step": 210, + "time_per_iteration": 2.6678504943847656 + }, + { + "auxiliary_loss_clip": 0.01388797, + "auxiliary_loss_mlp": 0.01117875, + "balance_loss_clip": 1.11192298, + "balance_loss_mlp": 1.06723487, + "epoch": 0.01268600631294153, + "flos": 27089178629760.0, + "grad_norm": 2.2254476475348524, + "language_loss": 0.97279471, + "learning_rate": 3.445805545042314e-06, + "loss": 0.9978615, + "num_input_tokens_seen": 4459950, + "step": 211, + "time_per_iteration": 5.365017890930176 + }, + { + "auxiliary_loss_clip": 0.01397705, + "auxiliary_loss_mlp": 0.0112021, + "balance_loss_clip": 1.11718869, + "balance_loss_mlp": 1.0659461, + "epoch": 0.012746129565609499, + "flos": 16982767238400.0, + "grad_norm": 2.134535624308585, + "language_loss": 0.95090449, + "learning_rate": 3.448849769075239e-06, + "loss": 0.97608364, + "num_input_tokens_seen": 4478390, + "step": 212, + "time_per_iteration": 3.974487066268921 + }, + { + "auxiliary_loss_clip": 0.01385195, + "auxiliary_loss_mlp": 0.01121101, + "balance_loss_clip": 1.11384463, + "balance_loss_mlp": 1.06798172, + "epoch": 0.012806252818277469, + "flos": 46533476995200.0, + "grad_norm": 1.801155671046002, + "language_loss": 0.7605617, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.78562462, + "num_input_tokens_seen": 4501665, + "step": 213, + "time_per_iteration": 4.131496906280518 + }, + { + "auxiliary_loss_clip": 0.01389486, + "auxiliary_loss_mlp": 0.01107454, + "balance_loss_clip": 1.11245823, + "balance_loss_mlp": 1.0553118, + "epoch": 0.012866376070945438, + "flos": 14388616120320.0, + "grad_norm": 2.3704250774061415, + "language_loss": 0.86484456, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.8898139, + "num_input_tokens_seen": 4519055, + "step": 214, + "time_per_iteration": 2.504061460494995 + }, + { + "auxiliary_loss_clip": 0.01388202, + "auxiliary_loss_mlp": 0.01130033, + "balance_loss_clip": 1.11825395, + "balance_loss_mlp": 1.07512581, + "epoch": 0.012926499323613408, + "flos": 26140813793280.0, + "grad_norm": 2.1660884496381674, + "language_loss": 0.7752043, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.80038661, + "num_input_tokens_seen": 4540870, + "step": 215, + "time_per_iteration": 2.600130081176758 + }, + { + "auxiliary_loss_clip": 0.01395256, + "auxiliary_loss_mlp": 0.01112857, + "balance_loss_clip": 1.11571765, + "balance_loss_mlp": 1.06069148, + "epoch": 0.012986622576281377, + "flos": 30117202128000.0, + "grad_norm": 2.174483311634191, + "language_loss": 0.9040401, + "learning_rate": 3.460884739729461e-06, + "loss": 0.92912126, + "num_input_tokens_seen": 4560395, + "step": 216, + "time_per_iteration": 2.5753557682037354 + }, + { + "auxiliary_loss_clip": 0.01387074, + "auxiliary_loss_mlp": 0.01107028, + "balance_loss_clip": 1.10984027, + "balance_loss_mlp": 1.0543375, + "epoch": 0.013046745828949347, + "flos": 13954025468160.0, + "grad_norm": 3.3866784082882018, + "language_loss": 0.93752033, + "learning_rate": 3.463858658104523e-06, + "loss": 0.96246135, + "num_input_tokens_seen": 4575785, + "step": 217, + "time_per_iteration": 2.508173942565918 + }, + { + "auxiliary_loss_clip": 0.01382646, + "auxiliary_loss_mlp": 0.01108176, + "balance_loss_clip": 1.10974121, + "balance_loss_mlp": 1.05338728, + "epoch": 0.013106869081617315, + "flos": 17347835116800.0, + "grad_norm": 1.9223427116840066, + "language_loss": 0.93535614, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.96026433, + "num_input_tokens_seen": 4594985, + "step": 218, + "time_per_iteration": 2.5009257793426514 + }, + { + "auxiliary_loss_clip": 0.01376981, + "auxiliary_loss_mlp": 0.01106107, + "balance_loss_clip": 1.10825384, + "balance_loss_mlp": 1.0551095, + "epoch": 0.013166992334285284, + "flos": 25884914325120.0, + "grad_norm": 1.8694227413307443, + "language_loss": 0.86191523, + "learning_rate": 3.46976560030214e-06, + "loss": 0.88674611, + "num_input_tokens_seen": 4616125, + "step": 219, + "time_per_iteration": 2.5708844661712646 + }, + { + "auxiliary_loss_clip": 0.01382943, + "auxiliary_loss_mlp": 0.01103565, + "balance_loss_clip": 1.11130381, + "balance_loss_mlp": 1.05223393, + "epoch": 0.013227115586953254, + "flos": 31175956437120.0, + "grad_norm": 1.7734535489206427, + "language_loss": 0.87669885, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.901564, + "num_input_tokens_seen": 4637795, + "step": 220, + "time_per_iteration": 2.6110036373138428 + }, + { + "auxiliary_loss_clip": 0.01375176, + "auxiliary_loss_mlp": 0.01105941, + "balance_loss_clip": 1.1068542, + "balance_loss_mlp": 1.05859137, + "epoch": 0.013287238839621223, + "flos": 20409470766720.0, + "grad_norm": 2.37770432935208, + "language_loss": 0.86565876, + "learning_rate": 3.475618842282164e-06, + "loss": 0.89046991, + "num_input_tokens_seen": 4656835, + "step": 221, + "time_per_iteration": 2.5673229694366455 + }, + { + "auxiliary_loss_clip": 0.01379184, + "auxiliary_loss_mlp": 0.01115692, + "balance_loss_clip": 1.10567832, + "balance_loss_mlp": 1.06335878, + "epoch": 0.013347362092289193, + "flos": 14137134024960.0, + "grad_norm": 2.096535039758878, + "language_loss": 0.92238522, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.94733399, + "num_input_tokens_seen": 4673015, + "step": 222, + "time_per_iteration": 2.4693238735198975 + }, + { + "auxiliary_loss_clip": 0.01376244, + "auxiliary_loss_mlp": 0.01107215, + "balance_loss_clip": 1.108006, + "balance_loss_mlp": 1.051759, + "epoch": 0.013407485344957162, + "flos": 21797705554560.0, + "grad_norm": 2.301863774865024, + "language_loss": 0.95768869, + "learning_rate": 3.481419351635897e-06, + "loss": 0.98252332, + "num_input_tokens_seen": 4692355, + "step": 223, + "time_per_iteration": 2.525021553039551 + }, + { + "auxiliary_loss_clip": 0.01377515, + "auxiliary_loss_mlp": 0.01107445, + "balance_loss_clip": 1.10881639, + "balance_loss_mlp": 1.05639923, + "epoch": 0.013467608597625132, + "flos": 18621622195200.0, + "grad_norm": 4.021672774997323, + "language_loss": 0.88157892, + "learning_rate": 3.484300126837776e-06, + "loss": 0.90642858, + "num_input_tokens_seen": 4710080, + "step": 224, + "time_per_iteration": 2.4916093349456787 + }, + { + "auxiliary_loss_clip": 0.01377218, + "auxiliary_loss_mlp": 0.0110686, + "balance_loss_clip": 1.10765481, + "balance_loss_mlp": 1.05118883, + "epoch": 0.013527731850293101, + "flos": 18552314903040.0, + "grad_norm": 1.9915145769366958, + "language_loss": 0.89413893, + "learning_rate": 3.487168070036317e-06, + "loss": 0.9189797, + "num_input_tokens_seen": 4728980, + "step": 225, + "time_per_iteration": 2.5031356811523438 + }, + { + "auxiliary_loss_clip": 0.01373314, + "auxiliary_loss_mlp": 0.0112267, + "balance_loss_clip": 1.10717499, + "balance_loss_mlp": 1.06921697, + "epoch": 0.01358785510296107, + "flos": 19165381257600.0, + "grad_norm": 3.1512186588840936, + "language_loss": 0.99168986, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.01664972, + "num_input_tokens_seen": 4747020, + "step": 226, + "time_per_iteration": 2.5122225284576416 + }, + { + "auxiliary_loss_clip": 0.01379799, + "auxiliary_loss_mlp": 0.0111529, + "balance_loss_clip": 1.10999382, + "balance_loss_mlp": 1.06038165, + "epoch": 0.01364797835562904, + "flos": 23329941966720.0, + "grad_norm": 2.1916836262515784, + "language_loss": 0.91006362, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.93501449, + "num_input_tokens_seen": 4765000, + "step": 227, + "time_per_iteration": 2.5267276763916016 + }, + { + "auxiliary_loss_clip": 0.01262221, + "auxiliary_loss_mlp": 0.01031758, + "balance_loss_clip": 1.11911011, + "balance_loss_mlp": 1.01373386, + "epoch": 0.013708101608297009, + "flos": 70993746097920.0, + "grad_norm": 0.9413837932204581, + "language_loss": 0.57647407, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.59941387, + "num_input_tokens_seen": 4833210, + "step": 228, + "time_per_iteration": 3.2246227264404297 + }, + { + "auxiliary_loss_clip": 0.01368924, + "auxiliary_loss_mlp": 0.01114144, + "balance_loss_clip": 1.10611475, + "balance_loss_mlp": 1.06417131, + "epoch": 0.013768224860964979, + "flos": 16325170997760.0, + "grad_norm": 2.448532707591361, + "language_loss": 0.87790722, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.90273798, + "num_input_tokens_seen": 4850120, + "step": 229, + "time_per_iteration": 2.520725965499878 + }, + { + "auxiliary_loss_clip": 0.01378691, + "auxiliary_loss_mlp": 0.0110128, + "balance_loss_clip": 1.10664546, + "balance_loss_mlp": 1.05068755, + "epoch": 0.013828348113632948, + "flos": 20193037367040.0, + "grad_norm": 3.740747102856291, + "language_loss": 0.8407408, + "learning_rate": 3.501319237118231e-06, + "loss": 0.8655405, + "num_input_tokens_seen": 4866215, + "step": 230, + "time_per_iteration": 2.516561985015869 + }, + { + "auxiliary_loss_clip": 0.01376157, + "auxiliary_loss_mlp": 0.01117923, + "balance_loss_clip": 1.1083101, + "balance_loss_mlp": 1.06799793, + "epoch": 0.013888471366300916, + "flos": 20741070147840.0, + "grad_norm": 1.9950068402642451, + "language_loss": 0.90644079, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.93138158, + "num_input_tokens_seen": 4885630, + "step": 231, + "time_per_iteration": 2.5194008350372314 + }, + { + "auxiliary_loss_clip": 0.0138042, + "auxiliary_loss_mlp": 0.01110323, + "balance_loss_clip": 1.11219323, + "balance_loss_mlp": 1.05999279, + "epoch": 0.013948594618968886, + "flos": 22090628966400.0, + "grad_norm": 2.180840857458781, + "language_loss": 0.83808231, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.86298978, + "num_input_tokens_seen": 4905570, + "step": 232, + "time_per_iteration": 2.5337069034576416 + }, + { + "auxiliary_loss_clip": 0.01380607, + "auxiliary_loss_mlp": 0.01092671, + "balance_loss_clip": 1.10332799, + "balance_loss_mlp": 1.04088664, + "epoch": 0.014008717871636855, + "flos": 19063108258560.0, + "grad_norm": 4.047560144586128, + "language_loss": 0.74348074, + "learning_rate": 3.509663010692652e-06, + "loss": 0.76821351, + "num_input_tokens_seen": 4923535, + "step": 233, + "time_per_iteration": 2.509115695953369 + }, + { + "auxiliary_loss_clip": 0.01385038, + "auxiliary_loss_mlp": 0.01122754, + "balance_loss_clip": 1.11052656, + "balance_loss_mlp": 1.06982541, + "epoch": 0.014068841124304825, + "flos": 14530822064640.0, + "grad_norm": 3.4904575543659124, + "language_loss": 0.85897446, + "learning_rate": 3.512420411838642e-06, + "loss": 0.8840524, + "num_input_tokens_seen": 4939200, + "step": 234, + "time_per_iteration": 2.515504837036133 + }, + { + "auxiliary_loss_clip": 0.01376332, + "auxiliary_loss_mlp": 0.01113065, + "balance_loss_clip": 1.10977292, + "balance_loss_mlp": 1.06309211, + "epoch": 0.014128964376972794, + "flos": 18077396256000.0, + "grad_norm": 2.4531900933922306, + "language_loss": 0.89403313, + "learning_rate": 3.515166054308634e-06, + "loss": 0.91892713, + "num_input_tokens_seen": 4956620, + "step": 235, + "time_per_iteration": 2.4948620796203613 + }, + { + "auxiliary_loss_clip": 0.01377438, + "auxiliary_loss_mlp": 0.01119191, + "balance_loss_clip": 1.11209154, + "balance_loss_mlp": 1.06766915, + "epoch": 0.014189087629640764, + "flos": 25334331678720.0, + "grad_norm": 2.1103530005430255, + "language_loss": 0.85574841, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.88071471, + "num_input_tokens_seen": 4975650, + "step": 236, + "time_per_iteration": 2.58736252784729 + }, + { + "auxiliary_loss_clip": 0.01372902, + "auxiliary_loss_mlp": 0.01099199, + "balance_loss_clip": 1.10394585, + "balance_loss_mlp": 1.0481056, + "epoch": 0.014249210882308733, + "flos": 36139744713600.0, + "grad_norm": 1.794890374651873, + "language_loss": 0.82350588, + "learning_rate": 3.520622461401154e-06, + "loss": 0.8482269, + "num_input_tokens_seen": 4997415, + "step": 237, + "time_per_iteration": 2.653890609741211 + }, + { + "auxiliary_loss_clip": 0.01372444, + "auxiliary_loss_mlp": 0.01118588, + "balance_loss_clip": 1.10811889, + "balance_loss_mlp": 1.06480098, + "epoch": 0.014309334134976702, + "flos": 12932977461120.0, + "grad_norm": 2.6889564776364963, + "language_loss": 0.77308869, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.79799902, + "num_input_tokens_seen": 5013905, + "step": 238, + "time_per_iteration": 2.5154969692230225 + }, + { + "auxiliary_loss_clip": 0.01370011, + "auxiliary_loss_mlp": 0.01115214, + "balance_loss_clip": 1.1105926, + "balance_loss_mlp": 1.06724429, + "epoch": 0.014369457387644672, + "flos": 20777519473920.0, + "grad_norm": 1.901824439886258, + "language_loss": 0.87094796, + "learning_rate": 3.526033015791284e-06, + "loss": 0.89580011, + "num_input_tokens_seen": 5033645, + "step": 239, + "time_per_iteration": 2.697768211364746 + }, + { + "auxiliary_loss_clip": 0.0135388, + "auxiliary_loss_mlp": 0.0110157, + "balance_loss_clip": 1.1003207, + "balance_loss_mlp": 1.05321884, + "epoch": 0.01442958064031264, + "flos": 25848536826240.0, + "grad_norm": 2.1022571165152266, + "language_loss": 0.92993557, + "learning_rate": 3.528721337790862e-06, + "loss": 0.95449007, + "num_input_tokens_seen": 5052875, + "step": 240, + "time_per_iteration": 2.5998799800872803 + }, + { + "auxiliary_loss_clip": 0.01363125, + "auxiliary_loss_mlp": 0.01108026, + "balance_loss_clip": 1.10578895, + "balance_loss_mlp": 1.06139135, + "epoch": 0.014489703892980611, + "flos": 28219718269440.0, + "grad_norm": 3.7286801621874743, + "language_loss": 0.8499316, + "learning_rate": 3.531398481704111e-06, + "loss": 0.87464309, + "num_input_tokens_seen": 5075005, + "step": 241, + "time_per_iteration": 2.6013524532318115 + }, + { + "auxiliary_loss_clip": 0.01362522, + "auxiliary_loss_mlp": 0.01119205, + "balance_loss_clip": 1.11130524, + "balance_loss_mlp": 1.06911302, + "epoch": 0.01454982714564858, + "flos": 22490925108480.0, + "grad_norm": 1.9842216956317018, + "language_loss": 0.88549984, + "learning_rate": 3.534064540103573e-06, + "loss": 0.91031706, + "num_input_tokens_seen": 5091875, + "step": 242, + "time_per_iteration": 2.5850167274475098 + }, + { + "auxiliary_loss_clip": 0.01362224, + "auxiliary_loss_mlp": 0.01106639, + "balance_loss_clip": 1.10480428, + "balance_loss_mlp": 1.05485404, + "epoch": 0.014609950398316548, + "flos": 21653201139840.0, + "grad_norm": 3.55145796641499, + "language_loss": 0.86753094, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89221954, + "num_input_tokens_seen": 5111290, + "step": 243, + "time_per_iteration": 2.5179176330566406 + }, + { + "auxiliary_loss_clip": 0.01367839, + "auxiliary_loss_mlp": 0.0110693, + "balance_loss_clip": 1.1074779, + "balance_loss_mlp": 1.05619431, + "epoch": 0.014670073650984519, + "flos": 21869993675520.0, + "grad_norm": 1.636914612065287, + "language_loss": 0.84211922, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.86686695, + "num_input_tokens_seen": 5132265, + "step": 244, + "time_per_iteration": 2.562384843826294 + }, + { + "auxiliary_loss_clip": 0.01374775, + "auxiliary_loss_mlp": 0.01118863, + "balance_loss_clip": 1.10892034, + "balance_loss_mlp": 1.06652975, + "epoch": 0.014730196903652487, + "flos": 23183713699200.0, + "grad_norm": 3.5734834474497483, + "language_loss": 0.78790694, + "learning_rate": 3.54199711087864e-06, + "loss": 0.81284332, + "num_input_tokens_seen": 5148575, + "step": 245, + "time_per_iteration": 2.5267293453216553 + }, + { + "auxiliary_loss_clip": 0.01372587, + "auxiliary_loss_mlp": 0.01106462, + "balance_loss_clip": 1.1042397, + "balance_loss_mlp": 1.05253208, + "epoch": 0.014790320156320457, + "flos": 23222605150080.0, + "grad_norm": 1.9975971415531737, + "language_loss": 0.83980095, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.86459142, + "num_input_tokens_seen": 5170415, + "step": 246, + "time_per_iteration": 2.5849974155426025 + }, + { + "auxiliary_loss_clip": 0.0136706, + "auxiliary_loss_mlp": 0.01101148, + "balance_loss_clip": 1.10368872, + "balance_loss_mlp": 1.05103219, + "epoch": 0.014850443408988426, + "flos": 15815490963840.0, + "grad_norm": 2.438868014737639, + "language_loss": 0.90017498, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.92485702, + "num_input_tokens_seen": 5188565, + "step": 247, + "time_per_iteration": 2.50834321975708 + }, + { + "auxiliary_loss_clip": 0.01365806, + "auxiliary_loss_mlp": 0.01100704, + "balance_loss_clip": 1.09931266, + "balance_loss_mlp": 1.05185175, + "epoch": 0.014910566661656396, + "flos": 22781657790720.0, + "grad_norm": 2.3599414979908655, + "language_loss": 0.77891719, + "learning_rate": 3.549833136812155e-06, + "loss": 0.80358231, + "num_input_tokens_seen": 5207810, + "step": 248, + "time_per_iteration": 2.5498206615448 + }, + { + "auxiliary_loss_clip": 0.01366736, + "auxiliary_loss_mlp": 0.0110798, + "balance_loss_clip": 1.10773385, + "balance_loss_mlp": 1.0578407, + "epoch": 0.014970689914324365, + "flos": 26865023806080.0, + "grad_norm": 1.9976307824361628, + "language_loss": 0.83752239, + "learning_rate": 3.552424094769381e-06, + "loss": 0.86226952, + "num_input_tokens_seen": 5226210, + "step": 249, + "time_per_iteration": 2.555947780609131 + }, + { + "auxiliary_loss_clip": 0.01360423, + "auxiliary_loss_mlp": 0.01106543, + "balance_loss_clip": 1.1018579, + "balance_loss_mlp": 1.05850208, + "epoch": 0.015030813166992334, + "flos": 13985662371840.0, + "grad_norm": 2.130508993705231, + "language_loss": 0.93491417, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.95958382, + "num_input_tokens_seen": 5241660, + "step": 250, + "time_per_iteration": 2.4924814701080322 + }, + { + "auxiliary_loss_clip": 0.0136866, + "auxiliary_loss_mlp": 0.01113585, + "balance_loss_clip": 1.10640812, + "balance_loss_mlp": 1.06261063, + "epoch": 0.015090936419660304, + "flos": 24717817618560.0, + "grad_norm": 2.6071368708473552, + "language_loss": 0.97118521, + "learning_rate": 3.5575749397087034e-06, + "loss": 0.99600768, + "num_input_tokens_seen": 5261090, + "step": 251, + "time_per_iteration": 2.5534534454345703 + }, + { + "auxiliary_loss_clip": 0.01363485, + "auxiliary_loss_mlp": 0.01110481, + "balance_loss_clip": 1.10152173, + "balance_loss_mlp": 1.06124806, + "epoch": 0.015151059672328273, + "flos": 25738793798400.0, + "grad_norm": 4.671852518599087, + "language_loss": 0.84020257, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.86494225, + "num_input_tokens_seen": 5279175, + "step": 252, + "time_per_iteration": 6.8410022258758545 + }, + { + "auxiliary_loss_clip": 0.01358484, + "auxiliary_loss_mlp": 0.01114374, + "balance_loss_clip": 1.10373652, + "balance_loss_mlp": 1.06509256, + "epoch": 0.015211182924996243, + "flos": 21871214737920.0, + "grad_norm": 2.954963566843214, + "language_loss": 0.98369592, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.00842452, + "num_input_tokens_seen": 5296975, + "step": 253, + "time_per_iteration": 2.5377111434936523 + }, + { + "auxiliary_loss_clip": 0.01248912, + "auxiliary_loss_mlp": 0.01054778, + "balance_loss_clip": 1.11190462, + "balance_loss_mlp": 1.03742111, + "epoch": 0.015271306177664212, + "flos": 66895080888960.0, + "grad_norm": 0.8673010973163903, + "language_loss": 0.55665803, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.57969499, + "num_input_tokens_seen": 5358375, + "step": 254, + "time_per_iteration": 3.1389174461364746 + }, + { + "auxiliary_loss_clip": 0.01362744, + "auxiliary_loss_mlp": 0.01116424, + "balance_loss_clip": 1.09876251, + "balance_loss_mlp": 1.06716669, + "epoch": 0.01533142943033218, + "flos": 26834069260800.0, + "grad_norm": 2.000381231832675, + "language_loss": 0.89998662, + "learning_rate": 3.567754632921479e-06, + "loss": 0.92477834, + "num_input_tokens_seen": 5377255, + "step": 255, + "time_per_iteration": 2.57558274269104 + }, + { + "auxiliary_loss_clip": 0.01359343, + "auxiliary_loss_mlp": 0.01130865, + "balance_loss_clip": 1.101511, + "balance_loss_mlp": 1.08060622, + "epoch": 0.01539155268300015, + "flos": 20813753318400.0, + "grad_norm": 2.5013527323490248, + "language_loss": 0.85574663, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.88064873, + "num_input_tokens_seen": 5395320, + "step": 256, + "time_per_iteration": 2.55669903755188 + }, + { + "auxiliary_loss_clip": 0.01365031, + "auxiliary_loss_mlp": 0.01114993, + "balance_loss_clip": 1.10173476, + "balance_loss_mlp": 1.06497252, + "epoch": 0.01545167593566812, + "flos": 15961862885760.0, + "grad_norm": 4.026885389394537, + "language_loss": 0.71599841, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.74079859, + "num_input_tokens_seen": 5411970, + "step": 257, + "time_per_iteration": 2.5195844173431396 + }, + { + "auxiliary_loss_clip": 0.01359155, + "auxiliary_loss_mlp": 0.01105034, + "balance_loss_clip": 1.10190642, + "balance_loss_mlp": 1.05546713, + "epoch": 0.01551179918833609, + "flos": 22601745544320.0, + "grad_norm": 1.9088300716225362, + "language_loss": 0.94414544, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.96878737, + "num_input_tokens_seen": 5430245, + "step": 258, + "time_per_iteration": 2.5318691730499268 + }, + { + "auxiliary_loss_clip": 0.0135839, + "auxiliary_loss_mlp": 0.0111157, + "balance_loss_clip": 1.09958136, + "balance_loss_mlp": 1.06302762, + "epoch": 0.015571922441004058, + "flos": 22816706486400.0, + "grad_norm": 2.2241151549438865, + "language_loss": 0.93147421, + "learning_rate": 3.577775880881658e-06, + "loss": 0.9561739, + "num_input_tokens_seen": 5448905, + "step": 259, + "time_per_iteration": 2.537464141845703 + }, + { + "auxiliary_loss_clip": 0.01351674, + "auxiliary_loss_mlp": 0.01099412, + "balance_loss_clip": 1.10240221, + "balance_loss_mlp": 1.05334973, + "epoch": 0.015632045693672027, + "flos": 18947439486720.0, + "grad_norm": 1.8100825929263045, + "language_loss": 0.97499704, + "learning_rate": 3.5802570099000424e-06, + "loss": 0.99950784, + "num_input_tokens_seen": 5466405, + "step": 260, + "time_per_iteration": 2.5359139442443848 + }, + { + "auxiliary_loss_clip": 0.01367381, + "auxiliary_loss_mlp": 0.01118224, + "balance_loss_clip": 1.10392773, + "balance_loss_mlp": 1.070207, + "epoch": 0.015692168946339995, + "flos": 29971728046080.0, + "grad_norm": 2.3978499308780394, + "language_loss": 0.87622184, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.90107793, + "num_input_tokens_seen": 5487055, + "step": 261, + "time_per_iteration": 2.595839500427246 + }, + { + "auxiliary_loss_clip": 0.01357842, + "auxiliary_loss_mlp": 0.01111158, + "balance_loss_clip": 1.09972167, + "balance_loss_mlp": 1.06225848, + "epoch": 0.015752292199007967, + "flos": 19392085946880.0, + "grad_norm": 2.0419986323467567, + "language_loss": 0.67188448, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.69657445, + "num_input_tokens_seen": 5506600, + "step": 262, + "time_per_iteration": 2.529513120651245 + }, + { + "auxiliary_loss_clip": 0.0135694, + "auxiliary_loss_mlp": 0.01121272, + "balance_loss_clip": 1.10174894, + "balance_loss_mlp": 1.07101297, + "epoch": 0.015812415451675936, + "flos": 20339804338560.0, + "grad_norm": 2.5881735238743597, + "language_loss": 0.68002111, + "learning_rate": 3.587643540438383e-06, + "loss": 0.70480323, + "num_input_tokens_seen": 5524350, + "step": 263, + "time_per_iteration": 2.511111259460449 + }, + { + "auxiliary_loss_clip": 0.01354858, + "auxiliary_loss_mlp": 0.01106401, + "balance_loss_clip": 1.09695625, + "balance_loss_mlp": 1.05754912, + "epoch": 0.015872538704343905, + "flos": 17525412979200.0, + "grad_norm": 2.8885617583956966, + "language_loss": 0.853661, + "learning_rate": 3.590087005168037e-06, + "loss": 0.87827361, + "num_input_tokens_seen": 5542145, + "step": 264, + "time_per_iteration": 2.5206456184387207 + }, + { + "auxiliary_loss_clip": 0.01362892, + "auxiliary_loss_mlp": 0.01097039, + "balance_loss_clip": 1.102947, + "balance_loss_mlp": 1.05016589, + "epoch": 0.015932661957011873, + "flos": 15260490944640.0, + "grad_norm": 2.918258430303743, + "language_loss": 1.04258764, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.06718695, + "num_input_tokens_seen": 5557920, + "step": 265, + "time_per_iteration": 2.4944159984588623 + }, + { + "auxiliary_loss_clip": 0.01364934, + "auxiliary_loss_mlp": 0.01116949, + "balance_loss_clip": 1.10501623, + "balance_loss_mlp": 1.06504536, + "epoch": 0.015992785209679845, + "flos": 20302528999680.0, + "grad_norm": 2.4039413117482957, + "language_loss": 0.7523104, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.77712917, + "num_input_tokens_seen": 5576290, + "step": 266, + "time_per_iteration": 2.53471040725708 + }, + { + "auxiliary_loss_clip": 0.01351882, + "auxiliary_loss_mlp": 0.01100797, + "balance_loss_clip": 1.10244286, + "balance_loss_mlp": 1.05211234, + "epoch": 0.016052908462347814, + "flos": 23362368969600.0, + "grad_norm": 1.8377807903895496, + "language_loss": 0.90750849, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.93203533, + "num_input_tokens_seen": 5595205, + "step": 267, + "time_per_iteration": 2.5514185428619385 + }, + { + "auxiliary_loss_clip": 0.01361136, + "auxiliary_loss_mlp": 0.0111957, + "balance_loss_clip": 1.10222924, + "balance_loss_mlp": 1.07224357, + "epoch": 0.016113031715015783, + "flos": 21286588976640.0, + "grad_norm": 2.6644671198576932, + "language_loss": 0.85729837, + "learning_rate": 3.599769175344462e-06, + "loss": 0.88210547, + "num_input_tokens_seen": 5612645, + "step": 268, + "time_per_iteration": 2.535179615020752 + }, + { + "auxiliary_loss_clip": 0.01354218, + "auxiliary_loss_mlp": 0.01097601, + "balance_loss_clip": 1.10415888, + "balance_loss_mlp": 1.05051351, + "epoch": 0.01617315496768375, + "flos": 18914689261440.0, + "grad_norm": 2.670727820412289, + "language_loss": 0.88510281, + "learning_rate": 3.602167137831432e-06, + "loss": 0.909621, + "num_input_tokens_seen": 5628345, + "step": 269, + "time_per_iteration": 2.504873514175415 + }, + { + "auxiliary_loss_clip": 0.01358593, + "auxiliary_loss_mlp": 0.01104483, + "balance_loss_clip": 1.10061765, + "balance_loss_mlp": 1.05281782, + "epoch": 0.01623327822035172, + "flos": 16546488647040.0, + "grad_norm": 2.0895170298773316, + "language_loss": 0.97050202, + "learning_rate": 3.6045562024779565e-06, + "loss": 0.9951328, + "num_input_tokens_seen": 5645940, + "step": 270, + "time_per_iteration": 2.5209782123565674 + }, + { + "auxiliary_loss_clip": 0.0135913, + "auxiliary_loss_mlp": 0.01118428, + "balance_loss_clip": 1.10646236, + "balance_loss_mlp": 1.07024348, + "epoch": 0.016293401473019692, + "flos": 23513481486720.0, + "grad_norm": 2.3759671943872127, + "language_loss": 0.86464715, + "learning_rate": 3.606936435072361e-06, + "loss": 0.88942271, + "num_input_tokens_seen": 5665690, + "step": 271, + "time_per_iteration": 2.5473685264587402 + }, + { + "auxiliary_loss_clip": 0.01354972, + "auxiliary_loss_mlp": 0.01107112, + "balance_loss_clip": 1.09663105, + "balance_loss_mlp": 1.05926085, + "epoch": 0.01635352472568766, + "flos": 29016072748800.0, + "grad_norm": 2.4129579947204833, + "language_loss": 0.81222534, + "learning_rate": 3.609307900676025e-06, + "loss": 0.83684611, + "num_input_tokens_seen": 5683190, + "step": 272, + "time_per_iteration": 2.584460496902466 + }, + { + "auxiliary_loss_clip": 0.01349206, + "auxiliary_loss_mlp": 0.01122272, + "balance_loss_clip": 1.09948301, + "balance_loss_mlp": 1.07570875, + "epoch": 0.01641364797835563, + "flos": 13370513028480.0, + "grad_norm": 2.101619129271149, + "language_loss": 0.81057894, + "learning_rate": 3.611670663634051e-06, + "loss": 0.83529371, + "num_input_tokens_seen": 5699780, + "step": 273, + "time_per_iteration": 2.498856782913208 + }, + { + "auxiliary_loss_clip": 0.01347155, + "auxiliary_loss_mlp": 0.01106375, + "balance_loss_clip": 1.09480536, + "balance_loss_mlp": 1.05857229, + "epoch": 0.016473771231023598, + "flos": 18878239935360.0, + "grad_norm": 2.139345533409641, + "language_loss": 0.91233766, + "learning_rate": 3.614024787585744e-06, + "loss": 0.93687284, + "num_input_tokens_seen": 5716980, + "step": 274, + "time_per_iteration": 2.547649621963501 + }, + { + "auxiliary_loss_clip": 0.01347435, + "auxiliary_loss_mlp": 0.01114881, + "balance_loss_clip": 1.09782517, + "balance_loss_mlp": 1.06650615, + "epoch": 0.016533894483691566, + "flos": 22601637803520.0, + "grad_norm": 1.759998900929906, + "language_loss": 0.87899989, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.90362298, + "num_input_tokens_seen": 5737780, + "step": 275, + "time_per_iteration": 2.549131393432617 + }, + { + "auxiliary_loss_clip": 0.01350587, + "auxiliary_loss_mlp": 0.01104661, + "balance_loss_clip": 1.099159, + "balance_loss_mlp": 1.05495048, + "epoch": 0.01659401773635954, + "flos": 21507188353920.0, + "grad_norm": 1.6307218884404493, + "language_loss": 0.80728763, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.8318401, + "num_input_tokens_seen": 5758330, + "step": 276, + "time_per_iteration": 2.548225164413452 + }, + { + "auxiliary_loss_clip": 0.01340912, + "auxiliary_loss_mlp": 0.01100361, + "balance_loss_clip": 1.09751666, + "balance_loss_mlp": 1.05592024, + "epoch": 0.016654140989027507, + "flos": 32850973411200.0, + "grad_norm": 3.0254175799438774, + "language_loss": 0.8099457, + "learning_rate": 3.621035951423551e-06, + "loss": 0.83435845, + "num_input_tokens_seen": 5778340, + "step": 277, + "time_per_iteration": 2.6365997791290283 + }, + { + "auxiliary_loss_clip": 0.01339161, + "auxiliary_loss_mlp": 0.01095437, + "balance_loss_clip": 1.09108841, + "balance_loss_mlp": 1.04911256, + "epoch": 0.016714264241695476, + "flos": 12306228024960.0, + "grad_norm": 2.0563842439974733, + "language_loss": 0.80398297, + "learning_rate": 3.623356141983041e-06, + "loss": 0.82832897, + "num_input_tokens_seen": 5794295, + "step": 278, + "time_per_iteration": 2.5450284481048584 + }, + { + "auxiliary_loss_clip": 0.01344861, + "auxiliary_loss_mlp": 0.01102195, + "balance_loss_clip": 1.09682906, + "balance_loss_mlp": 1.05629945, + "epoch": 0.016774387494363444, + "flos": 27123796362240.0, + "grad_norm": 1.695341178338216, + "language_loss": 0.90399712, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.92846769, + "num_input_tokens_seen": 5814405, + "step": 279, + "time_per_iteration": 2.6079492568969727 + }, + { + "auxiliary_loss_clip": 0.01349991, + "auxiliary_loss_mlp": 0.01119104, + "balance_loss_clip": 1.09726751, + "balance_loss_mlp": 1.07122993, + "epoch": 0.016834510747031413, + "flos": 20191493082240.0, + "grad_norm": 2.1942574233548098, + "language_loss": 0.94055426, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.96524525, + "num_input_tokens_seen": 5832795, + "step": 280, + "time_per_iteration": 2.5388219356536865 + }, + { + "auxiliary_loss_clip": 0.01350182, + "auxiliary_loss_mlp": 0.01112538, + "balance_loss_clip": 1.09594345, + "balance_loss_mlp": 1.06454444, + "epoch": 0.016894633999699385, + "flos": 27274262434560.0, + "grad_norm": 1.8832431167776666, + "language_loss": 0.74161649, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.7662437, + "num_input_tokens_seen": 5855750, + "step": 281, + "time_per_iteration": 2.6330034732818604 + }, + { + "auxiliary_loss_clip": 0.01345181, + "auxiliary_loss_mlp": 0.01118325, + "balance_loss_clip": 1.09767163, + "balance_loss_mlp": 1.07307339, + "epoch": 0.016954757252367354, + "flos": 14902964922240.0, + "grad_norm": 2.479813267575734, + "language_loss": 0.80187267, + "learning_rate": 3.632554186750274e-06, + "loss": 0.82650775, + "num_input_tokens_seen": 5872610, + "step": 282, + "time_per_iteration": 2.4776694774627686 + }, + { + "auxiliary_loss_clip": 0.01351329, + "auxiliary_loss_mlp": 0.01122504, + "balance_loss_clip": 1.0999676, + "balance_loss_mlp": 1.0753684, + "epoch": 0.017014880505035322, + "flos": 21358805270400.0, + "grad_norm": 1.968930159867162, + "language_loss": 0.77522027, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.79995859, + "num_input_tokens_seen": 5892985, + "step": 283, + "time_per_iteration": 2.5298852920532227 + }, + { + "auxiliary_loss_clip": 0.01350023, + "auxiliary_loss_mlp": 0.01094977, + "balance_loss_clip": 1.10065103, + "balance_loss_mlp": 1.05024981, + "epoch": 0.01707500375770329, + "flos": 35333154858240.0, + "grad_norm": 3.61162032230288, + "language_loss": 0.84020209, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.8646521, + "num_input_tokens_seen": 5914060, + "step": 284, + "time_per_iteration": 2.6063029766082764 + }, + { + "auxiliary_loss_clip": 0.01341853, + "auxiliary_loss_mlp": 0.01106124, + "balance_loss_clip": 1.09582341, + "balance_loss_mlp": 1.05808318, + "epoch": 0.01713512701037126, + "flos": 23582070506880.0, + "grad_norm": 2.3823244694630543, + "language_loss": 0.96769047, + "learning_rate": 3.639367500948819e-06, + "loss": 0.99217021, + "num_input_tokens_seen": 5932860, + "step": 285, + "time_per_iteration": 2.5391108989715576 + }, + { + "auxiliary_loss_clip": 0.01343575, + "auxiliary_loss_mlp": 0.0109574, + "balance_loss_clip": 1.09766126, + "balance_loss_mlp": 1.05151343, + "epoch": 0.01719525026303923, + "flos": 27634661544960.0, + "grad_norm": 2.035805266977711, + "language_loss": 0.93776071, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96215391, + "num_input_tokens_seen": 5952725, + "step": 286, + "time_per_iteration": 2.5694897174835205 + }, + { + "auxiliary_loss_clip": 0.01337339, + "auxiliary_loss_mlp": 0.01089256, + "balance_loss_clip": 1.09304154, + "balance_loss_mlp": 1.04355073, + "epoch": 0.0172553735157072, + "flos": 26979722910720.0, + "grad_norm": 1.7015879555808981, + "language_loss": 0.92440021, + "learning_rate": 3.643869982119001e-06, + "loss": 0.94866616, + "num_input_tokens_seen": 5970560, + "step": 287, + "time_per_iteration": 2.587815284729004 + }, + { + "auxiliary_loss_clip": 0.01339656, + "auxiliary_loss_mlp": 0.0109326, + "balance_loss_clip": 1.09215975, + "balance_loss_mlp": 1.04784083, + "epoch": 0.01731549676837517, + "flos": 14056621689600.0, + "grad_norm": 2.8754417687215152, + "language_loss": 1.01777673, + "learning_rate": 3.646109470232502e-06, + "loss": 1.04210591, + "num_input_tokens_seen": 5982980, + "step": 288, + "time_per_iteration": 2.4412081241607666 + }, + { + "auxiliary_loss_clip": 0.01233328, + "auxiliary_loss_mlp": 0.01123814, + "balance_loss_clip": 1.10167682, + "balance_loss_mlp": 1.10893714, + "epoch": 0.017375620021043137, + "flos": 66510694471680.0, + "grad_norm": 0.9307221348947577, + "language_loss": 0.6387279, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66229939, + "num_input_tokens_seen": 6049445, + "step": 289, + "time_per_iteration": 3.2459771633148193 + }, + { + "auxiliary_loss_clip": 0.01343565, + "auxiliary_loss_mlp": 0.01113671, + "balance_loss_clip": 1.09856653, + "balance_loss_mlp": 1.07023072, + "epoch": 0.01743574327371111, + "flos": 15225154940160.0, + "grad_norm": 2.427925743478862, + "language_loss": 0.8839581, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.90853047, + "num_input_tokens_seen": 6064150, + "step": 290, + "time_per_iteration": 5.3138415813446045 + }, + { + "auxiliary_loss_clip": 0.01339214, + "auxiliary_loss_mlp": 0.0109791, + "balance_loss_clip": 1.09575605, + "balance_loss_mlp": 1.05330157, + "epoch": 0.017495866526379078, + "flos": 25373869574400.0, + "grad_norm": 1.9230198200190896, + "language_loss": 0.84609938, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.87047058, + "num_input_tokens_seen": 6083920, + "step": 291, + "time_per_iteration": 2.563136339187622 + }, + { + "auxiliary_loss_clip": 0.01346307, + "auxiliary_loss_mlp": 0.01112867, + "balance_loss_clip": 1.10312796, + "balance_loss_mlp": 1.06456339, + "epoch": 0.017555989779047047, + "flos": 26359473836160.0, + "grad_norm": 1.6011466264553436, + "language_loss": 0.7267921, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.75138378, + "num_input_tokens_seen": 6105460, + "step": 292, + "time_per_iteration": 2.574538230895996 + }, + { + "auxiliary_loss_clip": 0.01334533, + "auxiliary_loss_mlp": 0.01108476, + "balance_loss_clip": 1.09444451, + "balance_loss_mlp": 1.06327212, + "epoch": 0.017616113031715015, + "flos": 22338807010560.0, + "grad_norm": 2.3082278197228443, + "language_loss": 0.87150347, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.89593351, + "num_input_tokens_seen": 6122890, + "step": 293, + "time_per_iteration": 2.5188400745391846 + }, + { + "auxiliary_loss_clip": 0.01335353, + "auxiliary_loss_mlp": 0.01117073, + "balance_loss_clip": 1.09437716, + "balance_loss_mlp": 1.07210779, + "epoch": 0.017676236284382984, + "flos": 20156911263360.0, + "grad_norm": 2.58156296475737, + "language_loss": 0.81073326, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.83525747, + "num_input_tokens_seen": 6142890, + "step": 294, + "time_per_iteration": 3.9038243293762207 + }, + { + "auxiliary_loss_clip": 0.01334855, + "auxiliary_loss_mlp": 0.01121771, + "balance_loss_clip": 1.09100759, + "balance_loss_mlp": 1.0766387, + "epoch": 0.017736359537050956, + "flos": 25223331674880.0, + "grad_norm": 1.8832542294351495, + "language_loss": 0.83657086, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.86113715, + "num_input_tokens_seen": 6162030, + "step": 295, + "time_per_iteration": 2.562040090560913 + }, + { + "auxiliary_loss_clip": 0.01340007, + "auxiliary_loss_mlp": 0.01122642, + "balance_loss_clip": 1.10024905, + "balance_loss_mlp": 1.07736683, + "epoch": 0.017796482789718925, + "flos": 20338798757760.0, + "grad_norm": 4.3199352828927235, + "language_loss": 0.84518093, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.86980742, + "num_input_tokens_seen": 6180540, + "step": 296, + "time_per_iteration": 2.534372091293335 + }, + { + "auxiliary_loss_clip": 0.01341111, + "auxiliary_loss_mlp": 0.01109048, + "balance_loss_clip": 1.09664202, + "balance_loss_mlp": 1.06470215, + "epoch": 0.017856606042386893, + "flos": 22379206832640.0, + "grad_norm": 9.47845339023804, + "language_loss": 0.87818229, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90268391, + "num_input_tokens_seen": 6199425, + "step": 297, + "time_per_iteration": 2.5354127883911133 + }, + { + "auxiliary_loss_clip": 0.01339649, + "auxiliary_loss_mlp": 0.01107176, + "balance_loss_clip": 1.09500957, + "balance_loss_mlp": 1.06306839, + "epoch": 0.017916729295054862, + "flos": 20230061310720.0, + "grad_norm": 2.06483633729234, + "language_loss": 0.88298631, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.90745461, + "num_input_tokens_seen": 6219170, + "step": 298, + "time_per_iteration": 2.5008327960968018 + }, + { + "auxiliary_loss_clip": 0.01336008, + "auxiliary_loss_mlp": 0.01130674, + "balance_loss_clip": 1.0964514, + "balance_loss_mlp": 1.08444476, + "epoch": 0.01797685254772283, + "flos": 19390972625280.0, + "grad_norm": 2.431403372387156, + "language_loss": 0.8863672, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.91103399, + "num_input_tokens_seen": 6237930, + "step": 299, + "time_per_iteration": 2.5303800106048584 + }, + { + "auxiliary_loss_clip": 0.01341325, + "auxiliary_loss_mlp": 0.0111682, + "balance_loss_clip": 1.09554935, + "balance_loss_mlp": 1.07087636, + "epoch": 0.018036975800390802, + "flos": 24426007528320.0, + "grad_norm": 3.7544407034872926, + "language_loss": 0.64941609, + "learning_rate": 3.672392800539357e-06, + "loss": 0.67399752, + "num_input_tokens_seen": 6257170, + "step": 300, + "time_per_iteration": 2.5367417335510254 + }, + { + "auxiliary_loss_clip": 0.01341658, + "auxiliary_loss_mlp": 0.01116804, + "balance_loss_clip": 1.09932256, + "balance_loss_mlp": 1.07140875, + "epoch": 0.01809709905305877, + "flos": 15778933896960.0, + "grad_norm": 2.4864296136859214, + "language_loss": 0.88327873, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.90786338, + "num_input_tokens_seen": 6274780, + "step": 301, + "time_per_iteration": 2.4992129802703857 + }, + { + "auxiliary_loss_clip": 0.01219983, + "auxiliary_loss_mlp": 0.01103495, + "balance_loss_clip": 1.08691525, + "balance_loss_mlp": 1.08976173, + "epoch": 0.01815722230572674, + "flos": 67348382526720.0, + "grad_norm": 0.838361940170133, + "language_loss": 0.62254959, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64578432, + "num_input_tokens_seen": 6340435, + "step": 302, + "time_per_iteration": 3.241762399673462 + }, + { + "auxiliary_loss_clip": 0.01330805, + "auxiliary_loss_mlp": 0.01112582, + "balance_loss_clip": 1.0919019, + "balance_loss_mlp": 1.06675768, + "epoch": 0.01821734555839471, + "flos": 15485615435520.0, + "grad_norm": 2.328279772106465, + "language_loss": 0.89759696, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.92203081, + "num_input_tokens_seen": 6358160, + "step": 303, + "time_per_iteration": 2.4968881607055664 + }, + { + "auxiliary_loss_clip": 0.01337659, + "auxiliary_loss_mlp": 0.01118862, + "balance_loss_clip": 1.09677899, + "balance_loss_mlp": 1.07220364, + "epoch": 0.018277468811062677, + "flos": 24097424889600.0, + "grad_norm": 1.8383419497140816, + "language_loss": 0.8044219, + "learning_rate": 3.680920768703364e-06, + "loss": 0.82898712, + "num_input_tokens_seen": 6378485, + "step": 304, + "time_per_iteration": 2.5360758304595947 + }, + { + "auxiliary_loss_clip": 0.01331076, + "auxiliary_loss_mlp": 0.01100502, + "balance_loss_clip": 1.09878373, + "balance_loss_mlp": 1.05708599, + "epoch": 0.01833759206373065, + "flos": 20959335141120.0, + "grad_norm": 1.520208305976294, + "language_loss": 0.82941401, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.85372984, + "num_input_tokens_seen": 6397845, + "step": 305, + "time_per_iteration": 2.518463611602783 + }, + { + "auxiliary_loss_clip": 0.01332569, + "auxiliary_loss_mlp": 0.01096086, + "balance_loss_clip": 1.09137988, + "balance_loss_mlp": 1.05276537, + "epoch": 0.018397715316398618, + "flos": 19390757143680.0, + "grad_norm": 1.7907027535521303, + "language_loss": 0.90931678, + "learning_rate": 3.685142765363119e-06, + "loss": 0.93360329, + "num_input_tokens_seen": 6416475, + "step": 306, + "time_per_iteration": 2.4926939010620117 + }, + { + "auxiliary_loss_clip": 0.01326175, + "auxiliary_loss_mlp": 0.01092986, + "balance_loss_clip": 1.08908272, + "balance_loss_mlp": 1.04999924, + "epoch": 0.018457838569066586, + "flos": 29132531619840.0, + "grad_norm": 2.6068671595959394, + "language_loss": 0.86325067, + "learning_rate": 3.687243426879095e-06, + "loss": 0.88744229, + "num_input_tokens_seen": 6437520, + "step": 307, + "time_per_iteration": 2.5516645908355713 + }, + { + "auxiliary_loss_clip": 0.01327815, + "auxiliary_loss_mlp": 0.01108366, + "balance_loss_clip": 1.09392428, + "balance_loss_mlp": 1.06037188, + "epoch": 0.018517961821734555, + "flos": 19208654167680.0, + "grad_norm": 4.175295809441147, + "language_loss": 0.71828777, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.74264956, + "num_input_tokens_seen": 6455680, + "step": 308, + "time_per_iteration": 2.5085878372192383 + }, + { + "auxiliary_loss_clip": 0.01333285, + "auxiliary_loss_mlp": 0.01103971, + "balance_loss_clip": 1.09073353, + "balance_loss_mlp": 1.06005394, + "epoch": 0.018578085074402523, + "flos": 19863018184320.0, + "grad_norm": 2.231224728646827, + "language_loss": 0.91654509, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.94091773, + "num_input_tokens_seen": 6474880, + "step": 309, + "time_per_iteration": 2.5095369815826416 + }, + { + "auxiliary_loss_clip": 0.01341962, + "auxiliary_loss_mlp": 0.01107076, + "balance_loss_clip": 1.09292269, + "balance_loss_mlp": 1.06096554, + "epoch": 0.018638208327070496, + "flos": 29606947476480.0, + "grad_norm": 1.8483677204587343, + "language_loss": 0.72587085, + "learning_rate": 3.69350459956065e-06, + "loss": 0.75036126, + "num_input_tokens_seen": 6495945, + "step": 310, + "time_per_iteration": 2.5712952613830566 + }, + { + "auxiliary_loss_clip": 0.01332616, + "auxiliary_loss_mlp": 0.01111299, + "balance_loss_clip": 1.09699655, + "balance_loss_mlp": 1.06673861, + "epoch": 0.018698331579738464, + "flos": 45731555907840.0, + "grad_norm": 2.1563336998008786, + "language_loss": 0.74176562, + "learning_rate": 3.695578199367497e-06, + "loss": 0.76620477, + "num_input_tokens_seen": 6519930, + "step": 311, + "time_per_iteration": 2.712759017944336 + }, + { + "auxiliary_loss_clip": 0.01339766, + "auxiliary_loss_mlp": 0.01116196, + "balance_loss_clip": 1.09383547, + "balance_loss_mlp": 1.07249331, + "epoch": 0.018758454832406433, + "flos": 20483662308480.0, + "grad_norm": 2.5542300347140925, + "language_loss": 0.9172585, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.94181812, + "num_input_tokens_seen": 6535070, + "step": 312, + "time_per_iteration": 2.4732766151428223 + }, + { + "auxiliary_loss_clip": 0.01339407, + "auxiliary_loss_mlp": 0.01119684, + "balance_loss_clip": 1.0953604, + "balance_loss_mlp": 1.07390773, + "epoch": 0.0188185780850744, + "flos": 15777784661760.0, + "grad_norm": 2.161492194659412, + "language_loss": 0.89793909, + "learning_rate": 3.699705471087043e-06, + "loss": 0.92252994, + "num_input_tokens_seen": 6554135, + "step": 313, + "time_per_iteration": 2.510206937789917 + }, + { + "auxiliary_loss_clip": 0.01341727, + "auxiliary_loss_mlp": 0.01103901, + "balance_loss_clip": 1.09423101, + "balance_loss_mlp": 1.05740905, + "epoch": 0.018878701337742373, + "flos": 22455732758400.0, + "grad_norm": 1.9300042227099097, + "language_loss": 0.72875816, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.75321436, + "num_input_tokens_seen": 6572275, + "step": 314, + "time_per_iteration": 2.526021718978882 + }, + { + "auxiliary_loss_clip": 0.01329749, + "auxiliary_loss_mlp": 0.01106551, + "balance_loss_clip": 1.09181952, + "balance_loss_mlp": 1.06313467, + "epoch": 0.018938824590410342, + "flos": 30993530238720.0, + "grad_norm": 2.6946247362473437, + "language_loss": 0.8969605, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.92132354, + "num_input_tokens_seen": 6594520, + "step": 315, + "time_per_iteration": 2.5963077545166016 + }, + { + "auxiliary_loss_clip": 0.01331882, + "auxiliary_loss_mlp": 0.01096604, + "balance_loss_clip": 1.09381747, + "balance_loss_mlp": 1.05237722, + "epoch": 0.01899894784307831, + "flos": 23258910821760.0, + "grad_norm": 3.584483433203057, + "language_loss": 0.80503726, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.8293221, + "num_input_tokens_seen": 6614245, + "step": 316, + "time_per_iteration": 2.5061070919036865 + }, + { + "auxiliary_loss_clip": 0.01326129, + "auxiliary_loss_mlp": 0.01094618, + "balance_loss_clip": 1.09128261, + "balance_loss_mlp": 1.05172658, + "epoch": 0.01905907109574628, + "flos": 17457901367040.0, + "grad_norm": 2.0691252717151656, + "language_loss": 0.90178567, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.9259932, + "num_input_tokens_seen": 6632015, + "step": 317, + "time_per_iteration": 2.488750696182251 + }, + { + "auxiliary_loss_clip": 0.01322913, + "auxiliary_loss_mlp": 0.01097754, + "balance_loss_clip": 1.08982813, + "balance_loss_mlp": 1.05278862, + "epoch": 0.019119194348414248, + "flos": 14970225139200.0, + "grad_norm": 2.4249503565227197, + "language_loss": 0.90965402, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93386078, + "num_input_tokens_seen": 6649015, + "step": 318, + "time_per_iteration": 2.468766212463379 + }, + { + "auxiliary_loss_clip": 0.01325631, + "auxiliary_loss_mlp": 0.01091806, + "balance_loss_clip": 1.08999777, + "balance_loss_mlp": 1.04984403, + "epoch": 0.01917931760108222, + "flos": 25482822503040.0, + "grad_norm": 2.37059658156526, + "language_loss": 0.93873966, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.96291411, + "num_input_tokens_seen": 6669225, + "step": 319, + "time_per_iteration": 2.542235851287842 + }, + { + "auxiliary_loss_clip": 0.01208177, + "auxiliary_loss_mlp": 0.01047628, + "balance_loss_clip": 1.08579254, + "balance_loss_mlp": 1.03418112, + "epoch": 0.01923944085375019, + "flos": 71556967353600.0, + "grad_norm": 0.9394075574673466, + "language_loss": 0.59889984, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62145793, + "num_input_tokens_seen": 6725775, + "step": 320, + "time_per_iteration": 3.035402297973633 + }, + { + "auxiliary_loss_clip": 0.01324748, + "auxiliary_loss_mlp": 0.01103163, + "balance_loss_clip": 1.09002709, + "balance_loss_mlp": 1.06065321, + "epoch": 0.019299564106418157, + "flos": 19682495406720.0, + "grad_norm": 2.6129098674880686, + "language_loss": 0.89904606, + "learning_rate": 3.715954969092154e-06, + "loss": 0.9233253, + "num_input_tokens_seen": 6744170, + "step": 321, + "time_per_iteration": 2.496229887008667 + }, + { + "auxiliary_loss_clip": 0.01334418, + "auxiliary_loss_mlp": 0.01118722, + "balance_loss_clip": 1.09399271, + "balance_loss_mlp": 1.07387519, + "epoch": 0.019359687359086126, + "flos": 24387151991040.0, + "grad_norm": 2.3717172801585624, + "language_loss": 0.82467294, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.8492043, + "num_input_tokens_seen": 6764565, + "step": 322, + "time_per_iteration": 2.563275098800659 + }, + { + "auxiliary_loss_clip": 0.0133429, + "auxiliary_loss_mlp": 0.0109176, + "balance_loss_clip": 1.09193921, + "balance_loss_mlp": 1.05008447, + "epoch": 0.019419810611754094, + "flos": 23951376190080.0, + "grad_norm": 2.1752211565318262, + "language_loss": 0.72402394, + "learning_rate": 3.719954063833981e-06, + "loss": 0.74828446, + "num_input_tokens_seen": 6785310, + "step": 323, + "time_per_iteration": 2.523047924041748 + }, + { + "auxiliary_loss_clip": 0.01322281, + "auxiliary_loss_mlp": 0.01090866, + "balance_loss_clip": 1.08683538, + "balance_loss_mlp": 1.0484277, + "epoch": 0.019479933864422067, + "flos": 22160223567360.0, + "grad_norm": 1.9307618113805254, + "language_loss": 0.92239487, + "learning_rate": 3.721944334919596e-06, + "loss": 0.94652629, + "num_input_tokens_seen": 6803290, + "step": 324, + "time_per_iteration": 2.5241775512695312 + }, + { + "auxiliary_loss_clip": 0.01330676, + "auxiliary_loss_mlp": 0.01088987, + "balance_loss_clip": 1.09341323, + "balance_loss_mlp": 1.04790807, + "epoch": 0.019540057117090035, + "flos": 22236821320320.0, + "grad_norm": 3.3302843913081985, + "language_loss": 0.65465653, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.67885315, + "num_input_tokens_seen": 6822570, + "step": 325, + "time_per_iteration": 2.4959139823913574 + }, + { + "auxiliary_loss_clip": 0.01329969, + "auxiliary_loss_mlp": 0.0110512, + "balance_loss_clip": 1.09819543, + "balance_loss_mlp": 1.06182337, + "epoch": 0.019600180369758004, + "flos": 23076771932160.0, + "grad_norm": 13.386541323931564, + "language_loss": 0.76534313, + "learning_rate": 3.72590651470665e-06, + "loss": 0.78969401, + "num_input_tokens_seen": 6841910, + "step": 326, + "time_per_iteration": 2.5278851985931396 + }, + { + "auxiliary_loss_clip": 0.01322459, + "auxiliary_loss_mlp": 0.01104874, + "balance_loss_clip": 1.09264433, + "balance_loss_mlp": 1.06112385, + "epoch": 0.019660303622425972, + "flos": 25410857604480.0, + "grad_norm": 2.2853184985182184, + "language_loss": 0.79478103, + "learning_rate": 3.727878498433505e-06, + "loss": 0.81905437, + "num_input_tokens_seen": 6862480, + "step": 327, + "time_per_iteration": 2.5820560455322266 + }, + { + "auxiliary_loss_clip": 0.01331935, + "auxiliary_loss_mlp": 0.01110933, + "balance_loss_clip": 1.09449315, + "balance_loss_mlp": 1.0684706, + "epoch": 0.01972042687509394, + "flos": 23657519024640.0, + "grad_norm": 2.140462945493788, + "language_loss": 0.80812597, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.8325547, + "num_input_tokens_seen": 6882015, + "step": 328, + "time_per_iteration": 2.542212963104248 + }, + { + "auxiliary_loss_clip": 0.01327113, + "auxiliary_loss_mlp": 0.01096352, + "balance_loss_clip": 1.08755779, + "balance_loss_mlp": 1.05248344, + "epoch": 0.019780550127761913, + "flos": 18223480869120.0, + "grad_norm": 2.2872169302560157, + "language_loss": 0.93529749, + "learning_rate": 3.731804438545683e-06, + "loss": 0.95953214, + "num_input_tokens_seen": 6899785, + "step": 329, + "time_per_iteration": 2.4751827716827393 + }, + { + "auxiliary_loss_clip": 0.01334545, + "auxiliary_loss_mlp": 0.01110017, + "balance_loss_clip": 1.09319758, + "balance_loss_mlp": 1.06681561, + "epoch": 0.01984067338042988, + "flos": 22418780641920.0, + "grad_norm": 3.3340798863750694, + "language_loss": 0.74575531, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.77020097, + "num_input_tokens_seen": 6918575, + "step": 330, + "time_per_iteration": 3.8920044898986816 + }, + { + "auxiliary_loss_clip": 0.01331179, + "auxiliary_loss_mlp": 0.01117037, + "balance_loss_clip": 1.0919801, + "balance_loss_mlp": 1.07364511, + "epoch": 0.01990079663309785, + "flos": 17055199013760.0, + "grad_norm": 4.940256493397481, + "language_loss": 0.93349528, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.95797753, + "num_input_tokens_seen": 6936965, + "step": 331, + "time_per_iteration": 3.857365846633911 + }, + { + "auxiliary_loss_clip": 0.01318373, + "auxiliary_loss_mlp": 0.01088302, + "balance_loss_clip": 1.08976138, + "balance_loss_mlp": 1.04741287, + "epoch": 0.01996091988576582, + "flos": 15961791058560.0, + "grad_norm": 2.1021148078879697, + "language_loss": 0.92604125, + "learning_rate": 3.737648825272422e-06, + "loss": 0.95010793, + "num_input_tokens_seen": 6953475, + "step": 332, + "time_per_iteration": 2.4573404788970947 + }, + { + "auxiliary_loss_clip": 0.01325031, + "auxiliary_loss_mlp": 0.01092416, + "balance_loss_clip": 1.09436011, + "balance_loss_mlp": 1.04938102, + "epoch": 0.02002104313843379, + "flos": 23586451966080.0, + "grad_norm": 2.9250033335138896, + "language_loss": 0.75577587, + "learning_rate": 3.739585224276384e-06, + "loss": 0.77995026, + "num_input_tokens_seen": 6971630, + "step": 333, + "time_per_iteration": 3.9254531860351562 + }, + { + "auxiliary_loss_clip": 0.01327376, + "auxiliary_loss_mlp": 0.01087271, + "balance_loss_clip": 1.09201992, + "balance_loss_mlp": 1.04611969, + "epoch": 0.02008116639110176, + "flos": 34094883352320.0, + "grad_norm": 2.1965969866719397, + "language_loss": 0.78864884, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.8127954, + "num_input_tokens_seen": 6992775, + "step": 334, + "time_per_iteration": 2.599663734436035 + }, + { + "auxiliary_loss_clip": 0.0132656, + "auxiliary_loss_mlp": 0.01097045, + "balance_loss_clip": 1.08734179, + "balance_loss_mlp": 1.05312824, + "epoch": 0.020141289643769728, + "flos": 19683716469120.0, + "grad_norm": 1.8804950161494818, + "language_loss": 0.83226275, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.85649884, + "num_input_tokens_seen": 7011425, + "step": 335, + "time_per_iteration": 2.5641684532165527 + }, + { + "auxiliary_loss_clip": 0.01322965, + "auxiliary_loss_mlp": 0.01088119, + "balance_loss_clip": 1.08998549, + "balance_loss_mlp": 1.04610991, + "epoch": 0.020201412896437697, + "flos": 20740567357440.0, + "grad_norm": 2.8221465495189912, + "language_loss": 0.92273647, + "learning_rate": 3.745359722027911e-06, + "loss": 0.94684732, + "num_input_tokens_seen": 7029450, + "step": 336, + "time_per_iteration": 2.520214080810547 + }, + { + "auxiliary_loss_clip": 0.01322602, + "auxiliary_loss_mlp": 0.01082996, + "balance_loss_clip": 1.08685946, + "balance_loss_mlp": 1.04158306, + "epoch": 0.020261536149105665, + "flos": 20266510636800.0, + "grad_norm": 2.3302545987054275, + "language_loss": 0.88378799, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.90784395, + "num_input_tokens_seen": 7047555, + "step": 337, + "time_per_iteration": 2.533212900161743 + }, + { + "auxiliary_loss_clip": 0.01313623, + "auxiliary_loss_mlp": 0.01102207, + "balance_loss_clip": 1.08497429, + "balance_loss_mlp": 1.05867171, + "epoch": 0.020321659401773638, + "flos": 25848752307840.0, + "grad_norm": 1.5362255420478748, + "language_loss": 0.89893103, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.92308939, + "num_input_tokens_seen": 7068185, + "step": 338, + "time_per_iteration": 2.603816270828247 + }, + { + "auxiliary_loss_clip": 0.01324362, + "auxiliary_loss_mlp": 0.01100494, + "balance_loss_clip": 1.08938348, + "balance_loss_mlp": 1.05800724, + "epoch": 0.020381782654441606, + "flos": 17495033051520.0, + "grad_norm": 2.3547377437641495, + "language_loss": 0.85036457, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.87461311, + "num_input_tokens_seen": 7085955, + "step": 339, + "time_per_iteration": 2.526341199874878 + }, + { + "auxiliary_loss_clip": 0.01328845, + "auxiliary_loss_mlp": 0.01099023, + "balance_loss_clip": 1.0922718, + "balance_loss_mlp": 1.05632198, + "epoch": 0.020441905907109575, + "flos": 24243940465920.0, + "grad_norm": 2.8083599186060364, + "language_loss": 0.88858235, + "learning_rate": 3.75297936342452e-06, + "loss": 0.91286105, + "num_input_tokens_seen": 7106345, + "step": 340, + "time_per_iteration": 2.543834686279297 + }, + { + "auxiliary_loss_clip": 0.01323876, + "auxiliary_loss_mlp": 0.0108679, + "balance_loss_clip": 1.08833981, + "balance_loss_mlp": 1.04251611, + "epoch": 0.020502029159777543, + "flos": 22233301787520.0, + "grad_norm": 4.507233545455789, + "language_loss": 0.88281071, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.90691733, + "num_input_tokens_seen": 7125070, + "step": 341, + "time_per_iteration": 2.5411481857299805 + }, + { + "auxiliary_loss_clip": 0.01326993, + "auxiliary_loss_mlp": 0.01102801, + "balance_loss_clip": 1.08800399, + "balance_loss_mlp": 1.05990982, + "epoch": 0.020562152412445512, + "flos": 23987861429760.0, + "grad_norm": 2.9408702366544794, + "language_loss": 0.80213362, + "learning_rate": 3.756755633390458e-06, + "loss": 0.82643157, + "num_input_tokens_seen": 7144675, + "step": 342, + "time_per_iteration": 2.560076951980591 + }, + { + "auxiliary_loss_clip": 0.01315378, + "auxiliary_loss_mlp": 0.01097228, + "balance_loss_clip": 1.08657849, + "balance_loss_mlp": 1.05252409, + "epoch": 0.020622275665113484, + "flos": 26975305537920.0, + "grad_norm": 2.4694771969817886, + "language_loss": 0.89687574, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.92100179, + "num_input_tokens_seen": 7165505, + "step": 343, + "time_per_iteration": 2.5710017681121826 + }, + { + "auxiliary_loss_clip": 0.01326274, + "auxiliary_loss_mlp": 0.01096059, + "balance_loss_clip": 1.09418142, + "balance_loss_mlp": 1.05555153, + "epoch": 0.020682398917781453, + "flos": 22600704049920.0, + "grad_norm": 1.9578545417081916, + "language_loss": 0.78210926, + "learning_rate": 3.7605098841644e-06, + "loss": 0.80633259, + "num_input_tokens_seen": 7184605, + "step": 344, + "time_per_iteration": 2.5151400566101074 + }, + { + "auxiliary_loss_clip": 0.01311232, + "auxiliary_loss_mlp": 0.01102438, + "balance_loss_clip": 1.08548689, + "balance_loss_mlp": 1.05868864, + "epoch": 0.02074252217044942, + "flos": 15013605790080.0, + "grad_norm": 1.856941840949793, + "language_loss": 0.74990153, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.77403826, + "num_input_tokens_seen": 7203065, + "step": 345, + "time_per_iteration": 2.4936389923095703 + }, + { + "auxiliary_loss_clip": 0.01318657, + "auxiliary_loss_mlp": 0.01102253, + "balance_loss_clip": 1.09153533, + "balance_loss_mlp": 1.05962384, + "epoch": 0.02080264542311739, + "flos": 25337958952320.0, + "grad_norm": 2.1365767476921906, + "language_loss": 0.90456986, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.92877901, + "num_input_tokens_seen": 7222995, + "step": 346, + "time_per_iteration": 2.562014579772949 + }, + { + "auxiliary_loss_clip": 0.01313735, + "auxiliary_loss_mlp": 0.01093796, + "balance_loss_clip": 1.08558679, + "balance_loss_mlp": 1.05452871, + "epoch": 0.02086276867578536, + "flos": 24388804016640.0, + "grad_norm": 1.9901221156972229, + "language_loss": 0.7887423, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.81281763, + "num_input_tokens_seen": 7244625, + "step": 347, + "time_per_iteration": 2.5340518951416016 + }, + { + "auxiliary_loss_clip": 0.01319291, + "auxiliary_loss_mlp": 0.01098395, + "balance_loss_clip": 1.09145069, + "balance_loss_mlp": 1.05533695, + "epoch": 0.02092289192845333, + "flos": 24462205459200.0, + "grad_norm": 1.8115406080892402, + "language_loss": 0.71126294, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.7354399, + "num_input_tokens_seen": 7263255, + "step": 348, + "time_per_iteration": 2.5632107257843018 + }, + { + "auxiliary_loss_clip": 0.01321399, + "auxiliary_loss_mlp": 0.01102718, + "balance_loss_clip": 1.0878005, + "balance_loss_mlp": 1.05977845, + "epoch": 0.0209830151811213, + "flos": 17451185523840.0, + "grad_norm": 2.6166965018956314, + "language_loss": 0.76650506, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.79074621, + "num_input_tokens_seen": 7279275, + "step": 349, + "time_per_iteration": 2.469172477722168 + }, + { + "auxiliary_loss_clip": 0.01306461, + "auxiliary_loss_mlp": 0.01103875, + "balance_loss_clip": 1.0866605, + "balance_loss_mlp": 1.06250906, + "epoch": 0.021043138433789268, + "flos": 24573995562240.0, + "grad_norm": 1.6622499677827813, + "language_loss": 0.85162234, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.87572575, + "num_input_tokens_seen": 7300180, + "step": 350, + "time_per_iteration": 2.5673046112060547 + }, + { + "auxiliary_loss_clip": 0.01314907, + "auxiliary_loss_mlp": 0.01090451, + "balance_loss_clip": 1.09089875, + "balance_loss_mlp": 1.05115986, + "epoch": 0.021103261686457236, + "flos": 24454053072000.0, + "grad_norm": 2.1384174562302634, + "language_loss": 0.79781115, + "learning_rate": 3.773480007028776e-06, + "loss": 0.82186472, + "num_input_tokens_seen": 7317430, + "step": 351, + "time_per_iteration": 2.4930734634399414 + }, + { + "auxiliary_loss_clip": 0.0132186, + "auxiliary_loss_mlp": 0.01104357, + "balance_loss_clip": 1.09109211, + "balance_loss_mlp": 1.06144142, + "epoch": 0.021163384939125205, + "flos": 14683083816960.0, + "grad_norm": 2.639562652236807, + "language_loss": 0.87569571, + "learning_rate": 3.775311735671078e-06, + "loss": 0.89995795, + "num_input_tokens_seen": 7334875, + "step": 352, + "time_per_iteration": 2.4874041080474854 + }, + { + "auxiliary_loss_clip": 0.01314178, + "auxiliary_loss_mlp": 0.01103263, + "balance_loss_clip": 1.08946502, + "balance_loss_mlp": 1.06001389, + "epoch": 0.021223508191793177, + "flos": 24493195918080.0, + "grad_norm": 1.8311512367302993, + "language_loss": 0.82362443, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.84779876, + "num_input_tokens_seen": 7355185, + "step": 353, + "time_per_iteration": 2.551184892654419 + }, + { + "auxiliary_loss_clip": 0.01311988, + "auxiliary_loss_mlp": 0.01097338, + "balance_loss_clip": 1.08867431, + "balance_loss_mlp": 1.0570451, + "epoch": 0.021283631444461146, + "flos": 24126978804480.0, + "grad_norm": 2.760516330628348, + "language_loss": 0.81079543, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.83488864, + "num_input_tokens_seen": 7374425, + "step": 354, + "time_per_iteration": 2.5483357906341553 + }, + { + "auxiliary_loss_clip": 0.0131645, + "auxiliary_loss_mlp": 0.01092506, + "balance_loss_clip": 1.08512473, + "balance_loss_mlp": 1.04899466, + "epoch": 0.021343754697129114, + "flos": 25192233475200.0, + "grad_norm": 2.033471943140301, + "language_loss": 0.81049144, + "learning_rate": 3.780775860546545e-06, + "loss": 0.83458096, + "num_input_tokens_seen": 7394175, + "step": 355, + "time_per_iteration": 2.531355142593384 + }, + { + "auxiliary_loss_clip": 0.01313725, + "auxiliary_loss_mlp": 0.01090674, + "balance_loss_clip": 1.08578539, + "balance_loss_mlp": 1.048998, + "epoch": 0.021403877949797083, + "flos": 17274182279040.0, + "grad_norm": 2.1836311633094274, + "language_loss": 0.89332211, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.91736603, + "num_input_tokens_seen": 7412645, + "step": 356, + "time_per_iteration": 2.487934112548828 + }, + { + "auxiliary_loss_clip": 0.01310702, + "auxiliary_loss_mlp": 0.01085635, + "balance_loss_clip": 1.0877552, + "balance_loss_mlp": 1.04314899, + "epoch": 0.021464001202465055, + "flos": 30917435276160.0, + "grad_norm": 1.9999570316142263, + "language_loss": 0.8024286, + "learning_rate": 3.784393017158528e-06, + "loss": 0.82639205, + "num_input_tokens_seen": 7432275, + "step": 357, + "time_per_iteration": 2.559035062789917 + }, + { + "auxiliary_loss_clip": 0.01312653, + "auxiliary_loss_mlp": 0.01085897, + "balance_loss_clip": 1.08524656, + "balance_loss_mlp": 1.04705894, + "epoch": 0.021524124455133024, + "flos": 18186385098240.0, + "grad_norm": 2.3131962209489867, + "language_loss": 0.76576149, + "learning_rate": 3.786194003461506e-06, + "loss": 0.78974694, + "num_input_tokens_seen": 7450245, + "step": 358, + "time_per_iteration": 2.481630325317383 + }, + { + "auxiliary_loss_clip": 0.01310087, + "auxiliary_loss_mlp": 0.01091409, + "balance_loss_clip": 1.08329225, + "balance_loss_mlp": 1.04837501, + "epoch": 0.021584247707800992, + "flos": 13805786039040.0, + "grad_norm": 2.7019611385348297, + "language_loss": 0.88065517, + "learning_rate": 3.787989966086264e-06, + "loss": 0.90467018, + "num_input_tokens_seen": 7466845, + "step": 359, + "time_per_iteration": 2.4632513523101807 + }, + { + "auxiliary_loss_clip": 0.01317968, + "auxiliary_loss_mlp": 0.0108999, + "balance_loss_clip": 1.08885562, + "balance_loss_mlp": 1.05041277, + "epoch": 0.02164437096046896, + "flos": 23294713703040.0, + "grad_norm": 2.829075214936934, + "language_loss": 0.75971377, + "learning_rate": 3.789780932980997e-06, + "loss": 0.78379339, + "num_input_tokens_seen": 7485450, + "step": 360, + "time_per_iteration": 2.520082473754883 + }, + { + "auxiliary_loss_clip": 0.0119617, + "auxiliary_loss_mlp": 0.0104815, + "balance_loss_clip": 1.07889986, + "balance_loss_mlp": 1.03508461, + "epoch": 0.02170449421313693, + "flos": 68899578341760.0, + "grad_norm": 0.9443858592817307, + "language_loss": 0.64945179, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67189497, + "num_input_tokens_seen": 7553780, + "step": 361, + "time_per_iteration": 3.2192747592926025 + }, + { + "auxiliary_loss_clip": 0.01309203, + "auxiliary_loss_mlp": 0.01088367, + "balance_loss_clip": 1.08199942, + "balance_loss_mlp": 1.04855132, + "epoch": 0.0217646174658049, + "flos": 25228539146880.0, + "grad_norm": 3.0445687010131754, + "language_loss": 0.78258491, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.80656058, + "num_input_tokens_seen": 7574155, + "step": 362, + "time_per_iteration": 2.5585684776306152 + }, + { + "auxiliary_loss_clip": 0.01311552, + "auxiliary_loss_mlp": 0.01094549, + "balance_loss_clip": 1.08443356, + "balance_loss_mlp": 1.05444729, + "epoch": 0.02182474071847287, + "flos": 22893124671360.0, + "grad_norm": 2.0296218972829916, + "language_loss": 0.92699635, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.95105743, + "num_input_tokens_seen": 7592320, + "step": 363, + "time_per_iteration": 2.4929096698760986 + }, + { + "auxiliary_loss_clip": 0.01307803, + "auxiliary_loss_mlp": 0.0110003, + "balance_loss_clip": 1.08417583, + "balance_loss_mlp": 1.06114376, + "epoch": 0.02188486397114084, + "flos": 23658991482240.0, + "grad_norm": 2.3473632528503794, + "language_loss": 0.89641362, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.92049193, + "num_input_tokens_seen": 7611185, + "step": 364, + "time_per_iteration": 2.5262603759765625 + }, + { + "auxiliary_loss_clip": 0.01313328, + "auxiliary_loss_mlp": 0.01094652, + "balance_loss_clip": 1.08764839, + "balance_loss_mlp": 1.05285728, + "epoch": 0.021944987223808807, + "flos": 21543637680000.0, + "grad_norm": 1.9697107324262233, + "language_loss": 0.79484671, + "learning_rate": 3.798661793553676e-06, + "loss": 0.81892651, + "num_input_tokens_seen": 7631970, + "step": 365, + "time_per_iteration": 2.4976956844329834 + }, + { + "auxiliary_loss_clip": 0.0130722, + "auxiliary_loss_mlp": 0.01095163, + "balance_loss_clip": 1.08438301, + "balance_loss_mlp": 1.05274868, + "epoch": 0.022005110476476776, + "flos": 16070887641600.0, + "grad_norm": 1.8231594100674484, + "language_loss": 0.84438133, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.8684051, + "num_input_tokens_seen": 7649745, + "step": 366, + "time_per_iteration": 2.5000321865081787 + }, + { + "auxiliary_loss_clip": 0.01313517, + "auxiliary_loss_mlp": 0.01090237, + "balance_loss_clip": 1.08552587, + "balance_loss_mlp": 1.05199432, + "epoch": 0.022065233729144748, + "flos": 21433715084160.0, + "grad_norm": 1.844294205659595, + "language_loss": 0.86911905, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.89315659, + "num_input_tokens_seen": 7668830, + "step": 367, + "time_per_iteration": 2.4984028339385986 + }, + { + "auxiliary_loss_clip": 0.01314941, + "auxiliary_loss_mlp": 0.01097101, + "balance_loss_clip": 1.0840205, + "balance_loss_mlp": 1.05552065, + "epoch": 0.022125356981812717, + "flos": 21543709507200.0, + "grad_norm": 2.166900845087149, + "language_loss": 0.84954393, + "learning_rate": 3.803932100062912e-06, + "loss": 0.87366438, + "num_input_tokens_seen": 7687240, + "step": 368, + "time_per_iteration": 3.8960788249969482 + }, + { + "auxiliary_loss_clip": 0.01312568, + "auxiliary_loss_mlp": 0.01085779, + "balance_loss_clip": 1.08075666, + "balance_loss_mlp": 1.04622507, + "epoch": 0.022185480234480685, + "flos": 20704153944960.0, + "grad_norm": 4.5238428897406475, + "language_loss": 0.75726801, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.78125143, + "num_input_tokens_seen": 7704440, + "step": 369, + "time_per_iteration": 3.908716917037964 + }, + { + "auxiliary_loss_clip": 0.01307742, + "auxiliary_loss_mlp": 0.01102635, + "balance_loss_clip": 1.08364892, + "balance_loss_mlp": 1.0634867, + "epoch": 0.022245603487148654, + "flos": 25193203142400.0, + "grad_norm": 2.1716404296819607, + "language_loss": 0.8283217, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.85242546, + "num_input_tokens_seen": 7727160, + "step": 370, + "time_per_iteration": 3.9636473655700684 + }, + { + "auxiliary_loss_clip": 0.01306043, + "auxiliary_loss_mlp": 0.01097403, + "balance_loss_clip": 1.08258677, + "balance_loss_mlp": 1.05722928, + "epoch": 0.022305726739816623, + "flos": 21395936954880.0, + "grad_norm": 3.290609403606036, + "language_loss": 0.81994444, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.84397888, + "num_input_tokens_seen": 7747730, + "step": 371, + "time_per_iteration": 2.505772113800049 + }, + { + "auxiliary_loss_clip": 0.01312886, + "auxiliary_loss_mlp": 0.010942, + "balance_loss_clip": 1.08814073, + "balance_loss_mlp": 1.05288219, + "epoch": 0.022365849992484595, + "flos": 22492146170880.0, + "grad_norm": 2.0477414870826864, + "language_loss": 0.83611, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.86018085, + "num_input_tokens_seen": 7766765, + "step": 372, + "time_per_iteration": 2.5135765075683594 + }, + { + "auxiliary_loss_clip": 0.01306217, + "auxiliary_loss_mlp": 0.01091586, + "balance_loss_clip": 1.08448815, + "balance_loss_mlp": 1.05112648, + "epoch": 0.022425973245152563, + "flos": 17856581397120.0, + "grad_norm": 4.033468741804688, + "language_loss": 0.78865242, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.81263047, + "num_input_tokens_seen": 7784010, + "step": 373, + "time_per_iteration": 3.9449522495269775 + }, + { + "auxiliary_loss_clip": 0.01310325, + "auxiliary_loss_mlp": 0.01089303, + "balance_loss_clip": 1.08591986, + "balance_loss_mlp": 1.04769862, + "epoch": 0.022486096497820532, + "flos": 15483029656320.0, + "grad_norm": 2.5650283271208494, + "language_loss": 0.77905834, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.80305457, + "num_input_tokens_seen": 7801305, + "step": 374, + "time_per_iteration": 2.476372718811035 + }, + { + "auxiliary_loss_clip": 0.01302677, + "auxiliary_loss_mlp": 0.01078784, + "balance_loss_clip": 1.07821178, + "balance_loss_mlp": 1.03739512, + "epoch": 0.0225462197504885, + "flos": 27784157950080.0, + "grad_norm": 1.9737072839295156, + "language_loss": 0.86098623, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.88480085, + "num_input_tokens_seen": 7823965, + "step": 375, + "time_per_iteration": 2.568939208984375 + }, + { + "auxiliary_loss_clip": 0.01308314, + "auxiliary_loss_mlp": 0.01100359, + "balance_loss_clip": 1.08594131, + "balance_loss_mlp": 1.05839717, + "epoch": 0.02260634300315647, + "flos": 19975490645760.0, + "grad_norm": 2.238733232050167, + "language_loss": 0.88854146, + "learning_rate": 3.817778917253314e-06, + "loss": 0.91262823, + "num_input_tokens_seen": 7842115, + "step": 376, + "time_per_iteration": 2.514871120452881 + }, + { + "auxiliary_loss_clip": 0.01307735, + "auxiliary_loss_mlp": 0.01084168, + "balance_loss_clip": 1.08049881, + "balance_loss_mlp": 1.04542482, + "epoch": 0.02266646625582444, + "flos": 16028189349120.0, + "grad_norm": 2.1383514500195613, + "language_loss": 0.74967128, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.77359033, + "num_input_tokens_seen": 7857830, + "step": 377, + "time_per_iteration": 2.4893674850463867 + }, + { + "auxiliary_loss_clip": 0.01299877, + "auxiliary_loss_mlp": 0.01095816, + "balance_loss_clip": 1.08405828, + "balance_loss_mlp": 1.0550468, + "epoch": 0.02272658950849241, + "flos": 20404622430720.0, + "grad_norm": 2.3126811011248107, + "language_loss": 0.9934442, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.01740122, + "num_input_tokens_seen": 7875840, + "step": 378, + "time_per_iteration": 2.535586357116699 + }, + { + "auxiliary_loss_clip": 0.0118095, + "auxiliary_loss_mlp": 0.01062227, + "balance_loss_clip": 1.06609571, + "balance_loss_mlp": 1.04992473, + "epoch": 0.02278671276116038, + "flos": 69847332647040.0, + "grad_norm": 1.0027553635834887, + "language_loss": 0.75430381, + "learning_rate": 3.822895650276492e-06, + "loss": 0.77673554, + "num_input_tokens_seen": 7940190, + "step": 379, + "time_per_iteration": 3.14353609085083 + }, + { + "auxiliary_loss_clip": 0.01309714, + "auxiliary_loss_mlp": 0.01091818, + "balance_loss_clip": 1.0808115, + "balance_loss_mlp": 1.05331302, + "epoch": 0.022846836013828347, + "flos": 38508771340800.0, + "grad_norm": 2.177222785802864, + "language_loss": 0.783696, + "learning_rate": 3.824592231451859e-06, + "loss": 0.8077113, + "num_input_tokens_seen": 7960840, + "step": 380, + "time_per_iteration": 2.6313724517822266 + }, + { + "auxiliary_loss_clip": 0.01303885, + "auxiliary_loss_mlp": 0.01086314, + "balance_loss_clip": 1.08352447, + "balance_loss_mlp": 1.04768968, + "epoch": 0.02290695926649632, + "flos": 20959478795520.0, + "grad_norm": 2.2643992893925593, + "language_loss": 0.96738076, + "learning_rate": 3.826284353801652e-06, + "loss": 0.99128282, + "num_input_tokens_seen": 7975500, + "step": 381, + "time_per_iteration": 2.4949941635131836 + }, + { + "auxiliary_loss_clip": 0.0131303, + "auxiliary_loss_mlp": 0.01088939, + "balance_loss_clip": 1.08519292, + "balance_loss_mlp": 1.04940903, + "epoch": 0.022967082519164288, + "flos": 24022407335040.0, + "grad_norm": 2.07957927508733, + "language_loss": 0.88007009, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90408981, + "num_input_tokens_seen": 7993880, + "step": 382, + "time_per_iteration": 2.5108087062835693 + }, + { + "auxiliary_loss_clip": 0.01305117, + "auxiliary_loss_mlp": 0.01097925, + "balance_loss_clip": 1.08524728, + "balance_loss_mlp": 1.05882406, + "epoch": 0.023027205771832256, + "flos": 20997149184000.0, + "grad_norm": 2.308514563186946, + "language_loss": 0.84561276, + "learning_rate": 3.829655315342268e-06, + "loss": 0.86964315, + "num_input_tokens_seen": 8012730, + "step": 383, + "time_per_iteration": 2.514528274536133 + }, + { + "auxiliary_loss_clip": 0.01303385, + "auxiliary_loss_mlp": 0.01112403, + "balance_loss_clip": 1.08614123, + "balance_loss_mlp": 1.07435107, + "epoch": 0.023087329024500225, + "flos": 21360816432000.0, + "grad_norm": 2.179193204534442, + "language_loss": 0.83356506, + "learning_rate": 3.831334200735543e-06, + "loss": 0.85772288, + "num_input_tokens_seen": 8031275, + "step": 384, + "time_per_iteration": 2.490983009338379 + }, + { + "auxiliary_loss_clip": 0.01303931, + "auxiliary_loss_mlp": 0.01092374, + "balance_loss_clip": 1.08914554, + "balance_loss_mlp": 1.05608618, + "epoch": 0.023147452277168194, + "flos": 21872435800320.0, + "grad_norm": 1.7668915939961258, + "language_loss": 0.88927078, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.91323388, + "num_input_tokens_seen": 8051600, + "step": 385, + "time_per_iteration": 2.5229649543762207 + }, + { + "auxiliary_loss_clip": 0.01307087, + "auxiliary_loss_mlp": 0.01120547, + "balance_loss_clip": 1.0859611, + "balance_loss_mlp": 1.08216166, + "epoch": 0.023207575529836166, + "flos": 18916700423040.0, + "grad_norm": 1.7522415908177935, + "language_loss": 0.69699174, + "learning_rate": 3.83467889492477e-06, + "loss": 0.72126812, + "num_input_tokens_seen": 8070600, + "step": 386, + "time_per_iteration": 2.489877700805664 + }, + { + "auxiliary_loss_clip": 0.01308004, + "auxiliary_loss_mlp": 0.01092931, + "balance_loss_clip": 1.08742118, + "balance_loss_mlp": 1.05521369, + "epoch": 0.023267698782504134, + "flos": 25046005207680.0, + "grad_norm": 2.0080706504945542, + "language_loss": 0.87942457, + "learning_rate": 3.836344748851495e-06, + "loss": 0.90343398, + "num_input_tokens_seen": 8090680, + "step": 387, + "time_per_iteration": 2.5519814491271973 + }, + { + "auxiliary_loss_clip": 0.01308927, + "auxiliary_loss_mlp": 0.01085443, + "balance_loss_clip": 1.08691835, + "balance_loss_mlp": 1.04591358, + "epoch": 0.023327822035172103, + "flos": 28879217930880.0, + "grad_norm": 2.7454276149442975, + "language_loss": 0.83656776, + "learning_rate": 3.838006303795566e-06, + "loss": 0.86051142, + "num_input_tokens_seen": 8114610, + "step": 388, + "time_per_iteration": 2.5873231887817383 + }, + { + "auxiliary_loss_clip": 0.01305076, + "auxiliary_loss_mlp": 0.01088436, + "balance_loss_clip": 1.08552885, + "balance_loss_mlp": 1.05219591, + "epoch": 0.02338794528784007, + "flos": 27121533805440.0, + "grad_norm": 6.0999110916679085, + "language_loss": 0.94020915, + "learning_rate": 3.839663581888206e-06, + "loss": 0.96414423, + "num_input_tokens_seen": 8133975, + "step": 389, + "time_per_iteration": 2.556816816329956 + }, + { + "auxiliary_loss_clip": 0.01298351, + "auxiliary_loss_mlp": 0.01084319, + "balance_loss_clip": 1.08533287, + "balance_loss_mlp": 1.04595757, + "epoch": 0.02344806854050804, + "flos": 21322355944320.0, + "grad_norm": 2.2801711694030815, + "language_loss": 0.87858617, + "learning_rate": 3.841316605090178e-06, + "loss": 0.90241289, + "num_input_tokens_seen": 8153570, + "step": 390, + "time_per_iteration": 2.5291900634765625 + }, + { + "auxiliary_loss_clip": 0.01303754, + "auxiliary_loss_mlp": 0.0109218, + "balance_loss_clip": 1.08749747, + "balance_loss_mlp": 1.05639374, + "epoch": 0.023508191793176012, + "flos": 24789997998720.0, + "grad_norm": 2.184041115767674, + "language_loss": 0.89398754, + "learning_rate": 3.842965395193529e-06, + "loss": 0.91794682, + "num_input_tokens_seen": 8170075, + "step": 391, + "time_per_iteration": 2.543987274169922 + }, + { + "auxiliary_loss_clip": 0.0130094, + "auxiliary_loss_mlp": 0.01076434, + "balance_loss_clip": 1.08453941, + "balance_loss_mlp": 1.03974128, + "epoch": 0.02356831504584398, + "flos": 25995375624960.0, + "grad_norm": 2.0509513442390026, + "language_loss": 0.8607955, + "learning_rate": 3.84460997382332e-06, + "loss": 0.88456917, + "num_input_tokens_seen": 8190420, + "step": 392, + "time_per_iteration": 2.5236165523529053 + }, + { + "auxiliary_loss_clip": 0.0129821, + "auxiliary_loss_mlp": 0.01089614, + "balance_loss_clip": 1.08452177, + "balance_loss_mlp": 1.05227804, + "epoch": 0.02362843829851195, + "flos": 19062461813760.0, + "grad_norm": 2.061757974902649, + "language_loss": 0.88949513, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.91337335, + "num_input_tokens_seen": 8208790, + "step": 393, + "time_per_iteration": 2.495410680770874 + }, + { + "auxiliary_loss_clip": 0.01308041, + "auxiliary_loss_mlp": 0.01108463, + "balance_loss_clip": 1.08971059, + "balance_loss_mlp": 1.06881428, + "epoch": 0.023688561551179918, + "flos": 16071031296000.0, + "grad_norm": 1.9445143273333805, + "language_loss": 0.81455553, + "learning_rate": 3.84788658233771e-06, + "loss": 0.83872056, + "num_input_tokens_seen": 8226885, + "step": 394, + "time_per_iteration": 2.471144676208496 + }, + { + "auxiliary_loss_clip": 0.01299191, + "auxiliary_loss_mlp": 0.01090305, + "balance_loss_clip": 1.08261597, + "balance_loss_mlp": 1.05158556, + "epoch": 0.023748684803847887, + "flos": 21724375939200.0, + "grad_norm": 2.1154644933840387, + "language_loss": 0.85621691, + "learning_rate": 3.84951865465269e-06, + "loss": 0.88011181, + "num_input_tokens_seen": 8246825, + "step": 395, + "time_per_iteration": 2.5246951580047607 + }, + { + "auxiliary_loss_clip": 0.0118279, + "auxiliary_loss_mlp": 0.01038124, + "balance_loss_clip": 1.07161582, + "balance_loss_mlp": 1.02663231, + "epoch": 0.02380880805651586, + "flos": 61926192881280.0, + "grad_norm": 0.9798267886652037, + "language_loss": 0.63866138, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66087043, + "num_input_tokens_seen": 8302835, + "step": 396, + "time_per_iteration": 2.929664373397827 + }, + { + "auxiliary_loss_clip": 0.01296999, + "auxiliary_loss_mlp": 0.01072306, + "balance_loss_clip": 1.08221042, + "balance_loss_mlp": 1.03537536, + "epoch": 0.023868931309183827, + "flos": 20266331068800.0, + "grad_norm": 2.5270554548033073, + "language_loss": 0.83809316, + "learning_rate": 3.852770440269372e-06, + "loss": 0.86178619, + "num_input_tokens_seen": 8320745, + "step": 397, + "time_per_iteration": 2.5194251537323 + }, + { + "auxiliary_loss_clip": 0.01301068, + "auxiliary_loss_mlp": 0.0109072, + "balance_loss_clip": 1.08536923, + "balance_loss_mlp": 1.05309761, + "epoch": 0.023929054561851796, + "flos": 21139103733120.0, + "grad_norm": 2.217874653480365, + "language_loss": 0.84606141, + "learning_rate": 3.854390195044404e-06, + "loss": 0.86997926, + "num_input_tokens_seen": 8339540, + "step": 398, + "time_per_iteration": 2.5117952823638916 + }, + { + "auxiliary_loss_clip": 0.01300739, + "auxiliary_loss_mlp": 0.01077769, + "balance_loss_clip": 1.08120346, + "balance_loss_mlp": 1.0391928, + "epoch": 0.023989177814519765, + "flos": 13698521049600.0, + "grad_norm": 2.6147474600277825, + "language_loss": 0.86140633, + "learning_rate": 3.856005885185868e-06, + "loss": 0.88519132, + "num_input_tokens_seen": 8354890, + "step": 399, + "time_per_iteration": 2.4803245067596436 + }, + { + "auxiliary_loss_clip": 0.01296234, + "auxiliary_loss_mlp": 0.01088423, + "balance_loss_clip": 1.08429503, + "balance_loss_mlp": 1.05132508, + "epoch": 0.024049301067187733, + "flos": 26322018929280.0, + "grad_norm": 1.9520564483724356, + "language_loss": 0.86424005, + "learning_rate": 3.857617531042398e-06, + "loss": 0.88808668, + "num_input_tokens_seen": 8375845, + "step": 400, + "time_per_iteration": 2.547632932662964 + }, + { + "auxiliary_loss_clip": 0.01302918, + "auxiliary_loss_mlp": 0.01082507, + "balance_loss_clip": 1.08725262, + "balance_loss_mlp": 1.04552794, + "epoch": 0.024109424319855705, + "flos": 24425432910720.0, + "grad_norm": 5.219954263965941, + "language_loss": 0.79198027, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.81583452, + "num_input_tokens_seen": 8395240, + "step": 401, + "time_per_iteration": 2.5446009635925293 + }, + { + "auxiliary_loss_clip": 0.01295131, + "auxiliary_loss_mlp": 0.0108941, + "balance_loss_clip": 1.08119714, + "balance_loss_mlp": 1.05283642, + "epoch": 0.024169547572523674, + "flos": 29604397610880.0, + "grad_norm": 2.042010850322047, + "language_loss": 0.7874651, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.81131053, + "num_input_tokens_seen": 8416950, + "step": 402, + "time_per_iteration": 2.5405514240264893 + }, + { + "auxiliary_loss_clip": 0.01301418, + "auxiliary_loss_mlp": 0.01081609, + "balance_loss_clip": 1.08259165, + "balance_loss_mlp": 1.04305673, + "epoch": 0.024229670825191642, + "flos": 22601458235520.0, + "grad_norm": 2.5081682172187754, + "language_loss": 0.94641459, + "learning_rate": 3.86242840411147e-06, + "loss": 0.97024494, + "num_input_tokens_seen": 8433660, + "step": 403, + "time_per_iteration": 2.4966673851013184 + }, + { + "auxiliary_loss_clip": 0.01304372, + "auxiliary_loss_mlp": 0.01091688, + "balance_loss_clip": 1.08116198, + "balance_loss_mlp": 1.05220604, + "epoch": 0.02428979407785961, + "flos": 18150258994560.0, + "grad_norm": 2.6112802569762135, + "language_loss": 0.99814641, + "learning_rate": 3.864024073288798e-06, + "loss": 1.02210689, + "num_input_tokens_seen": 8450180, + "step": 404, + "time_per_iteration": 2.466996669769287 + }, + { + "auxiliary_loss_clip": 0.01304505, + "auxiliary_loss_mlp": 0.0110411, + "balance_loss_clip": 1.08484054, + "balance_loss_mlp": 1.06696403, + "epoch": 0.024349917330527583, + "flos": 15304984917120.0, + "grad_norm": 2.9939423114247674, + "language_loss": 0.87750989, + "learning_rate": 3.865615797668091e-06, + "loss": 0.90159607, + "num_input_tokens_seen": 8467775, + "step": 405, + "time_per_iteration": 2.4818620681762695 + }, + { + "auxiliary_loss_clip": 0.01311309, + "auxiliary_loss_mlp": 0.01101068, + "balance_loss_clip": 1.08778858, + "balance_loss_mlp": 1.06308818, + "epoch": 0.024410040583195552, + "flos": 20773892200320.0, + "grad_norm": 2.3720614800987323, + "language_loss": 0.9329437, + "learning_rate": 3.867203596705844e-06, + "loss": 0.95706749, + "num_input_tokens_seen": 8486765, + "step": 406, + "time_per_iteration": 2.4722554683685303 + }, + { + "auxiliary_loss_clip": 0.013037, + "auxiliary_loss_mlp": 0.01091249, + "balance_loss_clip": 1.08672905, + "balance_loss_mlp": 1.05231547, + "epoch": 0.02447016383586352, + "flos": 21798854789760.0, + "grad_norm": 2.039096268801717, + "language_loss": 0.87003422, + "learning_rate": 3.86878748971496e-06, + "loss": 0.89398366, + "num_input_tokens_seen": 8506515, + "step": 407, + "time_per_iteration": 2.50523042678833 + }, + { + "auxiliary_loss_clip": 0.01300194, + "auxiliary_loss_mlp": 0.01084337, + "balance_loss_clip": 1.08679223, + "balance_loss_mlp": 1.04723918, + "epoch": 0.02453028708853149, + "flos": 33948116380800.0, + "grad_norm": 2.0433415160358543, + "language_loss": 0.74289078, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.76673603, + "num_input_tokens_seen": 8528035, + "step": 408, + "time_per_iteration": 3.954195499420166 + }, + { + "auxiliary_loss_clip": 0.01305114, + "auxiliary_loss_mlp": 0.0109265, + "balance_loss_clip": 1.08610761, + "balance_loss_mlp": 1.05462217, + "epoch": 0.024590410341199458, + "flos": 21793000872960.0, + "grad_norm": 3.308419862807477, + "language_loss": 0.92414629, + "learning_rate": 3.871943634189376e-06, + "loss": 0.94812393, + "num_input_tokens_seen": 8546455, + "step": 409, + "time_per_iteration": 3.9460253715515137 + }, + { + "auxiliary_loss_clip": 0.01302177, + "auxiliary_loss_mlp": 0.01077419, + "balance_loss_clip": 1.08635283, + "balance_loss_mlp": 1.04318166, + "epoch": 0.02465053359386743, + "flos": 35114782124160.0, + "grad_norm": 3.5334584229635304, + "language_loss": 0.82585609, + "learning_rate": 3.873515923575128e-06, + "loss": 0.84965205, + "num_input_tokens_seen": 8568450, + "step": 410, + "time_per_iteration": 3.980009078979492 + }, + { + "auxiliary_loss_clip": 0.01304367, + "auxiliary_loss_mlp": 0.0109329, + "balance_loss_clip": 1.08642793, + "balance_loss_mlp": 1.05616808, + "epoch": 0.0247106568465354, + "flos": 27451409333760.0, + "grad_norm": 2.5289261243078394, + "language_loss": 0.77953219, + "learning_rate": 3.875084382775879e-06, + "loss": 0.80350882, + "num_input_tokens_seen": 8589340, + "step": 411, + "time_per_iteration": 2.5406899452209473 + }, + { + "auxiliary_loss_clip": 0.01302626, + "auxiliary_loss_mlp": 0.01099911, + "balance_loss_clip": 1.08339143, + "balance_loss_mlp": 1.06166887, + "epoch": 0.024770780099203367, + "flos": 20703794808960.0, + "grad_norm": 2.2736611440976167, + "language_loss": 0.8649956, + "learning_rate": 3.87664903040738e-06, + "loss": 0.88902104, + "num_input_tokens_seen": 8607150, + "step": 412, + "time_per_iteration": 2.4936506748199463 + }, + { + "auxiliary_loss_clip": 0.01169076, + "auxiliary_loss_mlp": 0.01061884, + "balance_loss_clip": 1.06054318, + "balance_loss_mlp": 1.05091715, + "epoch": 0.024830903351871336, + "flos": 69551859369600.0, + "grad_norm": 0.8498793146915012, + "language_loss": 0.5854373, + "learning_rate": 3.878209884949994e-06, + "loss": 0.6077469, + "num_input_tokens_seen": 8669865, + "step": 413, + "time_per_iteration": 4.616122722625732 + }, + { + "auxiliary_loss_clip": 0.01294664, + "auxiliary_loss_mlp": 0.01093039, + "balance_loss_clip": 1.0805521, + "balance_loss_mlp": 1.05267429, + "epoch": 0.024891026604539304, + "flos": 32270477713920.0, + "grad_norm": 1.6834506535470901, + "language_loss": 0.80654544, + "learning_rate": 3.879766964750006e-06, + "loss": 0.83042252, + "num_input_tokens_seen": 8690235, + "step": 414, + "time_per_iteration": 2.613189458847046 + }, + { + "auxiliary_loss_clip": 0.01291478, + "auxiliary_loss_mlp": 0.01096323, + "balance_loss_clip": 1.08050799, + "balance_loss_mlp": 1.05917728, + "epoch": 0.024951149857207276, + "flos": 18840282238080.0, + "grad_norm": 2.3857012662037014, + "language_loss": 0.80220962, + "learning_rate": 3.881320288020917e-06, + "loss": 0.82608765, + "num_input_tokens_seen": 8706295, + "step": 415, + "time_per_iteration": 2.4934582710266113 + }, + { + "auxiliary_loss_clip": 0.01309313, + "auxiliary_loss_mlp": 0.01085492, + "balance_loss_clip": 1.08717442, + "balance_loss_mlp": 1.04834676, + "epoch": 0.025011273109875245, + "flos": 15377201210880.0, + "grad_norm": 2.9143216112797234, + "language_loss": 0.96300751, + "learning_rate": 3.882869872844723e-06, + "loss": 0.98695552, + "num_input_tokens_seen": 8724200, + "step": 416, + "time_per_iteration": 2.508213520050049 + }, + { + "auxiliary_loss_clip": 0.0129781, + "auxiliary_loss_mlp": 0.01077933, + "balance_loss_clip": 1.08289409, + "balance_loss_mlp": 1.03883266, + "epoch": 0.025071396362543213, + "flos": 18915515274240.0, + "grad_norm": 1.5542633821098661, + "language_loss": 0.77392864, + "learning_rate": 3.884415737173176e-06, + "loss": 0.7976861, + "num_input_tokens_seen": 8744170, + "step": 417, + "time_per_iteration": 2.537775754928589 + }, + { + "auxiliary_loss_clip": 0.01294253, + "auxiliary_loss_mlp": 0.01094293, + "balance_loss_clip": 1.08534837, + "balance_loss_mlp": 1.05674255, + "epoch": 0.025131519615211182, + "flos": 25337958952320.0, + "grad_norm": 1.6428277988881896, + "language_loss": 0.77056283, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.79444826, + "num_input_tokens_seen": 8765120, + "step": 418, + "time_per_iteration": 2.568434953689575 + }, + { + "auxiliary_loss_clip": 0.01302401, + "auxiliary_loss_mlp": 0.01076298, + "balance_loss_clip": 1.08733559, + "balance_loss_mlp": 1.04051173, + "epoch": 0.02519164286787915, + "flos": 18953149749120.0, + "grad_norm": 2.5073441006542896, + "language_loss": 0.8146444, + "learning_rate": 3.887496375507294e-06, + "loss": 0.83843136, + "num_input_tokens_seen": 8783500, + "step": 419, + "time_per_iteration": 2.4727859497070312 + }, + { + "auxiliary_loss_clip": 0.01296939, + "auxiliary_loss_mlp": 0.01085368, + "balance_loss_clip": 1.08577549, + "balance_loss_mlp": 1.04664934, + "epoch": 0.025251766120547123, + "flos": 17421092904960.0, + "grad_norm": 2.441449583315054, + "language_loss": 0.73835105, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.76217413, + "num_input_tokens_seen": 8801175, + "step": 420, + "time_per_iteration": 2.4857852458953857 + }, + { + "auxiliary_loss_clip": 0.01296221, + "auxiliary_loss_mlp": 0.01096019, + "balance_loss_clip": 1.0814327, + "balance_loss_mlp": 1.05920732, + "epoch": 0.02531188937321509, + "flos": 25045430590080.0, + "grad_norm": 1.7161945115572104, + "language_loss": 0.78964794, + "learning_rate": 3.890562344079484e-06, + "loss": 0.81357038, + "num_input_tokens_seen": 8820215, + "step": 421, + "time_per_iteration": 2.5336453914642334 + }, + { + "auxiliary_loss_clip": 0.01294997, + "auxiliary_loss_mlp": 0.01090051, + "balance_loss_clip": 1.08439362, + "balance_loss_mlp": 1.05085516, + "epoch": 0.02537201262588306, + "flos": 30592228515840.0, + "grad_norm": 2.869340971483033, + "language_loss": 0.81685144, + "learning_rate": 3.89208987073549e-06, + "loss": 0.84070194, + "num_input_tokens_seen": 8839660, + "step": 422, + "time_per_iteration": 2.573496103286743 + }, + { + "auxiliary_loss_clip": 0.01298319, + "auxiliary_loss_mlp": 0.0108221, + "balance_loss_clip": 1.08178818, + "balance_loss_mlp": 1.04699564, + "epoch": 0.02543213587855103, + "flos": 26065365275520.0, + "grad_norm": 1.8121756505453113, + "language_loss": 0.83335894, + "learning_rate": 3.893613781940409e-06, + "loss": 0.8571642, + "num_input_tokens_seen": 8859280, + "step": 423, + "time_per_iteration": 2.52799129486084 + }, + { + "auxiliary_loss_clip": 0.0129138, + "auxiliary_loss_mlp": 0.0108122, + "balance_loss_clip": 1.07955217, + "balance_loss_mlp": 1.04545772, + "epoch": 0.025492259131218997, + "flos": 36022818965760.0, + "grad_norm": 1.9244656666327609, + "language_loss": 0.74307859, + "learning_rate": 3.895134094768415e-06, + "loss": 0.76680458, + "num_input_tokens_seen": 8880560, + "step": 424, + "time_per_iteration": 2.7121007442474365 + }, + { + "auxiliary_loss_clip": 0.01300802, + "auxiliary_loss_mlp": 0.01100311, + "balance_loss_clip": 1.0853703, + "balance_loss_mlp": 1.06481028, + "epoch": 0.02555238238388697, + "flos": 18588045957120.0, + "grad_norm": 3.73347394985447, + "language_loss": 0.83358854, + "learning_rate": 3.896650826173015e-06, + "loss": 0.85759968, + "num_input_tokens_seen": 8899155, + "step": 425, + "time_per_iteration": 2.481843948364258 + }, + { + "auxiliary_loss_clip": 0.01296658, + "auxiliary_loss_mlp": 0.01088809, + "balance_loss_clip": 1.07773089, + "balance_loss_mlp": 1.05028021, + "epoch": 0.025612505636554938, + "flos": 24243186280320.0, + "grad_norm": 5.574313270590668, + "language_loss": 0.85630953, + "learning_rate": 3.898163992988186e-06, + "loss": 0.88016415, + "num_input_tokens_seen": 8917890, + "step": 426, + "time_per_iteration": 2.594860076904297 + }, + { + "auxiliary_loss_clip": 0.01169802, + "auxiliary_loss_mlp": 0.01028576, + "balance_loss_clip": 1.06239998, + "balance_loss_mlp": 1.01727474, + "epoch": 0.025672628889222907, + "flos": 60586941265920.0, + "grad_norm": 0.902800011789361, + "language_loss": 0.57298642, + "learning_rate": 3.899673611929491e-06, + "loss": 0.59497017, + "num_input_tokens_seen": 8978260, + "step": 427, + "time_per_iteration": 3.1978750228881836 + }, + { + "auxiliary_loss_clip": 0.01297447, + "auxiliary_loss_mlp": 0.01093804, + "balance_loss_clip": 1.08717608, + "balance_loss_mlp": 1.05827975, + "epoch": 0.025732752141890875, + "flos": 19573255169280.0, + "grad_norm": 2.370365355975273, + "language_loss": 0.8783105, + "learning_rate": 3.901179699595194e-06, + "loss": 0.90222299, + "num_input_tokens_seen": 8994460, + "step": 428, + "time_per_iteration": 2.5779666900634766 + }, + { + "auxiliary_loss_clip": 0.0129053, + "auxiliary_loss_mlp": 0.01077985, + "balance_loss_clip": 1.08059037, + "balance_loss_mlp": 1.03945637, + "epoch": 0.025792875394558847, + "flos": 31284262920960.0, + "grad_norm": 1.5888378280551276, + "language_loss": 0.85852361, + "learning_rate": 3.902682272467353e-06, + "loss": 0.8822087, + "num_input_tokens_seen": 9016670, + "step": 429, + "time_per_iteration": 2.579270124435425 + }, + { + "auxiliary_loss_clip": 0.01295644, + "auxiliary_loss_mlp": 0.01081738, + "balance_loss_clip": 1.0792129, + "balance_loss_mlp": 1.04354286, + "epoch": 0.025852998647226816, + "flos": 32379610210560.0, + "grad_norm": 2.306694592278278, + "language_loss": 0.88351828, + "learning_rate": 3.904181346912895e-06, + "loss": 0.90729213, + "num_input_tokens_seen": 9039720, + "step": 430, + "time_per_iteration": 2.5945487022399902 + }, + { + "auxiliary_loss_clip": 0.01297089, + "auxiliary_loss_mlp": 0.01081621, + "balance_loss_clip": 1.08788157, + "balance_loss_mlp": 1.0467639, + "epoch": 0.025913121899894784, + "flos": 20193288762240.0, + "grad_norm": 1.866764061918992, + "language_loss": 0.8402797, + "learning_rate": 3.905676939184698e-06, + "loss": 0.86406684, + "num_input_tokens_seen": 9059850, + "step": 431, + "time_per_iteration": 2.5026040077209473 + }, + { + "auxiliary_loss_clip": 0.01293406, + "auxiliary_loss_mlp": 0.0107492, + "balance_loss_clip": 1.08274579, + "balance_loss_mlp": 1.0407784, + "epoch": 0.025973245152562753, + "flos": 14720430983040.0, + "grad_norm": 4.540804590060525, + "language_loss": 0.86289227, + "learning_rate": 3.907169065422638e-06, + "loss": 0.88657558, + "num_input_tokens_seen": 9077590, + "step": 432, + "time_per_iteration": 2.4859097003936768 + }, + { + "auxiliary_loss_clip": 0.01294266, + "auxiliary_loss_mlp": 0.01072584, + "balance_loss_clip": 1.08290887, + "balance_loss_mlp": 1.03825152, + "epoch": 0.02603336840523072, + "flos": 30992991534720.0, + "grad_norm": 2.1806656000718854, + "language_loss": 0.75949878, + "learning_rate": 3.908657741654636e-06, + "loss": 0.78316724, + "num_input_tokens_seen": 9099880, + "step": 433, + "time_per_iteration": 2.563768148422241 + }, + { + "auxiliary_loss_clip": 0.01295576, + "auxiliary_loss_mlp": 0.01093916, + "balance_loss_clip": 1.08065009, + "balance_loss_mlp": 1.05469584, + "epoch": 0.026093491657898694, + "flos": 17674262939520.0, + "grad_norm": 1.8972918397434833, + "language_loss": 0.89419562, + "learning_rate": 3.910142983797699e-06, + "loss": 0.91809046, + "num_input_tokens_seen": 9118620, + "step": 434, + "time_per_iteration": 2.4933855533599854 + }, + { + "auxiliary_loss_clip": 0.01296454, + "auxiliary_loss_mlp": 0.011044, + "balance_loss_clip": 1.08591163, + "balance_loss_mlp": 1.06625271, + "epoch": 0.026153614910566662, + "flos": 17857874286720.0, + "grad_norm": 2.4857059990355808, + "language_loss": 0.80055976, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.82456827, + "num_input_tokens_seen": 9135655, + "step": 435, + "time_per_iteration": 2.472984552383423 + }, + { + "auxiliary_loss_clip": 0.01291785, + "auxiliary_loss_mlp": 0.01087401, + "balance_loss_clip": 1.07963979, + "balance_loss_mlp": 1.05039859, + "epoch": 0.02621373816323463, + "flos": 20011113959040.0, + "grad_norm": 2.2315123560050645, + "language_loss": 0.86554694, + "learning_rate": 3.913103228936546e-06, + "loss": 0.88933885, + "num_input_tokens_seen": 9153520, + "step": 436, + "time_per_iteration": 2.4957656860351562 + }, + { + "auxiliary_loss_clip": 0.01295969, + "auxiliary_loss_mlp": 0.01094134, + "balance_loss_clip": 1.08391356, + "balance_loss_mlp": 1.05751312, + "epoch": 0.0262738614159026, + "flos": 19281193683840.0, + "grad_norm": 2.1859195461373577, + "language_loss": 0.74891704, + "learning_rate": 3.914578263220868e-06, + "loss": 0.77281809, + "num_input_tokens_seen": 9170750, + "step": 437, + "time_per_iteration": 2.493433713912964 + }, + { + "auxiliary_loss_clip": 0.01294013, + "auxiliary_loss_mlp": 0.01094299, + "balance_loss_clip": 1.08470142, + "balance_loss_mlp": 1.05557966, + "epoch": 0.026333984668570568, + "flos": 18807208790400.0, + "grad_norm": 3.1187635584237485, + "language_loss": 0.91061419, + "learning_rate": 3.916049925995316e-06, + "loss": 0.9344973, + "num_input_tokens_seen": 9188430, + "step": 438, + "time_per_iteration": 2.480621099472046 + }, + { + "auxiliary_loss_clip": 0.01161758, + "auxiliary_loss_mlp": 0.01022264, + "balance_loss_clip": 1.05759215, + "balance_loss_mlp": 1.01248932, + "epoch": 0.02639410792123854, + "flos": 64572020691840.0, + "grad_norm": 0.8679921724154448, + "language_loss": 0.62627381, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64811409, + "num_input_tokens_seen": 9255835, + "step": 439, + "time_per_iteration": 3.195735454559326 + }, + { + "auxiliary_loss_clip": 0.01302715, + "auxiliary_loss_mlp": 0.01091455, + "balance_loss_clip": 1.08799112, + "balance_loss_mlp": 1.05385661, + "epoch": 0.02645423117390651, + "flos": 28473462921600.0, + "grad_norm": 3.493478589745417, + "language_loss": 0.75755179, + "learning_rate": 3.918983198419573e-06, + "loss": 0.78149343, + "num_input_tokens_seen": 9276835, + "step": 440, + "time_per_iteration": 2.5442724227905273 + }, + { + "auxiliary_loss_clip": 0.01293268, + "auxiliary_loss_mlp": 0.01076078, + "balance_loss_clip": 1.08369565, + "balance_loss_mlp": 1.03983831, + "epoch": 0.026514354426574478, + "flos": 18551237495040.0, + "grad_norm": 3.0311629252525614, + "language_loss": 0.83290154, + "learning_rate": 3.920444838510415e-06, + "loss": 0.85659492, + "num_input_tokens_seen": 9295075, + "step": 441, + "time_per_iteration": 2.5400731563568115 + }, + { + "auxiliary_loss_clip": 0.01296894, + "auxiliary_loss_mlp": 0.01086063, + "balance_loss_clip": 1.08225048, + "balance_loss_mlp": 1.04801142, + "epoch": 0.026574477679242446, + "flos": 20667812359680.0, + "grad_norm": 2.214105421451439, + "language_loss": 0.78743005, + "learning_rate": 3.92190316797534e-06, + "loss": 0.81125963, + "num_input_tokens_seen": 9314205, + "step": 442, + "time_per_iteration": 2.514350175857544 + }, + { + "auxiliary_loss_clip": 0.01159651, + "auxiliary_loss_mlp": 0.01014663, + "balance_loss_clip": 1.05674577, + "balance_loss_mlp": 1.00464976, + "epoch": 0.026634600931910415, + "flos": 57956125340160.0, + "grad_norm": 0.9579676985050465, + "language_loss": 0.64539683, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66714001, + "num_input_tokens_seen": 9367395, + "step": 443, + "time_per_iteration": 2.980947971343994 + }, + { + "auxiliary_loss_clip": 0.01295985, + "auxiliary_loss_mlp": 0.01087874, + "balance_loss_clip": 1.08613324, + "balance_loss_mlp": 1.0515871, + "epoch": 0.026694724184578387, + "flos": 15815131827840.0, + "grad_norm": 4.077748732564081, + "language_loss": 0.82599366, + "learning_rate": 3.924809954779425e-06, + "loss": 0.84983224, + "num_input_tokens_seen": 9385185, + "step": 444, + "time_per_iteration": 2.4806153774261475 + }, + { + "auxiliary_loss_clip": 0.01298424, + "auxiliary_loss_mlp": 0.01087132, + "balance_loss_clip": 1.08269048, + "balance_loss_mlp": 1.04769778, + "epoch": 0.026754847437246355, + "flos": 23440259612160.0, + "grad_norm": 2.147313420151961, + "language_loss": 0.95792741, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.98178297, + "num_input_tokens_seen": 9403225, + "step": 445, + "time_per_iteration": 2.5299737453460693 + }, + { + "auxiliary_loss_clip": 0.01294879, + "auxiliary_loss_mlp": 0.01094843, + "balance_loss_clip": 1.08487761, + "balance_loss_mlp": 1.05562341, + "epoch": 0.026814970689914324, + "flos": 17341801632000.0, + "grad_norm": 2.343351243696989, + "language_loss": 0.91824687, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.9421441, + "num_input_tokens_seen": 9420540, + "step": 446, + "time_per_iteration": 2.499513626098633 + }, + { + "auxiliary_loss_clip": 0.01289724, + "auxiliary_loss_mlp": 0.0108143, + "balance_loss_clip": 1.08277738, + "balance_loss_mlp": 1.04402161, + "epoch": 0.026875093942582293, + "flos": 17894718662400.0, + "grad_norm": 2.2114211953160776, + "language_loss": 0.79940355, + "learning_rate": 3.92914567610317e-06, + "loss": 0.82311511, + "num_input_tokens_seen": 9438840, + "step": 447, + "time_per_iteration": 2.512451410293579 + }, + { + "auxiliary_loss_clip": 0.01293725, + "auxiliary_loss_mlp": 0.01075116, + "balance_loss_clip": 1.08448386, + "balance_loss_mlp": 1.04047394, + "epoch": 0.026935217195250265, + "flos": 21723980889600.0, + "grad_norm": 3.0785195002079484, + "language_loss": 0.8635205, + "learning_rate": 3.930584452530952e-06, + "loss": 0.88720894, + "num_input_tokens_seen": 9457215, + "step": 448, + "time_per_iteration": 3.8676347732543945 + }, + { + "auxiliary_loss_clip": 0.01287436, + "auxiliary_loss_mlp": 0.01094766, + "balance_loss_clip": 1.08288908, + "balance_loss_mlp": 1.06071949, + "epoch": 0.026995340447918233, + "flos": 23622685810560.0, + "grad_norm": 3.6800706122280054, + "language_loss": 0.88914955, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.91297162, + "num_input_tokens_seen": 9475615, + "step": 449, + "time_per_iteration": 3.8611867427825928 + }, + { + "auxiliary_loss_clip": 0.01297281, + "auxiliary_loss_mlp": 0.01090528, + "balance_loss_clip": 1.08358872, + "balance_loss_mlp": 1.05309629, + "epoch": 0.027055463700586202, + "flos": 17931275729280.0, + "grad_norm": 2.237156602718341, + "language_loss": 0.80366027, + "learning_rate": 3.933452395729493e-06, + "loss": 0.82753831, + "num_input_tokens_seen": 9493975, + "step": 450, + "time_per_iteration": 2.487452983856201 + }, + { + "auxiliary_loss_clip": 0.01290778, + "auxiliary_loss_mlp": 0.01080574, + "balance_loss_clip": 1.0869211, + "balance_loss_mlp": 1.04390478, + "epoch": 0.02711558695325417, + "flos": 25118903859840.0, + "grad_norm": 1.711941683483557, + "language_loss": 0.81541884, + "learning_rate": 3.934881590952304e-06, + "loss": 0.83913231, + "num_input_tokens_seen": 9514810, + "step": 451, + "time_per_iteration": 3.931917428970337 + }, + { + "auxiliary_loss_clip": 0.01289713, + "auxiliary_loss_mlp": 0.01094382, + "balance_loss_clip": 1.08662915, + "balance_loss_mlp": 1.05697417, + "epoch": 0.02717571020592214, + "flos": 24239559006720.0, + "grad_norm": 1.6293262699563116, + "language_loss": 0.76990795, + "learning_rate": 3.936307620734599e-06, + "loss": 0.79374886, + "num_input_tokens_seen": 9533635, + "step": 452, + "time_per_iteration": 2.5682435035705566 + }, + { + "auxiliary_loss_clip": 0.01289312, + "auxiliary_loss_mlp": 0.01086099, + "balance_loss_clip": 1.08467674, + "balance_loss_mlp": 1.049263, + "epoch": 0.02723583345859011, + "flos": 25118939773440.0, + "grad_norm": 1.8070407921933467, + "language_loss": 0.72996998, + "learning_rate": 3.937730499067294e-06, + "loss": 0.7537241, + "num_input_tokens_seen": 9555420, + "step": 453, + "time_per_iteration": 3.963440418243408 + }, + { + "auxiliary_loss_clip": 0.01284835, + "auxiliary_loss_mlp": 0.01084017, + "balance_loss_clip": 1.08236194, + "balance_loss_mlp": 1.0483973, + "epoch": 0.02729595671125808, + "flos": 42741597847680.0, + "grad_norm": 1.8750299677014342, + "language_loss": 0.82110214, + "learning_rate": 3.939150239848748e-06, + "loss": 0.8447907, + "num_input_tokens_seen": 9578950, + "step": 454, + "time_per_iteration": 2.7149899005889893 + }, + { + "auxiliary_loss_clip": 0.01289311, + "auxiliary_loss_mlp": 0.01082397, + "balance_loss_clip": 1.08481574, + "balance_loss_mlp": 1.04939997, + "epoch": 0.02735607996392605, + "flos": 21430985650560.0, + "grad_norm": 1.9082207368311865, + "language_loss": 0.75332922, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.77704632, + "num_input_tokens_seen": 9598160, + "step": 455, + "time_per_iteration": 2.5228545665740967 + }, + { + "auxiliary_loss_clip": 0.01287275, + "auxiliary_loss_mlp": 0.01094347, + "balance_loss_clip": 1.08059645, + "balance_loss_mlp": 1.05953836, + "epoch": 0.027416203216594017, + "flos": 20851280052480.0, + "grad_norm": 2.7086095501431653, + "language_loss": 0.80716991, + "learning_rate": 3.941980363893499e-06, + "loss": 0.83098614, + "num_input_tokens_seen": 9616010, + "step": 456, + "time_per_iteration": 2.516589403152466 + }, + { + "auxiliary_loss_clip": 0.01285345, + "auxiliary_loss_mlp": 0.01078621, + "balance_loss_clip": 1.08194852, + "balance_loss_mlp": 1.04240477, + "epoch": 0.027476326469261986, + "flos": 13224500242560.0, + "grad_norm": 2.1735788221840195, + "language_loss": 0.81588703, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.83952665, + "num_input_tokens_seen": 9634000, + "step": 457, + "time_per_iteration": 2.5105857849121094 + }, + { + "auxiliary_loss_clip": 0.01289553, + "auxiliary_loss_mlp": 0.0108411, + "balance_loss_clip": 1.08104074, + "balance_loss_mlp": 1.04896736, + "epoch": 0.027536449721929958, + "flos": 24024526237440.0, + "grad_norm": 2.1549821104524036, + "language_loss": 0.9370966, + "learning_rate": 3.944798102235412e-06, + "loss": 0.96083325, + "num_input_tokens_seen": 9653455, + "step": 458, + "time_per_iteration": 2.5741076469421387 + }, + { + "auxiliary_loss_clip": 0.01287464, + "auxiliary_loss_mlp": 0.010925, + "balance_loss_clip": 1.08236825, + "balance_loss_mlp": 1.05816746, + "epoch": 0.027596572974597926, + "flos": 13006055681280.0, + "grad_norm": 2.3494062019256106, + "language_loss": 0.79069924, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.8144989, + "num_input_tokens_seen": 9669650, + "step": 459, + "time_per_iteration": 2.497664451599121 + }, + { + "auxiliary_loss_clip": 0.0129346, + "auxiliary_loss_mlp": 0.01085722, + "balance_loss_clip": 1.08861709, + "balance_loss_mlp": 1.04697895, + "epoch": 0.027656696227265895, + "flos": 26143076350080.0, + "grad_norm": 1.9124675039781192, + "language_loss": 0.8325392, + "learning_rate": 3.947603562811407e-06, + "loss": 0.85633105, + "num_input_tokens_seen": 9691415, + "step": 460, + "time_per_iteration": 2.5582635402679443 + }, + { + "auxiliary_loss_clip": 0.01151267, + "auxiliary_loss_mlp": 0.0101601, + "balance_loss_clip": 1.05213904, + "balance_loss_mlp": 1.00680721, + "epoch": 0.027716819479933864, + "flos": 60697222997760.0, + "grad_norm": 1.6215983804402434, + "language_loss": 0.73642015, + "learning_rate": 3.949001722282675e-06, + "loss": 0.75809288, + "num_input_tokens_seen": 9755605, + "step": 461, + "time_per_iteration": 3.087291717529297 + }, + { + "auxiliary_loss_clip": 0.01287992, + "auxiliary_loss_mlp": 0.01083537, + "balance_loss_clip": 1.08910763, + "balance_loss_mlp": 1.05011082, + "epoch": 0.027776942732601832, + "flos": 31211938886400.0, + "grad_norm": 2.7804600514429563, + "language_loss": 0.8082794, + "learning_rate": 3.950396852153582e-06, + "loss": 0.83199471, + "num_input_tokens_seen": 9776270, + "step": 462, + "time_per_iteration": 2.6029269695281982 + }, + { + "auxiliary_loss_clip": 0.01286385, + "auxiliary_loss_mlp": 0.01076973, + "balance_loss_clip": 1.08374095, + "balance_loss_mlp": 1.04450035, + "epoch": 0.027837065985269804, + "flos": 22674644196480.0, + "grad_norm": 3.08259408141153, + "language_loss": 0.9058404, + "learning_rate": 3.951788965525118e-06, + "loss": 0.929474, + "num_input_tokens_seen": 9794465, + "step": 463, + "time_per_iteration": 2.498464822769165 + }, + { + "auxiliary_loss_clip": 0.01147477, + "auxiliary_loss_mlp": 0.01009215, + "balance_loss_clip": 1.04916048, + "balance_loss_mlp": 1.00001204, + "epoch": 0.027897189237937773, + "flos": 62182487399040.0, + "grad_norm": 0.8875297763799335, + "language_loss": 0.59043205, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61199898, + "num_input_tokens_seen": 9849685, + "step": 464, + "time_per_iteration": 3.052896022796631 + }, + { + "auxiliary_loss_clip": 0.01298732, + "auxiliary_loss_mlp": 0.01099971, + "balance_loss_clip": 1.08889878, + "balance_loss_mlp": 1.06413662, + "epoch": 0.02795731249060574, + "flos": 24493160004480.0, + "grad_norm": 3.3047635515512463, + "language_loss": 0.81387413, + "learning_rate": 3.954564194750784e-06, + "loss": 0.83786106, + "num_input_tokens_seen": 9869505, + "step": 465, + "time_per_iteration": 2.5674421787261963 + }, + { + "auxiliary_loss_clip": 0.01284941, + "auxiliary_loss_mlp": 0.01086584, + "balance_loss_clip": 1.08092511, + "balance_loss_mlp": 1.05105984, + "epoch": 0.02801743574327371, + "flos": 23733003456000.0, + "grad_norm": 3.297863121086697, + "language_loss": 0.78347957, + "learning_rate": 3.955947336385828e-06, + "loss": 0.80719483, + "num_input_tokens_seen": 9890950, + "step": 466, + "time_per_iteration": 2.543804407119751 + }, + { + "auxiliary_loss_clip": 0.01285657, + "auxiliary_loss_mlp": 0.01088145, + "balance_loss_clip": 1.08362174, + "balance_loss_mlp": 1.05390835, + "epoch": 0.02807755899594168, + "flos": 20629100476800.0, + "grad_norm": 1.7710125411846602, + "language_loss": 0.87694323, + "learning_rate": 3.957327513084761e-06, + "loss": 0.90068126, + "num_input_tokens_seen": 9911265, + "step": 467, + "time_per_iteration": 2.578993320465088 + }, + { + "auxiliary_loss_clip": 0.01290305, + "auxiliary_loss_mlp": 0.01103747, + "balance_loss_clip": 1.08553684, + "balance_loss_mlp": 1.06707859, + "epoch": 0.02813768224860965, + "flos": 19244564789760.0, + "grad_norm": 2.029708135486315, + "language_loss": 0.86303067, + "learning_rate": 3.958704737531818e-06, + "loss": 0.88697124, + "num_input_tokens_seen": 9929025, + "step": 468, + "time_per_iteration": 2.4949283599853516 + }, + { + "auxiliary_loss_clip": 0.01286382, + "auxiliary_loss_mlp": 0.01081463, + "balance_loss_clip": 1.08134127, + "balance_loss_mlp": 1.04457951, + "epoch": 0.02819780550127762, + "flos": 20813968800000.0, + "grad_norm": 2.75835791171987, + "language_loss": 0.91922009, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.94289857, + "num_input_tokens_seen": 9945190, + "step": 469, + "time_per_iteration": 2.5109784603118896 + }, + { + "auxiliary_loss_clip": 0.0128354, + "auxiliary_loss_mlp": 0.01092779, + "balance_loss_clip": 1.082798, + "balance_loss_mlp": 1.0563004, + "epoch": 0.028257928753945588, + "flos": 19974125928960.0, + "grad_norm": 2.6045041952410464, + "language_loss": 0.81770122, + "learning_rate": 3.96145038000181e-06, + "loss": 0.8414644, + "num_input_tokens_seen": 9962820, + "step": 470, + "time_per_iteration": 2.490478515625 + }, + { + "auxiliary_loss_clip": 0.01286234, + "auxiliary_loss_mlp": 0.01088199, + "balance_loss_clip": 1.08035004, + "balance_loss_mlp": 1.05112529, + "epoch": 0.028318052006613557, + "flos": 20484488321280.0, + "grad_norm": 1.8763918805517323, + "language_loss": 0.93138921, + "learning_rate": 3.962818822989861e-06, + "loss": 0.9551335, + "num_input_tokens_seen": 9982595, + "step": 471, + "time_per_iteration": 2.5220422744750977 + }, + { + "auxiliary_loss_clip": 0.01280883, + "auxiliary_loss_mlp": 0.01094213, + "balance_loss_clip": 1.07921922, + "balance_loss_mlp": 1.05861735, + "epoch": 0.02837817525928153, + "flos": 28514832410880.0, + "grad_norm": 1.830050496122536, + "language_loss": 0.76000166, + "learning_rate": 3.964184363657625e-06, + "loss": 0.78375256, + "num_input_tokens_seen": 10004645, + "step": 472, + "time_per_iteration": 2.579195976257324 + }, + { + "auxiliary_loss_clip": 0.01288117, + "auxiliary_loss_mlp": 0.01078818, + "balance_loss_clip": 1.07968354, + "balance_loss_mlp": 1.04448617, + "epoch": 0.028438298511949497, + "flos": 18551668458240.0, + "grad_norm": 1.6841795139999398, + "language_loss": 0.93579555, + "learning_rate": 3.965547014290071e-06, + "loss": 0.95946491, + "num_input_tokens_seen": 10022555, + "step": 473, + "time_per_iteration": 2.5237925052642822 + }, + { + "auxiliary_loss_clip": 0.01293871, + "auxiliary_loss_mlp": 0.011125, + "balance_loss_clip": 1.08397865, + "balance_loss_mlp": 1.07771468, + "epoch": 0.028498421764617466, + "flos": 16910227722240.0, + "grad_norm": 2.4989010098811377, + "language_loss": 0.88353378, + "learning_rate": 3.96690678709433e-06, + "loss": 0.90759754, + "num_input_tokens_seen": 10041025, + "step": 474, + "time_per_iteration": 2.5154855251312256 + }, + { + "auxiliary_loss_clip": 0.01284451, + "auxiliary_loss_mlp": 0.01088887, + "balance_loss_clip": 1.08153296, + "balance_loss_mlp": 1.0524807, + "epoch": 0.028558545017285435, + "flos": 27778699082880.0, + "grad_norm": 2.3295711215100265, + "language_loss": 0.78736937, + "learning_rate": 3.968263694200355e-06, + "loss": 0.81110275, + "num_input_tokens_seen": 10060775, + "step": 475, + "time_per_iteration": 2.598573684692383 + }, + { + "auxiliary_loss_clip": 0.01141951, + "auxiliary_loss_mlp": 0.01019321, + "balance_loss_clip": 1.04605865, + "balance_loss_mlp": 1.01078606, + "epoch": 0.028618668269953403, + "flos": 65654367258240.0, + "grad_norm": 0.9542091715056401, + "language_loss": 0.66972047, + "learning_rate": 3.969617747661569e-06, + "loss": 0.69133323, + "num_input_tokens_seen": 10120225, + "step": 476, + "time_per_iteration": 3.0931899547576904 + }, + { + "auxiliary_loss_clip": 0.01286595, + "auxiliary_loss_mlp": 0.01085553, + "balance_loss_clip": 1.08214438, + "balance_loss_mlp": 1.04824078, + "epoch": 0.028678791522621375, + "flos": 21937074324480.0, + "grad_norm": 2.3198832276840786, + "language_loss": 0.83700073, + "learning_rate": 3.970968959455509e-06, + "loss": 0.86072218, + "num_input_tokens_seen": 10137880, + "step": 477, + "time_per_iteration": 2.5587594509124756 + }, + { + "auxiliary_loss_clip": 0.01293006, + "auxiliary_loss_mlp": 0.01090454, + "balance_loss_clip": 1.08577788, + "balance_loss_mlp": 1.05354643, + "epoch": 0.028738914775289344, + "flos": 24572128055040.0, + "grad_norm": 1.9992978548442268, + "language_loss": 0.81995547, + "learning_rate": 3.97231734148446e-06, + "loss": 0.84379011, + "num_input_tokens_seen": 10156930, + "step": 478, + "time_per_iteration": 2.577746868133545 + }, + { + "auxiliary_loss_clip": 0.01284946, + "auxiliary_loss_mlp": 0.01083995, + "balance_loss_clip": 1.08103776, + "balance_loss_mlp": 1.04787433, + "epoch": 0.028799038027957313, + "flos": 23257977068160.0, + "grad_norm": 1.6753054313428546, + "language_loss": 0.80957401, + "learning_rate": 3.973662905576082e-06, + "loss": 0.8332634, + "num_input_tokens_seen": 10176295, + "step": 479, + "time_per_iteration": 2.546095371246338 + }, + { + "auxiliary_loss_clip": 0.01281748, + "auxiliary_loss_mlp": 0.01084051, + "balance_loss_clip": 1.07977986, + "balance_loss_mlp": 1.04542732, + "epoch": 0.02885916128062528, + "flos": 22164102236160.0, + "grad_norm": 2.5180577648561915, + "language_loss": 0.73924029, + "learning_rate": 3.975005663484038e-06, + "loss": 0.76289821, + "num_input_tokens_seen": 10195790, + "step": 480, + "time_per_iteration": 2.536846876144409 + }, + { + "auxiliary_loss_clip": 0.01279903, + "auxiliary_loss_mlp": 0.01073384, + "balance_loss_clip": 1.08160353, + "balance_loss_mlp": 1.04095864, + "epoch": 0.02891928453329325, + "flos": 22932842135040.0, + "grad_norm": 1.8458472054812554, + "language_loss": 0.87754673, + "learning_rate": 3.976345626888605e-06, + "loss": 0.9010796, + "num_input_tokens_seen": 10218405, + "step": 481, + "time_per_iteration": 2.5575757026672363 + }, + { + "auxiliary_loss_clip": 0.01139935, + "auxiliary_loss_mlp": 0.01008764, + "balance_loss_clip": 1.04474068, + "balance_loss_mlp": 1.00080097, + "epoch": 0.028979407785961222, + "flos": 57432941792640.0, + "grad_norm": 0.8263788188585188, + "language_loss": 0.66088074, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68236774, + "num_input_tokens_seen": 10271005, + "step": 482, + "time_per_iteration": 2.8484065532684326 + }, + { + "auxiliary_loss_clip": 0.01296542, + "auxiliary_loss_mlp": 0.01079778, + "balance_loss_clip": 1.08520031, + "balance_loss_mlp": 1.04544604, + "epoch": 0.02903953103862919, + "flos": 16722737706240.0, + "grad_norm": 2.7450364491080927, + "language_loss": 0.79021853, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81398177, + "num_input_tokens_seen": 10288405, + "step": 483, + "time_per_iteration": 2.492056369781494 + }, + { + "auxiliary_loss_clip": 0.01292591, + "auxiliary_loss_mlp": 0.01091694, + "balance_loss_clip": 1.08602524, + "balance_loss_mlp": 1.05500102, + "epoch": 0.02909965429129716, + "flos": 16763640318720.0, + "grad_norm": 1.9281853500685082, + "language_loss": 0.75804806, + "learning_rate": 3.980348865796749e-06, + "loss": 0.78189087, + "num_input_tokens_seen": 10306875, + "step": 484, + "time_per_iteration": 2.4888498783111572 + }, + { + "auxiliary_loss_clip": 0.01288126, + "auxiliary_loss_mlp": 0.01085602, + "balance_loss_clip": 1.08308113, + "balance_loss_mlp": 1.05129325, + "epoch": 0.029159777543965128, + "flos": 19785343023360.0, + "grad_norm": 2.1407580525097365, + "language_loss": 0.83864254, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.86237979, + "num_input_tokens_seen": 10323965, + "step": 485, + "time_per_iteration": 2.4940836429595947 + }, + { + "auxiliary_loss_clip": 0.01294474, + "auxiliary_loss_mlp": 0.01087138, + "balance_loss_clip": 1.09030056, + "balance_loss_mlp": 1.05147076, + "epoch": 0.029219900796633096, + "flos": 19642670202240.0, + "grad_norm": 2.563064328038223, + "language_loss": 0.84566319, + "learning_rate": 3.983003930109732e-06, + "loss": 0.8694793, + "num_input_tokens_seen": 10342620, + "step": 486, + "time_per_iteration": 2.5129122734069824 + }, + { + "auxiliary_loss_clip": 0.01284369, + "auxiliary_loss_mlp": 0.01092371, + "balance_loss_clip": 1.08095777, + "balance_loss_mlp": 1.05658412, + "epoch": 0.02928002404930107, + "flos": 25885704424320.0, + "grad_norm": 2.054091571761709, + "language_loss": 0.89366752, + "learning_rate": 3.984327367746315e-06, + "loss": 0.91743493, + "num_input_tokens_seen": 10364610, + "step": 487, + "time_per_iteration": 2.540311098098755 + }, + { + "auxiliary_loss_clip": 0.01289396, + "auxiliary_loss_mlp": 0.01066399, + "balance_loss_clip": 1.0853796, + "balance_loss_mlp": 1.03340197, + "epoch": 0.029340147301969037, + "flos": 20660234590080.0, + "grad_norm": 2.742621271370116, + "language_loss": 0.88385177, + "learning_rate": 3.985648090637122e-06, + "loss": 0.90740979, + "num_input_tokens_seen": 10380910, + "step": 488, + "time_per_iteration": 2.4969539642333984 + }, + { + "auxiliary_loss_clip": 0.01284013, + "auxiliary_loss_mlp": 0.01082493, + "balance_loss_clip": 1.08265543, + "balance_loss_mlp": 1.04718328, + "epoch": 0.029400270554637006, + "flos": 24428018689920.0, + "grad_norm": 1.7755787439662505, + "language_loss": 0.88873637, + "learning_rate": 3.986966109896785e-06, + "loss": 0.91240144, + "num_input_tokens_seen": 10400665, + "step": 489, + "time_per_iteration": 6.790199041366577 + }, + { + "auxiliary_loss_clip": 0.01278338, + "auxiliary_loss_mlp": 0.01076341, + "balance_loss_clip": 1.07796097, + "balance_loss_mlp": 1.04110324, + "epoch": 0.029460393807304974, + "flos": 20120892900480.0, + "grad_norm": 1.8706154609460035, + "language_loss": 0.88531554, + "learning_rate": 3.988281436571815e-06, + "loss": 0.90886235, + "num_input_tokens_seen": 10420150, + "step": 490, + "time_per_iteration": 2.512439250946045 + }, + { + "auxiliary_loss_clip": 0.01284785, + "auxiliary_loss_mlp": 0.01085391, + "balance_loss_clip": 1.08060074, + "balance_loss_mlp": 1.05105901, + "epoch": 0.029520517059972943, + "flos": 17675914965120.0, + "grad_norm": 2.321623302917756, + "language_loss": 0.91213524, + "learning_rate": 3.989594081641164e-06, + "loss": 0.93583703, + "num_input_tokens_seen": 10438210, + "step": 491, + "time_per_iteration": 2.504706859588623 + }, + { + "auxiliary_loss_clip": 0.01276362, + "auxiliary_loss_mlp": 0.01072244, + "balance_loss_clip": 1.08065987, + "balance_loss_mlp": 1.03929424, + "epoch": 0.029580640312640915, + "flos": 18953185662720.0, + "grad_norm": 2.1340502694181165, + "language_loss": 0.85581422, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.8793003, + "num_input_tokens_seen": 10455125, + "step": 492, + "time_per_iteration": 2.4902396202087402 + }, + { + "auxiliary_loss_clip": 0.01287623, + "auxiliary_loss_mlp": 0.01101975, + "balance_loss_clip": 1.08603764, + "balance_loss_mlp": 1.0665462, + "epoch": 0.029640763565308884, + "flos": 18726121837440.0, + "grad_norm": 3.462192112926633, + "language_loss": 0.84109783, + "learning_rate": 3.992211370544093e-06, + "loss": 0.86499381, + "num_input_tokens_seen": 10470990, + "step": 493, + "time_per_iteration": 2.4672958850860596 + }, + { + "auxiliary_loss_clip": 0.01282306, + "auxiliary_loss_mlp": 0.01075822, + "balance_loss_clip": 1.0805037, + "balance_loss_mlp": 1.04246759, + "epoch": 0.029700886817976852, + "flos": 20595308757120.0, + "grad_norm": 2.2472980944724905, + "language_loss": 0.86534023, + "learning_rate": 3.99351603600268e-06, + "loss": 0.88892156, + "num_input_tokens_seen": 10490685, + "step": 494, + "time_per_iteration": 3.902836322784424 + }, + { + "auxiliary_loss_clip": 0.01287378, + "auxiliary_loss_mlp": 0.01083918, + "balance_loss_clip": 1.0839076, + "balance_loss_mlp": 1.05261397, + "epoch": 0.02976101007064482, + "flos": 22236857233920.0, + "grad_norm": 2.1647534696936215, + "language_loss": 0.86475515, + "learning_rate": 3.994818063106668e-06, + "loss": 0.88846815, + "num_input_tokens_seen": 10509435, + "step": 495, + "time_per_iteration": 2.514432191848755 + }, + { + "auxiliary_loss_clip": 0.01275081, + "auxiliary_loss_mlp": 0.01075853, + "balance_loss_clip": 1.08007073, + "balance_loss_mlp": 1.04328489, + "epoch": 0.029821133323312793, + "flos": 23732644320000.0, + "grad_norm": 2.059852336325344, + "language_loss": 0.62328988, + "learning_rate": 3.99611746250533e-06, + "loss": 0.64679921, + "num_input_tokens_seen": 10530050, + "step": 496, + "time_per_iteration": 2.5275046825408936 + }, + { + "auxiliary_loss_clip": 0.01279255, + "auxiliary_loss_mlp": 0.01086962, + "balance_loss_clip": 1.08461344, + "balance_loss_mlp": 1.05448961, + "epoch": 0.02988125657598076, + "flos": 22419498913920.0, + "grad_norm": 2.5208037597506583, + "language_loss": 0.88890362, + "learning_rate": 3.997414244783595e-06, + "loss": 0.91256583, + "num_input_tokens_seen": 10551370, + "step": 497, + "time_per_iteration": 2.5553793907165527 + }, + { + "auxiliary_loss_clip": 0.01282831, + "auxiliary_loss_mlp": 0.01079375, + "balance_loss_clip": 1.08253932, + "balance_loss_mlp": 1.0456146, + "epoch": 0.02994137982864873, + "flos": 13845108453120.0, + "grad_norm": 12.174860950256292, + "language_loss": 0.8480438, + "learning_rate": 3.998708420462557e-06, + "loss": 0.87166584, + "num_input_tokens_seen": 10569225, + "step": 498, + "time_per_iteration": 2.4612936973571777 + }, + { + "auxiliary_loss_clip": 0.0127958, + "auxiliary_loss_mlp": 0.01081799, + "balance_loss_clip": 1.08111262, + "balance_loss_mlp": 1.04994583, + "epoch": 0.0300015030813167, + "flos": 23908354675200.0, + "grad_norm": 4.883971593763153, + "language_loss": 0.78737938, + "learning_rate": 4e-06, + "loss": 0.81099319, + "num_input_tokens_seen": 10586170, + "step": 499, + "time_per_iteration": 2.510436534881592 + }, + { + "auxiliary_loss_clip": 0.01282371, + "auxiliary_loss_mlp": 0.01084829, + "balance_loss_clip": 1.0840838, + "balance_loss_mlp": 1.05197501, + "epoch": 0.030061626333984667, + "flos": 22016796560640.0, + "grad_norm": 2.554895241732742, + "language_loss": 0.82729942, + "learning_rate": 3.9999999620799e-06, + "loss": 0.85097146, + "num_input_tokens_seen": 10606205, + "step": 500, + "time_per_iteration": 2.5825767517089844 + }, + { + "auxiliary_loss_clip": 0.01273273, + "auxiliary_loss_mlp": 0.0108432, + "balance_loss_clip": 1.07897866, + "balance_loss_mlp": 1.04927194, + "epoch": 0.03012174958665264, + "flos": 23039747988480.0, + "grad_norm": 2.70584552463603, + "language_loss": 0.88422096, + "learning_rate": 3.9999998483196e-06, + "loss": 0.90779686, + "num_input_tokens_seen": 10625995, + "step": 501, + "time_per_iteration": 2.4917614459991455 + }, + { + "auxiliary_loss_clip": 0.01283542, + "auxiliary_loss_mlp": 0.01075952, + "balance_loss_clip": 1.08245063, + "balance_loss_mlp": 1.04388428, + "epoch": 0.030181872839320608, + "flos": 18953257489920.0, + "grad_norm": 2.5946176285018385, + "language_loss": 0.87230331, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.89589822, + "num_input_tokens_seen": 10644105, + "step": 502, + "time_per_iteration": 2.4917454719543457 + }, + { + "auxiliary_loss_clip": 0.01277821, + "auxiliary_loss_mlp": 0.01079999, + "balance_loss_clip": 1.08339572, + "balance_loss_mlp": 1.04745483, + "epoch": 0.030241996091988577, + "flos": 16728017005440.0, + "grad_norm": 2.489175126715502, + "language_loss": 0.84597397, + "learning_rate": 3.999999393278425e-06, + "loss": 0.86955214, + "num_input_tokens_seen": 10661090, + "step": 503, + "time_per_iteration": 2.4932639598846436 + }, + { + "auxiliary_loss_clip": 0.01271936, + "auxiliary_loss_mlp": 0.01086758, + "balance_loss_clip": 1.08046758, + "balance_loss_mlp": 1.05407047, + "epoch": 0.030302119344656545, + "flos": 28621271387520.0, + "grad_norm": 1.5807164771259137, + "language_loss": 0.88186139, + "learning_rate": 3.999999051997567e-06, + "loss": 0.90544832, + "num_input_tokens_seen": 10682380, + "step": 504, + "time_per_iteration": 2.56295108795166 + }, + { + "auxiliary_loss_clip": 0.01275223, + "auxiliary_loss_mlp": 0.01091183, + "balance_loss_clip": 1.08020985, + "balance_loss_mlp": 1.05842459, + "epoch": 0.030362242597324514, + "flos": 15669334523520.0, + "grad_norm": 2.3838600279076, + "language_loss": 0.78126585, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.80492997, + "num_input_tokens_seen": 10699925, + "step": 505, + "time_per_iteration": 2.480445146560669 + }, + { + "auxiliary_loss_clip": 0.01140739, + "auxiliary_loss_mlp": 0.01028196, + "balance_loss_clip": 1.04706335, + "balance_loss_mlp": 1.02061462, + "epoch": 0.030422365849992486, + "flos": 72125973676800.0, + "grad_norm": 0.870952608927018, + "language_loss": 0.5500617, + "learning_rate": 3.999998141915371e-06, + "loss": 0.571751, + "num_input_tokens_seen": 10766525, + "step": 506, + "time_per_iteration": 3.2477879524230957 + }, + { + "auxiliary_loss_clip": 0.01274462, + "auxiliary_loss_mlp": 0.01084991, + "balance_loss_clip": 1.07882833, + "balance_loss_mlp": 1.05254245, + "epoch": 0.030482489102660455, + "flos": 19427817000960.0, + "grad_norm": 2.0466558741268903, + "language_loss": 0.83180654, + "learning_rate": 3.999997573114069e-06, + "loss": 0.85540104, + "num_input_tokens_seen": 10786725, + "step": 507, + "time_per_iteration": 2.5006020069122314 + }, + { + "auxiliary_loss_clip": 0.01278781, + "auxiliary_loss_mlp": 0.0107368, + "balance_loss_clip": 1.08043861, + "balance_loss_mlp": 1.04156542, + "epoch": 0.030542612355328423, + "flos": 20375822701440.0, + "grad_norm": 2.3023127570760886, + "language_loss": 0.88539124, + "learning_rate": 3.999996928472659e-06, + "loss": 0.90891588, + "num_input_tokens_seen": 10805390, + "step": 508, + "time_per_iteration": 2.491718053817749 + }, + { + "auxiliary_loss_clip": 0.01281478, + "auxiliary_loss_mlp": 0.01069494, + "balance_loss_clip": 1.08113074, + "balance_loss_mlp": 1.03590083, + "epoch": 0.030602735607996392, + "flos": 34677354297600.0, + "grad_norm": 2.393352617223587, + "language_loss": 0.71333444, + "learning_rate": 3.999996207991165e-06, + "loss": 0.73684418, + "num_input_tokens_seen": 10828030, + "step": 509, + "time_per_iteration": 2.6113429069519043 + }, + { + "auxiliary_loss_clip": 0.01273273, + "auxiliary_loss_mlp": 0.01070398, + "balance_loss_clip": 1.08074403, + "balance_loss_mlp": 1.03961766, + "epoch": 0.03066285886066436, + "flos": 23658668259840.0, + "grad_norm": 1.9777524453834636, + "language_loss": 0.82129699, + "learning_rate": 3.999995411669614e-06, + "loss": 0.84473372, + "num_input_tokens_seen": 10845240, + "step": 510, + "time_per_iteration": 2.497653007507324 + }, + { + "auxiliary_loss_clip": 0.01279349, + "auxiliary_loss_mlp": 0.01078315, + "balance_loss_clip": 1.08482146, + "balance_loss_mlp": 1.04567599, + "epoch": 0.030722982113332332, + "flos": 23002975440000.0, + "grad_norm": 2.2365081371497673, + "language_loss": 0.83666289, + "learning_rate": 3.999994539508036e-06, + "loss": 0.86023962, + "num_input_tokens_seen": 10864325, + "step": 511, + "time_per_iteration": 2.5393166542053223 + }, + { + "auxiliary_loss_clip": 0.01279135, + "auxiliary_loss_mlp": 0.01079795, + "balance_loss_clip": 1.07971609, + "balance_loss_mlp": 1.04834712, + "epoch": 0.0307831053660003, + "flos": 24750855152640.0, + "grad_norm": 2.1820433155217915, + "language_loss": 0.81997889, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.84356821, + "num_input_tokens_seen": 10883860, + "step": 512, + "time_per_iteration": 2.5206189155578613 + }, + { + "auxiliary_loss_clip": 0.01276475, + "auxiliary_loss_mlp": 0.01080265, + "balance_loss_clip": 1.07971728, + "balance_loss_mlp": 1.04769683, + "epoch": 0.03084322861866827, + "flos": 26140885620480.0, + "grad_norm": 1.8765872012861904, + "language_loss": 0.86925894, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.89282638, + "num_input_tokens_seen": 10904555, + "step": 513, + "time_per_iteration": 2.5361359119415283 + }, + { + "auxiliary_loss_clip": 0.01284275, + "auxiliary_loss_mlp": 0.01082116, + "balance_loss_clip": 1.08333755, + "balance_loss_mlp": 1.04909563, + "epoch": 0.03090335187133624, + "flos": 18771298168320.0, + "grad_norm": 1.6939573488395416, + "language_loss": 0.79182673, + "learning_rate": 3.999991467983491e-06, + "loss": 0.8154906, + "num_input_tokens_seen": 10923700, + "step": 514, + "time_per_iteration": 2.4916253089904785 + }, + { + "auxiliary_loss_clip": 0.01275758, + "auxiliary_loss_mlp": 0.01064672, + "balance_loss_clip": 1.08312798, + "balance_loss_mlp": 1.03422558, + "epoch": 0.030963475124004207, + "flos": 23221886878080.0, + "grad_norm": 2.73808374141246, + "language_loss": 0.7710793, + "learning_rate": 3.999990292462167e-06, + "loss": 0.79448354, + "num_input_tokens_seen": 10942730, + "step": 515, + "time_per_iteration": 2.5126349925994873 + }, + { + "auxiliary_loss_clip": 0.0127282, + "auxiliary_loss_mlp": 0.01068888, + "balance_loss_clip": 1.07649148, + "balance_loss_mlp": 1.03615308, + "epoch": 0.03102359837667218, + "flos": 42525595411200.0, + "grad_norm": 2.000979851494337, + "language_loss": 0.82705641, + "learning_rate": 3.999989041101011e-06, + "loss": 0.85047352, + "num_input_tokens_seen": 10967120, + "step": 516, + "time_per_iteration": 2.6882431507110596 + }, + { + "auxiliary_loss_clip": 0.01272552, + "auxiliary_loss_mlp": 0.0107441, + "balance_loss_clip": 1.07974148, + "balance_loss_mlp": 1.04117489, + "epoch": 0.031083721629340148, + "flos": 21176953689600.0, + "grad_norm": 2.778314691075019, + "language_loss": 0.78658086, + "learning_rate": 3.999987713900071e-06, + "loss": 0.81005049, + "num_input_tokens_seen": 10986775, + "step": 517, + "time_per_iteration": 2.527066707611084 + }, + { + "auxiliary_loss_clip": 0.01269635, + "auxiliary_loss_mlp": 0.0107296, + "balance_loss_clip": 1.0796026, + "balance_loss_mlp": 1.04105997, + "epoch": 0.031143844882008116, + "flos": 29716187713920.0, + "grad_norm": 1.984569413089594, + "language_loss": 0.90648293, + "learning_rate": 3.999986310859396e-06, + "loss": 0.92990887, + "num_input_tokens_seen": 11011360, + "step": 518, + "time_per_iteration": 2.561100482940674 + }, + { + "auxiliary_loss_clip": 0.01282975, + "auxiliary_loss_mlp": 0.01094649, + "balance_loss_clip": 1.08737111, + "balance_loss_mlp": 1.05976832, + "epoch": 0.031203968134676085, + "flos": 23112467072640.0, + "grad_norm": 1.9318406331711715, + "language_loss": 0.86248553, + "learning_rate": 3.999984831979039e-06, + "loss": 0.8862617, + "num_input_tokens_seen": 11030150, + "step": 519, + "time_per_iteration": 2.522749900817871 + }, + { + "auxiliary_loss_clip": 0.01279583, + "auxiliary_loss_mlp": 0.010891, + "balance_loss_clip": 1.07926714, + "balance_loss_mlp": 1.05750918, + "epoch": 0.03126409138734405, + "flos": 20954379064320.0, + "grad_norm": 2.0505355165339667, + "language_loss": 0.86720949, + "learning_rate": 3.999983277259057e-06, + "loss": 0.89089626, + "num_input_tokens_seen": 11049145, + "step": 520, + "time_per_iteration": 2.499417304992676 + }, + { + "auxiliary_loss_clip": 0.01281129, + "auxiliary_loss_mlp": 0.01090228, + "balance_loss_clip": 1.08172321, + "balance_loss_mlp": 1.05734992, + "epoch": 0.031324214640012026, + "flos": 21650112570240.0, + "grad_norm": 2.0575898356951234, + "language_loss": 0.89220858, + "learning_rate": 3.999981646699509e-06, + "loss": 0.91592216, + "num_input_tokens_seen": 11068835, + "step": 521, + "time_per_iteration": 2.5391504764556885 + }, + { + "auxiliary_loss_clip": 0.01273869, + "auxiliary_loss_mlp": 0.01084906, + "balance_loss_clip": 1.08069706, + "balance_loss_mlp": 1.05090737, + "epoch": 0.03138433789267999, + "flos": 23441337020160.0, + "grad_norm": 2.046365236274203, + "language_loss": 0.71245646, + "learning_rate": 3.999979940300456e-06, + "loss": 0.73604417, + "num_input_tokens_seen": 11088980, + "step": 522, + "time_per_iteration": 2.5181522369384766 + }, + { + "auxiliary_loss_clip": 0.01277435, + "auxiliary_loss_mlp": 0.01087087, + "balance_loss_clip": 1.07822466, + "balance_loss_mlp": 1.05554461, + "epoch": 0.03144446114534796, + "flos": 18982164960000.0, + "grad_norm": 3.235221947515852, + "language_loss": 0.84783804, + "learning_rate": 3.999978158061963e-06, + "loss": 0.87148327, + "num_input_tokens_seen": 11104300, + "step": 523, + "time_per_iteration": 2.472031354904175 + }, + { + "auxiliary_loss_clip": 0.01282311, + "auxiliary_loss_mlp": 0.01075782, + "balance_loss_clip": 1.08011234, + "balance_loss_mlp": 1.04261756, + "epoch": 0.031504584398015935, + "flos": 22637692080000.0, + "grad_norm": 3.53419866585608, + "language_loss": 0.90326703, + "learning_rate": 3.999976299984099e-06, + "loss": 0.92684793, + "num_input_tokens_seen": 11123335, + "step": 524, + "time_per_iteration": 2.509885549545288 + }, + { + "auxiliary_loss_clip": 0.01285763, + "auxiliary_loss_mlp": 0.01084426, + "balance_loss_clip": 1.08335018, + "balance_loss_mlp": 1.05066633, + "epoch": 0.0315647076506839, + "flos": 25297056339840.0, + "grad_norm": 3.1148100584455056, + "language_loss": 0.80058461, + "learning_rate": 3.999974366066933e-06, + "loss": 0.82428652, + "num_input_tokens_seen": 11140880, + "step": 525, + "time_per_iteration": 2.519080877304077 + }, + { + "auxiliary_loss_clip": 0.012756, + "auxiliary_loss_mlp": 0.01083036, + "balance_loss_clip": 1.0773108, + "balance_loss_mlp": 1.05046833, + "epoch": 0.03162483090335187, + "flos": 16982839065600.0, + "grad_norm": 2.0852517719392165, + "language_loss": 0.80368161, + "learning_rate": 3.999972356310538e-06, + "loss": 0.827268, + "num_input_tokens_seen": 11158710, + "step": 526, + "time_per_iteration": 2.4859261512756348 + }, + { + "auxiliary_loss_clip": 0.01285138, + "auxiliary_loss_mlp": 0.010677, + "balance_loss_clip": 1.08423853, + "balance_loss_mlp": 1.03329587, + "epoch": 0.03168495415601984, + "flos": 18734489706240.0, + "grad_norm": 2.800288459454659, + "language_loss": 0.81673521, + "learning_rate": 3.999970270714991e-06, + "loss": 0.84026361, + "num_input_tokens_seen": 11177550, + "step": 527, + "time_per_iteration": 3.8811967372894287 + }, + { + "auxiliary_loss_clip": 0.01271994, + "auxiliary_loss_mlp": 0.01085217, + "balance_loss_clip": 1.07671165, + "balance_loss_mlp": 1.05162358, + "epoch": 0.03174507740868781, + "flos": 21214875473280.0, + "grad_norm": 1.9558711426931759, + "language_loss": 0.93774784, + "learning_rate": 3.999968109280371e-06, + "loss": 0.96131992, + "num_input_tokens_seen": 11196230, + "step": 528, + "time_per_iteration": 2.4739744663238525 + }, + { + "auxiliary_loss_clip": 0.01273306, + "auxiliary_loss_mlp": 0.01074173, + "balance_loss_clip": 1.07699466, + "balance_loss_mlp": 1.04198682, + "epoch": 0.03180520066135578, + "flos": 24787663614720.0, + "grad_norm": 1.87565134130626, + "language_loss": 0.83922398, + "learning_rate": 3.99996587200676e-06, + "loss": 0.86269873, + "num_input_tokens_seen": 11214935, + "step": 529, + "time_per_iteration": 5.339153289794922 + }, + { + "auxiliary_loss_clip": 0.01277911, + "auxiliary_loss_mlp": 0.01083384, + "balance_loss_clip": 1.08498645, + "balance_loss_mlp": 1.05182922, + "epoch": 0.03186532391402375, + "flos": 24864261367680.0, + "grad_norm": 1.9475299213372474, + "language_loss": 0.90660334, + "learning_rate": 3.999963558894243e-06, + "loss": 0.93021625, + "num_input_tokens_seen": 11235310, + "step": 530, + "time_per_iteration": 2.5239710807800293 + }, + { + "auxiliary_loss_clip": 0.01272192, + "auxiliary_loss_mlp": 0.01075399, + "balance_loss_clip": 1.07449484, + "balance_loss_mlp": 1.04109061, + "epoch": 0.03192544716669172, + "flos": 21215055041280.0, + "grad_norm": 2.201504003702701, + "language_loss": 0.76071054, + "learning_rate": 3.999961169942907e-06, + "loss": 0.78418648, + "num_input_tokens_seen": 11254425, + "step": 531, + "time_per_iteration": 2.4894371032714844 + }, + { + "auxiliary_loss_clip": 0.01270717, + "auxiliary_loss_mlp": 0.01061048, + "balance_loss_clip": 1.07549882, + "balance_loss_mlp": 1.02859974, + "epoch": 0.03198557041935969, + "flos": 24353216616960.0, + "grad_norm": 3.073395398223609, + "language_loss": 0.90312344, + "learning_rate": 3.999958705152843e-06, + "loss": 0.92644107, + "num_input_tokens_seen": 11274595, + "step": 532, + "time_per_iteration": 3.9291367530822754 + }, + { + "auxiliary_loss_clip": 0.01139221, + "auxiliary_loss_mlp": 0.01013038, + "balance_loss_clip": 1.04662085, + "balance_loss_mlp": 1.00555122, + "epoch": 0.032045693672027656, + "flos": 61827367587840.0, + "grad_norm": 0.7398484017489936, + "language_loss": 0.57966787, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.60119051, + "num_input_tokens_seen": 11336705, + "step": 533, + "time_per_iteration": 3.1299264430999756 + }, + { + "auxiliary_loss_clip": 0.01270256, + "auxiliary_loss_mlp": 0.0108017, + "balance_loss_clip": 1.07609022, + "balance_loss_mlp": 1.04848409, + "epoch": 0.03210581692469563, + "flos": 28401174800640.0, + "grad_norm": 1.984427574189356, + "language_loss": 0.86596537, + "learning_rate": 3.999953548056907e-06, + "loss": 0.88946962, + "num_input_tokens_seen": 11356820, + "step": 534, + "time_per_iteration": 2.548802137374878 + }, + { + "auxiliary_loss_clip": 0.01270592, + "auxiliary_loss_mlp": 0.0106526, + "balance_loss_clip": 1.07795644, + "balance_loss_mlp": 1.03414667, + "epoch": 0.03216594017736359, + "flos": 24717709877760.0, + "grad_norm": 2.8101864068775853, + "language_loss": 0.77494359, + "learning_rate": 3.999950855751232e-06, + "loss": 0.79830205, + "num_input_tokens_seen": 11376645, + "step": 535, + "time_per_iteration": 2.5143158435821533 + }, + { + "auxiliary_loss_clip": 0.01273824, + "auxiliary_loss_mlp": 0.01080524, + "balance_loss_clip": 1.07955217, + "balance_loss_mlp": 1.04905248, + "epoch": 0.032226063430031565, + "flos": 31175453646720.0, + "grad_norm": 2.21633001275123, + "language_loss": 0.80781323, + "learning_rate": 3.999948087607219e-06, + "loss": 0.8313567, + "num_input_tokens_seen": 11397310, + "step": 536, + "time_per_iteration": 2.571321964263916 + }, + { + "auxiliary_loss_clip": 0.0127437, + "auxiliary_loss_mlp": 0.01073678, + "balance_loss_clip": 1.07974696, + "balance_loss_mlp": 1.04039514, + "epoch": 0.03228618668269954, + "flos": 32198225506560.0, + "grad_norm": 3.1319395201691136, + "language_loss": 0.70305437, + "learning_rate": 3.999945243624975e-06, + "loss": 0.72653484, + "num_input_tokens_seen": 11418475, + "step": 537, + "time_per_iteration": 2.5643370151519775 + }, + { + "auxiliary_loss_clip": 0.01274361, + "auxiliary_loss_mlp": 0.0107843, + "balance_loss_clip": 1.08375478, + "balance_loss_mlp": 1.04705405, + "epoch": 0.0323463099353675, + "flos": 22670154996480.0, + "grad_norm": 2.270865022404991, + "language_loss": 0.828614, + "learning_rate": 3.999942323804607e-06, + "loss": 0.85214186, + "num_input_tokens_seen": 11436630, + "step": 538, + "time_per_iteration": 2.5318655967712402 + }, + { + "auxiliary_loss_clip": 0.01282063, + "auxiliary_loss_mlp": 0.01079965, + "balance_loss_clip": 1.08134007, + "balance_loss_mlp": 1.04801726, + "epoch": 0.032406433188035474, + "flos": 26905172232960.0, + "grad_norm": 1.7570041660039513, + "language_loss": 0.79285616, + "learning_rate": 3.999939328146225e-06, + "loss": 0.81647646, + "num_input_tokens_seen": 11457275, + "step": 539, + "time_per_iteration": 2.5331685543060303 + }, + { + "auxiliary_loss_clip": 0.01271826, + "auxiliary_loss_mlp": 0.01070166, + "balance_loss_clip": 1.07804561, + "balance_loss_mlp": 1.03652549, + "epoch": 0.03246655644070344, + "flos": 31503928544640.0, + "grad_norm": 2.1578675588173466, + "language_loss": 0.77643526, + "learning_rate": 3.999936256649943e-06, + "loss": 0.79985517, + "num_input_tokens_seen": 11476925, + "step": 540, + "time_per_iteration": 2.5724873542785645 + }, + { + "auxiliary_loss_clip": 0.01281682, + "auxiliary_loss_mlp": 0.01070801, + "balance_loss_clip": 1.08246756, + "balance_loss_mlp": 1.03882873, + "epoch": 0.03252667969337141, + "flos": 23218331431680.0, + "grad_norm": 2.097653839066907, + "language_loss": 0.85517079, + "learning_rate": 3.999933109315878e-06, + "loss": 0.87869561, + "num_input_tokens_seen": 11496830, + "step": 541, + "time_per_iteration": 2.495028018951416 + }, + { + "auxiliary_loss_clip": 0.01269965, + "auxiliary_loss_mlp": 0.01079159, + "balance_loss_clip": 1.07954597, + "balance_loss_mlp": 1.04613841, + "epoch": 0.032586802946039384, + "flos": 14757454926720.0, + "grad_norm": 2.4102764613127583, + "language_loss": 0.89100009, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.91449136, + "num_input_tokens_seen": 11515605, + "step": 542, + "time_per_iteration": 2.490851402282715 + }, + { + "auxiliary_loss_clip": 0.01274224, + "auxiliary_loss_mlp": 0.0107854, + "balance_loss_clip": 1.07985973, + "balance_loss_mlp": 1.04641259, + "epoch": 0.03264692619870735, + "flos": 24280677100800.0, + "grad_norm": 1.8370824708215936, + "language_loss": 0.71028399, + "learning_rate": 3.999926587134879e-06, + "loss": 0.73381162, + "num_input_tokens_seen": 11536230, + "step": 543, + "time_per_iteration": 2.5286896228790283 + }, + { + "auxiliary_loss_clip": 0.01268757, + "auxiliary_loss_mlp": 0.01088585, + "balance_loss_clip": 1.07183158, + "balance_loss_mlp": 1.0565412, + "epoch": 0.03270704945137532, + "flos": 22893160584960.0, + "grad_norm": 3.1021649006178613, + "language_loss": 0.91572207, + "learning_rate": 3.999923212288192e-06, + "loss": 0.93929553, + "num_input_tokens_seen": 11554715, + "step": 544, + "time_per_iteration": 2.4921436309814453 + }, + { + "auxiliary_loss_clip": 0.01274905, + "auxiliary_loss_mlp": 0.01083245, + "balance_loss_clip": 1.0798986, + "balance_loss_mlp": 1.05353785, + "epoch": 0.032767172704043286, + "flos": 18041018757120.0, + "grad_norm": 3.7502744098743688, + "language_loss": 0.65763068, + "learning_rate": 3.999919761604216e-06, + "loss": 0.68121219, + "num_input_tokens_seen": 11571370, + "step": 545, + "time_per_iteration": 2.4578700065612793 + }, + { + "auxiliary_loss_clip": 0.01271545, + "auxiliary_loss_mlp": 0.01067537, + "balance_loss_clip": 1.07631397, + "balance_loss_mlp": 1.03656638, + "epoch": 0.03282729595671126, + "flos": 22528739151360.0, + "grad_norm": 2.139599197895755, + "language_loss": 0.91862822, + "learning_rate": 3.999916235083083e-06, + "loss": 0.94201905, + "num_input_tokens_seen": 11588560, + "step": 546, + "time_per_iteration": 2.5076115131378174 + }, + { + "auxiliary_loss_clip": 0.0126959, + "auxiliary_loss_mlp": 0.01070576, + "balance_loss_clip": 1.07379556, + "balance_loss_mlp": 1.03765059, + "epoch": 0.03288741920937923, + "flos": 20410620001920.0, + "grad_norm": 2.3647671834954482, + "language_loss": 0.81899041, + "learning_rate": 3.999912632724925e-06, + "loss": 0.84239209, + "num_input_tokens_seen": 11605685, + "step": 547, + "time_per_iteration": 2.4750301837921143 + }, + { + "auxiliary_loss_clip": 0.01271216, + "auxiliary_loss_mlp": 0.01071361, + "balance_loss_clip": 1.07726562, + "balance_loss_mlp": 1.03912687, + "epoch": 0.032947542462047195, + "flos": 20777986350720.0, + "grad_norm": 1.76780359875636, + "language_loss": 0.81039995, + "learning_rate": 3.999908954529881e-06, + "loss": 0.83382571, + "num_input_tokens_seen": 11626290, + "step": 548, + "time_per_iteration": 2.5423567295074463 + }, + { + "auxiliary_loss_clip": 0.01271097, + "auxiliary_loss_mlp": 0.01078742, + "balance_loss_clip": 1.07667351, + "balance_loss_mlp": 1.04491043, + "epoch": 0.03300766571471517, + "flos": 19901263190400.0, + "grad_norm": 3.070341316265579, + "language_loss": 0.67900735, + "learning_rate": 3.999905200498087e-06, + "loss": 0.70250577, + "num_input_tokens_seen": 11643950, + "step": 549, + "time_per_iteration": 2.461216688156128 + }, + { + "auxiliary_loss_clip": 0.01264902, + "auxiliary_loss_mlp": 0.01075252, + "balance_loss_clip": 1.0763402, + "balance_loss_mlp": 1.04375708, + "epoch": 0.03306778896738313, + "flos": 17967760968960.0, + "grad_norm": 1.955900987754541, + "language_loss": 0.86177558, + "learning_rate": 3.999901370629689e-06, + "loss": 0.88517714, + "num_input_tokens_seen": 11662560, + "step": 550, + "time_per_iteration": 2.47871732711792 + }, + { + "auxiliary_loss_clip": 0.01274555, + "auxiliary_loss_mlp": 0.01086557, + "balance_loss_clip": 1.08146811, + "balance_loss_mlp": 1.05499077, + "epoch": 0.033127912220051105, + "flos": 21653380707840.0, + "grad_norm": 1.8347107206490618, + "language_loss": 0.80992532, + "learning_rate": 3.99989746492483e-06, + "loss": 0.83353639, + "num_input_tokens_seen": 11682265, + "step": 551, + "time_per_iteration": 2.4783577919006348 + }, + { + "auxiliary_loss_clip": 0.01279485, + "auxiliary_loss_mlp": 0.01085097, + "balance_loss_clip": 1.08024275, + "balance_loss_mlp": 1.05300593, + "epoch": 0.03318803547271908, + "flos": 30188376927360.0, + "grad_norm": 5.198533295083856, + "language_loss": 0.8628543, + "learning_rate": 3.999893483383658e-06, + "loss": 0.88650012, + "num_input_tokens_seen": 11699300, + "step": 552, + "time_per_iteration": 2.5497093200683594 + }, + { + "auxiliary_loss_clip": 0.01276707, + "auxiliary_loss_mlp": 0.01079008, + "balance_loss_clip": 1.08169341, + "balance_loss_mlp": 1.04500961, + "epoch": 0.03324815872538704, + "flos": 20376038183040.0, + "grad_norm": 3.35803272880239, + "language_loss": 0.93045783, + "learning_rate": 3.999889426006326e-06, + "loss": 0.95401502, + "num_input_tokens_seen": 11716955, + "step": 553, + "time_per_iteration": 2.481351852416992 + }, + { + "auxiliary_loss_clip": 0.01271001, + "auxiliary_loss_mlp": 0.0107448, + "balance_loss_clip": 1.07799721, + "balance_loss_mlp": 1.04045725, + "epoch": 0.033308281978055014, + "flos": 24494560634880.0, + "grad_norm": 5.347904269112907, + "language_loss": 0.7888574, + "learning_rate": 3.999885292792986e-06, + "loss": 0.81231219, + "num_input_tokens_seen": 11736130, + "step": 554, + "time_per_iteration": 2.5355350971221924 + }, + { + "auxiliary_loss_clip": 0.01267049, + "auxiliary_loss_mlp": 0.01081849, + "balance_loss_clip": 1.07656705, + "balance_loss_mlp": 1.04737329, + "epoch": 0.03336840523072298, + "flos": 23400326666880.0, + "grad_norm": 2.502474885874672, + "language_loss": 0.82027483, + "learning_rate": 3.999881083743795e-06, + "loss": 0.84376377, + "num_input_tokens_seen": 11754425, + "step": 555, + "time_per_iteration": 2.480738639831543 + }, + { + "auxiliary_loss_clip": 0.01272573, + "auxiliary_loss_mlp": 0.0107962, + "balance_loss_clip": 1.07746768, + "balance_loss_mlp": 1.0466466, + "epoch": 0.03342852848339095, + "flos": 30550571717760.0, + "grad_norm": 2.6150130205433646, + "language_loss": 0.88445431, + "learning_rate": 3.999876798858914e-06, + "loss": 0.90797627, + "num_input_tokens_seen": 11772845, + "step": 556, + "time_per_iteration": 2.5686826705932617 + }, + { + "auxiliary_loss_clip": 0.01270657, + "auxiliary_loss_mlp": 0.01086897, + "balance_loss_clip": 1.07758415, + "balance_loss_mlp": 1.05289841, + "epoch": 0.03348865173605892, + "flos": 22893304239360.0, + "grad_norm": 2.65398686665017, + "language_loss": 0.83658433, + "learning_rate": 3.999872438138503e-06, + "loss": 0.86015987, + "num_input_tokens_seen": 11792850, + "step": 557, + "time_per_iteration": 2.4996397495269775 + }, + { + "auxiliary_loss_clip": 0.0127623, + "auxiliary_loss_mlp": 0.01065361, + "balance_loss_clip": 1.08202505, + "balance_loss_mlp": 1.03388989, + "epoch": 0.03354877498872689, + "flos": 17676022705920.0, + "grad_norm": 3.071701654278524, + "language_loss": 0.94121724, + "learning_rate": 3.999868001582729e-06, + "loss": 0.96463311, + "num_input_tokens_seen": 11809670, + "step": 558, + "time_per_iteration": 2.469963312149048 + }, + { + "auxiliary_loss_clip": 0.01267375, + "auxiliary_loss_mlp": 0.01073513, + "balance_loss_clip": 1.07459259, + "balance_loss_mlp": 1.04099238, + "epoch": 0.03360889824139486, + "flos": 21652985658240.0, + "grad_norm": 2.1421963364554015, + "language_loss": 0.77135956, + "learning_rate": 3.99986348919176e-06, + "loss": 0.79476845, + "num_input_tokens_seen": 11829665, + "step": 559, + "time_per_iteration": 2.4862797260284424 + }, + { + "auxiliary_loss_clip": 0.01270665, + "auxiliary_loss_mlp": 0.0108025, + "balance_loss_clip": 1.07785761, + "balance_loss_mlp": 1.04868329, + "epoch": 0.033669021494062826, + "flos": 21795730306560.0, + "grad_norm": 2.2136735764504754, + "language_loss": 0.87594247, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.89945161, + "num_input_tokens_seen": 11848190, + "step": 560, + "time_per_iteration": 2.562744140625 + }, + { + "auxiliary_loss_clip": 0.01264831, + "auxiliary_loss_mlp": 0.01067122, + "balance_loss_clip": 1.07568228, + "balance_loss_mlp": 1.03689086, + "epoch": 0.0337291447467308, + "flos": 21866222747520.0, + "grad_norm": 4.9504778131490115, + "language_loss": 0.81872606, + "learning_rate": 3.999854236904925e-06, + "loss": 0.84204555, + "num_input_tokens_seen": 11864795, + "step": 561, + "time_per_iteration": 2.4801275730133057 + }, + { + "auxiliary_loss_clip": 0.01265414, + "auxiliary_loss_mlp": 0.0107009, + "balance_loss_clip": 1.07690668, + "balance_loss_mlp": 1.03911924, + "epoch": 0.03378926799939877, + "flos": 24245951627520.0, + "grad_norm": 1.7699075076382416, + "language_loss": 0.82244611, + "learning_rate": 3.999849497009409e-06, + "loss": 0.84580117, + "num_input_tokens_seen": 11885275, + "step": 562, + "time_per_iteration": 2.519102096557617 + }, + { + "auxiliary_loss_clip": 0.01272253, + "auxiliary_loss_mlp": 0.0108053, + "balance_loss_clip": 1.0792439, + "balance_loss_mlp": 1.04862928, + "epoch": 0.033849391252066735, + "flos": 16507812677760.0, + "grad_norm": 2.243640090035936, + "language_loss": 0.84315574, + "learning_rate": 3.999844681279401e-06, + "loss": 0.8666836, + "num_input_tokens_seen": 11903595, + "step": 563, + "time_per_iteration": 2.4492974281311035 + }, + { + "auxiliary_loss_clip": 0.01270524, + "auxiliary_loss_mlp": 0.01083307, + "balance_loss_clip": 1.07820928, + "balance_loss_mlp": 1.05107248, + "epoch": 0.03390951450473471, + "flos": 15669298609920.0, + "grad_norm": 2.243490051481645, + "language_loss": 0.94325721, + "learning_rate": 3.99983978971508e-06, + "loss": 0.96679556, + "num_input_tokens_seen": 11917815, + "step": 564, + "time_per_iteration": 2.4681451320648193 + }, + { + "auxiliary_loss_clip": 0.01269156, + "auxiliary_loss_mlp": 0.0106997, + "balance_loss_clip": 1.07518101, + "balance_loss_mlp": 1.03682995, + "epoch": 0.03396963775740267, + "flos": 22674787850880.0, + "grad_norm": 2.478504994341654, + "language_loss": 0.9405092, + "learning_rate": 3.999834822316635e-06, + "loss": 0.96390051, + "num_input_tokens_seen": 11936305, + "step": 565, + "time_per_iteration": 2.4794232845306396 + }, + { + "auxiliary_loss_clip": 0.01142712, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.05036592, + "balance_loss_mlp": 1.03192317, + "epoch": 0.034029761010070644, + "flos": 64392683063040.0, + "grad_norm": 1.0848581903627883, + "language_loss": 0.54903507, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.57085627, + "num_input_tokens_seen": 11998940, + "step": 566, + "time_per_iteration": 4.444222450256348 + }, + { + "auxiliary_loss_clip": 0.01272133, + "auxiliary_loss_mlp": 0.01074469, + "balance_loss_clip": 1.0785104, + "balance_loss_mlp": 1.04037571, + "epoch": 0.034089884262738616, + "flos": 25004204755200.0, + "grad_norm": 3.000057000328792, + "language_loss": 0.76652658, + "learning_rate": 3.999824660018126e-06, + "loss": 0.78999257, + "num_input_tokens_seen": 12018860, + "step": 567, + "time_per_iteration": 2.5354883670806885 + }, + { + "auxiliary_loss_clip": 0.01265446, + "auxiliary_loss_mlp": 0.01079722, + "balance_loss_clip": 1.07800078, + "balance_loss_mlp": 1.04910874, + "epoch": 0.03415000751540658, + "flos": 28439096584320.0, + "grad_norm": 1.8408513697147326, + "language_loss": 0.80652207, + "learning_rate": 3.999819465118447e-06, + "loss": 0.8299737, + "num_input_tokens_seen": 12039675, + "step": 568, + "time_per_iteration": 3.996645212173462 + }, + { + "auxiliary_loss_clip": 0.01266546, + "auxiliary_loss_mlp": 0.0108589, + "balance_loss_clip": 1.07965386, + "balance_loss_mlp": 1.05365551, + "epoch": 0.034210130768074554, + "flos": 21468727866240.0, + "grad_norm": 1.9195599722514187, + "language_loss": 0.86389709, + "learning_rate": 3.999814194385413e-06, + "loss": 0.88742137, + "num_input_tokens_seen": 12057680, + "step": 569, + "time_per_iteration": 3.916688919067383 + }, + { + "auxiliary_loss_clip": 0.01265683, + "auxiliary_loss_mlp": 0.01075379, + "balance_loss_clip": 1.07660246, + "balance_loss_mlp": 1.0441227, + "epoch": 0.03427025402074252, + "flos": 18697501676160.0, + "grad_norm": 1.815386908674507, + "language_loss": 0.95938987, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.98280048, + "num_input_tokens_seen": 12076135, + "step": 570, + "time_per_iteration": 2.4799740314483643 + }, + { + "auxiliary_loss_clip": 0.01267132, + "auxiliary_loss_mlp": 0.01076246, + "balance_loss_clip": 1.07352054, + "balance_loss_mlp": 1.04127038, + "epoch": 0.03433037727341049, + "flos": 20849987162880.0, + "grad_norm": 2.1779943541162785, + "language_loss": 0.79948843, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.82292223, + "num_input_tokens_seen": 12094785, + "step": 571, + "time_per_iteration": 3.8558449745178223 + }, + { + "auxiliary_loss_clip": 0.01267811, + "auxiliary_loss_mlp": 0.01083382, + "balance_loss_clip": 1.07853591, + "balance_loss_mlp": 1.05014658, + "epoch": 0.03439050052607846, + "flos": 25410282986880.0, + "grad_norm": 4.571174304564672, + "language_loss": 0.80325431, + "learning_rate": 3.999797927188199e-06, + "loss": 0.82676625, + "num_input_tokens_seen": 12114590, + "step": 572, + "time_per_iteration": 2.5146992206573486 + }, + { + "auxiliary_loss_clip": 0.01275728, + "auxiliary_loss_mlp": 0.01072275, + "balance_loss_clip": 1.08139789, + "balance_loss_mlp": 1.03987396, + "epoch": 0.03445062377874643, + "flos": 17640147997440.0, + "grad_norm": 1.8272520339256504, + "language_loss": 0.84589422, + "learning_rate": 3.999792353123774e-06, + "loss": 0.86937428, + "num_input_tokens_seen": 12132390, + "step": 573, + "time_per_iteration": 2.4722063541412354 + }, + { + "auxiliary_loss_clip": 0.01268498, + "auxiliary_loss_mlp": 0.01066585, + "balance_loss_clip": 1.07571602, + "balance_loss_mlp": 1.03625786, + "epoch": 0.0345107470314144, + "flos": 16764502245120.0, + "grad_norm": 2.4119711403306447, + "language_loss": 0.7641983, + "learning_rate": 3.999786703227023e-06, + "loss": 0.78754914, + "num_input_tokens_seen": 12149035, + "step": 574, + "time_per_iteration": 2.478041648864746 + }, + { + "auxiliary_loss_clip": 0.01268233, + "auxiliary_loss_mlp": 0.01070726, + "balance_loss_clip": 1.07792163, + "balance_loss_mlp": 1.03989804, + "epoch": 0.03457087028408237, + "flos": 14684448533760.0, + "grad_norm": 2.5343917115280123, + "language_loss": 0.83672482, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.86011446, + "num_input_tokens_seen": 12167530, + "step": 575, + "time_per_iteration": 2.473597764968872 + }, + { + "auxiliary_loss_clip": 0.01263558, + "auxiliary_loss_mlp": 0.01075363, + "balance_loss_clip": 1.07851744, + "balance_loss_mlp": 1.04424894, + "epoch": 0.03463099353675034, + "flos": 20011293527040.0, + "grad_norm": 1.9706343163024558, + "language_loss": 0.84135616, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.86474538, + "num_input_tokens_seen": 12186340, + "step": 576, + "time_per_iteration": 2.4883697032928467 + }, + { + "auxiliary_loss_clip": 0.01270893, + "auxiliary_loss_mlp": 0.01071544, + "balance_loss_clip": 1.08502579, + "balance_loss_mlp": 1.0416944, + "epoch": 0.03469111678941831, + "flos": 25301150490240.0, + "grad_norm": 2.13412835635318, + "language_loss": 0.86487353, + "learning_rate": 3.99976929854497e-06, + "loss": 0.88829786, + "num_input_tokens_seen": 12204090, + "step": 577, + "time_per_iteration": 2.5000736713409424 + }, + { + "auxiliary_loss_clip": 0.01265697, + "auxiliary_loss_mlp": 0.01076331, + "balance_loss_clip": 1.08083272, + "balance_loss_mlp": 1.04476428, + "epoch": 0.034751240042086275, + "flos": 23259413612160.0, + "grad_norm": 1.8650738549208734, + "language_loss": 0.72178662, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.74520695, + "num_input_tokens_seen": 12224850, + "step": 578, + "time_per_iteration": 2.5300705432891846 + }, + { + "auxiliary_loss_clip": 0.01266366, + "auxiliary_loss_mlp": 0.01082288, + "balance_loss_clip": 1.07687616, + "balance_loss_mlp": 1.04945803, + "epoch": 0.03481136329475425, + "flos": 23769237300480.0, + "grad_norm": 2.3731535740735743, + "language_loss": 0.77602941, + "learning_rate": 3.999757316265973e-06, + "loss": 0.79951596, + "num_input_tokens_seen": 12244935, + "step": 579, + "time_per_iteration": 2.4892470836639404 + }, + { + "auxiliary_loss_clip": 0.01265137, + "auxiliary_loss_mlp": 0.01080635, + "balance_loss_clip": 1.07855868, + "balance_loss_mlp": 1.04816198, + "epoch": 0.03487148654742222, + "flos": 20157521794560.0, + "grad_norm": 1.9065227364371031, + "language_loss": 0.86615121, + "learning_rate": 3.999751211379863e-06, + "loss": 0.88960898, + "num_input_tokens_seen": 12262140, + "step": 580, + "time_per_iteration": 2.496328353881836 + }, + { + "auxiliary_loss_clip": 0.01270842, + "auxiliary_loss_mlp": 0.0106775, + "balance_loss_clip": 1.07863641, + "balance_loss_mlp": 1.03856754, + "epoch": 0.034931609800090184, + "flos": 15669585918720.0, + "grad_norm": 2.7049339501381997, + "language_loss": 0.8202002, + "learning_rate": 3.999745030662987e-06, + "loss": 0.84358615, + "num_input_tokens_seen": 12280930, + "step": 581, + "time_per_iteration": 2.4669504165649414 + }, + { + "auxiliary_loss_clip": 0.01267412, + "auxiliary_loss_mlp": 0.01071462, + "balance_loss_clip": 1.08008754, + "balance_loss_mlp": 1.04111099, + "epoch": 0.034991733052758156, + "flos": 16362374509440.0, + "grad_norm": 2.132807967618713, + "language_loss": 0.77002013, + "learning_rate": 3.99973877411558e-06, + "loss": 0.79340887, + "num_input_tokens_seen": 12299125, + "step": 582, + "time_per_iteration": 2.5026347637176514 + }, + { + "auxiliary_loss_clip": 0.01265817, + "auxiliary_loss_mlp": 0.01078064, + "balance_loss_clip": 1.08111489, + "balance_loss_mlp": 1.04580593, + "epoch": 0.03505185630542612, + "flos": 19387309438080.0, + "grad_norm": 1.9906301773961352, + "language_loss": 0.87732178, + "learning_rate": 3.999732441737877e-06, + "loss": 0.90076059, + "num_input_tokens_seen": 12316905, + "step": 583, + "time_per_iteration": 2.450899600982666 + }, + { + "auxiliary_loss_clip": 0.01270841, + "auxiliary_loss_mlp": 0.01087585, + "balance_loss_clip": 1.07904279, + "balance_loss_mlp": 1.05628026, + "epoch": 0.03511197955809409, + "flos": 21323828401920.0, + "grad_norm": 2.6634599873608877, + "language_loss": 0.81370026, + "learning_rate": 3.99972603353012e-06, + "loss": 0.83728456, + "num_input_tokens_seen": 12335070, + "step": 584, + "time_per_iteration": 2.5133345127105713 + }, + { + "auxiliary_loss_clip": 0.01266393, + "auxiliary_loss_mlp": 0.01070571, + "balance_loss_clip": 1.0767653, + "balance_loss_mlp": 1.0396955, + "epoch": 0.035172102810762065, + "flos": 14136595320960.0, + "grad_norm": 3.401273540972964, + "language_loss": 0.92766976, + "learning_rate": 3.999719549492551e-06, + "loss": 0.95103943, + "num_input_tokens_seen": 12350315, + "step": 585, + "time_per_iteration": 2.4437124729156494 + }, + { + "auxiliary_loss_clip": 0.01267492, + "auxiliary_loss_mlp": 0.01075471, + "balance_loss_clip": 1.07925153, + "balance_loss_mlp": 1.04435754, + "epoch": 0.03523222606343003, + "flos": 20296890564480.0, + "grad_norm": 3.549505062469986, + "language_loss": 0.87795913, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.90138876, + "num_input_tokens_seen": 12366030, + "step": 586, + "time_per_iteration": 2.461998701095581 + }, + { + "auxiliary_loss_clip": 0.01270725, + "auxiliary_loss_mlp": 0.01079455, + "balance_loss_clip": 1.08072937, + "balance_loss_mlp": 1.04862714, + "epoch": 0.035292349316098, + "flos": 20375822701440.0, + "grad_norm": 1.8672240406682776, + "language_loss": 0.76943094, + "learning_rate": 3.999706353928965e-06, + "loss": 0.79293275, + "num_input_tokens_seen": 12384895, + "step": 587, + "time_per_iteration": 2.461668014526367 + }, + { + "auxiliary_loss_clip": 0.01272382, + "auxiliary_loss_mlp": 0.01067356, + "balance_loss_clip": 1.07962024, + "balance_loss_mlp": 1.03502691, + "epoch": 0.03535247256876597, + "flos": 21468871520640.0, + "grad_norm": 6.087030229719717, + "language_loss": 0.78839141, + "learning_rate": 3.999699642403449e-06, + "loss": 0.8117888, + "num_input_tokens_seen": 12404980, + "step": 588, + "time_per_iteration": 2.504309892654419 + }, + { + "auxiliary_loss_clip": 0.01269633, + "auxiliary_loss_mlp": 0.01078179, + "balance_loss_clip": 1.07796383, + "balance_loss_mlp": 1.04418015, + "epoch": 0.03541259582143394, + "flos": 23623044946560.0, + "grad_norm": 2.055504965058517, + "language_loss": 0.94321144, + "learning_rate": 3.99969285504912e-06, + "loss": 0.96668953, + "num_input_tokens_seen": 12423835, + "step": 589, + "time_per_iteration": 2.4861948490142822 + }, + { + "auxiliary_loss_clip": 0.01271455, + "auxiliary_loss_mlp": 0.01076565, + "balance_loss_clip": 1.07956159, + "balance_loss_mlp": 1.04623866, + "epoch": 0.03547271907410191, + "flos": 33726367768320.0, + "grad_norm": 2.322929290027473, + "language_loss": 0.83637238, + "learning_rate": 3.99968599186624e-06, + "loss": 0.85985255, + "num_input_tokens_seen": 12443135, + "step": 590, + "time_per_iteration": 2.5837550163269043 + }, + { + "auxiliary_loss_clip": 0.01262852, + "auxiliary_loss_mlp": 0.01069764, + "balance_loss_clip": 1.07772732, + "balance_loss_mlp": 1.04074848, + "epoch": 0.03553284232676988, + "flos": 21142695093120.0, + "grad_norm": 2.880934480499134, + "language_loss": 0.87028849, + "learning_rate": 3.999679052855065e-06, + "loss": 0.89361459, + "num_input_tokens_seen": 12462895, + "step": 591, + "time_per_iteration": 2.471809148788452 + }, + { + "auxiliary_loss_clip": 0.01266617, + "auxiliary_loss_mlp": 0.01078025, + "balance_loss_clip": 1.07488751, + "balance_loss_mlp": 1.04536152, + "epoch": 0.03559296557943785, + "flos": 20046593617920.0, + "grad_norm": 2.324912267432322, + "language_loss": 0.82935798, + "learning_rate": 3.999672038015861e-06, + "loss": 0.85280442, + "num_input_tokens_seen": 12481515, + "step": 592, + "time_per_iteration": 2.475825548171997 + }, + { + "auxiliary_loss_clip": 0.01163403, + "auxiliary_loss_mlp": 0.01042793, + "balance_loss_clip": 1.06023741, + "balance_loss_mlp": 1.0369513, + "epoch": 0.035653088832105814, + "flos": 60334597244160.0, + "grad_norm": 0.8946623584723222, + "language_loss": 0.59819162, + "learning_rate": 3.999664947348893e-06, + "loss": 0.62025356, + "num_input_tokens_seen": 12548220, + "step": 593, + "time_per_iteration": 3.1318681240081787 + }, + { + "auxiliary_loss_clip": 0.01267086, + "auxiliary_loss_mlp": 0.01071923, + "balance_loss_clip": 1.08094192, + "balance_loss_mlp": 1.03983164, + "epoch": 0.035713212084773786, + "flos": 20113135562880.0, + "grad_norm": 2.707095985408309, + "language_loss": 0.86923981, + "learning_rate": 3.999657780854429e-06, + "loss": 0.89262986, + "num_input_tokens_seen": 12566105, + "step": 594, + "time_per_iteration": 2.480548620223999 + }, + { + "auxiliary_loss_clip": 0.01264633, + "auxiliary_loss_mlp": 0.01071495, + "balance_loss_clip": 1.0750066, + "balance_loss_mlp": 1.04054785, + "epoch": 0.03577333533744176, + "flos": 26285785084800.0, + "grad_norm": 2.022232648847156, + "language_loss": 0.83355463, + "learning_rate": 3.999650538532742e-06, + "loss": 0.85691583, + "num_input_tokens_seen": 12586680, + "step": 595, + "time_per_iteration": 2.5608370304107666 + }, + { + "auxiliary_loss_clip": 0.01264116, + "auxiliary_loss_mlp": 0.01077248, + "balance_loss_clip": 1.07853663, + "balance_loss_mlp": 1.04570556, + "epoch": 0.035833458590109724, + "flos": 10889732211840.0, + "grad_norm": 3.975459646866357, + "language_loss": 0.96219558, + "learning_rate": 3.999643220384106e-06, + "loss": 0.98560917, + "num_input_tokens_seen": 12601605, + "step": 596, + "time_per_iteration": 2.4495489597320557 + }, + { + "auxiliary_loss_clip": 0.01266936, + "auxiliary_loss_mlp": 0.01076897, + "balance_loss_clip": 1.07881594, + "balance_loss_mlp": 1.04685605, + "epoch": 0.035893581842777696, + "flos": 22090198003200.0, + "grad_norm": 2.546966433500354, + "language_loss": 0.8273415, + "learning_rate": 3.999635826408799e-06, + "loss": 0.85077983, + "num_input_tokens_seen": 12620365, + "step": 597, + "time_per_iteration": 2.5030415058135986 + }, + { + "auxiliary_loss_clip": 0.01263799, + "auxiliary_loss_mlp": 0.01069854, + "balance_loss_clip": 1.07954168, + "balance_loss_mlp": 1.03885996, + "epoch": 0.03595370509544566, + "flos": 23038347358080.0, + "grad_norm": 1.778157566472502, + "language_loss": 0.81453472, + "learning_rate": 3.999628356607101e-06, + "loss": 0.83787131, + "num_input_tokens_seen": 12641140, + "step": 598, + "time_per_iteration": 2.554788589477539 + }, + { + "auxiliary_loss_clip": 0.01259513, + "auxiliary_loss_mlp": 0.01071988, + "balance_loss_clip": 1.07961512, + "balance_loss_mlp": 1.03968191, + "epoch": 0.03601382834811363, + "flos": 20777734955520.0, + "grad_norm": 1.924807245635329, + "language_loss": 0.81037521, + "learning_rate": 3.999620810979295e-06, + "loss": 0.83369023, + "num_input_tokens_seen": 12661080, + "step": 599, + "time_per_iteration": 2.5619709491729736 + }, + { + "auxiliary_loss_clip": 0.01267438, + "auxiliary_loss_mlp": 0.01081938, + "balance_loss_clip": 1.07717586, + "balance_loss_mlp": 1.05218291, + "epoch": 0.036073951600781605, + "flos": 23951627585280.0, + "grad_norm": 2.4656747586614496, + "language_loss": 0.85833454, + "learning_rate": 3.999613189525668e-06, + "loss": 0.88182831, + "num_input_tokens_seen": 12678270, + "step": 600, + "time_per_iteration": 2.550051689147949 + }, + { + "auxiliary_loss_clip": 0.01259526, + "auxiliary_loss_mlp": 0.01085252, + "balance_loss_clip": 1.07389736, + "balance_loss_mlp": 1.05494857, + "epoch": 0.03613407485344957, + "flos": 18912283050240.0, + "grad_norm": 2.0754472473818395, + "language_loss": 0.82199258, + "learning_rate": 3.999605492246508e-06, + "loss": 0.84544039, + "num_input_tokens_seen": 12697295, + "step": 601, + "time_per_iteration": 2.4767348766326904 + }, + { + "auxiliary_loss_clip": 0.01256591, + "auxiliary_loss_mlp": 0.01060178, + "balance_loss_clip": 1.07390904, + "balance_loss_mlp": 1.02966058, + "epoch": 0.03619419810611754, + "flos": 23038526926080.0, + "grad_norm": 2.3440356128618745, + "language_loss": 0.7542612, + "learning_rate": 3.999597719142107e-06, + "loss": 0.77742893, + "num_input_tokens_seen": 12716165, + "step": 602, + "time_per_iteration": 2.471942663192749 + }, + { + "auxiliary_loss_clip": 0.01255861, + "auxiliary_loss_mlp": 0.01064639, + "balance_loss_clip": 1.07274139, + "balance_loss_mlp": 1.03354931, + "epoch": 0.03625432135878551, + "flos": 29457774293760.0, + "grad_norm": 1.9754909640674911, + "language_loss": 0.79593658, + "learning_rate": 3.999589870212761e-06, + "loss": 0.81914151, + "num_input_tokens_seen": 12735475, + "step": 603, + "time_per_iteration": 2.553173780441284 + }, + { + "auxiliary_loss_clip": 0.0126047, + "auxiliary_loss_mlp": 0.01066756, + "balance_loss_clip": 1.07811737, + "balance_loss_mlp": 1.03723943, + "epoch": 0.03631444461145348, + "flos": 23508525409920.0, + "grad_norm": 3.384710501025295, + "language_loss": 0.86839509, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.89166737, + "num_input_tokens_seen": 12754540, + "step": 604, + "time_per_iteration": 2.480534791946411 + }, + { + "auxiliary_loss_clip": 0.01263579, + "auxiliary_loss_mlp": 0.01071492, + "balance_loss_clip": 1.07967842, + "balance_loss_mlp": 1.03890014, + "epoch": 0.03637456786412145, + "flos": 16618130323200.0, + "grad_norm": 2.1549927094794272, + "language_loss": 0.80670166, + "learning_rate": 3.999573944880424e-06, + "loss": 0.83005238, + "num_input_tokens_seen": 12773050, + "step": 605, + "time_per_iteration": 3.824784517288208 + }, + { + "auxiliary_loss_clip": 0.01261027, + "auxiliary_loss_mlp": 0.01069907, + "balance_loss_clip": 1.07714224, + "balance_loss_mlp": 1.04121375, + "epoch": 0.03643469111678942, + "flos": 15851832549120.0, + "grad_norm": 2.2164480756607836, + "language_loss": 0.85706699, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.88037634, + "num_input_tokens_seen": 12791240, + "step": 606, + "time_per_iteration": 2.45992112159729 + }, + { + "auxiliary_loss_clip": 0.01265168, + "auxiliary_loss_mlp": 0.0107834, + "balance_loss_clip": 1.07771611, + "balance_loss_mlp": 1.04770291, + "epoch": 0.03649481436945739, + "flos": 23620387340160.0, + "grad_norm": 2.2349045283287383, + "language_loss": 0.82342446, + "learning_rate": 3.999557716251912e-06, + "loss": 0.84685957, + "num_input_tokens_seen": 12812245, + "step": 607, + "time_per_iteration": 3.967804431915283 + }, + { + "auxiliary_loss_clip": 0.01258655, + "auxiliary_loss_mlp": 0.01071174, + "balance_loss_clip": 1.07661784, + "balance_loss_mlp": 1.04194403, + "epoch": 0.036554937622125354, + "flos": 21755581879680.0, + "grad_norm": 2.1604117318991154, + "language_loss": 0.8344844, + "learning_rate": 3.999549488202358e-06, + "loss": 0.85778272, + "num_input_tokens_seen": 12831085, + "step": 608, + "time_per_iteration": 3.9322891235351562 + }, + { + "auxiliary_loss_clip": 0.01263897, + "auxiliary_loss_mlp": 0.01069309, + "balance_loss_clip": 1.07878816, + "balance_loss_mlp": 1.03674114, + "epoch": 0.036615060874793326, + "flos": 17819772935040.0, + "grad_norm": 2.1656117325242477, + "language_loss": 0.81954956, + "learning_rate": 3.999541184329688e-06, + "loss": 0.84288162, + "num_input_tokens_seen": 12849115, + "step": 609, + "time_per_iteration": 2.4642422199249268 + }, + { + "auxiliary_loss_clip": 0.01270212, + "auxiliary_loss_mlp": 0.01089263, + "balance_loss_clip": 1.08474493, + "balance_loss_mlp": 1.05958033, + "epoch": 0.0366751841274613, + "flos": 26753808320640.0, + "grad_norm": 1.9050483483729914, + "language_loss": 0.79137301, + "learning_rate": 3.999532804634215e-06, + "loss": 0.81496775, + "num_input_tokens_seen": 12868005, + "step": 610, + "time_per_iteration": 2.523236036300659 + }, + { + "auxiliary_loss_clip": 0.01266789, + "auxiliary_loss_mlp": 0.01087561, + "balance_loss_clip": 1.07946277, + "balance_loss_mlp": 1.056638, + "epoch": 0.03673530738012926, + "flos": 22196960202240.0, + "grad_norm": 2.058157630182323, + "language_loss": 0.87253487, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.89607835, + "num_input_tokens_seen": 12886890, + "step": 611, + "time_per_iteration": 2.4754512310028076 + }, + { + "auxiliary_loss_clip": 0.01262467, + "auxiliary_loss_mlp": 0.01092681, + "balance_loss_clip": 1.08016503, + "balance_loss_mlp": 1.06298554, + "epoch": 0.036795430632797235, + "flos": 24681655601280.0, + "grad_norm": 8.001205117711043, + "language_loss": 0.72764426, + "learning_rate": 3.999515817776136e-06, + "loss": 0.75119567, + "num_input_tokens_seen": 12906130, + "step": 612, + "time_per_iteration": 3.910191535949707 + }, + { + "auxiliary_loss_clip": 0.01262253, + "auxiliary_loss_mlp": 0.01076147, + "balance_loss_clip": 1.07695389, + "balance_loss_mlp": 1.04493773, + "epoch": 0.0368555538854652, + "flos": 17748921358080.0, + "grad_norm": 3.0170571918431355, + "language_loss": 0.79205078, + "learning_rate": 3.999507210614175e-06, + "loss": 0.81543475, + "num_input_tokens_seen": 12925260, + "step": 613, + "time_per_iteration": 2.4712016582489014 + }, + { + "auxiliary_loss_clip": 0.01258812, + "auxiliary_loss_mlp": 0.01080852, + "balance_loss_clip": 1.07804215, + "balance_loss_mlp": 1.05052531, + "epoch": 0.03691567713813317, + "flos": 20594554571520.0, + "grad_norm": 2.0334073345437895, + "language_loss": 0.93529367, + "learning_rate": 3.9994985276307e-06, + "loss": 0.95869035, + "num_input_tokens_seen": 12944590, + "step": 614, + "time_per_iteration": 2.481989860534668 + }, + { + "auxiliary_loss_clip": 0.01265953, + "auxiliary_loss_mlp": 0.01078354, + "balance_loss_clip": 1.08072233, + "balance_loss_mlp": 1.04585779, + "epoch": 0.036975800390801145, + "flos": 33650380546560.0, + "grad_norm": 4.282530987571834, + "language_loss": 0.72850531, + "learning_rate": 3.999489768826041e-06, + "loss": 0.75194836, + "num_input_tokens_seen": 12964785, + "step": 615, + "time_per_iteration": 2.590994119644165 + }, + { + "auxiliary_loss_clip": 0.01261948, + "auxiliary_loss_mlp": 0.01073271, + "balance_loss_clip": 1.07618034, + "balance_loss_mlp": 1.04387379, + "epoch": 0.03703592364346911, + "flos": 28293694329600.0, + "grad_norm": 2.752167712145806, + "language_loss": 0.81885445, + "learning_rate": 3.999480934200528e-06, + "loss": 0.84220666, + "num_input_tokens_seen": 12986705, + "step": 616, + "time_per_iteration": 2.5316333770751953 + }, + { + "auxiliary_loss_clip": 0.01261862, + "auxiliary_loss_mlp": 0.01069787, + "balance_loss_clip": 1.07772827, + "balance_loss_mlp": 1.04062891, + "epoch": 0.03709604689613708, + "flos": 31504215853440.0, + "grad_norm": 2.105602933460105, + "language_loss": 0.68274242, + "learning_rate": 3.999472023754499e-06, + "loss": 0.70605892, + "num_input_tokens_seen": 13010560, + "step": 617, + "time_per_iteration": 2.572199821472168 + }, + { + "auxiliary_loss_clip": 0.01266493, + "auxiliary_loss_mlp": 0.01070664, + "balance_loss_clip": 1.08158422, + "balance_loss_mlp": 1.0387398, + "epoch": 0.03715617014880505, + "flos": 19609381272960.0, + "grad_norm": 2.2786636072509285, + "language_loss": 0.80186707, + "learning_rate": 3.99946303748829e-06, + "loss": 0.82523859, + "num_input_tokens_seen": 13028935, + "step": 618, + "time_per_iteration": 2.4802162647247314 + }, + { + "auxiliary_loss_clip": 0.01266185, + "auxiliary_loss_mlp": 0.0107521, + "balance_loss_clip": 1.0783639, + "balance_loss_mlp": 1.04328597, + "epoch": 0.03721629340147302, + "flos": 15924192497280.0, + "grad_norm": 3.648389328328535, + "language_loss": 0.91155064, + "learning_rate": 3.999453975402242e-06, + "loss": 0.9349646, + "num_input_tokens_seen": 13046000, + "step": 619, + "time_per_iteration": 2.5026469230651855 + }, + { + "auxiliary_loss_clip": 0.01261508, + "auxiliary_loss_mlp": 0.01079631, + "balance_loss_clip": 1.07998168, + "balance_loss_mlp": 1.04937613, + "epoch": 0.03727641665414099, + "flos": 21104090951040.0, + "grad_norm": 2.378477328271392, + "language_loss": 0.94196492, + "learning_rate": 3.9994448374967e-06, + "loss": 0.96537626, + "num_input_tokens_seen": 13062995, + "step": 620, + "time_per_iteration": 2.4629948139190674 + }, + { + "auxiliary_loss_clip": 0.01259356, + "auxiliary_loss_mlp": 0.01079201, + "balance_loss_clip": 1.07666337, + "balance_loss_mlp": 1.04627514, + "epoch": 0.037336539906808956, + "flos": 24131683486080.0, + "grad_norm": 2.275627158655139, + "language_loss": 0.77419603, + "learning_rate": 3.999435623772008e-06, + "loss": 0.79758161, + "num_input_tokens_seen": 13084120, + "step": 621, + "time_per_iteration": 2.5375559329986572 + }, + { + "auxiliary_loss_clip": 0.01258357, + "auxiliary_loss_mlp": 0.01066698, + "balance_loss_clip": 1.07940221, + "balance_loss_mlp": 1.03498816, + "epoch": 0.03739666315947693, + "flos": 22346384780160.0, + "grad_norm": 3.7693986113650273, + "language_loss": 0.86629665, + "learning_rate": 3.999426334228518e-06, + "loss": 0.88954723, + "num_input_tokens_seen": 13100035, + "step": 622, + "time_per_iteration": 2.475057363510132 + }, + { + "auxiliary_loss_clip": 0.01260206, + "auxiliary_loss_mlp": 0.01068153, + "balance_loss_clip": 1.07839918, + "balance_loss_mlp": 1.03782654, + "epoch": 0.0374567864121449, + "flos": 20449511452800.0, + "grad_norm": 2.2395120542559743, + "language_loss": 0.90277684, + "learning_rate": 3.999416968866581e-06, + "loss": 0.92606038, + "num_input_tokens_seen": 13118070, + "step": 623, + "time_per_iteration": 2.4850597381591797 + }, + { + "auxiliary_loss_clip": 0.01264985, + "auxiliary_loss_mlp": 0.01081644, + "balance_loss_clip": 1.08187222, + "balance_loss_mlp": 1.0506258, + "epoch": 0.037516909664812866, + "flos": 19208043636480.0, + "grad_norm": 1.9167099724974273, + "language_loss": 0.8409313, + "learning_rate": 3.999407527686551e-06, + "loss": 0.86439753, + "num_input_tokens_seen": 13136355, + "step": 624, + "time_per_iteration": 2.4656646251678467 + }, + { + "auxiliary_loss_clip": 0.01262814, + "auxiliary_loss_mlp": 0.01071085, + "balance_loss_clip": 1.07678711, + "balance_loss_mlp": 1.0400908, + "epoch": 0.03757703291748084, + "flos": 35005218664320.0, + "grad_norm": 6.194727931453058, + "language_loss": 0.66913795, + "learning_rate": 3.999398010688788e-06, + "loss": 0.69247699, + "num_input_tokens_seen": 13155435, + "step": 625, + "time_per_iteration": 2.602140426635742 + }, + { + "auxiliary_loss_clip": 0.01257444, + "auxiliary_loss_mlp": 0.01070195, + "balance_loss_clip": 1.0778625, + "balance_loss_mlp": 1.03741217, + "epoch": 0.0376371561701488, + "flos": 25483899911040.0, + "grad_norm": 3.957133360189669, + "language_loss": 0.76833189, + "learning_rate": 3.999388417873652e-06, + "loss": 0.79160833, + "num_input_tokens_seen": 13174295, + "step": 626, + "time_per_iteration": 2.510777473449707 + }, + { + "auxiliary_loss_clip": 0.01261254, + "auxiliary_loss_mlp": 0.01076318, + "balance_loss_clip": 1.07908964, + "balance_loss_mlp": 1.04501367, + "epoch": 0.037697279422816775, + "flos": 18185630912640.0, + "grad_norm": 1.9189899510225719, + "language_loss": 0.8159169, + "learning_rate": 3.999378749241506e-06, + "loss": 0.83929259, + "num_input_tokens_seen": 13192500, + "step": 627, + "time_per_iteration": 2.4697675704956055 + }, + { + "auxiliary_loss_clip": 0.01265322, + "auxiliary_loss_mlp": 0.01078306, + "balance_loss_clip": 1.08127236, + "balance_loss_mlp": 1.04745483, + "epoch": 0.03775740267548475, + "flos": 24644272521600.0, + "grad_norm": 2.057187205426108, + "language_loss": 0.88823539, + "learning_rate": 3.999369004792719e-06, + "loss": 0.91167164, + "num_input_tokens_seen": 13213470, + "step": 628, + "time_per_iteration": 2.5090696811676025 + }, + { + "auxiliary_loss_clip": 0.01259195, + "auxiliary_loss_mlp": 0.01073796, + "balance_loss_clip": 1.07494068, + "balance_loss_mlp": 1.04368377, + "epoch": 0.03781752592815271, + "flos": 21288205088640.0, + "grad_norm": 2.2501177098618843, + "language_loss": 0.79715705, + "learning_rate": 3.999359184527658e-06, + "loss": 0.82048696, + "num_input_tokens_seen": 13232365, + "step": 629, + "time_per_iteration": 2.4849469661712646 + }, + { + "auxiliary_loss_clip": 0.01259917, + "auxiliary_loss_mlp": 0.01066528, + "balance_loss_clip": 1.07699418, + "balance_loss_mlp": 1.03695226, + "epoch": 0.037877649180820684, + "flos": 22089623385600.0, + "grad_norm": 1.7642934692815957, + "language_loss": 0.76698381, + "learning_rate": 3.999349288446696e-06, + "loss": 0.79024827, + "num_input_tokens_seen": 13251920, + "step": 630, + "time_per_iteration": 2.4914400577545166 + }, + { + "auxiliary_loss_clip": 0.01265778, + "auxiliary_loss_mlp": 0.0107072, + "balance_loss_clip": 1.07917047, + "balance_loss_mlp": 1.0404408, + "epoch": 0.03793777243348865, + "flos": 14501339976960.0, + "grad_norm": 2.7240984363816225, + "language_loss": 0.91351134, + "learning_rate": 3.99933931655021e-06, + "loss": 0.9368763, + "num_input_tokens_seen": 13267440, + "step": 631, + "time_per_iteration": 2.4541170597076416 + }, + { + "auxiliary_loss_clip": 0.01255653, + "auxiliary_loss_mlp": 0.01083744, + "balance_loss_clip": 1.0763135, + "balance_loss_mlp": 1.05060351, + "epoch": 0.03799789568615662, + "flos": 21908418249600.0, + "grad_norm": 1.6180283359924887, + "language_loss": 0.92133188, + "learning_rate": 3.999329268838575e-06, + "loss": 0.94472587, + "num_input_tokens_seen": 13287850, + "step": 632, + "time_per_iteration": 2.5121684074401855 + }, + { + "auxiliary_loss_clip": 0.01259781, + "auxiliary_loss_mlp": 0.01062413, + "balance_loss_clip": 1.07871342, + "balance_loss_mlp": 1.0330162, + "epoch": 0.03805801893882459, + "flos": 24827021942400.0, + "grad_norm": 2.0148989286941728, + "language_loss": 0.83056724, + "learning_rate": 3.999319145312175e-06, + "loss": 0.85378921, + "num_input_tokens_seen": 13307760, + "step": 633, + "time_per_iteration": 2.5197947025299072 + }, + { + "auxiliary_loss_clip": 0.01259317, + "auxiliary_loss_mlp": 0.01067856, + "balance_loss_clip": 1.07732654, + "balance_loss_mlp": 1.03760028, + "epoch": 0.03811814219149256, + "flos": 30482952364800.0, + "grad_norm": 1.6227433678314849, + "language_loss": 0.69665444, + "learning_rate": 3.999308945971392e-06, + "loss": 0.71992618, + "num_input_tokens_seen": 13331230, + "step": 634, + "time_per_iteration": 2.55253267288208 + }, + { + "auxiliary_loss_clip": 0.01192886, + "auxiliary_loss_mlp": 0.0103806, + "balance_loss_clip": 1.09763944, + "balance_loss_mlp": 1.03066874, + "epoch": 0.03817826544416053, + "flos": 66992577379200.0, + "grad_norm": 0.8982167014853317, + "language_loss": 0.61641932, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63872874, + "num_input_tokens_seen": 13394760, + "step": 635, + "time_per_iteration": 3.146446466445923 + }, + { + "auxiliary_loss_clip": 0.01254231, + "auxiliary_loss_mlp": 0.01068562, + "balance_loss_clip": 1.07546675, + "balance_loss_mlp": 1.0386641, + "epoch": 0.038238388696828496, + "flos": 20485350247680.0, + "grad_norm": 2.340059841025445, + "language_loss": 0.83915699, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.86238492, + "num_input_tokens_seen": 13412775, + "step": 636, + "time_per_iteration": 2.476250648498535 + }, + { + "auxiliary_loss_clip": 0.01259703, + "auxiliary_loss_mlp": 0.0109219, + "balance_loss_clip": 1.07833409, + "balance_loss_mlp": 1.06164861, + "epoch": 0.03829851194949647, + "flos": 17965893461760.0, + "grad_norm": 3.1027326803037427, + "language_loss": 0.79340589, + "learning_rate": 3.999277893066632e-06, + "loss": 0.81692487, + "num_input_tokens_seen": 13427835, + "step": 637, + "time_per_iteration": 2.456866979598999 + }, + { + "auxiliary_loss_clip": 0.01256788, + "auxiliary_loss_mlp": 0.01081746, + "balance_loss_clip": 1.07472932, + "balance_loss_mlp": 1.04982221, + "epoch": 0.03835863520216444, + "flos": 22456522857600.0, + "grad_norm": 2.631098707569537, + "language_loss": 0.84100312, + "learning_rate": 3.999267390472215e-06, + "loss": 0.86438853, + "num_input_tokens_seen": 13447295, + "step": 638, + "time_per_iteration": 2.4854836463928223 + }, + { + "auxiliary_loss_clip": 0.01264593, + "auxiliary_loss_mlp": 0.01066857, + "balance_loss_clip": 1.07729816, + "balance_loss_mlp": 1.0358386, + "epoch": 0.038418758454832405, + "flos": 22164425458560.0, + "grad_norm": 2.2610814988316514, + "language_loss": 0.70198774, + "learning_rate": 3.999256812065381e-06, + "loss": 0.72530222, + "num_input_tokens_seen": 13468455, + "step": 639, + "time_per_iteration": 2.5341312885284424 + }, + { + "auxiliary_loss_clip": 0.01261178, + "auxiliary_loss_mlp": 0.01083601, + "balance_loss_clip": 1.07824373, + "balance_loss_mlp": 1.05117583, + "epoch": 0.03847888170750038, + "flos": 22747435107840.0, + "grad_norm": 3.161721484167637, + "language_loss": 0.84965062, + "learning_rate": 3.999246157846526e-06, + "loss": 0.87309849, + "num_input_tokens_seen": 13489085, + "step": 640, + "time_per_iteration": 2.479825735092163 + }, + { + "auxiliary_loss_clip": 0.01263059, + "auxiliary_loss_mlp": 0.01084914, + "balance_loss_clip": 1.07908535, + "balance_loss_mlp": 1.05244112, + "epoch": 0.03853900496016834, + "flos": 22711201263360.0, + "grad_norm": 2.3790389278669632, + "language_loss": 0.82238561, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.84586537, + "num_input_tokens_seen": 13509120, + "step": 641, + "time_per_iteration": 2.5051443576812744 + }, + { + "auxiliary_loss_clip": 0.01156679, + "auxiliary_loss_mlp": 0.01008662, + "balance_loss_clip": 1.06533051, + "balance_loss_mlp": 1.00179517, + "epoch": 0.038599128212836314, + "flos": 70399136355840.0, + "grad_norm": 0.9278082749220062, + "language_loss": 0.65519875, + "learning_rate": 3.999224621974381e-06, + "loss": 0.67685217, + "num_input_tokens_seen": 13562005, + "step": 642, + "time_per_iteration": 3.0849153995513916 + }, + { + "auxiliary_loss_clip": 0.01255475, + "auxiliary_loss_mlp": 0.0106688, + "balance_loss_clip": 1.07520175, + "balance_loss_mlp": 1.03738749, + "epoch": 0.03865925146550429, + "flos": 23295144666240.0, + "grad_norm": 2.0768647898304433, + "language_loss": 0.79402781, + "learning_rate": 3.999213740321906e-06, + "loss": 0.81725138, + "num_input_tokens_seen": 13582185, + "step": 643, + "time_per_iteration": 2.5406243801116943 + }, + { + "auxiliary_loss_clip": 0.01255064, + "auxiliary_loss_mlp": 0.01079353, + "balance_loss_clip": 1.07431197, + "balance_loss_mlp": 1.04965806, + "epoch": 0.03871937471817225, + "flos": 21430446946560.0, + "grad_norm": 1.8254828721768888, + "language_loss": 0.83234519, + "learning_rate": 3.999202782859046e-06, + "loss": 0.85568935, + "num_input_tokens_seen": 13599555, + "step": 644, + "time_per_iteration": 2.5066418647766113 + }, + { + "auxiliary_loss_clip": 0.01257015, + "auxiliary_loss_mlp": 0.01072592, + "balance_loss_clip": 1.0748713, + "balance_loss_mlp": 1.04054856, + "epoch": 0.038779497970840224, + "flos": 34277309550720.0, + "grad_norm": 2.146945430305243, + "language_loss": 0.82316136, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.84645748, + "num_input_tokens_seen": 13621160, + "step": 645, + "time_per_iteration": 3.9618613719940186 + }, + { + "auxiliary_loss_clip": 0.01260651, + "auxiliary_loss_mlp": 0.01071265, + "balance_loss_clip": 1.07684183, + "balance_loss_mlp": 1.04043758, + "epoch": 0.03883962122350819, + "flos": 22748189293440.0, + "grad_norm": 2.6479130491560428, + "language_loss": 0.82056963, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.84388882, + "num_input_tokens_seen": 13641915, + "step": 646, + "time_per_iteration": 5.284180641174316 + }, + { + "auxiliary_loss_clip": 0.01259478, + "auxiliary_loss_mlp": 0.01083434, + "balance_loss_clip": 1.0797646, + "balance_loss_mlp": 1.05196238, + "epoch": 0.03889974447617616, + "flos": 21945837242880.0, + "grad_norm": 3.0451762793030377, + "language_loss": 0.81926131, + "learning_rate": 3.999169455612323e-06, + "loss": 0.84269047, + "num_input_tokens_seen": 13661410, + "step": 647, + "time_per_iteration": 2.4871902465820312 + }, + { + "auxiliary_loss_clip": 0.01258569, + "auxiliary_loss_mlp": 0.01068944, + "balance_loss_clip": 1.07886529, + "balance_loss_mlp": 1.03952289, + "epoch": 0.03895986772884413, + "flos": 31504826384640.0, + "grad_norm": 2.403922103412966, + "language_loss": 0.84255946, + "learning_rate": 3.999158194912106e-06, + "loss": 0.86583459, + "num_input_tokens_seen": 13681705, + "step": 648, + "time_per_iteration": 2.5516202449798584 + }, + { + "auxiliary_loss_clip": 0.01256716, + "auxiliary_loss_mlp": 0.01077625, + "balance_loss_clip": 1.07713509, + "balance_loss_mlp": 1.04643965, + "epoch": 0.0390199909815121, + "flos": 19901011795200.0, + "grad_norm": 2.8062318683260097, + "language_loss": 0.84226859, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.86561203, + "num_input_tokens_seen": 13700400, + "step": 649, + "time_per_iteration": 2.4766805171966553 + }, + { + "auxiliary_loss_clip": 0.01256804, + "auxiliary_loss_mlp": 0.01072138, + "balance_loss_clip": 1.07573342, + "balance_loss_mlp": 1.04047608, + "epoch": 0.03908011423418007, + "flos": 21612478095360.0, + "grad_norm": 1.8952921502191973, + "language_loss": 0.80048692, + "learning_rate": 3.999135446087263e-06, + "loss": 0.82377625, + "num_input_tokens_seen": 13720145, + "step": 650, + "time_per_iteration": 2.4821808338165283 + }, + { + "auxiliary_loss_clip": 0.01252613, + "auxiliary_loss_mlp": 0.01075616, + "balance_loss_clip": 1.07297516, + "balance_loss_mlp": 1.04424036, + "epoch": 0.039140237486848035, + "flos": 18661411486080.0, + "grad_norm": 2.273368604882989, + "language_loss": 0.78747356, + "learning_rate": 3.9991239579635e-06, + "loss": 0.81075591, + "num_input_tokens_seen": 13737500, + "step": 651, + "time_per_iteration": 2.46699857711792 + }, + { + "auxiliary_loss_clip": 0.0125271, + "auxiliary_loss_mlp": 0.0108055, + "balance_loss_clip": 1.07314301, + "balance_loss_mlp": 1.0471952, + "epoch": 0.03920036073951601, + "flos": 18661124177280.0, + "grad_norm": 2.3704974938140313, + "language_loss": 0.87563097, + "learning_rate": 3.999112394032757e-06, + "loss": 0.89896357, + "num_input_tokens_seen": 13754750, + "step": 652, + "time_per_iteration": 3.8582844734191895 + }, + { + "auxiliary_loss_clip": 0.01248335, + "auxiliary_loss_mlp": 0.01069296, + "balance_loss_clip": 1.07328165, + "balance_loss_mlp": 1.0398035, + "epoch": 0.03926048399218398, + "flos": 31354468053120.0, + "grad_norm": 2.3604241463018427, + "language_loss": 0.79046333, + "learning_rate": 3.999100754295471e-06, + "loss": 0.81363964, + "num_input_tokens_seen": 13771990, + "step": 653, + "time_per_iteration": 2.526585578918457 + }, + { + "auxiliary_loss_clip": 0.01262864, + "auxiliary_loss_mlp": 0.01073018, + "balance_loss_clip": 1.07725561, + "balance_loss_mlp": 1.04185724, + "epoch": 0.039320607244851945, + "flos": 29603499770880.0, + "grad_norm": 2.56684786017853, + "language_loss": 0.85858661, + "learning_rate": 3.999089038752085e-06, + "loss": 0.88194543, + "num_input_tokens_seen": 13792750, + "step": 654, + "time_per_iteration": 2.5386204719543457 + }, + { + "auxiliary_loss_clip": 0.01132717, + "auxiliary_loss_mlp": 0.01018074, + "balance_loss_clip": 1.05075288, + "balance_loss_mlp": 1.01125526, + "epoch": 0.03938073049751992, + "flos": 66534609951360.0, + "grad_norm": 0.7282546574417021, + "language_loss": 0.49919501, + "learning_rate": 3.999077247403041e-06, + "loss": 0.5207029, + "num_input_tokens_seen": 13858570, + "step": 655, + "time_per_iteration": 3.144606590270996 + }, + { + "auxiliary_loss_clip": 0.01251074, + "auxiliary_loss_mlp": 0.01071078, + "balance_loss_clip": 1.07508349, + "balance_loss_mlp": 1.04141855, + "epoch": 0.03944085375018788, + "flos": 23367827836800.0, + "grad_norm": 2.077461336000576, + "language_loss": 0.80552369, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.82874525, + "num_input_tokens_seen": 13876335, + "step": 656, + "time_per_iteration": 2.5098366737365723 + }, + { + "auxiliary_loss_clip": 0.0126201, + "auxiliary_loss_mlp": 0.01089813, + "balance_loss_clip": 1.07760072, + "balance_loss_mlp": 1.05393112, + "epoch": 0.039500977002855854, + "flos": 18548292579840.0, + "grad_norm": 2.455662967782196, + "language_loss": 0.76272631, + "learning_rate": 3.999053437289776e-06, + "loss": 0.78624451, + "num_input_tokens_seen": 13892640, + "step": 657, + "time_per_iteration": 2.4509992599487305 + }, + { + "auxiliary_loss_clip": 0.0125788, + "auxiliary_loss_mlp": 0.01071534, + "balance_loss_clip": 1.07601774, + "balance_loss_mlp": 1.0404917, + "epoch": 0.039561100255523826, + "flos": 25338174433920.0, + "grad_norm": 1.8797334727118136, + "language_loss": 0.81405187, + "learning_rate": 3.999041418526457e-06, + "loss": 0.83734608, + "num_input_tokens_seen": 13910085, + "step": 658, + "time_per_iteration": 2.53115177154541 + }, + { + "auxiliary_loss_clip": 0.01251363, + "auxiliary_loss_mlp": 0.01073824, + "balance_loss_clip": 1.07394862, + "balance_loss_mlp": 1.04073143, + "epoch": 0.03962122350819179, + "flos": 18219889509120.0, + "grad_norm": 3.168404237022681, + "language_loss": 0.9142555, + "learning_rate": 3.999029323959287e-06, + "loss": 0.93750733, + "num_input_tokens_seen": 13928800, + "step": 659, + "time_per_iteration": 2.4812357425689697 + }, + { + "auxiliary_loss_clip": 0.01257327, + "auxiliary_loss_mlp": 0.01070502, + "balance_loss_clip": 1.07588959, + "balance_loss_mlp": 1.03981745, + "epoch": 0.03968134676085976, + "flos": 20522230536960.0, + "grad_norm": 2.2204490384318465, + "language_loss": 0.79300106, + "learning_rate": 3.999017153588724e-06, + "loss": 0.81627929, + "num_input_tokens_seen": 13948325, + "step": 660, + "time_per_iteration": 2.489722728729248 + }, + { + "auxiliary_loss_clip": 0.01254503, + "auxiliary_loss_mlp": 0.01071884, + "balance_loss_clip": 1.07736778, + "balance_loss_mlp": 1.04017472, + "epoch": 0.03974147001352773, + "flos": 22422587483520.0, + "grad_norm": 1.6167554534474062, + "language_loss": 0.81583601, + "learning_rate": 3.999004907415231e-06, + "loss": 0.83909988, + "num_input_tokens_seen": 13969090, + "step": 661, + "time_per_iteration": 2.509481430053711 + }, + { + "auxiliary_loss_clip": 0.01126748, + "auxiliary_loss_mlp": 0.0101096, + "balance_loss_clip": 1.04644966, + "balance_loss_mlp": 1.00375926, + "epoch": 0.0398015932661957, + "flos": 71128769322240.0, + "grad_norm": 0.9158705297985051, + "language_loss": 0.69400656, + "learning_rate": 3.998992585439272e-06, + "loss": 0.71538365, + "num_input_tokens_seen": 14037555, + "step": 662, + "time_per_iteration": 3.2361743450164795 + }, + { + "auxiliary_loss_clip": 0.01258082, + "auxiliary_loss_mlp": 0.01076716, + "balance_loss_clip": 1.07840908, + "balance_loss_mlp": 1.04491115, + "epoch": 0.03986171651886367, + "flos": 16800951571200.0, + "grad_norm": 1.920196942889544, + "language_loss": 0.82885408, + "learning_rate": 3.998980187661314e-06, + "loss": 0.85220212, + "num_input_tokens_seen": 14055765, + "step": 663, + "time_per_iteration": 2.4980053901672363 + }, + { + "auxiliary_loss_clip": 0.012629, + "auxiliary_loss_mlp": 0.0106596, + "balance_loss_clip": 1.079687, + "balance_loss_mlp": 1.03346324, + "epoch": 0.03992183977153164, + "flos": 24535068197760.0, + "grad_norm": 2.163861249384643, + "language_loss": 0.86956382, + "learning_rate": 3.998967714081826e-06, + "loss": 0.89285243, + "num_input_tokens_seen": 14074195, + "step": 664, + "time_per_iteration": 2.4950802326202393 + }, + { + "auxiliary_loss_clip": 0.0125038, + "auxiliary_loss_mlp": 0.01066368, + "balance_loss_clip": 1.07546103, + "balance_loss_mlp": 1.03521848, + "epoch": 0.03998196302419961, + "flos": 15595897167360.0, + "grad_norm": 1.8984004696743426, + "language_loss": 0.84463418, + "learning_rate": 3.998955164701281e-06, + "loss": 0.86780167, + "num_input_tokens_seen": 14090215, + "step": 665, + "time_per_iteration": 2.423750400543213 + }, + { + "auxiliary_loss_clip": 0.01265943, + "auxiliary_loss_mlp": 0.01084658, + "balance_loss_clip": 1.08004594, + "balance_loss_mlp": 1.05130303, + "epoch": 0.04004208627686758, + "flos": 25305065072640.0, + "grad_norm": 1.8826682892978763, + "language_loss": 0.81620437, + "learning_rate": 3.998942539520158e-06, + "loss": 0.83971035, + "num_input_tokens_seen": 14112150, + "step": 666, + "time_per_iteration": 2.5383729934692383 + }, + { + "auxiliary_loss_clip": 0.01254627, + "auxiliary_loss_mlp": 0.01073062, + "balance_loss_clip": 1.07571125, + "balance_loss_mlp": 1.03956389, + "epoch": 0.04010220952953555, + "flos": 23475847011840.0, + "grad_norm": 1.962142242362111, + "language_loss": 0.8700698, + "learning_rate": 3.998929838538932e-06, + "loss": 0.89334673, + "num_input_tokens_seen": 14131475, + "step": 667, + "time_per_iteration": 2.5061609745025635 + }, + { + "auxiliary_loss_clip": 0.01255584, + "auxiliary_loss_mlp": 0.0107014, + "balance_loss_clip": 1.08016467, + "balance_loss_mlp": 1.03964663, + "epoch": 0.04016233278220352, + "flos": 18617025254400.0, + "grad_norm": 2.1698844712723084, + "language_loss": 0.80154771, + "learning_rate": 3.998917061758087e-06, + "loss": 0.82480502, + "num_input_tokens_seen": 14146165, + "step": 668, + "time_per_iteration": 2.4839277267456055 + }, + { + "auxiliary_loss_clip": 0.01126079, + "auxiliary_loss_mlp": 0.01018097, + "balance_loss_clip": 1.04595125, + "balance_loss_mlp": 1.01127839, + "epoch": 0.040222456034871484, + "flos": 70906194696960.0, + "grad_norm": 0.7882309858167069, + "language_loss": 0.60060382, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62204564, + "num_input_tokens_seen": 14215005, + "step": 669, + "time_per_iteration": 3.215690851211548 + }, + { + "auxiliary_loss_clip": 0.01256024, + "auxiliary_loss_mlp": 0.01079718, + "balance_loss_clip": 1.07496119, + "balance_loss_mlp": 1.04807949, + "epoch": 0.040282579287539456, + "flos": 23764712186880.0, + "grad_norm": 1.8127088321060816, + "language_loss": 0.86165673, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.88501418, + "num_input_tokens_seen": 14235510, + "step": 670, + "time_per_iteration": 2.5203728675842285 + }, + { + "auxiliary_loss_clip": 0.0125233, + "auxiliary_loss_mlp": 0.01079248, + "balance_loss_clip": 1.07622802, + "balance_loss_mlp": 1.04889703, + "epoch": 0.04034270254020743, + "flos": 18478518410880.0, + "grad_norm": 2.168893746212811, + "language_loss": 0.75634074, + "learning_rate": 3.998878276622692e-06, + "loss": 0.77965653, + "num_input_tokens_seen": 14254565, + "step": 671, + "time_per_iteration": 2.4886200428009033 + }, + { + "auxiliary_loss_clip": 0.01262422, + "auxiliary_loss_mlp": 0.01074617, + "balance_loss_clip": 1.08058035, + "balance_loss_mlp": 1.04274058, + "epoch": 0.040402825792875394, + "flos": 17201858244480.0, + "grad_norm": 1.9058916643122634, + "language_loss": 0.9249537, + "learning_rate": 3.998865196648242e-06, + "loss": 0.94832402, + "num_input_tokens_seen": 14271885, + "step": 672, + "time_per_iteration": 2.451547145843506 + }, + { + "auxiliary_loss_clip": 0.012548, + "auxiliary_loss_mlp": 0.01071366, + "balance_loss_clip": 1.07658672, + "balance_loss_mlp": 1.03748655, + "epoch": 0.040462949045543366, + "flos": 19172168928000.0, + "grad_norm": 1.996862517924773, + "language_loss": 0.90170383, + "learning_rate": 3.998852040876622e-06, + "loss": 0.9249655, + "num_input_tokens_seen": 14289670, + "step": 673, + "time_per_iteration": 2.484511137008667 + }, + { + "auxiliary_loss_clip": 0.01252367, + "auxiliary_loss_mlp": 0.0108116, + "balance_loss_clip": 1.07483292, + "balance_loss_mlp": 1.0481385, + "epoch": 0.04052307229821133, + "flos": 24019821555840.0, + "grad_norm": 1.9896264085857376, + "language_loss": 0.75430405, + "learning_rate": 3.998838809308334e-06, + "loss": 0.77763939, + "num_input_tokens_seen": 14309285, + "step": 674, + "time_per_iteration": 2.5134782791137695 + }, + { + "auxiliary_loss_clip": 0.01261879, + "auxiliary_loss_mlp": 0.01066614, + "balance_loss_clip": 1.07696247, + "balance_loss_mlp": 1.03423643, + "epoch": 0.0405831955508793, + "flos": 16436601964800.0, + "grad_norm": 3.09904855653083, + "language_loss": 0.78010499, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.80338991, + "num_input_tokens_seen": 14328300, + "step": 675, + "time_per_iteration": 2.496034860610962 + }, + { + "auxiliary_loss_clip": 0.01255584, + "auxiliary_loss_mlp": 0.01082005, + "balance_loss_clip": 1.07585895, + "balance_loss_mlp": 1.04874527, + "epoch": 0.040643318803547275, + "flos": 24279922915200.0, + "grad_norm": 1.8333487957403605, + "language_loss": 0.77007121, + "learning_rate": 3.998812118783757e-06, + "loss": 0.79344714, + "num_input_tokens_seen": 14346395, + "step": 676, + "time_per_iteration": 2.4976956844329834 + }, + { + "auxiliary_loss_clip": 0.01260003, + "auxiliary_loss_mlp": 0.01081987, + "balance_loss_clip": 1.07815659, + "balance_loss_mlp": 1.04901397, + "epoch": 0.04070344205621524, + "flos": 17712076982400.0, + "grad_norm": 3.3940211917861802, + "language_loss": 0.85783064, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.88125062, + "num_input_tokens_seen": 14364605, + "step": 677, + "time_per_iteration": 2.469655990600586 + }, + { + "auxiliary_loss_clip": 0.0125407, + "auxiliary_loss_mlp": 0.01069729, + "balance_loss_clip": 1.07640111, + "balance_loss_mlp": 1.03711295, + "epoch": 0.04076356530888321, + "flos": 26177658168960.0, + "grad_norm": 1.8070210356236929, + "language_loss": 0.76470745, + "learning_rate": 3.998785125078559e-06, + "loss": 0.78794551, + "num_input_tokens_seen": 14385265, + "step": 678, + "time_per_iteration": 2.5258257389068604 + }, + { + "auxiliary_loss_clip": 0.01255311, + "auxiliary_loss_mlp": 0.01073823, + "balance_loss_clip": 1.07597566, + "balance_loss_mlp": 1.04242349, + "epoch": 0.04082368856155118, + "flos": 35773455772800.0, + "grad_norm": 1.846638510811123, + "language_loss": 0.8250283, + "learning_rate": 3.998771514534505e-06, + "loss": 0.84831965, + "num_input_tokens_seen": 14406090, + "step": 679, + "time_per_iteration": 2.6074466705322266 + }, + { + "auxiliary_loss_clip": 0.01259219, + "auxiliary_loss_mlp": 0.01066007, + "balance_loss_clip": 1.08097255, + "balance_loss_mlp": 1.03477407, + "epoch": 0.04088381181421915, + "flos": 28146640049280.0, + "grad_norm": 1.9566088282602192, + "language_loss": 0.76550275, + "learning_rate": 3.998757828196835e-06, + "loss": 0.78875506, + "num_input_tokens_seen": 14425130, + "step": 680, + "time_per_iteration": 2.5173959732055664 + }, + { + "auxiliary_loss_clip": 0.01258014, + "auxiliary_loss_mlp": 0.01074278, + "balance_loss_clip": 1.07527781, + "balance_loss_mlp": 1.03975499, + "epoch": 0.04094393506688712, + "flos": 27597673514880.0, + "grad_norm": 1.793116836579398, + "language_loss": 0.83212912, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.855452, + "num_input_tokens_seen": 14447355, + "step": 681, + "time_per_iteration": 2.533306121826172 + }, + { + "auxiliary_loss_clip": 0.01256628, + "auxiliary_loss_mlp": 0.01070229, + "balance_loss_clip": 1.07486796, + "balance_loss_mlp": 1.03763771, + "epoch": 0.04100405831955509, + "flos": 23112036109440.0, + "grad_norm": 1.923237819898852, + "language_loss": 0.71372402, + "learning_rate": 3.998730228142726e-06, + "loss": 0.7369926, + "num_input_tokens_seen": 14466790, + "step": 682, + "time_per_iteration": 2.469986915588379 + }, + { + "auxiliary_loss_clip": 0.01254337, + "auxiliary_loss_mlp": 0.01078318, + "balance_loss_clip": 1.07603168, + "balance_loss_mlp": 1.0472523, + "epoch": 0.04106418157222306, + "flos": 20156731695360.0, + "grad_norm": 1.7890478386380715, + "language_loss": 0.72652054, + "learning_rate": 3.998716314427333e-06, + "loss": 0.74984711, + "num_input_tokens_seen": 14485195, + "step": 683, + "time_per_iteration": 3.851506471633911 + }, + { + "auxiliary_loss_clip": 0.0125478, + "auxiliary_loss_mlp": 0.01076279, + "balance_loss_clip": 1.08191979, + "balance_loss_mlp": 1.04571402, + "epoch": 0.041124304824891024, + "flos": 17420697855360.0, + "grad_norm": 2.5919534416718935, + "language_loss": 0.8154422, + "learning_rate": 3.998702324920417e-06, + "loss": 0.83875281, + "num_input_tokens_seen": 14503370, + "step": 684, + "time_per_iteration": 2.450343608856201 + }, + { + "auxiliary_loss_clip": 0.01257497, + "auxiliary_loss_mlp": 0.01070754, + "balance_loss_clip": 1.08056235, + "balance_loss_mlp": 1.03744721, + "epoch": 0.041184428077558996, + "flos": 25780163287680.0, + "grad_norm": 1.4567538840531897, + "language_loss": 0.90762269, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.93090522, + "num_input_tokens_seen": 14526415, + "step": 685, + "time_per_iteration": 2.5699713230133057 + }, + { + "auxiliary_loss_clip": 0.01256911, + "auxiliary_loss_mlp": 0.01075932, + "balance_loss_clip": 1.07822669, + "balance_loss_mlp": 1.04441333, + "epoch": 0.04124455133022697, + "flos": 22964766347520.0, + "grad_norm": 2.3615197925156055, + "language_loss": 0.88268769, + "learning_rate": 3.998674118534141e-06, + "loss": 0.90601611, + "num_input_tokens_seen": 14546595, + "step": 686, + "time_per_iteration": 3.9234211444854736 + }, + { + "auxiliary_loss_clip": 0.0125873, + "auxiliary_loss_mlp": 0.01076943, + "balance_loss_clip": 1.076828, + "balance_loss_mlp": 1.04530478, + "epoch": 0.04130467458289493, + "flos": 21289067015040.0, + "grad_norm": 1.8318210298611695, + "language_loss": 0.71688837, + "learning_rate": 3.998659901655851e-06, + "loss": 0.74024504, + "num_input_tokens_seen": 14566590, + "step": 687, + "time_per_iteration": 3.942305088043213 + }, + { + "auxiliary_loss_clip": 0.01255608, + "auxiliary_loss_mlp": 0.01072926, + "balance_loss_clip": 1.08176732, + "balance_loss_mlp": 1.04331398, + "epoch": 0.041364797835562905, + "flos": 19974233669760.0, + "grad_norm": 1.7497910595364712, + "language_loss": 0.86064255, + "learning_rate": 3.998645608988177e-06, + "loss": 0.88392788, + "num_input_tokens_seen": 14585965, + "step": 688, + "time_per_iteration": 2.4797966480255127 + }, + { + "auxiliary_loss_clip": 0.01254938, + "auxiliary_loss_mlp": 0.01081282, + "balance_loss_clip": 1.07974577, + "balance_loss_mlp": 1.04992962, + "epoch": 0.04142492108823087, + "flos": 21906227520000.0, + "grad_norm": 2.771328456523103, + "language_loss": 0.83210683, + "learning_rate": 3.998631240531661e-06, + "loss": 0.85546899, + "num_input_tokens_seen": 14606015, + "step": 689, + "time_per_iteration": 2.489409923553467 + }, + { + "auxiliary_loss_clip": 0.01252353, + "auxiliary_loss_mlp": 0.01080227, + "balance_loss_clip": 1.07592487, + "balance_loss_mlp": 1.04935205, + "epoch": 0.04148504434089884, + "flos": 27639617621760.0, + "grad_norm": 1.8939316836370124, + "language_loss": 0.68170244, + "learning_rate": 3.998616796286848e-06, + "loss": 0.70502818, + "num_input_tokens_seen": 14629955, + "step": 690, + "time_per_iteration": 3.948004961013794 + }, + { + "auxiliary_loss_clip": 0.01252039, + "auxiliary_loss_mlp": 0.01076404, + "balance_loss_clip": 1.07605553, + "balance_loss_mlp": 1.04524314, + "epoch": 0.041545167593566815, + "flos": 20518387781760.0, + "grad_norm": 1.7162383688328091, + "language_loss": 0.74996936, + "learning_rate": 3.998602276254286e-06, + "loss": 0.7732538, + "num_input_tokens_seen": 14648000, + "step": 691, + "time_per_iteration": 2.4792983531951904 + }, + { + "auxiliary_loss_clip": 0.01251074, + "auxiliary_loss_mlp": 0.0108103, + "balance_loss_clip": 1.07716656, + "balance_loss_mlp": 1.04889083, + "epoch": 0.04160529084623478, + "flos": 11868907939200.0, + "grad_norm": 1.9914756311841812, + "language_loss": 0.84468967, + "learning_rate": 3.998587680434526e-06, + "loss": 0.86801076, + "num_input_tokens_seen": 14662235, + "step": 692, + "time_per_iteration": 2.4478113651275635 + }, + { + "auxiliary_loss_clip": 0.01256471, + "auxiliary_loss_mlp": 0.01077471, + "balance_loss_clip": 1.07570076, + "balance_loss_mlp": 1.04409242, + "epoch": 0.04166541409890275, + "flos": 14828306503680.0, + "grad_norm": 2.159535987280252, + "language_loss": 0.89016283, + "learning_rate": 3.99857300882812e-06, + "loss": 0.91350228, + "num_input_tokens_seen": 14676065, + "step": 693, + "time_per_iteration": 2.4439892768859863 + }, + { + "auxiliary_loss_clip": 0.01259553, + "auxiliary_loss_mlp": 0.01066019, + "balance_loss_clip": 1.08082128, + "balance_loss_mlp": 1.03531075, + "epoch": 0.04172553735157072, + "flos": 25808137004160.0, + "grad_norm": 2.8487276133935975, + "language_loss": 0.81734371, + "learning_rate": 3.998558261435626e-06, + "loss": 0.84059942, + "num_input_tokens_seen": 14694955, + "step": 694, + "time_per_iteration": 2.5195086002349854 + }, + { + "auxiliary_loss_clip": 0.01254915, + "auxiliary_loss_mlp": 0.01072142, + "balance_loss_clip": 1.07507145, + "balance_loss_mlp": 1.04117191, + "epoch": 0.04178566060423869, + "flos": 24279815174400.0, + "grad_norm": 2.3671278321000675, + "language_loss": 0.83598286, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.85925347, + "num_input_tokens_seen": 14715510, + "step": 695, + "time_per_iteration": 2.5144309997558594 + }, + { + "auxiliary_loss_clip": 0.01255687, + "auxiliary_loss_mlp": 0.01080167, + "balance_loss_clip": 1.07851648, + "balance_loss_mlp": 1.04769409, + "epoch": 0.04184578385690666, + "flos": 18222008411520.0, + "grad_norm": 2.322678800478601, + "language_loss": 0.84757161, + "learning_rate": 3.99852853929461e-06, + "loss": 0.87093008, + "num_input_tokens_seen": 14731755, + "step": 696, + "time_per_iteration": 2.4426229000091553 + }, + { + "auxiliary_loss_clip": 0.01252212, + "auxiliary_loss_mlp": 0.01085219, + "balance_loss_clip": 1.07542408, + "balance_loss_mlp": 1.05243671, + "epoch": 0.041905907109574626, + "flos": 22776342577920.0, + "grad_norm": 2.0853579128612427, + "language_loss": 0.93085659, + "learning_rate": 3.998513564547216e-06, + "loss": 0.9542309, + "num_input_tokens_seen": 14750810, + "step": 697, + "time_per_iteration": 2.538133144378662 + }, + { + "auxiliary_loss_clip": 0.01248941, + "auxiliary_loss_mlp": 0.01076087, + "balance_loss_clip": 1.07568896, + "balance_loss_mlp": 1.04573655, + "epoch": 0.0419660303622426, + "flos": 20156947176960.0, + "grad_norm": 3.234299987591336, + "language_loss": 0.83806789, + "learning_rate": 3.998498514015987e-06, + "loss": 0.86131817, + "num_input_tokens_seen": 14768435, + "step": 698, + "time_per_iteration": 2.4838712215423584 + }, + { + "auxiliary_loss_clip": 0.01252099, + "auxiliary_loss_mlp": 0.01096734, + "balance_loss_clip": 1.07491469, + "balance_loss_mlp": 1.06333137, + "epoch": 0.042026153614910564, + "flos": 23076376882560.0, + "grad_norm": 2.583247213264913, + "language_loss": 0.91226494, + "learning_rate": 3.998483387701495e-06, + "loss": 0.93575329, + "num_input_tokens_seen": 14786690, + "step": 699, + "time_per_iteration": 2.5066189765930176 + }, + { + "auxiliary_loss_clip": 0.01129553, + "auxiliary_loss_mlp": 0.01009266, + "balance_loss_clip": 1.05193877, + "balance_loss_mlp": 1.00390112, + "epoch": 0.042086276867578536, + "flos": 64495243370880.0, + "grad_norm": 0.9052507285585839, + "language_loss": 0.67849785, + "learning_rate": 3.998468185604312e-06, + "loss": 0.69988608, + "num_input_tokens_seen": 14853840, + "step": 700, + "time_per_iteration": 3.123793840408325 + }, + { + "auxiliary_loss_clip": 0.01259235, + "auxiliary_loss_mlp": 0.01083059, + "balance_loss_clip": 1.07865882, + "balance_loss_mlp": 1.04951322, + "epoch": 0.04214640012024651, + "flos": 15487016065920.0, + "grad_norm": 2.987524473886307, + "language_loss": 0.89117229, + "learning_rate": 3.998452907725016e-06, + "loss": 0.91459525, + "num_input_tokens_seen": 14869580, + "step": 701, + "time_per_iteration": 2.49013352394104 + }, + { + "auxiliary_loss_clip": 0.01255575, + "auxiliary_loss_mlp": 0.01079551, + "balance_loss_clip": 1.08155131, + "balance_loss_mlp": 1.04753149, + "epoch": 0.04220652337291447, + "flos": 23877040993920.0, + "grad_norm": 3.997151742114414, + "language_loss": 0.67208856, + "learning_rate": 3.998437554064184e-06, + "loss": 0.69543982, + "num_input_tokens_seen": 14891065, + "step": 702, + "time_per_iteration": 2.5157768726348877 + }, + { + "auxiliary_loss_clip": 0.01122044, + "auxiliary_loss_mlp": 0.01004701, + "balance_loss_clip": 1.04455519, + "balance_loss_mlp": 0.99945587, + "epoch": 0.042266646625582445, + "flos": 63795451628160.0, + "grad_norm": 0.8445266931710613, + "language_loss": 0.60818982, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.62945735, + "num_input_tokens_seen": 14954815, + "step": 703, + "time_per_iteration": 3.1593618392944336 + }, + { + "auxiliary_loss_clip": 0.01119477, + "auxiliary_loss_mlp": 0.01005413, + "balance_loss_clip": 1.04243517, + "balance_loss_mlp": 1.00016809, + "epoch": 0.04232676987825041, + "flos": 50018863345920.0, + "grad_norm": 1.0184104379284098, + "language_loss": 0.57682896, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.59807789, + "num_input_tokens_seen": 15003050, + "step": 704, + "time_per_iteration": 2.9713449478149414 + }, + { + "auxiliary_loss_clip": 0.01255192, + "auxiliary_loss_mlp": 0.01074428, + "balance_loss_clip": 1.07865191, + "balance_loss_mlp": 1.04212236, + "epoch": 0.04238689313091838, + "flos": 21616105368960.0, + "grad_norm": 2.2978268656013348, + "language_loss": 0.87722951, + "learning_rate": 3.998391038398319e-06, + "loss": 0.90052569, + "num_input_tokens_seen": 15021990, + "step": 705, + "time_per_iteration": 2.504945993423462 + }, + { + "auxiliary_loss_clip": 0.01243634, + "auxiliary_loss_mlp": 0.01070954, + "balance_loss_clip": 1.07341313, + "balance_loss_mlp": 1.04127133, + "epoch": 0.042447016383586354, + "flos": 19135109070720.0, + "grad_norm": 1.7895424711153658, + "language_loss": 0.71079034, + "learning_rate": 3.998375381617201e-06, + "loss": 0.73393619, + "num_input_tokens_seen": 15040700, + "step": 706, + "time_per_iteration": 2.4908573627471924 + }, + { + "auxiliary_loss_clip": 0.01247668, + "auxiliary_loss_mlp": 0.01068946, + "balance_loss_clip": 1.07594621, + "balance_loss_mlp": 1.03621149, + "epoch": 0.04250713963625432, + "flos": 24426007528320.0, + "grad_norm": 2.607427980261203, + "language_loss": 0.93483216, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.95799834, + "num_input_tokens_seen": 15056725, + "step": 707, + "time_per_iteration": 2.5100858211517334 + }, + { + "auxiliary_loss_clip": 0.01253305, + "auxiliary_loss_mlp": 0.01070148, + "balance_loss_clip": 1.07454944, + "balance_loss_mlp": 1.03819966, + "epoch": 0.04256726288892229, + "flos": 30367391333760.0, + "grad_norm": 4.925244098397407, + "language_loss": 0.81191194, + "learning_rate": 3.998343840719776e-06, + "loss": 0.83514655, + "num_input_tokens_seen": 15077550, + "step": 708, + "time_per_iteration": 2.5856199264526367 + }, + { + "auxiliary_loss_clip": 0.01257358, + "auxiliary_loss_mlp": 0.01080113, + "balance_loss_clip": 1.07808971, + "balance_loss_mlp": 1.04668701, + "epoch": 0.04262738614159026, + "flos": 16362661818240.0, + "grad_norm": 2.347093696339523, + "language_loss": 0.82389688, + "learning_rate": 3.998327956604666e-06, + "loss": 0.84727162, + "num_input_tokens_seen": 15094955, + "step": 709, + "time_per_iteration": 2.4977023601531982 + }, + { + "auxiliary_loss_clip": 0.01261804, + "auxiliary_loss_mlp": 0.01070312, + "balance_loss_clip": 1.08144331, + "balance_loss_mlp": 1.03838801, + "epoch": 0.04268750939425823, + "flos": 20412379768320.0, + "grad_norm": 2.666747884559168, + "language_loss": 0.85512519, + "learning_rate": 3.99831199671276e-06, + "loss": 0.87844628, + "num_input_tokens_seen": 15113395, + "step": 710, + "time_per_iteration": 2.490229845046997 + }, + { + "auxiliary_loss_clip": 0.0125992, + "auxiliary_loss_mlp": 0.01071554, + "balance_loss_clip": 1.08292031, + "balance_loss_mlp": 1.04008269, + "epoch": 0.0427476326469262, + "flos": 20302959962880.0, + "grad_norm": 2.414943052683915, + "language_loss": 0.84779704, + "learning_rate": 3.998295961044662e-06, + "loss": 0.87111175, + "num_input_tokens_seen": 15132920, + "step": 711, + "time_per_iteration": 2.475520133972168 + }, + { + "auxiliary_loss_clip": 0.01248932, + "auxiliary_loss_mlp": 0.01067966, + "balance_loss_clip": 1.07379985, + "balance_loss_mlp": 1.03647137, + "epoch": 0.042807755899594166, + "flos": 21650794928640.0, + "grad_norm": 1.654643123845041, + "language_loss": 0.85393977, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.87710875, + "num_input_tokens_seen": 15153115, + "step": 712, + "time_per_iteration": 2.5120770931243896 + }, + { + "auxiliary_loss_clip": 0.01256493, + "auxiliary_loss_mlp": 0.01073196, + "balance_loss_clip": 1.07418346, + "balance_loss_mlp": 1.04255915, + "epoch": 0.04286787915226214, + "flos": 21435007973760.0, + "grad_norm": 2.51691310507806, + "language_loss": 0.90784538, + "learning_rate": 3.998263662382328e-06, + "loss": 0.93114227, + "num_input_tokens_seen": 15172770, + "step": 713, + "time_per_iteration": 2.4832911491394043 + }, + { + "auxiliary_loss_clip": 0.01121091, + "auxiliary_loss_mlp": 0.01023259, + "balance_loss_clip": 1.04465437, + "balance_loss_mlp": 1.0183239, + "epoch": 0.04292800240493011, + "flos": 66397970615040.0, + "grad_norm": 0.8811728709994043, + "language_loss": 0.63713288, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.65857637, + "num_input_tokens_seen": 15240055, + "step": 714, + "time_per_iteration": 3.2018778324127197 + }, + { + "auxiliary_loss_clip": 0.01251504, + "auxiliary_loss_mlp": 0.0108492, + "balance_loss_clip": 1.07920837, + "balance_loss_mlp": 1.05392528, + "epoch": 0.042988125657598075, + "flos": 31650264552960.0, + "grad_norm": 1.838967079899082, + "language_loss": 0.74468529, + "learning_rate": 3.998231060622563e-06, + "loss": 0.76804948, + "num_input_tokens_seen": 15261585, + "step": 715, + "time_per_iteration": 2.571183204650879 + }, + { + "auxiliary_loss_clip": 0.01254444, + "auxiliary_loss_mlp": 0.01075104, + "balance_loss_clip": 1.07951045, + "balance_loss_mlp": 1.04232144, + "epoch": 0.04304824891026605, + "flos": 33248468292480.0, + "grad_norm": 2.255241924457185, + "language_loss": 0.72790945, + "learning_rate": 3.998214646082688e-06, + "loss": 0.75120491, + "num_input_tokens_seen": 15281160, + "step": 716, + "time_per_iteration": 2.6089766025543213 + }, + { + "auxiliary_loss_clip": 0.01110715, + "auxiliary_loss_mlp": 0.01006259, + "balance_loss_clip": 1.03539634, + "balance_loss_mlp": 1.00139511, + "epoch": 0.04310837216293401, + "flos": 64064782782720.0, + "grad_norm": 1.36413784072223, + "language_loss": 0.65578079, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67695051, + "num_input_tokens_seen": 15344505, + "step": 717, + "time_per_iteration": 3.1340420246124268 + }, + { + "auxiliary_loss_clip": 0.01110468, + "auxiliary_loss_mlp": 0.01006442, + "balance_loss_clip": 1.03563023, + "balance_loss_mlp": 1.00155497, + "epoch": 0.043168495415601985, + "flos": 61343757849600.0, + "grad_norm": 0.9831549069726905, + "language_loss": 0.5879755, + "learning_rate": 3.998181589686065e-06, + "loss": 0.60914469, + "num_input_tokens_seen": 15404050, + "step": 718, + "time_per_iteration": 2.9251046180725098 + }, + { + "auxiliary_loss_clip": 0.0125152, + "auxiliary_loss_mlp": 0.01074686, + "balance_loss_clip": 1.07893729, + "balance_loss_mlp": 1.04130697, + "epoch": 0.04322861866826996, + "flos": 20704261685760.0, + "grad_norm": 1.8847359734447025, + "language_loss": 0.91464567, + "learning_rate": 3.99816494783057e-06, + "loss": 0.93790781, + "num_input_tokens_seen": 15424190, + "step": 719, + "time_per_iteration": 2.480520725250244 + }, + { + "auxiliary_loss_clip": 0.01247591, + "auxiliary_loss_mlp": 0.01071594, + "balance_loss_clip": 1.07297122, + "balance_loss_mlp": 1.04157722, + "epoch": 0.04328874192093792, + "flos": 30373352991360.0, + "grad_norm": 1.6197938620912151, + "language_loss": 0.66681176, + "learning_rate": 3.99814823020446e-06, + "loss": 0.69000363, + "num_input_tokens_seen": 15446500, + "step": 720, + "time_per_iteration": 2.5536956787109375 + }, + { + "auxiliary_loss_clip": 0.01247406, + "auxiliary_loss_mlp": 0.01075771, + "balance_loss_clip": 1.07514811, + "balance_loss_mlp": 1.04399025, + "epoch": 0.043348865173605894, + "flos": 21944795748480.0, + "grad_norm": 1.9383275550222532, + "language_loss": 0.77480465, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.79803646, + "num_input_tokens_seen": 15465830, + "step": 721, + "time_per_iteration": 2.487926721572876 + }, + { + "auxiliary_loss_clip": 0.01252422, + "auxiliary_loss_mlp": 0.01086979, + "balance_loss_clip": 1.07756495, + "balance_loss_mlp": 1.0560087, + "epoch": 0.04340898842627386, + "flos": 15264225959040.0, + "grad_norm": 2.511747489707896, + "language_loss": 0.88315809, + "learning_rate": 3.998114567642933e-06, + "loss": 0.90655208, + "num_input_tokens_seen": 15479985, + "step": 722, + "time_per_iteration": 2.4527525901794434 + }, + { + "auxiliary_loss_clip": 0.01256942, + "auxiliary_loss_mlp": 0.01076551, + "balance_loss_clip": 1.08006072, + "balance_loss_mlp": 1.04653418, + "epoch": 0.04346911167894183, + "flos": 27965434913280.0, + "grad_norm": 1.8231900395995235, + "language_loss": 0.84438741, + "learning_rate": 3.998097622708792e-06, + "loss": 0.86772239, + "num_input_tokens_seen": 15501545, + "step": 723, + "time_per_iteration": 3.894731283187866 + }, + { + "auxiliary_loss_clip": 0.01260079, + "auxiliary_loss_mlp": 0.01079058, + "balance_loss_clip": 1.0832659, + "balance_loss_mlp": 1.04796767, + "epoch": 0.0435292349316098, + "flos": 29242202820480.0, + "grad_norm": 2.045688703605771, + "language_loss": 0.82843268, + "learning_rate": 3.99808060200659e-06, + "loss": 0.85182405, + "num_input_tokens_seen": 15521725, + "step": 724, + "time_per_iteration": 2.553359031677246 + }, + { + "auxiliary_loss_clip": 0.01252676, + "auxiliary_loss_mlp": 0.01086063, + "balance_loss_clip": 1.0775435, + "balance_loss_mlp": 1.05359006, + "epoch": 0.04358935818427777, + "flos": 20558356640640.0, + "grad_norm": 1.988237117483734, + "language_loss": 0.79552901, + "learning_rate": 3.998063505536971e-06, + "loss": 0.81891632, + "num_input_tokens_seen": 15540910, + "step": 725, + "time_per_iteration": 3.8466293811798096 + }, + { + "auxiliary_loss_clip": 0.01261587, + "auxiliary_loss_mlp": 0.01072773, + "balance_loss_clip": 1.07915306, + "balance_loss_mlp": 1.04049134, + "epoch": 0.04364948143694574, + "flos": 14464926564480.0, + "grad_norm": 2.091345909136279, + "language_loss": 0.87197322, + "learning_rate": 3.998046333300584e-06, + "loss": 0.89531684, + "num_input_tokens_seen": 15558640, + "step": 726, + "time_per_iteration": 3.849003791809082 + }, + { + "auxiliary_loss_clip": 0.01126927, + "auxiliary_loss_mlp": 0.010231, + "balance_loss_clip": 1.05230141, + "balance_loss_mlp": 1.01828372, + "epoch": 0.043709604689613706, + "flos": 50067268922880.0, + "grad_norm": 0.9173065387859451, + "language_loss": 0.55872881, + "learning_rate": 3.998029085298079e-06, + "loss": 0.58022904, + "num_input_tokens_seen": 15612975, + "step": 727, + "time_per_iteration": 3.1951780319213867 + }, + { + "auxiliary_loss_clip": 0.01253438, + "auxiliary_loss_mlp": 0.01077901, + "balance_loss_clip": 1.07788634, + "balance_loss_mlp": 1.04626238, + "epoch": 0.04376972794228168, + "flos": 13991588115840.0, + "grad_norm": 2.02170613627145, + "language_loss": 0.82037175, + "learning_rate": 3.998011761530112e-06, + "loss": 0.84368515, + "num_input_tokens_seen": 15631070, + "step": 728, + "time_per_iteration": 2.482454776763916 + }, + { + "auxiliary_loss_clip": 0.01247585, + "auxiliary_loss_mlp": 0.01070672, + "balance_loss_clip": 1.07657981, + "balance_loss_mlp": 1.04048872, + "epoch": 0.04382985119494965, + "flos": 22009901149440.0, + "grad_norm": 2.15447269369695, + "language_loss": 0.76986587, + "learning_rate": 3.997994361997338e-06, + "loss": 0.79304838, + "num_input_tokens_seen": 15647825, + "step": 729, + "time_per_iteration": 3.9494788646698 + }, + { + "auxiliary_loss_clip": 0.01255064, + "auxiliary_loss_mlp": 0.01079642, + "balance_loss_clip": 1.07632136, + "balance_loss_mlp": 1.04876661, + "epoch": 0.043889974447617615, + "flos": 24206521472640.0, + "grad_norm": 1.9931661930854947, + "language_loss": 0.94973999, + "learning_rate": 3.997976886700417e-06, + "loss": 0.97308707, + "num_input_tokens_seen": 15668260, + "step": 730, + "time_per_iteration": 2.5052812099456787 + }, + { + "auxiliary_loss_clip": 0.01249149, + "auxiliary_loss_mlp": 0.01072001, + "balance_loss_clip": 1.07240653, + "balance_loss_mlp": 1.03897977, + "epoch": 0.04395009770028559, + "flos": 17274541415040.0, + "grad_norm": 3.497273825391156, + "language_loss": 0.88617849, + "learning_rate": 3.997959335640013e-06, + "loss": 0.90938997, + "num_input_tokens_seen": 15685630, + "step": 731, + "time_per_iteration": 2.4540822505950928 + }, + { + "auxiliary_loss_clip": 0.01249975, + "auxiliary_loss_mlp": 0.01070735, + "balance_loss_clip": 1.07549679, + "balance_loss_mlp": 1.04187393, + "epoch": 0.04401022095295355, + "flos": 12310286261760.0, + "grad_norm": 3.172461984134687, + "language_loss": 0.88943565, + "learning_rate": 3.997941708816791e-06, + "loss": 0.91264272, + "num_input_tokens_seen": 15698645, + "step": 732, + "time_per_iteration": 2.443603277206421 + }, + { + "auxiliary_loss_clip": 0.01251924, + "auxiliary_loss_mlp": 0.01078384, + "balance_loss_clip": 1.07617939, + "balance_loss_mlp": 1.04731798, + "epoch": 0.044070344205621524, + "flos": 20959658363520.0, + "grad_norm": 2.2864011189564546, + "language_loss": 0.85841179, + "learning_rate": 3.997924006231419e-06, + "loss": 0.88171488, + "num_input_tokens_seen": 15716775, + "step": 733, + "time_per_iteration": 2.4763846397399902 + }, + { + "auxiliary_loss_clip": 0.01255541, + "auxiliary_loss_mlp": 0.01087581, + "balance_loss_clip": 1.07699037, + "balance_loss_mlp": 1.05377281, + "epoch": 0.044130467458289496, + "flos": 13845288021120.0, + "grad_norm": 2.4033747210251843, + "language_loss": 0.91343981, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.93687105, + "num_input_tokens_seen": 15733320, + "step": 734, + "time_per_iteration": 2.479966402053833 + }, + { + "auxiliary_loss_clip": 0.01247769, + "auxiliary_loss_mlp": 0.01067713, + "balance_loss_clip": 1.07781398, + "balance_loss_mlp": 1.03811288, + "epoch": 0.04419059071095746, + "flos": 28655063107200.0, + "grad_norm": 2.4223649138158003, + "language_loss": 0.78063184, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.80378664, + "num_input_tokens_seen": 15752705, + "step": 735, + "time_per_iteration": 2.5412886142730713 + }, + { + "auxiliary_loss_clip": 0.01243273, + "auxiliary_loss_mlp": 0.01067476, + "balance_loss_clip": 1.07247901, + "balance_loss_mlp": 1.03785276, + "epoch": 0.04425071396362543, + "flos": 28183304856960.0, + "grad_norm": 2.4100771569239217, + "language_loss": 0.88533014, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.90843761, + "num_input_tokens_seen": 15772800, + "step": 736, + "time_per_iteration": 2.5225746631622314 + }, + { + "auxiliary_loss_clip": 0.01246785, + "auxiliary_loss_mlp": 0.01080797, + "balance_loss_clip": 1.07856536, + "balance_loss_mlp": 1.05097103, + "epoch": 0.0443108372162934, + "flos": 23658452778240.0, + "grad_norm": 3.9013177773359704, + "language_loss": 0.84129977, + "learning_rate": 3.997852438281901e-06, + "loss": 0.86457562, + "num_input_tokens_seen": 15793665, + "step": 737, + "time_per_iteration": 2.5188817977905273 + }, + { + "auxiliary_loss_clip": 0.01252148, + "auxiliary_loss_mlp": 0.01075241, + "balance_loss_clip": 1.07787871, + "balance_loss_mlp": 1.04183817, + "epoch": 0.04437096046896137, + "flos": 33979861025280.0, + "grad_norm": 2.422845689764848, + "language_loss": 0.84817713, + "learning_rate": 3.997834356895906e-06, + "loss": 0.87145102, + "num_input_tokens_seen": 15813175, + "step": 738, + "time_per_iteration": 2.577906608581543 + }, + { + "auxiliary_loss_clip": 0.01115685, + "auxiliary_loss_mlp": 0.01022955, + "balance_loss_clip": 1.0399214, + "balance_loss_mlp": 1.0186398, + "epoch": 0.04443108372162934, + "flos": 67397506375680.0, + "grad_norm": 0.8785747309996341, + "language_loss": 0.59174645, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61313283, + "num_input_tokens_seen": 15872050, + "step": 739, + "time_per_iteration": 3.050217628479004 + }, + { + "auxiliary_loss_clip": 0.01249794, + "auxiliary_loss_mlp": 0.01072794, + "balance_loss_clip": 1.07896817, + "balance_loss_mlp": 1.04344428, + "epoch": 0.04449120697429731, + "flos": 29752672953600.0, + "grad_norm": 2.144007634602174, + "language_loss": 0.91327178, + "learning_rate": 3.997797966850369e-06, + "loss": 0.93649769, + "num_input_tokens_seen": 15891085, + "step": 740, + "time_per_iteration": 2.5430283546447754 + }, + { + "auxiliary_loss_clip": 0.01254131, + "auxiliary_loss_mlp": 0.01071002, + "balance_loss_clip": 1.08104539, + "balance_loss_mlp": 1.04218912, + "epoch": 0.04455133022696528, + "flos": 36502119072000.0, + "grad_norm": 2.2230877648367002, + "language_loss": 0.72081542, + "learning_rate": 3.997779658192205e-06, + "loss": 0.74406677, + "num_input_tokens_seen": 15914225, + "step": 741, + "time_per_iteration": 2.61557674407959 + }, + { + "auxiliary_loss_clip": 0.01243308, + "auxiliary_loss_mlp": 0.01080602, + "balance_loss_clip": 1.0737747, + "balance_loss_mlp": 1.05108547, + "epoch": 0.044611453479633245, + "flos": 28803661672320.0, + "grad_norm": 1.7620407795085722, + "language_loss": 0.8850745, + "learning_rate": 3.997761273778037e-06, + "loss": 0.90831363, + "num_input_tokens_seen": 15934540, + "step": 742, + "time_per_iteration": 2.523158073425293 + }, + { + "auxiliary_loss_clip": 0.0124367, + "auxiliary_loss_mlp": 0.01060847, + "balance_loss_clip": 1.07474065, + "balance_loss_mlp": 1.03094912, + "epoch": 0.04467157673230122, + "flos": 20010970304640.0, + "grad_norm": 1.8629622203053944, + "language_loss": 0.8397727, + "learning_rate": 3.997742813608561e-06, + "loss": 0.86281782, + "num_input_tokens_seen": 15952560, + "step": 743, + "time_per_iteration": 2.475792169570923 + }, + { + "auxiliary_loss_clip": 0.01250785, + "auxiliary_loss_mlp": 0.01070553, + "balance_loss_clip": 1.0769738, + "balance_loss_mlp": 1.04115546, + "epoch": 0.04473169998496919, + "flos": 18004964480640.0, + "grad_norm": 4.313392535547785, + "language_loss": 0.79953408, + "learning_rate": 3.997724277684479e-06, + "loss": 0.82274747, + "num_input_tokens_seen": 15970620, + "step": 744, + "time_per_iteration": 2.4453978538513184 + }, + { + "auxiliary_loss_clip": 0.0124343, + "auxiliary_loss_mlp": 0.01070831, + "balance_loss_clip": 1.07467628, + "balance_loss_mlp": 1.04162514, + "epoch": 0.044791823237637154, + "flos": 20631722169600.0, + "grad_norm": 1.8845812783584215, + "language_loss": 0.8536855, + "learning_rate": 3.99770566600649e-06, + "loss": 0.87682807, + "num_input_tokens_seen": 15987325, + "step": 745, + "time_per_iteration": 2.478398323059082 + }, + { + "auxiliary_loss_clip": 0.01243613, + "auxiliary_loss_mlp": 0.01068784, + "balance_loss_clip": 1.07413411, + "balance_loss_mlp": 1.03874326, + "epoch": 0.04485194649030513, + "flos": 31176171918720.0, + "grad_norm": 1.6666171424001954, + "language_loss": 0.68985176, + "learning_rate": 3.997686978575302e-06, + "loss": 0.71297574, + "num_input_tokens_seen": 16008310, + "step": 746, + "time_per_iteration": 2.550433874130249 + }, + { + "auxiliary_loss_clip": 0.01253217, + "auxiliary_loss_mlp": 0.01081538, + "balance_loss_clip": 1.08141601, + "balance_loss_mlp": 1.05059147, + "epoch": 0.04491206974297309, + "flos": 26143291831680.0, + "grad_norm": 5.231731256899493, + "language_loss": 0.6895206, + "learning_rate": 3.997668215391625e-06, + "loss": 0.71286809, + "num_input_tokens_seen": 16029620, + "step": 747, + "time_per_iteration": 2.5539510250091553 + }, + { + "auxiliary_loss_clip": 0.01250931, + "auxiliary_loss_mlp": 0.01083399, + "balance_loss_clip": 1.07707548, + "balance_loss_mlp": 1.05225003, + "epoch": 0.044972192995641064, + "flos": 20667668705280.0, + "grad_norm": 1.9367419301393973, + "language_loss": 0.66610992, + "learning_rate": 3.997649376456168e-06, + "loss": 0.68945324, + "num_input_tokens_seen": 16049065, + "step": 748, + "time_per_iteration": 2.4767324924468994 + }, + { + "auxiliary_loss_clip": 0.01254079, + "auxiliary_loss_mlp": 0.01082655, + "balance_loss_clip": 1.0833205, + "balance_loss_mlp": 1.05297232, + "epoch": 0.045032316248309036, + "flos": 16106834177280.0, + "grad_norm": 2.524246522525193, + "language_loss": 0.76673996, + "learning_rate": 3.997630461769647e-06, + "loss": 0.79010731, + "num_input_tokens_seen": 16066765, + "step": 749, + "time_per_iteration": 2.4764151573181152 + }, + { + "auxiliary_loss_clip": 0.01251115, + "auxiliary_loss_mlp": 0.01080557, + "balance_loss_clip": 1.07920742, + "balance_loss_mlp": 1.05139875, + "epoch": 0.045092439500977, + "flos": 17858843953920.0, + "grad_norm": 2.2951621087839453, + "language_loss": 0.88605535, + "learning_rate": 3.997611471332778e-06, + "loss": 0.90937209, + "num_input_tokens_seen": 16085980, + "step": 750, + "time_per_iteration": 2.441727638244629 + }, + { + "auxiliary_loss_clip": 0.01248424, + "auxiliary_loss_mlp": 0.01077257, + "balance_loss_clip": 1.07408965, + "balance_loss_mlp": 1.0438782, + "epoch": 0.04515256275364497, + "flos": 24462815990400.0, + "grad_norm": 1.7534666436342763, + "language_loss": 0.74666762, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.7699244, + "num_input_tokens_seen": 16106260, + "step": 751, + "time_per_iteration": 2.5410470962524414 + }, + { + "auxiliary_loss_clip": 0.01243635, + "auxiliary_loss_mlp": 0.01079546, + "balance_loss_clip": 1.07410443, + "balance_loss_mlp": 1.05074525, + "epoch": 0.04521268600631294, + "flos": 20916385453440.0, + "grad_norm": 2.295971916288258, + "language_loss": 0.69418418, + "learning_rate": 3.997573263210883e-06, + "loss": 0.71741599, + "num_input_tokens_seen": 16123475, + "step": 752, + "time_per_iteration": 2.479640483856201 + }, + { + "auxiliary_loss_clip": 0.0124532, + "auxiliary_loss_mlp": 0.01065509, + "balance_loss_clip": 1.07489073, + "balance_loss_mlp": 1.03668404, + "epoch": 0.04527280925898091, + "flos": 13371374954880.0, + "grad_norm": 2.436510867077347, + "language_loss": 0.9194591, + "learning_rate": 3.997554045527305e-06, + "loss": 0.94256735, + "num_input_tokens_seen": 16138335, + "step": 753, + "time_per_iteration": 2.48738169670105 + }, + { + "auxiliary_loss_clip": 0.01249109, + "auxiliary_loss_mlp": 0.01084318, + "balance_loss_clip": 1.07751989, + "balance_loss_mlp": 1.05508828, + "epoch": 0.04533293251164888, + "flos": 23254565276160.0, + "grad_norm": 2.2098293179081305, + "language_loss": 0.91399074, + "learning_rate": 3.997534752096277e-06, + "loss": 0.937325, + "num_input_tokens_seen": 16157110, + "step": 754, + "time_per_iteration": 2.49340558052063 + }, + { + "auxiliary_loss_clip": 0.01239653, + "auxiliary_loss_mlp": 0.01078328, + "balance_loss_clip": 1.07571316, + "balance_loss_mlp": 1.04742885, + "epoch": 0.04539305576431685, + "flos": 12422004537600.0, + "grad_norm": 2.2623222092758892, + "language_loss": 0.78551352, + "learning_rate": 3.997515382918531e-06, + "loss": 0.80869335, + "num_input_tokens_seen": 16174155, + "step": 755, + "time_per_iteration": 2.477518320083618 + }, + { + "auxiliary_loss_clip": 0.01251429, + "auxiliary_loss_mlp": 0.0107947, + "balance_loss_clip": 1.07907486, + "balance_loss_mlp": 1.05004895, + "epoch": 0.04545317901698482, + "flos": 16070995382400.0, + "grad_norm": 1.9951826329934674, + "language_loss": 0.78643048, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.80973947, + "num_input_tokens_seen": 16192240, + "step": 756, + "time_per_iteration": 2.458444833755493 + }, + { + "auxiliary_loss_clip": 0.01114259, + "auxiliary_loss_mlp": 0.01039793, + "balance_loss_clip": 1.04198599, + "balance_loss_mlp": 1.03504896, + "epoch": 0.045513302269652785, + "flos": 66396139021440.0, + "grad_norm": 0.8212813490298987, + "language_loss": 0.62675792, + "learning_rate": 3.997476417325827e-06, + "loss": 0.64829844, + "num_input_tokens_seen": 16255775, + "step": 757, + "time_per_iteration": 3.1493735313415527 + }, + { + "auxiliary_loss_clip": 0.01245854, + "auxiliary_loss_mlp": 0.01070045, + "balance_loss_clip": 1.0769403, + "balance_loss_mlp": 1.04174519, + "epoch": 0.04557342552232076, + "flos": 21471169991040.0, + "grad_norm": 1.44699575784267, + "language_loss": 0.84337306, + "learning_rate": 3.997456820912346e-06, + "loss": 0.86653209, + "num_input_tokens_seen": 16277015, + "step": 758, + "time_per_iteration": 2.503798007965088 + }, + { + "auxiliary_loss_clip": 0.01240061, + "auxiliary_loss_mlp": 0.01067799, + "balance_loss_clip": 1.07182825, + "balance_loss_mlp": 1.04024935, + "epoch": 0.04563354877498873, + "flos": 23732680233600.0, + "grad_norm": 1.884062487466806, + "language_loss": 0.88095313, + "learning_rate": 3.997437148755101e-06, + "loss": 0.90403175, + "num_input_tokens_seen": 16296005, + "step": 759, + "time_per_iteration": 2.508530855178833 + }, + { + "auxiliary_loss_clip": 0.01250368, + "auxiliary_loss_mlp": 0.01078588, + "balance_loss_clip": 1.07985139, + "balance_loss_mlp": 1.04801106, + "epoch": 0.045693672027656694, + "flos": 25735741142400.0, + "grad_norm": 1.9368538333189138, + "language_loss": 0.73655021, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.75983977, + "num_input_tokens_seen": 16315300, + "step": 760, + "time_per_iteration": 2.5314486026763916 + }, + { + "auxiliary_loss_clip": 0.01251546, + "auxiliary_loss_mlp": 0.01074986, + "balance_loss_clip": 1.0838654, + "balance_loss_mlp": 1.0469594, + "epoch": 0.045753795280324666, + "flos": 19719016560000.0, + "grad_norm": 1.9591846113694338, + "language_loss": 0.82625586, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.84952116, + "num_input_tokens_seen": 16333820, + "step": 761, + "time_per_iteration": 2.489737033843994 + }, + { + "auxiliary_loss_clip": 0.01244902, + "auxiliary_loss_mlp": 0.01075711, + "balance_loss_clip": 1.07635701, + "balance_loss_mlp": 1.04683852, + "epoch": 0.04581391853299264, + "flos": 23255786338560.0, + "grad_norm": 1.8086329031035762, + "language_loss": 0.79603302, + "learning_rate": 3.997377677828266e-06, + "loss": 0.81923914, + "num_input_tokens_seen": 16355290, + "step": 762, + "time_per_iteration": 2.529679775238037 + }, + { + "auxiliary_loss_clip": 0.01107278, + "auxiliary_loss_mlp": 0.01013927, + "balance_loss_clip": 1.03486323, + "balance_loss_mlp": 1.00932586, + "epoch": 0.0458740417856606, + "flos": 64231155601920.0, + "grad_norm": 1.040188661578853, + "language_loss": 0.58747685, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.60868889, + "num_input_tokens_seen": 16415995, + "step": 763, + "time_per_iteration": 4.526393175125122 + }, + { + "auxiliary_loss_clip": 0.01247215, + "auxiliary_loss_mlp": 0.01074598, + "balance_loss_clip": 1.07709277, + "balance_loss_mlp": 1.04574943, + "epoch": 0.045934165038328575, + "flos": 20770121272320.0, + "grad_norm": 2.375432560898797, + "language_loss": 0.87829518, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.90151328, + "num_input_tokens_seen": 16433120, + "step": 764, + "time_per_iteration": 3.922713279724121 + }, + { + "auxiliary_loss_clip": 0.01248823, + "auxiliary_loss_mlp": 0.01079525, + "balance_loss_clip": 1.07897139, + "balance_loss_mlp": 1.050879, + "epoch": 0.04599428829099654, + "flos": 30262891691520.0, + "grad_norm": 2.1029605367212705, + "language_loss": 0.85938191, + "learning_rate": 3.997317525234592e-06, + "loss": 0.8826654, + "num_input_tokens_seen": 16453360, + "step": 765, + "time_per_iteration": 2.58488392829895 + }, + { + "auxiliary_loss_clip": 0.01251573, + "auxiliary_loss_mlp": 0.01072392, + "balance_loss_clip": 1.07816458, + "balance_loss_mlp": 1.04087281, + "epoch": 0.04605441154366451, + "flos": 23038921975680.0, + "grad_norm": 3.967786082390371, + "language_loss": 0.88215971, + "learning_rate": 3.997297322892056e-06, + "loss": 0.90539932, + "num_input_tokens_seen": 16471160, + "step": 766, + "time_per_iteration": 3.8616089820861816 + }, + { + "auxiliary_loss_clip": 0.01246871, + "auxiliary_loss_mlp": 0.01076691, + "balance_loss_clip": 1.07713246, + "balance_loss_mlp": 1.04738927, + "epoch": 0.046114534796332485, + "flos": 22017407091840.0, + "grad_norm": 2.4426135146880097, + "language_loss": 0.83775514, + "learning_rate": 3.997277044811806e-06, + "loss": 0.86099076, + "num_input_tokens_seen": 16488940, + "step": 767, + "time_per_iteration": 2.5143752098083496 + }, + { + "auxiliary_loss_clip": 0.01247751, + "auxiliary_loss_mlp": 0.01063066, + "balance_loss_clip": 1.0803442, + "balance_loss_mlp": 1.03327525, + "epoch": 0.04617465804900045, + "flos": 29862380067840.0, + "grad_norm": 2.665062574564417, + "language_loss": 0.87119257, + "learning_rate": 3.99725669099461e-06, + "loss": 0.89430076, + "num_input_tokens_seen": 16509505, + "step": 768, + "time_per_iteration": 2.5956757068634033 + }, + { + "auxiliary_loss_clip": 0.01242505, + "auxiliary_loss_mlp": 0.01069907, + "balance_loss_clip": 1.07306993, + "balance_loss_mlp": 1.04260862, + "epoch": 0.04623478130166842, + "flos": 25630056351360.0, + "grad_norm": 2.3409045144873186, + "language_loss": 0.74993503, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.77305919, + "num_input_tokens_seen": 16528840, + "step": 769, + "time_per_iteration": 3.953342914581299 + }, + { + "auxiliary_loss_clip": 0.01241863, + "auxiliary_loss_mlp": 0.01068419, + "balance_loss_clip": 1.07704031, + "balance_loss_mlp": 1.04214561, + "epoch": 0.04629490455433639, + "flos": 20449080489600.0, + "grad_norm": 3.942513204832819, + "language_loss": 0.86191607, + "learning_rate": 3.997215756152471e-06, + "loss": 0.88501894, + "num_input_tokens_seen": 16548335, + "step": 770, + "time_per_iteration": 2.489542245864868 + }, + { + "auxiliary_loss_clip": 0.01250659, + "auxiliary_loss_mlp": 0.01070067, + "balance_loss_clip": 1.07556844, + "balance_loss_mlp": 1.04126596, + "epoch": 0.04635502780700436, + "flos": 23148736830720.0, + "grad_norm": 2.090621220760043, + "language_loss": 0.86928719, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89249444, + "num_input_tokens_seen": 16567725, + "step": 771, + "time_per_iteration": 2.477332830429077 + }, + { + "auxiliary_loss_clip": 0.01248303, + "auxiliary_loss_mlp": 0.01080548, + "balance_loss_clip": 1.07343459, + "balance_loss_mlp": 1.05045962, + "epoch": 0.04641515105967233, + "flos": 23292020183040.0, + "grad_norm": 3.480609359772438, + "language_loss": 0.83834875, + "learning_rate": 3.997174518371848e-06, + "loss": 0.86163729, + "num_input_tokens_seen": 16588175, + "step": 772, + "time_per_iteration": 2.558382987976074 + }, + { + "auxiliary_loss_clip": 0.01246311, + "auxiliary_loss_mlp": 0.01066675, + "balance_loss_clip": 1.07794011, + "balance_loss_mlp": 1.03984118, + "epoch": 0.046475274312340296, + "flos": 25115204759040.0, + "grad_norm": 1.8278369870749573, + "language_loss": 0.73804945, + "learning_rate": 3.997153785881557e-06, + "loss": 0.76117933, + "num_input_tokens_seen": 16607735, + "step": 773, + "time_per_iteration": 2.511962413787842 + }, + { + "auxiliary_loss_clip": 0.01242208, + "auxiliary_loss_mlp": 0.01069037, + "balance_loss_clip": 1.0767622, + "balance_loss_mlp": 1.03909159, + "epoch": 0.04653539756500827, + "flos": 25264916645760.0, + "grad_norm": 2.159746583844111, + "language_loss": 0.78493178, + "learning_rate": 3.997132977658996e-06, + "loss": 0.8080442, + "num_input_tokens_seen": 16627225, + "step": 774, + "time_per_iteration": 2.5290379524230957 + }, + { + "auxiliary_loss_clip": 0.01238986, + "auxiliary_loss_mlp": 0.01068362, + "balance_loss_clip": 1.07366967, + "balance_loss_mlp": 1.04169452, + "epoch": 0.046595520817676234, + "flos": 35404150089600.0, + "grad_norm": 2.6148932968465695, + "language_loss": 0.73463553, + "learning_rate": 3.997112093704952e-06, + "loss": 0.75770903, + "num_input_tokens_seen": 16647785, + "step": 775, + "time_per_iteration": 2.601954221725464 + }, + { + "auxiliary_loss_clip": 0.01243921, + "auxiliary_loss_mlp": 0.01060676, + "balance_loss_clip": 1.07473838, + "balance_loss_mlp": 1.03225648, + "epoch": 0.046655644070344206, + "flos": 18112516778880.0, + "grad_norm": 1.8466338060532805, + "language_loss": 0.77191961, + "learning_rate": 3.997091134020217e-06, + "loss": 0.79496562, + "num_input_tokens_seen": 16667555, + "step": 776, + "time_per_iteration": 2.4902849197387695 + }, + { + "auxiliary_loss_clip": 0.01238718, + "auxiliary_loss_mlp": 0.01070349, + "balance_loss_clip": 1.07383513, + "balance_loss_mlp": 1.04366958, + "epoch": 0.04671576732301218, + "flos": 29205286617600.0, + "grad_norm": 2.093483963289726, + "language_loss": 0.71109039, + "learning_rate": 3.997070098605585e-06, + "loss": 0.73418105, + "num_input_tokens_seen": 16686875, + "step": 777, + "time_per_iteration": 2.647341728210449 + }, + { + "auxiliary_loss_clip": 0.01243771, + "auxiliary_loss_mlp": 0.01079351, + "balance_loss_clip": 1.07673287, + "balance_loss_mlp": 1.05002534, + "epoch": 0.04677589057568014, + "flos": 30478319510400.0, + "grad_norm": 1.9743188650380956, + "language_loss": 0.76996791, + "learning_rate": 3.997048987461856e-06, + "loss": 0.79319906, + "num_input_tokens_seen": 16706420, + "step": 778, + "time_per_iteration": 2.646841526031494 + }, + { + "auxiliary_loss_clip": 0.01241011, + "auxiliary_loss_mlp": 0.0106939, + "balance_loss_clip": 1.07557082, + "balance_loss_mlp": 1.04001689, + "epoch": 0.046836013828348115, + "flos": 20557674282240.0, + "grad_norm": 2.005052451336899, + "language_loss": 0.79054034, + "learning_rate": 3.997027800589829e-06, + "loss": 0.81364441, + "num_input_tokens_seen": 16726390, + "step": 779, + "time_per_iteration": 2.565671443939209 + }, + { + "auxiliary_loss_clip": 0.01231719, + "auxiliary_loss_mlp": 0.01067857, + "balance_loss_clip": 1.07208145, + "balance_loss_mlp": 1.04123771, + "epoch": 0.04689613708101608, + "flos": 25447378757760.0, + "grad_norm": 1.6658308374977617, + "language_loss": 0.77407861, + "learning_rate": 3.997006537990308e-06, + "loss": 0.79707432, + "num_input_tokens_seen": 16748965, + "step": 780, + "time_per_iteration": 2.643284320831299 + }, + { + "auxiliary_loss_clip": 0.01238833, + "auxiliary_loss_mlp": 0.0107527, + "balance_loss_clip": 1.0750674, + "balance_loss_mlp": 1.04891324, + "epoch": 0.04695626033368405, + "flos": 23001395241600.0, + "grad_norm": 1.7802463794733479, + "language_loss": 0.76502752, + "learning_rate": 3.996985199664099e-06, + "loss": 0.78816861, + "num_input_tokens_seen": 16768620, + "step": 781, + "time_per_iteration": 2.5727956295013428 + }, + { + "auxiliary_loss_clip": 0.01249443, + "auxiliary_loss_mlp": 0.01073532, + "balance_loss_clip": 1.07888877, + "balance_loss_mlp": 1.04477882, + "epoch": 0.047016383586352024, + "flos": 29133357632640.0, + "grad_norm": 2.67820962447853, + "language_loss": 0.73549056, + "learning_rate": 3.99696378561201e-06, + "loss": 0.75872028, + "num_input_tokens_seen": 16789755, + "step": 782, + "time_per_iteration": 2.6457245349884033 + }, + { + "auxiliary_loss_clip": 0.01242569, + "auxiliary_loss_mlp": 0.01069719, + "balance_loss_clip": 1.07818317, + "balance_loss_mlp": 1.04343307, + "epoch": 0.04707650683901999, + "flos": 14976330451200.0, + "grad_norm": 2.5240838310153864, + "language_loss": 0.80276, + "learning_rate": 3.996942295834855e-06, + "loss": 0.82588285, + "num_input_tokens_seen": 16807585, + "step": 783, + "time_per_iteration": 2.5471854209899902 + }, + { + "auxiliary_loss_clip": 0.01235665, + "auxiliary_loss_mlp": 0.01064256, + "balance_loss_clip": 1.07603633, + "balance_loss_mlp": 1.03834033, + "epoch": 0.04713663009168796, + "flos": 21651118151040.0, + "grad_norm": 2.326109942048762, + "language_loss": 0.81375659, + "learning_rate": 3.996920730333448e-06, + "loss": 0.83675581, + "num_input_tokens_seen": 16827220, + "step": 784, + "time_per_iteration": 2.561842679977417 + }, + { + "auxiliary_loss_clip": 0.01241403, + "auxiliary_loss_mlp": 0.01068954, + "balance_loss_clip": 1.07363915, + "balance_loss_mlp": 1.04285896, + "epoch": 0.04719675334435593, + "flos": 21325408600320.0, + "grad_norm": 3.200522746687193, + "language_loss": 0.80228728, + "learning_rate": 3.996899089108607e-06, + "loss": 0.82539082, + "num_input_tokens_seen": 16846230, + "step": 785, + "time_per_iteration": 2.4844813346862793 + }, + { + "auxiliary_loss_clip": 0.01242893, + "auxiliary_loss_mlp": 0.01063234, + "balance_loss_clip": 1.07928646, + "balance_loss_mlp": 1.0381999, + "epoch": 0.0472568765970239, + "flos": 17931383470080.0, + "grad_norm": 2.186008494275165, + "language_loss": 0.89448529, + "learning_rate": 3.996877372161152e-06, + "loss": 0.91754657, + "num_input_tokens_seen": 16865325, + "step": 786, + "time_per_iteration": 2.4741830825805664 + }, + { + "auxiliary_loss_clip": 0.01241003, + "auxiliary_loss_mlp": 0.01068337, + "balance_loss_clip": 1.06833756, + "balance_loss_mlp": 1.03915501, + "epoch": 0.04731699984969187, + "flos": 18077324428800.0, + "grad_norm": 2.2738306717570125, + "language_loss": 0.76539093, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.78848433, + "num_input_tokens_seen": 16882930, + "step": 787, + "time_per_iteration": 2.448213577270508 + }, + { + "auxiliary_loss_clip": 0.01250633, + "auxiliary_loss_mlp": 0.01068563, + "balance_loss_clip": 1.0820123, + "balance_loss_mlp": 1.0407393, + "epoch": 0.047377123102359836, + "flos": 23185078416000.0, + "grad_norm": 2.7142084372379203, + "language_loss": 0.81182063, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83501261, + "num_input_tokens_seen": 16900710, + "step": 788, + "time_per_iteration": 2.5112040042877197 + }, + { + "auxiliary_loss_clip": 0.01242586, + "auxiliary_loss_mlp": 0.01078506, + "balance_loss_clip": 1.07943511, + "balance_loss_mlp": 1.04953814, + "epoch": 0.04743724635502781, + "flos": 22747794243840.0, + "grad_norm": 1.877253285300506, + "language_loss": 0.84504247, + "learning_rate": 3.996811766991355e-06, + "loss": 0.86825335, + "num_input_tokens_seen": 16919210, + "step": 789, + "time_per_iteration": 2.568422317504883 + }, + { + "auxiliary_loss_clip": 0.0124699, + "auxiliary_loss_mlp": 0.01071354, + "balance_loss_clip": 1.07941151, + "balance_loss_mlp": 1.04455578, + "epoch": 0.04749736960769577, + "flos": 17238702620160.0, + "grad_norm": 2.3772834910979395, + "language_loss": 0.81960428, + "learning_rate": 3.996789747161709e-06, + "loss": 0.8427878, + "num_input_tokens_seen": 16937125, + "step": 790, + "time_per_iteration": 2.533909559249878 + }, + { + "auxiliary_loss_clip": 0.01240714, + "auxiliary_loss_mlp": 0.01064129, + "balance_loss_clip": 1.07476413, + "balance_loss_mlp": 1.03594792, + "epoch": 0.047557492860363745, + "flos": 40479261592320.0, + "grad_norm": 2.057979567772816, + "language_loss": 0.88179904, + "learning_rate": 3.996767651613597e-06, + "loss": 0.9048475, + "num_input_tokens_seen": 16958610, + "step": 791, + "time_per_iteration": 2.70927357673645 + }, + { + "auxiliary_loss_clip": 0.01242579, + "auxiliary_loss_mlp": 0.01063953, + "balance_loss_clip": 1.07735348, + "balance_loss_mlp": 1.0357604, + "epoch": 0.04761761611303172, + "flos": 18698004466560.0, + "grad_norm": 2.6759705848844186, + "language_loss": 0.90337658, + "learning_rate": 3.996745480347854e-06, + "loss": 0.92644191, + "num_input_tokens_seen": 16977300, + "step": 792, + "time_per_iteration": 2.5280330181121826 + }, + { + "auxiliary_loss_clip": 0.01244433, + "auxiliary_loss_mlp": 0.0107635, + "balance_loss_clip": 1.07612813, + "balance_loss_mlp": 1.04950356, + "epoch": 0.04767773936569968, + "flos": 20921987975040.0, + "grad_norm": 1.979640043812563, + "language_loss": 0.73490667, + "learning_rate": 3.996723233365324e-06, + "loss": 0.75811452, + "num_input_tokens_seen": 16994950, + "step": 793, + "time_per_iteration": 2.4967477321624756 + }, + { + "auxiliary_loss_clip": 0.01251229, + "auxiliary_loss_mlp": 0.01067985, + "balance_loss_clip": 1.08046937, + "balance_loss_mlp": 1.03958917, + "epoch": 0.047737862618367655, + "flos": 23732680233600.0, + "grad_norm": 1.9734411627117254, + "language_loss": 0.86496353, + "learning_rate": 3.996700910666847e-06, + "loss": 0.88815564, + "num_input_tokens_seen": 17014760, + "step": 794, + "time_per_iteration": 2.541639804840088 + }, + { + "auxiliary_loss_clip": 0.01245251, + "auxiliary_loss_mlp": 0.01078193, + "balance_loss_clip": 1.07593322, + "balance_loss_mlp": 1.04908252, + "epoch": 0.04779798587103562, + "flos": 23695764030720.0, + "grad_norm": 35.03331845758941, + "language_loss": 0.70094812, + "learning_rate": 3.996678512253272e-06, + "loss": 0.72418255, + "num_input_tokens_seen": 17032715, + "step": 795, + "time_per_iteration": 2.5224030017852783 + }, + { + "auxiliary_loss_clip": 0.01244542, + "auxiliary_loss_mlp": 0.01076427, + "balance_loss_clip": 1.07734275, + "balance_loss_mlp": 1.04797149, + "epoch": 0.04785810912370359, + "flos": 23183641872000.0, + "grad_norm": 1.7880897926234682, + "language_loss": 0.80784398, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83105361, + "num_input_tokens_seen": 17052215, + "step": 796, + "time_per_iteration": 2.4957098960876465 + }, + { + "auxiliary_loss_clip": 0.01245762, + "auxiliary_loss_mlp": 0.01063612, + "balance_loss_clip": 1.07751513, + "balance_loss_mlp": 1.03538287, + "epoch": 0.047918232376371564, + "flos": 18040623707520.0, + "grad_norm": 3.0904967361896576, + "language_loss": 0.81703985, + "learning_rate": 3.996633488284228e-06, + "loss": 0.84013361, + "num_input_tokens_seen": 17069225, + "step": 797, + "time_per_iteration": 2.451547861099243 + }, + { + "auxiliary_loss_clip": 0.01116969, + "auxiliary_loss_mlp": 0.01009572, + "balance_loss_clip": 1.0456568, + "balance_loss_mlp": 1.0050658, + "epoch": 0.04797835562903953, + "flos": 62442588758400.0, + "grad_norm": 0.9161351180304066, + "language_loss": 0.64460701, + "learning_rate": 3.996610862730465e-06, + "loss": 0.6658724, + "num_input_tokens_seen": 17126680, + "step": 798, + "time_per_iteration": 3.0242950916290283 + }, + { + "auxiliary_loss_clip": 0.01248157, + "auxiliary_loss_mlp": 0.01071013, + "balance_loss_clip": 1.07410622, + "balance_loss_mlp": 1.0438807, + "epoch": 0.0480384788817075, + "flos": 21507296094720.0, + "grad_norm": 1.851269406555387, + "language_loss": 0.91196877, + "learning_rate": 3.996588161465018e-06, + "loss": 0.93516046, + "num_input_tokens_seen": 17144835, + "step": 799, + "time_per_iteration": 2.5169241428375244 + }, + { + "auxiliary_loss_clip": 0.01245512, + "auxiliary_loss_mlp": 0.01075066, + "balance_loss_clip": 1.07970643, + "balance_loss_mlp": 1.04621708, + "epoch": 0.048098602134375466, + "flos": 21726710323200.0, + "grad_norm": 2.2827408794087605, + "language_loss": 0.86596709, + "learning_rate": 3.996565384488748e-06, + "loss": 0.88917285, + "num_input_tokens_seen": 17165030, + "step": 800, + "time_per_iteration": 2.4946484565734863 + }, + { + "auxiliary_loss_clip": 0.01246489, + "auxiliary_loss_mlp": 0.01075286, + "balance_loss_clip": 1.07660556, + "balance_loss_mlp": 1.04871416, + "epoch": 0.04815872538704344, + "flos": 22931082368640.0, + "grad_norm": 3.458195410231451, + "language_loss": 0.83910966, + "learning_rate": 3.996542531802518e-06, + "loss": 0.86232746, + "num_input_tokens_seen": 17184895, + "step": 801, + "time_per_iteration": 2.5180957317352295 + }, + { + "auxiliary_loss_clip": 0.012468, + "auxiliary_loss_mlp": 0.01072515, + "balance_loss_clip": 1.07785225, + "balance_loss_mlp": 1.04501331, + "epoch": 0.04821884863971141, + "flos": 43174716042240.0, + "grad_norm": 2.3504046974825106, + "language_loss": 0.79904783, + "learning_rate": 3.996519603407196e-06, + "loss": 0.82224095, + "num_input_tokens_seen": 17208225, + "step": 802, + "time_per_iteration": 2.664400815963745 + }, + { + "auxiliary_loss_clip": 0.01244724, + "auxiliary_loss_mlp": 0.01069605, + "balance_loss_clip": 1.07769346, + "balance_loss_mlp": 1.04253221, + "epoch": 0.048278971892379376, + "flos": 18620006083200.0, + "grad_norm": 1.9975675206562173, + "language_loss": 0.86513817, + "learning_rate": 3.996496599303649e-06, + "loss": 0.88828146, + "num_input_tokens_seen": 17226305, + "step": 803, + "time_per_iteration": 3.847806215286255 + }, + { + "auxiliary_loss_clip": 0.01239669, + "auxiliary_loss_mlp": 0.01064092, + "balance_loss_clip": 1.07638907, + "balance_loss_mlp": 1.03712749, + "epoch": 0.04833909514504735, + "flos": 20230061310720.0, + "grad_norm": 3.3957759857752925, + "language_loss": 0.85617793, + "learning_rate": 3.996473519492753e-06, + "loss": 0.8792156, + "num_input_tokens_seen": 17244545, + "step": 804, + "time_per_iteration": 3.9010961055755615 + }, + { + "auxiliary_loss_clip": 0.01243587, + "auxiliary_loss_mlp": 0.01073093, + "balance_loss_clip": 1.07667589, + "balance_loss_mlp": 1.04588962, + "epoch": 0.04839921839771532, + "flos": 24645170361600.0, + "grad_norm": 3.9632404868175786, + "language_loss": 0.86168587, + "learning_rate": 3.99645036397538e-06, + "loss": 0.88485265, + "num_input_tokens_seen": 17265730, + "step": 805, + "time_per_iteration": 2.493422508239746 + }, + { + "auxiliary_loss_clip": 0.01237, + "auxiliary_loss_mlp": 0.01068786, + "balance_loss_clip": 1.07276845, + "balance_loss_mlp": 1.04251266, + "epoch": 0.048459341650383285, + "flos": 24827452905600.0, + "grad_norm": 2.0428492173876864, + "language_loss": 0.68074137, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.70379931, + "num_input_tokens_seen": 17284820, + "step": 806, + "time_per_iteration": 2.5187482833862305 + }, + { + "auxiliary_loss_clip": 0.01238216, + "auxiliary_loss_mlp": 0.01062313, + "balance_loss_clip": 1.07449675, + "balance_loss_mlp": 1.03463221, + "epoch": 0.04851946490305126, + "flos": 22163204396160.0, + "grad_norm": 1.9891827639438235, + "language_loss": 0.77223712, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.79524237, + "num_input_tokens_seen": 17305085, + "step": 807, + "time_per_iteration": 2.5002756118774414 + }, + { + "auxiliary_loss_clip": 0.01233753, + "auxiliary_loss_mlp": 0.01073023, + "balance_loss_clip": 1.07018268, + "balance_loss_mlp": 1.04611778, + "epoch": 0.04857958815571922, + "flos": 19792022952960.0, + "grad_norm": 2.372314994282698, + "language_loss": 0.86724633, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.89031404, + "num_input_tokens_seen": 17322715, + "step": 808, + "time_per_iteration": 3.8439013957977295 + }, + { + "auxiliary_loss_clip": 0.01244054, + "auxiliary_loss_mlp": 0.01069863, + "balance_loss_clip": 1.07447898, + "balance_loss_mlp": 1.04224205, + "epoch": 0.048639711408387194, + "flos": 18697968552960.0, + "grad_norm": 1.9519769957507593, + "language_loss": 0.89985967, + "learning_rate": 3.996356984858732e-06, + "loss": 0.92299885, + "num_input_tokens_seen": 17341455, + "step": 809, + "time_per_iteration": 2.5018386840820312 + }, + { + "auxiliary_loss_clip": 0.01243507, + "auxiliary_loss_mlp": 0.01073199, + "balance_loss_clip": 1.07807541, + "balance_loss_mlp": 1.04556584, + "epoch": 0.048699834661055166, + "flos": 24863507182080.0, + "grad_norm": 2.0426929709655415, + "language_loss": 0.84630668, + "learning_rate": 3.996333450822208e-06, + "loss": 0.8694737, + "num_input_tokens_seen": 17360765, + "step": 810, + "time_per_iteration": 3.883864641189575 + }, + { + "auxiliary_loss_clip": 0.01247299, + "auxiliary_loss_mlp": 0.01069101, + "balance_loss_clip": 1.07698536, + "balance_loss_mlp": 1.04195702, + "epoch": 0.04875995791372313, + "flos": 20704010290560.0, + "grad_norm": 1.6850177970647835, + "language_loss": 0.80617166, + "learning_rate": 3.99630984108452e-06, + "loss": 0.82933569, + "num_input_tokens_seen": 17380625, + "step": 811, + "time_per_iteration": 2.5062718391418457 + }, + { + "auxiliary_loss_clip": 0.01234437, + "auxiliary_loss_mlp": 0.01075688, + "balance_loss_clip": 1.07247007, + "balance_loss_mlp": 1.04890203, + "epoch": 0.048820081166391104, + "flos": 18588297352320.0, + "grad_norm": 1.8283325809864828, + "language_loss": 0.74756324, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.77066457, + "num_input_tokens_seen": 17399355, + "step": 812, + "time_per_iteration": 2.5213494300842285 + }, + { + "auxiliary_loss_clip": 0.01239913, + "auxiliary_loss_mlp": 0.0108147, + "balance_loss_clip": 1.07801151, + "balance_loss_mlp": 1.05531573, + "epoch": 0.04888020441905907, + "flos": 22707322594560.0, + "grad_norm": 2.133409844960452, + "language_loss": 0.90314323, + "learning_rate": 3.996262394509233e-06, + "loss": 0.92635709, + "num_input_tokens_seen": 17418240, + "step": 813, + "time_per_iteration": 2.5120720863342285 + }, + { + "auxiliary_loss_clip": 0.01235485, + "auxiliary_loss_mlp": 0.01060716, + "balance_loss_clip": 1.07429349, + "balance_loss_mlp": 1.03528905, + "epoch": 0.04894032767172704, + "flos": 22784351310720.0, + "grad_norm": 2.1912719187944285, + "language_loss": 0.75002581, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.77298784, + "num_input_tokens_seen": 17436250, + "step": 814, + "time_per_iteration": 2.479597568511963 + }, + { + "auxiliary_loss_clip": 0.01238339, + "auxiliary_loss_mlp": 0.01073285, + "balance_loss_clip": 1.07385266, + "balance_loss_mlp": 1.04597437, + "epoch": 0.04900045092439501, + "flos": 25516147345920.0, + "grad_norm": 1.9812739330943436, + "language_loss": 0.83839464, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.86151087, + "num_input_tokens_seen": 17455750, + "step": 815, + "time_per_iteration": 2.545210599899292 + }, + { + "auxiliary_loss_clip": 0.01242332, + "auxiliary_loss_mlp": 0.01066371, + "balance_loss_clip": 1.07592249, + "balance_loss_mlp": 1.03902447, + "epoch": 0.04906057417706298, + "flos": 25958136199680.0, + "grad_norm": 2.3654767481053676, + "language_loss": 0.90904683, + "learning_rate": 3.996190656910043e-06, + "loss": 0.93213379, + "num_input_tokens_seen": 17474995, + "step": 816, + "time_per_iteration": 2.5946240425109863 + }, + { + "auxiliary_loss_clip": 0.01243095, + "auxiliary_loss_mlp": 0.01063479, + "balance_loss_clip": 1.07603812, + "balance_loss_mlp": 1.0364306, + "epoch": 0.04912069742973095, + "flos": 18624638937600.0, + "grad_norm": 2.4563360718106364, + "language_loss": 0.80206913, + "learning_rate": 3.996166592984268e-06, + "loss": 0.82513487, + "num_input_tokens_seen": 17493395, + "step": 817, + "time_per_iteration": 2.542229413986206 + }, + { + "auxiliary_loss_clip": 0.0123988, + "auxiliary_loss_mlp": 0.01076566, + "balance_loss_clip": 1.07686591, + "balance_loss_mlp": 1.05004203, + "epoch": 0.049180820682398915, + "flos": 23699786353920.0, + "grad_norm": 1.8170935039504736, + "language_loss": 0.84975958, + "learning_rate": 3.996142453363656e-06, + "loss": 0.87292403, + "num_input_tokens_seen": 17514565, + "step": 818, + "time_per_iteration": 2.508615255355835 + }, + { + "auxiliary_loss_clip": 0.01245111, + "auxiliary_loss_mlp": 0.01068236, + "balance_loss_clip": 1.07409656, + "balance_loss_mlp": 1.04022145, + "epoch": 0.04924094393506689, + "flos": 22420396753920.0, + "grad_norm": 3.2968567874784522, + "language_loss": 0.75655162, + "learning_rate": 3.996118238049124e-06, + "loss": 0.77968508, + "num_input_tokens_seen": 17534590, + "step": 819, + "time_per_iteration": 2.5291829109191895 + }, + { + "auxiliary_loss_clip": 0.01241697, + "auxiliary_loss_mlp": 0.01062928, + "balance_loss_clip": 1.07796013, + "balance_loss_mlp": 1.03834665, + "epoch": 0.04930106718773486, + "flos": 15738246766080.0, + "grad_norm": 2.2047996342106155, + "language_loss": 0.85105658, + "learning_rate": 3.996093947041586e-06, + "loss": 0.87410283, + "num_input_tokens_seen": 17551900, + "step": 820, + "time_per_iteration": 2.456725597381592 + }, + { + "auxiliary_loss_clip": 0.01240709, + "auxiliary_loss_mlp": 0.01065732, + "balance_loss_clip": 1.07420337, + "balance_loss_mlp": 1.03892159, + "epoch": 0.049361190440402825, + "flos": 26250628648320.0, + "grad_norm": 1.9313174443054077, + "language_loss": 0.90514338, + "learning_rate": 3.996069580341966e-06, + "loss": 0.92820781, + "num_input_tokens_seen": 17571485, + "step": 821, + "time_per_iteration": 2.552177667617798 + }, + { + "auxiliary_loss_clip": 0.01237725, + "auxiliary_loss_mlp": 0.01080742, + "balance_loss_clip": 1.07364774, + "balance_loss_mlp": 1.05432534, + "epoch": 0.0494213136930708, + "flos": 21252366293760.0, + "grad_norm": 1.9457102468323255, + "language_loss": 0.89642507, + "learning_rate": 3.996045137951188e-06, + "loss": 0.91960973, + "num_input_tokens_seen": 17591410, + "step": 822, + "time_per_iteration": 2.4803619384765625 + }, + { + "auxiliary_loss_clip": 0.0124025, + "auxiliary_loss_mlp": 0.01063365, + "balance_loss_clip": 1.07704973, + "balance_loss_mlp": 1.03444457, + "epoch": 0.04948143694573876, + "flos": 27965506740480.0, + "grad_norm": 1.6721570864429145, + "language_loss": 0.67191553, + "learning_rate": 3.996020619870178e-06, + "loss": 0.69495165, + "num_input_tokens_seen": 17612010, + "step": 823, + "time_per_iteration": 2.5521817207336426 + }, + { + "auxiliary_loss_clip": 0.01113993, + "auxiliary_loss_mlp": 0.01005757, + "balance_loss_clip": 1.04316378, + "balance_loss_mlp": 1.00120342, + "epoch": 0.049541560198406734, + "flos": 66180995533440.0, + "grad_norm": 1.3597094069857931, + "language_loss": 0.62263799, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64383548, + "num_input_tokens_seen": 17673430, + "step": 824, + "time_per_iteration": 3.1534554958343506 + }, + { + "auxiliary_loss_clip": 0.01242674, + "auxiliary_loss_mlp": 0.01072878, + "balance_loss_clip": 1.07462335, + "balance_loss_mlp": 1.04458928, + "epoch": 0.049601683451074706, + "flos": 22892693708160.0, + "grad_norm": 1.8949680508776576, + "language_loss": 0.90363204, + "learning_rate": 3.995971356641185e-06, + "loss": 0.92678756, + "num_input_tokens_seen": 17689545, + "step": 825, + "time_per_iteration": 2.4887239933013916 + }, + { + "auxiliary_loss_clip": 0.01240682, + "auxiliary_loss_mlp": 0.01068896, + "balance_loss_clip": 1.07500732, + "balance_loss_mlp": 1.04081035, + "epoch": 0.04966180670374267, + "flos": 21433643256960.0, + "grad_norm": 3.4454821184335183, + "language_loss": 0.66797441, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.6910702, + "num_input_tokens_seen": 17705965, + "step": 826, + "time_per_iteration": 2.474905490875244 + }, + { + "auxiliary_loss_clip": 0.0124177, + "auxiliary_loss_mlp": 0.01067724, + "balance_loss_clip": 1.07563317, + "balance_loss_mlp": 1.04032993, + "epoch": 0.04972192995641064, + "flos": 23107367341440.0, + "grad_norm": 2.6053337035041992, + "language_loss": 0.78375435, + "learning_rate": 3.995921790662459e-06, + "loss": 0.8068493, + "num_input_tokens_seen": 17724580, + "step": 827, + "time_per_iteration": 2.5347914695739746 + }, + { + "auxiliary_loss_clip": 0.01242948, + "auxiliary_loss_mlp": 0.01079771, + "balance_loss_clip": 1.07559633, + "balance_loss_mlp": 1.05137515, + "epoch": 0.04978205320907861, + "flos": 40406147458560.0, + "grad_norm": 2.9075729392404845, + "language_loss": 0.78656793, + "learning_rate": 3.995896894144294e-06, + "loss": 0.80979514, + "num_input_tokens_seen": 17747755, + "step": 828, + "time_per_iteration": 2.669144868850708 + }, + { + "auxiliary_loss_clip": 0.01234163, + "auxiliary_loss_mlp": 0.01059369, + "balance_loss_clip": 1.07293701, + "balance_loss_mlp": 1.03341699, + "epoch": 0.04984217646174658, + "flos": 25228539146880.0, + "grad_norm": 1.801032769760919, + "language_loss": 0.83437985, + "learning_rate": 3.995871921941519e-06, + "loss": 0.85731518, + "num_input_tokens_seen": 17768550, + "step": 829, + "time_per_iteration": 2.541841745376587 + }, + { + "auxiliary_loss_clip": 0.01238873, + "auxiliary_loss_mlp": 0.01082721, + "balance_loss_clip": 1.07341337, + "balance_loss_mlp": 1.05266857, + "epoch": 0.04990229971441455, + "flos": 15959636242560.0, + "grad_norm": 2.379943050940154, + "language_loss": 0.75488675, + "learning_rate": 3.99584687405508e-06, + "loss": 0.77810264, + "num_input_tokens_seen": 17786080, + "step": 830, + "time_per_iteration": 2.5370733737945557 + }, + { + "auxiliary_loss_clip": 0.01240261, + "auxiliary_loss_mlp": 0.0107173, + "balance_loss_clip": 1.07410598, + "balance_loss_mlp": 1.04359674, + "epoch": 0.04996242296708252, + "flos": 18405116968320.0, + "grad_norm": 1.986546773729812, + "language_loss": 0.79863107, + "learning_rate": 3.995821750485929e-06, + "loss": 0.821751, + "num_input_tokens_seen": 17803635, + "step": 831, + "time_per_iteration": 2.4857072830200195 + }, + { + "auxiliary_loss_clip": 0.01196039, + "auxiliary_loss_mlp": 0.01076753, + "balance_loss_clip": 1.08537984, + "balance_loss_mlp": 1.05020475, + "epoch": 0.05002254621975049, + "flos": 17858053854720.0, + "grad_norm": 2.2083579232609147, + "language_loss": 0.91245604, + "learning_rate": 3.995796551235016e-06, + "loss": 0.93518388, + "num_input_tokens_seen": 17822190, + "step": 832, + "time_per_iteration": 2.6722090244293213 + }, + { + "auxiliary_loss_clip": 0.01208011, + "auxiliary_loss_mlp": 0.01079697, + "balance_loss_clip": 1.072855, + "balance_loss_mlp": 1.05417418, + "epoch": 0.050082669472418455, + "flos": 45660273367680.0, + "grad_norm": 1.9986415376887188, + "language_loss": 0.83324468, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.85612178, + "num_input_tokens_seen": 17846915, + "step": 833, + "time_per_iteration": 2.7380354404449463 + }, + { + "auxiliary_loss_clip": 0.01212716, + "auxiliary_loss_mlp": 0.01062455, + "balance_loss_clip": 1.0718925, + "balance_loss_mlp": 1.03475058, + "epoch": 0.05014279272508643, + "flos": 37962067363200.0, + "grad_norm": 2.1953223908704627, + "language_loss": 0.8219378, + "learning_rate": 3.995745925691733e-06, + "loss": 0.84468949, + "num_input_tokens_seen": 17867270, + "step": 834, + "time_per_iteration": 2.7344141006469727 + }, + { + "auxiliary_loss_clip": 0.01229672, + "auxiliary_loss_mlp": 0.01065577, + "balance_loss_clip": 1.07608032, + "balance_loss_mlp": 1.03726506, + "epoch": 0.0502029159777544, + "flos": 20996179516800.0, + "grad_norm": 2.126276246460614, + "language_loss": 0.91635203, + "learning_rate": 3.995720499401282e-06, + "loss": 0.93930453, + "num_input_tokens_seen": 17884880, + "step": 835, + "time_per_iteration": 2.5124807357788086 + }, + { + "auxiliary_loss_clip": 0.01241332, + "auxiliary_loss_mlp": 0.01066821, + "balance_loss_clip": 1.07261693, + "balance_loss_mlp": 1.03865206, + "epoch": 0.050263039230422364, + "flos": 15888066393600.0, + "grad_norm": 1.9954982039182585, + "language_loss": 0.76064563, + "learning_rate": 3.995694997432911e-06, + "loss": 0.78372711, + "num_input_tokens_seen": 17903695, + "step": 836, + "time_per_iteration": 2.4958484172821045 + }, + { + "auxiliary_loss_clip": 0.01223456, + "auxiliary_loss_mlp": 0.01071503, + "balance_loss_clip": 1.07376409, + "balance_loss_mlp": 1.04534841, + "epoch": 0.050323162483090336, + "flos": 23732752060800.0, + "grad_norm": 2.1861736002081855, + "language_loss": 0.83395815, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.85690773, + "num_input_tokens_seen": 17920745, + "step": 837, + "time_per_iteration": 2.535252332687378 + }, + { + "auxiliary_loss_clip": 0.01210587, + "auxiliary_loss_mlp": 0.00827203, + "balance_loss_clip": 1.07563329, + "balance_loss_mlp": 1.07636249, + "epoch": 0.0503832857357583, + "flos": 20266223328000.0, + "grad_norm": 3.1550277814872705, + "language_loss": 0.73076892, + "learning_rate": 3.995643766466275e-06, + "loss": 0.75114679, + "num_input_tokens_seen": 17938220, + "step": 838, + "time_per_iteration": 2.5712361335754395 + }, + { + "auxiliary_loss_clip": 0.01203015, + "auxiliary_loss_mlp": 0.01073303, + "balance_loss_clip": 1.06845963, + "balance_loss_mlp": 1.04625463, + "epoch": 0.05044340898842627, + "flos": 17785011548160.0, + "grad_norm": 1.7153880318213135, + "language_loss": 0.83041525, + "learning_rate": 3.995618037469953e-06, + "loss": 0.85317838, + "num_input_tokens_seen": 17957325, + "step": 839, + "time_per_iteration": 2.510937452316284 + }, + { + "auxiliary_loss_clip": 0.01234295, + "auxiliary_loss_mlp": 0.01070547, + "balance_loss_clip": 1.07371879, + "balance_loss_mlp": 1.04411781, + "epoch": 0.050503532241094246, + "flos": 22966526113920.0, + "grad_norm": 2.0404168296533074, + "language_loss": 0.85622895, + "learning_rate": 3.995592232799595e-06, + "loss": 0.87927735, + "num_input_tokens_seen": 17975875, + "step": 840, + "time_per_iteration": 2.508007764816284 + }, + { + "auxiliary_loss_clip": 0.01215873, + "auxiliary_loss_mlp": 0.01062745, + "balance_loss_clip": 1.08593535, + "balance_loss_mlp": 1.03384829, + "epoch": 0.05056365549376221, + "flos": 22776989022720.0, + "grad_norm": 2.9750553662566777, + "language_loss": 0.94513535, + "learning_rate": 3.99556635245618e-06, + "loss": 0.9679215, + "num_input_tokens_seen": 17994340, + "step": 841, + "time_per_iteration": 2.5414319038391113 + }, + { + "auxiliary_loss_clip": 0.01240704, + "auxiliary_loss_mlp": 0.01077376, + "balance_loss_clip": 1.07565808, + "balance_loss_mlp": 1.0493381, + "epoch": 0.05062377874643018, + "flos": 30916968399360.0, + "grad_norm": 2.523214183916328, + "language_loss": 0.77575576, + "learning_rate": 3.995540396440688e-06, + "loss": 0.79893655, + "num_input_tokens_seen": 18015260, + "step": 842, + "time_per_iteration": 3.9567034244537354 + }, + { + "auxiliary_loss_clip": 0.01227883, + "auxiliary_loss_mlp": 0.01072986, + "balance_loss_clip": 1.07589078, + "balance_loss_mlp": 1.04580653, + "epoch": 0.05068390199909815, + "flos": 19647159402240.0, + "grad_norm": 2.0162273758978047, + "language_loss": 0.78336877, + "learning_rate": 3.995514364754105e-06, + "loss": 0.80637741, + "num_input_tokens_seen": 18033960, + "step": 843, + "time_per_iteration": 3.8950881958007812 + }, + { + "auxiliary_loss_clip": 0.01228292, + "auxiliary_loss_mlp": 0.01066261, + "balance_loss_clip": 1.07687461, + "balance_loss_mlp": 1.04030919, + "epoch": 0.05074402525176612, + "flos": 37962103276800.0, + "grad_norm": 1.792246861780705, + "language_loss": 0.83359987, + "learning_rate": 3.995488257397417e-06, + "loss": 0.85654533, + "num_input_tokens_seen": 18056700, + "step": 844, + "time_per_iteration": 2.66231107711792 + }, + { + "auxiliary_loss_clip": 0.01226355, + "auxiliary_loss_mlp": 0.0106437, + "balance_loss_clip": 1.07186198, + "balance_loss_mlp": 1.03758371, + "epoch": 0.05080414850443409, + "flos": 22054610603520.0, + "grad_norm": 2.155546461550287, + "language_loss": 0.76555032, + "learning_rate": 3.995462074371614e-06, + "loss": 0.78845751, + "num_input_tokens_seen": 18075815, + "step": 845, + "time_per_iteration": 2.578859567642212 + }, + { + "auxiliary_loss_clip": 0.01217867, + "auxiliary_loss_mlp": 0.01072954, + "balance_loss_clip": 1.06965065, + "balance_loss_mlp": 1.04511893, + "epoch": 0.05086427175710206, + "flos": 20225787592320.0, + "grad_norm": 1.867128996955264, + "language_loss": 0.87666845, + "learning_rate": 3.99543581567769e-06, + "loss": 0.89957666, + "num_input_tokens_seen": 18095095, + "step": 846, + "time_per_iteration": 2.514352321624756 + }, + { + "auxiliary_loss_clip": 0.01218399, + "auxiliary_loss_mlp": 0.01070723, + "balance_loss_clip": 1.07218754, + "balance_loss_mlp": 1.04388928, + "epoch": 0.05092439500977003, + "flos": 15159223526400.0, + "grad_norm": 1.8377556550718166, + "language_loss": 0.87795925, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.90085053, + "num_input_tokens_seen": 18112675, + "step": 847, + "time_per_iteration": 3.9573960304260254 + }, + { + "auxiliary_loss_clip": 0.01184846, + "auxiliary_loss_mlp": 0.01068434, + "balance_loss_clip": 1.07143998, + "balance_loss_mlp": 1.04092073, + "epoch": 0.050984518262437994, + "flos": 22055149307520.0, + "grad_norm": 2.628172156180215, + "language_loss": 0.82061779, + "learning_rate": 3.995383071289462e-06, + "loss": 0.84315062, + "num_input_tokens_seen": 18130745, + "step": 848, + "time_per_iteration": 2.6481239795684814 + }, + { + "auxiliary_loss_clip": 0.01237024, + "auxiliary_loss_mlp": 0.01078139, + "balance_loss_clip": 1.07553422, + "balance_loss_mlp": 1.05095935, + "epoch": 0.05104464151510597, + "flos": 30225329043840.0, + "grad_norm": 2.6271318484109183, + "language_loss": 0.87401921, + "learning_rate": 3.995356585597158e-06, + "loss": 0.8971709, + "num_input_tokens_seen": 18152410, + "step": 849, + "time_per_iteration": 3.9041121006011963 + }, + { + "auxiliary_loss_clip": 0.01232527, + "auxiliary_loss_mlp": 0.01065631, + "balance_loss_clip": 1.07164991, + "balance_loss_mlp": 1.03917861, + "epoch": 0.05110476476777394, + "flos": 18332900674560.0, + "grad_norm": 2.2562986068911015, + "language_loss": 0.83392179, + "learning_rate": 3.995330024240732e-06, + "loss": 0.85690331, + "num_input_tokens_seen": 18170870, + "step": 850, + "time_per_iteration": 2.4967684745788574 + }, + { + "auxiliary_loss_clip": 0.01222735, + "auxiliary_loss_mlp": 0.01064767, + "balance_loss_clip": 1.07124782, + "balance_loss_mlp": 1.03839827, + "epoch": 0.051164888020441904, + "flos": 37998732170880.0, + "grad_norm": 2.8399936564895736, + "language_loss": 0.6495626, + "learning_rate": 3.995303387221192e-06, + "loss": 0.67243755, + "num_input_tokens_seen": 18191555, + "step": 851, + "time_per_iteration": 2.6590030193328857 + }, + { + "auxiliary_loss_clip": 0.01221832, + "auxiliary_loss_mlp": 0.01079729, + "balance_loss_clip": 1.07160592, + "balance_loss_mlp": 1.05023658, + "epoch": 0.051225011273109876, + "flos": 23038634666880.0, + "grad_norm": 2.2558170343702675, + "language_loss": 0.83235192, + "learning_rate": 3.995276674539547e-06, + "loss": 0.85536754, + "num_input_tokens_seen": 18208620, + "step": 852, + "time_per_iteration": 2.5351204872131348 + }, + { + "auxiliary_loss_clip": 0.01209264, + "auxiliary_loss_mlp": 0.01076837, + "balance_loss_clip": 1.07325673, + "balance_loss_mlp": 1.04915655, + "epoch": 0.05128513452577785, + "flos": 18259822454400.0, + "grad_norm": 1.9365288850717792, + "language_loss": 0.80546242, + "learning_rate": 3.995249886196811e-06, + "loss": 0.82832342, + "num_input_tokens_seen": 18226370, + "step": 853, + "time_per_iteration": 2.5126612186431885 + }, + { + "auxiliary_loss_clip": 0.01234795, + "auxiliary_loss_mlp": 0.01072971, + "balance_loss_clip": 1.07312644, + "balance_loss_mlp": 1.04480219, + "epoch": 0.05134525777844581, + "flos": 27198957571200.0, + "grad_norm": 1.9945722649075286, + "language_loss": 0.75480604, + "learning_rate": 3.995223022193999e-06, + "loss": 0.77788365, + "num_input_tokens_seen": 18247075, + "step": 854, + "time_per_iteration": 2.521136999130249 + }, + { + "auxiliary_loss_clip": 0.01214391, + "auxiliary_loss_mlp": 0.01071637, + "balance_loss_clip": 1.07310426, + "balance_loss_mlp": 1.04350328, + "epoch": 0.051405381031113785, + "flos": 28362247436160.0, + "grad_norm": 2.1119381876064898, + "language_loss": 0.81608504, + "learning_rate": 3.99519608253213e-06, + "loss": 0.83894527, + "num_input_tokens_seen": 18265680, + "step": 855, + "time_per_iteration": 2.5770468711853027 + }, + { + "auxiliary_loss_clip": 0.01084305, + "auxiliary_loss_mlp": 0.00839029, + "balance_loss_clip": 1.04575634, + "balance_loss_mlp": 1.15224361, + "epoch": 0.05146550428378175, + "flos": 65618169327360.0, + "grad_norm": 1.0042976504068604, + "language_loss": 0.65651131, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67574465, + "num_input_tokens_seen": 18327015, + "step": 856, + "time_per_iteration": 3.1591265201568604 + }, + { + "auxiliary_loss_clip": 0.01204665, + "auxiliary_loss_mlp": 0.01060097, + "balance_loss_clip": 1.07042229, + "balance_loss_mlp": 1.03271437, + "epoch": 0.05152562753644972, + "flos": 22054861998720.0, + "grad_norm": 2.2446012315865076, + "language_loss": 0.76849931, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.79114693, + "num_input_tokens_seen": 18345235, + "step": 857, + "time_per_iteration": 2.6026453971862793 + }, + { + "auxiliary_loss_clip": 0.01195276, + "auxiliary_loss_mlp": 0.01060537, + "balance_loss_clip": 1.06812525, + "balance_loss_mlp": 1.03252316, + "epoch": 0.051585750789117694, + "flos": 18509544783360.0, + "grad_norm": 2.376093041208949, + "language_loss": 0.89038754, + "learning_rate": 3.995114809602412e-06, + "loss": 0.91294569, + "num_input_tokens_seen": 18362350, + "step": 858, + "time_per_iteration": 2.5573222637176514 + }, + { + "auxiliary_loss_clip": 0.01207491, + "auxiliary_loss_mlp": 0.01060131, + "balance_loss_clip": 1.07195282, + "balance_loss_mlp": 1.03298724, + "epoch": 0.05164587404178566, + "flos": 23730238108800.0, + "grad_norm": 1.7732660427176947, + "language_loss": 0.74813759, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.77081382, + "num_input_tokens_seen": 18383390, + "step": 859, + "time_per_iteration": 2.6116816997528076 + }, + { + "auxiliary_loss_clip": 0.01200518, + "auxiliary_loss_mlp": 0.01078996, + "balance_loss_clip": 1.07151818, + "balance_loss_mlp": 1.04926562, + "epoch": 0.05170599729445363, + "flos": 16252882876800.0, + "grad_norm": 2.464442423931956, + "language_loss": 0.91136885, + "learning_rate": 3.995060249372788e-06, + "loss": 0.93416399, + "num_input_tokens_seen": 18399220, + "step": 860, + "time_per_iteration": 2.5147926807403564 + }, + { + "auxiliary_loss_clip": 0.01236353, + "auxiliary_loss_mlp": 0.01063292, + "balance_loss_clip": 1.07511687, + "balance_loss_mlp": 1.03722048, + "epoch": 0.0517661205471216, + "flos": 23985922095360.0, + "grad_norm": 1.888055019229181, + "language_loss": 0.82224113, + "learning_rate": 3.99503285577813e-06, + "loss": 0.84523761, + "num_input_tokens_seen": 18419005, + "step": 861, + "time_per_iteration": 2.5715174674987793 + }, + { + "auxiliary_loss_clip": 0.0120857, + "auxiliary_loss_mlp": 0.01060628, + "balance_loss_clip": 1.07130527, + "balance_loss_mlp": 1.03410411, + "epoch": 0.05182624379978957, + "flos": 29277718392960.0, + "grad_norm": 1.8381376592938383, + "language_loss": 0.78692919, + "learning_rate": 3.995005386531627e-06, + "loss": 0.80962121, + "num_input_tokens_seen": 18440550, + "step": 862, + "time_per_iteration": 2.6206045150756836 + }, + { + "auxiliary_loss_clip": 0.01200965, + "auxiliary_loss_mlp": 0.01073814, + "balance_loss_clip": 1.06939781, + "balance_loss_mlp": 1.04799294, + "epoch": 0.05188636705245754, + "flos": 24170826332160.0, + "grad_norm": 2.043088376184506, + "language_loss": 0.89069796, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.91344577, + "num_input_tokens_seen": 18461950, + "step": 863, + "time_per_iteration": 2.6068830490112305 + }, + { + "auxiliary_loss_clip": 0.0121325, + "auxiliary_loss_mlp": 0.01067477, + "balance_loss_clip": 1.07461524, + "balance_loss_mlp": 1.03905749, + "epoch": 0.051946490305125506, + "flos": 26760703731840.0, + "grad_norm": 3.125802667825891, + "language_loss": 0.76103586, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.78384316, + "num_input_tokens_seen": 18480555, + "step": 864, + "time_per_iteration": 2.5935940742492676 + }, + { + "auxiliary_loss_clip": 0.01185308, + "auxiliary_loss_mlp": 0.0106913, + "balance_loss_clip": 1.05829477, + "balance_loss_mlp": 1.0409255, + "epoch": 0.05200661355779348, + "flos": 21502519585920.0, + "grad_norm": 2.0739711194129105, + "language_loss": 0.78992319, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81246763, + "num_input_tokens_seen": 18499645, + "step": 865, + "time_per_iteration": 2.559828996658325 + }, + { + "auxiliary_loss_clip": 0.01220478, + "auxiliary_loss_mlp": 0.01067321, + "balance_loss_clip": 1.07081628, + "balance_loss_mlp": 1.04053497, + "epoch": 0.05206673681046144, + "flos": 18114492026880.0, + "grad_norm": 2.8354941581406194, + "language_loss": 0.85845405, + "learning_rate": 3.994894753048032e-06, + "loss": 0.88133204, + "num_input_tokens_seen": 18516810, + "step": 866, + "time_per_iteration": 2.4799506664276123 + }, + { + "auxiliary_loss_clip": 0.01198853, + "auxiliary_loss_mlp": 0.01064054, + "balance_loss_clip": 1.0777576, + "balance_loss_mlp": 1.03785205, + "epoch": 0.052126860063129415, + "flos": 17524191916800.0, + "grad_norm": 2.5149075774074072, + "language_loss": 0.87631929, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.89894837, + "num_input_tokens_seen": 18532510, + "step": 867, + "time_per_iteration": 2.5681934356689453 + }, + { + "auxiliary_loss_clip": 0.01168289, + "auxiliary_loss_mlp": 0.01070839, + "balance_loss_clip": 1.06075227, + "balance_loss_mlp": 1.04626989, + "epoch": 0.05218698331579739, + "flos": 32598054771840.0, + "grad_norm": 1.4800476005793581, + "language_loss": 0.63637638, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.65876764, + "num_input_tokens_seen": 18557380, + "step": 868, + "time_per_iteration": 2.6763460636138916 + }, + { + "auxiliary_loss_clip": 0.01235884, + "auxiliary_loss_mlp": 0.01067624, + "balance_loss_clip": 1.073071, + "balance_loss_mlp": 1.03888226, + "epoch": 0.05224710656846535, + "flos": 22127293774080.0, + "grad_norm": 1.9793633351175597, + "language_loss": 0.83025193, + "learning_rate": 3.994810983642281e-06, + "loss": 0.85328698, + "num_input_tokens_seen": 18575720, + "step": 869, + "time_per_iteration": 2.515263557434082 + }, + { + "auxiliary_loss_clip": 0.01223619, + "auxiliary_loss_mlp": 0.01058445, + "balance_loss_clip": 1.07171845, + "balance_loss_mlp": 1.03206372, + "epoch": 0.052307229821133325, + "flos": 11145092976000.0, + "grad_norm": 1.9993669762815234, + "language_loss": 0.87700975, + "learning_rate": 3.994782909218751e-06, + "loss": 0.8998304, + "num_input_tokens_seen": 18592185, + "step": 870, + "time_per_iteration": 2.4927549362182617 + }, + { + "auxiliary_loss_clip": 0.01235293, + "auxiliary_loss_mlp": 0.01065714, + "balance_loss_clip": 1.07340336, + "balance_loss_mlp": 1.03980994, + "epoch": 0.05236735307380129, + "flos": 19128070005120.0, + "grad_norm": 2.099647956412021, + "language_loss": 0.81003368, + "learning_rate": 3.994754759152854e-06, + "loss": 0.83304381, + "num_input_tokens_seen": 18609560, + "step": 871, + "time_per_iteration": 2.5126662254333496 + }, + { + "auxiliary_loss_clip": 0.01205697, + "auxiliary_loss_mlp": 0.0106062, + "balance_loss_clip": 1.07587111, + "balance_loss_mlp": 1.03582406, + "epoch": 0.05242747632646926, + "flos": 20960663944320.0, + "grad_norm": 1.9524730775383052, + "language_loss": 0.81130838, + "learning_rate": 3.994726533445656e-06, + "loss": 0.8339715, + "num_input_tokens_seen": 18629405, + "step": 872, + "time_per_iteration": 2.560051918029785 + }, + { + "auxiliary_loss_clip": 0.01096864, + "auxiliary_loss_mlp": 0.01030318, + "balance_loss_clip": 1.05232286, + "balance_loss_mlp": 1.02519214, + "epoch": 0.052487599579137234, + "flos": 65020542842880.0, + "grad_norm": 0.9019198494168122, + "language_loss": 0.61693221, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.63820398, + "num_input_tokens_seen": 18681480, + "step": 873, + "time_per_iteration": 3.06003475189209 + }, + { + "auxiliary_loss_clip": 0.01206362, + "auxiliary_loss_mlp": 0.01055572, + "balance_loss_clip": 1.07213938, + "balance_loss_mlp": 1.03020477, + "epoch": 0.0525477228318052, + "flos": 23288859786240.0, + "grad_norm": 1.93122624115511, + "language_loss": 0.88771755, + "learning_rate": 3.994669855111643e-06, + "loss": 0.91033685, + "num_input_tokens_seen": 18700390, + "step": 874, + "time_per_iteration": 2.576326608657837 + }, + { + "auxiliary_loss_clip": 0.01205856, + "auxiliary_loss_mlp": 0.01063366, + "balance_loss_clip": 1.07055449, + "balance_loss_mlp": 1.03733051, + "epoch": 0.05260784608447317, + "flos": 32230221546240.0, + "grad_norm": 1.883844190560903, + "language_loss": 0.74723351, + "learning_rate": 3.994641402486977e-06, + "loss": 0.76992565, + "num_input_tokens_seen": 18721280, + "step": 875, + "time_per_iteration": 2.64450740814209 + }, + { + "auxiliary_loss_clip": 0.01214965, + "auxiliary_loss_mlp": 0.01056066, + "balance_loss_clip": 1.07321906, + "balance_loss_mlp": 1.02922034, + "epoch": 0.052667969337141136, + "flos": 24463211040000.0, + "grad_norm": 1.724593397416063, + "language_loss": 0.93172318, + "learning_rate": 3.99461287422531e-06, + "loss": 0.95443344, + "num_input_tokens_seen": 18741545, + "step": 876, + "time_per_iteration": 2.552605390548706 + }, + { + "auxiliary_loss_clip": 0.01126476, + "auxiliary_loss_mlp": 0.01030353, + "balance_loss_clip": 1.04974389, + "balance_loss_mlp": 1.02594244, + "epoch": 0.05272809258980911, + "flos": 57784329567360.0, + "grad_norm": 0.8173137274975907, + "language_loss": 0.62917572, + "learning_rate": 3.994584270327722e-06, + "loss": 0.65074402, + "num_input_tokens_seen": 18801400, + "step": 877, + "time_per_iteration": 3.1108858585357666 + }, + { + "auxiliary_loss_clip": 0.01210986, + "auxiliary_loss_mlp": 0.01072648, + "balance_loss_clip": 1.07201231, + "balance_loss_mlp": 1.04484797, + "epoch": 0.05278821584247708, + "flos": 17420805596160.0, + "grad_norm": 2.1294198440006737, + "language_loss": 0.85420358, + "learning_rate": 3.994555590795299e-06, + "loss": 0.87703985, + "num_input_tokens_seen": 18819670, + "step": 878, + "time_per_iteration": 2.573084592819214 + }, + { + "auxiliary_loss_clip": 0.01232923, + "auxiliary_loss_mlp": 0.01063167, + "balance_loss_clip": 1.07050157, + "balance_loss_mlp": 1.03752494, + "epoch": 0.052848339095145046, + "flos": 26137258346880.0, + "grad_norm": 1.9502374454425166, + "language_loss": 0.83035666, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.8533175, + "num_input_tokens_seen": 18840580, + "step": 879, + "time_per_iteration": 2.522031545639038 + }, + { + "auxiliary_loss_clip": 0.01200795, + "auxiliary_loss_mlp": 0.0106825, + "balance_loss_clip": 1.07149363, + "balance_loss_mlp": 1.04122543, + "epoch": 0.05290846234781302, + "flos": 16472081623680.0, + "grad_norm": 2.071213064451731, + "language_loss": 0.84218007, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.86487055, + "num_input_tokens_seen": 18859295, + "step": 880, + "time_per_iteration": 3.934206247329712 + }, + { + "auxiliary_loss_clip": 0.01192925, + "auxiliary_loss_mlp": 0.0106315, + "balance_loss_clip": 1.07080853, + "balance_loss_mlp": 1.03706717, + "epoch": 0.05296858560048098, + "flos": 19865173000320.0, + "grad_norm": 2.1498725176777214, + "language_loss": 0.87145591, + "learning_rate": 3.994469098399906e-06, + "loss": 0.89401662, + "num_input_tokens_seen": 18877485, + "step": 881, + "time_per_iteration": 2.5754125118255615 + }, + { + "auxiliary_loss_clip": 0.01218282, + "auxiliary_loss_mlp": 0.01067732, + "balance_loss_clip": 1.070889, + "balance_loss_mlp": 1.04021811, + "epoch": 0.053028708853148955, + "flos": 24388588535040.0, + "grad_norm": 1.9092364171747176, + "language_loss": 0.87847406, + "learning_rate": 3.994440116339046e-06, + "loss": 0.90133423, + "num_input_tokens_seen": 18898275, + "step": 882, + "time_per_iteration": 3.947329044342041 + }, + { + "auxiliary_loss_clip": 0.01233567, + "auxiliary_loss_mlp": 0.01056917, + "balance_loss_clip": 1.07226801, + "balance_loss_mlp": 1.02988076, + "epoch": 0.05308883210581693, + "flos": 36393166143360.0, + "grad_norm": 3.021160918032195, + "language_loss": 0.69564617, + "learning_rate": 3.994411058648816e-06, + "loss": 0.71855098, + "num_input_tokens_seen": 18920665, + "step": 883, + "time_per_iteration": 2.6199071407318115 + }, + { + "auxiliary_loss_clip": 0.01170083, + "auxiliary_loss_mlp": 0.01066639, + "balance_loss_clip": 1.06000412, + "balance_loss_mlp": 1.04105711, + "epoch": 0.05314895535848489, + "flos": 22855095146880.0, + "grad_norm": 1.815307914756506, + "language_loss": 0.75968301, + "learning_rate": 3.994381925330319e-06, + "loss": 0.78205025, + "num_input_tokens_seen": 18939835, + "step": 884, + "time_per_iteration": 2.5956761837005615 + }, + { + "auxiliary_loss_clip": 0.01182458, + "auxiliary_loss_mlp": 0.01069316, + "balance_loss_clip": 1.07529938, + "balance_loss_mlp": 1.04336381, + "epoch": 0.053209078611152864, + "flos": 12860330204160.0, + "grad_norm": 2.0766826139352164, + "language_loss": 0.85653013, + "learning_rate": 3.994352716384659e-06, + "loss": 0.87904787, + "num_input_tokens_seen": 18958405, + "step": 885, + "time_per_iteration": 2.6230082511901855 + }, + { + "auxiliary_loss_clip": 0.01185331, + "auxiliary_loss_mlp": 0.01069384, + "balance_loss_clip": 1.05735052, + "balance_loss_mlp": 1.04251468, + "epoch": 0.05326920186382083, + "flos": 12164596698240.0, + "grad_norm": 3.4819633081118546, + "language_loss": 0.85972977, + "learning_rate": 3.994323431812945e-06, + "loss": 0.88227689, + "num_input_tokens_seen": 18975445, + "step": 886, + "time_per_iteration": 3.9707014560699463 + }, + { + "auxiliary_loss_clip": 0.01187475, + "auxiliary_loss_mlp": 0.01065515, + "balance_loss_clip": 1.07113957, + "balance_loss_mlp": 1.03824019, + "epoch": 0.0533293251164888, + "flos": 22704485420160.0, + "grad_norm": 2.1228156076950615, + "language_loss": 0.89277673, + "learning_rate": 3.994294071616286e-06, + "loss": 0.91530657, + "num_input_tokens_seen": 18991930, + "step": 887, + "time_per_iteration": 2.5956554412841797 + }, + { + "auxiliary_loss_clip": 0.01156326, + "auxiliary_loss_mlp": 0.01075968, + "balance_loss_clip": 1.06784678, + "balance_loss_mlp": 1.04721451, + "epoch": 0.053389448369156774, + "flos": 26940939200640.0, + "grad_norm": 1.98232819460545, + "language_loss": 0.75045687, + "learning_rate": 3.994264635795796e-06, + "loss": 0.77277982, + "num_input_tokens_seen": 19009790, + "step": 888, + "time_per_iteration": 4.084397554397583 + }, + { + "auxiliary_loss_clip": 0.0117827, + "auxiliary_loss_mlp": 0.01075751, + "balance_loss_clip": 1.07159567, + "balance_loss_mlp": 1.04752207, + "epoch": 0.05344957162182474, + "flos": 25556331686400.0, + "grad_norm": 1.9056828682042726, + "language_loss": 0.88298452, + "learning_rate": 3.994235124352592e-06, + "loss": 0.90552473, + "num_input_tokens_seen": 19030170, + "step": 889, + "time_per_iteration": 2.637068033218384 + }, + { + "auxiliary_loss_clip": 0.01227286, + "auxiliary_loss_mlp": 0.01047467, + "balance_loss_clip": 1.07140684, + "balance_loss_mlp": 1.02270699, + "epoch": 0.05350969487449271, + "flos": 19719591177600.0, + "grad_norm": 1.7807413465836133, + "language_loss": 0.89102387, + "learning_rate": 3.994205537287791e-06, + "loss": 0.91377139, + "num_input_tokens_seen": 19048075, + "step": 890, + "time_per_iteration": 2.497800588607788 + }, + { + "auxiliary_loss_clip": 0.01206129, + "auxiliary_loss_mlp": 0.01065784, + "balance_loss_clip": 1.0695436, + "balance_loss_mlp": 1.04133403, + "epoch": 0.053569818127160676, + "flos": 27016351804800.0, + "grad_norm": 2.4296235003731823, + "language_loss": 0.93274426, + "learning_rate": 3.994175874602517e-06, + "loss": 0.95546335, + "num_input_tokens_seen": 19067465, + "step": 891, + "time_per_iteration": 2.5771398544311523 + }, + { + "auxiliary_loss_clip": 0.01205061, + "auxiliary_loss_mlp": 0.01065856, + "balance_loss_clip": 1.07170367, + "balance_loss_mlp": 1.03747201, + "epoch": 0.05362994137982865, + "flos": 13188338225280.0, + "grad_norm": 2.1834680020691946, + "language_loss": 0.71693456, + "learning_rate": 3.994146136297893e-06, + "loss": 0.73964375, + "num_input_tokens_seen": 19085505, + "step": 892, + "time_per_iteration": 2.536660671234131 + }, + { + "auxiliary_loss_clip": 0.012045, + "auxiliary_loss_mlp": 0.0112842, + "balance_loss_clip": 1.06974065, + "balance_loss_mlp": 1.6300416, + "epoch": 0.05369006463249662, + "flos": 28658008022400.0, + "grad_norm": 1.9794291813656961, + "language_loss": 0.8236239, + "learning_rate": 3.994116322375049e-06, + "loss": 0.84695309, + "num_input_tokens_seen": 19104360, + "step": 893, + "time_per_iteration": 2.602376937866211 + }, + { + "auxiliary_loss_clip": 0.01205823, + "auxiliary_loss_mlp": 0.01067895, + "balance_loss_clip": 1.07102883, + "balance_loss_mlp": 1.04189563, + "epoch": 0.053750187885164585, + "flos": 28913153304960.0, + "grad_norm": 1.9212043746376017, + "language_loss": 0.81746852, + "learning_rate": 3.994086432835114e-06, + "loss": 0.84020567, + "num_input_tokens_seen": 19124680, + "step": 894, + "time_per_iteration": 2.631514310836792 + }, + { + "auxiliary_loss_clip": 0.0121284, + "auxiliary_loss_mlp": 0.01062131, + "balance_loss_clip": 1.06932735, + "balance_loss_mlp": 1.03633404, + "epoch": 0.05381031113783256, + "flos": 15158828476800.0, + "grad_norm": 2.7376740373639628, + "language_loss": 0.75101012, + "learning_rate": 3.994056467679221e-06, + "loss": 0.77375978, + "num_input_tokens_seen": 19142895, + "step": 895, + "time_per_iteration": 2.5075786113739014 + }, + { + "auxiliary_loss_clip": 0.01213128, + "auxiliary_loss_mlp": 0.01061693, + "balance_loss_clip": 1.07725394, + "balance_loss_mlp": 1.03589654, + "epoch": 0.05387043439050053, + "flos": 21835232288640.0, + "grad_norm": 2.111736705586089, + "language_loss": 0.86780262, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.89055085, + "num_input_tokens_seen": 19163125, + "step": 896, + "time_per_iteration": 2.603149890899658 + }, + { + "auxiliary_loss_clip": 0.01233952, + "auxiliary_loss_mlp": 0.01068852, + "balance_loss_clip": 1.07206035, + "balance_loss_mlp": 1.52363825, + "epoch": 0.053930557643168495, + "flos": 17310308382720.0, + "grad_norm": 2.223050599621852, + "language_loss": 0.87642455, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.89945269, + "num_input_tokens_seen": 19179385, + "step": 897, + "time_per_iteration": 2.4829087257385254 + }, + { + "auxiliary_loss_clip": 0.01213993, + "auxiliary_loss_mlp": 0.01063299, + "balance_loss_clip": 1.07268429, + "balance_loss_mlp": 1.03578568, + "epoch": 0.05399068089583647, + "flos": 17348481561600.0, + "grad_norm": 1.7753305616385673, + "language_loss": 0.90397179, + "learning_rate": 3.993966118527175e-06, + "loss": 0.9267447, + "num_input_tokens_seen": 19198725, + "step": 898, + "time_per_iteration": 2.5142648220062256 + }, + { + "auxiliary_loss_clip": 0.01212827, + "auxiliary_loss_mlp": 0.01080085, + "balance_loss_clip": 1.07265902, + "balance_loss_mlp": 1.054479, + "epoch": 0.05405080414850443, + "flos": 17486952491520.0, + "grad_norm": 3.0103010763471603, + "language_loss": 0.92109048, + "learning_rate": 3.993935850918845e-06, + "loss": 0.94401962, + "num_input_tokens_seen": 19212380, + "step": 899, + "time_per_iteration": 2.5852553844451904 + }, + { + "auxiliary_loss_clip": 0.01197865, + "auxiliary_loss_mlp": 0.01072378, + "balance_loss_clip": 1.06803334, + "balance_loss_mlp": 1.04436338, + "epoch": 0.054110927401172404, + "flos": 24496787278080.0, + "grad_norm": 2.1574361704972804, + "language_loss": 0.75842959, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.78113198, + "num_input_tokens_seen": 19232235, + "step": 900, + "time_per_iteration": 2.583963394165039 + }, + { + "auxiliary_loss_clip": 0.01221599, + "auxiliary_loss_mlp": 0.0105628, + "balance_loss_clip": 1.06833267, + "balance_loss_mlp": 1.03127027, + "epoch": 0.054171050653840376, + "flos": 22930040874240.0, + "grad_norm": 2.1765975908191035, + "language_loss": 0.73895329, + "learning_rate": 3.993875088872592e-06, + "loss": 0.7617321, + "num_input_tokens_seen": 19251460, + "step": 901, + "time_per_iteration": 2.5631070137023926 + }, + { + "auxiliary_loss_clip": 0.01189995, + "auxiliary_loss_mlp": 0.01065611, + "balance_loss_clip": 1.0692575, + "balance_loss_mlp": 1.04074383, + "epoch": 0.05423117390650834, + "flos": 12933192942720.0, + "grad_norm": 2.7497067550315157, + "language_loss": 0.84882116, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.87137723, + "num_input_tokens_seen": 19269060, + "step": 902, + "time_per_iteration": 2.5651910305023193 + }, + { + "auxiliary_loss_clip": 0.01179484, + "auxiliary_loss_mlp": 0.01065627, + "balance_loss_clip": 1.07582521, + "balance_loss_mlp": 1.03875756, + "epoch": 0.05429129715917631, + "flos": 19901335017600.0, + "grad_norm": 1.7756390456982243, + "language_loss": 0.86547661, + "learning_rate": 3.993814024394569e-06, + "loss": 0.88792765, + "num_input_tokens_seen": 19288620, + "step": 903, + "time_per_iteration": 2.6966285705566406 + }, + { + "auxiliary_loss_clip": 0.01218427, + "auxiliary_loss_mlp": 0.01058187, + "balance_loss_clip": 1.07121587, + "balance_loss_mlp": 1.03333211, + "epoch": 0.05435142041184428, + "flos": 16908611610240.0, + "grad_norm": 2.248947337203656, + "language_loss": 0.74946427, + "learning_rate": 3.993783378746537e-06, + "loss": 0.77223045, + "num_input_tokens_seen": 19306615, + "step": 904, + "time_per_iteration": 2.534850597381592 + }, + { + "auxiliary_loss_clip": 0.01216298, + "auxiliary_loss_mlp": 0.01071087, + "balance_loss_clip": 1.06919765, + "balance_loss_mlp": 1.04627907, + "epoch": 0.05441154366451225, + "flos": 23948323534080.0, + "grad_norm": 2.217768241182223, + "language_loss": 0.85702819, + "learning_rate": 3.993752657494039e-06, + "loss": 0.87990201, + "num_input_tokens_seen": 19321680, + "step": 905, + "time_per_iteration": 2.557027816772461 + }, + { + "auxiliary_loss_clip": 0.01207649, + "auxiliary_loss_mlp": 0.01070205, + "balance_loss_clip": 1.07417011, + "balance_loss_mlp": 1.04515886, + "epoch": 0.05447166691718022, + "flos": 19975382904960.0, + "grad_norm": 1.8010172629073642, + "language_loss": 0.74334836, + "learning_rate": 3.993721860638241e-06, + "loss": 0.76612693, + "num_input_tokens_seen": 19339760, + "step": 906, + "time_per_iteration": 2.5589680671691895 + }, + { + "auxiliary_loss_clip": 0.01203725, + "auxiliary_loss_mlp": 0.01064354, + "balance_loss_clip": 1.07185113, + "balance_loss_mlp": 1.03877187, + "epoch": 0.05453179016984819, + "flos": 24936513575040.0, + "grad_norm": 2.0601340770347467, + "language_loss": 0.87160665, + "learning_rate": 3.993690988180309e-06, + "loss": 0.89428747, + "num_input_tokens_seen": 19359585, + "step": 907, + "time_per_iteration": 2.6042590141296387 + }, + { + "auxiliary_loss_clip": 0.01215902, + "auxiliary_loss_mlp": 0.01072223, + "balance_loss_clip": 1.07247162, + "balance_loss_mlp": 1.04599679, + "epoch": 0.05459191342251616, + "flos": 18115102558080.0, + "grad_norm": 1.9147250809076852, + "language_loss": 0.87107456, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.89395589, + "num_input_tokens_seen": 19378590, + "step": 908, + "time_per_iteration": 2.5289340019226074 + }, + { + "auxiliary_loss_clip": 0.0120781, + "auxiliary_loss_mlp": 0.01067651, + "balance_loss_clip": 1.06887841, + "balance_loss_mlp": 1.04075694, + "epoch": 0.054652036675184125, + "flos": 19208295031680.0, + "grad_norm": 2.5870235586430548, + "language_loss": 0.89760447, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.92035908, + "num_input_tokens_seen": 19397910, + "step": 909, + "time_per_iteration": 2.586700916290283 + }, + { + "auxiliary_loss_clip": 0.01209808, + "auxiliary_loss_mlp": 0.01074114, + "balance_loss_clip": 1.07129347, + "balance_loss_mlp": 1.04730344, + "epoch": 0.0547121599278521, + "flos": 16325745615360.0, + "grad_norm": 2.618369487881624, + "language_loss": 0.70765096, + "learning_rate": 3.99359791720544e-06, + "loss": 0.73049021, + "num_input_tokens_seen": 19415950, + "step": 910, + "time_per_iteration": 2.53511905670166 + }, + { + "auxiliary_loss_clip": 0.01200062, + "auxiliary_loss_mlp": 0.01053318, + "balance_loss_clip": 1.06863356, + "balance_loss_mlp": 1.02862978, + "epoch": 0.05477228318052007, + "flos": 20339014239360.0, + "grad_norm": 1.8868595352342914, + "language_loss": 0.83306777, + "learning_rate": 3.993566742350714e-06, + "loss": 0.85560155, + "num_input_tokens_seen": 19435275, + "step": 911, + "time_per_iteration": 2.587833881378174 + }, + { + "auxiliary_loss_clip": 0.01200872, + "auxiliary_loss_mlp": 0.01070358, + "balance_loss_clip": 1.06841397, + "balance_loss_mlp": 1.04303503, + "epoch": 0.054832406433188034, + "flos": 21973092687360.0, + "grad_norm": 2.4222115748246744, + "language_loss": 0.75998247, + "learning_rate": 3.993535491899736e-06, + "loss": 0.78269482, + "num_input_tokens_seen": 19452090, + "step": 912, + "time_per_iteration": 2.5228497982025146 + }, + { + "auxiliary_loss_clip": 0.01198495, + "auxiliary_loss_mlp": 0.01050651, + "balance_loss_clip": 1.06942177, + "balance_loss_mlp": 1.02553356, + "epoch": 0.054892529685856006, + "flos": 16398931576320.0, + "grad_norm": 2.250017035664635, + "language_loss": 0.82952851, + "learning_rate": 3.993504165853694e-06, + "loss": 0.85202003, + "num_input_tokens_seen": 19470865, + "step": 913, + "time_per_iteration": 2.5697543621063232 + }, + { + "auxiliary_loss_clip": 0.01212633, + "auxiliary_loss_mlp": 0.01058776, + "balance_loss_clip": 1.08327341, + "balance_loss_mlp": 1.03445721, + "epoch": 0.05495265293852397, + "flos": 23912341084800.0, + "grad_norm": 2.2020157072855473, + "language_loss": 0.83744383, + "learning_rate": 3.993472764213772e-06, + "loss": 0.86015797, + "num_input_tokens_seen": 19492145, + "step": 914, + "time_per_iteration": 2.5590310096740723 + }, + { + "auxiliary_loss_clip": 0.01220928, + "auxiliary_loss_mlp": 0.00848193, + "balance_loss_clip": 1.07384896, + "balance_loss_mlp": 1.1141088, + "epoch": 0.055012776191191944, + "flos": 23586954756480.0, + "grad_norm": 3.3950793702037205, + "language_loss": 0.90389013, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.92458141, + "num_input_tokens_seen": 19511015, + "step": 915, + "time_per_iteration": 2.596266984939575 + }, + { + "auxiliary_loss_clip": 0.01213258, + "auxiliary_loss_mlp": 0.01055884, + "balance_loss_clip": 1.07529211, + "balance_loss_mlp": 1.03152943, + "epoch": 0.055072899443859916, + "flos": 17528501548800.0, + "grad_norm": 1.9940370684154272, + "language_loss": 0.897946, + "learning_rate": 3.993409734157064e-06, + "loss": 0.92063737, + "num_input_tokens_seen": 19529040, + "step": 916, + "time_per_iteration": 2.4945449829101562 + }, + { + "auxiliary_loss_clip": 0.01193556, + "auxiliary_loss_mlp": 0.01062865, + "balance_loss_clip": 1.08301973, + "balance_loss_mlp": 1.03829622, + "epoch": 0.05513302269652788, + "flos": 21687172427520.0, + "grad_norm": 2.4024600693564215, + "language_loss": 0.80377972, + "learning_rate": 3.993378105742666e-06, + "loss": 0.82634395, + "num_input_tokens_seen": 19549540, + "step": 917, + "time_per_iteration": 2.6394553184509277 + }, + { + "auxiliary_loss_clip": 0.01153868, + "auxiliary_loss_mlp": 0.01061776, + "balance_loss_clip": 1.0677619, + "balance_loss_mlp": 1.03607464, + "epoch": 0.05519314594919585, + "flos": 21613340021760.0, + "grad_norm": 1.7691600682540194, + "language_loss": 0.79320288, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.81535935, + "num_input_tokens_seen": 19567570, + "step": 918, + "time_per_iteration": 2.6425232887268066 + }, + { + "auxiliary_loss_clip": 0.01211273, + "auxiliary_loss_mlp": 0.01056598, + "balance_loss_clip": 1.07026434, + "balance_loss_mlp": 1.03180277, + "epoch": 0.05525326920186382, + "flos": 21798567480960.0, + "grad_norm": 2.2911785852361013, + "language_loss": 0.89679003, + "learning_rate": 3.99331462214778e-06, + "loss": 0.91946876, + "num_input_tokens_seen": 19585330, + "step": 919, + "time_per_iteration": 2.5472095012664795 + }, + { + "auxiliary_loss_clip": 0.01225496, + "auxiliary_loss_mlp": 0.01072748, + "balance_loss_clip": 1.07110202, + "balance_loss_mlp": 1.04705799, + "epoch": 0.05531339245453179, + "flos": 28439635288320.0, + "grad_norm": 2.7296299240980435, + "language_loss": 0.87570059, + "learning_rate": 3.993282766969699e-06, + "loss": 0.89868307, + "num_input_tokens_seen": 19604970, + "step": 920, + "time_per_iteration": 5.368548154830933 + }, + { + "auxiliary_loss_clip": 0.01202036, + "auxiliary_loss_mlp": 0.01058711, + "balance_loss_clip": 1.07139444, + "balance_loss_mlp": 1.03392744, + "epoch": 0.05537351570719976, + "flos": 37375143131520.0, + "grad_norm": 1.9243711142471018, + "language_loss": 0.65750051, + "learning_rate": 3.993250836206136e-06, + "loss": 0.68010795, + "num_input_tokens_seen": 19626235, + "step": 921, + "time_per_iteration": 2.6874630451202393 + }, + { + "auxiliary_loss_clip": 0.01223011, + "auxiliary_loss_mlp": 0.01067371, + "balance_loss_clip": 1.07324648, + "balance_loss_mlp": 1.03917766, + "epoch": 0.05543363895986773, + "flos": 20084479488000.0, + "grad_norm": 2.584578921048857, + "language_loss": 0.72042388, + "learning_rate": 3.993218829858301e-06, + "loss": 0.74332768, + "num_input_tokens_seen": 19644305, + "step": 922, + "time_per_iteration": 2.543826103210449 + }, + { + "auxiliary_loss_clip": 0.01208275, + "auxiliary_loss_mlp": 0.01067381, + "balance_loss_clip": 1.08123207, + "balance_loss_mlp": 1.04109502, + "epoch": 0.0554937622125357, + "flos": 24533200690560.0, + "grad_norm": 4.640836319641821, + "language_loss": 0.82212853, + "learning_rate": 3.993186747927408e-06, + "loss": 0.84488511, + "num_input_tokens_seen": 19662130, + "step": 923, + "time_per_iteration": 2.5858380794525146 + }, + { + "auxiliary_loss_clip": 0.01215078, + "auxiliary_loss_mlp": 0.01068384, + "balance_loss_clip": 1.06863916, + "balance_loss_mlp": 1.04315984, + "epoch": 0.055553885465203665, + "flos": 14320063013760.0, + "grad_norm": 2.2588118448663743, + "language_loss": 0.78730309, + "learning_rate": 3.993154590414675e-06, + "loss": 0.81013775, + "num_input_tokens_seen": 19680715, + "step": 924, + "time_per_iteration": 2.5336220264434814 + }, + { + "auxiliary_loss_clip": 0.01179954, + "auxiliary_loss_mlp": 0.01058713, + "balance_loss_clip": 1.07005489, + "balance_loss_mlp": 1.03252292, + "epoch": 0.05561400871787164, + "flos": 27381132374400.0, + "grad_norm": 2.6713120625843105, + "language_loss": 1.02390254, + "learning_rate": 3.993122357321319e-06, + "loss": 1.04628932, + "num_input_tokens_seen": 19700535, + "step": 925, + "time_per_iteration": 4.010425090789795 + }, + { + "auxiliary_loss_clip": 0.01163645, + "auxiliary_loss_mlp": 0.01054933, + "balance_loss_clip": 1.05221725, + "balance_loss_mlp": 1.02862406, + "epoch": 0.05567413197053961, + "flos": 23221096778880.0, + "grad_norm": 2.200777159822035, + "language_loss": 0.8130753, + "learning_rate": 3.993090048648564e-06, + "loss": 0.83526105, + "num_input_tokens_seen": 19718825, + "step": 926, + "time_per_iteration": 4.102551698684692 + }, + { + "auxiliary_loss_clip": 0.01222813, + "auxiliary_loss_mlp": 0.01069178, + "balance_loss_clip": 1.0737493, + "balance_loss_mlp": 1.04271317, + "epoch": 0.055734255223207574, + "flos": 25264952559360.0, + "grad_norm": 2.964969896398472, + "language_loss": 0.73120177, + "learning_rate": 3.993057664397634e-06, + "loss": 0.75412166, + "num_input_tokens_seen": 19739080, + "step": 927, + "time_per_iteration": 2.527183771133423 + }, + { + "auxiliary_loss_clip": 0.01130848, + "auxiliary_loss_mlp": 0.01007181, + "balance_loss_clip": 1.05468082, + "balance_loss_mlp": 1.00277066, + "epoch": 0.055794378475875546, + "flos": 66503116702080.0, + "grad_norm": 0.7820258878125067, + "language_loss": 0.59870601, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.62008631, + "num_input_tokens_seen": 19802960, + "step": 928, + "time_per_iteration": 3.1208837032318115 + }, + { + "auxiliary_loss_clip": 0.01217137, + "auxiliary_loss_mlp": 0.01063333, + "balance_loss_clip": 1.07297695, + "balance_loss_mlp": 1.03786993, + "epoch": 0.05585450172854351, + "flos": 25337635729920.0, + "grad_norm": 2.657287173913943, + "language_loss": 0.95007539, + "learning_rate": 3.992992669166168e-06, + "loss": 0.97288007, + "num_input_tokens_seen": 19822765, + "step": 929, + "time_per_iteration": 2.550182819366455 + }, + { + "auxiliary_loss_clip": 0.01187307, + "auxiliary_loss_mlp": 0.01068717, + "balance_loss_clip": 1.06842649, + "balance_loss_mlp": 1.0409174, + "epoch": 0.05591462498121148, + "flos": 33911738881920.0, + "grad_norm": 2.1842478256239244, + "language_loss": 0.7206881, + "learning_rate": 3.992960058188094e-06, + "loss": 0.74324834, + "num_input_tokens_seen": 19843590, + "step": 930, + "time_per_iteration": 2.704300880432129 + }, + { + "auxiliary_loss_clip": 0.01205205, + "auxiliary_loss_mlp": 0.0106243, + "balance_loss_clip": 1.07465911, + "balance_loss_mlp": 1.03601325, + "epoch": 0.055974748233879455, + "flos": 17930880679680.0, + "grad_norm": 2.992742214691456, + "language_loss": 0.85260862, + "learning_rate": 3.992927371636776e-06, + "loss": 0.87528497, + "num_input_tokens_seen": 19860230, + "step": 931, + "time_per_iteration": 2.5343477725982666 + }, + { + "auxiliary_loss_clip": 0.01237587, + "auxiliary_loss_mlp": 0.00823184, + "balance_loss_clip": 1.10563719, + "balance_loss_mlp": 1.06617951, + "epoch": 0.05603487148654742, + "flos": 24021976371840.0, + "grad_norm": 2.37893649935355, + "language_loss": 0.83495522, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.85556287, + "num_input_tokens_seen": 19880795, + "step": 932, + "time_per_iteration": 2.5562260150909424 + }, + { + "auxiliary_loss_clip": 0.0121563, + "auxiliary_loss_mlp": 0.01070444, + "balance_loss_clip": 1.07321918, + "balance_loss_mlp": 1.04335928, + "epoch": 0.05609499473921539, + "flos": 17307758517120.0, + "grad_norm": 5.889604115011838, + "language_loss": 0.74117213, + "learning_rate": 3.992861771819365e-06, + "loss": 0.76403284, + "num_input_tokens_seen": 19897960, + "step": 933, + "time_per_iteration": 2.498112678527832 + }, + { + "auxiliary_loss_clip": 0.0117536, + "auxiliary_loss_mlp": 0.01069474, + "balance_loss_clip": 1.07733071, + "balance_loss_mlp": 1.04281878, + "epoch": 0.05615511799188336, + "flos": 20994742972800.0, + "grad_norm": 2.331500448204897, + "language_loss": 0.86854899, + "learning_rate": 3.99282885855576e-06, + "loss": 0.89099741, + "num_input_tokens_seen": 19913315, + "step": 934, + "time_per_iteration": 2.609607458114624 + }, + { + "auxiliary_loss_clip": 0.01178463, + "auxiliary_loss_mlp": 0.01069507, + "balance_loss_clip": 1.07047081, + "balance_loss_mlp": 1.04447246, + "epoch": 0.05621524124455133, + "flos": 17273535834240.0, + "grad_norm": 2.6067818713856035, + "language_loss": 0.8048414, + "learning_rate": 3.992795869723885e-06, + "loss": 0.82732105, + "num_input_tokens_seen": 19928790, + "step": 935, + "time_per_iteration": 2.630005121231079 + }, + { + "auxiliary_loss_clip": 0.01121719, + "auxiliary_loss_mlp": 0.01009546, + "balance_loss_clip": 1.04589105, + "balance_loss_mlp": 1.00532579, + "epoch": 0.0562753644972193, + "flos": 58719370458240.0, + "grad_norm": 0.8268353941187083, + "language_loss": 0.69203669, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71334934, + "num_input_tokens_seen": 19988785, + "step": 936, + "time_per_iteration": 2.99532413482666 + }, + { + "auxiliary_loss_clip": 0.01229328, + "auxiliary_loss_mlp": 0.01061983, + "balance_loss_clip": 1.07230377, + "balance_loss_mlp": 1.03730655, + "epoch": 0.05633548774988727, + "flos": 17457039440640.0, + "grad_norm": 2.1979195646578087, + "language_loss": 0.75785667, + "learning_rate": 3.992729665360331e-06, + "loss": 0.78076977, + "num_input_tokens_seen": 20007685, + "step": 937, + "time_per_iteration": 2.4748282432556152 + }, + { + "auxiliary_loss_clip": 0.01115146, + "auxiliary_loss_mlp": 0.01004898, + "balance_loss_clip": 1.05102253, + "balance_loss_mlp": 1.00079715, + "epoch": 0.05639561100255524, + "flos": 70654928083200.0, + "grad_norm": 0.8610421630523332, + "language_loss": 0.64420635, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66540682, + "num_input_tokens_seen": 20072750, + "step": 938, + "time_per_iteration": 3.0760161876678467 + }, + { + "auxiliary_loss_clip": 0.01189525, + "auxiliary_loss_mlp": 0.01063866, + "balance_loss_clip": 1.07609892, + "balance_loss_mlp": 1.0372225, + "epoch": 0.056455734255223204, + "flos": 20485996692480.0, + "grad_norm": 2.909173545331531, + "language_loss": 0.79749429, + "learning_rate": 3.992663158738745e-06, + "loss": 0.82002819, + "num_input_tokens_seen": 20089070, + "step": 939, + "time_per_iteration": 2.5867676734924316 + }, + { + "auxiliary_loss_clip": 0.01194965, + "auxiliary_loss_mlp": 0.0106925, + "balance_loss_clip": 1.07582033, + "balance_loss_mlp": 1.04378676, + "epoch": 0.056515857507891176, + "flos": 22053569109120.0, + "grad_norm": 1.7871310028209755, + "language_loss": 0.74238747, + "learning_rate": 3.992629792084341e-06, + "loss": 0.76502961, + "num_input_tokens_seen": 20108790, + "step": 940, + "time_per_iteration": 2.5813028812408447 + }, + { + "auxiliary_loss_clip": 0.01214268, + "auxiliary_loss_mlp": 0.01063601, + "balance_loss_clip": 1.07232594, + "balance_loss_mlp": 1.03754222, + "epoch": 0.05657598076055915, + "flos": 24025316336640.0, + "grad_norm": 2.4397014242942725, + "language_loss": 0.70547098, + "learning_rate": 3.992596349869216e-06, + "loss": 0.72824967, + "num_input_tokens_seen": 20128455, + "step": 941, + "time_per_iteration": 2.5666110515594482 + }, + { + "auxiliary_loss_clip": 0.01154887, + "auxiliary_loss_mlp": 0.01064707, + "balance_loss_clip": 1.07694435, + "balance_loss_mlp": 1.03849268, + "epoch": 0.05663610401322711, + "flos": 20480609652480.0, + "grad_norm": 2.120997432076242, + "language_loss": 0.80879784, + "learning_rate": 3.992562832094637e-06, + "loss": 0.83099383, + "num_input_tokens_seen": 20145775, + "step": 942, + "time_per_iteration": 2.615006923675537 + }, + { + "auxiliary_loss_clip": 0.01198012, + "auxiliary_loss_mlp": 0.01070202, + "balance_loss_clip": 1.06569827, + "balance_loss_mlp": 1.04354692, + "epoch": 0.056696227265895086, + "flos": 21069042255360.0, + "grad_norm": 2.1205185095260695, + "language_loss": 0.88050014, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.90318227, + "num_input_tokens_seen": 20164315, + "step": 943, + "time_per_iteration": 2.537804126739502 + }, + { + "auxiliary_loss_clip": 0.01211865, + "auxiliary_loss_mlp": 0.01058704, + "balance_loss_clip": 1.07266855, + "balance_loss_mlp": 1.03454065, + "epoch": 0.05675635051856306, + "flos": 17821317219840.0, + "grad_norm": 2.3912290263554676, + "language_loss": 0.75273067, + "learning_rate": 3.992495569872206e-06, + "loss": 0.77543634, + "num_input_tokens_seen": 20182760, + "step": 944, + "time_per_iteration": 2.5158066749572754 + }, + { + "auxiliary_loss_clip": 0.01214209, + "auxiliary_loss_mlp": 0.01061553, + "balance_loss_clip": 1.06859994, + "balance_loss_mlp": 1.03779483, + "epoch": 0.05681647377123102, + "flos": 23114945111040.0, + "grad_norm": 1.8489148838888723, + "language_loss": 0.79560876, + "learning_rate": 3.992461825426906e-06, + "loss": 0.81836641, + "num_input_tokens_seen": 20203830, + "step": 945, + "time_per_iteration": 2.5499508380889893 + }, + { + "auxiliary_loss_clip": 0.01208364, + "auxiliary_loss_mlp": 0.01057077, + "balance_loss_clip": 1.06919873, + "balance_loss_mlp": 1.03236508, + "epoch": 0.056876597023898995, + "flos": 16070528505600.0, + "grad_norm": 4.081894037189643, + "language_loss": 0.82539171, + "learning_rate": 3.992428005427252e-06, + "loss": 0.84804618, + "num_input_tokens_seen": 20220365, + "step": 946, + "time_per_iteration": 2.492144823074341 + }, + { + "auxiliary_loss_clip": 0.01227495, + "auxiliary_loss_mlp": 0.01055883, + "balance_loss_clip": 1.06973875, + "balance_loss_mlp": 1.03013396, + "epoch": 0.05693672027656696, + "flos": 16835641130880.0, + "grad_norm": 1.8767368385453804, + "language_loss": 0.78982449, + "learning_rate": 3.992394109874529e-06, + "loss": 0.81265831, + "num_input_tokens_seen": 20238640, + "step": 947, + "time_per_iteration": 2.480041980743408 + }, + { + "auxiliary_loss_clip": 0.01187302, + "auxiliary_loss_mlp": 0.01067591, + "balance_loss_clip": 1.07060218, + "balance_loss_mlp": 1.04150772, + "epoch": 0.05699684352923493, + "flos": 21389113370880.0, + "grad_norm": 4.744162497354199, + "language_loss": 0.85213029, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.87467921, + "num_input_tokens_seen": 20251025, + "step": 948, + "time_per_iteration": 2.522787570953369 + }, + { + "auxiliary_loss_clip": 0.01225527, + "auxiliary_loss_mlp": 0.01063168, + "balance_loss_clip": 1.07010472, + "balance_loss_mlp": 1.03602374, + "epoch": 0.057056966781902904, + "flos": 15560309767680.0, + "grad_norm": 2.4978031255856035, + "language_loss": 0.87184274, + "learning_rate": 3.992326092115019e-06, + "loss": 0.89472973, + "num_input_tokens_seen": 20269775, + "step": 949, + "time_per_iteration": 2.484736680984497 + }, + { + "auxiliary_loss_clip": 0.01206128, + "auxiliary_loss_mlp": 0.01065205, + "balance_loss_clip": 1.06985974, + "balance_loss_mlp": 1.04169714, + "epoch": 0.05711709003457087, + "flos": 19937856170880.0, + "grad_norm": 2.1560561914792413, + "language_loss": 0.78873062, + "learning_rate": 3.992291969910811e-06, + "loss": 0.81144392, + "num_input_tokens_seen": 20287715, + "step": 950, + "time_per_iteration": 2.4868617057800293 + }, + { + "auxiliary_loss_clip": 0.01194313, + "auxiliary_loss_mlp": 0.01066897, + "balance_loss_clip": 1.06735468, + "balance_loss_mlp": 1.04267406, + "epoch": 0.05717721328723884, + "flos": 30332701774080.0, + "grad_norm": 1.9471889257276236, + "language_loss": 0.82429546, + "learning_rate": 3.992257772158691e-06, + "loss": 0.84690756, + "num_input_tokens_seen": 20307070, + "step": 951, + "time_per_iteration": 2.6099483966827393 + }, + { + "auxiliary_loss_clip": 0.01178076, + "auxiliary_loss_mlp": 0.0105885, + "balance_loss_clip": 1.06287479, + "balance_loss_mlp": 1.03256392, + "epoch": 0.05723733653990681, + "flos": 23654358627840.0, + "grad_norm": 2.4020693099392014, + "language_loss": 0.86390215, + "learning_rate": 3.992223498859958e-06, + "loss": 0.88627142, + "num_input_tokens_seen": 20324945, + "step": 952, + "time_per_iteration": 2.553435802459717 + }, + { + "auxiliary_loss_clip": 0.01204458, + "auxiliary_loss_mlp": 0.01061521, + "balance_loss_clip": 1.06802976, + "balance_loss_mlp": 1.03349531, + "epoch": 0.05729745979257478, + "flos": 22055759838720.0, + "grad_norm": 2.2137311881131576, + "language_loss": 0.79160559, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.81426537, + "num_input_tokens_seen": 20346135, + "step": 953, + "time_per_iteration": 2.5988008975982666 + }, + { + "auxiliary_loss_clip": 0.01198522, + "auxiliary_loss_mlp": 0.01063322, + "balance_loss_clip": 1.06947112, + "balance_loss_mlp": 1.03789473, + "epoch": 0.05735758304524275, + "flos": 19604353368960.0, + "grad_norm": 1.9838836779139906, + "language_loss": 0.86833405, + "learning_rate": 3.992154725627848e-06, + "loss": 0.89095247, + "num_input_tokens_seen": 20364450, + "step": 954, + "time_per_iteration": 2.5365943908691406 + }, + { + "auxiliary_loss_clip": 0.01219935, + "auxiliary_loss_mlp": 0.01062331, + "balance_loss_clip": 1.07086492, + "balance_loss_mlp": 1.03745222, + "epoch": 0.057417706297910716, + "flos": 19099018880640.0, + "grad_norm": 2.5886432241836483, + "language_loss": 0.88173354, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.90455616, + "num_input_tokens_seen": 20383500, + "step": 955, + "time_per_iteration": 2.514401912689209 + }, + { + "auxiliary_loss_clip": 0.01196086, + "auxiliary_loss_mlp": 0.01061478, + "balance_loss_clip": 1.07444859, + "balance_loss_mlp": 1.03609824, + "epoch": 0.05747782955057869, + "flos": 16654507822080.0, + "grad_norm": 2.292065801486436, + "language_loss": 0.89249617, + "learning_rate": 3.992085650224914e-06, + "loss": 0.91507185, + "num_input_tokens_seen": 20400295, + "step": 956, + "time_per_iteration": 2.5152671337127686 + }, + { + "auxiliary_loss_clip": 0.01177877, + "auxiliary_loss_mlp": 0.01054682, + "balance_loss_clip": 1.06748939, + "balance_loss_mlp": 1.0295881, + "epoch": 0.05753795280324665, + "flos": 14502058248960.0, + "grad_norm": 2.7306624775191684, + "language_loss": 0.75743902, + "learning_rate": 3.99205099921266e-06, + "loss": 0.77976459, + "num_input_tokens_seen": 20419085, + "step": 957, + "time_per_iteration": 2.583271026611328 + }, + { + "auxiliary_loss_clip": 0.0118643, + "auxiliary_loss_mlp": 0.01066111, + "balance_loss_clip": 1.06577969, + "balance_loss_mlp": 1.03917027, + "epoch": 0.057598076055914625, + "flos": 18076318848000.0, + "grad_norm": 2.2018324367838487, + "language_loss": 0.79883564, + "learning_rate": 3.992016272661633e-06, + "loss": 0.82136106, + "num_input_tokens_seen": 20437465, + "step": 958, + "time_per_iteration": 3.997917413711548 + }, + { + "auxiliary_loss_clip": 0.01195151, + "auxiliary_loss_mlp": 0.01057337, + "balance_loss_clip": 1.06888437, + "balance_loss_mlp": 1.03424597, + "epoch": 0.0576581993085826, + "flos": 22124600254080.0, + "grad_norm": 2.576731120965464, + "language_loss": 0.88333404, + "learning_rate": 3.99198147057315e-06, + "loss": 0.90585899, + "num_input_tokens_seen": 20456235, + "step": 959, + "time_per_iteration": 4.038728952407837 + }, + { + "auxiliary_loss_clip": 0.01181518, + "auxiliary_loss_mlp": 0.01061272, + "balance_loss_clip": 1.06792104, + "balance_loss_mlp": 1.03691745, + "epoch": 0.05771832256125056, + "flos": 33181746779520.0, + "grad_norm": 1.9840624198998669, + "language_loss": 0.78611934, + "learning_rate": 3.991946592948529e-06, + "loss": 0.80854726, + "num_input_tokens_seen": 20476825, + "step": 960, + "time_per_iteration": 2.6978962421417236 + }, + { + "auxiliary_loss_clip": 0.01147103, + "auxiliary_loss_mlp": 0.0106759, + "balance_loss_clip": 1.06781864, + "balance_loss_mlp": 1.04155433, + "epoch": 0.057778445813918534, + "flos": 24170143973760.0, + "grad_norm": 2.9027025976053795, + "language_loss": 0.93228149, + "learning_rate": 3.991911639789094e-06, + "loss": 0.95442843, + "num_input_tokens_seen": 20496965, + "step": 961, + "time_per_iteration": 2.6799280643463135 + }, + { + "auxiliary_loss_clip": 0.01190444, + "auxiliary_loss_mlp": 0.01065825, + "balance_loss_clip": 1.06575835, + "balance_loss_mlp": 1.03943253, + "epoch": 0.0578385690665865, + "flos": 29643037666560.0, + "grad_norm": 1.9922293158632958, + "language_loss": 0.67869675, + "learning_rate": 3.991876611096169e-06, + "loss": 0.70125949, + "num_input_tokens_seen": 20518035, + "step": 962, + "time_per_iteration": 2.6245076656341553 + }, + { + "auxiliary_loss_clip": 0.0115926, + "auxiliary_loss_mlp": 0.01065757, + "balance_loss_clip": 1.05397344, + "balance_loss_mlp": 1.04215336, + "epoch": 0.05789869231925447, + "flos": 20885430908160.0, + "grad_norm": 2.9175803394960362, + "language_loss": 0.885342, + "learning_rate": 3.991841506871084e-06, + "loss": 0.90759218, + "num_input_tokens_seen": 20534740, + "step": 963, + "time_per_iteration": 2.563469171524048 + }, + { + "auxiliary_loss_clip": 0.01194357, + "auxiliary_loss_mlp": 0.01057861, + "balance_loss_clip": 1.07305384, + "balance_loss_mlp": 1.03317273, + "epoch": 0.057958815571922444, + "flos": 26031106679040.0, + "grad_norm": 2.858711691247023, + "language_loss": 0.84866524, + "learning_rate": 3.99180632711517e-06, + "loss": 0.87118739, + "num_input_tokens_seen": 20553485, + "step": 964, + "time_per_iteration": 5.440260648727417 + }, + { + "auxiliary_loss_clip": 0.01201285, + "auxiliary_loss_mlp": 0.01067832, + "balance_loss_clip": 1.07195854, + "balance_loss_mlp": 1.04263127, + "epoch": 0.05801893882459041, + "flos": 18077683564800.0, + "grad_norm": 6.0364997886788965, + "language_loss": 0.77651596, + "learning_rate": 3.99177107182976e-06, + "loss": 0.79920709, + "num_input_tokens_seen": 20572155, + "step": 965, + "time_per_iteration": 2.607905864715576 + }, + { + "auxiliary_loss_clip": 0.01229002, + "auxiliary_loss_mlp": 0.0106954, + "balance_loss_clip": 1.1747005, + "balance_loss_mlp": 1.04432702, + "epoch": 0.05807906207725838, + "flos": 17748885444480.0, + "grad_norm": 2.005634553913885, + "language_loss": 0.81018603, + "learning_rate": 3.99173574101619e-06, + "loss": 0.83317143, + "num_input_tokens_seen": 20590395, + "step": 966, + "time_per_iteration": 2.6351230144500732 + }, + { + "auxiliary_loss_clip": 0.01208459, + "auxiliary_loss_mlp": 0.01060168, + "balance_loss_clip": 1.06918192, + "balance_loss_mlp": 1.03759027, + "epoch": 0.058139185329926346, + "flos": 18040372312320.0, + "grad_norm": 1.8928077223393354, + "language_loss": 0.76280046, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.78548682, + "num_input_tokens_seen": 20608435, + "step": 967, + "time_per_iteration": 2.5184640884399414 + }, + { + "auxiliary_loss_clip": 0.01121109, + "auxiliary_loss_mlp": 0.01090119, + "balance_loss_clip": 1.05865812, + "balance_loss_mlp": 1.08535051, + "epoch": 0.05819930858259432, + "flos": 62363297485440.0, + "grad_norm": 0.8115790733219859, + "language_loss": 0.57345188, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59556425, + "num_input_tokens_seen": 20668575, + "step": 968, + "time_per_iteration": 3.04945969581604 + }, + { + "auxiliary_loss_clip": 0.0120065, + "auxiliary_loss_mlp": 0.01055805, + "balance_loss_clip": 1.07094705, + "balance_loss_mlp": 1.03063989, + "epoch": 0.05825943183526229, + "flos": 19135360465920.0, + "grad_norm": 2.1014976469854396, + "language_loss": 0.82199919, + "learning_rate": 3.991629295419945e-06, + "loss": 0.84456372, + "num_input_tokens_seen": 20687355, + "step": 969, + "time_per_iteration": 2.5850179195404053 + }, + { + "auxiliary_loss_clip": 0.01216189, + "auxiliary_loss_mlp": 0.00808659, + "balance_loss_clip": 1.07261205, + "balance_loss_mlp": 1.04277611, + "epoch": 0.058319555087930255, + "flos": 29022465369600.0, + "grad_norm": 2.347194270274993, + "language_loss": 0.77391684, + "learning_rate": 3.991593662507167e-06, + "loss": 0.79416531, + "num_input_tokens_seen": 20705710, + "step": 970, + "time_per_iteration": 2.5915369987487793 + }, + { + "auxiliary_loss_clip": 0.01193036, + "auxiliary_loss_mlp": 0.01060076, + "balance_loss_clip": 1.06819022, + "balance_loss_mlp": 1.03426743, + "epoch": 0.05837967834059823, + "flos": 18879999701760.0, + "grad_norm": 2.6318922983924216, + "language_loss": 0.92249507, + "learning_rate": 3.991557954072958e-06, + "loss": 0.94502622, + "num_input_tokens_seen": 20722405, + "step": 971, + "time_per_iteration": 2.585118055343628 + }, + { + "auxiliary_loss_clip": 0.0119261, + "auxiliary_loss_mlp": 0.0105587, + "balance_loss_clip": 1.0672096, + "balance_loss_mlp": 1.03218293, + "epoch": 0.05843980159326619, + "flos": 25703062744320.0, + "grad_norm": 1.842121426587099, + "language_loss": 0.85853374, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88101852, + "num_input_tokens_seen": 20741480, + "step": 972, + "time_per_iteration": 2.590047597885132 + }, + { + "auxiliary_loss_clip": 0.01183313, + "auxiliary_loss_mlp": 0.01062928, + "balance_loss_clip": 1.07254517, + "balance_loss_mlp": 1.03999245, + "epoch": 0.058499924845934165, + "flos": 25552129795200.0, + "grad_norm": 2.0283360681900664, + "language_loss": 0.87479758, + "learning_rate": 3.991486310645667e-06, + "loss": 0.89726001, + "num_input_tokens_seen": 20759685, + "step": 973, + "time_per_iteration": 2.658601760864258 + }, + { + "auxiliary_loss_clip": 0.01217088, + "auxiliary_loss_mlp": 0.00811407, + "balance_loss_clip": 1.07249939, + "balance_loss_mlp": 1.04588127, + "epoch": 0.05856004809860214, + "flos": 16436171001600.0, + "grad_norm": 2.116011697756705, + "language_loss": 0.7512033, + "learning_rate": 3.991450375655301e-06, + "loss": 0.77148825, + "num_input_tokens_seen": 20778180, + "step": 974, + "time_per_iteration": 2.5226144790649414 + }, + { + "auxiliary_loss_clip": 0.01211687, + "auxiliary_loss_mlp": 0.00806364, + "balance_loss_clip": 1.07348013, + "balance_loss_mlp": 1.03797174, + "epoch": 0.0586201713512701, + "flos": 39458824116480.0, + "grad_norm": 7.1136996950571305, + "language_loss": 0.76723361, + "learning_rate": 3.991414365148936e-06, + "loss": 0.78741407, + "num_input_tokens_seen": 20802705, + "step": 975, + "time_per_iteration": 2.6912994384765625 + }, + { + "auxiliary_loss_clip": 0.01226424, + "auxiliary_loss_mlp": 0.01065896, + "balance_loss_clip": 1.07146168, + "balance_loss_mlp": 1.04193461, + "epoch": 0.058680294603938074, + "flos": 23365170230400.0, + "grad_norm": 2.7474774104515616, + "language_loss": 0.77054274, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79346591, + "num_input_tokens_seen": 20822540, + "step": 976, + "time_per_iteration": 2.510050058364868 + }, + { + "auxiliary_loss_clip": 0.01188474, + "auxiliary_loss_mlp": 0.01078487, + "balance_loss_clip": 1.06816506, + "balance_loss_mlp": 1.05233264, + "epoch": 0.05874041785660604, + "flos": 32232017226240.0, + "grad_norm": 2.0060845212821086, + "language_loss": 0.87296009, + "learning_rate": 3.991342117593679e-06, + "loss": 0.89562976, + "num_input_tokens_seen": 20844175, + "step": 977, + "time_per_iteration": 2.6715548038482666 + }, + { + "auxiliary_loss_clip": 0.01195426, + "auxiliary_loss_mlp": 0.0106558, + "balance_loss_clip": 1.07209563, + "balance_loss_mlp": 1.04041481, + "epoch": 0.05880054110927401, + "flos": 22310043194880.0, + "grad_norm": 1.5162413927943732, + "language_loss": 0.792575, + "learning_rate": 3.991305880547527e-06, + "loss": 0.81518501, + "num_input_tokens_seen": 20864730, + "step": 978, + "time_per_iteration": 2.577559471130371 + }, + { + "auxiliary_loss_clip": 0.01207888, + "auxiliary_loss_mlp": 0.01076532, + "balance_loss_clip": 1.17905569, + "balance_loss_mlp": 1.05031765, + "epoch": 0.05886066436194198, + "flos": 27380450016000.0, + "grad_norm": 2.0463819775454537, + "language_loss": 0.80693382, + "learning_rate": 3.991269567990855e-06, + "loss": 0.82977808, + "num_input_tokens_seen": 20885200, + "step": 979, + "time_per_iteration": 3.206923007965088 + }, + { + "auxiliary_loss_clip": 0.01099098, + "auxiliary_loss_mlp": 0.0103787, + "balance_loss_clip": 1.04910827, + "balance_loss_mlp": 1.03343523, + "epoch": 0.05892078761460995, + "flos": 59584493525760.0, + "grad_norm": 0.9469908296516281, + "language_loss": 0.59043694, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61180663, + "num_input_tokens_seen": 20940325, + "step": 980, + "time_per_iteration": 3.4336395263671875 + }, + { + "auxiliary_loss_clip": 0.01219429, + "auxiliary_loss_mlp": 0.01066015, + "balance_loss_clip": 1.0707798, + "balance_loss_mlp": 1.04025424, + "epoch": 0.05898091086727792, + "flos": 15414081500160.0, + "grad_norm": 2.282963099956916, + "language_loss": 0.86809731, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.89095175, + "num_input_tokens_seen": 20958220, + "step": 981, + "time_per_iteration": 2.4974305629730225 + }, + { + "auxiliary_loss_clip": 0.0120085, + "auxiliary_loss_mlp": 0.01055677, + "balance_loss_clip": 1.07026434, + "balance_loss_mlp": 1.03235948, + "epoch": 0.059041034119945886, + "flos": 23655328295040.0, + "grad_norm": 2.457992680729703, + "language_loss": 0.79750216, + "learning_rate": 3.991160177271513e-06, + "loss": 0.82006741, + "num_input_tokens_seen": 20978920, + "step": 982, + "time_per_iteration": 2.5989937782287598 + }, + { + "auxiliary_loss_clip": 0.01261767, + "auxiliary_loss_mlp": 0.01061672, + "balance_loss_clip": 1.17790926, + "balance_loss_mlp": 1.03754449, + "epoch": 0.05910115737261386, + "flos": 24754087376640.0, + "grad_norm": 2.2154336956594216, + "language_loss": 0.84149545, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.86472988, + "num_input_tokens_seen": 20999490, + "step": 983, + "time_per_iteration": 2.592073917388916 + }, + { + "auxiliary_loss_clip": 0.01206933, + "auxiliary_loss_mlp": 0.01060914, + "balance_loss_clip": 1.0672071, + "balance_loss_mlp": 1.03745413, + "epoch": 0.05916128062528183, + "flos": 11728749070080.0, + "grad_norm": 1.9993185416718362, + "language_loss": 0.8466599, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.86933845, + "num_input_tokens_seen": 21017865, + "step": 984, + "time_per_iteration": 2.499333620071411 + }, + { + "auxiliary_loss_clip": 0.01206577, + "auxiliary_loss_mlp": 0.01054064, + "balance_loss_clip": 1.07903004, + "balance_loss_mlp": 1.03094912, + "epoch": 0.059221403877949795, + "flos": 21902995296000.0, + "grad_norm": 4.879450703343181, + "language_loss": 0.77561241, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.79821885, + "num_input_tokens_seen": 21035900, + "step": 985, + "time_per_iteration": 2.539445161819458 + }, + { + "auxiliary_loss_clip": 0.01155959, + "auxiliary_loss_mlp": 0.0106455, + "balance_loss_clip": 1.06829143, + "balance_loss_mlp": 1.04099405, + "epoch": 0.05928152713061777, + "flos": 20514580940160.0, + "grad_norm": 2.052150881378456, + "language_loss": 0.9066211, + "learning_rate": 3.991013265915661e-06, + "loss": 0.92882621, + "num_input_tokens_seen": 21053235, + "step": 986, + "time_per_iteration": 2.58915638923645 + }, + { + "auxiliary_loss_clip": 0.01210589, + "auxiliary_loss_mlp": 0.0106358, + "balance_loss_clip": 1.06605864, + "balance_loss_mlp": 1.03712702, + "epoch": 0.05934165038328574, + "flos": 24495135252480.0, + "grad_norm": 2.0737832676270753, + "language_loss": 0.75812083, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.78086257, + "num_input_tokens_seen": 21073090, + "step": 987, + "time_per_iteration": 2.56268572807312 + }, + { + "auxiliary_loss_clip": 0.01210099, + "auxiliary_loss_mlp": 0.01055015, + "balance_loss_clip": 1.06846976, + "balance_loss_mlp": 1.03127992, + "epoch": 0.059401773635953704, + "flos": 38728041914880.0, + "grad_norm": 2.2575136837372436, + "language_loss": 0.71988893, + "learning_rate": 3.990939357235621e-06, + "loss": 0.74254012, + "num_input_tokens_seen": 21094895, + "step": 988, + "time_per_iteration": 2.659303665161133 + }, + { + "auxiliary_loss_clip": 0.01080044, + "auxiliary_loss_mlp": 0.01043521, + "balance_loss_clip": 1.04851151, + "balance_loss_mlp": 1.03863323, + "epoch": 0.059461896888621676, + "flos": 58023565125120.0, + "grad_norm": 0.9643695972733062, + "language_loss": 0.71181428, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73304999, + "num_input_tokens_seen": 21147555, + "step": 989, + "time_per_iteration": 3.0398290157318115 + }, + { + "auxiliary_loss_clip": 0.01185191, + "auxiliary_loss_mlp": 0.01068021, + "balance_loss_clip": 1.06462073, + "balance_loss_mlp": 1.04100776, + "epoch": 0.05952202014128964, + "flos": 22127760650880.0, + "grad_norm": 3.0693769626661345, + "language_loss": 0.78354836, + "learning_rate": 3.990865146569105e-06, + "loss": 0.80608046, + "num_input_tokens_seen": 21167845, + "step": 990, + "time_per_iteration": 2.628450870513916 + }, + { + "auxiliary_loss_clip": 0.01205954, + "auxiliary_loss_mlp": 0.0105944, + "balance_loss_clip": 1.07024026, + "balance_loss_mlp": 1.03458452, + "epoch": 0.059582143393957614, + "flos": 20445776438400.0, + "grad_norm": 2.1111790395980563, + "language_loss": 0.86311287, + "learning_rate": 3.990827927994434e-06, + "loss": 0.8857668, + "num_input_tokens_seen": 21185085, + "step": 991, + "time_per_iteration": 2.530669927597046 + }, + { + "auxiliary_loss_clip": 0.0122565, + "auxiliary_loss_mlp": 0.0106215, + "balance_loss_clip": 1.06936622, + "balance_loss_mlp": 1.03766441, + "epoch": 0.059642266646625586, + "flos": 20594877793920.0, + "grad_norm": 1.933745670090089, + "language_loss": 0.77218014, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.79505813, + "num_input_tokens_seen": 21204230, + "step": 992, + "time_per_iteration": 2.483783483505249 + }, + { + "auxiliary_loss_clip": 0.01155944, + "auxiliary_loss_mlp": 0.01060697, + "balance_loss_clip": 1.06729412, + "balance_loss_mlp": 1.03755796, + "epoch": 0.05970238989929355, + "flos": 19352655792000.0, + "grad_norm": 2.960335431540363, + "language_loss": 0.75085944, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.77302587, + "num_input_tokens_seen": 21222655, + "step": 993, + "time_per_iteration": 2.6115572452545166 + }, + { + "auxiliary_loss_clip": 0.01174158, + "auxiliary_loss_mlp": 0.01077981, + "balance_loss_clip": 1.07896256, + "balance_loss_mlp": 1.05098021, + "epoch": 0.05976251315196152, + "flos": 30264040926720.0, + "grad_norm": 1.9686551783607777, + "language_loss": 0.78871083, + "learning_rate": 3.990715819321712e-06, + "loss": 0.81123221, + "num_input_tokens_seen": 21242310, + "step": 994, + "time_per_iteration": 2.6749072074890137 + }, + { + "auxiliary_loss_clip": 0.01224067, + "auxiliary_loss_mlp": 0.01076125, + "balance_loss_clip": 1.0720942, + "balance_loss_mlp": 1.05193758, + "epoch": 0.05982263640462949, + "flos": 23185150243200.0, + "grad_norm": 2.506593804803191, + "language_loss": 0.80195391, + "learning_rate": 3.99067829878596e-06, + "loss": 0.82495588, + "num_input_tokens_seen": 21261410, + "step": 995, + "time_per_iteration": 2.481417179107666 + }, + { + "auxiliary_loss_clip": 0.01177044, + "auxiliary_loss_mlp": 0.01063555, + "balance_loss_clip": 1.06830716, + "balance_loss_mlp": 1.03812742, + "epoch": 0.05988275965729746, + "flos": 27850879463040.0, + "grad_norm": 1.9668162029316845, + "language_loss": 0.86790198, + "learning_rate": 3.990640702763487e-06, + "loss": 0.89030802, + "num_input_tokens_seen": 21280080, + "step": 996, + "time_per_iteration": 2.625706672668457 + }, + { + "auxiliary_loss_clip": 0.01189764, + "auxiliary_loss_mlp": 0.01072742, + "balance_loss_clip": 1.0757277, + "balance_loss_mlp": 1.04431021, + "epoch": 0.05994288290996543, + "flos": 24680003575680.0, + "grad_norm": 3.0767806845943197, + "language_loss": 0.88084173, + "learning_rate": 3.990603031255718e-06, + "loss": 0.90346682, + "num_input_tokens_seen": 21296765, + "step": 997, + "time_per_iteration": 3.9969847202301025 + }, + { + "auxiliary_loss_clip": 0.01157596, + "auxiliary_loss_mlp": 0.01031384, + "balance_loss_clip": 1.15298545, + "balance_loss_mlp": 1.02647305, + "epoch": 0.0600030061626334, + "flos": 69929568835200.0, + "grad_norm": 1.02057964995631, + "language_loss": 0.75460589, + "learning_rate": 3.990565284264083e-06, + "loss": 0.7764957, + "num_input_tokens_seen": 21363345, + "step": 998, + "time_per_iteration": 4.631951808929443 + }, + { + "auxiliary_loss_clip": 0.01183423, + "auxiliary_loss_mlp": 0.01062806, + "balance_loss_clip": 1.07864499, + "balance_loss_mlp": 1.03776038, + "epoch": 0.06006312941530137, + "flos": 26540140268160.0, + "grad_norm": 1.8757349560470702, + "language_loss": 0.75966203, + "learning_rate": 3.990527461790013e-06, + "loss": 0.78212428, + "num_input_tokens_seen": 21385290, + "step": 999, + "time_per_iteration": 2.6595571041107178 + }, + { + "auxiliary_loss_clip": 0.01204512, + "auxiliary_loss_mlp": 0.01056483, + "balance_loss_clip": 1.0655539, + "balance_loss_mlp": 1.03171158, + "epoch": 0.060123252667969335, + "flos": 27344000689920.0, + "grad_norm": 1.7916408498364047, + "language_loss": 0.82484126, + "learning_rate": 3.990489563834943e-06, + "loss": 0.84745121, + "num_input_tokens_seen": 21407625, + "step": 1000, + "time_per_iteration": 2.5711843967437744 + }, + { + "auxiliary_loss_clip": 0.0119475, + "auxiliary_loss_mlp": 0.01070673, + "balance_loss_clip": 1.07067955, + "balance_loss_mlp": 1.0454843, + "epoch": 0.06018337592063731, + "flos": 27016710940800.0, + "grad_norm": 2.171629907366366, + "language_loss": 0.86091167, + "learning_rate": 3.990451590400309e-06, + "loss": 0.8835659, + "num_input_tokens_seen": 21426835, + "step": 1001, + "time_per_iteration": 2.630709409713745 + }, + { + "auxiliary_loss_clip": 0.01197385, + "auxiliary_loss_mlp": 0.01055398, + "balance_loss_clip": 1.06689107, + "balance_loss_mlp": 1.03180695, + "epoch": 0.06024349917330528, + "flos": 25592960580480.0, + "grad_norm": 2.1489276183072756, + "language_loss": 0.73945892, + "learning_rate": 3.990413541487551e-06, + "loss": 0.76198673, + "num_input_tokens_seen": 21444920, + "step": 1002, + "time_per_iteration": 4.0176098346710205 + }, + { + "auxiliary_loss_clip": 0.01221868, + "auxiliary_loss_mlp": 0.01058949, + "balance_loss_clip": 1.06987667, + "balance_loss_mlp": 1.03488088, + "epoch": 0.060303622425973244, + "flos": 26133271937280.0, + "grad_norm": 2.778439871108027, + "language_loss": 0.75676692, + "learning_rate": 3.990375417098112e-06, + "loss": 0.77957511, + "num_input_tokens_seen": 21463555, + "step": 1003, + "time_per_iteration": 2.58474063873291 + }, + { + "auxiliary_loss_clip": 0.01200914, + "auxiliary_loss_mlp": 0.0105964, + "balance_loss_clip": 1.07945132, + "balance_loss_mlp": 1.03578663, + "epoch": 0.060363745678641216, + "flos": 20377187418240.0, + "grad_norm": 2.026264775996035, + "language_loss": 0.70297629, + "learning_rate": 3.990337217233437e-06, + "loss": 0.72558177, + "num_input_tokens_seen": 21481990, + "step": 1004, + "time_per_iteration": 2.5648927688598633 + }, + { + "auxiliary_loss_clip": 0.01219348, + "auxiliary_loss_mlp": 0.01070984, + "balance_loss_clip": 1.07260704, + "balance_loss_mlp": 1.04626, + "epoch": 0.06042386893130918, + "flos": 17749172753280.0, + "grad_norm": 2.633538805133491, + "language_loss": 0.83320796, + "learning_rate": 3.990298941894976e-06, + "loss": 0.85611123, + "num_input_tokens_seen": 21500385, + "step": 1005, + "time_per_iteration": 2.4894700050354004 + }, + { + "auxiliary_loss_clip": 0.01107081, + "auxiliary_loss_mlp": 0.01010858, + "balance_loss_clip": 1.04905033, + "balance_loss_mlp": 1.00666142, + "epoch": 0.06048399218397715, + "flos": 68538496872960.0, + "grad_norm": 0.89587268647693, + "language_loss": 0.59039563, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61157501, + "num_input_tokens_seen": 21561040, + "step": 1006, + "time_per_iteration": 3.185408592224121 + }, + { + "auxiliary_loss_clip": 0.01190072, + "auxiliary_loss_mlp": 0.01054852, + "balance_loss_clip": 1.06259263, + "balance_loss_mlp": 1.03024673, + "epoch": 0.060544115436645125, + "flos": 23258515772160.0, + "grad_norm": 2.387725534704207, + "language_loss": 0.7446155, + "learning_rate": 3.990222164802503e-06, + "loss": 0.76706469, + "num_input_tokens_seen": 21580655, + "step": 1007, + "time_per_iteration": 2.550104856491089 + }, + { + "auxiliary_loss_clip": 0.01199285, + "auxiliary_loss_mlp": 0.01062672, + "balance_loss_clip": 1.06600547, + "balance_loss_mlp": 1.03832912, + "epoch": 0.06060423868931309, + "flos": 23878441624320.0, + "grad_norm": 1.9477385052047491, + "language_loss": 0.80524254, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.82786202, + "num_input_tokens_seen": 21599650, + "step": 1008, + "time_per_iteration": 2.576491594314575 + }, + { + "auxiliary_loss_clip": 0.01243083, + "auxiliary_loss_mlp": 0.01057452, + "balance_loss_clip": 1.17467427, + "balance_loss_mlp": 1.03330016, + "epoch": 0.06066436194198106, + "flos": 18728061171840.0, + "grad_norm": 1.9072104984405902, + "language_loss": 0.78198016, + "learning_rate": 3.990145085832335e-06, + "loss": 0.80498552, + "num_input_tokens_seen": 21617550, + "step": 1009, + "time_per_iteration": 2.567054271697998 + }, + { + "auxiliary_loss_clip": 0.01207874, + "auxiliary_loss_mlp": 0.01058349, + "balance_loss_clip": 1.08072305, + "balance_loss_mlp": 1.03492451, + "epoch": 0.06072448519464903, + "flos": 24640465680000.0, + "grad_norm": 1.7897389105736101, + "language_loss": 0.92961478, + "learning_rate": 3.990106433146769e-06, + "loss": 0.952277, + "num_input_tokens_seen": 21635865, + "step": 1010, + "time_per_iteration": 2.549360513687134 + }, + { + "auxiliary_loss_clip": 0.0116607, + "auxiliary_loss_mlp": 0.00826994, + "balance_loss_clip": 1.07665706, + "balance_loss_mlp": 1.07715666, + "epoch": 0.060784608447317, + "flos": 17378825575680.0, + "grad_norm": 2.20160227278177, + "language_loss": 0.72036451, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.74029517, + "num_input_tokens_seen": 21653945, + "step": 1011, + "time_per_iteration": 2.6423544883728027 + }, + { + "auxiliary_loss_clip": 0.01191718, + "auxiliary_loss_mlp": 0.01068049, + "balance_loss_clip": 1.05802798, + "balance_loss_mlp": 1.04100049, + "epoch": 0.06084473169998497, + "flos": 23692208584320.0, + "grad_norm": 5.982863501147838, + "language_loss": 0.87598157, + "learning_rate": 3.990028901381999e-06, + "loss": 0.89857924, + "num_input_tokens_seen": 21671230, + "step": 1012, + "time_per_iteration": 2.534186840057373 + }, + { + "auxiliary_loss_clip": 0.01191472, + "auxiliary_loss_mlp": 0.01064868, + "balance_loss_clip": 1.06487715, + "balance_loss_mlp": 1.04120481, + "epoch": 0.06090485495265294, + "flos": 23546339452800.0, + "grad_norm": 2.020234721219651, + "language_loss": 0.76824385, + "learning_rate": 3.989990022305734e-06, + "loss": 0.79080725, + "num_input_tokens_seen": 21691155, + "step": 1013, + "time_per_iteration": 2.562086343765259 + }, + { + "auxiliary_loss_clip": 0.01211172, + "auxiliary_loss_mlp": 0.00804544, + "balance_loss_clip": 1.07038319, + "balance_loss_mlp": 1.03521681, + "epoch": 0.06096497820532091, + "flos": 20339301548160.0, + "grad_norm": 2.7763004057192417, + "language_loss": 0.8597998, + "learning_rate": 3.98995106776885e-06, + "loss": 0.87995696, + "num_input_tokens_seen": 21707405, + "step": 1014, + "time_per_iteration": 2.5221683979034424 + }, + { + "auxiliary_loss_clip": 0.01219479, + "auxiliary_loss_mlp": 0.01064693, + "balance_loss_clip": 1.06985402, + "balance_loss_mlp": 1.03822851, + "epoch": 0.061025101457988874, + "flos": 26939035779840.0, + "grad_norm": 2.3211893848740623, + "language_loss": 0.73360002, + "learning_rate": 3.98991203777282e-06, + "loss": 0.75644171, + "num_input_tokens_seen": 21728090, + "step": 1015, + "time_per_iteration": 2.5620009899139404 + }, + { + "auxiliary_loss_clip": 0.01190099, + "auxiliary_loss_mlp": 0.01059671, + "balance_loss_clip": 1.06643105, + "balance_loss_mlp": 1.03550756, + "epoch": 0.061085224710656846, + "flos": 25375054723200.0, + "grad_norm": 1.7278419064041828, + "language_loss": 0.79242778, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.81492543, + "num_input_tokens_seen": 21747950, + "step": 1016, + "time_per_iteration": 2.5964720249176025 + }, + { + "auxiliary_loss_clip": 0.01176857, + "auxiliary_loss_mlp": 0.0105824, + "balance_loss_clip": 1.06588602, + "balance_loss_mlp": 1.0341959, + "epoch": 0.06114534796332482, + "flos": 24824759385600.0, + "grad_norm": 2.2925296645510995, + "language_loss": 0.75948983, + "learning_rate": 3.989833751409254e-06, + "loss": 0.7818408, + "num_input_tokens_seen": 21767900, + "step": 1017, + "time_per_iteration": 2.631725788116455 + }, + { + "auxiliary_loss_clip": 0.01191442, + "auxiliary_loss_mlp": 0.01076628, + "balance_loss_clip": 1.0714395, + "balance_loss_mlp": 1.05117667, + "epoch": 0.061205471215992784, + "flos": 20631434860800.0, + "grad_norm": 2.283524948878038, + "language_loss": 0.85985792, + "learning_rate": 3.989794495044685e-06, + "loss": 0.88253856, + "num_input_tokens_seen": 21787375, + "step": 1018, + "time_per_iteration": 2.572498083114624 + }, + { + "auxiliary_loss_clip": 0.01239632, + "auxiliary_loss_mlp": 0.01072175, + "balance_loss_clip": 1.17450309, + "balance_loss_mlp": 1.04565132, + "epoch": 0.061265594468660756, + "flos": 16508351381760.0, + "grad_norm": 3.1387369909267524, + "language_loss": 0.77724612, + "learning_rate": 3.989755163226909e-06, + "loss": 0.8003642, + "num_input_tokens_seen": 21806275, + "step": 1019, + "time_per_iteration": 2.559656858444214 + }, + { + "auxiliary_loss_clip": 0.01168654, + "auxiliary_loss_mlp": 0.0105951, + "balance_loss_clip": 1.0626719, + "balance_loss_mlp": 1.0347141, + "epoch": 0.06132571772132872, + "flos": 26246211275520.0, + "grad_norm": 1.9015181061021196, + "language_loss": 0.84547591, + "learning_rate": 3.989715755957418e-06, + "loss": 0.86775762, + "num_input_tokens_seen": 21826430, + "step": 1020, + "time_per_iteration": 2.66471004486084 + }, + { + "auxiliary_loss_clip": 0.01201459, + "auxiliary_loss_mlp": 0.01059094, + "balance_loss_clip": 1.06926239, + "balance_loss_mlp": 1.03174746, + "epoch": 0.06138584097399669, + "flos": 37414788768000.0, + "grad_norm": 1.9328704699728074, + "language_loss": 0.79373932, + "learning_rate": 3.989676273237705e-06, + "loss": 0.81634492, + "num_input_tokens_seen": 21847800, + "step": 1021, + "time_per_iteration": 2.701507806777954 + }, + { + "auxiliary_loss_clip": 0.01193468, + "auxiliary_loss_mlp": 0.01058916, + "balance_loss_clip": 1.06506681, + "balance_loss_mlp": 1.03645647, + "epoch": 0.061445964226664665, + "flos": 17420661941760.0, + "grad_norm": 2.202858582003627, + "language_loss": 0.87798876, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.90051258, + "num_input_tokens_seen": 21863385, + "step": 1022, + "time_per_iteration": 2.520646810531616 + }, + { + "auxiliary_loss_clip": 0.01256946, + "auxiliary_loss_mlp": 0.01059836, + "balance_loss_clip": 1.17252076, + "balance_loss_mlp": 1.03574407, + "epoch": 0.06150608747933263, + "flos": 22600021691520.0, + "grad_norm": 2.0719514898472906, + "language_loss": 0.83114821, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85431606, + "num_input_tokens_seen": 21881880, + "step": 1023, + "time_per_iteration": 2.5900027751922607 + }, + { + "auxiliary_loss_clip": 0.01120008, + "auxiliary_loss_mlp": 0.01015397, + "balance_loss_clip": 1.0508641, + "balance_loss_mlp": 1.01139164, + "epoch": 0.0615662107320006, + "flos": 56741482005120.0, + "grad_norm": 0.8969573229655349, + "language_loss": 0.65092325, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67227727, + "num_input_tokens_seen": 21940550, + "step": 1024, + "time_per_iteration": 3.1315290927886963 + }, + { + "auxiliary_loss_clip": 0.01161621, + "auxiliary_loss_mlp": 0.01066837, + "balance_loss_clip": 1.05619431, + "balance_loss_mlp": 1.03986037, + "epoch": 0.06162633398466857, + "flos": 22564793427840.0, + "grad_norm": 2.047035854799912, + "language_loss": 0.88249445, + "learning_rate": 3.989517587886636e-06, + "loss": 0.90477902, + "num_input_tokens_seen": 21958390, + "step": 1025, + "time_per_iteration": 2.5561535358428955 + }, + { + "auxiliary_loss_clip": 0.01196872, + "auxiliary_loss_mlp": 0.01057575, + "balance_loss_clip": 1.06642127, + "balance_loss_mlp": 1.03455544, + "epoch": 0.06168645723733654, + "flos": 25593104234880.0, + "grad_norm": 6.10241395841338, + "language_loss": 0.84530538, + "learning_rate": 3.989477727938335e-06, + "loss": 0.86784983, + "num_input_tokens_seen": 21978625, + "step": 1026, + "time_per_iteration": 2.571310520172119 + }, + { + "auxiliary_loss_clip": 0.01171062, + "auxiliary_loss_mlp": 0.01067544, + "balance_loss_clip": 1.07499468, + "balance_loss_mlp": 1.04230785, + "epoch": 0.06174658049000451, + "flos": 15997917162240.0, + "grad_norm": 1.8992643429412435, + "language_loss": 0.82404929, + "learning_rate": 3.989437792548839e-06, + "loss": 0.84643531, + "num_input_tokens_seen": 21996035, + "step": 1027, + "time_per_iteration": 2.578012466430664 + }, + { + "auxiliary_loss_clip": 0.01167124, + "auxiliary_loss_mlp": 0.01057718, + "balance_loss_clip": 1.0709635, + "balance_loss_mlp": 1.03316092, + "epoch": 0.06180670374267248, + "flos": 11285970117120.0, + "grad_norm": 2.5755062592196, + "language_loss": 0.84035152, + "learning_rate": 3.989397781719663e-06, + "loss": 0.86259997, + "num_input_tokens_seen": 22011625, + "step": 1028, + "time_per_iteration": 2.5923354625701904 + }, + { + "auxiliary_loss_clip": 0.01087213, + "auxiliary_loss_mlp": 0.01006276, + "balance_loss_clip": 1.04401517, + "balance_loss_mlp": 1.002056, + "epoch": 0.06186682699534045, + "flos": 65130142216320.0, + "grad_norm": 0.9386424605360532, + "language_loss": 0.60446727, + "learning_rate": 3.989357695452323e-06, + "loss": 0.62540221, + "num_input_tokens_seen": 22066035, + "step": 1029, + "time_per_iteration": 2.943061590194702 + }, + { + "auxiliary_loss_clip": 0.01183189, + "auxiliary_loss_mlp": 0.01063623, + "balance_loss_clip": 1.06694174, + "balance_loss_mlp": 1.03749275, + "epoch": 0.061926950248008414, + "flos": 21105742976640.0, + "grad_norm": 2.416411101689549, + "language_loss": 0.82904816, + "learning_rate": 3.98931753374834e-06, + "loss": 0.85151625, + "num_input_tokens_seen": 22085015, + "step": 1030, + "time_per_iteration": 2.6117117404937744 + }, + { + "auxiliary_loss_clip": 0.01224888, + "auxiliary_loss_mlp": 0.01067935, + "balance_loss_clip": 1.07139075, + "balance_loss_mlp": 1.04329407, + "epoch": 0.061987073500676386, + "flos": 17748454481280.0, + "grad_norm": 4.110198106305974, + "language_loss": 0.79837704, + "learning_rate": 3.989277296609237e-06, + "loss": 0.82130527, + "num_input_tokens_seen": 22102775, + "step": 1031, + "time_per_iteration": 2.470902442932129 + }, + { + "auxiliary_loss_clip": 0.01193966, + "auxiliary_loss_mlp": 0.01076398, + "balance_loss_clip": 1.07010484, + "balance_loss_mlp": 1.05001688, + "epoch": 0.06204719675334436, + "flos": 21836237869440.0, + "grad_norm": 1.7198605640623903, + "language_loss": 0.77513051, + "learning_rate": 3.98923698403654e-06, + "loss": 0.79783416, + "num_input_tokens_seen": 22121680, + "step": 1032, + "time_per_iteration": 2.5531489849090576 + }, + { + "auxiliary_loss_clip": 0.01202595, + "auxiliary_loss_mlp": 0.01069016, + "balance_loss_clip": 1.06348825, + "balance_loss_mlp": 1.04395819, + "epoch": 0.06210732000601232, + "flos": 19353697286400.0, + "grad_norm": 2.8219982169132094, + "language_loss": 0.89283907, + "learning_rate": 3.989196596031776e-06, + "loss": 0.91555524, + "num_input_tokens_seen": 22138155, + "step": 1033, + "time_per_iteration": 2.508654832839966 + }, + { + "auxiliary_loss_clip": 0.01212642, + "auxiliary_loss_mlp": 0.01058993, + "balance_loss_clip": 1.06752968, + "balance_loss_mlp": 1.03493631, + "epoch": 0.062167443258680295, + "flos": 24749382695040.0, + "grad_norm": 2.7005893466601063, + "language_loss": 0.85055178, + "learning_rate": 3.989156132596479e-06, + "loss": 0.87326813, + "num_input_tokens_seen": 22157420, + "step": 1034, + "time_per_iteration": 2.545006513595581 + }, + { + "auxiliary_loss_clip": 0.01188412, + "auxiliary_loss_mlp": 0.01058543, + "balance_loss_clip": 1.06625164, + "balance_loss_mlp": 1.03371203, + "epoch": 0.06222756651134827, + "flos": 34458478773120.0, + "grad_norm": 2.0278922021140775, + "language_loss": 0.80742383, + "learning_rate": 3.989115593732182e-06, + "loss": 0.82989335, + "num_input_tokens_seen": 22178620, + "step": 1035, + "time_per_iteration": 4.0297064781188965 + }, + { + "auxiliary_loss_clip": 0.01168223, + "auxiliary_loss_mlp": 0.01069993, + "balance_loss_clip": 1.07188118, + "balance_loss_mlp": 1.04281306, + "epoch": 0.06228768976401623, + "flos": 25666469763840.0, + "grad_norm": 3.3885385627443125, + "language_loss": 0.78227878, + "learning_rate": 3.989074979440421e-06, + "loss": 0.80466092, + "num_input_tokens_seen": 22197125, + "step": 1036, + "time_per_iteration": 4.044435024261475 + }, + { + "auxiliary_loss_clip": 0.01202217, + "auxiliary_loss_mlp": 0.01066537, + "balance_loss_clip": 1.0693171, + "balance_loss_mlp": 1.04237366, + "epoch": 0.062347813016684205, + "flos": 25295619795840.0, + "grad_norm": 1.794658920648856, + "language_loss": 0.86704051, + "learning_rate": 3.989034289722739e-06, + "loss": 0.88972807, + "num_input_tokens_seen": 22217575, + "step": 1037, + "time_per_iteration": 2.5914485454559326 + }, + { + "auxiliary_loss_clip": 0.01204047, + "auxiliary_loss_mlp": 0.01056904, + "balance_loss_clip": 1.06681371, + "balance_loss_mlp": 1.03102326, + "epoch": 0.06240793626935217, + "flos": 26907039740160.0, + "grad_norm": 3.3720356162683984, + "language_loss": 0.80837935, + "learning_rate": 3.988993524580676e-06, + "loss": 0.83098882, + "num_input_tokens_seen": 22236840, + "step": 1038, + "time_per_iteration": 2.6048426628112793 + }, + { + "auxiliary_loss_clip": 0.01145765, + "auxiliary_loss_mlp": 0.0107656, + "balance_loss_clip": 1.05513287, + "balance_loss_mlp": 1.04828358, + "epoch": 0.06246805952202014, + "flos": 21615782146560.0, + "grad_norm": 3.965360659677508, + "language_loss": 0.85608822, + "learning_rate": 3.98895268401578e-06, + "loss": 0.87831151, + "num_input_tokens_seen": 22256465, + "step": 1039, + "time_per_iteration": 2.67110276222229 + }, + { + "auxiliary_loss_clip": 0.01198135, + "auxiliary_loss_mlp": 0.01068964, + "balance_loss_clip": 1.06665421, + "balance_loss_mlp": 1.04356086, + "epoch": 0.0625281827746881, + "flos": 19311896833920.0, + "grad_norm": 1.8227440915158006, + "language_loss": 0.8106916, + "learning_rate": 3.9889117680296e-06, + "loss": 0.83336258, + "num_input_tokens_seen": 22274025, + "step": 1040, + "time_per_iteration": 2.6068663597106934 + }, + { + "auxiliary_loss_clip": 0.0122329, + "auxiliary_loss_mlp": 0.01065123, + "balance_loss_clip": 1.07256353, + "balance_loss_mlp": 1.03946877, + "epoch": 0.06258830602735609, + "flos": 27745769289600.0, + "grad_norm": 2.818621148230795, + "language_loss": 0.69767642, + "learning_rate": 3.988870776623685e-06, + "loss": 0.72056055, + "num_input_tokens_seen": 22292245, + "step": 1041, + "time_per_iteration": 3.932661533355713 + }, + { + "auxiliary_loss_clip": 0.0122029, + "auxiliary_loss_mlp": 0.01054424, + "balance_loss_clip": 1.06687617, + "balance_loss_mlp": 1.02885389, + "epoch": 0.06264842928002405, + "flos": 23222605150080.0, + "grad_norm": 1.9892516945938106, + "language_loss": 0.81552386, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.83827102, + "num_input_tokens_seen": 22311455, + "step": 1042, + "time_per_iteration": 3.93043851852417 + }, + { + "auxiliary_loss_clip": 0.01218153, + "auxiliary_loss_mlp": 0.01050677, + "balance_loss_clip": 1.06817603, + "balance_loss_mlp": 1.02702618, + "epoch": 0.06270855253269202, + "flos": 38399495189760.0, + "grad_norm": 1.7645122934528015, + "language_loss": 0.76377797, + "learning_rate": 3.988788567558874e-06, + "loss": 0.78646624, + "num_input_tokens_seen": 22333750, + "step": 1043, + "time_per_iteration": 2.64691424369812 + }, + { + "auxiliary_loss_clip": 0.01197614, + "auxiliary_loss_mlp": 0.0106415, + "balance_loss_clip": 1.06762719, + "balance_loss_mlp": 1.03991449, + "epoch": 0.06276867578535998, + "flos": 22453542028800.0, + "grad_norm": 2.007423210682907, + "language_loss": 0.92504358, + "learning_rate": 3.988747349903097e-06, + "loss": 0.94766128, + "num_input_tokens_seen": 22351940, + "step": 1044, + "time_per_iteration": 2.5594093799591064 + }, + { + "auxiliary_loss_clip": 0.0119986, + "auxiliary_loss_mlp": 0.01071861, + "balance_loss_clip": 1.06418705, + "balance_loss_mlp": 1.04686236, + "epoch": 0.06282879903802796, + "flos": 22930435923840.0, + "grad_norm": 1.8799293467685745, + "language_loss": 0.86093235, + "learning_rate": 3.988706056833821e-06, + "loss": 0.88364947, + "num_input_tokens_seen": 22372085, + "step": 1045, + "time_per_iteration": 2.565340995788574 + }, + { + "auxiliary_loss_clip": 0.01188895, + "auxiliary_loss_mlp": 0.01062511, + "balance_loss_clip": 1.0657934, + "balance_loss_mlp": 1.03789473, + "epoch": 0.06288892229069593, + "flos": 34819237019520.0, + "grad_norm": 2.2263567792633947, + "language_loss": 0.78175974, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.80427372, + "num_input_tokens_seen": 22392020, + "step": 1046, + "time_per_iteration": 2.6921207904815674 + }, + { + "auxiliary_loss_clip": 0.01203741, + "auxiliary_loss_mlp": 0.0107082, + "balance_loss_clip": 1.06820488, + "balance_loss_mlp": 1.04714513, + "epoch": 0.06294904554336389, + "flos": 19427134642560.0, + "grad_norm": 4.060171943681887, + "language_loss": 0.77613354, + "learning_rate": 3.988623244461039e-06, + "loss": 0.79887915, + "num_input_tokens_seen": 22411180, + "step": 1047, + "time_per_iteration": 2.5352025032043457 + }, + { + "auxiliary_loss_clip": 0.01209618, + "auxiliary_loss_mlp": 0.01060305, + "balance_loss_clip": 1.07002759, + "balance_loss_mlp": 1.0356046, + "epoch": 0.06300916879603187, + "flos": 40661867358720.0, + "grad_norm": 4.920300748383546, + "language_loss": 0.77172208, + "learning_rate": 3.988581725160672e-06, + "loss": 0.79442132, + "num_input_tokens_seen": 22435105, + "step": 1048, + "time_per_iteration": 2.684413194656372 + }, + { + "auxiliary_loss_clip": 0.0119128, + "auxiliary_loss_mlp": 0.010663, + "balance_loss_clip": 1.06998539, + "balance_loss_mlp": 1.04125404, + "epoch": 0.06306929204869983, + "flos": 23804142341760.0, + "grad_norm": 3.550967429240787, + "language_loss": 0.77603805, + "learning_rate": 3.988540130453087e-06, + "loss": 0.79861379, + "num_input_tokens_seen": 22452710, + "step": 1049, + "time_per_iteration": 2.5848257541656494 + }, + { + "auxiliary_loss_clip": 0.01202964, + "auxiliary_loss_mlp": 0.0105678, + "balance_loss_clip": 1.06777334, + "balance_loss_mlp": 1.03208041, + "epoch": 0.0631294153013678, + "flos": 18915802583040.0, + "grad_norm": 3.607397545677041, + "language_loss": 0.82718027, + "learning_rate": 3.988498460339862e-06, + "loss": 0.8497777, + "num_input_tokens_seen": 22470175, + "step": 1050, + "time_per_iteration": 2.523500442504883 + }, + { + "auxiliary_loss_clip": 0.01220408, + "auxiliary_loss_mlp": 0.01063028, + "balance_loss_clip": 1.07249928, + "balance_loss_mlp": 1.03889978, + "epoch": 0.06318953855403578, + "flos": 24280174310400.0, + "grad_norm": 2.6112443242256607, + "language_loss": 0.77305663, + "learning_rate": 3.988456714822575e-06, + "loss": 0.79589105, + "num_input_tokens_seen": 22490020, + "step": 1051, + "time_per_iteration": 2.5461220741271973 + }, + { + "auxiliary_loss_clip": 0.01197116, + "auxiliary_loss_mlp": 0.01068972, + "balance_loss_clip": 1.071787, + "balance_loss_mlp": 1.04398596, + "epoch": 0.06324966180670374, + "flos": 22528918719360.0, + "grad_norm": 2.2949919161433647, + "language_loss": 0.80194324, + "learning_rate": 3.98841489390281e-06, + "loss": 0.82460415, + "num_input_tokens_seen": 22509685, + "step": 1052, + "time_per_iteration": 2.584480047225952 + }, + { + "auxiliary_loss_clip": 0.01222693, + "auxiliary_loss_mlp": 0.01058389, + "balance_loss_clip": 1.07188749, + "balance_loss_mlp": 1.03405833, + "epoch": 0.06330978505937171, + "flos": 15778107884160.0, + "grad_norm": 3.913642491201759, + "language_loss": 0.77650714, + "learning_rate": 3.988372997582155e-06, + "loss": 0.79931796, + "num_input_tokens_seen": 22527905, + "step": 1053, + "time_per_iteration": 2.5122177600860596 + }, + { + "auxiliary_loss_clip": 0.01199802, + "auxiliary_loss_mlp": 0.00837031, + "balance_loss_clip": 1.06940889, + "balance_loss_mlp": 1.09944177, + "epoch": 0.06336990831203967, + "flos": 21471098163840.0, + "grad_norm": 2.511052655569475, + "language_loss": 0.84855127, + "learning_rate": 3.988331025862195e-06, + "loss": 0.86891961, + "num_input_tokens_seen": 22546335, + "step": 1054, + "time_per_iteration": 2.6102490425109863 + }, + { + "auxiliary_loss_clip": 0.0116997, + "auxiliary_loss_mlp": 0.01063845, + "balance_loss_clip": 1.05432224, + "balance_loss_mlp": 1.03877497, + "epoch": 0.06343003156470765, + "flos": 18478877546880.0, + "grad_norm": 2.0776474324760117, + "language_loss": 0.85550886, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.87784696, + "num_input_tokens_seen": 22563885, + "step": 1055, + "time_per_iteration": 2.5325145721435547 + }, + { + "auxiliary_loss_clip": 0.01161531, + "auxiliary_loss_mlp": 0.01064731, + "balance_loss_clip": 1.05427635, + "balance_loss_mlp": 1.03997159, + "epoch": 0.06349015481737562, + "flos": 25154886309120.0, + "grad_norm": 2.4658233574034476, + "language_loss": 0.80740762, + "learning_rate": 3.988246856230734e-06, + "loss": 0.82967019, + "num_input_tokens_seen": 22583035, + "step": 1056, + "time_per_iteration": 2.623706579208374 + }, + { + "auxiliary_loss_clip": 0.01154617, + "auxiliary_loss_mlp": 0.01059581, + "balance_loss_clip": 1.06178474, + "balance_loss_mlp": 1.032866, + "epoch": 0.06355027807004358, + "flos": 26871775562880.0, + "grad_norm": 4.222766734071776, + "language_loss": 0.81305873, + "learning_rate": 3.988204658322426e-06, + "loss": 0.83520067, + "num_input_tokens_seen": 22605055, + "step": 1057, + "time_per_iteration": 2.8047425746917725 + }, + { + "auxiliary_loss_clip": 0.01196525, + "auxiliary_loss_mlp": 0.01061224, + "balance_loss_clip": 1.15195322, + "balance_loss_mlp": 1.03828776, + "epoch": 0.06361040132271156, + "flos": 21396691140480.0, + "grad_norm": 3.6279079620434347, + "language_loss": 0.83423674, + "learning_rate": 3.988162385021196e-06, + "loss": 0.85681427, + "num_input_tokens_seen": 22623760, + "step": 1058, + "time_per_iteration": 2.8255083560943604 + }, + { + "auxiliary_loss_clip": 0.01188552, + "auxiliary_loss_mlp": 0.01063326, + "balance_loss_clip": 1.06800032, + "balance_loss_mlp": 1.03687358, + "epoch": 0.06367052457537953, + "flos": 25733765894400.0, + "grad_norm": 2.4874884186023927, + "language_loss": 0.87744606, + "learning_rate": 3.988120036328651e-06, + "loss": 0.89996487, + "num_input_tokens_seen": 22643000, + "step": 1059, + "time_per_iteration": 2.6268696784973145 + }, + { + "auxiliary_loss_clip": 0.01165884, + "auxiliary_loss_mlp": 0.01064609, + "balance_loss_clip": 1.0592078, + "balance_loss_mlp": 1.03875279, + "epoch": 0.0637306478280475, + "flos": 17631420992640.0, + "grad_norm": 2.2562275321471947, + "language_loss": 0.91517717, + "learning_rate": 3.988077612246394e-06, + "loss": 0.93748206, + "num_input_tokens_seen": 22660460, + "step": 1060, + "time_per_iteration": 2.599611520767212 + }, + { + "auxiliary_loss_clip": 0.01190239, + "auxiliary_loss_mlp": 0.01064788, + "balance_loss_clip": 1.07670641, + "balance_loss_mlp": 1.03888416, + "epoch": 0.06379077108071547, + "flos": 13662610427520.0, + "grad_norm": 2.008884687205767, + "language_loss": 0.86711103, + "learning_rate": 3.988035112776035e-06, + "loss": 0.88966131, + "num_input_tokens_seen": 22679270, + "step": 1061, + "time_per_iteration": 2.5593855381011963 + }, + { + "auxiliary_loss_clip": 0.01199466, + "auxiliary_loss_mlp": 0.01060984, + "balance_loss_clip": 1.06607914, + "balance_loss_mlp": 1.03459072, + "epoch": 0.06385089433338344, + "flos": 28478849961600.0, + "grad_norm": 2.1716157622158554, + "language_loss": 0.77480567, + "learning_rate": 3.987992537919185e-06, + "loss": 0.79741013, + "num_input_tokens_seen": 22699330, + "step": 1062, + "time_per_iteration": 2.634864091873169 + }, + { + "auxiliary_loss_clip": 0.01181947, + "auxiliary_loss_mlp": 0.01062999, + "balance_loss_clip": 1.0666213, + "balance_loss_mlp": 1.03919315, + "epoch": 0.0639110175860514, + "flos": 24311057028480.0, + "grad_norm": 2.361444996122634, + "language_loss": 0.86546463, + "learning_rate": 3.987949887677459e-06, + "loss": 0.88791406, + "num_input_tokens_seen": 22717945, + "step": 1063, + "time_per_iteration": 2.621023654937744 + }, + { + "auxiliary_loss_clip": 0.0121612, + "auxiliary_loss_mlp": 0.01059545, + "balance_loss_clip": 1.06560588, + "balance_loss_mlp": 1.0348444, + "epoch": 0.06397114083871938, + "flos": 22090772620800.0, + "grad_norm": 2.099732942199714, + "language_loss": 0.80368745, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.82644415, + "num_input_tokens_seen": 22736790, + "step": 1064, + "time_per_iteration": 2.497345209121704 + }, + { + "auxiliary_loss_clip": 0.0120656, + "auxiliary_loss_mlp": 0.01066726, + "balance_loss_clip": 1.0665164, + "balance_loss_mlp": 1.04077411, + "epoch": 0.06403126409138735, + "flos": 19572824206080.0, + "grad_norm": 2.3856589810131514, + "language_loss": 0.84151387, + "learning_rate": 3.987864361045851e-06, + "loss": 0.86424673, + "num_input_tokens_seen": 22754745, + "step": 1065, + "time_per_iteration": 2.5229530334472656 + }, + { + "auxiliary_loss_clip": 0.01175216, + "auxiliary_loss_mlp": 0.01054766, + "balance_loss_clip": 1.06935275, + "balance_loss_mlp": 1.0317347, + "epoch": 0.06409138734405531, + "flos": 40807413267840.0, + "grad_norm": 3.4399674580425756, + "language_loss": 0.68509996, + "learning_rate": 3.987821484659211e-06, + "loss": 0.70739979, + "num_input_tokens_seen": 22776780, + "step": 1066, + "time_per_iteration": 2.7499170303344727 + }, + { + "auxiliary_loss_clip": 0.01219783, + "auxiliary_loss_mlp": 0.01072257, + "balance_loss_clip": 1.07190156, + "balance_loss_mlp": 1.04640031, + "epoch": 0.06415151059672328, + "flos": 20441610460800.0, + "grad_norm": 3.88002229464114, + "language_loss": 0.90236521, + "learning_rate": 3.987778532894181e-06, + "loss": 0.92528564, + "num_input_tokens_seen": 22793915, + "step": 1067, + "time_per_iteration": 2.5137240886688232 + }, + { + "auxiliary_loss_clip": 0.0119281, + "auxiliary_loss_mlp": 0.01064272, + "balance_loss_clip": 1.06861818, + "balance_loss_mlp": 1.0413835, + "epoch": 0.06421163384939126, + "flos": 18072045129600.0, + "grad_norm": 2.2452089177674397, + "language_loss": 0.83321106, + "learning_rate": 3.987735505752391e-06, + "loss": 0.85578185, + "num_input_tokens_seen": 22812670, + "step": 1068, + "time_per_iteration": 2.559892177581787 + }, + { + "auxiliary_loss_clip": 0.01190157, + "auxiliary_loss_mlp": 0.01060091, + "balance_loss_clip": 1.0733515, + "balance_loss_mlp": 1.0370003, + "epoch": 0.06427175710205922, + "flos": 25119442563840.0, + "grad_norm": 3.286705454194583, + "language_loss": 0.89712274, + "learning_rate": 3.987692403235471e-06, + "loss": 0.91962522, + "num_input_tokens_seen": 22832440, + "step": 1069, + "time_per_iteration": 2.6122753620147705 + }, + { + "auxiliary_loss_clip": 0.01246881, + "auxiliary_loss_mlp": 0.01075869, + "balance_loss_clip": 1.16698861, + "balance_loss_mlp": 1.0506686, + "epoch": 0.06433188035472719, + "flos": 17380549428480.0, + "grad_norm": 2.882622116157054, + "language_loss": 0.95839447, + "learning_rate": 3.987649225345056e-06, + "loss": 0.98162192, + "num_input_tokens_seen": 22845495, + "step": 1070, + "time_per_iteration": 2.5294783115386963 + }, + { + "auxiliary_loss_clip": 0.01148763, + "auxiliary_loss_mlp": 0.01055262, + "balance_loss_clip": 1.06830287, + "balance_loss_mlp": 1.02982247, + "epoch": 0.06439200360739517, + "flos": 23546267625600.0, + "grad_norm": 1.8550227522505478, + "language_loss": 0.88202214, + "learning_rate": 3.987605972082782e-06, + "loss": 0.90406233, + "num_input_tokens_seen": 22865390, + "step": 1071, + "time_per_iteration": 2.6722991466522217 + }, + { + "auxiliary_loss_clip": 0.01160253, + "auxiliary_loss_mlp": 0.01052592, + "balance_loss_clip": 1.06161022, + "balance_loss_mlp": 1.02858365, + "epoch": 0.06445212686006313, + "flos": 21979772616960.0, + "grad_norm": 1.7050556204732423, + "language_loss": 0.75824261, + "learning_rate": 3.987562643450292e-06, + "loss": 0.78037107, + "num_input_tokens_seen": 22885495, + "step": 1072, + "time_per_iteration": 2.589935064315796 + }, + { + "auxiliary_loss_clip": 0.01178975, + "auxiliary_loss_mlp": 0.01064652, + "balance_loss_clip": 1.06774855, + "balance_loss_mlp": 1.03840184, + "epoch": 0.0645122501127311, + "flos": 25921291824000.0, + "grad_norm": 1.9482781333518797, + "language_loss": 0.80642629, + "learning_rate": 3.987519239449226e-06, + "loss": 0.82886261, + "num_input_tokens_seen": 22904845, + "step": 1073, + "time_per_iteration": 2.6098790168762207 + }, + { + "auxiliary_loss_clip": 0.01197722, + "auxiliary_loss_mlp": 0.01056891, + "balance_loss_clip": 1.06617081, + "balance_loss_mlp": 1.03389502, + "epoch": 0.06457237336539907, + "flos": 25626034028160.0, + "grad_norm": 1.751247334258551, + "language_loss": 0.80249214, + "learning_rate": 3.987475760081233e-06, + "loss": 0.82503831, + "num_input_tokens_seen": 22925940, + "step": 1074, + "time_per_iteration": 3.993777275085449 + }, + { + "auxiliary_loss_clip": 0.01177586, + "auxiliary_loss_mlp": 0.01063461, + "balance_loss_clip": 1.06528497, + "balance_loss_mlp": 1.03914261, + "epoch": 0.06463249661806704, + "flos": 19463979018240.0, + "grad_norm": 2.156404738549543, + "language_loss": 0.79318714, + "learning_rate": 3.987432205347958e-06, + "loss": 0.81559765, + "num_input_tokens_seen": 22944375, + "step": 1075, + "time_per_iteration": 3.9172990322113037 + }, + { + "auxiliary_loss_clip": 0.01167003, + "auxiliary_loss_mlp": 0.01061363, + "balance_loss_clip": 1.05438328, + "balance_loss_mlp": 1.03814137, + "epoch": 0.064692619870735, + "flos": 24498044254080.0, + "grad_norm": 2.467688460909924, + "language_loss": 0.8743701, + "learning_rate": 3.987388575251055e-06, + "loss": 0.89665377, + "num_input_tokens_seen": 22959145, + "step": 1076, + "time_per_iteration": 2.571411609649658 + }, + { + "auxiliary_loss_clip": 0.01196198, + "auxiliary_loss_mlp": 0.01055286, + "balance_loss_clip": 1.07145369, + "balance_loss_mlp": 1.03234971, + "epoch": 0.06475274312340297, + "flos": 17018677860480.0, + "grad_norm": 1.9575562840587992, + "language_loss": 0.81004441, + "learning_rate": 3.98734486979218e-06, + "loss": 0.83255935, + "num_input_tokens_seen": 22978100, + "step": 1077, + "time_per_iteration": 2.522670030593872 + }, + { + "auxiliary_loss_clip": 0.01200342, + "auxiliary_loss_mlp": 0.01063584, + "balance_loss_clip": 1.06635761, + "balance_loss_mlp": 1.03888345, + "epoch": 0.06481286637607095, + "flos": 24572379450240.0, + "grad_norm": 2.674507698973867, + "language_loss": 0.91557676, + "learning_rate": 3.987301088972986e-06, + "loss": 0.93821603, + "num_input_tokens_seen": 22997285, + "step": 1078, + "time_per_iteration": 2.5893747806549072 + }, + { + "auxiliary_loss_clip": 0.01225489, + "auxiliary_loss_mlp": 0.0106041, + "balance_loss_clip": 1.07109666, + "balance_loss_mlp": 1.03641284, + "epoch": 0.06487298962873891, + "flos": 21105635235840.0, + "grad_norm": 2.7377853121534392, + "language_loss": 0.79086232, + "learning_rate": 3.987257232795137e-06, + "loss": 0.8137213, + "num_input_tokens_seen": 23016285, + "step": 1079, + "time_per_iteration": 2.487137794494629 + }, + { + "auxiliary_loss_clip": 0.01148927, + "auxiliary_loss_mlp": 0.01059818, + "balance_loss_clip": 1.05222917, + "balance_loss_mlp": 1.03619123, + "epoch": 0.06493311288140688, + "flos": 24608182331520.0, + "grad_norm": 2.3035097649758867, + "language_loss": 0.69664478, + "learning_rate": 3.987213301260294e-06, + "loss": 0.71873224, + "num_input_tokens_seen": 23036420, + "step": 1080, + "time_per_iteration": 4.006195783615112 + }, + { + "auxiliary_loss_clip": 0.01174828, + "auxiliary_loss_mlp": 0.0106691, + "balance_loss_clip": 1.06694889, + "balance_loss_mlp": 1.04072022, + "epoch": 0.06499323613407486, + "flos": 25337994865920.0, + "grad_norm": 1.8816511155085698, + "language_loss": 0.7250495, + "learning_rate": 3.987169294370123e-06, + "loss": 0.74746692, + "num_input_tokens_seen": 23056945, + "step": 1081, + "time_per_iteration": 4.078016996383667 + }, + { + "auxiliary_loss_clip": 0.01191914, + "auxiliary_loss_mlp": 0.01060806, + "balance_loss_clip": 1.15285718, + "balance_loss_mlp": 1.03577244, + "epoch": 0.06505335938674282, + "flos": 20375714960640.0, + "grad_norm": 2.976536428341487, + "language_loss": 0.84134829, + "learning_rate": 3.987125212126294e-06, + "loss": 0.86387551, + "num_input_tokens_seen": 23074940, + "step": 1082, + "time_per_iteration": 2.611605405807495 + }, + { + "auxiliary_loss_clip": 0.01207984, + "auxiliary_loss_mlp": 0.01059528, + "balance_loss_clip": 1.07319331, + "balance_loss_mlp": 1.03489923, + "epoch": 0.06511348263941079, + "flos": 25337923038720.0, + "grad_norm": 2.3317014200784376, + "language_loss": 0.82401091, + "learning_rate": 3.987081054530478e-06, + "loss": 0.84668601, + "num_input_tokens_seen": 23093420, + "step": 1083, + "time_per_iteration": 2.5405869483947754 + }, + { + "auxiliary_loss_clip": 0.01174195, + "auxiliary_loss_mlp": 0.01060445, + "balance_loss_clip": 1.07356143, + "balance_loss_mlp": 1.03513706, + "epoch": 0.06517360589207877, + "flos": 20332801186560.0, + "grad_norm": 2.9325057626298663, + "language_loss": 0.79011863, + "learning_rate": 3.987036821584348e-06, + "loss": 0.81246501, + "num_input_tokens_seen": 23111550, + "step": 1084, + "time_per_iteration": 2.633962392807007 + }, + { + "auxiliary_loss_clip": 0.01176586, + "auxiliary_loss_mlp": 0.01061196, + "balance_loss_clip": 1.06325483, + "balance_loss_mlp": 1.03722298, + "epoch": 0.06523372914474673, + "flos": 31681650061440.0, + "grad_norm": 1.9807720031509701, + "language_loss": 0.66130853, + "learning_rate": 3.986992513289584e-06, + "loss": 0.68368638, + "num_input_tokens_seen": 23130335, + "step": 1085, + "time_per_iteration": 2.623504638671875 + }, + { + "auxiliary_loss_clip": 0.0117506, + "auxiliary_loss_mlp": 0.0106282, + "balance_loss_clip": 1.06515741, + "balance_loss_mlp": 1.03928792, + "epoch": 0.0652938523974147, + "flos": 20778165918720.0, + "grad_norm": 2.093824201519614, + "language_loss": 0.76590407, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.78828281, + "num_input_tokens_seen": 23152380, + "step": 1086, + "time_per_iteration": 2.5879526138305664 + }, + { + "auxiliary_loss_clip": 0.01199623, + "auxiliary_loss_mlp": 0.01055929, + "balance_loss_clip": 1.06513119, + "balance_loss_mlp": 1.03124118, + "epoch": 0.06535397565008266, + "flos": 16690993061760.0, + "grad_norm": 2.3264679911228514, + "language_loss": 0.85129821, + "learning_rate": 3.986903670660872e-06, + "loss": 0.87385368, + "num_input_tokens_seen": 23171630, + "step": 1087, + "time_per_iteration": 2.511742353439331 + }, + { + "auxiliary_loss_clip": 0.01188605, + "auxiliary_loss_mlp": 0.01057494, + "balance_loss_clip": 1.06811059, + "balance_loss_mlp": 1.03389096, + "epoch": 0.06541409890275064, + "flos": 26868220116480.0, + "grad_norm": 1.9627707746288217, + "language_loss": 0.77970618, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.80216712, + "num_input_tokens_seen": 23192520, + "step": 1088, + "time_per_iteration": 2.616218328475952 + }, + { + "auxiliary_loss_clip": 0.01195006, + "auxiliary_loss_mlp": 0.01062739, + "balance_loss_clip": 1.06818581, + "balance_loss_mlp": 1.03980327, + "epoch": 0.06547422215541861, + "flos": 20521620005760.0, + "grad_norm": 2.4399243989732837, + "language_loss": 0.71390957, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.73648703, + "num_input_tokens_seen": 23210710, + "step": 1089, + "time_per_iteration": 2.5650622844696045 + }, + { + "auxiliary_loss_clip": 0.0117592, + "auxiliary_loss_mlp": 0.00807978, + "balance_loss_clip": 1.06647468, + "balance_loss_mlp": 1.04776788, + "epoch": 0.06553434540808657, + "flos": 22016616992640.0, + "grad_norm": 2.000081489051539, + "language_loss": 0.85323596, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.87307489, + "num_input_tokens_seen": 23230305, + "step": 1090, + "time_per_iteration": 2.5791780948638916 + }, + { + "auxiliary_loss_clip": 0.01214988, + "auxiliary_loss_mlp": 0.01053428, + "balance_loss_clip": 1.06873822, + "balance_loss_mlp": 1.03012216, + "epoch": 0.06559446866075455, + "flos": 24608649208320.0, + "grad_norm": 1.692122282020335, + "language_loss": 0.71675372, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.73943788, + "num_input_tokens_seen": 23249015, + "step": 1091, + "time_per_iteration": 2.524648427963257 + }, + { + "auxiliary_loss_clip": 0.0112718, + "auxiliary_loss_mlp": 0.01059641, + "balance_loss_clip": 1.06338203, + "balance_loss_mlp": 1.03488088, + "epoch": 0.06565459191342252, + "flos": 24274679529600.0, + "grad_norm": 2.378314415057797, + "language_loss": 0.82636607, + "learning_rate": 3.986680245605936e-06, + "loss": 0.84823424, + "num_input_tokens_seen": 23265105, + "step": 1092, + "time_per_iteration": 2.8152034282684326 + }, + { + "auxiliary_loss_clip": 0.01217887, + "auxiliary_loss_mlp": 0.01059727, + "balance_loss_clip": 1.06653273, + "balance_loss_mlp": 1.03435969, + "epoch": 0.06571471516609048, + "flos": 24787124910720.0, + "grad_norm": 1.9884923854961465, + "language_loss": 0.7141096, + "learning_rate": 3.986635334582814e-06, + "loss": 0.73688567, + "num_input_tokens_seen": 23283950, + "step": 1093, + "time_per_iteration": 2.648366928100586 + }, + { + "auxiliary_loss_clip": 0.01190877, + "auxiliary_loss_mlp": 0.01062309, + "balance_loss_clip": 1.06941783, + "balance_loss_mlp": 1.03684616, + "epoch": 0.06577483841875846, + "flos": 26214071581440.0, + "grad_norm": 1.629856260676979, + "language_loss": 0.87775266, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90028453, + "num_input_tokens_seen": 23305005, + "step": 1094, + "time_per_iteration": 2.611783504486084 + }, + { + "auxiliary_loss_clip": 0.01196522, + "auxiliary_loss_mlp": 0.01065223, + "balance_loss_clip": 1.07000995, + "balance_loss_mlp": 1.03853226, + "epoch": 0.06583496167142643, + "flos": 25080802508160.0, + "grad_norm": 1.4986106042245428, + "language_loss": 0.81367749, + "learning_rate": 3.986545286538044e-06, + "loss": 0.83629489, + "num_input_tokens_seen": 23323220, + "step": 1095, + "time_per_iteration": 2.6105403900146484 + }, + { + "auxiliary_loss_clip": 0.01171226, + "auxiliary_loss_mlp": 0.01057199, + "balance_loss_clip": 1.0698247, + "balance_loss_mlp": 1.03503776, + "epoch": 0.06589508492409439, + "flos": 25629804956160.0, + "grad_norm": 1.9463469050724667, + "language_loss": 0.70004129, + "learning_rate": 3.986500149519811e-06, + "loss": 0.7223255, + "num_input_tokens_seen": 23342235, + "step": 1096, + "time_per_iteration": 2.6628947257995605 + }, + { + "auxiliary_loss_clip": 0.01206153, + "auxiliary_loss_mlp": 0.01072204, + "balance_loss_clip": 1.07087755, + "balance_loss_mlp": 1.04739714, + "epoch": 0.06595520817676236, + "flos": 23621249266560.0, + "grad_norm": 1.95808333764464, + "language_loss": 0.77587497, + "learning_rate": 3.986454937173292e-06, + "loss": 0.79865849, + "num_input_tokens_seen": 23363680, + "step": 1097, + "time_per_iteration": 2.5664379596710205 + }, + { + "auxiliary_loss_clip": 0.0121717, + "auxiliary_loss_mlp": 0.01063793, + "balance_loss_clip": 1.06849146, + "balance_loss_mlp": 1.04049933, + "epoch": 0.06601533142943034, + "flos": 33801708545280.0, + "grad_norm": 1.9999713666033208, + "language_loss": 0.78775114, + "learning_rate": 3.986409649500203e-06, + "loss": 0.81056082, + "num_input_tokens_seen": 23385590, + "step": 1098, + "time_per_iteration": 2.6263554096221924 + }, + { + "auxiliary_loss_clip": 0.01254995, + "auxiliary_loss_mlp": 0.01072959, + "balance_loss_clip": 1.16318274, + "balance_loss_mlp": 1.04735315, + "epoch": 0.0660754546820983, + "flos": 20259184262400.0, + "grad_norm": 2.8455302558143747, + "language_loss": 0.81899405, + "learning_rate": 3.986364286502261e-06, + "loss": 0.84227365, + "num_input_tokens_seen": 23402945, + "step": 1099, + "time_per_iteration": 2.5424156188964844 + }, + { + "auxiliary_loss_clip": 0.01183987, + "auxiliary_loss_mlp": 0.01055729, + "balance_loss_clip": 1.06312323, + "balance_loss_mlp": 1.03059983, + "epoch": 0.06613557793476627, + "flos": 19354164163200.0, + "grad_norm": 1.940237122032918, + "language_loss": 0.82582057, + "learning_rate": 3.986318848181186e-06, + "loss": 0.84821767, + "num_input_tokens_seen": 23421410, + "step": 1100, + "time_per_iteration": 2.5277209281921387 + }, + { + "auxiliary_loss_clip": 0.01192002, + "auxiliary_loss_mlp": 0.01060321, + "balance_loss_clip": 1.07192683, + "balance_loss_mlp": 1.03614509, + "epoch": 0.06619570118743424, + "flos": 13772568936960.0, + "grad_norm": 2.3698108715995128, + "language_loss": 0.73673934, + "learning_rate": 3.986273334538702e-06, + "loss": 0.75926256, + "num_input_tokens_seen": 23438870, + "step": 1101, + "time_per_iteration": 2.545546770095825 + }, + { + "auxiliary_loss_clip": 0.01201792, + "auxiliary_loss_mlp": 0.01062904, + "balance_loss_clip": 1.06455195, + "balance_loss_mlp": 1.03851366, + "epoch": 0.06625582444010221, + "flos": 17857874286720.0, + "grad_norm": 4.564811369163246, + "language_loss": 0.86143339, + "learning_rate": 3.986227745576533e-06, + "loss": 0.88408029, + "num_input_tokens_seen": 23456975, + "step": 1102, + "time_per_iteration": 2.5051393508911133 + }, + { + "auxiliary_loss_clip": 0.01189575, + "auxiliary_loss_mlp": 0.0105959, + "balance_loss_clip": 1.06624293, + "balance_loss_mlp": 1.03508103, + "epoch": 0.06631594769277017, + "flos": 11838707579520.0, + "grad_norm": 2.3838972733759025, + "language_loss": 0.81932425, + "learning_rate": 3.98618208129641e-06, + "loss": 0.84181589, + "num_input_tokens_seen": 23473440, + "step": 1103, + "time_per_iteration": 2.5489401817321777 + }, + { + "auxiliary_loss_clip": 0.01204955, + "auxiliary_loss_mlp": 0.00821134, + "balance_loss_clip": 1.07047033, + "balance_loss_mlp": 1.07102799, + "epoch": 0.06637607094543815, + "flos": 19793351756160.0, + "grad_norm": 1.8494668546361632, + "language_loss": 0.82212621, + "learning_rate": 3.986136341700063e-06, + "loss": 0.84238708, + "num_input_tokens_seen": 23493880, + "step": 1104, + "time_per_iteration": 2.569284677505493 + }, + { + "auxiliary_loss_clip": 0.0117171, + "auxiliary_loss_mlp": 0.01050948, + "balance_loss_clip": 1.06535995, + "balance_loss_mlp": 1.02618861, + "epoch": 0.06643619419810612, + "flos": 25485659677440.0, + "grad_norm": 1.674453414127921, + "language_loss": 0.80616009, + "learning_rate": 3.986090526789227e-06, + "loss": 0.82838666, + "num_input_tokens_seen": 23514920, + "step": 1105, + "time_per_iteration": 2.650836229324341 + }, + { + "auxiliary_loss_clip": 0.01185983, + "auxiliary_loss_mlp": 0.01061431, + "balance_loss_clip": 1.07025373, + "balance_loss_mlp": 1.03833985, + "epoch": 0.06649631745077408, + "flos": 16946533393920.0, + "grad_norm": 2.0041114872029078, + "language_loss": 0.96829402, + "learning_rate": 3.986044636565639e-06, + "loss": 0.99076807, + "num_input_tokens_seen": 23531635, + "step": 1106, + "time_per_iteration": 2.545681953430176 + }, + { + "auxiliary_loss_clip": 0.0120748, + "auxiliary_loss_mlp": 0.01060144, + "balance_loss_clip": 1.06974435, + "balance_loss_mlp": 1.03524113, + "epoch": 0.06655644070344206, + "flos": 17858592558720.0, + "grad_norm": 2.077196130817546, + "language_loss": 0.82625329, + "learning_rate": 3.985998671031039e-06, + "loss": 0.84892952, + "num_input_tokens_seen": 23551020, + "step": 1107, + "time_per_iteration": 2.55208683013916 + }, + { + "auxiliary_loss_clip": 0.01147928, + "auxiliary_loss_mlp": 0.01014782, + "balance_loss_clip": 1.13096881, + "balance_loss_mlp": 1.01056218, + "epoch": 0.06661656395611003, + "flos": 61419350021760.0, + "grad_norm": 0.7992800992168448, + "language_loss": 0.56762016, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.58924723, + "num_input_tokens_seen": 23610675, + "step": 1108, + "time_per_iteration": 3.0996246337890625 + }, + { + "auxiliary_loss_clip": 0.01190544, + "auxiliary_loss_mlp": 0.01060948, + "balance_loss_clip": 1.06359744, + "balance_loss_mlp": 1.03536594, + "epoch": 0.066676687208778, + "flos": 20662856282880.0, + "grad_norm": 2.792563791525604, + "language_loss": 0.72755802, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.75007296, + "num_input_tokens_seen": 23628710, + "step": 1109, + "time_per_iteration": 2.570469379425049 + }, + { + "auxiliary_loss_clip": 0.01159147, + "auxiliary_loss_mlp": 0.01064347, + "balance_loss_clip": 1.0659076, + "balance_loss_mlp": 1.03905106, + "epoch": 0.06673681046144596, + "flos": 20923280864640.0, + "grad_norm": 2.1865432226292354, + "language_loss": 0.78031182, + "learning_rate": 3.985860322578614e-06, + "loss": 0.80254674, + "num_input_tokens_seen": 23649160, + "step": 1110, + "time_per_iteration": 2.6239068508148193 + }, + { + "auxiliary_loss_clip": 0.01163757, + "auxiliary_loss_mlp": 0.01056316, + "balance_loss_clip": 1.06802034, + "balance_loss_mlp": 1.03258193, + "epoch": 0.06679693371411394, + "flos": 31065818359680.0, + "grad_norm": 2.0186322653832978, + "language_loss": 0.71698946, + "learning_rate": 3.985814055817427e-06, + "loss": 0.73919022, + "num_input_tokens_seen": 23671995, + "step": 1111, + "time_per_iteration": 2.7007100582122803 + }, + { + "auxiliary_loss_clip": 0.01177219, + "auxiliary_loss_mlp": 0.01069141, + "balance_loss_clip": 1.06778455, + "balance_loss_mlp": 1.04513264, + "epoch": 0.0668570569667819, + "flos": 21726135705600.0, + "grad_norm": 2.174539858357906, + "language_loss": 0.78634948, + "learning_rate": 3.985767713753971e-06, + "loss": 0.8088131, + "num_input_tokens_seen": 23690705, + "step": 1112, + "time_per_iteration": 2.665090560913086 + }, + { + "auxiliary_loss_clip": 0.01150751, + "auxiliary_loss_mlp": 0.0106445, + "balance_loss_clip": 1.05291378, + "balance_loss_mlp": 1.0402627, + "epoch": 0.06691718021944987, + "flos": 22747255539840.0, + "grad_norm": 2.400443235003305, + "language_loss": 0.79269671, + "learning_rate": 3.985721296390005e-06, + "loss": 0.81484878, + "num_input_tokens_seen": 23709990, + "step": 1113, + "time_per_iteration": 4.017433166503906 + }, + { + "auxiliary_loss_clip": 0.01157597, + "auxiliary_loss_mlp": 0.01061791, + "balance_loss_clip": 1.06595159, + "balance_loss_mlp": 1.03847361, + "epoch": 0.06697730347211785, + "flos": 16545626720640.0, + "grad_norm": 1.7761953581480479, + "language_loss": 0.82650596, + "learning_rate": 3.985674803727289e-06, + "loss": 0.84869981, + "num_input_tokens_seen": 23728485, + "step": 1114, + "time_per_iteration": 2.66770601272583 + }, + { + "auxiliary_loss_clip": 0.01069168, + "auxiliary_loss_mlp": 0.01016101, + "balance_loss_clip": 1.0449748, + "balance_loss_mlp": 1.01166618, + "epoch": 0.06703742672478581, + "flos": 59782326658560.0, + "grad_norm": 0.8409293515507801, + "language_loss": 0.58100414, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60185683, + "num_input_tokens_seen": 23786650, + "step": 1115, + "time_per_iteration": 4.631579637527466 + }, + { + "auxiliary_loss_clip": 0.01187378, + "auxiliary_loss_mlp": 0.01064851, + "balance_loss_clip": 1.07080984, + "balance_loss_mlp": 1.03887546, + "epoch": 0.06709754997745378, + "flos": 16800197385600.0, + "grad_norm": 2.8642662220929167, + "language_loss": 0.91588414, + "learning_rate": 3.985581592512658e-06, + "loss": 0.93840653, + "num_input_tokens_seen": 23802555, + "step": 1116, + "time_per_iteration": 2.5949795246124268 + }, + { + "auxiliary_loss_clip": 0.01177749, + "auxiliary_loss_mlp": 0.00816988, + "balance_loss_clip": 1.07026303, + "balance_loss_mlp": 1.06006992, + "epoch": 0.06715767323012176, + "flos": 22123917895680.0, + "grad_norm": 1.8080041763185188, + "language_loss": 0.87371922, + "learning_rate": 3.985534873964279e-06, + "loss": 0.89366663, + "num_input_tokens_seen": 23822945, + "step": 1117, + "time_per_iteration": 2.703951358795166 + }, + { + "auxiliary_loss_clip": 0.01093954, + "auxiliary_loss_mlp": 0.01005615, + "balance_loss_clip": 1.03779864, + "balance_loss_mlp": 1.00132322, + "epoch": 0.06721779648278972, + "flos": 66618100137600.0, + "grad_norm": 0.8827841041776882, + "language_loss": 0.59844798, + "learning_rate": 3.985488080124218e-06, + "loss": 0.61944371, + "num_input_tokens_seen": 23874075, + "step": 1118, + "time_per_iteration": 4.475669860839844 + }, + { + "auxiliary_loss_clip": 0.01186515, + "auxiliary_loss_mlp": 0.01056698, + "balance_loss_clip": 1.0621165, + "balance_loss_mlp": 1.03265321, + "epoch": 0.06727791973545769, + "flos": 22382474970240.0, + "grad_norm": 3.4774748039627883, + "language_loss": 0.8342942, + "learning_rate": 3.985441210994251e-06, + "loss": 0.85672629, + "num_input_tokens_seen": 23889720, + "step": 1119, + "time_per_iteration": 2.535940170288086 + }, + { + "auxiliary_loss_clip": 0.01190778, + "auxiliary_loss_mlp": 0.01057783, + "balance_loss_clip": 1.06962109, + "balance_loss_mlp": 1.03512156, + "epoch": 0.06733804298812565, + "flos": 24280210224000.0, + "grad_norm": 2.211687750310762, + "language_loss": 0.84851825, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.87100393, + "num_input_tokens_seen": 23909385, + "step": 1120, + "time_per_iteration": 4.017683267593384 + }, + { + "auxiliary_loss_clip": 0.01220746, + "auxiliary_loss_mlp": 0.01066115, + "balance_loss_clip": 1.07265472, + "balance_loss_mlp": 1.04162955, + "epoch": 0.06739816624079363, + "flos": 15918230839680.0, + "grad_norm": 1.9199342718461165, + "language_loss": 0.78822112, + "learning_rate": 3.985347246871708e-06, + "loss": 0.81108975, + "num_input_tokens_seen": 23926830, + "step": 1121, + "time_per_iteration": 2.4743101596832275 + }, + { + "auxiliary_loss_clip": 0.01089512, + "auxiliary_loss_mlp": 0.01007833, + "balance_loss_clip": 1.0395987, + "balance_loss_mlp": 1.00409031, + "epoch": 0.0674582894934616, + "flos": 71398567353600.0, + "grad_norm": 0.7557446687332641, + "language_loss": 0.58347619, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60444963, + "num_input_tokens_seen": 23992640, + "step": 1122, + "time_per_iteration": 3.2536373138427734 + }, + { + "auxiliary_loss_clip": 0.01158287, + "auxiliary_loss_mlp": 0.01064791, + "balance_loss_clip": 1.06976271, + "balance_loss_mlp": 1.04041219, + "epoch": 0.06751841274612956, + "flos": 25264952559360.0, + "grad_norm": 1.9169359993617905, + "language_loss": 0.71557915, + "learning_rate": 3.985252981610901e-06, + "loss": 0.73780996, + "num_input_tokens_seen": 24011135, + "step": 1123, + "time_per_iteration": 2.642385721206665 + }, + { + "auxiliary_loss_clip": 0.01155322, + "auxiliary_loss_mlp": 0.01062979, + "balance_loss_clip": 1.06660748, + "balance_loss_mlp": 1.03478563, + "epoch": 0.06757853599879754, + "flos": 23802741711360.0, + "grad_norm": 1.8112590089396507, + "language_loss": 0.78821993, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81040293, + "num_input_tokens_seen": 24030695, + "step": 1124, + "time_per_iteration": 2.625953435897827 + }, + { + "auxiliary_loss_clip": 0.01199398, + "auxiliary_loss_mlp": 0.01053047, + "balance_loss_clip": 1.06761551, + "balance_loss_mlp": 1.03032601, + "epoch": 0.0676386592514655, + "flos": 21033742164480.0, + "grad_norm": 2.3483394470155887, + "language_loss": 0.71561515, + "learning_rate": 3.985158415226128e-06, + "loss": 0.73813963, + "num_input_tokens_seen": 24050680, + "step": 1125, + "time_per_iteration": 2.5361976623535156 + }, + { + "auxiliary_loss_clip": 0.01175773, + "auxiliary_loss_mlp": 0.01071265, + "balance_loss_clip": 1.07004786, + "balance_loss_mlp": 1.04571807, + "epoch": 0.06769878250413347, + "flos": 25556331686400.0, + "grad_norm": 2.7506463679843507, + "language_loss": 0.81051481, + "learning_rate": 3.985111019116736e-06, + "loss": 0.83298516, + "num_input_tokens_seen": 24067205, + "step": 1126, + "time_per_iteration": 2.596774101257324 + }, + { + "auxiliary_loss_clip": 0.01081984, + "auxiliary_loss_mlp": 0.01006116, + "balance_loss_clip": 1.03780389, + "balance_loss_mlp": 1.00213432, + "epoch": 0.06775890575680145, + "flos": 70655251305600.0, + "grad_norm": 0.7974349759856977, + "language_loss": 0.59766507, + "learning_rate": 3.985063547731735e-06, + "loss": 0.61854607, + "num_input_tokens_seen": 24131320, + "step": 1127, + "time_per_iteration": 3.1434102058410645 + }, + { + "auxiliary_loss_clip": 0.01212644, + "auxiliary_loss_mlp": 0.01054407, + "balance_loss_clip": 1.06904054, + "balance_loss_mlp": 1.03068447, + "epoch": 0.06781902900946941, + "flos": 24235500769920.0, + "grad_norm": 2.1091829590563007, + "language_loss": 0.81434685, + "learning_rate": 3.985016001072925e-06, + "loss": 0.8370173, + "num_input_tokens_seen": 24149930, + "step": 1128, + "time_per_iteration": 2.5007717609405518 + }, + { + "auxiliary_loss_clip": 0.01171031, + "auxiliary_loss_mlp": 0.01049269, + "balance_loss_clip": 1.07150316, + "balance_loss_mlp": 1.02345979, + "epoch": 0.06787915226213738, + "flos": 22417523665920.0, + "grad_norm": 2.0353338945221733, + "language_loss": 0.75527853, + "learning_rate": 3.984968379142109e-06, + "loss": 0.77748156, + "num_input_tokens_seen": 24169590, + "step": 1129, + "time_per_iteration": 2.5951085090637207 + }, + { + "auxiliary_loss_clip": 0.0113329, + "auxiliary_loss_mlp": 0.01060341, + "balance_loss_clip": 1.06761754, + "balance_loss_mlp": 1.03530741, + "epoch": 0.06793927551480534, + "flos": 37706922080640.0, + "grad_norm": 1.9157595062371864, + "language_loss": 0.72491419, + "learning_rate": 3.984920681941094e-06, + "loss": 0.74685049, + "num_input_tokens_seen": 24189965, + "step": 1130, + "time_per_iteration": 2.8679919242858887 + }, + { + "auxiliary_loss_clip": 0.01166104, + "auxiliary_loss_mlp": 0.01059441, + "balance_loss_clip": 1.06505978, + "balance_loss_mlp": 1.03489542, + "epoch": 0.06799939876747332, + "flos": 20631398947200.0, + "grad_norm": 2.607843312225482, + "language_loss": 0.80783767, + "learning_rate": 3.984872909471688e-06, + "loss": 0.83009315, + "num_input_tokens_seen": 24208045, + "step": 1131, + "time_per_iteration": 2.7554352283477783 + }, + { + "auxiliary_loss_clip": 0.01200847, + "auxiliary_loss_mlp": 0.01066078, + "balance_loss_clip": 1.06567776, + "balance_loss_mlp": 1.04184222, + "epoch": 0.06805952202014129, + "flos": 14864755829760.0, + "grad_norm": 2.09101309082411, + "language_loss": 0.80399406, + "learning_rate": 3.984825061735701e-06, + "loss": 0.82666326, + "num_input_tokens_seen": 24223805, + "step": 1132, + "time_per_iteration": 2.486359119415283 + }, + { + "auxiliary_loss_clip": 0.01184655, + "auxiliary_loss_mlp": 0.01063268, + "balance_loss_clip": 1.07271254, + "balance_loss_mlp": 1.03897333, + "epoch": 0.06811964527280925, + "flos": 48909434947200.0, + "grad_norm": 1.5192230891698713, + "language_loss": 0.63448668, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.65696585, + "num_input_tokens_seen": 24249475, + "step": 1133, + "time_per_iteration": 2.8064792156219482 + }, + { + "auxiliary_loss_clip": 0.01125387, + "auxiliary_loss_mlp": 0.0105997, + "balance_loss_clip": 1.04968143, + "balance_loss_mlp": 1.03256381, + "epoch": 0.06817976852547723, + "flos": 15377273038080.0, + "grad_norm": 2.003233265821166, + "language_loss": 0.7454046, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.76725817, + "num_input_tokens_seen": 24267980, + "step": 1134, + "time_per_iteration": 2.624812364578247 + }, + { + "auxiliary_loss_clip": 0.01179735, + "auxiliary_loss_mlp": 0.00799421, + "balance_loss_clip": 1.06837308, + "balance_loss_mlp": 1.02759528, + "epoch": 0.0682398917781452, + "flos": 20155690200960.0, + "grad_norm": 2.1688591520864264, + "language_loss": 0.87023634, + "learning_rate": 3.984681066946423e-06, + "loss": 0.89002788, + "num_input_tokens_seen": 24286805, + "step": 1135, + "time_per_iteration": 2.585642099380493 + }, + { + "auxiliary_loss_clip": 0.01183329, + "auxiliary_loss_mlp": 0.00796706, + "balance_loss_clip": 1.06317592, + "balance_loss_mlp": 1.02166605, + "epoch": 0.06830001503081316, + "flos": 23440618748160.0, + "grad_norm": 2.6761458311863247, + "language_loss": 0.78343618, + "learning_rate": 3.984632918162291e-06, + "loss": 0.80323648, + "num_input_tokens_seen": 24305855, + "step": 1136, + "time_per_iteration": 2.566164493560791 + }, + { + "auxiliary_loss_clip": 0.01193156, + "auxiliary_loss_mlp": 0.01069832, + "balance_loss_clip": 1.06738663, + "balance_loss_mlp": 1.04467845, + "epoch": 0.06836013828348114, + "flos": 34349813153280.0, + "grad_norm": 2.340356846484295, + "language_loss": 0.84292316, + "learning_rate": 3.984584694120679e-06, + "loss": 0.86555302, + "num_input_tokens_seen": 24326535, + "step": 1137, + "time_per_iteration": 2.6787359714508057 + }, + { + "auxiliary_loss_clip": 0.01165026, + "auxiliary_loss_mlp": 0.01058841, + "balance_loss_clip": 1.06723642, + "balance_loss_mlp": 1.03470111, + "epoch": 0.06842026153614911, + "flos": 23148844571520.0, + "grad_norm": 2.0964565607321455, + "language_loss": 0.78649384, + "learning_rate": 3.984536394823418e-06, + "loss": 0.80873251, + "num_input_tokens_seen": 24345810, + "step": 1138, + "time_per_iteration": 2.6176974773406982 + }, + { + "auxiliary_loss_clip": 0.01213634, + "auxiliary_loss_mlp": 0.01056506, + "balance_loss_clip": 1.06674552, + "balance_loss_mlp": 1.03179431, + "epoch": 0.06848038478881707, + "flos": 24608972430720.0, + "grad_norm": 3.13388214743193, + "language_loss": 0.85622025, + "learning_rate": 3.984488020272336e-06, + "loss": 0.87892163, + "num_input_tokens_seen": 24366095, + "step": 1139, + "time_per_iteration": 2.508000135421753 + }, + { + "auxiliary_loss_clip": 0.01163463, + "auxiliary_loss_mlp": 0.01066501, + "balance_loss_clip": 1.06544924, + "balance_loss_mlp": 1.04006052, + "epoch": 0.06854050804148504, + "flos": 40880994278400.0, + "grad_norm": 1.7204646785224804, + "language_loss": 0.74888623, + "learning_rate": 3.984439570469271e-06, + "loss": 0.77118576, + "num_input_tokens_seen": 24388665, + "step": 1140, + "time_per_iteration": 2.7564125061035156 + }, + { + "auxiliary_loss_clip": 0.0118914, + "auxiliary_loss_mlp": 0.00800851, + "balance_loss_clip": 1.06498921, + "balance_loss_mlp": 1.03154659, + "epoch": 0.06860063129415302, + "flos": 31686354743040.0, + "grad_norm": 2.45191140560547, + "language_loss": 0.68056405, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.70046401, + "num_input_tokens_seen": 24407705, + "step": 1141, + "time_per_iteration": 2.640537977218628 + }, + { + "auxiliary_loss_clip": 0.01204433, + "auxiliary_loss_mlp": 0.01066097, + "balance_loss_clip": 1.06606078, + "balance_loss_mlp": 1.04024053, + "epoch": 0.06866075454682098, + "flos": 26542007775360.0, + "grad_norm": 1.9656488939580798, + "language_loss": 0.79275274, + "learning_rate": 3.984342445114538e-06, + "loss": 0.81545806, + "num_input_tokens_seen": 24428390, + "step": 1142, + "time_per_iteration": 2.570099353790283 + }, + { + "auxiliary_loss_clip": 0.01189556, + "auxiliary_loss_mlp": 0.01060456, + "balance_loss_clip": 1.0652585, + "balance_loss_mlp": 1.03663826, + "epoch": 0.06872087779948895, + "flos": 29789768724480.0, + "grad_norm": 2.125905014405525, + "language_loss": 0.69020647, + "learning_rate": 3.984293769566553e-06, + "loss": 0.71270663, + "num_input_tokens_seen": 24450810, + "step": 1143, + "time_per_iteration": 2.620107650756836 + }, + { + "auxiliary_loss_clip": 0.0117239, + "auxiliary_loss_mlp": 0.01059341, + "balance_loss_clip": 1.05389881, + "balance_loss_mlp": 1.03746605, + "epoch": 0.06878100105215693, + "flos": 26941118768640.0, + "grad_norm": 1.6917533243342024, + "language_loss": 0.74627882, + "learning_rate": 3.98424501877395e-06, + "loss": 0.76859611, + "num_input_tokens_seen": 24469965, + "step": 1144, + "time_per_iteration": 2.562654495239258 + }, + { + "auxiliary_loss_clip": 0.0119792, + "auxiliary_loss_mlp": 0.01061626, + "balance_loss_clip": 1.0696485, + "balance_loss_mlp": 1.03625786, + "epoch": 0.06884112430482489, + "flos": 10670748946560.0, + "grad_norm": 2.984439777550312, + "language_loss": 0.92093068, + "learning_rate": 3.984196192738577e-06, + "loss": 0.94352621, + "num_input_tokens_seen": 24486370, + "step": 1145, + "time_per_iteration": 2.471287250518799 + }, + { + "auxiliary_loss_clip": 0.01214304, + "auxiliary_loss_mlp": 0.01066132, + "balance_loss_clip": 1.0649333, + "balance_loss_mlp": 1.04107475, + "epoch": 0.06890124755749286, + "flos": 20193647898240.0, + "grad_norm": 2.4757197178980532, + "language_loss": 0.8224296, + "learning_rate": 3.984147291462285e-06, + "loss": 0.84523398, + "num_input_tokens_seen": 24503780, + "step": 1146, + "time_per_iteration": 2.5051491260528564 + }, + { + "auxiliary_loss_clip": 0.01209052, + "auxiliary_loss_mlp": 0.01063667, + "balance_loss_clip": 1.06731439, + "balance_loss_mlp": 1.04087377, + "epoch": 0.06896137081016084, + "flos": 20449224144000.0, + "grad_norm": 2.5202236995326737, + "language_loss": 0.85243917, + "learning_rate": 3.98409831494693e-06, + "loss": 0.8751663, + "num_input_tokens_seen": 24522320, + "step": 1147, + "time_per_iteration": 2.4951117038726807 + }, + { + "auxiliary_loss_clip": 0.01162497, + "auxiliary_loss_mlp": 0.01064921, + "balance_loss_clip": 1.06436813, + "balance_loss_mlp": 1.04087651, + "epoch": 0.0690214940628288, + "flos": 18368703555840.0, + "grad_norm": 2.3113134841473237, + "language_loss": 0.8587389, + "learning_rate": 3.984049263194367e-06, + "loss": 0.8810131, + "num_input_tokens_seen": 24540445, + "step": 1148, + "time_per_iteration": 2.592421531677246 + }, + { + "auxiliary_loss_clip": 0.01180267, + "auxiliary_loss_mlp": 0.01064534, + "balance_loss_clip": 1.06626916, + "balance_loss_mlp": 1.03891587, + "epoch": 0.06908161731549677, + "flos": 20558033418240.0, + "grad_norm": 2.3341387100873874, + "language_loss": 0.69488913, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.71733713, + "num_input_tokens_seen": 24557105, + "step": 1149, + "time_per_iteration": 2.554088830947876 + }, + { + "auxiliary_loss_clip": 0.01214463, + "auxiliary_loss_mlp": 0.01053825, + "balance_loss_clip": 1.06594753, + "balance_loss_mlp": 1.02907753, + "epoch": 0.06914174056816474, + "flos": 27563666313600.0, + "grad_norm": 1.8541888159764939, + "language_loss": 0.83762074, + "learning_rate": 3.983950933985064e-06, + "loss": 0.86030358, + "num_input_tokens_seen": 24578240, + "step": 1150, + "time_per_iteration": 2.526684045791626 + }, + { + "auxiliary_loss_clip": 0.0118867, + "auxiliary_loss_mlp": 0.0105958, + "balance_loss_clip": 1.06788731, + "balance_loss_mlp": 1.03521347, + "epoch": 0.06920186382083271, + "flos": 15304015249920.0, + "grad_norm": 4.312248316102627, + "language_loss": 0.81371701, + "learning_rate": 3.983901656532052e-06, + "loss": 0.83619946, + "num_input_tokens_seen": 24593585, + "step": 1151, + "time_per_iteration": 3.8767220973968506 + }, + { + "auxiliary_loss_clip": 0.01210657, + "auxiliary_loss_mlp": 0.01061027, + "balance_loss_clip": 1.06765771, + "balance_loss_mlp": 1.03767347, + "epoch": 0.06926198707350067, + "flos": 25191227894400.0, + "grad_norm": 1.8404893434885063, + "language_loss": 0.85450852, + "learning_rate": 3.983852303849291e-06, + "loss": 0.8772254, + "num_input_tokens_seen": 24613110, + "step": 1152, + "time_per_iteration": 2.513389825820923 + }, + { + "auxiliary_loss_clip": 0.01194226, + "auxiliary_loss_mlp": 0.01058253, + "balance_loss_clip": 1.06703782, + "balance_loss_mlp": 1.03559148, + "epoch": 0.06932211032616864, + "flos": 13256137146240.0, + "grad_norm": 2.442297144011295, + "language_loss": 0.90685719, + "learning_rate": 3.983802875938651e-06, + "loss": 0.92938197, + "num_input_tokens_seen": 24628795, + "step": 1153, + "time_per_iteration": 3.843212842941284 + }, + { + "auxiliary_loss_clip": 0.0116571, + "auxiliary_loss_mlp": 0.01055147, + "balance_loss_clip": 1.05471826, + "balance_loss_mlp": 1.03109074, + "epoch": 0.06938223357883662, + "flos": 24827381078400.0, + "grad_norm": 2.5465244461108454, + "language_loss": 0.81399202, + "learning_rate": 3.983753372802008e-06, + "loss": 0.83620059, + "num_input_tokens_seen": 24645480, + "step": 1154, + "time_per_iteration": 2.5371267795562744 + }, + { + "auxiliary_loss_clip": 0.01191813, + "auxiliary_loss_mlp": 0.01059657, + "balance_loss_clip": 1.07322764, + "balance_loss_mlp": 1.03625607, + "epoch": 0.06944235683150458, + "flos": 27267977554560.0, + "grad_norm": 1.964007526828095, + "language_loss": 0.75378811, + "learning_rate": 3.983703794441237e-06, + "loss": 0.77630281, + "num_input_tokens_seen": 24664630, + "step": 1155, + "time_per_iteration": 2.61171293258667 + }, + { + "auxiliary_loss_clip": 0.0118704, + "auxiliary_loss_mlp": 0.00793952, + "balance_loss_clip": 1.06438863, + "balance_loss_mlp": 1.01722181, + "epoch": 0.06950248008417255, + "flos": 25808065176960.0, + "grad_norm": 1.920563873994528, + "language_loss": 0.70790964, + "learning_rate": 3.98365414085822e-06, + "loss": 0.72771955, + "num_input_tokens_seen": 24684210, + "step": 1156, + "time_per_iteration": 2.5296638011932373 + }, + { + "auxiliary_loss_clip": 0.01231957, + "auxiliary_loss_mlp": 0.00798384, + "balance_loss_clip": 1.15258408, + "balance_loss_mlp": 1.02437449, + "epoch": 0.06956260333684053, + "flos": 22271546793600.0, + "grad_norm": 1.9913972250886258, + "language_loss": 0.7471891, + "learning_rate": 3.98360441205484e-06, + "loss": 0.76749253, + "num_input_tokens_seen": 24702490, + "step": 1157, + "time_per_iteration": 3.9423065185546875 + }, + { + "auxiliary_loss_clip": 0.01182633, + "auxiliary_loss_mlp": 0.01056211, + "balance_loss_clip": 1.06340563, + "balance_loss_mlp": 1.03170133, + "epoch": 0.0696227265895085, + "flos": 29681390413440.0, + "grad_norm": 1.7397935482058873, + "language_loss": 0.71791697, + "learning_rate": 3.983554608032982e-06, + "loss": 0.74030542, + "num_input_tokens_seen": 24724340, + "step": 1158, + "time_per_iteration": 2.6307575702667236 + }, + { + "auxiliary_loss_clip": 0.01213456, + "auxiliary_loss_mlp": 0.01060266, + "balance_loss_clip": 1.06703401, + "balance_loss_mlp": 1.0359236, + "epoch": 0.06968284984217646, + "flos": 25523545547520.0, + "grad_norm": 2.089709290236777, + "language_loss": 0.79619133, + "learning_rate": 3.983504728794533e-06, + "loss": 0.81892854, + "num_input_tokens_seen": 24745550, + "step": 1159, + "time_per_iteration": 3.899813652038574 + }, + { + "auxiliary_loss_clip": 0.01214749, + "auxiliary_loss_mlp": 0.01059208, + "balance_loss_clip": 1.06830204, + "balance_loss_mlp": 1.03302956, + "epoch": 0.06974297309484444, + "flos": 20698192287360.0, + "grad_norm": 11.298384426823219, + "language_loss": 0.80917019, + "learning_rate": 3.983454774341387e-06, + "loss": 0.83190978, + "num_input_tokens_seen": 24762575, + "step": 1160, + "time_per_iteration": 2.4611637592315674 + }, + { + "auxiliary_loss_clip": 0.01244288, + "auxiliary_loss_mlp": 0.01055671, + "balance_loss_clip": 1.14829993, + "balance_loss_mlp": 1.03106594, + "epoch": 0.0698030963475124, + "flos": 26505199313280.0, + "grad_norm": 1.7873573907411655, + "language_loss": 0.75794333, + "learning_rate": 3.983404744675437e-06, + "loss": 0.78094292, + "num_input_tokens_seen": 24782605, + "step": 1161, + "time_per_iteration": 2.5486936569213867 + }, + { + "auxiliary_loss_clip": 0.01182771, + "auxiliary_loss_mlp": 0.01062082, + "balance_loss_clip": 1.06590068, + "balance_loss_mlp": 1.03692842, + "epoch": 0.06986321960018037, + "flos": 23040430346880.0, + "grad_norm": 2.2738955936392897, + "language_loss": 0.82700789, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.84945643, + "num_input_tokens_seen": 24802910, + "step": 1162, + "time_per_iteration": 2.5651886463165283 + }, + { + "auxiliary_loss_clip": 0.01188518, + "auxiliary_loss_mlp": 0.01062209, + "balance_loss_clip": 1.066378, + "balance_loss_mlp": 1.03635275, + "epoch": 0.06992334285284833, + "flos": 28584822061440.0, + "grad_norm": 2.1809663904259793, + "language_loss": 0.79496068, + "learning_rate": 3.983304459712716e-06, + "loss": 0.81746793, + "num_input_tokens_seen": 24823305, + "step": 1163, + "time_per_iteration": 2.575932502746582 + }, + { + "auxiliary_loss_clip": 0.01195685, + "auxiliary_loss_mlp": 0.01060575, + "balance_loss_clip": 1.06514144, + "balance_loss_mlp": 1.03495646, + "epoch": 0.06998346610551631, + "flos": 20595344670720.0, + "grad_norm": 2.0650118353054636, + "language_loss": 0.78857553, + "learning_rate": 3.983254204419749e-06, + "loss": 0.81113809, + "num_input_tokens_seen": 24842155, + "step": 1164, + "time_per_iteration": 2.5194385051727295 + }, + { + "auxiliary_loss_clip": 0.01154248, + "auxiliary_loss_mlp": 0.01074027, + "balance_loss_clip": 1.06568289, + "balance_loss_mlp": 1.04647803, + "epoch": 0.07004358935818428, + "flos": 22528810978560.0, + "grad_norm": 1.6865995448939972, + "language_loss": 0.73009568, + "learning_rate": 3.983203873921583e-06, + "loss": 0.75237846, + "num_input_tokens_seen": 24862080, + "step": 1165, + "time_per_iteration": 2.6458218097686768 + }, + { + "auxiliary_loss_clip": 0.01186724, + "auxiliary_loss_mlp": 0.01056736, + "balance_loss_clip": 1.06491661, + "balance_loss_mlp": 1.03281081, + "epoch": 0.07010371261085224, + "flos": 28949997680640.0, + "grad_norm": 2.451425667487029, + "language_loss": 0.80968428, + "learning_rate": 3.983153468220128e-06, + "loss": 0.83211887, + "num_input_tokens_seen": 24886165, + "step": 1166, + "time_per_iteration": 2.6297972202301025 + }, + { + "auxiliary_loss_clip": 0.01183632, + "auxiliary_loss_mlp": 0.01047168, + "balance_loss_clip": 1.0699507, + "balance_loss_mlp": 1.02225327, + "epoch": 0.07016383586352022, + "flos": 23659171050240.0, + "grad_norm": 2.408447230800185, + "language_loss": 0.84560871, + "learning_rate": 3.983102987317295e-06, + "loss": 0.8679167, + "num_input_tokens_seen": 24905775, + "step": 1167, + "time_per_iteration": 2.6032495498657227 + }, + { + "auxiliary_loss_clip": 0.01200387, + "auxiliary_loss_mlp": 0.01055876, + "balance_loss_clip": 1.06852269, + "balance_loss_mlp": 1.03098464, + "epoch": 0.07022395911618819, + "flos": 19792130693760.0, + "grad_norm": 2.2498279048232885, + "language_loss": 0.89519179, + "learning_rate": 3.983052431214997e-06, + "loss": 0.91775441, + "num_input_tokens_seen": 24924295, + "step": 1168, + "time_per_iteration": 2.5281834602355957 + }, + { + "auxiliary_loss_clip": 0.01193339, + "auxiliary_loss_mlp": 0.01065073, + "balance_loss_clip": 1.06564808, + "balance_loss_mlp": 1.03724909, + "epoch": 0.07028408236885615, + "flos": 21689147675520.0, + "grad_norm": 2.0912175778985413, + "language_loss": 0.88953197, + "learning_rate": 3.983001799915153e-06, + "loss": 0.91211617, + "num_input_tokens_seen": 24943210, + "step": 1169, + "time_per_iteration": 2.5864615440368652 + }, + { + "auxiliary_loss_clip": 0.01215195, + "auxiliary_loss_mlp": 0.01065213, + "balance_loss_clip": 1.06798065, + "balance_loss_mlp": 1.03945184, + "epoch": 0.07034420562152413, + "flos": 25630271832960.0, + "grad_norm": 2.379615026133279, + "language_loss": 0.83910972, + "learning_rate": 3.982951093419681e-06, + "loss": 0.8619138, + "num_input_tokens_seen": 24960360, + "step": 1170, + "time_per_iteration": 2.5139658451080322 + }, + { + "auxiliary_loss_clip": 0.01230706, + "auxiliary_loss_mlp": 0.0081491, + "balance_loss_clip": 1.14877701, + "balance_loss_mlp": 1.05708098, + "epoch": 0.0704043288741921, + "flos": 20810449267200.0, + "grad_norm": 2.0276202940663968, + "language_loss": 0.75698322, + "learning_rate": 3.982900311730506e-06, + "loss": 0.77743936, + "num_input_tokens_seen": 24978290, + "step": 1171, + "time_per_iteration": 2.604780435562134 + }, + { + "auxiliary_loss_clip": 0.01183345, + "auxiliary_loss_mlp": 0.01058747, + "balance_loss_clip": 1.06594253, + "balance_loss_mlp": 1.03451133, + "epoch": 0.07046445212686006, + "flos": 25593176062080.0, + "grad_norm": 1.9942595762859727, + "language_loss": 0.89049661, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.91291749, + "num_input_tokens_seen": 24997055, + "step": 1172, + "time_per_iteration": 2.5661892890930176 + }, + { + "auxiliary_loss_clip": 0.01193993, + "auxiliary_loss_mlp": 0.01054256, + "balance_loss_clip": 1.06297743, + "balance_loss_mlp": 1.02863765, + "epoch": 0.07052457537952803, + "flos": 25556978131200.0, + "grad_norm": 4.853086079042627, + "language_loss": 0.82086641, + "learning_rate": 3.982798522778748e-06, + "loss": 0.84334892, + "num_input_tokens_seen": 25017490, + "step": 1173, + "time_per_iteration": 2.6214759349823 + }, + { + "auxiliary_loss_clip": 0.01204165, + "auxiliary_loss_mlp": 0.01058064, + "balance_loss_clip": 1.06553257, + "balance_loss_mlp": 1.03211236, + "epoch": 0.070584698632196, + "flos": 17968515154560.0, + "grad_norm": 2.118853023502844, + "language_loss": 0.82527643, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.84789872, + "num_input_tokens_seen": 25035660, + "step": 1174, + "time_per_iteration": 2.4745283126831055 + }, + { + "auxiliary_loss_clip": 0.01186692, + "auxiliary_loss_mlp": 0.01058767, + "balance_loss_clip": 1.06431007, + "balance_loss_mlp": 1.0335784, + "epoch": 0.07064482188486397, + "flos": 25370888745600.0, + "grad_norm": 1.7783316230677921, + "language_loss": 0.8485356, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87099028, + "num_input_tokens_seen": 25054785, + "step": 1175, + "time_per_iteration": 2.603943109512329 + }, + { + "auxiliary_loss_clip": 0.01203466, + "auxiliary_loss_mlp": 0.01069554, + "balance_loss_clip": 1.06915951, + "balance_loss_mlp": 1.04498458, + "epoch": 0.07070494513753194, + "flos": 24899848767360.0, + "grad_norm": 1.7956910741854903, + "language_loss": 0.83424342, + "learning_rate": 3.982645275446563e-06, + "loss": 0.85697365, + "num_input_tokens_seen": 25075180, + "step": 1176, + "time_per_iteration": 2.562504529953003 + }, + { + "auxiliary_loss_clip": 0.01149565, + "auxiliary_loss_mlp": 0.01070728, + "balance_loss_clip": 1.05205953, + "balance_loss_mlp": 1.04398918, + "epoch": 0.07076506839019991, + "flos": 22338447874560.0, + "grad_norm": 2.503694822698019, + "language_loss": 0.74494392, + "learning_rate": 3.982594042635701e-06, + "loss": 0.76714683, + "num_input_tokens_seen": 25093035, + "step": 1177, + "time_per_iteration": 2.5684187412261963 + }, + { + "auxiliary_loss_clip": 0.01193759, + "auxiliary_loss_mlp": 0.01066199, + "balance_loss_clip": 1.06666112, + "balance_loss_mlp": 1.04003274, + "epoch": 0.07082519164286788, + "flos": 18660800954880.0, + "grad_norm": 1.9208109525459056, + "language_loss": 0.85823941, + "learning_rate": 3.982542734644673e-06, + "loss": 0.88083899, + "num_input_tokens_seen": 25112520, + "step": 1178, + "time_per_iteration": 2.542872428894043 + }, + { + "auxiliary_loss_clip": 0.01083283, + "auxiliary_loss_mlp": 0.01008745, + "balance_loss_clip": 1.03682506, + "balance_loss_mlp": 1.00409603, + "epoch": 0.07088531489553584, + "flos": 63654107610240.0, + "grad_norm": 0.91247882605911, + "language_loss": 0.63208199, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65300226, + "num_input_tokens_seen": 25177760, + "step": 1179, + "time_per_iteration": 3.2518677711486816 + }, + { + "auxiliary_loss_clip": 0.01211694, + "auxiliary_loss_mlp": 0.01065832, + "balance_loss_clip": 1.06961322, + "balance_loss_mlp": 1.04131114, + "epoch": 0.07094543814820382, + "flos": 21572688804480.0, + "grad_norm": 5.037731513842757, + "language_loss": 0.83243799, + "learning_rate": 3.98243989312991e-06, + "loss": 0.85521317, + "num_input_tokens_seen": 25195260, + "step": 1180, + "time_per_iteration": 2.525660991668701 + }, + { + "auxiliary_loss_clip": 0.01184167, + "auxiliary_loss_mlp": 0.01063706, + "balance_loss_clip": 1.06460118, + "balance_loss_mlp": 1.038064, + "epoch": 0.07100556140087179, + "flos": 22089946608000.0, + "grad_norm": 2.696060999376245, + "language_loss": 0.88489223, + "learning_rate": 3.982388359610074e-06, + "loss": 0.90737098, + "num_input_tokens_seen": 25212740, + "step": 1181, + "time_per_iteration": 2.545790672302246 + }, + { + "auxiliary_loss_clip": 0.01189952, + "auxiliary_loss_mlp": 0.01069703, + "balance_loss_clip": 1.07089353, + "balance_loss_mlp": 1.04452562, + "epoch": 0.07106568465353975, + "flos": 47922286400640.0, + "grad_norm": 2.6002387288313242, + "language_loss": 0.83758378, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.86018038, + "num_input_tokens_seen": 25236420, + "step": 1182, + "time_per_iteration": 2.766767740249634 + }, + { + "auxiliary_loss_clip": 0.01204165, + "auxiliary_loss_mlp": 0.01059387, + "balance_loss_clip": 1.07266378, + "balance_loss_mlp": 1.03262496, + "epoch": 0.07112580790620772, + "flos": 23440798316160.0, + "grad_norm": 3.1273364546181304, + "language_loss": 0.79313695, + "learning_rate": 3.982285067055262e-06, + "loss": 0.81577241, + "num_input_tokens_seen": 25255120, + "step": 1183, + "time_per_iteration": 2.5836963653564453 + }, + { + "auxiliary_loss_clip": 0.01216311, + "auxiliary_loss_mlp": 0.01064705, + "balance_loss_clip": 1.0660007, + "balance_loss_mlp": 1.0386939, + "epoch": 0.0711859311588757, + "flos": 31868888682240.0, + "grad_norm": 2.185604274808392, + "language_loss": 0.79295337, + "learning_rate": 3.982233308024204e-06, + "loss": 0.81576347, + "num_input_tokens_seen": 25275150, + "step": 1184, + "time_per_iteration": 2.5465188026428223 + }, + { + "auxiliary_loss_clip": 0.01160314, + "auxiliary_loss_mlp": 0.01067199, + "balance_loss_clip": 1.06793237, + "balance_loss_mlp": 1.04209352, + "epoch": 0.07124605441154366, + "flos": 19610315026560.0, + "grad_norm": 2.2792059595597585, + "language_loss": 0.76754522, + "learning_rate": 3.98218147382666e-06, + "loss": 0.78982031, + "num_input_tokens_seen": 25293680, + "step": 1185, + "time_per_iteration": 2.615647554397583 + }, + { + "auxiliary_loss_clip": 0.01214563, + "auxiliary_loss_mlp": 0.01067227, + "balance_loss_clip": 1.06807399, + "balance_loss_mlp": 1.04143071, + "epoch": 0.07130617766421163, + "flos": 14684448533760.0, + "grad_norm": 2.997435203268458, + "language_loss": 0.65342814, + "learning_rate": 3.982129564464596e-06, + "loss": 0.67624605, + "num_input_tokens_seen": 25310050, + "step": 1186, + "time_per_iteration": 2.4411468505859375 + }, + { + "auxiliary_loss_clip": 0.01201222, + "auxiliary_loss_mlp": 0.01058876, + "balance_loss_clip": 1.07055187, + "balance_loss_mlp": 1.03328156, + "epoch": 0.07136630091687961, + "flos": 26067915141120.0, + "grad_norm": 2.181661753030015, + "language_loss": 0.70020473, + "learning_rate": 3.98207757993998e-06, + "loss": 0.72280574, + "num_input_tokens_seen": 25331020, + "step": 1187, + "time_per_iteration": 2.5722804069519043 + }, + { + "auxiliary_loss_clip": 0.01164949, + "auxiliary_loss_mlp": 0.0105858, + "balance_loss_clip": 1.06971073, + "balance_loss_mlp": 1.03479755, + "epoch": 0.07142642416954757, + "flos": 15669190869120.0, + "grad_norm": 2.759748590360436, + "language_loss": 0.79341507, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.81565034, + "num_input_tokens_seen": 25347875, + "step": 1188, + "time_per_iteration": 2.5465219020843506 + }, + { + "auxiliary_loss_clip": 0.01215599, + "auxiliary_loss_mlp": 0.01062855, + "balance_loss_clip": 1.07120919, + "balance_loss_mlp": 1.03721356, + "epoch": 0.07148654742221554, + "flos": 19755322231680.0, + "grad_norm": 1.902467531249299, + "language_loss": 0.84652185, + "learning_rate": 3.981973385410981e-06, + "loss": 0.86930645, + "num_input_tokens_seen": 25366715, + "step": 1189, + "time_per_iteration": 2.499077320098877 + }, + { + "auxiliary_loss_clip": 0.0117672, + "auxiliary_loss_mlp": 0.00799794, + "balance_loss_clip": 1.06718588, + "balance_loss_mlp": 1.02726746, + "epoch": 0.07154667067488352, + "flos": 23471824688640.0, + "grad_norm": 2.7426786993965138, + "language_loss": 0.76727188, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.78703701, + "num_input_tokens_seen": 25385450, + "step": 1190, + "time_per_iteration": 3.9924471378326416 + }, + { + "auxiliary_loss_clip": 0.01215541, + "auxiliary_loss_mlp": 0.01073321, + "balance_loss_clip": 1.0683856, + "balance_loss_mlp": 1.04587901, + "epoch": 0.07160679392755148, + "flos": 18332936588160.0, + "grad_norm": 2.0581370917741344, + "language_loss": 0.75650471, + "learning_rate": 3.981868890255468e-06, + "loss": 0.77939332, + "num_input_tokens_seen": 25403940, + "step": 1191, + "time_per_iteration": 2.4657485485076904 + }, + { + "auxiliary_loss_clip": 0.01174098, + "auxiliary_loss_mlp": 0.01057043, + "balance_loss_clip": 1.06450653, + "balance_loss_mlp": 1.02950549, + "epoch": 0.07166691718021945, + "flos": 17747017937280.0, + "grad_norm": 2.4257656857668546, + "language_loss": 0.73760068, + "learning_rate": 3.981816529947719e-06, + "loss": 0.75991213, + "num_input_tokens_seen": 25420410, + "step": 1192, + "time_per_iteration": 3.9080147743225098 + }, + { + "auxiliary_loss_clip": 0.0120921, + "auxiliary_loss_mlp": 0.01054245, + "balance_loss_clip": 1.06358385, + "balance_loss_mlp": 1.0299263, + "epoch": 0.07172704043288743, + "flos": 22451925916800.0, + "grad_norm": 2.432473034795723, + "language_loss": 0.78139824, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.8040328, + "num_input_tokens_seen": 25439415, + "step": 1193, + "time_per_iteration": 2.478437662124634 + }, + { + "auxiliary_loss_clip": 0.01193303, + "auxiliary_loss_mlp": 0.01058219, + "balance_loss_clip": 1.07088089, + "balance_loss_mlp": 1.03238606, + "epoch": 0.07178716368555539, + "flos": 23222210100480.0, + "grad_norm": 1.9912876299860613, + "language_loss": 0.85382998, + "learning_rate": 3.981711583882166e-06, + "loss": 0.87634522, + "num_input_tokens_seen": 25458715, + "step": 1194, + "time_per_iteration": 2.5872788429260254 + }, + { + "auxiliary_loss_clip": 0.0119767, + "auxiliary_loss_mlp": 0.01066727, + "balance_loss_clip": 1.0713203, + "balance_loss_mlp": 1.04137158, + "epoch": 0.07184728693822336, + "flos": 25150828072320.0, + "grad_norm": 2.49217318708141, + "language_loss": 0.81845641, + "learning_rate": 3.981658998128341e-06, + "loss": 0.84110039, + "num_input_tokens_seen": 25477985, + "step": 1195, + "time_per_iteration": 3.937192678451538 + }, + { + "auxiliary_loss_clip": 0.01176478, + "auxiliary_loss_mlp": 0.01057096, + "balance_loss_clip": 1.06785274, + "balance_loss_mlp": 1.03293252, + "epoch": 0.07190741019089132, + "flos": 22711237176960.0, + "grad_norm": 1.937818594780431, + "language_loss": 0.79854977, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82088548, + "num_input_tokens_seen": 25497110, + "step": 1196, + "time_per_iteration": 2.607423782348633 + }, + { + "auxiliary_loss_clip": 0.01184528, + "auxiliary_loss_mlp": 0.00797301, + "balance_loss_clip": 1.06800556, + "balance_loss_mlp": 1.01831508, + "epoch": 0.0719675334435593, + "flos": 29349791032320.0, + "grad_norm": 2.787793865224825, + "language_loss": 0.70888603, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.72870427, + "num_input_tokens_seen": 25516555, + "step": 1197, + "time_per_iteration": 2.641493082046509 + }, + { + "auxiliary_loss_clip": 0.01162825, + "auxiliary_loss_mlp": 0.01056886, + "balance_loss_clip": 1.0651269, + "balance_loss_mlp": 1.03154182, + "epoch": 0.07202765669622727, + "flos": 17639788861440.0, + "grad_norm": 2.0350242378403762, + "language_loss": 0.86015046, + "learning_rate": 3.98150079000661e-06, + "loss": 0.88234758, + "num_input_tokens_seen": 25533895, + "step": 1198, + "time_per_iteration": 4.077868461608887 + }, + { + "auxiliary_loss_clip": 0.01167777, + "auxiliary_loss_mlp": 0.01065492, + "balance_loss_clip": 1.0653677, + "balance_loss_mlp": 1.04007626, + "epoch": 0.07208777994889523, + "flos": 21434038306560.0, + "grad_norm": 2.231543778523985, + "language_loss": 0.83816004, + "learning_rate": 3.981447903685947e-06, + "loss": 0.86049277, + "num_input_tokens_seen": 25554195, + "step": 1199, + "time_per_iteration": 2.6173348426818848 + }, + { + "auxiliary_loss_clip": 0.01218952, + "auxiliary_loss_mlp": 0.01057267, + "balance_loss_clip": 1.07312489, + "balance_loss_mlp": 1.03307986, + "epoch": 0.07214790320156321, + "flos": 26940867373440.0, + "grad_norm": 2.1877017054245136, + "language_loss": 0.76951146, + "learning_rate": 3.981394942228581e-06, + "loss": 0.79227364, + "num_input_tokens_seen": 25574155, + "step": 1200, + "time_per_iteration": 2.555222511291504 + }, + { + "auxiliary_loss_clip": 0.01194527, + "auxiliary_loss_mlp": 0.01070903, + "balance_loss_clip": 1.06888199, + "balance_loss_mlp": 1.04510629, + "epoch": 0.07220802645423118, + "flos": 23879949995520.0, + "grad_norm": 2.398704174967709, + "language_loss": 0.82372916, + "learning_rate": 3.98134190563652e-06, + "loss": 0.84638345, + "num_input_tokens_seen": 25592735, + "step": 1201, + "time_per_iteration": 2.540445327758789 + }, + { + "auxiliary_loss_clip": 0.01202862, + "auxiliary_loss_mlp": 0.01060337, + "balance_loss_clip": 1.06654203, + "balance_loss_mlp": 1.03307366, + "epoch": 0.07226814970689914, + "flos": 19243631036160.0, + "grad_norm": 2.4844168720150637, + "language_loss": 0.68989384, + "learning_rate": 3.981288793911775e-06, + "loss": 0.71252578, + "num_input_tokens_seen": 25611510, + "step": 1202, + "time_per_iteration": 2.5117533206939697 + }, + { + "auxiliary_loss_clip": 0.01190827, + "auxiliary_loss_mlp": 0.00797477, + "balance_loss_clip": 1.07042778, + "balance_loss_mlp": 1.02247047, + "epoch": 0.07232827295956712, + "flos": 19172025273600.0, + "grad_norm": 2.1738820401864185, + "language_loss": 0.87562704, + "learning_rate": 3.98123560705636e-06, + "loss": 0.89551008, + "num_input_tokens_seen": 25629560, + "step": 1203, + "time_per_iteration": 2.5440328121185303 + }, + { + "auxiliary_loss_clip": 0.01165123, + "auxiliary_loss_mlp": 0.01069527, + "balance_loss_clip": 1.06198359, + "balance_loss_mlp": 1.0437181, + "epoch": 0.07238839621223508, + "flos": 17639752947840.0, + "grad_norm": 1.9823673486944118, + "language_loss": 0.78466356, + "learning_rate": 3.981182345072293e-06, + "loss": 0.80701005, + "num_input_tokens_seen": 25648330, + "step": 1204, + "time_per_iteration": 2.6112170219421387 + }, + { + "auxiliary_loss_clip": 0.01200281, + "auxiliary_loss_mlp": 0.0107023, + "balance_loss_clip": 1.06807721, + "balance_loss_mlp": 1.0449934, + "epoch": 0.07244851946490305, + "flos": 28292401440000.0, + "grad_norm": 1.8275814698892219, + "language_loss": 0.8218677, + "learning_rate": 3.981129007961593e-06, + "loss": 0.84457278, + "num_input_tokens_seen": 25669470, + "step": 1205, + "time_per_iteration": 2.573889970779419 + }, + { + "auxiliary_loss_clip": 0.01187683, + "auxiliary_loss_mlp": 0.0079391, + "balance_loss_clip": 1.06963205, + "balance_loss_mlp": 1.01633668, + "epoch": 0.07250864271757101, + "flos": 22564829341440.0, + "grad_norm": 2.3330723865503264, + "language_loss": 0.76684058, + "learning_rate": 3.981075595726283e-06, + "loss": 0.7866565, + "num_input_tokens_seen": 25690470, + "step": 1206, + "time_per_iteration": 2.5994017124176025 + }, + { + "auxiliary_loss_clip": 0.01195918, + "auxiliary_loss_mlp": 0.0106011, + "balance_loss_clip": 1.0697, + "balance_loss_mlp": 1.03450418, + "epoch": 0.072568765970239, + "flos": 21762405463680.0, + "grad_norm": 1.9244264359581478, + "language_loss": 0.77764928, + "learning_rate": 3.981022108368387e-06, + "loss": 0.80020952, + "num_input_tokens_seen": 25709205, + "step": 1207, + "time_per_iteration": 2.506878137588501 + }, + { + "auxiliary_loss_clip": 0.01194729, + "auxiliary_loss_mlp": 0.01055464, + "balance_loss_clip": 1.06556308, + "balance_loss_mlp": 1.03160977, + "epoch": 0.07262888922290696, + "flos": 25519702792320.0, + "grad_norm": 3.543086582320235, + "language_loss": 0.7955451, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.81804699, + "num_input_tokens_seen": 25728485, + "step": 1208, + "time_per_iteration": 2.5797102451324463 + }, + { + "auxiliary_loss_clip": 0.01193765, + "auxiliary_loss_mlp": 0.0105732, + "balance_loss_clip": 1.06537342, + "balance_loss_mlp": 1.03382349, + "epoch": 0.07268901247557492, + "flos": 21246548290560.0, + "grad_norm": 2.5006084242123596, + "language_loss": 0.7888763, + "learning_rate": 3.980914908292955e-06, + "loss": 0.81138718, + "num_input_tokens_seen": 25747730, + "step": 1209, + "time_per_iteration": 2.5232326984405518 + }, + { + "auxiliary_loss_clip": 0.01199913, + "auxiliary_loss_mlp": 0.01063797, + "balance_loss_clip": 1.06651545, + "balance_loss_mlp": 1.03964496, + "epoch": 0.0727491357282429, + "flos": 25479302970240.0, + "grad_norm": 2.357422485467626, + "language_loss": 0.81396508, + "learning_rate": 3.980861195579486e-06, + "loss": 0.83660215, + "num_input_tokens_seen": 25768050, + "step": 1210, + "time_per_iteration": 2.5782530307769775 + }, + { + "auxiliary_loss_clip": 0.01185982, + "auxiliary_loss_mlp": 0.01068143, + "balance_loss_clip": 1.06947231, + "balance_loss_mlp": 1.04267967, + "epoch": 0.07280925898091087, + "flos": 24462169545600.0, + "grad_norm": 2.7616279220963627, + "language_loss": 0.84537047, + "learning_rate": 3.98080740775156e-06, + "loss": 0.8679117, + "num_input_tokens_seen": 25787985, + "step": 1211, + "time_per_iteration": 2.5680668354034424 + }, + { + "auxiliary_loss_clip": 0.01173146, + "auxiliary_loss_mlp": 0.01055217, + "balance_loss_clip": 1.06400347, + "balance_loss_mlp": 1.03091002, + "epoch": 0.07286938223357883, + "flos": 18288191220480.0, + "grad_norm": 2.7774349724346505, + "language_loss": 0.90878206, + "learning_rate": 3.98075354481122e-06, + "loss": 0.93106568, + "num_input_tokens_seen": 25803620, + "step": 1212, + "time_per_iteration": 2.5289597511291504 + }, + { + "auxiliary_loss_clip": 0.01212581, + "auxiliary_loss_mlp": 0.01053667, + "balance_loss_clip": 1.06973803, + "balance_loss_mlp": 1.02957487, + "epoch": 0.07292950548624681, + "flos": 21214803646080.0, + "grad_norm": 1.8046083526528385, + "language_loss": 0.72789836, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.75056082, + "num_input_tokens_seen": 25823315, + "step": 1213, + "time_per_iteration": 2.4908602237701416 + }, + { + "auxiliary_loss_clip": 0.01162268, + "auxiliary_loss_mlp": 0.01056773, + "balance_loss_clip": 1.06477261, + "balance_loss_mlp": 1.0320375, + "epoch": 0.07298962873891478, + "flos": 24642009964800.0, + "grad_norm": 1.9692764154646545, + "language_loss": 0.84317774, + "learning_rate": 3.980645593601465e-06, + "loss": 0.86536813, + "num_input_tokens_seen": 25842605, + "step": 1214, + "time_per_iteration": 2.6685433387756348 + }, + { + "auxiliary_loss_clip": 0.01216763, + "auxiliary_loss_mlp": 0.01059925, + "balance_loss_clip": 1.06965017, + "balance_loss_mlp": 1.03437805, + "epoch": 0.07304975199158274, + "flos": 27052765217280.0, + "grad_norm": 2.1411961938525756, + "language_loss": 0.84169424, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86446112, + "num_input_tokens_seen": 25863030, + "step": 1215, + "time_per_iteration": 2.529304265975952 + }, + { + "auxiliary_loss_clip": 0.01199013, + "auxiliary_loss_mlp": 0.01061761, + "balance_loss_clip": 1.1310364, + "balance_loss_mlp": 1.03638148, + "epoch": 0.07310987524425071, + "flos": 33549544091520.0, + "grad_norm": 3.5015031401045085, + "language_loss": 0.81053972, + "learning_rate": 3.980537341966595e-06, + "loss": 0.83314741, + "num_input_tokens_seen": 25888015, + "step": 1216, + "time_per_iteration": 2.708005428314209 + }, + { + "auxiliary_loss_clip": 0.01167847, + "auxiliary_loss_mlp": 0.01057722, + "balance_loss_clip": 1.05554605, + "balance_loss_mlp": 1.03402328, + "epoch": 0.07316999849691869, + "flos": 28110944908800.0, + "grad_norm": 2.473932988038879, + "language_loss": 0.76125169, + "learning_rate": 3.980483103494872e-06, + "loss": 0.78350741, + "num_input_tokens_seen": 25908660, + "step": 1217, + "time_per_iteration": 2.588402509689331 + }, + { + "auxiliary_loss_clip": 0.01176315, + "auxiliary_loss_mlp": 0.01061517, + "balance_loss_clip": 1.06679177, + "balance_loss_mlp": 1.03876019, + "epoch": 0.07323012174958665, + "flos": 14392602529920.0, + "grad_norm": 2.1724715232538485, + "language_loss": 0.8690449, + "learning_rate": 3.98042878992303e-06, + "loss": 0.89142323, + "num_input_tokens_seen": 25927215, + "step": 1218, + "time_per_iteration": 2.593597888946533 + }, + { + "auxiliary_loss_clip": 0.01200673, + "auxiliary_loss_mlp": 0.01065401, + "balance_loss_clip": 1.06702137, + "balance_loss_mlp": 1.04122555, + "epoch": 0.07329024500225462, + "flos": 21616428591360.0, + "grad_norm": 2.1426433930107502, + "language_loss": 0.86631531, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.88897604, + "num_input_tokens_seen": 25945500, + "step": 1219, + "time_per_iteration": 2.52473783493042 + }, + { + "auxiliary_loss_clip": 0.0121, + "auxiliary_loss_mlp": 0.0105692, + "balance_loss_clip": 1.06746864, + "balance_loss_mlp": 1.03410363, + "epoch": 0.0733503682549226, + "flos": 13224141106560.0, + "grad_norm": 3.305342997540625, + "language_loss": 0.8493489, + "learning_rate": 3.980319937487235e-06, + "loss": 0.87201804, + "num_input_tokens_seen": 25963105, + "step": 1220, + "time_per_iteration": 2.46592378616333 + }, + { + "auxiliary_loss_clip": 0.01165873, + "auxiliary_loss_mlp": 0.01061011, + "balance_loss_clip": 1.05964828, + "balance_loss_mlp": 1.03679991, + "epoch": 0.07341049150759056, + "flos": 20886975192960.0, + "grad_norm": 3.8939684076398953, + "language_loss": 0.7698487, + "learning_rate": 3.98026539862741e-06, + "loss": 0.79211754, + "num_input_tokens_seen": 25981690, + "step": 1221, + "time_per_iteration": 2.571197748184204 + }, + { + "auxiliary_loss_clip": 0.01169132, + "auxiliary_loss_mlp": 0.01062984, + "balance_loss_clip": 1.06757903, + "balance_loss_mlp": 1.03905916, + "epoch": 0.07347061476025853, + "flos": 15413614623360.0, + "grad_norm": 1.8133769186156972, + "language_loss": 0.9182173, + "learning_rate": 3.980210784675722e-06, + "loss": 0.94053853, + "num_input_tokens_seen": 25999890, + "step": 1222, + "time_per_iteration": 2.628995895385742 + }, + { + "auxiliary_loss_clip": 0.01146531, + "auxiliary_loss_mlp": 0.01057682, + "balance_loss_clip": 1.06302798, + "balance_loss_mlp": 1.0343051, + "epoch": 0.0735307380129265, + "flos": 11108859131520.0, + "grad_norm": 2.462969248388403, + "language_loss": 0.905487, + "learning_rate": 3.980156095634242e-06, + "loss": 0.9275291, + "num_input_tokens_seen": 26016445, + "step": 1223, + "time_per_iteration": 2.6526036262512207 + }, + { + "auxiliary_loss_clip": 0.01212245, + "auxiliary_loss_mlp": 0.01070113, + "balance_loss_clip": 1.06858075, + "balance_loss_mlp": 1.04604506, + "epoch": 0.07359086126559447, + "flos": 23732392924800.0, + "grad_norm": 1.9685783344553158, + "language_loss": 0.82043421, + "learning_rate": 3.980101331505045e-06, + "loss": 0.84325778, + "num_input_tokens_seen": 26036080, + "step": 1224, + "time_per_iteration": 2.5358564853668213 + }, + { + "auxiliary_loss_clip": 0.01207861, + "auxiliary_loss_mlp": 0.01058427, + "balance_loss_clip": 1.06556463, + "balance_loss_mlp": 1.03249931, + "epoch": 0.07365098451826244, + "flos": 20993270515200.0, + "grad_norm": 2.512858573433487, + "language_loss": 0.83136499, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.85402787, + "num_input_tokens_seen": 26055805, + "step": 1225, + "time_per_iteration": 2.538318395614624 + }, + { + "auxiliary_loss_clip": 0.01176354, + "auxiliary_loss_mlp": 0.01058111, + "balance_loss_clip": 1.0642333, + "balance_loss_mlp": 1.03337502, + "epoch": 0.0737111077709304, + "flos": 19933582452480.0, + "grad_norm": 1.8229889990168386, + "language_loss": 0.90514988, + "learning_rate": 3.979991577991808e-06, + "loss": 0.92749459, + "num_input_tokens_seen": 26073905, + "step": 1226, + "time_per_iteration": 2.5840744972229004 + }, + { + "auxiliary_loss_clip": 0.01218461, + "auxiliary_loss_mlp": 0.01051263, + "balance_loss_clip": 1.06613278, + "balance_loss_mlp": 1.0262413, + "epoch": 0.07377123102359838, + "flos": 16581537342720.0, + "grad_norm": 2.8602489682009664, + "language_loss": 0.76921117, + "learning_rate": 3.97993658861193e-06, + "loss": 0.79190838, + "num_input_tokens_seen": 26091700, + "step": 1227, + "time_per_iteration": 2.491011381149292 + }, + { + "auxiliary_loss_clip": 0.01194869, + "auxiliary_loss_mlp": 0.01052756, + "balance_loss_clip": 1.06692696, + "balance_loss_mlp": 1.02868795, + "epoch": 0.07383135427626634, + "flos": 28328563457280.0, + "grad_norm": 1.4652449490589148, + "language_loss": 0.85645741, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.87893367, + "num_input_tokens_seen": 26114105, + "step": 1228, + "time_per_iteration": 3.9786975383758545 + }, + { + "auxiliary_loss_clip": 0.01195063, + "auxiliary_loss_mlp": 0.01054569, + "balance_loss_clip": 1.06334829, + "balance_loss_mlp": 1.03103662, + "epoch": 0.07389147752893431, + "flos": 20047168235520.0, + "grad_norm": 2.193714897879391, + "language_loss": 0.79796851, + "learning_rate": 3.97982638461608e-06, + "loss": 0.82046485, + "num_input_tokens_seen": 26131165, + "step": 1229, + "time_per_iteration": 2.5239813327789307 + }, + { + "auxiliary_loss_clip": 0.01198655, + "auxiliary_loss_mlp": 0.00803293, + "balance_loss_clip": 1.06507528, + "balance_loss_mlp": 1.03532076, + "epoch": 0.07395160078160229, + "flos": 18114132890880.0, + "grad_norm": 2.0434423696237207, + "language_loss": 0.78048635, + "learning_rate": 3.979771170004287e-06, + "loss": 0.80050582, + "num_input_tokens_seen": 26150040, + "step": 1230, + "time_per_iteration": 3.881772756576538 + }, + { + "auxiliary_loss_clip": 0.0120977, + "auxiliary_loss_mlp": 0.0105056, + "balance_loss_clip": 1.06776762, + "balance_loss_mlp": 1.02575254, + "epoch": 0.07401172403427025, + "flos": 23586918842880.0, + "grad_norm": 1.970803162220986, + "language_loss": 0.81341505, + "learning_rate": 3.979715880319372e-06, + "loss": 0.83601832, + "num_input_tokens_seen": 26169380, + "step": 1231, + "time_per_iteration": 2.4915425777435303 + }, + { + "auxiliary_loss_clip": 0.01225826, + "auxiliary_loss_mlp": 0.01067904, + "balance_loss_clip": 1.13030195, + "balance_loss_mlp": 1.04322755, + "epoch": 0.07407184728693822, + "flos": 26359904799360.0, + "grad_norm": 2.1041716610920305, + "language_loss": 0.95045739, + "learning_rate": 3.979660515563434e-06, + "loss": 0.97339469, + "num_input_tokens_seen": 26189420, + "step": 1232, + "time_per_iteration": 2.5925018787384033 + }, + { + "auxiliary_loss_clip": 0.01193464, + "auxiliary_loss_mlp": 0.01065024, + "balance_loss_clip": 1.06739712, + "balance_loss_mlp": 1.04175484, + "epoch": 0.0741319705396062, + "flos": 22200443821440.0, + "grad_norm": 1.7772434679745015, + "language_loss": 0.80863416, + "learning_rate": 3.979605075738569e-06, + "loss": 0.83121908, + "num_input_tokens_seen": 26209300, + "step": 1233, + "time_per_iteration": 2.5042660236358643 + }, + { + "auxiliary_loss_clip": 0.01213845, + "auxiliary_loss_mlp": 0.01063909, + "balance_loss_clip": 1.065925, + "balance_loss_mlp": 1.03662229, + "epoch": 0.07419209379227416, + "flos": 39200482523520.0, + "grad_norm": 2.401168986623431, + "language_loss": 0.70857382, + "learning_rate": 3.979549560846883e-06, + "loss": 0.73135138, + "num_input_tokens_seen": 26228110, + "step": 1234, + "time_per_iteration": 4.008370637893677 + }, + { + "auxiliary_loss_clip": 0.01179001, + "auxiliary_loss_mlp": 0.01072177, + "balance_loss_clip": 1.06553078, + "balance_loss_mlp": 1.04536641, + "epoch": 0.07425221704494213, + "flos": 22781657790720.0, + "grad_norm": 1.9245137227351239, + "language_loss": 0.77483892, + "learning_rate": 3.979493970890478e-06, + "loss": 0.79735065, + "num_input_tokens_seen": 26247020, + "step": 1235, + "time_per_iteration": 2.544532299041748 + }, + { + "auxiliary_loss_clip": 0.01206293, + "auxiliary_loss_mlp": 0.01055388, + "balance_loss_clip": 1.0651567, + "balance_loss_mlp": 1.03159392, + "epoch": 0.0743123402976101, + "flos": 22272983337600.0, + "grad_norm": 1.9890696111345716, + "language_loss": 0.83020198, + "learning_rate": 3.979438305871464e-06, + "loss": 0.85281885, + "num_input_tokens_seen": 26265750, + "step": 1236, + "time_per_iteration": 2.4901630878448486 + }, + { + "auxiliary_loss_clip": 0.01160047, + "auxiliary_loss_mlp": 0.00794154, + "balance_loss_clip": 1.06171668, + "balance_loss_mlp": 1.01765919, + "epoch": 0.07437246355027807, + "flos": 29315029645440.0, + "grad_norm": 1.7067638454671585, + "language_loss": 0.75762439, + "learning_rate": 3.979382565791951e-06, + "loss": 0.77716649, + "num_input_tokens_seen": 26287905, + "step": 1237, + "time_per_iteration": 4.071123838424683 + }, + { + "auxiliary_loss_clip": 0.01145521, + "auxiliary_loss_mlp": 0.00793981, + "balance_loss_clip": 1.06203604, + "balance_loss_mlp": 1.01531601, + "epoch": 0.07443258680294604, + "flos": 31944732249600.0, + "grad_norm": 1.6931481053439517, + "language_loss": 0.77491069, + "learning_rate": 3.979326750654053e-06, + "loss": 0.79430574, + "num_input_tokens_seen": 26311795, + "step": 1238, + "time_per_iteration": 2.696892261505127 + }, + { + "auxiliary_loss_clip": 0.01182517, + "auxiliary_loss_mlp": 0.01059301, + "balance_loss_clip": 1.06205595, + "balance_loss_mlp": 1.03489912, + "epoch": 0.074492710055614, + "flos": 22675290641280.0, + "grad_norm": 1.9605511041995878, + "language_loss": 0.86468959, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.88710779, + "num_input_tokens_seen": 26330330, + "step": 1239, + "time_per_iteration": 2.5642828941345215 + }, + { + "auxiliary_loss_clip": 0.01169271, + "auxiliary_loss_mlp": 0.01052527, + "balance_loss_clip": 1.06347513, + "balance_loss_mlp": 1.02701616, + "epoch": 0.07455283330828198, + "flos": 21284901037440.0, + "grad_norm": 2.0159037462177753, + "language_loss": 0.8854124, + "learning_rate": 3.979214895211569e-06, + "loss": 0.90763044, + "num_input_tokens_seen": 26348865, + "step": 1240, + "time_per_iteration": 2.5696630477905273 + }, + { + "auxiliary_loss_clip": 0.01179907, + "auxiliary_loss_mlp": 0.01063872, + "balance_loss_clip": 1.06437361, + "balance_loss_mlp": 1.03743172, + "epoch": 0.07461295656094995, + "flos": 24388408967040.0, + "grad_norm": 1.7217903520696973, + "language_loss": 0.8870734, + "learning_rate": 3.979158854911225e-06, + "loss": 0.90951121, + "num_input_tokens_seen": 26368210, + "step": 1241, + "time_per_iteration": 2.591825008392334 + }, + { + "auxiliary_loss_clip": 0.01074126, + "auxiliary_loss_mlp": 0.01003846, + "balance_loss_clip": 1.03139472, + "balance_loss_mlp": 0.99962628, + "epoch": 0.07467307981361791, + "flos": 62109660574080.0, + "grad_norm": 0.8892070443971893, + "language_loss": 0.63118911, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65196884, + "num_input_tokens_seen": 26424890, + "step": 1242, + "time_per_iteration": 3.1899678707122803 + }, + { + "auxiliary_loss_clip": 0.01170987, + "auxiliary_loss_mlp": 0.01065727, + "balance_loss_clip": 1.06281996, + "balance_loss_mlp": 1.03673518, + "epoch": 0.07473320306628589, + "flos": 24863148046080.0, + "grad_norm": 2.209222373428118, + "language_loss": 0.62898684, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.65135396, + "num_input_tokens_seen": 26446405, + "step": 1243, + "time_per_iteration": 2.6175901889801025 + }, + { + "auxiliary_loss_clip": 0.01193865, + "auxiliary_loss_mlp": 0.01058556, + "balance_loss_clip": 1.06227231, + "balance_loss_mlp": 1.03329515, + "epoch": 0.07479332631895386, + "flos": 24897442556160.0, + "grad_norm": 1.8121215080235986, + "language_loss": 0.76420856, + "learning_rate": 3.978990283719296e-06, + "loss": 0.78673279, + "num_input_tokens_seen": 26466070, + "step": 1244, + "time_per_iteration": 2.573676347732544 + }, + { + "auxiliary_loss_clip": 0.01186545, + "auxiliary_loss_mlp": 0.00790778, + "balance_loss_clip": 1.0632453, + "balance_loss_mlp": 1.01155066, + "epoch": 0.07485344957162182, + "flos": 17815247821440.0, + "grad_norm": 3.2470050273282554, + "language_loss": 0.68727005, + "learning_rate": 3.978933943232123e-06, + "loss": 0.70704323, + "num_input_tokens_seen": 26479350, + "step": 1245, + "time_per_iteration": 2.501337766647339 + }, + { + "auxiliary_loss_clip": 0.01206211, + "auxiliary_loss_mlp": 0.01058363, + "balance_loss_clip": 1.06255531, + "balance_loss_mlp": 1.03323388, + "epoch": 0.0749135728242898, + "flos": 25010202326400.0, + "grad_norm": 1.8220562544163381, + "language_loss": 0.88734066, + "learning_rate": 3.978877527703576e-06, + "loss": 0.90998638, + "num_input_tokens_seen": 26498255, + "step": 1246, + "time_per_iteration": 2.528078556060791 + }, + { + "auxiliary_loss_clip": 0.01215589, + "auxiliary_loss_mlp": 0.01075493, + "balance_loss_clip": 1.06375861, + "balance_loss_mlp": 1.04848003, + "epoch": 0.07497369607695777, + "flos": 17822071405440.0, + "grad_norm": 4.4635089513940915, + "language_loss": 0.88105309, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.90396392, + "num_input_tokens_seen": 26515375, + "step": 1247, + "time_per_iteration": 2.4396097660064697 + }, + { + "auxiliary_loss_clip": 0.01184817, + "auxiliary_loss_mlp": 0.01066776, + "balance_loss_clip": 1.06071091, + "balance_loss_mlp": 1.04151547, + "epoch": 0.07503381932962573, + "flos": 15121086261120.0, + "grad_norm": 2.281722735345676, + "language_loss": 0.64747941, + "learning_rate": 3.978764471530921e-06, + "loss": 0.66999531, + "num_input_tokens_seen": 26533595, + "step": 1248, + "time_per_iteration": 2.520212411880493 + }, + { + "auxiliary_loss_clip": 0.01189207, + "auxiliary_loss_mlp": 0.00792377, + "balance_loss_clip": 1.06806946, + "balance_loss_mlp": 1.01520908, + "epoch": 0.0750939425822937, + "flos": 12816734071680.0, + "grad_norm": 2.1356445220973868, + "language_loss": 0.74403358, + "learning_rate": 3.978707830891102e-06, + "loss": 0.76384938, + "num_input_tokens_seen": 26549405, + "step": 1249, + "time_per_iteration": 2.4669113159179688 + }, + { + "auxiliary_loss_clip": 0.01171327, + "auxiliary_loss_mlp": 0.010716, + "balance_loss_clip": 1.06289768, + "balance_loss_mlp": 1.04577899, + "epoch": 0.07515406583496168, + "flos": 24206844695040.0, + "grad_norm": 4.311755211924707, + "language_loss": 0.81867629, + "learning_rate": 3.978651115218482e-06, + "loss": 0.84110558, + "num_input_tokens_seen": 26567200, + "step": 1250, + "time_per_iteration": 2.566483736038208 + }, + { + "auxiliary_loss_clip": 0.01150385, + "auxiliary_loss_mlp": 0.0106958, + "balance_loss_clip": 1.05922806, + "balance_loss_mlp": 1.04499888, + "epoch": 0.07521418908762964, + "flos": 26688164215680.0, + "grad_norm": 2.7253448028688187, + "language_loss": 0.6666007, + "learning_rate": 3.978594324515215e-06, + "loss": 0.68880033, + "num_input_tokens_seen": 26586190, + "step": 1251, + "time_per_iteration": 2.633263111114502 + }, + { + "auxiliary_loss_clip": 0.01058067, + "auxiliary_loss_mlp": 0.01012627, + "balance_loss_clip": 1.02996564, + "balance_loss_mlp": 1.00871706, + "epoch": 0.0752743123402976, + "flos": 59095140589440.0, + "grad_norm": 0.8924714101643109, + "language_loss": 0.7034409, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72414792, + "num_input_tokens_seen": 26650710, + "step": 1252, + "time_per_iteration": 3.2008719444274902 + }, + { + "auxiliary_loss_clip": 0.01204259, + "auxiliary_loss_mlp": 0.01064079, + "balance_loss_clip": 1.06131053, + "balance_loss_mlp": 1.03995109, + "epoch": 0.07533443559296558, + "flos": 23477032160640.0, + "grad_norm": 2.2476254377378786, + "language_loss": 0.79695463, + "learning_rate": 3.97848051802535e-06, + "loss": 0.81963801, + "num_input_tokens_seen": 26669000, + "step": 1253, + "time_per_iteration": 2.4874281883239746 + }, + { + "auxiliary_loss_clip": 0.01169721, + "auxiliary_loss_mlp": 0.01061326, + "balance_loss_clip": 1.06683898, + "balance_loss_mlp": 1.03697181, + "epoch": 0.07539455884563355, + "flos": 20879110114560.0, + "grad_norm": 5.690615884276562, + "language_loss": 0.93358594, + "learning_rate": 3.978423502243069e-06, + "loss": 0.95589644, + "num_input_tokens_seen": 26683075, + "step": 1254, + "time_per_iteration": 2.565964698791504 + }, + { + "auxiliary_loss_clip": 0.01173593, + "auxiliary_loss_mlp": 0.01058288, + "balance_loss_clip": 1.06195998, + "balance_loss_mlp": 1.03450584, + "epoch": 0.07545468209830151, + "flos": 27672906551040.0, + "grad_norm": 2.1966610481206414, + "language_loss": 0.88229012, + "learning_rate": 3.97836641143877e-06, + "loss": 0.90460891, + "num_input_tokens_seen": 26701875, + "step": 1255, + "time_per_iteration": 2.6177263259887695 + }, + { + "auxiliary_loss_clip": 0.01202587, + "auxiliary_loss_mlp": 0.01063878, + "balance_loss_clip": 1.06171119, + "balance_loss_mlp": 1.03871322, + "epoch": 0.0755148053509695, + "flos": 14136990370560.0, + "grad_norm": 1.8715698154690987, + "language_loss": 0.79628718, + "learning_rate": 3.978309245614618e-06, + "loss": 0.81895185, + "num_input_tokens_seen": 26719050, + "step": 1256, + "time_per_iteration": 2.4832043647766113 + }, + { + "auxiliary_loss_clip": 0.01065496, + "auxiliary_loss_mlp": 0.01005113, + "balance_loss_clip": 1.03181362, + "balance_loss_mlp": 1.00134647, + "epoch": 0.07557492860363746, + "flos": 58235257929600.0, + "grad_norm": 0.7721144620458134, + "language_loss": 0.57998478, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.6006909, + "num_input_tokens_seen": 26780650, + "step": 1257, + "time_per_iteration": 3.2268896102905273 + }, + { + "auxiliary_loss_clip": 0.01153431, + "auxiliary_loss_mlp": 0.01062086, + "balance_loss_clip": 1.06216741, + "balance_loss_mlp": 1.03781509, + "epoch": 0.07563505185630542, + "flos": 24644380262400.0, + "grad_norm": 2.1376427709870556, + "language_loss": 0.89918262, + "learning_rate": 3.978194688915432e-06, + "loss": 0.92133778, + "num_input_tokens_seen": 26798725, + "step": 1258, + "time_per_iteration": 2.636545181274414 + }, + { + "auxiliary_loss_clip": 0.01169214, + "auxiliary_loss_mlp": 0.01057191, + "balance_loss_clip": 1.06371605, + "balance_loss_mlp": 1.03251505, + "epoch": 0.07569517510897339, + "flos": 15522998515200.0, + "grad_norm": 1.8561083712582425, + "language_loss": 0.81233096, + "learning_rate": 3.978137298044741e-06, + "loss": 0.83459508, + "num_input_tokens_seen": 26817005, + "step": 1259, + "time_per_iteration": 2.5479156970977783 + }, + { + "auxiliary_loss_clip": 0.01194896, + "auxiliary_loss_mlp": 0.01056252, + "balance_loss_clip": 1.06431675, + "balance_loss_mlp": 1.03300643, + "epoch": 0.07575529836164137, + "flos": 22928532503040.0, + "grad_norm": 1.9950165252349916, + "language_loss": 0.75836444, + "learning_rate": 3.978079832162885e-06, + "loss": 0.78087592, + "num_input_tokens_seen": 26836655, + "step": 1260, + "time_per_iteration": 2.5715529918670654 + }, + { + "auxiliary_loss_clip": 0.011937, + "auxiliary_loss_mlp": 0.01065433, + "balance_loss_clip": 1.11412454, + "balance_loss_mlp": 1.04054201, + "epoch": 0.07581542161430933, + "flos": 19500428344320.0, + "grad_norm": 2.168828247531321, + "language_loss": 0.84623969, + "learning_rate": 3.978022291272044e-06, + "loss": 0.86883104, + "num_input_tokens_seen": 26854925, + "step": 1261, + "time_per_iteration": 2.5466198921203613 + }, + { + "auxiliary_loss_clip": 0.01209735, + "auxiliary_loss_mlp": 0.01061643, + "balance_loss_clip": 1.06570005, + "balance_loss_mlp": 1.03845716, + "epoch": 0.0758755448669773, + "flos": 24973465691520.0, + "grad_norm": 2.592489498431488, + "language_loss": 0.82536894, + "learning_rate": 3.977964675374399e-06, + "loss": 0.84808272, + "num_input_tokens_seen": 26876170, + "step": 1262, + "time_per_iteration": 2.5429248809814453 + }, + { + "auxiliary_loss_clip": 0.0120218, + "auxiliary_loss_mlp": 0.01059533, + "balance_loss_clip": 1.05982685, + "balance_loss_mlp": 1.03477371, + "epoch": 0.07593566811964528, + "flos": 22747973811840.0, + "grad_norm": 2.9610122487166897, + "language_loss": 0.82413363, + "learning_rate": 3.977906984472136e-06, + "loss": 0.84675074, + "num_input_tokens_seen": 26895005, + "step": 1263, + "time_per_iteration": 2.4843833446502686 + }, + { + "auxiliary_loss_clip": 0.01159708, + "auxiliary_loss_mlp": 0.01059023, + "balance_loss_clip": 1.06179988, + "balance_loss_mlp": 1.03550327, + "epoch": 0.07599579137231324, + "flos": 23112395245440.0, + "grad_norm": 2.979874282031962, + "language_loss": 0.76107848, + "learning_rate": 3.977849218567442e-06, + "loss": 0.78326577, + "num_input_tokens_seen": 26913930, + "step": 1264, + "time_per_iteration": 2.649636745452881 + }, + { + "auxiliary_loss_clip": 0.01210203, + "auxiliary_loss_mlp": 0.01058491, + "balance_loss_clip": 1.11172533, + "balance_loss_mlp": 1.03489995, + "epoch": 0.07605591462498121, + "flos": 14502058248960.0, + "grad_norm": 3.877292587258297, + "language_loss": 0.80940992, + "learning_rate": 3.977791377662507e-06, + "loss": 0.83209687, + "num_input_tokens_seen": 26931485, + "step": 1265, + "time_per_iteration": 2.5221712589263916 + }, + { + "auxiliary_loss_clip": 0.01146886, + "auxiliary_loss_mlp": 0.01062752, + "balance_loss_clip": 1.06094623, + "balance_loss_mlp": 1.03643048, + "epoch": 0.07611603787764919, + "flos": 23514199758720.0, + "grad_norm": 2.3510964044771936, + "language_loss": 0.65532875, + "learning_rate": 3.977733461759524e-06, + "loss": 0.67742515, + "num_input_tokens_seen": 26951670, + "step": 1266, + "time_per_iteration": 2.6282331943511963 + }, + { + "auxiliary_loss_clip": 0.0116, + "auxiliary_loss_mlp": 0.01061693, + "balance_loss_clip": 1.05770111, + "balance_loss_mlp": 1.03739786, + "epoch": 0.07617616113031715, + "flos": 21507188353920.0, + "grad_norm": 2.241382006311566, + "language_loss": 0.79583824, + "learning_rate": 3.977675470860691e-06, + "loss": 0.81805515, + "num_input_tokens_seen": 26970335, + "step": 1267, + "time_per_iteration": 3.945256233215332 + }, + { + "auxiliary_loss_clip": 0.0117997, + "auxiliary_loss_mlp": 0.01052217, + "balance_loss_clip": 1.06036615, + "balance_loss_mlp": 1.02961493, + "epoch": 0.07623628438298512, + "flos": 14573161221120.0, + "grad_norm": 4.251490918519658, + "language_loss": 0.73491597, + "learning_rate": 3.977617404968205e-06, + "loss": 0.75723779, + "num_input_tokens_seen": 26986025, + "step": 1268, + "time_per_iteration": 2.5150680541992188 + }, + { + "auxiliary_loss_clip": 0.01189119, + "auxiliary_loss_mlp": 0.01052798, + "balance_loss_clip": 1.06004155, + "balance_loss_mlp": 1.02901566, + "epoch": 0.07629640763565308, + "flos": 14720395069440.0, + "grad_norm": 2.9854896167729645, + "language_loss": 0.82232034, + "learning_rate": 3.977559264084269e-06, + "loss": 0.8447395, + "num_input_tokens_seen": 27004045, + "step": 1269, + "time_per_iteration": 3.8626296520233154 + }, + { + "auxiliary_loss_clip": 0.01193153, + "auxiliary_loss_mlp": 0.01060527, + "balance_loss_clip": 1.06224036, + "balance_loss_mlp": 1.03611302, + "epoch": 0.07635653088832106, + "flos": 14902929008640.0, + "grad_norm": 2.3788313749918792, + "language_loss": 0.88725567, + "learning_rate": 3.977501048211088e-06, + "loss": 0.90979242, + "num_input_tokens_seen": 27022070, + "step": 1270, + "time_per_iteration": 2.4969308376312256 + }, + { + "auxiliary_loss_clip": 0.01190557, + "auxiliary_loss_mlp": 0.01058427, + "balance_loss_clip": 1.06118488, + "balance_loss_mlp": 1.03422761, + "epoch": 0.07641665414098903, + "flos": 26651571235200.0, + "grad_norm": 4.034692484542531, + "language_loss": 0.71129233, + "learning_rate": 3.977442757350869e-06, + "loss": 0.73378217, + "num_input_tokens_seen": 27041755, + "step": 1271, + "time_per_iteration": 2.5552656650543213 + }, + { + "auxiliary_loss_clip": 0.0115789, + "auxiliary_loss_mlp": 0.01067158, + "balance_loss_clip": 1.0614779, + "balance_loss_mlp": 1.04348326, + "epoch": 0.07647677739365699, + "flos": 25192808092800.0, + "grad_norm": 1.6975590448899378, + "language_loss": 0.82622957, + "learning_rate": 3.977384391505823e-06, + "loss": 0.84847999, + "num_input_tokens_seen": 27061540, + "step": 1272, + "time_per_iteration": 2.6253504753112793 + }, + { + "auxiliary_loss_clip": 0.01173707, + "auxiliary_loss_mlp": 0.00829344, + "balance_loss_clip": 1.05758429, + "balance_loss_mlp": 1.08582973, + "epoch": 0.07653690064632497, + "flos": 20558141159040.0, + "grad_norm": 1.667615180357115, + "language_loss": 0.80246925, + "learning_rate": 3.977325950678162e-06, + "loss": 0.82249975, + "num_input_tokens_seen": 27081395, + "step": 1273, + "time_per_iteration": 3.9171383380889893 + }, + { + "auxiliary_loss_clip": 0.01179059, + "auxiliary_loss_mlp": 0.0106415, + "balance_loss_clip": 1.06146085, + "balance_loss_mlp": 1.03921187, + "epoch": 0.07659702389899294, + "flos": 22269320150400.0, + "grad_norm": 1.9677418411818408, + "language_loss": 0.81379402, + "learning_rate": 3.977267434870103e-06, + "loss": 0.83622611, + "num_input_tokens_seen": 27101175, + "step": 1274, + "time_per_iteration": 2.5613925457000732 + }, + { + "auxiliary_loss_clip": 0.01178902, + "auxiliary_loss_mlp": 0.01068299, + "balance_loss_clip": 1.05823231, + "balance_loss_mlp": 1.04262137, + "epoch": 0.0766571471516609, + "flos": 32636120209920.0, + "grad_norm": 2.0095454542086224, + "language_loss": 0.73129481, + "learning_rate": 3.977208844083865e-06, + "loss": 0.75376683, + "num_input_tokens_seen": 27124505, + "step": 1275, + "time_per_iteration": 4.009122848510742 + }, + { + "auxiliary_loss_clip": 0.01204654, + "auxiliary_loss_mlp": 0.01063474, + "balance_loss_clip": 1.06163502, + "balance_loss_mlp": 1.03755808, + "epoch": 0.07671727040432888, + "flos": 15267386355840.0, + "grad_norm": 2.3013042357854943, + "language_loss": 0.79700208, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.81968337, + "num_input_tokens_seen": 27140960, + "step": 1276, + "time_per_iteration": 2.446730613708496 + }, + { + "auxiliary_loss_clip": 0.01217226, + "auxiliary_loss_mlp": 0.01054173, + "balance_loss_clip": 1.10234904, + "balance_loss_mlp": 1.030164, + "epoch": 0.07677739365699685, + "flos": 28184094956160.0, + "grad_norm": 2.4586467129555527, + "language_loss": 0.59358799, + "learning_rate": 3.97709143758574e-06, + "loss": 0.61630201, + "num_input_tokens_seen": 27160985, + "step": 1277, + "time_per_iteration": 2.580177068710327 + }, + { + "auxiliary_loss_clip": 0.01196268, + "auxiliary_loss_mlp": 0.01055851, + "balance_loss_clip": 1.06126285, + "balance_loss_mlp": 1.03200912, + "epoch": 0.07683751690966481, + "flos": 18296128126080.0, + "grad_norm": 2.6395948800344025, + "language_loss": 0.74656469, + "learning_rate": 3.977032621878305e-06, + "loss": 0.76908588, + "num_input_tokens_seen": 27178390, + "step": 1278, + "time_per_iteration": 2.4842662811279297 + }, + { + "auxiliary_loss_clip": 0.01157661, + "auxiliary_loss_mlp": 0.01060497, + "balance_loss_clip": 1.0576911, + "balance_loss_mlp": 1.03645205, + "epoch": 0.07689764016233278, + "flos": 21981101420160.0, + "grad_norm": 3.045077566881005, + "language_loss": 0.88511616, + "learning_rate": 3.976973731201596e-06, + "loss": 0.90729773, + "num_input_tokens_seen": 27197505, + "step": 1279, + "time_per_iteration": 2.550074815750122 + }, + { + "auxiliary_loss_clip": 0.01164891, + "auxiliary_loss_mlp": 0.01058741, + "balance_loss_clip": 1.05634952, + "balance_loss_mlp": 1.03422022, + "epoch": 0.07695776341500075, + "flos": 22235995307520.0, + "grad_norm": 2.5726771041571905, + "language_loss": 0.82817745, + "learning_rate": 3.976914765557845e-06, + "loss": 0.8504138, + "num_input_tokens_seen": 27214260, + "step": 1280, + "time_per_iteration": 2.529118776321411 + }, + { + "auxiliary_loss_clip": 0.01187206, + "auxiliary_loss_mlp": 0.01061051, + "balance_loss_clip": 1.05984366, + "balance_loss_mlp": 1.03695917, + "epoch": 0.07701788666766872, + "flos": 16143750380160.0, + "grad_norm": 2.0807081573788127, + "language_loss": 0.76081622, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.78329879, + "num_input_tokens_seen": 27232525, + "step": 1281, + "time_per_iteration": 2.4998531341552734 + }, + { + "auxiliary_loss_clip": 0.01165001, + "auxiliary_loss_mlp": 0.01058488, + "balance_loss_clip": 1.05557799, + "balance_loss_mlp": 1.03405023, + "epoch": 0.07707800992033668, + "flos": 19463045264640.0, + "grad_norm": 1.941160384270207, + "language_loss": 0.75216877, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.77440369, + "num_input_tokens_seen": 27249800, + "step": 1282, + "time_per_iteration": 2.5537612438201904 + }, + { + "auxiliary_loss_clip": 0.01199182, + "auxiliary_loss_mlp": 0.01068291, + "balance_loss_clip": 1.0577662, + "balance_loss_mlp": 1.04347134, + "epoch": 0.07713813317300466, + "flos": 18990281433600.0, + "grad_norm": 2.2407635074516348, + "language_loss": 0.84083015, + "learning_rate": 3.976737418846713e-06, + "loss": 0.86350489, + "num_input_tokens_seen": 27268895, + "step": 1283, + "time_per_iteration": 2.4931952953338623 + }, + { + "auxiliary_loss_clip": 0.01187636, + "auxiliary_loss_mlp": 0.01065511, + "balance_loss_clip": 1.05854023, + "balance_loss_mlp": 1.03903425, + "epoch": 0.07719825642567263, + "flos": 18113953322880.0, + "grad_norm": 2.476431088103227, + "language_loss": 0.75026894, + "learning_rate": 3.976678153357181e-06, + "loss": 0.77280033, + "num_input_tokens_seen": 27288180, + "step": 1284, + "time_per_iteration": 2.4868905544281006 + }, + { + "auxiliary_loss_clip": 0.01171935, + "auxiliary_loss_mlp": 0.01072715, + "balance_loss_clip": 1.0558964, + "balance_loss_mlp": 1.04899263, + "epoch": 0.0772583796783406, + "flos": 42194426993280.0, + "grad_norm": 1.7664813412507139, + "language_loss": 0.76049483, + "learning_rate": 3.976618812911817e-06, + "loss": 0.78294134, + "num_input_tokens_seen": 27311815, + "step": 1285, + "time_per_iteration": 2.707158088684082 + }, + { + "auxiliary_loss_clip": 0.0120348, + "auxiliary_loss_mlp": 0.01065498, + "balance_loss_clip": 1.06172025, + "balance_loss_mlp": 1.04243112, + "epoch": 0.07731850293100857, + "flos": 24753692327040.0, + "grad_norm": 2.0635069572162545, + "language_loss": 0.83756602, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.86025578, + "num_input_tokens_seen": 27331890, + "step": 1286, + "time_per_iteration": 2.501821517944336 + }, + { + "auxiliary_loss_clip": 0.01169881, + "auxiliary_loss_mlp": 0.0106127, + "balance_loss_clip": 1.05504346, + "balance_loss_mlp": 1.03682005, + "epoch": 0.07737862618367654, + "flos": 17565884628480.0, + "grad_norm": 2.6480422458879445, + "language_loss": 0.76859426, + "learning_rate": 3.97649990716259e-06, + "loss": 0.79090571, + "num_input_tokens_seen": 27348320, + "step": 1287, + "time_per_iteration": 2.4943323135375977 + }, + { + "auxiliary_loss_clip": 0.01170238, + "auxiliary_loss_mlp": 0.0105872, + "balance_loss_clip": 1.05258465, + "balance_loss_mlp": 1.03543818, + "epoch": 0.0774387494363445, + "flos": 25627147349760.0, + "grad_norm": 1.6881683759824808, + "language_loss": 0.84483165, + "learning_rate": 3.976440341863237e-06, + "loss": 0.86712122, + "num_input_tokens_seen": 27367670, + "step": 1288, + "time_per_iteration": 2.563023805618286 + }, + { + "auxiliary_loss_clip": 0.01197817, + "auxiliary_loss_mlp": 0.01059949, + "balance_loss_clip": 1.05485392, + "balance_loss_mlp": 1.03707314, + "epoch": 0.07749887268901248, + "flos": 12239865648000.0, + "grad_norm": 2.164736729580838, + "language_loss": 0.85310096, + "learning_rate": 3.976380701617068e-06, + "loss": 0.8756786, + "num_input_tokens_seen": 27385485, + "step": 1289, + "time_per_iteration": 2.4442203044891357 + }, + { + "auxiliary_loss_clip": 0.0119558, + "auxiliary_loss_mlp": 0.01049891, + "balance_loss_clip": 1.0550034, + "balance_loss_mlp": 1.02671695, + "epoch": 0.07755899594168045, + "flos": 25081736261760.0, + "grad_norm": 1.9355346363803563, + "language_loss": 0.85395646, + "learning_rate": 3.976320986426344e-06, + "loss": 0.8764112, + "num_input_tokens_seen": 27405110, + "step": 1290, + "time_per_iteration": 2.491056442260742 + }, + { + "auxiliary_loss_clip": 0.01165431, + "auxiliary_loss_mlp": 0.01063664, + "balance_loss_clip": 1.0583775, + "balance_loss_mlp": 1.03876114, + "epoch": 0.07761911919434841, + "flos": 14246410176000.0, + "grad_norm": 2.048544371838898, + "language_loss": 0.90912133, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.93141234, + "num_input_tokens_seen": 27422855, + "step": 1291, + "time_per_iteration": 2.5296120643615723 + }, + { + "auxiliary_loss_clip": 0.01063723, + "auxiliary_loss_mlp": 0.01011043, + "balance_loss_clip": 1.02685416, + "balance_loss_mlp": 1.00627446, + "epoch": 0.07767924244701638, + "flos": 67237202954880.0, + "grad_norm": 0.8874479904709917, + "language_loss": 0.65095544, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.6717031, + "num_input_tokens_seen": 27487190, + "step": 1292, + "time_per_iteration": 3.230682134628296 + }, + { + "auxiliary_loss_clip": 0.01185415, + "auxiliary_loss_mlp": 0.01058336, + "balance_loss_clip": 1.05759096, + "balance_loss_mlp": 1.03503036, + "epoch": 0.07773936569968436, + "flos": 28550635292160.0, + "grad_norm": 1.8783728786898657, + "language_loss": 0.87705308, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.89949059, + "num_input_tokens_seen": 27510465, + "step": 1293, + "time_per_iteration": 2.599426507949829 + }, + { + "auxiliary_loss_clip": 0.01116893, + "auxiliary_loss_mlp": 0.01079327, + "balance_loss_clip": 1.05588436, + "balance_loss_mlp": 1.05012131, + "epoch": 0.07779948895235232, + "flos": 27490264871040.0, + "grad_norm": 2.103355026491555, + "language_loss": 0.85120755, + "learning_rate": 3.976081376263239e-06, + "loss": 0.87316972, + "num_input_tokens_seen": 27528645, + "step": 1294, + "time_per_iteration": 2.6664233207702637 + }, + { + "auxiliary_loss_clip": 0.01152312, + "auxiliary_loss_mlp": 0.01058779, + "balance_loss_clip": 1.05731416, + "balance_loss_mlp": 1.03369713, + "epoch": 0.07785961220502029, + "flos": 18223301301120.0, + "grad_norm": 2.5106051730394525, + "language_loss": 0.79053247, + "learning_rate": 3.976021286383768e-06, + "loss": 0.81264341, + "num_input_tokens_seen": 27546165, + "step": 1295, + "time_per_iteration": 2.5529909133911133 + }, + { + "auxiliary_loss_clip": 0.01145645, + "auxiliary_loss_mlp": 0.01059502, + "balance_loss_clip": 1.052472, + "balance_loss_mlp": 1.03509998, + "epoch": 0.07791973545768827, + "flos": 24608218245120.0, + "grad_norm": 2.4770748893685397, + "language_loss": 0.87928391, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90133536, + "num_input_tokens_seen": 27566520, + "step": 1296, + "time_per_iteration": 2.6050217151641846 + }, + { + "auxiliary_loss_clip": 0.01199516, + "auxiliary_loss_mlp": 0.01063421, + "balance_loss_clip": 1.0574789, + "balance_loss_mlp": 1.03891194, + "epoch": 0.07797985871035623, + "flos": 14282069402880.0, + "grad_norm": 2.9589175995729646, + "language_loss": 0.96104366, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.9836731, + "num_input_tokens_seen": 27581960, + "step": 1297, + "time_per_iteration": 2.442082405090332 + }, + { + "auxiliary_loss_clip": 0.01169086, + "auxiliary_loss_mlp": 0.01059244, + "balance_loss_clip": 1.05804729, + "balance_loss_mlp": 1.0358429, + "epoch": 0.0780399819630242, + "flos": 26610453141120.0, + "grad_norm": 2.2838807321652306, + "language_loss": 0.75959408, + "learning_rate": 3.97584056716893e-06, + "loss": 0.78187734, + "num_input_tokens_seen": 27601415, + "step": 1298, + "time_per_iteration": 2.6006946563720703 + }, + { + "auxiliary_loss_clip": 0.01139115, + "auxiliary_loss_mlp": 0.00819846, + "balance_loss_clip": 1.05488229, + "balance_loss_mlp": 1.0644865, + "epoch": 0.07810010521569218, + "flos": 21834514016640.0, + "grad_norm": 4.0639267411131135, + "language_loss": 0.80770528, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.82729489, + "num_input_tokens_seen": 27621490, + "step": 1299, + "time_per_iteration": 2.6070175170898438 + }, + { + "auxiliary_loss_clip": 0.01153521, + "auxiliary_loss_mlp": 0.0106363, + "balance_loss_clip": 1.05366075, + "balance_loss_mlp": 1.03909695, + "epoch": 0.07816022846836014, + "flos": 25081233471360.0, + "grad_norm": 2.08973995769173, + "language_loss": 0.86592889, + "learning_rate": 3.975719713068202e-06, + "loss": 0.88810039, + "num_input_tokens_seen": 27640600, + "step": 1300, + "time_per_iteration": 2.607984781265259 + }, + { + "auxiliary_loss_clip": 0.01195339, + "auxiliary_loss_mlp": 0.01055136, + "balance_loss_clip": 1.05512214, + "balance_loss_mlp": 1.03043616, + "epoch": 0.0782203517210281, + "flos": 40917515431680.0, + "grad_norm": 2.133961906184151, + "language_loss": 0.72056824, + "learning_rate": 3.975659173637458e-06, + "loss": 0.74307299, + "num_input_tokens_seen": 27663070, + "step": 1301, + "time_per_iteration": 2.6316583156585693 + }, + { + "auxiliary_loss_clip": 0.01188846, + "auxiliary_loss_mlp": 0.01068177, + "balance_loss_clip": 1.05700922, + "balance_loss_mlp": 1.04425216, + "epoch": 0.07828047497369607, + "flos": 41172014269440.0, + "grad_norm": 1.8635475486364876, + "language_loss": 0.70935726, + "learning_rate": 3.97559855928952e-06, + "loss": 0.73192751, + "num_input_tokens_seen": 27686425, + "step": 1302, + "time_per_iteration": 2.694078207015991 + }, + { + "auxiliary_loss_clip": 0.01157026, + "auxiliary_loss_mlp": 0.00807689, + "balance_loss_clip": 1.05854809, + "balance_loss_mlp": 1.04317915, + "epoch": 0.07834059822636405, + "flos": 23508130360320.0, + "grad_norm": 2.1171932592363976, + "language_loss": 0.82126296, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.8409102, + "num_input_tokens_seen": 27704900, + "step": 1303, + "time_per_iteration": 2.5692477226257324 + }, + { + "auxiliary_loss_clip": 0.0117742, + "auxiliary_loss_mlp": 0.01064393, + "balance_loss_clip": 1.05251122, + "balance_loss_mlp": 1.0399313, + "epoch": 0.07840072147903202, + "flos": 20193899293440.0, + "grad_norm": 1.6571278494302157, + "language_loss": 0.75208318, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.7745012, + "num_input_tokens_seen": 27724890, + "step": 1304, + "time_per_iteration": 2.5045342445373535 + }, + { + "auxiliary_loss_clip": 0.0119823, + "auxiliary_loss_mlp": 0.0106105, + "balance_loss_clip": 1.0592494, + "balance_loss_mlp": 1.03702903, + "epoch": 0.07846084473169998, + "flos": 21360816432000.0, + "grad_norm": 1.975064385580126, + "language_loss": 0.76174819, + "learning_rate": 3.975416266765542e-06, + "loss": 0.78434104, + "num_input_tokens_seen": 27743115, + "step": 1305, + "time_per_iteration": 2.4738821983337402 + }, + { + "auxiliary_loss_clip": 0.01133405, + "auxiliary_loss_mlp": 0.01066026, + "balance_loss_clip": 1.05631757, + "balance_loss_mlp": 1.04167151, + "epoch": 0.07852096798436796, + "flos": 25410965345280.0, + "grad_norm": 1.7003105257272415, + "language_loss": 0.84975249, + "learning_rate": 3.975355352771841e-06, + "loss": 0.87174678, + "num_input_tokens_seen": 27763570, + "step": 1306, + "time_per_iteration": 4.060633182525635 + }, + { + "auxiliary_loss_clip": 0.01186259, + "auxiliary_loss_mlp": 0.01045235, + "balance_loss_clip": 1.05862713, + "balance_loss_mlp": 1.02304983, + "epoch": 0.07858109123703592, + "flos": 24571481610240.0, + "grad_norm": 2.4065625511026596, + "language_loss": 0.90150112, + "learning_rate": 3.975294363872468e-06, + "loss": 0.92381608, + "num_input_tokens_seen": 27780030, + "step": 1307, + "time_per_iteration": 3.8742942810058594 + }, + { + "auxiliary_loss_clip": 0.0114008, + "auxiliary_loss_mlp": 0.01058473, + "balance_loss_clip": 1.0535022, + "balance_loss_mlp": 1.03386796, + "epoch": 0.07864121448970389, + "flos": 20698874645760.0, + "grad_norm": 1.8778080999027962, + "language_loss": 0.83412665, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85611212, + "num_input_tokens_seen": 27796225, + "step": 1308, + "time_per_iteration": 2.5883257389068604 + }, + { + "auxiliary_loss_clip": 0.01170854, + "auxiliary_loss_mlp": 0.0105575, + "balance_loss_clip": 1.09223533, + "balance_loss_mlp": 1.03251672, + "epoch": 0.07870133774237187, + "flos": 22966526113920.0, + "grad_norm": 1.479312861942881, + "language_loss": 0.77480745, + "learning_rate": 3.975172161365958e-06, + "loss": 0.79707348, + "num_input_tokens_seen": 27815975, + "step": 1309, + "time_per_iteration": 2.5950517654418945 + }, + { + "auxiliary_loss_clip": 0.01191854, + "auxiliary_loss_mlp": 0.01069343, + "balance_loss_clip": 1.05746651, + "balance_loss_mlp": 1.04341531, + "epoch": 0.07876146099503983, + "flos": 18842832103680.0, + "grad_norm": 1.9267887761688451, + "language_loss": 0.80338287, + "learning_rate": 3.975110947763453e-06, + "loss": 0.82599473, + "num_input_tokens_seen": 27832255, + "step": 1310, + "time_per_iteration": 2.488211154937744 + }, + { + "auxiliary_loss_clip": 0.0116691, + "auxiliary_loss_mlp": 0.0080258, + "balance_loss_clip": 1.05800986, + "balance_loss_mlp": 1.03434253, + "epoch": 0.0788215842477078, + "flos": 23805794367360.0, + "grad_norm": 2.1714460809035345, + "language_loss": 0.73188102, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.75157589, + "num_input_tokens_seen": 27852180, + "step": 1311, + "time_per_iteration": 3.9456276893615723 + }, + { + "auxiliary_loss_clip": 0.01182738, + "auxiliary_loss_mlp": 0.01076606, + "balance_loss_clip": 1.05830264, + "balance_loss_mlp": 1.05182242, + "epoch": 0.07888170750037576, + "flos": 21579907438080.0, + "grad_norm": 2.0226517736299297, + "language_loss": 0.85988206, + "learning_rate": 3.974988295871553e-06, + "loss": 0.8824755, + "num_input_tokens_seen": 27871435, + "step": 1312, + "time_per_iteration": 2.546945333480835 + }, + { + "auxiliary_loss_clip": 0.01169489, + "auxiliary_loss_mlp": 0.0106227, + "balance_loss_clip": 1.05543721, + "balance_loss_mlp": 1.03998983, + "epoch": 0.07894183075304374, + "flos": 19864849777920.0, + "grad_norm": 1.9637712943324916, + "language_loss": 0.82224369, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.84456128, + "num_input_tokens_seen": 27890625, + "step": 1313, + "time_per_iteration": 2.5260608196258545 + }, + { + "auxiliary_loss_clip": 0.01177841, + "auxiliary_loss_mlp": 0.00802716, + "balance_loss_clip": 1.05751276, + "balance_loss_mlp": 1.03046298, + "epoch": 0.07900195400571171, + "flos": 16143463071360.0, + "grad_norm": 2.9558166374315182, + "language_loss": 0.73633945, + "learning_rate": 3.97486534441264e-06, + "loss": 0.75614506, + "num_input_tokens_seen": 27906530, + "step": 1314, + "time_per_iteration": 3.8893513679504395 + }, + { + "auxiliary_loss_clip": 0.01173597, + "auxiliary_loss_mlp": 0.00800991, + "balance_loss_clip": 1.09289122, + "balance_loss_mlp": 1.02948976, + "epoch": 0.07906207725837967, + "flos": 23730417676800.0, + "grad_norm": 1.632907592563888, + "language_loss": 0.79647273, + "learning_rate": 3.974803756351379e-06, + "loss": 0.81621861, + "num_input_tokens_seen": 27926725, + "step": 1315, + "time_per_iteration": 2.6121950149536133 + }, + { + "auxiliary_loss_clip": 0.01182759, + "auxiliary_loss_mlp": 0.01064599, + "balance_loss_clip": 1.05315781, + "balance_loss_mlp": 1.03952909, + "epoch": 0.07912220051104765, + "flos": 24315905364480.0, + "grad_norm": 1.9863534812643666, + "language_loss": 0.73712748, + "learning_rate": 3.974742093405362e-06, + "loss": 0.75960106, + "num_input_tokens_seen": 27947875, + "step": 1316, + "time_per_iteration": 2.5542471408843994 + }, + { + "auxiliary_loss_clip": 0.01158585, + "auxiliary_loss_mlp": 0.01064344, + "balance_loss_clip": 1.05928206, + "balance_loss_mlp": 1.0398823, + "epoch": 0.07918232376371562, + "flos": 18880035615360.0, + "grad_norm": 2.987674638518574, + "language_loss": 0.65224314, + "learning_rate": 3.974680355576927e-06, + "loss": 0.67447245, + "num_input_tokens_seen": 27965040, + "step": 1317, + "time_per_iteration": 2.5529956817626953 + }, + { + "auxiliary_loss_clip": 0.01190753, + "auxiliary_loss_mlp": 0.01064959, + "balance_loss_clip": 1.09238696, + "balance_loss_mlp": 1.04027045, + "epoch": 0.07924244701638358, + "flos": 27376284038400.0, + "grad_norm": 2.10885541310027, + "language_loss": 0.73085082, + "learning_rate": 3.974618542868415e-06, + "loss": 0.75340796, + "num_input_tokens_seen": 27985330, + "step": 1318, + "time_per_iteration": 2.6214659214019775 + }, + { + "auxiliary_loss_clip": 0.0113609, + "auxiliary_loss_mlp": 0.01059206, + "balance_loss_clip": 1.05669296, + "balance_loss_mlp": 1.03666377, + "epoch": 0.07930257026905156, + "flos": 25120340403840.0, + "grad_norm": 1.6435995105208747, + "language_loss": 0.90417397, + "learning_rate": 3.97455665528217e-06, + "loss": 0.92612696, + "num_input_tokens_seen": 28007615, + "step": 1319, + "time_per_iteration": 2.6516027450561523 + }, + { + "auxiliary_loss_clip": 0.01172799, + "auxiliary_loss_mlp": 0.01054952, + "balance_loss_clip": 1.05552268, + "balance_loss_mlp": 1.03052628, + "epoch": 0.07936269352171953, + "flos": 21834478103040.0, + "grad_norm": 2.718758878503946, + "language_loss": 0.80164897, + "learning_rate": 3.974494692820539e-06, + "loss": 0.82392645, + "num_input_tokens_seen": 28027765, + "step": 1320, + "time_per_iteration": 2.5317859649658203 + }, + { + "auxiliary_loss_clip": 0.01173778, + "auxiliary_loss_mlp": 0.01058758, + "balance_loss_clip": 1.05645776, + "balance_loss_mlp": 1.03578663, + "epoch": 0.07942281677438749, + "flos": 16939889377920.0, + "grad_norm": 2.6861214567157754, + "language_loss": 0.69065177, + "learning_rate": 3.974432655485872e-06, + "loss": 0.71297717, + "num_input_tokens_seen": 28044225, + "step": 1321, + "time_per_iteration": 2.5478408336639404 + }, + { + "auxiliary_loss_clip": 0.01179854, + "auxiliary_loss_mlp": 0.01056748, + "balance_loss_clip": 1.05625021, + "balance_loss_mlp": 1.0327034, + "epoch": 0.07948294002705546, + "flos": 18986941468800.0, + "grad_norm": 2.1474158004917916, + "language_loss": 0.83983111, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.86219716, + "num_input_tokens_seen": 28062915, + "step": 1322, + "time_per_iteration": 2.52219295501709 + }, + { + "auxiliary_loss_clip": 0.01194876, + "auxiliary_loss_mlp": 0.01054199, + "balance_loss_clip": 1.05403364, + "balance_loss_mlp": 1.03108406, + "epoch": 0.07954306327972344, + "flos": 21653452535040.0, + "grad_norm": 2.1241999569277774, + "language_loss": 0.90565312, + "learning_rate": 3.974308356206838e-06, + "loss": 0.92814386, + "num_input_tokens_seen": 28082175, + "step": 1323, + "time_per_iteration": 2.4719998836517334 + }, + { + "auxiliary_loss_clip": 0.01154877, + "auxiliary_loss_mlp": 0.01056666, + "balance_loss_clip": 1.05411124, + "balance_loss_mlp": 1.03344417, + "epoch": 0.0796031865323914, + "flos": 23220270766080.0, + "grad_norm": 2.536179061922817, + "language_loss": 0.82231033, + "learning_rate": 3.974246094267187e-06, + "loss": 0.84442574, + "num_input_tokens_seen": 28102645, + "step": 1324, + "time_per_iteration": 2.5817573070526123 + }, + { + "auxiliary_loss_clip": 0.01172159, + "auxiliary_loss_mlp": 0.01051899, + "balance_loss_clip": 1.05541742, + "balance_loss_mlp": 1.02710342, + "epoch": 0.07966330978505937, + "flos": 23294534135040.0, + "grad_norm": 6.823954890189742, + "language_loss": 0.78952956, + "learning_rate": 3.974183757463925e-06, + "loss": 0.81177008, + "num_input_tokens_seen": 28122805, + "step": 1325, + "time_per_iteration": 2.546828031539917 + }, + { + "auxiliary_loss_clip": 0.01118436, + "auxiliary_loss_mlp": 0.00824185, + "balance_loss_clip": 1.05249095, + "balance_loss_mlp": 1.07054067, + "epoch": 0.07972343303772735, + "flos": 18363783392640.0, + "grad_norm": 2.0881667375617563, + "language_loss": 0.88152897, + "learning_rate": 3.974121345799418e-06, + "loss": 0.90095514, + "num_input_tokens_seen": 28140530, + "step": 1326, + "time_per_iteration": 2.612086057662964 + }, + { + "auxiliary_loss_clip": 0.01191167, + "auxiliary_loss_mlp": 0.01050751, + "balance_loss_clip": 1.05429244, + "balance_loss_mlp": 1.0256927, + "epoch": 0.07978355629039531, + "flos": 21762513204480.0, + "grad_norm": 2.0700527780954165, + "language_loss": 0.8311559, + "learning_rate": 3.974058859276032e-06, + "loss": 0.85357511, + "num_input_tokens_seen": 28159640, + "step": 1327, + "time_per_iteration": 2.4839165210723877 + }, + { + "auxiliary_loss_clip": 0.01198057, + "auxiliary_loss_mlp": 0.01052118, + "balance_loss_clip": 1.05804348, + "balance_loss_mlp": 1.02777576, + "epoch": 0.07984367954306328, + "flos": 18551309322240.0, + "grad_norm": 2.649129925990115, + "language_loss": 0.79176372, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.81426549, + "num_input_tokens_seen": 28177050, + "step": 1328, + "time_per_iteration": 2.4634907245635986 + }, + { + "auxiliary_loss_clip": 0.01190816, + "auxiliary_loss_mlp": 0.01056077, + "balance_loss_clip": 1.05813503, + "balance_loss_mlp": 1.03064942, + "epoch": 0.07990380279573125, + "flos": 16904050583040.0, + "grad_norm": 3.35893583109153, + "language_loss": 0.73646677, + "learning_rate": 3.973933661662101e-06, + "loss": 0.75893569, + "num_input_tokens_seen": 28193245, + "step": 1329, + "time_per_iteration": 2.4774179458618164 + }, + { + "auxiliary_loss_clip": 0.01163311, + "auxiliary_loss_mlp": 0.01062977, + "balance_loss_clip": 1.05622911, + "balance_loss_mlp": 1.03932631, + "epoch": 0.07996392604839922, + "flos": 24098358643200.0, + "grad_norm": 2.3118984942611016, + "language_loss": 0.81353331, + "learning_rate": 3.973870950576305e-06, + "loss": 0.83579618, + "num_input_tokens_seen": 28213570, + "step": 1330, + "time_per_iteration": 2.5662713050842285 + }, + { + "auxiliary_loss_clip": 0.0119718, + "auxiliary_loss_mlp": 0.0079722, + "balance_loss_clip": 1.05741024, + "balance_loss_mlp": 1.02525437, + "epoch": 0.08002404930106718, + "flos": 14278729438080.0, + "grad_norm": 1.9860645284044605, + "language_loss": 0.88573027, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.90567428, + "num_input_tokens_seen": 28229980, + "step": 1331, + "time_per_iteration": 2.431732177734375 + }, + { + "auxiliary_loss_clip": 0.01190779, + "auxiliary_loss_mlp": 0.00795673, + "balance_loss_clip": 1.05585694, + "balance_loss_mlp": 1.02053428, + "epoch": 0.08008417255373516, + "flos": 40406219285760.0, + "grad_norm": 3.154731185758471, + "language_loss": 0.7350865, + "learning_rate": 3.973745303858942e-06, + "loss": 0.754951, + "num_input_tokens_seen": 28253840, + "step": 1332, + "time_per_iteration": 2.67429518699646 + }, + { + "auxiliary_loss_clip": 0.01195874, + "auxiliary_loss_mlp": 0.01053874, + "balance_loss_clip": 1.09148037, + "balance_loss_mlp": 1.03067589, + "epoch": 0.08014429580640313, + "flos": 18478913460480.0, + "grad_norm": 2.040073806978841, + "language_loss": 0.82675743, + "learning_rate": 3.973682368232138e-06, + "loss": 0.84925497, + "num_input_tokens_seen": 28271675, + "step": 1333, + "time_per_iteration": 2.516737222671509 + }, + { + "auxiliary_loss_clip": 0.01174358, + "auxiliary_loss_mlp": 0.01056122, + "balance_loss_clip": 1.08929944, + "balance_loss_mlp": 1.03237569, + "epoch": 0.0802044190590711, + "flos": 22053461368320.0, + "grad_norm": 2.4278991120773297, + "language_loss": 0.75136638, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.77367121, + "num_input_tokens_seen": 28291850, + "step": 1334, + "time_per_iteration": 2.6159162521362305 + }, + { + "auxiliary_loss_clip": 0.01165321, + "auxiliary_loss_mlp": 0.01057155, + "balance_loss_clip": 1.05587149, + "balance_loss_mlp": 1.03381395, + "epoch": 0.08026454231173906, + "flos": 24572128055040.0, + "grad_norm": 1.8897166213671848, + "language_loss": 0.80116844, + "learning_rate": 3.973556272454221e-06, + "loss": 0.82339317, + "num_input_tokens_seen": 28310780, + "step": 1335, + "time_per_iteration": 2.591738224029541 + }, + { + "auxiliary_loss_clip": 0.0105923, + "auxiliary_loss_mlp": 0.01020418, + "balance_loss_clip": 1.03156483, + "balance_loss_mlp": 1.01681745, + "epoch": 0.08032466556440704, + "flos": 52581841459200.0, + "grad_norm": 0.7394834998864181, + "language_loss": 0.56052023, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58131665, + "num_input_tokens_seen": 28369985, + "step": 1336, + "time_per_iteration": 3.218465805053711 + }, + { + "auxiliary_loss_clip": 0.0117439, + "auxiliary_loss_mlp": 0.01059226, + "balance_loss_clip": 1.05666602, + "balance_loss_mlp": 1.03623092, + "epoch": 0.080384788817075, + "flos": 23842602829440.0, + "grad_norm": 2.2992934074015645, + "language_loss": 0.67511618, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.69745237, + "num_input_tokens_seen": 28388670, + "step": 1337, + "time_per_iteration": 2.5586206912994385 + }, + { + "auxiliary_loss_clip": 0.01165354, + "auxiliary_loss_mlp": 0.01072423, + "balance_loss_clip": 1.05749238, + "balance_loss_mlp": 1.04685235, + "epoch": 0.08044491206974297, + "flos": 25300719527040.0, + "grad_norm": 1.8725330679223322, + "language_loss": 0.86730832, + "learning_rate": 3.973366567512453e-06, + "loss": 0.88968611, + "num_input_tokens_seen": 28411845, + "step": 1338, + "time_per_iteration": 2.639244794845581 + }, + { + "auxiliary_loss_clip": 0.01135332, + "auxiliary_loss_mlp": 0.01072382, + "balance_loss_clip": 1.05084527, + "balance_loss_mlp": 1.04496396, + "epoch": 0.08050503532241095, + "flos": 22376549226240.0, + "grad_norm": 2.5213288721145526, + "language_loss": 0.87351358, + "learning_rate": 3.973303182868147e-06, + "loss": 0.89559072, + "num_input_tokens_seen": 28427875, + "step": 1339, + "time_per_iteration": 2.601747512817383 + }, + { + "auxiliary_loss_clip": 0.01181457, + "auxiliary_loss_mlp": 0.01051846, + "balance_loss_clip": 1.05542326, + "balance_loss_mlp": 1.0293988, + "epoch": 0.08056515857507891, + "flos": 18369421827840.0, + "grad_norm": 2.347119697257138, + "language_loss": 0.89481509, + "learning_rate": 3.973239723395988e-06, + "loss": 0.91714811, + "num_input_tokens_seen": 28446615, + "step": 1340, + "time_per_iteration": 2.5073328018188477 + }, + { + "auxiliary_loss_clip": 0.01082345, + "auxiliary_loss_mlp": 0.01008388, + "balance_loss_clip": 1.03208733, + "balance_loss_mlp": 1.00443065, + "epoch": 0.08062528182774688, + "flos": 51348130980480.0, + "grad_norm": 0.8780791031969445, + "language_loss": 0.64887929, + "learning_rate": 3.97317618909838e-06, + "loss": 0.66978663, + "num_input_tokens_seen": 28505290, + "step": 1341, + "time_per_iteration": 3.041718006134033 + }, + { + "auxiliary_loss_clip": 0.01191823, + "auxiliary_loss_mlp": 0.01058982, + "balance_loss_clip": 1.05658054, + "balance_loss_mlp": 1.03353107, + "epoch": 0.08068540508041486, + "flos": 17599712261760.0, + "grad_norm": 1.95042885043803, + "language_loss": 0.89343381, + "learning_rate": 3.973112579977733e-06, + "loss": 0.91594189, + "num_input_tokens_seen": 28522735, + "step": 1342, + "time_per_iteration": 2.530014753341675 + }, + { + "auxiliary_loss_clip": 0.01174859, + "auxiliary_loss_mlp": 0.01062768, + "balance_loss_clip": 1.06562257, + "balance_loss_mlp": 1.03754377, + "epoch": 0.08074552833308282, + "flos": 10561185486720.0, + "grad_norm": 2.561275773477413, + "language_loss": 0.762501, + "learning_rate": 3.973048896036459e-06, + "loss": 0.7848773, + "num_input_tokens_seen": 28539460, + "step": 1343, + "time_per_iteration": 2.54150128364563 + }, + { + "auxiliary_loss_clip": 0.01097797, + "auxiliary_loss_mlp": 0.01009369, + "balance_loss_clip": 1.07620847, + "balance_loss_mlp": 1.00524461, + "epoch": 0.08080565158575079, + "flos": 60840254954880.0, + "grad_norm": 0.795330008709181, + "language_loss": 0.57427239, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59534407, + "num_input_tokens_seen": 28599855, + "step": 1344, + "time_per_iteration": 3.0688464641571045 + }, + { + "auxiliary_loss_clip": 0.01152022, + "auxiliary_loss_mlp": 0.01060404, + "balance_loss_clip": 1.05450237, + "balance_loss_mlp": 1.03587091, + "epoch": 0.08086577483841875, + "flos": 18332361970560.0, + "grad_norm": 2.434786142918157, + "language_loss": 0.86915791, + "learning_rate": 3.972921303701695e-06, + "loss": 0.89128214, + "num_input_tokens_seen": 28617585, + "step": 1345, + "time_per_iteration": 3.9129714965820312 + }, + { + "auxiliary_loss_clip": 0.01196309, + "auxiliary_loss_mlp": 0.01059386, + "balance_loss_clip": 1.0585413, + "balance_loss_mlp": 1.03696275, + "epoch": 0.08092589809108673, + "flos": 21543601766400.0, + "grad_norm": 1.7988402900729554, + "language_loss": 0.87749505, + "learning_rate": 3.972857395313042e-06, + "loss": 0.90005207, + "num_input_tokens_seen": 28636355, + "step": 1346, + "time_per_iteration": 3.8620026111602783 + }, + { + "auxiliary_loss_clip": 0.011804, + "auxiliary_loss_mlp": 0.01053055, + "balance_loss_clip": 1.05572617, + "balance_loss_mlp": 1.02974975, + "epoch": 0.0809860213437547, + "flos": 22128012046080.0, + "grad_norm": 1.5713892466306822, + "language_loss": 0.92796767, + "learning_rate": 3.972793412113439e-06, + "loss": 0.9503023, + "num_input_tokens_seen": 28656260, + "step": 1347, + "time_per_iteration": 2.5278053283691406 + }, + { + "auxiliary_loss_clip": 0.01204254, + "auxiliary_loss_mlp": 0.0105952, + "balance_loss_clip": 1.09220338, + "balance_loss_mlp": 1.03436649, + "epoch": 0.08104614459642266, + "flos": 21725489260800.0, + "grad_norm": 1.6864334237843444, + "language_loss": 0.8950876, + "learning_rate": 3.972729354105312e-06, + "loss": 0.91772532, + "num_input_tokens_seen": 28675865, + "step": 1348, + "time_per_iteration": 2.501950740814209 + }, + { + "auxiliary_loss_clip": 0.01152564, + "auxiliary_loss_mlp": 0.01063474, + "balance_loss_clip": 1.09079063, + "balance_loss_mlp": 1.0403831, + "epoch": 0.08110626784909064, + "flos": 23951878980480.0, + "grad_norm": 3.4467167325069368, + "language_loss": 0.76749754, + "learning_rate": 3.97266522129109e-06, + "loss": 0.78965795, + "num_input_tokens_seen": 28696255, + "step": 1349, + "time_per_iteration": 2.610936164855957 + }, + { + "auxiliary_loss_clip": 0.0119493, + "auxiliary_loss_mlp": 0.01058324, + "balance_loss_clip": 1.05528498, + "balance_loss_mlp": 1.03397, + "epoch": 0.0811663911017586, + "flos": 19025689265280.0, + "grad_norm": 1.8989390577797505, + "language_loss": 0.88429129, + "learning_rate": 3.972601013673205e-06, + "loss": 0.90682381, + "num_input_tokens_seen": 28713905, + "step": 1350, + "time_per_iteration": 3.824913740158081 + }, + { + "auxiliary_loss_clip": 0.01178826, + "auxiliary_loss_mlp": 0.00858069, + "balance_loss_clip": 1.08871794, + "balance_loss_mlp": 1.14298868, + "epoch": 0.08122651435442657, + "flos": 15341290588800.0, + "grad_norm": 2.1761253749468312, + "language_loss": 0.82542384, + "learning_rate": 3.972536731254092e-06, + "loss": 0.84579277, + "num_input_tokens_seen": 28732075, + "step": 1351, + "time_per_iteration": 2.5552327632904053 + }, + { + "auxiliary_loss_clip": 0.01191306, + "auxiliary_loss_mlp": 0.01050706, + "balance_loss_clip": 1.05221069, + "balance_loss_mlp": 1.02551651, + "epoch": 0.08128663760709455, + "flos": 23221563655680.0, + "grad_norm": 1.961345574255941, + "language_loss": 0.75411856, + "learning_rate": 3.972472374036189e-06, + "loss": 0.77653861, + "num_input_tokens_seen": 28751150, + "step": 1352, + "time_per_iteration": 3.8863532543182373 + }, + { + "auxiliary_loss_clip": 0.01186822, + "auxiliary_loss_mlp": 0.00810722, + "balance_loss_clip": 1.05904818, + "balance_loss_mlp": 1.04813504, + "epoch": 0.08134676085976252, + "flos": 22965628273920.0, + "grad_norm": 1.8251132912884378, + "language_loss": 0.82826072, + "learning_rate": 3.972407942021935e-06, + "loss": 0.84823614, + "num_input_tokens_seen": 28773360, + "step": 1353, + "time_per_iteration": 2.5281124114990234 + }, + { + "auxiliary_loss_clip": 0.01072565, + "auxiliary_loss_mlp": 0.01009286, + "balance_loss_clip": 1.02626729, + "balance_loss_mlp": 1.00544798, + "epoch": 0.08140688411243048, + "flos": 64322115816960.0, + "grad_norm": 0.8549421491455913, + "language_loss": 0.59737825, + "learning_rate": 3.972343435213775e-06, + "loss": 0.61819685, + "num_input_tokens_seen": 28833390, + "step": 1354, + "time_per_iteration": 3.1323704719543457 + }, + { + "auxiliary_loss_clip": 0.01149119, + "auxiliary_loss_mlp": 0.01057509, + "balance_loss_clip": 1.05428398, + "balance_loss_mlp": 1.03501415, + "epoch": 0.08146700736509845, + "flos": 22491858862080.0, + "grad_norm": 2.1241352541284133, + "language_loss": 0.82738227, + "learning_rate": 3.972278853614154e-06, + "loss": 0.84944856, + "num_input_tokens_seen": 28852430, + "step": 1355, + "time_per_iteration": 2.562041759490967 + }, + { + "auxiliary_loss_clip": 0.01203307, + "auxiliary_loss_mlp": 0.010584, + "balance_loss_clip": 1.09103155, + "balance_loss_mlp": 1.03371167, + "epoch": 0.08152713061776642, + "flos": 20447823513600.0, + "grad_norm": 1.9701699135637438, + "language_loss": 0.71023792, + "learning_rate": 3.972214197225521e-06, + "loss": 0.73285496, + "num_input_tokens_seen": 28870685, + "step": 1356, + "time_per_iteration": 2.5126326084136963 + }, + { + "auxiliary_loss_clip": 0.01186326, + "auxiliary_loss_mlp": 0.01057462, + "balance_loss_clip": 1.05405152, + "balance_loss_mlp": 1.03401375, + "epoch": 0.08158725387043439, + "flos": 23550218121600.0, + "grad_norm": 2.730456872983281, + "language_loss": 0.70089281, + "learning_rate": 3.972149466050329e-06, + "loss": 0.72333068, + "num_input_tokens_seen": 28889860, + "step": 1357, + "time_per_iteration": 2.4966936111450195 + }, + { + "auxiliary_loss_clip": 0.01182222, + "auxiliary_loss_mlp": 0.0105655, + "balance_loss_clip": 1.05668271, + "balance_loss_mlp": 1.03310156, + "epoch": 0.08164737712310235, + "flos": 22017335264640.0, + "grad_norm": 2.652534457599916, + "language_loss": 0.8441186, + "learning_rate": 3.97208466009103e-06, + "loss": 0.86650634, + "num_input_tokens_seen": 28905865, + "step": 1358, + "time_per_iteration": 2.517043352127075 + }, + { + "auxiliary_loss_clip": 0.01171836, + "auxiliary_loss_mlp": 0.01063684, + "balance_loss_clip": 1.0558331, + "balance_loss_mlp": 1.03782797, + "epoch": 0.08170750037577033, + "flos": 23367827836800.0, + "grad_norm": 2.120111389558974, + "language_loss": 1.02303839, + "learning_rate": 3.972019779350084e-06, + "loss": 1.04539359, + "num_input_tokens_seen": 28925250, + "step": 1359, + "time_per_iteration": 2.55159330368042 + }, + { + "auxiliary_loss_clip": 0.01127729, + "auxiliary_loss_mlp": 0.0105833, + "balance_loss_clip": 1.05472505, + "balance_loss_mlp": 1.03404701, + "epoch": 0.0817676236284383, + "flos": 28397978490240.0, + "grad_norm": 2.4215780227663326, + "language_loss": 0.84065056, + "learning_rate": 3.971954823829951e-06, + "loss": 0.86251116, + "num_input_tokens_seen": 28943445, + "step": 1360, + "time_per_iteration": 2.690230131149292 + }, + { + "auxiliary_loss_clip": 0.01194892, + "auxiliary_loss_mlp": 0.01068033, + "balance_loss_clip": 1.05528533, + "balance_loss_mlp": 1.04410791, + "epoch": 0.08182774688110626, + "flos": 19208905562880.0, + "grad_norm": 2.49016972531053, + "language_loss": 0.71722198, + "learning_rate": 3.971889793533093e-06, + "loss": 0.73985118, + "num_input_tokens_seen": 28962695, + "step": 1361, + "time_per_iteration": 2.4689581394195557 + }, + { + "auxiliary_loss_clip": 0.0115907, + "auxiliary_loss_mlp": 0.01057113, + "balance_loss_clip": 1.04698396, + "balance_loss_mlp": 1.03216243, + "epoch": 0.08188787013377424, + "flos": 22784099915520.0, + "grad_norm": 2.200134914475846, + "language_loss": 0.76544893, + "learning_rate": 3.971824688461976e-06, + "loss": 0.78761077, + "num_input_tokens_seen": 28982120, + "step": 1362, + "time_per_iteration": 2.5434932708740234 + }, + { + "auxiliary_loss_clip": 0.01194065, + "auxiliary_loss_mlp": 0.01055553, + "balance_loss_clip": 1.05716801, + "balance_loss_mlp": 1.03339171, + "epoch": 0.08194799338644221, + "flos": 16468095214080.0, + "grad_norm": 2.5264349234260406, + "language_loss": 0.72381639, + "learning_rate": 3.971759508619069e-06, + "loss": 0.7463125, + "num_input_tokens_seen": 28998100, + "step": 1363, + "time_per_iteration": 2.438025712966919 + }, + { + "auxiliary_loss_clip": 0.0119535, + "auxiliary_loss_mlp": 0.01066038, + "balance_loss_clip": 1.05911791, + "balance_loss_mlp": 1.04053903, + "epoch": 0.08200811663911017, + "flos": 23913633974400.0, + "grad_norm": 2.9670142336255867, + "language_loss": 0.77221179, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79482567, + "num_input_tokens_seen": 29017095, + "step": 1364, + "time_per_iteration": 2.5069632530212402 + }, + { + "auxiliary_loss_clip": 0.01146967, + "auxiliary_loss_mlp": 0.01066932, + "balance_loss_clip": 1.07619596, + "balance_loss_mlp": 1.04002666, + "epoch": 0.08206823989177814, + "flos": 17896550256000.0, + "grad_norm": 1.6891767437743876, + "language_loss": 0.82133698, + "learning_rate": 3.971628924627776e-06, + "loss": 0.84347594, + "num_input_tokens_seen": 29037240, + "step": 1365, + "time_per_iteration": 2.6055777072906494 + }, + { + "auxiliary_loss_clip": 0.01184743, + "auxiliary_loss_mlp": 0.01059947, + "balance_loss_clip": 1.05738413, + "balance_loss_mlp": 1.03635502, + "epoch": 0.08212836314444612, + "flos": 22088186841600.0, + "grad_norm": 1.7937527562322797, + "language_loss": 0.82316637, + "learning_rate": 3.97156352048434e-06, + "loss": 0.84561336, + "num_input_tokens_seen": 29056250, + "step": 1366, + "time_per_iteration": 2.517164707183838 + }, + { + "auxiliary_loss_clip": 0.01151298, + "auxiliary_loss_mlp": 0.01076251, + "balance_loss_clip": 1.05310857, + "balance_loss_mlp": 1.05193257, + "epoch": 0.08218848639711408, + "flos": 17597485618560.0, + "grad_norm": 2.2678581268030182, + "language_loss": 0.81732595, + "learning_rate": 3.97149804157902e-06, + "loss": 0.83960152, + "num_input_tokens_seen": 29073380, + "step": 1367, + "time_per_iteration": 2.5454294681549072 + }, + { + "auxiliary_loss_clip": 0.0119738, + "auxiliary_loss_mlp": 0.01062053, + "balance_loss_clip": 1.0557487, + "balance_loss_mlp": 1.03858113, + "epoch": 0.08224860964978205, + "flos": 17857838373120.0, + "grad_norm": 2.267629418302392, + "language_loss": 0.83387166, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.856466, + "num_input_tokens_seen": 29091330, + "step": 1368, + "time_per_iteration": 2.453807830810547 + }, + { + "auxiliary_loss_clip": 0.01149537, + "auxiliary_loss_mlp": 0.0104851, + "balance_loss_clip": 1.05284393, + "balance_loss_mlp": 1.02639651, + "epoch": 0.08230873290245003, + "flos": 25227533566080.0, + "grad_norm": 1.672830543746323, + "language_loss": 0.81268024, + "learning_rate": 3.971366859492653e-06, + "loss": 0.83466071, + "num_input_tokens_seen": 29110375, + "step": 1369, + "time_per_iteration": 2.5987918376922607 + }, + { + "auxiliary_loss_clip": 0.01134819, + "auxiliary_loss_mlp": 0.00897687, + "balance_loss_clip": 1.05420494, + "balance_loss_mlp": 1.20826483, + "epoch": 0.08236885615511799, + "flos": 31759935753600.0, + "grad_norm": 2.3569416676976394, + "language_loss": 0.75210464, + "learning_rate": 3.971301156316582e-06, + "loss": 0.77242965, + "num_input_tokens_seen": 29129395, + "step": 1370, + "time_per_iteration": 2.675539493560791 + }, + { + "auxiliary_loss_clip": 0.01145145, + "auxiliary_loss_mlp": 0.01061663, + "balance_loss_clip": 1.05532598, + "balance_loss_mlp": 1.03771377, + "epoch": 0.08242897940778596, + "flos": 23185832601600.0, + "grad_norm": 1.7746662832576605, + "language_loss": 0.74430811, + "learning_rate": 3.971235378388573e-06, + "loss": 0.76637614, + "num_input_tokens_seen": 29148650, + "step": 1371, + "time_per_iteration": 2.606079339981079 + }, + { + "auxiliary_loss_clip": 0.01093728, + "auxiliary_loss_mlp": 0.01061006, + "balance_loss_clip": 1.05149138, + "balance_loss_mlp": 1.0356741, + "epoch": 0.08248910266045394, + "flos": 34491480393600.0, + "grad_norm": 2.191756877851688, + "language_loss": 0.70751309, + "learning_rate": 3.971169525711122e-06, + "loss": 0.72906041, + "num_input_tokens_seen": 29170785, + "step": 1372, + "time_per_iteration": 3.0113046169281006 + }, + { + "auxiliary_loss_clip": 0.01164975, + "auxiliary_loss_mlp": 0.0105387, + "balance_loss_clip": 1.0682075, + "balance_loss_mlp": 1.02953911, + "epoch": 0.0825492259131219, + "flos": 13436228960640.0, + "grad_norm": 2.477101707785931, + "language_loss": 0.88175493, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.90394342, + "num_input_tokens_seen": 29185210, + "step": 1373, + "time_per_iteration": 2.701233386993408 + }, + { + "auxiliary_loss_clip": 0.01146135, + "auxiliary_loss_mlp": 0.01061176, + "balance_loss_clip": 1.05686069, + "balance_loss_mlp": 1.03772759, + "epoch": 0.08260934916578987, + "flos": 25812446636160.0, + "grad_norm": 1.7556608681570234, + "language_loss": 0.82056987, + "learning_rate": 3.971037596117882e-06, + "loss": 0.84264302, + "num_input_tokens_seen": 29205210, + "step": 1374, + "time_per_iteration": 2.609203577041626 + }, + { + "auxiliary_loss_clip": 0.0104044, + "auxiliary_loss_mlp": 0.01022793, + "balance_loss_clip": 1.02812564, + "balance_loss_mlp": 1.01902616, + "epoch": 0.08266947241845783, + "flos": 63460009491840.0, + "grad_norm": 0.8224806421073857, + "language_loss": 0.60635924, + "learning_rate": 3.970971519207095e-06, + "loss": 0.62699157, + "num_input_tokens_seen": 29265350, + "step": 1375, + "time_per_iteration": 3.2022194862365723 + }, + { + "auxiliary_loss_clip": 0.01058865, + "auxiliary_loss_mlp": 0.01009147, + "balance_loss_clip": 1.0207082, + "balance_loss_mlp": 1.00588059, + "epoch": 0.08272959567112581, + "flos": 69993704568960.0, + "grad_norm": 0.9075195824136949, + "language_loss": 0.62189412, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64257431, + "num_input_tokens_seen": 29321475, + "step": 1376, + "time_per_iteration": 3.061965227127075 + }, + { + "auxiliary_loss_clip": 0.011643, + "auxiliary_loss_mlp": 0.01069168, + "balance_loss_clip": 1.07208121, + "balance_loss_mlp": 1.0456841, + "epoch": 0.08278971892379378, + "flos": 20413205781120.0, + "grad_norm": 1.8100377799768825, + "language_loss": 0.82557929, + "learning_rate": 3.970839141169718e-06, + "loss": 0.84791398, + "num_input_tokens_seen": 29341405, + "step": 1377, + "time_per_iteration": 2.646111488342285 + }, + { + "auxiliary_loss_clip": 0.01167409, + "auxiliary_loss_mlp": 0.01055555, + "balance_loss_clip": 1.05490303, + "balance_loss_mlp": 1.03203547, + "epoch": 0.08284984217646174, + "flos": 26250233598720.0, + "grad_norm": 1.7518277713017192, + "language_loss": 0.84448099, + "learning_rate": 3.970772840048147e-06, + "loss": 0.86671066, + "num_input_tokens_seen": 29361955, + "step": 1378, + "time_per_iteration": 2.589545249938965 + }, + { + "auxiliary_loss_clip": 0.01179713, + "auxiliary_loss_mlp": 0.01066904, + "balance_loss_clip": 1.05354333, + "balance_loss_mlp": 1.04240668, + "epoch": 0.08290996542912972, + "flos": 27194683852800.0, + "grad_norm": 3.0575904561770804, + "language_loss": 0.87582791, + "learning_rate": 3.970706464194672e-06, + "loss": 0.89829403, + "num_input_tokens_seen": 29382395, + "step": 1379, + "time_per_iteration": 2.603670358657837 + }, + { + "auxiliary_loss_clip": 0.01161965, + "auxiliary_loss_mlp": 0.01062357, + "balance_loss_clip": 1.0676018, + "balance_loss_mlp": 1.03919435, + "epoch": 0.08297008868179769, + "flos": 38618191146240.0, + "grad_norm": 2.1479643117290066, + "language_loss": 0.78515524, + "learning_rate": 3.970640013611812e-06, + "loss": 0.80739844, + "num_input_tokens_seen": 29404460, + "step": 1380, + "time_per_iteration": 2.7013583183288574 + }, + { + "auxiliary_loss_clip": 0.01177288, + "auxiliary_loss_mlp": 0.01057459, + "balance_loss_clip": 1.05616283, + "balance_loss_mlp": 1.03279459, + "epoch": 0.08303021193446565, + "flos": 19974736460160.0, + "grad_norm": 2.938776720082317, + "language_loss": 0.86105061, + "learning_rate": 3.970573488302083e-06, + "loss": 0.88339806, + "num_input_tokens_seen": 29422675, + "step": 1381, + "time_per_iteration": 2.5222864151000977 + }, + { + "auxiliary_loss_clip": 0.01190585, + "auxiliary_loss_mlp": 0.00890074, + "balance_loss_clip": 1.0583744, + "balance_loss_mlp": 1.19936597, + "epoch": 0.08309033518713363, + "flos": 13662646341120.0, + "grad_norm": 3.7238317189119847, + "language_loss": 0.88423169, + "learning_rate": 3.970506888268011e-06, + "loss": 0.90503824, + "num_input_tokens_seen": 29439840, + "step": 1382, + "time_per_iteration": 2.5308749675750732 + }, + { + "auxiliary_loss_clip": 0.01154684, + "auxiliary_loss_mlp": 0.01059978, + "balance_loss_clip": 1.05543566, + "balance_loss_mlp": 1.03745937, + "epoch": 0.0831504584398016, + "flos": 17968551068160.0, + "grad_norm": 2.8361848316785485, + "language_loss": 0.77227545, + "learning_rate": 3.970440213512121e-06, + "loss": 0.79442203, + "num_input_tokens_seen": 29457360, + "step": 1383, + "time_per_iteration": 3.9899027347564697 + }, + { + "auxiliary_loss_clip": 0.011873, + "auxiliary_loss_mlp": 0.01058517, + "balance_loss_clip": 1.05763626, + "balance_loss_mlp": 1.03459144, + "epoch": 0.08321058169246956, + "flos": 22601386408320.0, + "grad_norm": 1.9280332702507494, + "language_loss": 0.82792896, + "learning_rate": 3.97037346403694e-06, + "loss": 0.85038716, + "num_input_tokens_seen": 29477040, + "step": 1384, + "time_per_iteration": 2.5434463024139404 + }, + { + "auxiliary_loss_clip": 0.0114428, + "auxiliary_loss_mlp": 0.01057706, + "balance_loss_clip": 1.05584311, + "balance_loss_mlp": 1.02971518, + "epoch": 0.08327070494513754, + "flos": 22850426378880.0, + "grad_norm": 2.445617375163999, + "language_loss": 0.84979594, + "learning_rate": 3.970306639845e-06, + "loss": 0.8718158, + "num_input_tokens_seen": 29492010, + "step": 1385, + "time_per_iteration": 3.997011184692383 + }, + { + "auxiliary_loss_clip": 0.01152675, + "auxiliary_loss_mlp": 0.01067989, + "balance_loss_clip": 1.05599785, + "balance_loss_mlp": 1.0429548, + "epoch": 0.0833308281978055, + "flos": 22782986593920.0, + "grad_norm": 3.22368202664472, + "language_loss": 0.68799365, + "learning_rate": 3.970239740938835e-06, + "loss": 0.71020031, + "num_input_tokens_seen": 29511850, + "step": 1386, + "time_per_iteration": 2.5908398628234863 + }, + { + "auxiliary_loss_clip": 0.01170583, + "auxiliary_loss_mlp": 0.01056343, + "balance_loss_clip": 1.05424988, + "balance_loss_mlp": 1.03229821, + "epoch": 0.08339095145047347, + "flos": 20812604083200.0, + "grad_norm": 1.7340584476387966, + "language_loss": 0.82052439, + "learning_rate": 3.97017276732098e-06, + "loss": 0.84279364, + "num_input_tokens_seen": 29531415, + "step": 1387, + "time_per_iteration": 2.5233919620513916 + }, + { + "auxiliary_loss_clip": 0.01171544, + "auxiliary_loss_mlp": 0.01068417, + "balance_loss_clip": 1.05527532, + "balance_loss_mlp": 1.04350209, + "epoch": 0.08345107470314143, + "flos": 18515326872960.0, + "grad_norm": 2.881857166179083, + "language_loss": 0.77649295, + "learning_rate": 3.970105718993978e-06, + "loss": 0.79889262, + "num_input_tokens_seen": 29549525, + "step": 1388, + "time_per_iteration": 2.6173527240753174 + }, + { + "auxiliary_loss_clip": 0.0112539, + "auxiliary_loss_mlp": 0.01065496, + "balance_loss_clip": 1.05334783, + "balance_loss_mlp": 1.04036713, + "epoch": 0.08351119795580941, + "flos": 18807567926400.0, + "grad_norm": 2.1308519509124317, + "language_loss": 0.7875731, + "learning_rate": 3.970038595960369e-06, + "loss": 0.80948198, + "num_input_tokens_seen": 29568705, + "step": 1389, + "time_per_iteration": 4.030388593673706 + }, + { + "auxiliary_loss_clip": 0.01169976, + "auxiliary_loss_mlp": 0.0105954, + "balance_loss_clip": 1.05550814, + "balance_loss_mlp": 1.03548348, + "epoch": 0.08357132120847738, + "flos": 18441817689600.0, + "grad_norm": 2.2344612518923666, + "language_loss": 0.87197471, + "learning_rate": 3.969971398222699e-06, + "loss": 0.89426982, + "num_input_tokens_seen": 29585855, + "step": 1390, + "time_per_iteration": 2.4936447143554688 + }, + { + "auxiliary_loss_clip": 0.01157074, + "auxiliary_loss_mlp": 0.01067847, + "balance_loss_clip": 1.05366492, + "balance_loss_mlp": 1.04129887, + "epoch": 0.08363144446114534, + "flos": 25922333318400.0, + "grad_norm": 1.7341200012645293, + "language_loss": 0.86993837, + "learning_rate": 3.969904125783517e-06, + "loss": 0.89218765, + "num_input_tokens_seen": 29607280, + "step": 1391, + "time_per_iteration": 3.943129062652588 + }, + { + "auxiliary_loss_clip": 0.01152783, + "auxiliary_loss_mlp": 0.01070827, + "balance_loss_clip": 1.05673289, + "balance_loss_mlp": 1.04697299, + "epoch": 0.08369156771381332, + "flos": 18041306065920.0, + "grad_norm": 2.5061505314481667, + "language_loss": 0.87487209, + "learning_rate": 3.969836778645371e-06, + "loss": 0.89710814, + "num_input_tokens_seen": 29624130, + "step": 1392, + "time_per_iteration": 2.5995640754699707 + }, + { + "auxiliary_loss_clip": 0.01180615, + "auxiliary_loss_mlp": 0.01061944, + "balance_loss_clip": 1.05436158, + "balance_loss_mlp": 1.03818607, + "epoch": 0.08375169096648129, + "flos": 22675111073280.0, + "grad_norm": 4.552697192328562, + "language_loss": 0.79898894, + "learning_rate": 3.969769356810819e-06, + "loss": 0.82141447, + "num_input_tokens_seen": 29643210, + "step": 1393, + "time_per_iteration": 2.5235164165496826 + }, + { + "auxiliary_loss_clip": 0.01195648, + "auxiliary_loss_mlp": 0.01056736, + "balance_loss_clip": 1.05996811, + "balance_loss_mlp": 1.03364515, + "epoch": 0.08381181421914925, + "flos": 26103215232000.0, + "grad_norm": 1.7724223673837824, + "language_loss": 0.84837413, + "learning_rate": 3.969701860282415e-06, + "loss": 0.87089789, + "num_input_tokens_seen": 29663920, + "step": 1394, + "time_per_iteration": 2.5576984882354736 + }, + { + "auxiliary_loss_clip": 0.01143659, + "auxiliary_loss_mlp": 0.01056912, + "balance_loss_clip": 1.05838418, + "balance_loss_mlp": 1.03335643, + "epoch": 0.08387193747181723, + "flos": 20629782835200.0, + "grad_norm": 2.204175162367542, + "language_loss": 0.83022642, + "learning_rate": 3.969634289062719e-06, + "loss": 0.8522321, + "num_input_tokens_seen": 29683825, + "step": 1395, + "time_per_iteration": 2.6070992946624756 + }, + { + "auxiliary_loss_clip": 0.01186113, + "auxiliary_loss_mlp": 0.00872722, + "balance_loss_clip": 1.05989683, + "balance_loss_mlp": 1.16321957, + "epoch": 0.0839320607244852, + "flos": 13443196199040.0, + "grad_norm": 3.9227238215097096, + "language_loss": 0.8299917, + "learning_rate": 3.969566643154293e-06, + "loss": 0.8505801, + "num_input_tokens_seen": 29698775, + "step": 1396, + "time_per_iteration": 2.504648447036743 + }, + { + "auxiliary_loss_clip": 0.01183941, + "auxiliary_loss_mlp": 0.01062161, + "balance_loss_clip": 1.06043482, + "balance_loss_mlp": 1.03632832, + "epoch": 0.08399218397715316, + "flos": 23477247642240.0, + "grad_norm": 2.161229699558928, + "language_loss": 0.76433504, + "learning_rate": 3.969498922559703e-06, + "loss": 0.78679609, + "num_input_tokens_seen": 29719430, + "step": 1397, + "time_per_iteration": 2.5364434719085693 + }, + { + "auxiliary_loss_clip": 0.0115222, + "auxiliary_loss_mlp": 0.01051812, + "balance_loss_clip": 1.05606222, + "balance_loss_mlp": 1.02684915, + "epoch": 0.08405230722982113, + "flos": 25920717206400.0, + "grad_norm": 2.079525483133427, + "language_loss": 0.78191149, + "learning_rate": 3.969431127281516e-06, + "loss": 0.80395186, + "num_input_tokens_seen": 29739685, + "step": 1398, + "time_per_iteration": 2.612788677215576 + }, + { + "auxiliary_loss_clip": 0.01191011, + "auxiliary_loss_mlp": 0.01055553, + "balance_loss_clip": 1.05782962, + "balance_loss_mlp": 1.03184235, + "epoch": 0.0841124304824891, + "flos": 17967437746560.0, + "grad_norm": 2.517325156913226, + "language_loss": 0.95005888, + "learning_rate": 3.969363257322304e-06, + "loss": 0.97252452, + "num_input_tokens_seen": 29756165, + "step": 1399, + "time_per_iteration": 2.446211814880371 + }, + { + "auxiliary_loss_clip": 0.01173696, + "auxiliary_loss_mlp": 0.01065278, + "balance_loss_clip": 1.05642569, + "balance_loss_mlp": 1.03859913, + "epoch": 0.08417255373515707, + "flos": 25629661301760.0, + "grad_norm": 2.3651670784164684, + "language_loss": 0.81925595, + "learning_rate": 3.96929531268464e-06, + "loss": 0.8416456, + "num_input_tokens_seen": 29776425, + "step": 1400, + "time_per_iteration": 2.5432238578796387 + }, + { + "auxiliary_loss_clip": 0.01169142, + "auxiliary_loss_mlp": 0.01058607, + "balance_loss_clip": 1.05840206, + "balance_loss_mlp": 1.03462243, + "epoch": 0.08423267698782504, + "flos": 26249730808320.0, + "grad_norm": 2.1079399510985963, + "language_loss": 0.86965173, + "learning_rate": 3.969227293371099e-06, + "loss": 0.89192921, + "num_input_tokens_seen": 29796440, + "step": 1401, + "time_per_iteration": 2.589743137359619 + }, + { + "auxiliary_loss_clip": 0.01194684, + "auxiliary_loss_mlp": 0.01061837, + "balance_loss_clip": 1.05581963, + "balance_loss_mlp": 1.03607583, + "epoch": 0.08429280024049302, + "flos": 20119707751680.0, + "grad_norm": 2.162950752363109, + "language_loss": 0.87466592, + "learning_rate": 3.969159199384263e-06, + "loss": 0.8972311, + "num_input_tokens_seen": 29814755, + "step": 1402, + "time_per_iteration": 2.462029218673706 + }, + { + "auxiliary_loss_clip": 0.01153605, + "auxiliary_loss_mlp": 0.00846838, + "balance_loss_clip": 1.06595874, + "balance_loss_mlp": 1.11910236, + "epoch": 0.08435292349316098, + "flos": 42924526836480.0, + "grad_norm": 2.345988116567741, + "language_loss": 0.88876134, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.90876579, + "num_input_tokens_seen": 29834785, + "step": 1403, + "time_per_iteration": 2.7763559818267822 + }, + { + "auxiliary_loss_clip": 0.01165005, + "auxiliary_loss_mlp": 0.01057184, + "balance_loss_clip": 1.05639803, + "balance_loss_mlp": 1.03236449, + "epoch": 0.08441304674582895, + "flos": 22857285876480.0, + "grad_norm": 1.882429202391723, + "language_loss": 0.80095994, + "learning_rate": 3.969022787401033e-06, + "loss": 0.82318187, + "num_input_tokens_seen": 29854695, + "step": 1404, + "time_per_iteration": 2.545522689819336 + }, + { + "auxiliary_loss_clip": 0.01178434, + "auxiliary_loss_mlp": 0.01068557, + "balance_loss_clip": 1.05901933, + "balance_loss_mlp": 1.04379702, + "epoch": 0.08447316999849692, + "flos": 18697501676160.0, + "grad_norm": 2.188668829162983, + "language_loss": 0.83544254, + "learning_rate": 3.968954469409811e-06, + "loss": 0.85791242, + "num_input_tokens_seen": 29872180, + "step": 1405, + "time_per_iteration": 2.511488914489746 + }, + { + "auxiliary_loss_clip": 0.01178522, + "auxiliary_loss_mlp": 0.01056872, + "balance_loss_clip": 1.05285501, + "balance_loss_mlp": 1.03363848, + "epoch": 0.08453329325116489, + "flos": 25483971738240.0, + "grad_norm": 1.5831089315003686, + "language_loss": 0.79751539, + "learning_rate": 3.968886076755639e-06, + "loss": 0.81986934, + "num_input_tokens_seen": 29893205, + "step": 1406, + "time_per_iteration": 2.5482656955718994 + }, + { + "auxiliary_loss_clip": 0.01178643, + "auxiliary_loss_mlp": 0.0106595, + "balance_loss_clip": 1.06923532, + "balance_loss_mlp": 1.04226279, + "epoch": 0.08459341650383286, + "flos": 20920048640640.0, + "grad_norm": 1.8019563381331154, + "language_loss": 0.7957387, + "learning_rate": 3.96881760944111e-06, + "loss": 0.81818467, + "num_input_tokens_seen": 29911970, + "step": 1407, + "time_per_iteration": 2.542900323867798 + }, + { + "auxiliary_loss_clip": 0.01182629, + "auxiliary_loss_mlp": 0.01052793, + "balance_loss_clip": 1.05551589, + "balance_loss_mlp": 1.02984524, + "epoch": 0.08465353975650082, + "flos": 13043079624960.0, + "grad_norm": 2.1497825349822746, + "language_loss": 0.91968518, + "learning_rate": 3.968749067468819e-06, + "loss": 0.94203937, + "num_input_tokens_seen": 29929925, + "step": 1408, + "time_per_iteration": 2.5068957805633545 + }, + { + "auxiliary_loss_clip": 0.01069168, + "auxiliary_loss_mlp": 0.0101737, + "balance_loss_clip": 1.02955818, + "balance_loss_mlp": 1.01386535, + "epoch": 0.0847136630091688, + "flos": 60877422552960.0, + "grad_norm": 0.88392969356341, + "language_loss": 0.61851841, + "learning_rate": 3.968680450841368e-06, + "loss": 0.63938379, + "num_input_tokens_seen": 29985950, + "step": 1409, + "time_per_iteration": 3.221773147583008 + }, + { + "auxiliary_loss_clip": 0.01185434, + "auxiliary_loss_mlp": 0.01057048, + "balance_loss_clip": 1.05449247, + "balance_loss_mlp": 1.03417218, + "epoch": 0.08477378626183676, + "flos": 22046530043520.0, + "grad_norm": 2.0555788572955818, + "language_loss": 0.86712021, + "learning_rate": 3.968611759561355e-06, + "loss": 0.88954502, + "num_input_tokens_seen": 30004330, + "step": 1410, + "time_per_iteration": 2.483055591583252 + }, + { + "auxiliary_loss_clip": 0.01176704, + "auxiliary_loss_mlp": 0.01052888, + "balance_loss_clip": 1.05510259, + "balance_loss_mlp": 1.02799749, + "epoch": 0.08483390951450473, + "flos": 16690059308160.0, + "grad_norm": 2.7087830253917407, + "language_loss": 0.73945272, + "learning_rate": 3.968542993631388e-06, + "loss": 0.76174873, + "num_input_tokens_seen": 30022555, + "step": 1411, + "time_per_iteration": 2.5148556232452393 + }, + { + "auxiliary_loss_clip": 0.01083948, + "auxiliary_loss_mlp": 0.01005256, + "balance_loss_clip": 1.02542758, + "balance_loss_mlp": 1.00163221, + "epoch": 0.08489403276717271, + "flos": 51584640082560.0, + "grad_norm": 0.9120626682348201, + "language_loss": 0.56755084, + "learning_rate": 3.968474153054073e-06, + "loss": 0.58844286, + "num_input_tokens_seen": 30077220, + "step": 1412, + "time_per_iteration": 3.0098018646240234 + }, + { + "auxiliary_loss_clip": 0.01155241, + "auxiliary_loss_mlp": 0.01074551, + "balance_loss_clip": 1.05215609, + "balance_loss_mlp": 1.04878998, + "epoch": 0.08495415601984067, + "flos": 17092330698240.0, + "grad_norm": 2.171377085053349, + "language_loss": 0.89338124, + "learning_rate": 3.96840523783202e-06, + "loss": 0.9156791, + "num_input_tokens_seen": 30094600, + "step": 1413, + "time_per_iteration": 2.5200605392456055 + }, + { + "auxiliary_loss_clip": 0.01163606, + "auxiliary_loss_mlp": 0.01053293, + "balance_loss_clip": 1.05668688, + "balance_loss_mlp": 1.02868867, + "epoch": 0.08501427927250864, + "flos": 23148413608320.0, + "grad_norm": 2.1201041957451294, + "language_loss": 0.87991083, + "learning_rate": 3.968336247967844e-06, + "loss": 0.90207976, + "num_input_tokens_seen": 30114475, + "step": 1414, + "time_per_iteration": 2.552064895629883 + }, + { + "auxiliary_loss_clip": 0.01167031, + "auxiliary_loss_mlp": 0.01065573, + "balance_loss_clip": 1.056283, + "balance_loss_mlp": 1.04337621, + "epoch": 0.08507440252517662, + "flos": 19063467394560.0, + "grad_norm": 1.907492519910922, + "language_loss": 0.77443445, + "learning_rate": 3.96826718346416e-06, + "loss": 0.79676056, + "num_input_tokens_seen": 30133350, + "step": 1415, + "time_per_iteration": 2.536937952041626 + }, + { + "auxiliary_loss_clip": 0.01177887, + "auxiliary_loss_mlp": 0.0106011, + "balance_loss_clip": 1.0552938, + "balance_loss_mlp": 1.03866386, + "epoch": 0.08513452577784458, + "flos": 60182296600320.0, + "grad_norm": 2.330268511497652, + "language_loss": 0.70902288, + "learning_rate": 3.968198044323587e-06, + "loss": 0.73140287, + "num_input_tokens_seen": 30159005, + "step": 1416, + "time_per_iteration": 2.8506417274475098 + }, + { + "auxiliary_loss_clip": 0.01172443, + "auxiliary_loss_mlp": 0.01065794, + "balance_loss_clip": 1.05826163, + "balance_loss_mlp": 1.04030681, + "epoch": 0.08519464903051255, + "flos": 27308485117440.0, + "grad_norm": 2.161262581385308, + "language_loss": 0.75072491, + "learning_rate": 3.968128830548748e-06, + "loss": 0.77310729, + "num_input_tokens_seen": 30179450, + "step": 1417, + "time_per_iteration": 2.6399874687194824 + }, + { + "auxiliary_loss_clip": 0.01163884, + "auxiliary_loss_mlp": 0.01053818, + "balance_loss_clip": 1.05194819, + "balance_loss_mlp": 1.03029823, + "epoch": 0.08525477228318051, + "flos": 20266438809600.0, + "grad_norm": 3.7367646176802873, + "language_loss": 0.82233644, + "learning_rate": 3.968059542142265e-06, + "loss": 0.84451348, + "num_input_tokens_seen": 30197235, + "step": 1418, + "time_per_iteration": 2.520747423171997 + }, + { + "auxiliary_loss_clip": 0.01049132, + "auxiliary_loss_mlp": 0.0102198, + "balance_loss_clip": 1.02819371, + "balance_loss_mlp": 1.0182128, + "epoch": 0.08531489553584849, + "flos": 67615017183360.0, + "grad_norm": 0.8681441192398974, + "language_loss": 0.5662781, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58698922, + "num_input_tokens_seen": 30257410, + "step": 1419, + "time_per_iteration": 3.1313247680664062 + }, + { + "auxiliary_loss_clip": 0.01187534, + "auxiliary_loss_mlp": 0.0105756, + "balance_loss_clip": 1.05239344, + "balance_loss_mlp": 1.03390884, + "epoch": 0.08537501878851646, + "flos": 27526965592320.0, + "grad_norm": 2.289767681583507, + "language_loss": 0.70208764, + "learning_rate": 3.967920741444886e-06, + "loss": 0.72453862, + "num_input_tokens_seen": 30277865, + "step": 1420, + "time_per_iteration": 2.517216444015503 + }, + { + "auxiliary_loss_clip": 0.01150014, + "auxiliary_loss_mlp": 0.01050475, + "balance_loss_clip": 1.05045009, + "balance_loss_mlp": 1.02665687, + "epoch": 0.08543514204118442, + "flos": 22784243569920.0, + "grad_norm": 1.5635443028992941, + "language_loss": 0.87954116, + "learning_rate": 3.967851229159252e-06, + "loss": 0.901546, + "num_input_tokens_seen": 30298545, + "step": 1421, + "time_per_iteration": 4.0657830238342285 + }, + { + "auxiliary_loss_clip": 0.01083126, + "auxiliary_loss_mlp": 0.01007651, + "balance_loss_clip": 1.02647924, + "balance_loss_mlp": 1.00395584, + "epoch": 0.0854952652938524, + "flos": 60990721027200.0, + "grad_norm": 0.7913341862450514, + "language_loss": 0.63474226, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65565002, + "num_input_tokens_seen": 30361725, + "step": 1422, + "time_per_iteration": 3.0729610919952393 + }, + { + "auxiliary_loss_clip": 0.01145952, + "auxiliary_loss_mlp": 0.0105375, + "balance_loss_clip": 1.05371153, + "balance_loss_mlp": 1.03071904, + "epoch": 0.08555538854652037, + "flos": 28038046256640.0, + "grad_norm": 2.816338005345657, + "language_loss": 0.82965112, + "learning_rate": 3.967711980727276e-06, + "loss": 0.85164815, + "num_input_tokens_seen": 30382180, + "step": 1423, + "time_per_iteration": 4.000679969787598 + }, + { + "auxiliary_loss_clip": 0.01154985, + "auxiliary_loss_mlp": 0.01062477, + "balance_loss_clip": 1.0565834, + "balance_loss_mlp": 1.03837311, + "epoch": 0.08561551179918833, + "flos": 23509279595520.0, + "grad_norm": 1.6839336331805932, + "language_loss": 0.75055611, + "learning_rate": 3.967642244586213e-06, + "loss": 0.77273077, + "num_input_tokens_seen": 30402980, + "step": 1424, + "time_per_iteration": 2.6163113117218018 + }, + { + "auxiliary_loss_clip": 0.0113492, + "auxiliary_loss_mlp": 0.01054978, + "balance_loss_clip": 1.05402708, + "balance_loss_mlp": 1.0321734, + "epoch": 0.08567563505185631, + "flos": 17926930183680.0, + "grad_norm": 2.0534475492613002, + "language_loss": 0.75968826, + "learning_rate": 3.96757243383196e-06, + "loss": 0.78158724, + "num_input_tokens_seen": 30420800, + "step": 1425, + "time_per_iteration": 2.5937554836273193 + }, + { + "auxiliary_loss_clip": 0.01187342, + "auxiliary_loss_mlp": 0.01048808, + "balance_loss_clip": 1.05553544, + "balance_loss_mlp": 1.02633739, + "epoch": 0.08573575830452428, + "flos": 19719519350400.0, + "grad_norm": 2.144556785493343, + "language_loss": 0.93508077, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.95744228, + "num_input_tokens_seen": 30439620, + "step": 1426, + "time_per_iteration": 2.482816219329834 + }, + { + "auxiliary_loss_clip": 0.01140865, + "auxiliary_loss_mlp": 0.0106059, + "balance_loss_clip": 1.05449331, + "balance_loss_mlp": 1.03492403, + "epoch": 0.08579588155719224, + "flos": 17931563038080.0, + "grad_norm": 2.688516678536283, + "language_loss": 0.75635034, + "learning_rate": 3.967432588494471e-06, + "loss": 0.7783649, + "num_input_tokens_seen": 30457300, + "step": 1427, + "time_per_iteration": 2.62381649017334 + }, + { + "auxiliary_loss_clip": 0.01186955, + "auxiliary_loss_mlp": 0.01054606, + "balance_loss_clip": 1.05530071, + "balance_loss_mlp": 1.03229034, + "epoch": 0.08585600480986022, + "flos": 16033324993920.0, + "grad_norm": 3.0212352255102766, + "language_loss": 0.82156587, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84398156, + "num_input_tokens_seen": 30471580, + "step": 1428, + "time_per_iteration": 3.8691062927246094 + }, + { + "auxiliary_loss_clip": 0.01173836, + "auxiliary_loss_mlp": 0.01059837, + "balance_loss_clip": 1.05977106, + "balance_loss_mlp": 1.03594768, + "epoch": 0.08591612806252819, + "flos": 28657433404800.0, + "grad_norm": 2.2942504496676923, + "language_loss": 0.79669249, + "learning_rate": 3.967292444736023e-06, + "loss": 0.81902921, + "num_input_tokens_seen": 30492720, + "step": 1429, + "time_per_iteration": 2.5814480781555176 + }, + { + "auxiliary_loss_clip": 0.01165681, + "auxiliary_loss_mlp": 0.01061517, + "balance_loss_clip": 1.05703187, + "balance_loss_mlp": 1.03847396, + "epoch": 0.08597625131519615, + "flos": 20959119659520.0, + "grad_norm": 2.1062116865199294, + "language_loss": 0.8821522, + "learning_rate": 3.967222260955578e-06, + "loss": 0.90442419, + "num_input_tokens_seen": 30509535, + "step": 1430, + "time_per_iteration": 3.900022506713867 + }, + { + "auxiliary_loss_clip": 0.01141174, + "auxiliary_loss_mlp": 0.01070572, + "balance_loss_clip": 1.05605721, + "balance_loss_mlp": 1.04744518, + "epoch": 0.08603637456786412, + "flos": 23256360956160.0, + "grad_norm": 1.648138255737804, + "language_loss": 0.82049704, + "learning_rate": 3.96715200257787e-06, + "loss": 0.84261447, + "num_input_tokens_seen": 30529490, + "step": 1431, + "time_per_iteration": 2.582465171813965 + }, + { + "auxiliary_loss_clip": 0.01148417, + "auxiliary_loss_mlp": 0.01056838, + "balance_loss_clip": 1.05611229, + "balance_loss_mlp": 1.03285336, + "epoch": 0.0860964978205321, + "flos": 28694170039680.0, + "grad_norm": 1.5753224400325532, + "language_loss": 0.7782678, + "learning_rate": 3.967081669605559e-06, + "loss": 0.80032033, + "num_input_tokens_seen": 30550205, + "step": 1432, + "time_per_iteration": 2.6299450397491455 + }, + { + "auxiliary_loss_clip": 0.01164173, + "auxiliary_loss_mlp": 0.0105869, + "balance_loss_clip": 1.05182111, + "balance_loss_mlp": 1.03462112, + "epoch": 0.08615662107320006, + "flos": 19318397195520.0, + "grad_norm": 2.122380661239903, + "language_loss": 0.72660923, + "learning_rate": 3.967011262041315e-06, + "loss": 0.74883783, + "num_input_tokens_seen": 30568830, + "step": 1433, + "time_per_iteration": 2.554478406906128 + }, + { + "auxiliary_loss_clip": 0.01149844, + "auxiliary_loss_mlp": 0.00800064, + "balance_loss_clip": 1.05399513, + "balance_loss_mlp": 1.03091598, + "epoch": 0.08621674432586802, + "flos": 15851688894720.0, + "grad_norm": 2.7851188848530937, + "language_loss": 0.85714734, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.87664646, + "num_input_tokens_seen": 30585730, + "step": 1434, + "time_per_iteration": 2.5908522605895996 + }, + { + "auxiliary_loss_clip": 0.01167269, + "auxiliary_loss_mlp": 0.01054149, + "balance_loss_clip": 1.05405867, + "balance_loss_mlp": 1.03080797, + "epoch": 0.086276867578536, + "flos": 14100648785280.0, + "grad_norm": 2.148692344025134, + "language_loss": 0.78694725, + "learning_rate": 3.966870223147707e-06, + "loss": 0.80916142, + "num_input_tokens_seen": 30603180, + "step": 1435, + "time_per_iteration": 2.5363612174987793 + }, + { + "auxiliary_loss_clip": 0.01074257, + "auxiliary_loss_mlp": 0.0100768, + "balance_loss_clip": 1.05984211, + "balance_loss_mlp": 1.00405622, + "epoch": 0.08633699083120397, + "flos": 70184857772160.0, + "grad_norm": 0.8873535031849531, + "language_loss": 0.57923287, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60005224, + "num_input_tokens_seen": 30668895, + "step": 1436, + "time_per_iteration": 3.2980265617370605 + }, + { + "auxiliary_loss_clip": 0.01172773, + "auxiliary_loss_mlp": 0.01052721, + "balance_loss_clip": 1.05625916, + "balance_loss_mlp": 1.02721071, + "epoch": 0.08639711408387193, + "flos": 30298874140800.0, + "grad_norm": 2.3282229442786573, + "language_loss": 0.69479793, + "learning_rate": 3.966728885918437e-06, + "loss": 0.71705294, + "num_input_tokens_seen": 30688955, + "step": 1437, + "time_per_iteration": 2.596550941467285 + }, + { + "auxiliary_loss_clip": 0.01119571, + "auxiliary_loss_mlp": 0.0105307, + "balance_loss_clip": 1.05314267, + "balance_loss_mlp": 1.03040862, + "epoch": 0.08645723733653991, + "flos": 20297680663680.0, + "grad_norm": 2.026902945362301, + "language_loss": 0.723836, + "learning_rate": 3.966658105434627e-06, + "loss": 0.74556243, + "num_input_tokens_seen": 30706095, + "step": 1438, + "time_per_iteration": 2.6453452110290527 + }, + { + "auxiliary_loss_clip": 0.011752, + "auxiliary_loss_mlp": 0.0104966, + "balance_loss_clip": 1.05709982, + "balance_loss_mlp": 1.02629542, + "epoch": 0.08651736058920788, + "flos": 32890583134080.0, + "grad_norm": 1.6839047953935795, + "language_loss": 0.64292943, + "learning_rate": 3.966587250374945e-06, + "loss": 0.66517806, + "num_input_tokens_seen": 30729025, + "step": 1439, + "time_per_iteration": 2.64375638961792 + }, + { + "auxiliary_loss_clip": 0.0115247, + "auxiliary_loss_mlp": 0.01055511, + "balance_loss_clip": 1.05702353, + "balance_loss_mlp": 1.03138304, + "epoch": 0.08657748384187584, + "flos": 22637368857600.0, + "grad_norm": 1.9747424452714684, + "language_loss": 0.8745411, + "learning_rate": 3.966516320742077e-06, + "loss": 0.89662087, + "num_input_tokens_seen": 30746155, + "step": 1440, + "time_per_iteration": 2.5738162994384766 + }, + { + "auxiliary_loss_clip": 0.01153302, + "auxiliary_loss_mlp": 0.00793454, + "balance_loss_clip": 1.0578903, + "balance_loss_mlp": 1.01519179, + "epoch": 0.08663760709454381, + "flos": 23658380951040.0, + "grad_norm": 2.3783300870478095, + "language_loss": 0.83693051, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.85639805, + "num_input_tokens_seen": 30761410, + "step": 1441, + "time_per_iteration": 2.6063992977142334 + }, + { + "auxiliary_loss_clip": 0.01085319, + "auxiliary_loss_mlp": 0.0101356, + "balance_loss_clip": 1.03012192, + "balance_loss_mlp": 1.01041341, + "epoch": 0.08669773034721179, + "flos": 62686564911360.0, + "grad_norm": 0.8530046741951038, + "language_loss": 0.60457605, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62556487, + "num_input_tokens_seen": 30823010, + "step": 1442, + "time_per_iteration": 3.19811749458313 + }, + { + "auxiliary_loss_clip": 0.0116451, + "auxiliary_loss_mlp": 0.01051623, + "balance_loss_clip": 1.05652547, + "balance_loss_mlp": 1.02887809, + "epoch": 0.08675785359987975, + "flos": 20667489137280.0, + "grad_norm": 2.4859857258956715, + "language_loss": 0.79005504, + "learning_rate": 3.96630308443127e-06, + "loss": 0.8122164, + "num_input_tokens_seen": 30841980, + "step": 1443, + "time_per_iteration": 2.5527591705322266 + }, + { + "auxiliary_loss_clip": 0.01178351, + "auxiliary_loss_mlp": 0.01049782, + "balance_loss_clip": 1.05524683, + "balance_loss_mlp": 1.02682245, + "epoch": 0.08681797685254772, + "flos": 26941118768640.0, + "grad_norm": 1.551168130742296, + "language_loss": 0.82787728, + "learning_rate": 3.966231856532584e-06, + "loss": 0.85015857, + "num_input_tokens_seen": 30863280, + "step": 1444, + "time_per_iteration": 2.5685172080993652 + }, + { + "auxiliary_loss_clip": 0.01193986, + "auxiliary_loss_mlp": 0.0105288, + "balance_loss_clip": 1.05847955, + "balance_loss_mlp": 1.02998042, + "epoch": 0.0868781001052157, + "flos": 17712831168000.0, + "grad_norm": 1.9912234202628565, + "language_loss": 0.86965901, + "learning_rate": 3.966160554074189e-06, + "loss": 0.89212763, + "num_input_tokens_seen": 30881710, + "step": 1445, + "time_per_iteration": 2.491346836090088 + }, + { + "auxiliary_loss_clip": 0.01179781, + "auxiliary_loss_mlp": 0.01054057, + "balance_loss_clip": 1.06129634, + "balance_loss_mlp": 1.03263497, + "epoch": 0.08693822335788366, + "flos": 19896522595200.0, + "grad_norm": 1.9861155721774153, + "language_loss": 0.82124597, + "learning_rate": 3.96608917705879e-06, + "loss": 0.84358436, + "num_input_tokens_seen": 30900225, + "step": 1446, + "time_per_iteration": 2.5155255794525146 + }, + { + "auxiliary_loss_clip": 0.01069396, + "auxiliary_loss_mlp": 0.01004954, + "balance_loss_clip": 1.03010166, + "balance_loss_mlp": 1.00180721, + "epoch": 0.08699834661055163, + "flos": 67023747406080.0, + "grad_norm": 0.723766240555001, + "language_loss": 0.54819649, + "learning_rate": 3.966017725489091e-06, + "loss": 0.56894004, + "num_input_tokens_seen": 30959580, + "step": 1447, + "time_per_iteration": 3.1502277851104736 + }, + { + "auxiliary_loss_clip": 0.01150335, + "auxiliary_loss_mlp": 0.01054432, + "balance_loss_clip": 1.05708385, + "balance_loss_mlp": 1.03229451, + "epoch": 0.0870584698632196, + "flos": 13480507451520.0, + "grad_norm": 2.992249319629708, + "language_loss": 0.8437885, + "learning_rate": 3.965946199367804e-06, + "loss": 0.86583614, + "num_input_tokens_seen": 30976775, + "step": 1448, + "time_per_iteration": 2.551645040512085 + }, + { + "auxiliary_loss_clip": 0.0119344, + "auxiliary_loss_mlp": 0.01054923, + "balance_loss_clip": 1.05936146, + "balance_loss_mlp": 1.03276229, + "epoch": 0.08711859311588757, + "flos": 16107013745280.0, + "grad_norm": 4.210181490187109, + "language_loss": 0.80519062, + "learning_rate": 3.965874598697638e-06, + "loss": 0.82767427, + "num_input_tokens_seen": 30990495, + "step": 1449, + "time_per_iteration": 2.451266050338745 + }, + { + "auxiliary_loss_clip": 0.01137404, + "auxiliary_loss_mlp": 0.01050557, + "balance_loss_clip": 1.0571816, + "balance_loss_mlp": 1.02809834, + "epoch": 0.08717871636855554, + "flos": 38472357928320.0, + "grad_norm": 1.9595189644924056, + "language_loss": 0.71343768, + "learning_rate": 3.965802923481313e-06, + "loss": 0.73531723, + "num_input_tokens_seen": 31014080, + "step": 1450, + "time_per_iteration": 2.7288060188293457 + }, + { + "auxiliary_loss_clip": 0.0112499, + "auxiliary_loss_mlp": 0.01057577, + "balance_loss_clip": 1.05714262, + "balance_loss_mlp": 1.0351181, + "epoch": 0.0872388396212235, + "flos": 17600574188160.0, + "grad_norm": 2.1332848560021285, + "language_loss": 0.83552158, + "learning_rate": 3.965731173721542e-06, + "loss": 0.85734719, + "num_input_tokens_seen": 31031210, + "step": 1451, + "time_per_iteration": 2.5917136669158936 + }, + { + "auxiliary_loss_clip": 0.01136661, + "auxiliary_loss_mlp": 0.00793719, + "balance_loss_clip": 1.05699229, + "balance_loss_mlp": 1.02087259, + "epoch": 0.08729896287389148, + "flos": 25259385951360.0, + "grad_norm": 2.0310730835785735, + "language_loss": 0.74943852, + "learning_rate": 3.965659349421049e-06, + "loss": 0.76874232, + "num_input_tokens_seen": 31049710, + "step": 1452, + "time_per_iteration": 2.6472387313842773 + }, + { + "auxiliary_loss_clip": 0.0115545, + "auxiliary_loss_mlp": 0.01056369, + "balance_loss_clip": 1.055058, + "balance_loss_mlp": 1.03185964, + "epoch": 0.08735908612655945, + "flos": 15632454234240.0, + "grad_norm": 3.8617338582513687, + "language_loss": 0.80128908, + "learning_rate": 3.965587450582556e-06, + "loss": 0.82340729, + "num_input_tokens_seen": 31066160, + "step": 1453, + "time_per_iteration": 2.5252108573913574 + }, + { + "auxiliary_loss_clip": 0.01163223, + "auxiliary_loss_mlp": 0.01055695, + "balance_loss_clip": 1.05562103, + "balance_loss_mlp": 1.03249693, + "epoch": 0.08741920937922741, + "flos": 20339660684160.0, + "grad_norm": 2.0144094220252486, + "language_loss": 0.71426308, + "learning_rate": 3.96551547720879e-06, + "loss": 0.73645222, + "num_input_tokens_seen": 31085270, + "step": 1454, + "time_per_iteration": 2.5762574672698975 + }, + { + "auxiliary_loss_clip": 0.0107413, + "auxiliary_loss_mlp": 0.01007969, + "balance_loss_clip": 1.02957368, + "balance_loss_mlp": 1.00465477, + "epoch": 0.08747933263189539, + "flos": 62819795433600.0, + "grad_norm": 0.7765726661350218, + "language_loss": 0.58604121, + "learning_rate": 3.96544342930248e-06, + "loss": 0.60686219, + "num_input_tokens_seen": 31148445, + "step": 1455, + "time_per_iteration": 3.138570785522461 + }, + { + "auxiliary_loss_clip": 0.01189513, + "auxiliary_loss_mlp": 0.01055131, + "balance_loss_clip": 1.05740499, + "balance_loss_mlp": 1.03243303, + "epoch": 0.08753945588456336, + "flos": 33035877648000.0, + "grad_norm": 2.031102427098728, + "language_loss": 0.775038, + "learning_rate": 3.965371306866359e-06, + "loss": 0.7974844, + "num_input_tokens_seen": 31168770, + "step": 1456, + "time_per_iteration": 2.6047418117523193 + }, + { + "auxiliary_loss_clip": 0.01127646, + "auxiliary_loss_mlp": 0.01051387, + "balance_loss_clip": 1.05144501, + "balance_loss_mlp": 1.02870154, + "epoch": 0.08759957913723132, + "flos": 35547182046720.0, + "grad_norm": 2.430167652290147, + "language_loss": 0.72266495, + "learning_rate": 3.96529910990316e-06, + "loss": 0.74445522, + "num_input_tokens_seen": 31189270, + "step": 1457, + "time_per_iteration": 2.7274932861328125 + }, + { + "auxiliary_loss_clip": 0.01171924, + "auxiliary_loss_mlp": 0.01046058, + "balance_loss_clip": 1.05369461, + "balance_loss_mlp": 1.02438569, + "epoch": 0.0876597023898993, + "flos": 23911120022400.0, + "grad_norm": 1.6206134596293675, + "language_loss": 0.86420918, + "learning_rate": 3.965226838415622e-06, + "loss": 0.88638896, + "num_input_tokens_seen": 31210385, + "step": 1458, + "time_per_iteration": 2.5647056102752686 + }, + { + "auxiliary_loss_clip": 0.01167594, + "auxiliary_loss_mlp": 0.01054817, + "balance_loss_clip": 1.05929649, + "balance_loss_mlp": 1.03252482, + "epoch": 0.08771982564256726, + "flos": 18114025150080.0, + "grad_norm": 2.0684349009193315, + "language_loss": 0.80577344, + "learning_rate": 3.965154492406486e-06, + "loss": 0.82799757, + "num_input_tokens_seen": 31229745, + "step": 1459, + "time_per_iteration": 2.534670829772949 + }, + { + "auxiliary_loss_clip": 0.0112896, + "auxiliary_loss_mlp": 0.01052615, + "balance_loss_clip": 1.0552963, + "balance_loss_mlp": 1.02889204, + "epoch": 0.08777994889523523, + "flos": 17712005155200.0, + "grad_norm": 2.353208371814136, + "language_loss": 0.84149063, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.8633064, + "num_input_tokens_seen": 31248280, + "step": 1460, + "time_per_iteration": 4.252511024475098 + }, + { + "auxiliary_loss_clip": 0.01173025, + "auxiliary_loss_mlp": 0.01052468, + "balance_loss_clip": 1.05379105, + "balance_loss_mlp": 1.03117681, + "epoch": 0.0878400721479032, + "flos": 12819930382080.0, + "grad_norm": 4.8754693487306255, + "language_loss": 0.80502784, + "learning_rate": 3.965009576834394e-06, + "loss": 0.82728279, + "num_input_tokens_seen": 31262190, + "step": 1461, + "time_per_iteration": 2.5258021354675293 + }, + { + "auxiliary_loss_clip": 0.01185506, + "auxiliary_loss_mlp": 0.01058105, + "balance_loss_clip": 1.08982849, + "balance_loss_mlp": 1.03608704, + "epoch": 0.08790019540057117, + "flos": 26392690938240.0, + "grad_norm": 1.7226847751673817, + "language_loss": 0.76390898, + "learning_rate": 3.964937007276932e-06, + "loss": 0.78634512, + "num_input_tokens_seen": 31283690, + "step": 1462, + "time_per_iteration": 4.035697937011719 + }, + { + "auxiliary_loss_clip": 0.01172076, + "auxiliary_loss_mlp": 0.01057392, + "balance_loss_clip": 1.05871022, + "balance_loss_mlp": 1.03374052, + "epoch": 0.08796031865323914, + "flos": 19134031662720.0, + "grad_norm": 2.0945109168234106, + "language_loss": 0.7455709, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.7678656, + "num_input_tokens_seen": 31302505, + "step": 1463, + "time_per_iteration": 2.545161485671997 + }, + { + "auxiliary_loss_clip": 0.01185757, + "auxiliary_loss_mlp": 0.01057302, + "balance_loss_clip": 1.05653811, + "balance_loss_mlp": 1.03329337, + "epoch": 0.0880204419059071, + "flos": 26064287867520.0, + "grad_norm": 1.9194947950212204, + "language_loss": 0.83017516, + "learning_rate": 3.964791644632941e-06, + "loss": 0.85260576, + "num_input_tokens_seen": 31323070, + "step": 1464, + "time_per_iteration": 2.565160036087036 + }, + { + "auxiliary_loss_clip": 0.01167899, + "auxiliary_loss_mlp": 0.01063378, + "balance_loss_clip": 1.05491424, + "balance_loss_mlp": 1.04096639, + "epoch": 0.08808056515857508, + "flos": 22377842115840.0, + "grad_norm": 2.05675057302645, + "language_loss": 0.78117168, + "learning_rate": 3.964718851551923e-06, + "loss": 0.80348444, + "num_input_tokens_seen": 31341880, + "step": 1465, + "time_per_iteration": 2.5701022148132324 + }, + { + "auxiliary_loss_clip": 0.0119436, + "auxiliary_loss_mlp": 0.01060358, + "balance_loss_clip": 1.05879271, + "balance_loss_mlp": 1.03918672, + "epoch": 0.08814068841124305, + "flos": 23185293897600.0, + "grad_norm": 2.1301398677264185, + "language_loss": 0.85075903, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.87330616, + "num_input_tokens_seen": 31361995, + "step": 1466, + "time_per_iteration": 3.976207733154297 + }, + { + "auxiliary_loss_clip": 0.01119677, + "auxiliary_loss_mlp": 0.0079117, + "balance_loss_clip": 1.05345154, + "balance_loss_mlp": 1.01382589, + "epoch": 0.08820081166391101, + "flos": 25155281358720.0, + "grad_norm": 2.1786291209668596, + "language_loss": 0.83956236, + "learning_rate": 3.964573041885641e-06, + "loss": 0.85867083, + "num_input_tokens_seen": 31381515, + "step": 1467, + "time_per_iteration": 2.6584904193878174 + }, + { + "auxiliary_loss_clip": 0.01173276, + "auxiliary_loss_mlp": 0.01058601, + "balance_loss_clip": 1.05626035, + "balance_loss_mlp": 1.03536677, + "epoch": 0.08826093491657899, + "flos": 22231685675520.0, + "grad_norm": 1.793268306223792, + "language_loss": 0.75704187, + "learning_rate": 3.964500025305907e-06, + "loss": 0.77936065, + "num_input_tokens_seen": 31400345, + "step": 1468, + "time_per_iteration": 2.52948260307312 + }, + { + "auxiliary_loss_clip": 0.01173741, + "auxiliary_loss_mlp": 0.01058029, + "balance_loss_clip": 1.05695343, + "balance_loss_mlp": 1.03713155, + "epoch": 0.08832105816924696, + "flos": 22126826897280.0, + "grad_norm": 1.8559897663152325, + "language_loss": 0.8039223, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.82623994, + "num_input_tokens_seen": 31419620, + "step": 1469, + "time_per_iteration": 3.997688055038452 + }, + { + "auxiliary_loss_clip": 0.01192303, + "auxiliary_loss_mlp": 0.01060648, + "balance_loss_clip": 1.0578227, + "balance_loss_mlp": 1.03809404, + "epoch": 0.08838118142191492, + "flos": 17566495159680.0, + "grad_norm": 2.1566482021590314, + "language_loss": 0.77848309, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.80101264, + "num_input_tokens_seen": 31437970, + "step": 1470, + "time_per_iteration": 2.4564836025238037 + }, + { + "auxiliary_loss_clip": 0.01184202, + "auxiliary_loss_mlp": 0.01066918, + "balance_loss_clip": 1.05512452, + "balance_loss_mlp": 1.04397058, + "epoch": 0.0884413046745829, + "flos": 20777196251520.0, + "grad_norm": 1.8652659549643773, + "language_loss": 0.84152907, + "learning_rate": 3.964280528613569e-06, + "loss": 0.86404026, + "num_input_tokens_seen": 31457040, + "step": 1471, + "time_per_iteration": 2.5191969871520996 + }, + { + "auxiliary_loss_clip": 0.01160087, + "auxiliary_loss_mlp": 0.01057659, + "balance_loss_clip": 1.06921208, + "balance_loss_mlp": 1.0378468, + "epoch": 0.08850142792725087, + "flos": 22125462180480.0, + "grad_norm": 1.8061843909292565, + "language_loss": 0.83565956, + "learning_rate": 3.964207214074324e-06, + "loss": 0.85783696, + "num_input_tokens_seen": 31477520, + "step": 1472, + "time_per_iteration": 2.609419107437134 + }, + { + "auxiliary_loss_clip": 0.01167856, + "auxiliary_loss_mlp": 0.0105972, + "balance_loss_clip": 1.05550039, + "balance_loss_mlp": 1.03707051, + "epoch": 0.08856155117991883, + "flos": 22418744728320.0, + "grad_norm": 2.3285326427946127, + "language_loss": 0.82838386, + "learning_rate": 3.964133825052146e-06, + "loss": 0.85065961, + "num_input_tokens_seen": 31495575, + "step": 1473, + "time_per_iteration": 2.5579936504364014 + }, + { + "auxiliary_loss_clip": 0.01122504, + "auxiliary_loss_mlp": 0.01060368, + "balance_loss_clip": 1.05263925, + "balance_loss_mlp": 1.03950691, + "epoch": 0.0886216744325868, + "flos": 29937002572800.0, + "grad_norm": 1.6012922988826765, + "language_loss": 0.78777742, + "learning_rate": 3.964060361549816e-06, + "loss": 0.80960613, + "num_input_tokens_seen": 31520020, + "step": 1474, + "time_per_iteration": 2.721236228942871 + }, + { + "auxiliary_loss_clip": 0.01152612, + "auxiliary_loss_mlp": 0.01068983, + "balance_loss_clip": 1.07052612, + "balance_loss_mlp": 1.04319835, + "epoch": 0.08868179768525478, + "flos": 23982833525760.0, + "grad_norm": 1.8004191272671695, + "language_loss": 0.78462052, + "learning_rate": 3.963986823570121e-06, + "loss": 0.80683649, + "num_input_tokens_seen": 31539265, + "step": 1475, + "time_per_iteration": 2.60508394241333 + }, + { + "auxiliary_loss_clip": 0.01187234, + "auxiliary_loss_mlp": 0.01050283, + "balance_loss_clip": 1.05627513, + "balance_loss_mlp": 1.0279547, + "epoch": 0.08874192093792274, + "flos": 43177553216640.0, + "grad_norm": 1.6384832172061847, + "language_loss": 0.7396518, + "learning_rate": 3.963913211115848e-06, + "loss": 0.76202691, + "num_input_tokens_seen": 31563425, + "step": 1476, + "time_per_iteration": 2.6666409969329834 + }, + { + "auxiliary_loss_clip": 0.01165178, + "auxiliary_loss_mlp": 0.01057523, + "balance_loss_clip": 1.0550977, + "balance_loss_mlp": 1.03436053, + "epoch": 0.0888020441905907, + "flos": 32852445868800.0, + "grad_norm": 1.6086390125562438, + "language_loss": 0.74552554, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.76775259, + "num_input_tokens_seen": 31584525, + "step": 1477, + "time_per_iteration": 2.653693675994873 + }, + { + "auxiliary_loss_clip": 0.01184569, + "auxiliary_loss_mlp": 0.0105138, + "balance_loss_clip": 1.05460894, + "balance_loss_mlp": 1.02784801, + "epoch": 0.08886216744325869, + "flos": 23149347361920.0, + "grad_norm": 1.8862392302905042, + "language_loss": 0.86755311, + "learning_rate": 3.963765762794739e-06, + "loss": 0.88991261, + "num_input_tokens_seen": 31603325, + "step": 1478, + "time_per_iteration": 2.481678009033203 + }, + { + "auxiliary_loss_clip": 0.01172388, + "auxiliary_loss_mlp": 0.01055656, + "balance_loss_clip": 1.0547328, + "balance_loss_mlp": 1.03428185, + "epoch": 0.08892229069592665, + "flos": 23331593992320.0, + "grad_norm": 1.6796737396519537, + "language_loss": 0.77599698, + "learning_rate": 3.963691926933495e-06, + "loss": 0.79827738, + "num_input_tokens_seen": 31624820, + "step": 1479, + "time_per_iteration": 2.5577874183654785 + }, + { + "auxiliary_loss_clip": 0.01169502, + "auxiliary_loss_mlp": 0.01051735, + "balance_loss_clip": 1.06883597, + "balance_loss_mlp": 1.02834582, + "epoch": 0.08898241394859462, + "flos": 26213784272640.0, + "grad_norm": 2.6005372299476988, + "language_loss": 0.7806316, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.80284399, + "num_input_tokens_seen": 31646080, + "step": 1480, + "time_per_iteration": 2.5667178630828857 + }, + { + "auxiliary_loss_clip": 0.01176094, + "auxiliary_loss_mlp": 0.01061731, + "balance_loss_clip": 1.05491555, + "balance_loss_mlp": 1.03747165, + "epoch": 0.0890425372012626, + "flos": 23550613171200.0, + "grad_norm": 1.7230803744806364, + "language_loss": 0.66436011, + "learning_rate": 3.963544031823624e-06, + "loss": 0.68673837, + "num_input_tokens_seen": 31665770, + "step": 1481, + "time_per_iteration": 2.5299625396728516 + }, + { + "auxiliary_loss_clip": 0.01137769, + "auxiliary_loss_mlp": 0.01052177, + "balance_loss_clip": 1.05413735, + "balance_loss_mlp": 1.03038597, + "epoch": 0.08910266045393056, + "flos": 23002795872000.0, + "grad_norm": 1.8883646877639868, + "language_loss": 0.96223301, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.98413253, + "num_input_tokens_seen": 31683805, + "step": 1482, + "time_per_iteration": 2.600769519805908 + }, + { + "auxiliary_loss_clip": 0.01152973, + "auxiliary_loss_mlp": 0.01061518, + "balance_loss_clip": 1.05664361, + "balance_loss_mlp": 1.03823686, + "epoch": 0.08916278370659853, + "flos": 31936508035200.0, + "grad_norm": 2.965354733436817, + "language_loss": 0.7824477, + "learning_rate": 3.96339583888261e-06, + "loss": 0.80459261, + "num_input_tokens_seen": 31704630, + "step": 1483, + "time_per_iteration": 2.6356382369995117 + }, + { + "auxiliary_loss_clip": 0.01167338, + "auxiliary_loss_mlp": 0.0107889, + "balance_loss_clip": 1.05649185, + "balance_loss_mlp": 1.0557636, + "epoch": 0.08922290695926649, + "flos": 17530404969600.0, + "grad_norm": 2.775592602752793, + "language_loss": 0.85460865, + "learning_rate": 3.963321630732448e-06, + "loss": 0.87707096, + "num_input_tokens_seen": 31723255, + "step": 1484, + "time_per_iteration": 2.521491527557373 + }, + { + "auxiliary_loss_clip": 0.01195185, + "auxiliary_loss_mlp": 0.01061755, + "balance_loss_clip": 1.06053257, + "balance_loss_mlp": 1.03822339, + "epoch": 0.08928303021193447, + "flos": 32125075459200.0, + "grad_norm": 1.7776800892137086, + "language_loss": 0.80499876, + "learning_rate": 3.963247348132932e-06, + "loss": 0.82756811, + "num_input_tokens_seen": 31747045, + "step": 1485, + "time_per_iteration": 2.5922036170959473 + }, + { + "auxiliary_loss_clip": 0.01172634, + "auxiliary_loss_mlp": 0.01061864, + "balance_loss_clip": 1.05730295, + "balance_loss_mlp": 1.0383085, + "epoch": 0.08934315346460243, + "flos": 22125210785280.0, + "grad_norm": 2.52907113862076, + "language_loss": 0.82853043, + "learning_rate": 3.96317299108688e-06, + "loss": 0.85087538, + "num_input_tokens_seen": 31766615, + "step": 1486, + "time_per_iteration": 2.5227699279785156 + }, + { + "auxiliary_loss_clip": 0.01144919, + "auxiliary_loss_mlp": 0.0106884, + "balance_loss_clip": 1.05871999, + "balance_loss_mlp": 1.04597604, + "epoch": 0.0894032767172704, + "flos": 22565583527040.0, + "grad_norm": 3.645222599128206, + "language_loss": 0.76581693, + "learning_rate": 3.963098559597111e-06, + "loss": 0.78795457, + "num_input_tokens_seen": 31785855, + "step": 1487, + "time_per_iteration": 2.585028886795044 + }, + { + "auxiliary_loss_clip": 0.01157187, + "auxiliary_loss_mlp": 0.01062588, + "balance_loss_clip": 1.05447996, + "balance_loss_mlp": 1.0380547, + "epoch": 0.08946339996993838, + "flos": 20193396503040.0, + "grad_norm": 2.6502370406749445, + "language_loss": 0.83103651, + "learning_rate": 3.963024053666449e-06, + "loss": 0.85323423, + "num_input_tokens_seen": 31804210, + "step": 1488, + "time_per_iteration": 2.5531671047210693 + }, + { + "auxiliary_loss_clip": 0.01172286, + "auxiliary_loss_mlp": 0.01049049, + "balance_loss_clip": 1.0566386, + "balance_loss_mlp": 1.02693582, + "epoch": 0.08952352322260634, + "flos": 48360181104000.0, + "grad_norm": 1.7693566292890348, + "language_loss": 0.71657169, + "learning_rate": 3.962949473297718e-06, + "loss": 0.73878503, + "num_input_tokens_seen": 31826150, + "step": 1489, + "time_per_iteration": 2.748378276824951 + }, + { + "auxiliary_loss_clip": 0.01149268, + "auxiliary_loss_mlp": 0.0105354, + "balance_loss_clip": 1.05275655, + "balance_loss_mlp": 1.03054488, + "epoch": 0.08958364647527431, + "flos": 31793081028480.0, + "grad_norm": 2.0582958044499793, + "language_loss": 0.89333296, + "learning_rate": 3.962874818493745e-06, + "loss": 0.91536105, + "num_input_tokens_seen": 31848060, + "step": 1490, + "time_per_iteration": 2.7027411460876465 + }, + { + "auxiliary_loss_clip": 0.01184481, + "auxiliary_loss_mlp": 0.01065339, + "balance_loss_clip": 1.05729842, + "balance_loss_mlp": 1.04239166, + "epoch": 0.08964376972794229, + "flos": 23368186972800.0, + "grad_norm": 2.5118489442322662, + "language_loss": 0.74041879, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.76291704, + "num_input_tokens_seen": 31870040, + "step": 1491, + "time_per_iteration": 2.5703282356262207 + }, + { + "auxiliary_loss_clip": 0.0118967, + "auxiliary_loss_mlp": 0.00806942, + "balance_loss_clip": 1.05985188, + "balance_loss_mlp": 1.04633868, + "epoch": 0.08970389298061025, + "flos": 23294785530240.0, + "grad_norm": 1.8254582310338925, + "language_loss": 0.76950085, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.78946698, + "num_input_tokens_seen": 31890400, + "step": 1492, + "time_per_iteration": 2.514705181121826 + }, + { + "auxiliary_loss_clip": 0.01186607, + "auxiliary_loss_mlp": 0.0105474, + "balance_loss_clip": 1.05934525, + "balance_loss_mlp": 1.03261447, + "epoch": 0.08976401623327822, + "flos": 33761703772800.0, + "grad_norm": 2.106506929682526, + "language_loss": 0.71034497, + "learning_rate": 3.962650407498707e-06, + "loss": 0.7327584, + "num_input_tokens_seen": 31913435, + "step": 1493, + "time_per_iteration": 2.6029624938964844 + }, + { + "auxiliary_loss_clip": 0.01188673, + "auxiliary_loss_mlp": 0.01056902, + "balance_loss_clip": 1.05794764, + "balance_loss_mlp": 1.03375113, + "epoch": 0.08982413948594618, + "flos": 23911335504000.0, + "grad_norm": 2.076088876937707, + "language_loss": 0.86863351, + "learning_rate": 3.962575454982109e-06, + "loss": 0.8910892, + "num_input_tokens_seen": 31932435, + "step": 1494, + "time_per_iteration": 2.508139133453369 + }, + { + "auxiliary_loss_clip": 0.01086697, + "auxiliary_loss_mlp": 0.01063276, + "balance_loss_clip": 1.05280256, + "balance_loss_mlp": 1.03914821, + "epoch": 0.08988426273861416, + "flos": 16837544551680.0, + "grad_norm": 1.8630122950744596, + "language_loss": 0.82844341, + "learning_rate": 3.962500428044454e-06, + "loss": 0.84994316, + "num_input_tokens_seen": 31950125, + "step": 1495, + "time_per_iteration": 2.683941602706909 + }, + { + "auxiliary_loss_clip": 0.01170157, + "auxiliary_loss_mlp": 0.01054693, + "balance_loss_clip": 1.05918872, + "balance_loss_mlp": 1.0321033, + "epoch": 0.08994438599128213, + "flos": 14793365548800.0, + "grad_norm": 6.602765023709375, + "language_loss": 0.701684, + "learning_rate": 3.962425326688585e-06, + "loss": 0.7239325, + "num_input_tokens_seen": 31968050, + "step": 1496, + "time_per_iteration": 2.5291266441345215 + }, + { + "auxiliary_loss_clip": 0.01162834, + "auxiliary_loss_mlp": 0.01047548, + "balance_loss_clip": 1.05468535, + "balance_loss_mlp": 1.02693689, + "epoch": 0.09000450924395009, + "flos": 17384320356480.0, + "grad_norm": 1.5963090340107997, + "language_loss": 0.79752511, + "learning_rate": 3.962350150917351e-06, + "loss": 0.81962895, + "num_input_tokens_seen": 31985675, + "step": 1497, + "time_per_iteration": 2.5172297954559326 + }, + { + "auxiliary_loss_clip": 0.01127397, + "auxiliary_loss_mlp": 0.0105524, + "balance_loss_clip": 1.05693519, + "balance_loss_mlp": 1.03193474, + "epoch": 0.09006463249661807, + "flos": 24280317964800.0, + "grad_norm": 2.270428470659929, + "language_loss": 0.82687432, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.84870064, + "num_input_tokens_seen": 32005180, + "step": 1498, + "time_per_iteration": 4.057194471359253 + }, + { + "auxiliary_loss_clip": 0.01168836, + "auxiliary_loss_mlp": 0.01060404, + "balance_loss_clip": 1.05768895, + "balance_loss_mlp": 1.03855276, + "epoch": 0.09012475574928604, + "flos": 13661928069120.0, + "grad_norm": 4.0742591383826285, + "language_loss": 0.79041547, + "learning_rate": 3.962199576140195e-06, + "loss": 0.8127079, + "num_input_tokens_seen": 32022970, + "step": 1499, + "time_per_iteration": 2.527783155441284 + }, + { + "auxiliary_loss_clip": 0.01161343, + "auxiliary_loss_mlp": 0.00787912, + "balance_loss_clip": 1.05504107, + "balance_loss_mlp": 1.00948477, + "epoch": 0.090184879001954, + "flos": 23327751237120.0, + "grad_norm": 1.591349316446943, + "language_loss": 0.92961919, + "learning_rate": 3.962124177139981e-06, + "loss": 0.9491117, + "num_input_tokens_seen": 32043055, + "step": 1500, + "time_per_iteration": 2.5245378017425537 + }, + { + "auxiliary_loss_clip": 0.01148352, + "auxiliary_loss_mlp": 0.01056684, + "balance_loss_clip": 1.05602717, + "balance_loss_mlp": 1.03280616, + "epoch": 0.09024500225462198, + "flos": 23002688131200.0, + "grad_norm": 3.585268945874999, + "language_loss": 0.74299836, + "learning_rate": 3.962048703735822e-06, + "loss": 0.76504868, + "num_input_tokens_seen": 32061900, + "step": 1501, + "time_per_iteration": 3.9402801990509033 + }, + { + "auxiliary_loss_clip": 0.01066696, + "auxiliary_loss_mlp": 0.01016742, + "balance_loss_clip": 1.04115081, + "balance_loss_mlp": 1.01292706, + "epoch": 0.09030512550728995, + "flos": 62189203242240.0, + "grad_norm": 0.7346539458841886, + "language_loss": 0.58344895, + "learning_rate": 3.96197315593058e-06, + "loss": 0.60428333, + "num_input_tokens_seen": 32122745, + "step": 1502, + "time_per_iteration": 3.178743839263916 + }, + { + "auxiliary_loss_clip": 0.01148019, + "auxiliary_loss_mlp": 0.01070823, + "balance_loss_clip": 1.05187154, + "balance_loss_mlp": 1.04710054, + "epoch": 0.09036524875995791, + "flos": 38800689171840.0, + "grad_norm": 2.3388638137498536, + "language_loss": 0.69822526, + "learning_rate": 3.961897533727119e-06, + "loss": 0.72041368, + "num_input_tokens_seen": 32145125, + "step": 1503, + "time_per_iteration": 2.7136096954345703 + }, + { + "auxiliary_loss_clip": 0.01136416, + "auxiliary_loss_mlp": 0.01058915, + "balance_loss_clip": 1.05259335, + "balance_loss_mlp": 1.0368253, + "epoch": 0.09042537201262588, + "flos": 21690081429120.0, + "grad_norm": 2.178340786075428, + "language_loss": 0.86266947, + "learning_rate": 3.961821837128306e-06, + "loss": 0.88462281, + "num_input_tokens_seen": 32166255, + "step": 1504, + "time_per_iteration": 2.636479139328003 + }, + { + "auxiliary_loss_clip": 0.01145905, + "auxiliary_loss_mlp": 0.01069652, + "balance_loss_clip": 1.05633998, + "balance_loss_mlp": 1.04401016, + "epoch": 0.09048549526529386, + "flos": 22267021680000.0, + "grad_norm": 2.3572676148247154, + "language_loss": 0.72550428, + "learning_rate": 3.961746066137014e-06, + "loss": 0.7476598, + "num_input_tokens_seen": 32184010, + "step": 1505, + "time_per_iteration": 3.9850544929504395 + }, + { + "auxiliary_loss_clip": 0.01134673, + "auxiliary_loss_mlp": 0.01052933, + "balance_loss_clip": 1.05248451, + "balance_loss_mlp": 1.02993798, + "epoch": 0.09054561851796182, + "flos": 14610939350400.0, + "grad_norm": 2.166874227117722, + "language_loss": 0.80677038, + "learning_rate": 3.961670220756114e-06, + "loss": 0.82864648, + "num_input_tokens_seen": 32201635, + "step": 1506, + "time_per_iteration": 2.5869858264923096 + }, + { + "auxiliary_loss_clip": 0.01139233, + "auxiliary_loss_mlp": 0.01052137, + "balance_loss_clip": 1.05140781, + "balance_loss_mlp": 1.0306797, + "epoch": 0.09060574177062979, + "flos": 27636169916160.0, + "grad_norm": 1.9244742593211628, + "language_loss": 0.76189566, + "learning_rate": 3.961594300988482e-06, + "loss": 0.78380936, + "num_input_tokens_seen": 32221940, + "step": 1507, + "time_per_iteration": 4.141975402832031 + }, + { + "auxiliary_loss_clip": 0.01058796, + "auxiliary_loss_mlp": 0.01010573, + "balance_loss_clip": 1.02549386, + "balance_loss_mlp": 1.00661564, + "epoch": 0.09066586502329776, + "flos": 66085797513600.0, + "grad_norm": 0.7301178857652109, + "language_loss": 0.57632238, + "learning_rate": 3.961518306836998e-06, + "loss": 0.5970161, + "num_input_tokens_seen": 32276495, + "step": 1508, + "time_per_iteration": 2.9964821338653564 + }, + { + "auxiliary_loss_clip": 0.01161904, + "auxiliary_loss_mlp": 0.01055329, + "balance_loss_clip": 1.05364466, + "balance_loss_mlp": 1.03254771, + "epoch": 0.09072598827596573, + "flos": 18916449027840.0, + "grad_norm": 1.946227083421863, + "language_loss": 0.85385394, + "learning_rate": 3.961442238304543e-06, + "loss": 0.87602627, + "num_input_tokens_seen": 32294130, + "step": 1509, + "time_per_iteration": 2.5556492805480957 + }, + { + "auxiliary_loss_clip": 0.01164025, + "auxiliary_loss_mlp": 0.01064585, + "balance_loss_clip": 1.0559442, + "balance_loss_mlp": 1.04141045, + "epoch": 0.0907861115286337, + "flos": 24821742643200.0, + "grad_norm": 2.2457622079531983, + "language_loss": 0.8450948, + "learning_rate": 3.961366095394002e-06, + "loss": 0.86738086, + "num_input_tokens_seen": 32313555, + "step": 1510, + "time_per_iteration": 2.6287920475006104 + }, + { + "auxiliary_loss_clip": 0.01152068, + "auxiliary_loss_mlp": 0.01059978, + "balance_loss_clip": 1.05525887, + "balance_loss_mlp": 1.03723311, + "epoch": 0.09084623478130167, + "flos": 21652842003840.0, + "grad_norm": 2.172176150358652, + "language_loss": 0.85411179, + "learning_rate": 3.961289878108262e-06, + "loss": 0.87623221, + "num_input_tokens_seen": 32331430, + "step": 1511, + "time_per_iteration": 2.5501222610473633 + }, + { + "auxiliary_loss_clip": 0.01144599, + "auxiliary_loss_mlp": 0.01050694, + "balance_loss_clip": 1.05410194, + "balance_loss_mlp": 1.02878332, + "epoch": 0.09090635803396964, + "flos": 27639258485760.0, + "grad_norm": 1.4427513685903741, + "language_loss": 0.84912413, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.87107706, + "num_input_tokens_seen": 32353705, + "step": 1512, + "time_per_iteration": 2.6442928314208984 + }, + { + "auxiliary_loss_clip": 0.01147996, + "auxiliary_loss_mlp": 0.01048243, + "balance_loss_clip": 1.05238926, + "balance_loss_mlp": 1.02721453, + "epoch": 0.0909664812866376, + "flos": 17669127294720.0, + "grad_norm": 2.3280006614011888, + "language_loss": 0.87142682, + "learning_rate": 3.961137220422749e-06, + "loss": 0.89338923, + "num_input_tokens_seen": 32370520, + "step": 1513, + "time_per_iteration": 2.563004732131958 + }, + { + "auxiliary_loss_clip": 0.01168633, + "auxiliary_loss_mlp": 0.01052458, + "balance_loss_clip": 1.0570339, + "balance_loss_mlp": 1.03153706, + "epoch": 0.09102660453930557, + "flos": 23951448017280.0, + "grad_norm": 2.1163989995864743, + "language_loss": 0.86534721, + "learning_rate": 3.961060780028764e-06, + "loss": 0.8875581, + "num_input_tokens_seen": 32389105, + "step": 1514, + "time_per_iteration": 2.5247397422790527 + }, + { + "auxiliary_loss_clip": 0.0112907, + "auxiliary_loss_mlp": 0.01054426, + "balance_loss_clip": 1.05622387, + "balance_loss_mlp": 1.03426814, + "epoch": 0.09108672779197355, + "flos": 25812949426560.0, + "grad_norm": 2.0012730821916427, + "language_loss": 0.89925855, + "learning_rate": 3.960984265271159e-06, + "loss": 0.92109352, + "num_input_tokens_seen": 32408065, + "step": 1515, + "time_per_iteration": 2.656680107116699 + }, + { + "auxiliary_loss_clip": 0.01158434, + "auxiliary_loss_mlp": 0.01052408, + "balance_loss_clip": 1.05478299, + "balance_loss_mlp": 1.03035426, + "epoch": 0.09114685104464151, + "flos": 29639482220160.0, + "grad_norm": 2.124135782522118, + "language_loss": 0.85358018, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.87568855, + "num_input_tokens_seen": 32427225, + "step": 1516, + "time_per_iteration": 2.5991122722625732 + }, + { + "auxiliary_loss_clip": 0.01159755, + "auxiliary_loss_mlp": 0.01055221, + "balance_loss_clip": 1.05800748, + "balance_loss_mlp": 1.03240478, + "epoch": 0.09120697429730948, + "flos": 33729635905920.0, + "grad_norm": 2.05099865610027, + "language_loss": 0.81417572, + "learning_rate": 3.960831012676692e-06, + "loss": 0.83632553, + "num_input_tokens_seen": 32450510, + "step": 1517, + "time_per_iteration": 2.652472496032715 + }, + { + "auxiliary_loss_clip": 0.01176377, + "auxiliary_loss_mlp": 0.01069365, + "balance_loss_clip": 1.05681431, + "balance_loss_mlp": 1.04675126, + "epoch": 0.09126709754997746, + "flos": 18401381953920.0, + "grad_norm": 1.650330387910903, + "language_loss": 0.77734596, + "learning_rate": 3.960754274845642e-06, + "loss": 0.79980338, + "num_input_tokens_seen": 32468425, + "step": 1518, + "time_per_iteration": 2.508869171142578 + }, + { + "auxiliary_loss_clip": 0.01163094, + "auxiliary_loss_mlp": 0.01061503, + "balance_loss_clip": 1.05306602, + "balance_loss_mlp": 1.03971124, + "epoch": 0.09132722080264542, + "flos": 22091957769600.0, + "grad_norm": 1.8746944278411497, + "language_loss": 0.86216724, + "learning_rate": 3.960677462662594e-06, + "loss": 0.88441324, + "num_input_tokens_seen": 32487510, + "step": 1519, + "time_per_iteration": 2.5309321880340576 + }, + { + "auxiliary_loss_clip": 0.01159858, + "auxiliary_loss_mlp": 0.01054698, + "balance_loss_clip": 1.05261195, + "balance_loss_mlp": 1.03148818, + "epoch": 0.09138734405531339, + "flos": 21033131633280.0, + "grad_norm": 2.0977056898079063, + "language_loss": 0.73361552, + "learning_rate": 3.96060057613046e-06, + "loss": 0.75576109, + "num_input_tokens_seen": 32507250, + "step": 1520, + "time_per_iteration": 2.5505049228668213 + }, + { + "auxiliary_loss_clip": 0.01157324, + "auxiliary_loss_mlp": 0.01062146, + "balance_loss_clip": 1.05614948, + "balance_loss_mlp": 1.03978252, + "epoch": 0.09144746730798137, + "flos": 20083940784000.0, + "grad_norm": 2.5836459409947605, + "language_loss": 0.84871328, + "learning_rate": 3.960523615252156e-06, + "loss": 0.87090796, + "num_input_tokens_seen": 32526045, + "step": 1521, + "time_per_iteration": 2.531982898712158 + }, + { + "auxiliary_loss_clip": 0.01116375, + "auxiliary_loss_mlp": 0.01067953, + "balance_loss_clip": 1.05180883, + "balance_loss_mlp": 1.04421878, + "epoch": 0.09150759056064933, + "flos": 22778210085120.0, + "grad_norm": 1.8460065580217138, + "language_loss": 0.8434211, + "learning_rate": 3.960446580030599e-06, + "loss": 0.8652643, + "num_input_tokens_seen": 32546575, + "step": 1522, + "time_per_iteration": 2.6617746353149414 + }, + { + "auxiliary_loss_clip": 0.0117907, + "auxiliary_loss_mlp": 0.01062049, + "balance_loss_clip": 1.05365157, + "balance_loss_mlp": 1.03992414, + "epoch": 0.0915677138133173, + "flos": 27564205017600.0, + "grad_norm": 2.0010648389972676, + "language_loss": 0.8102994, + "learning_rate": 3.960369470468711e-06, + "loss": 0.83271062, + "num_input_tokens_seen": 32568795, + "step": 1523, + "time_per_iteration": 2.5278143882751465 + }, + { + "auxiliary_loss_clip": 0.01157007, + "auxiliary_loss_mlp": 0.00795537, + "balance_loss_clip": 1.05854332, + "balance_loss_mlp": 1.02340651, + "epoch": 0.09162783706598528, + "flos": 17674765729920.0, + "grad_norm": 3.247803748906381, + "language_loss": 0.74556911, + "learning_rate": 3.960292286569418e-06, + "loss": 0.76509464, + "num_input_tokens_seen": 32587010, + "step": 1524, + "time_per_iteration": 2.533238649368286 + }, + { + "auxiliary_loss_clip": 0.0113401, + "auxiliary_loss_mlp": 0.01056572, + "balance_loss_clip": 1.0523926, + "balance_loss_mlp": 1.03416085, + "epoch": 0.09168796031865324, + "flos": 18478195188480.0, + "grad_norm": 2.0255726076191825, + "language_loss": 0.86106497, + "learning_rate": 3.960215028335644e-06, + "loss": 0.88297081, + "num_input_tokens_seen": 32602375, + "step": 1525, + "time_per_iteration": 2.5458011627197266 + }, + { + "auxiliary_loss_clip": 0.01160855, + "auxiliary_loss_mlp": 0.01045536, + "balance_loss_clip": 1.05560803, + "balance_loss_mlp": 1.02368498, + "epoch": 0.0917480835713212, + "flos": 29387605075200.0, + "grad_norm": 2.2998286185292156, + "language_loss": 0.74329484, + "learning_rate": 3.96013769577032e-06, + "loss": 0.76535869, + "num_input_tokens_seen": 32621460, + "step": 1526, + "time_per_iteration": 2.6242287158966064 + }, + { + "auxiliary_loss_clip": 0.0118423, + "auxiliary_loss_mlp": 0.01058365, + "balance_loss_clip": 1.05610669, + "balance_loss_mlp": 1.03589368, + "epoch": 0.09180820682398917, + "flos": 19829262378240.0, + "grad_norm": 1.926553207756451, + "language_loss": 0.77380705, + "learning_rate": 3.960060288876378e-06, + "loss": 0.79623306, + "num_input_tokens_seen": 32640440, + "step": 1527, + "time_per_iteration": 2.4857473373413086 + }, + { + "auxiliary_loss_clip": 0.0117049, + "auxiliary_loss_mlp": 0.01052835, + "balance_loss_clip": 1.05101466, + "balance_loss_mlp": 1.02985168, + "epoch": 0.09186833007665715, + "flos": 23841848643840.0, + "grad_norm": 1.7369661645416155, + "language_loss": 0.78218365, + "learning_rate": 3.959982807656753e-06, + "loss": 0.80441689, + "num_input_tokens_seen": 32660020, + "step": 1528, + "time_per_iteration": 2.5360777378082275 + }, + { + "auxiliary_loss_clip": 0.01143211, + "auxiliary_loss_mlp": 0.01050171, + "balance_loss_clip": 1.0521872, + "balance_loss_mlp": 1.02886832, + "epoch": 0.09192845332932512, + "flos": 12932726065920.0, + "grad_norm": 2.8216388968307333, + "language_loss": 0.76876384, + "learning_rate": 3.959905252114384e-06, + "loss": 0.79069763, + "num_input_tokens_seen": 32678170, + "step": 1529, + "time_per_iteration": 2.5686495304107666 + }, + { + "auxiliary_loss_clip": 0.01186314, + "auxiliary_loss_mlp": 0.0079079, + "balance_loss_clip": 1.05413091, + "balance_loss_mlp": 1.01402617, + "epoch": 0.09198857658199308, + "flos": 24568177559040.0, + "grad_norm": 2.078637330458293, + "language_loss": 0.82868236, + "learning_rate": 3.959827622252211e-06, + "loss": 0.8484534, + "num_input_tokens_seen": 32697540, + "step": 1530, + "time_per_iteration": 2.528794527053833 + }, + { + "auxiliary_loss_clip": 0.01133386, + "auxiliary_loss_mlp": 0.01058511, + "balance_loss_clip": 1.06042874, + "balance_loss_mlp": 1.03611135, + "epoch": 0.09204869983466106, + "flos": 20266941600000.0, + "grad_norm": 2.7867869229389717, + "language_loss": 0.83862007, + "learning_rate": 3.959749918073179e-06, + "loss": 0.86053896, + "num_input_tokens_seen": 32716805, + "step": 1531, + "time_per_iteration": 2.5940892696380615 + }, + { + "auxiliary_loss_clip": 0.0113647, + "auxiliary_loss_mlp": 0.01047935, + "balance_loss_clip": 1.05203843, + "balance_loss_mlp": 1.02551198, + "epoch": 0.09210882308732903, + "flos": 20885646389760.0, + "grad_norm": 2.40925776766462, + "language_loss": 0.81330812, + "learning_rate": 3.959672139580233e-06, + "loss": 0.83515215, + "num_input_tokens_seen": 32736385, + "step": 1532, + "time_per_iteration": 2.577235221862793 + }, + { + "auxiliary_loss_clip": 0.01161101, + "auxiliary_loss_mlp": 0.01054051, + "balance_loss_clip": 1.05990183, + "balance_loss_mlp": 1.03099608, + "epoch": 0.09216894633999699, + "flos": 30956326727040.0, + "grad_norm": 2.1129730529221784, + "language_loss": 0.83712405, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.85927558, + "num_input_tokens_seen": 32757140, + "step": 1533, + "time_per_iteration": 2.6085715293884277 + }, + { + "auxiliary_loss_clip": 0.01144966, + "auxiliary_loss_mlp": 0.0104919, + "balance_loss_clip": 1.05487084, + "balance_loss_mlp": 1.02674329, + "epoch": 0.09222906959266497, + "flos": 13151565676800.0, + "grad_norm": 2.0316249191300093, + "language_loss": 0.89692444, + "learning_rate": 3.959516359664402e-06, + "loss": 0.91886592, + "num_input_tokens_seen": 32774860, + "step": 1534, + "time_per_iteration": 2.560976028442383 + }, + { + "auxiliary_loss_clip": 0.01150648, + "auxiliary_loss_mlp": 0.01060939, + "balance_loss_clip": 1.05135679, + "balance_loss_mlp": 1.03617966, + "epoch": 0.09228919284533293, + "flos": 25994477784960.0, + "grad_norm": 2.275219924248099, + "language_loss": 0.75629532, + "learning_rate": 3.959438358247424e-06, + "loss": 0.77841121, + "num_input_tokens_seen": 32795250, + "step": 1535, + "time_per_iteration": 2.590768337249756 + }, + { + "auxiliary_loss_clip": 0.01165007, + "auxiliary_loss_mlp": 0.01045691, + "balance_loss_clip": 1.04877067, + "balance_loss_mlp": 1.02467477, + "epoch": 0.0923493160980009, + "flos": 18660800954880.0, + "grad_norm": 1.8460175491267252, + "language_loss": 0.81477356, + "learning_rate": 3.959360282528346e-06, + "loss": 0.83688056, + "num_input_tokens_seen": 32813805, + "step": 1536, + "time_per_iteration": 2.5068020820617676 + }, + { + "auxiliary_loss_clip": 0.01178701, + "auxiliary_loss_mlp": 0.01050358, + "balance_loss_clip": 1.05230856, + "balance_loss_mlp": 1.02973461, + "epoch": 0.09240943935066886, + "flos": 21140576190720.0, + "grad_norm": 2.195315411546666, + "language_loss": 0.89235657, + "learning_rate": 3.959282132510131e-06, + "loss": 0.91464716, + "num_input_tokens_seen": 32830960, + "step": 1537, + "time_per_iteration": 4.124530553817749 + }, + { + "auxiliary_loss_clip": 0.01156569, + "auxiliary_loss_mlp": 0.01060364, + "balance_loss_clip": 1.05115664, + "balance_loss_mlp": 1.03720188, + "epoch": 0.09246956260333684, + "flos": 20592435669120.0, + "grad_norm": 2.419909202146827, + "language_loss": 0.80911404, + "learning_rate": 3.959203908195741e-06, + "loss": 0.83128333, + "num_input_tokens_seen": 32848275, + "step": 1538, + "time_per_iteration": 2.555480480194092 + }, + { + "auxiliary_loss_clip": 0.01057762, + "auxiliary_loss_mlp": 0.01017607, + "balance_loss_clip": 1.0289197, + "balance_loss_mlp": 1.0137924, + "epoch": 0.09252968585600481, + "flos": 67558710614400.0, + "grad_norm": 0.7326619234038901, + "language_loss": 0.5741455, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59489912, + "num_input_tokens_seen": 32917730, + "step": 1539, + "time_per_iteration": 4.670522212982178 + }, + { + "auxiliary_loss_clip": 0.01154331, + "auxiliary_loss_mlp": 0.01049804, + "balance_loss_clip": 1.05379295, + "balance_loss_mlp": 1.02755964, + "epoch": 0.09258980910867277, + "flos": 17383853479680.0, + "grad_norm": 2.5244353851334598, + "language_loss": 0.6706208, + "learning_rate": 3.959047236690304e-06, + "loss": 0.69266206, + "num_input_tokens_seen": 32934910, + "step": 1540, + "time_per_iteration": 2.497007131576538 + }, + { + "auxiliary_loss_clip": 0.01143622, + "auxiliary_loss_mlp": 0.01046373, + "balance_loss_clip": 1.05054176, + "balance_loss_mlp": 1.02350879, + "epoch": 0.09264993236134075, + "flos": 19865927185920.0, + "grad_norm": 1.745557933283031, + "language_loss": 0.83886689, + "learning_rate": 3.958968789505198e-06, + "loss": 0.86076683, + "num_input_tokens_seen": 32953840, + "step": 1541, + "time_per_iteration": 2.575521230697632 + }, + { + "auxiliary_loss_clip": 0.01078577, + "auxiliary_loss_mlp": 0.01014772, + "balance_loss_clip": 1.02356029, + "balance_loss_mlp": 1.01069498, + "epoch": 0.09271005561400872, + "flos": 62284401262080.0, + "grad_norm": 0.8887921319015558, + "language_loss": 0.61884367, + "learning_rate": 3.9588902680358e-06, + "loss": 0.63977718, + "num_input_tokens_seen": 33011410, + "step": 1542, + "time_per_iteration": 3.0866987705230713 + }, + { + "auxiliary_loss_clip": 0.0116073, + "auxiliary_loss_mlp": 0.01054241, + "balance_loss_clip": 1.05572999, + "balance_loss_mlp": 1.03290248, + "epoch": 0.09277017886667668, + "flos": 23329870139520.0, + "grad_norm": 1.6421952065219911, + "language_loss": 0.82431793, + "learning_rate": 3.958811672285086e-06, + "loss": 0.84646761, + "num_input_tokens_seen": 33031675, + "step": 1543, + "time_per_iteration": 3.969069719314575 + }, + { + "auxiliary_loss_clip": 0.01134819, + "auxiliary_loss_mlp": 0.01059313, + "balance_loss_clip": 1.05058241, + "balance_loss_mlp": 1.03715229, + "epoch": 0.09283030211934466, + "flos": 54745169875200.0, + "grad_norm": 1.8297290651731999, + "language_loss": 0.7179746, + "learning_rate": 3.958733002256038e-06, + "loss": 0.73991591, + "num_input_tokens_seen": 33056355, + "step": 1544, + "time_per_iteration": 2.905487537384033 + }, + { + "auxiliary_loss_clip": 0.01164396, + "auxiliary_loss_mlp": 0.01053977, + "balance_loss_clip": 1.0526042, + "balance_loss_mlp": 1.02913392, + "epoch": 0.09289042537201263, + "flos": 30334784762880.0, + "grad_norm": 1.691556555425474, + "language_loss": 0.77237624, + "learning_rate": 3.958654257951637e-06, + "loss": 0.79456002, + "num_input_tokens_seen": 33079520, + "step": 1545, + "time_per_iteration": 2.594414710998535 + }, + { + "auxiliary_loss_clip": 0.01138274, + "auxiliary_loss_mlp": 0.0104952, + "balance_loss_clip": 1.05402517, + "balance_loss_mlp": 1.02818108, + "epoch": 0.09295054862468059, + "flos": 17746838369280.0, + "grad_norm": 2.959016047578255, + "language_loss": 0.74520642, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.76708436, + "num_input_tokens_seen": 33096135, + "step": 1546, + "time_per_iteration": 3.9474639892578125 + }, + { + "auxiliary_loss_clip": 0.01161308, + "auxiliary_loss_mlp": 0.01052578, + "balance_loss_clip": 1.05402732, + "balance_loss_mlp": 1.02946365, + "epoch": 0.09301067187734856, + "flos": 23658021815040.0, + "grad_norm": 2.026381835327108, + "language_loss": 0.84034383, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.86248273, + "num_input_tokens_seen": 33115245, + "step": 1547, + "time_per_iteration": 2.5713911056518555 + }, + { + "auxiliary_loss_clip": 0.0114248, + "auxiliary_loss_mlp": 0.0105182, + "balance_loss_clip": 1.04959869, + "balance_loss_mlp": 1.02985024, + "epoch": 0.09307079513001654, + "flos": 27527719777920.0, + "grad_norm": 2.0871849131709936, + "language_loss": 0.67568362, + "learning_rate": 3.958417579416199e-06, + "loss": 0.69762659, + "num_input_tokens_seen": 33136640, + "step": 1548, + "time_per_iteration": 2.629547357559204 + }, + { + "auxiliary_loss_clip": 0.01122767, + "auxiliary_loss_mlp": 0.01057982, + "balance_loss_clip": 1.05126691, + "balance_loss_mlp": 1.03545189, + "epoch": 0.0931309183826845, + "flos": 20627340710400.0, + "grad_norm": 1.9046972450781892, + "language_loss": 0.83350712, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.85531461, + "num_input_tokens_seen": 33155060, + "step": 1549, + "time_per_iteration": 2.6733181476593018 + }, + { + "auxiliary_loss_clip": 0.01173461, + "auxiliary_loss_mlp": 0.01046868, + "balance_loss_clip": 1.0569911, + "balance_loss_mlp": 1.02550602, + "epoch": 0.09319104163535247, + "flos": 29020921084800.0, + "grad_norm": 1.7418675037786757, + "language_loss": 0.75808239, + "learning_rate": 3.958259422403966e-06, + "loss": 0.78028572, + "num_input_tokens_seen": 33175420, + "step": 1550, + "time_per_iteration": 2.5949177742004395 + }, + { + "auxiliary_loss_clip": 0.01150137, + "auxiliary_loss_mlp": 0.01071568, + "balance_loss_clip": 1.05701232, + "balance_loss_mlp": 1.04674852, + "epoch": 0.09325116488802045, + "flos": 25301545539840.0, + "grad_norm": 2.2901676843496963, + "language_loss": 0.83363521, + "learning_rate": 3.95818023251026e-06, + "loss": 0.85585225, + "num_input_tokens_seen": 33194120, + "step": 1551, + "time_per_iteration": 2.6354622840881348 + }, + { + "auxiliary_loss_clip": 0.01070022, + "auxiliary_loss_mlp": 0.01095191, + "balance_loss_clip": 1.02603698, + "balance_loss_mlp": 1.61678183, + "epoch": 0.09331128814068841, + "flos": 61536203942400.0, + "grad_norm": 0.87965634133191, + "language_loss": 0.61862975, + "learning_rate": 3.958100968362163e-06, + "loss": 0.64028192, + "num_input_tokens_seen": 33261080, + "step": 1552, + "time_per_iteration": 3.2560019493103027 + }, + { + "auxiliary_loss_clip": 0.01067926, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.02976322, + "balance_loss_mlp": 1.02867341, + "epoch": 0.09337141139335638, + "flos": 53293700171520.0, + "grad_norm": 0.8516254911756155, + "language_loss": 0.58916771, + "learning_rate": 3.958021629962681e-06, + "loss": 0.61018121, + "num_input_tokens_seen": 33330235, + "step": 1553, + "time_per_iteration": 3.25569748878479 + }, + { + "auxiliary_loss_clip": 0.0115257, + "auxiliary_loss_mlp": 0.01057298, + "balance_loss_clip": 1.06284237, + "balance_loss_mlp": 1.03425479, + "epoch": 0.09343153464602436, + "flos": 23476852592640.0, + "grad_norm": 1.72815007581565, + "language_loss": 0.87467062, + "learning_rate": 3.957942217314823e-06, + "loss": 0.89676929, + "num_input_tokens_seen": 33349035, + "step": 1554, + "time_per_iteration": 2.5997087955474854 + }, + { + "auxiliary_loss_clip": 0.01146938, + "auxiliary_loss_mlp": 0.01056765, + "balance_loss_clip": 1.0507946, + "balance_loss_mlp": 1.03406811, + "epoch": 0.09349165789869232, + "flos": 19353481804800.0, + "grad_norm": 1.8801212045448559, + "language_loss": 0.81200826, + "learning_rate": 3.957862730421599e-06, + "loss": 0.83404529, + "num_input_tokens_seen": 33368060, + "step": 1555, + "time_per_iteration": 2.5348825454711914 + }, + { + "auxiliary_loss_clip": 0.01061985, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.02095282, + "balance_loss_mlp": 1.03977823, + "epoch": 0.09355178115136029, + "flos": 67502580635520.0, + "grad_norm": 0.997381150092209, + "language_loss": 0.59629101, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61734229, + "num_input_tokens_seen": 33430825, + "step": 1556, + "time_per_iteration": 3.1305603981018066 + }, + { + "auxiliary_loss_clip": 0.01167913, + "auxiliary_loss_mlp": 0.01065249, + "balance_loss_clip": 1.05246568, + "balance_loss_mlp": 1.04391003, + "epoch": 0.09361190440402825, + "flos": 37341638720640.0, + "grad_norm": 1.8507075355396525, + "language_loss": 0.84420449, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.86653608, + "num_input_tokens_seen": 33454855, + "step": 1557, + "time_per_iteration": 2.6349875926971436 + }, + { + "auxiliary_loss_clip": 0.01106851, + "auxiliary_loss_mlp": 0.01075589, + "balance_loss_clip": 1.04926777, + "balance_loss_mlp": 1.0516156, + "epoch": 0.09367202765669623, + "flos": 24899705112960.0, + "grad_norm": 1.9344503149162864, + "language_loss": 0.78060937, + "learning_rate": 3.957623824299893e-06, + "loss": 0.80243379, + "num_input_tokens_seen": 33476000, + "step": 1558, + "time_per_iteration": 2.71237850189209 + }, + { + "auxiliary_loss_clip": 0.0115758, + "auxiliary_loss_mlp": 0.01055428, + "balance_loss_clip": 1.0535996, + "balance_loss_mlp": 1.03336191, + "epoch": 0.0937321509093642, + "flos": 15705568368000.0, + "grad_norm": 1.9910814817631814, + "language_loss": 0.79729378, + "learning_rate": 3.957544040455379e-06, + "loss": 0.81942379, + "num_input_tokens_seen": 33493845, + "step": 1559, + "time_per_iteration": 2.519101619720459 + }, + { + "auxiliary_loss_clip": 0.0113735, + "auxiliary_loss_mlp": 0.01067095, + "balance_loss_clip": 1.05141127, + "balance_loss_mlp": 1.0458281, + "epoch": 0.09379227416203216, + "flos": 20483698222080.0, + "grad_norm": 2.2360972993240122, + "language_loss": 0.76485443, + "learning_rate": 3.957464182380599e-06, + "loss": 0.78689885, + "num_input_tokens_seen": 33510850, + "step": 1560, + "time_per_iteration": 2.5743136405944824 + }, + { + "auxiliary_loss_clip": 0.01135336, + "auxiliary_loss_mlp": 0.01066319, + "balance_loss_clip": 1.05189443, + "balance_loss_mlp": 1.04390752, + "epoch": 0.09385239741470014, + "flos": 24352498344960.0, + "grad_norm": 1.8218549194015512, + "language_loss": 0.81066704, + "learning_rate": 3.95738425007858e-06, + "loss": 0.83268356, + "num_input_tokens_seen": 33530430, + "step": 1561, + "time_per_iteration": 2.6112961769104004 + }, + { + "auxiliary_loss_clip": 0.01169797, + "auxiliary_loss_mlp": 0.01055214, + "balance_loss_clip": 1.05206347, + "balance_loss_mlp": 1.03343451, + "epoch": 0.0939125206673681, + "flos": 33291489807360.0, + "grad_norm": 2.5651116649916244, + "language_loss": 0.61222744, + "learning_rate": 3.957304243552354e-06, + "loss": 0.6344775, + "num_input_tokens_seen": 33551975, + "step": 1562, + "time_per_iteration": 2.626699447631836 + }, + { + "auxiliary_loss_clip": 0.01161295, + "auxiliary_loss_mlp": 0.01063531, + "balance_loss_clip": 1.05953634, + "balance_loss_mlp": 1.04275286, + "epoch": 0.09397264392003607, + "flos": 19244923925760.0, + "grad_norm": 2.3201285129485134, + "language_loss": 0.85000223, + "learning_rate": 3.957224162804956e-06, + "loss": 0.87225056, + "num_input_tokens_seen": 33569850, + "step": 1563, + "time_per_iteration": 2.537867546081543 + }, + { + "auxiliary_loss_clip": 0.01160679, + "auxiliary_loss_mlp": 0.01056693, + "balance_loss_clip": 1.05530453, + "balance_loss_mlp": 1.03580725, + "epoch": 0.09403276717270405, + "flos": 19317930318720.0, + "grad_norm": 2.264358492601823, + "language_loss": 0.76249576, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.78466946, + "num_input_tokens_seen": 33590510, + "step": 1564, + "time_per_iteration": 2.569422721862793 + }, + { + "auxiliary_loss_clip": 0.01155982, + "auxiliary_loss_mlp": 0.01055514, + "balance_loss_clip": 1.06261706, + "balance_loss_mlp": 1.03435445, + "epoch": 0.09409289042537201, + "flos": 23583471137280.0, + "grad_norm": 2.2121213470529786, + "language_loss": 0.80234188, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.82445693, + "num_input_tokens_seen": 33608810, + "step": 1565, + "time_per_iteration": 2.555333375930786 + }, + { + "auxiliary_loss_clip": 0.01151983, + "auxiliary_loss_mlp": 0.0106829, + "balance_loss_clip": 1.05090725, + "balance_loss_mlp": 1.04748797, + "epoch": 0.09415301367803998, + "flos": 20078446003200.0, + "grad_norm": 2.1695145049123563, + "language_loss": 0.75791264, + "learning_rate": 3.956983475266103e-06, + "loss": 0.78011543, + "num_input_tokens_seen": 33627265, + "step": 1566, + "time_per_iteration": 2.5790772438049316 + }, + { + "auxiliary_loss_clip": 0.01143436, + "auxiliary_loss_mlp": 0.01078395, + "balance_loss_clip": 1.05343819, + "balance_loss_mlp": 1.5408802, + "epoch": 0.09421313693070796, + "flos": 21062075016960.0, + "grad_norm": 2.219322228617997, + "language_loss": 0.78245521, + "learning_rate": 3.956903097664407e-06, + "loss": 0.80467349, + "num_input_tokens_seen": 33644810, + "step": 1567, + "time_per_iteration": 2.593858242034912 + }, + { + "auxiliary_loss_clip": 0.01155362, + "auxiliary_loss_mlp": 0.01054355, + "balance_loss_clip": 1.05386257, + "balance_loss_mlp": 1.03393412, + "epoch": 0.09427326018337592, + "flos": 24316156759680.0, + "grad_norm": 2.064782258190575, + "language_loss": 0.82442731, + "learning_rate": 3.956822645856749e-06, + "loss": 0.84652448, + "num_input_tokens_seen": 33665665, + "step": 1568, + "time_per_iteration": 2.5692715644836426 + }, + { + "auxiliary_loss_clip": 0.01186399, + "auxiliary_loss_mlp": 0.01050723, + "balance_loss_clip": 1.05698466, + "balance_loss_mlp": 1.02789426, + "epoch": 0.09433338343604389, + "flos": 20263888944000.0, + "grad_norm": 2.2988660845124333, + "language_loss": 0.76480532, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.78717649, + "num_input_tokens_seen": 33684760, + "step": 1569, + "time_per_iteration": 2.489922523498535 + }, + { + "auxiliary_loss_clip": 0.01118612, + "auxiliary_loss_mlp": 0.0105991, + "balance_loss_clip": 1.05112243, + "balance_loss_mlp": 1.03591323, + "epoch": 0.09439350668871185, + "flos": 12742973493120.0, + "grad_norm": 2.219714831850327, + "language_loss": 0.85613203, + "learning_rate": 3.956661519635756e-06, + "loss": 0.87791729, + "num_input_tokens_seen": 33700750, + "step": 1570, + "time_per_iteration": 2.5774660110473633 + }, + { + "auxiliary_loss_clip": 0.01122979, + "auxiliary_loss_mlp": 0.0105643, + "balance_loss_clip": 1.05312991, + "balance_loss_mlp": 1.0323379, + "epoch": 0.09445362994137983, + "flos": 25962266263680.0, + "grad_norm": 1.5804812266916313, + "language_loss": 0.76523054, + "learning_rate": 3.95658084522853e-06, + "loss": 0.78702462, + "num_input_tokens_seen": 33724430, + "step": 1571, + "time_per_iteration": 2.6757442951202393 + }, + { + "auxiliary_loss_clip": 0.01133502, + "auxiliary_loss_mlp": 0.01053381, + "balance_loss_clip": 1.04924881, + "balance_loss_mlp": 1.03151822, + "epoch": 0.0945137531940478, + "flos": 19715353372800.0, + "grad_norm": 1.6541776610975596, + "language_loss": 0.79374641, + "learning_rate": 3.956500096627561e-06, + "loss": 0.81561524, + "num_input_tokens_seen": 33743455, + "step": 1572, + "time_per_iteration": 2.600696563720703 + }, + { + "auxiliary_loss_clip": 0.01143772, + "auxiliary_loss_mlp": 0.01064729, + "balance_loss_clip": 1.05482936, + "balance_loss_mlp": 1.04086351, + "epoch": 0.09457387644671576, + "flos": 23617047375360.0, + "grad_norm": 1.7919013296474864, + "language_loss": 0.87676793, + "learning_rate": 3.956419273835913e-06, + "loss": 0.89885294, + "num_input_tokens_seen": 33763435, + "step": 1573, + "time_per_iteration": 2.6169488430023193 + }, + { + "auxiliary_loss_clip": 0.01157401, + "auxiliary_loss_mlp": 0.01060854, + "balance_loss_clip": 1.05487692, + "balance_loss_mlp": 1.03679776, + "epoch": 0.09463399969938374, + "flos": 26907291135360.0, + "grad_norm": 4.115764256561912, + "language_loss": 0.81898856, + "learning_rate": 3.95633837685665e-06, + "loss": 0.84117115, + "num_input_tokens_seen": 33784325, + "step": 1574, + "time_per_iteration": 2.5944738388061523 + }, + { + "auxiliary_loss_clip": 0.01163494, + "auxiliary_loss_mlp": 0.0105581, + "balance_loss_clip": 1.0651145, + "balance_loss_mlp": 1.03431666, + "epoch": 0.0946941229520517, + "flos": 23659566099840.0, + "grad_norm": 1.7698475406768694, + "language_loss": 0.81076109, + "learning_rate": 3.95625740569284e-06, + "loss": 0.83295411, + "num_input_tokens_seen": 33802510, + "step": 1575, + "time_per_iteration": 4.0059428215026855 + }, + { + "auxiliary_loss_clip": 0.01179561, + "auxiliary_loss_mlp": 0.01067018, + "balance_loss_clip": 1.05460763, + "balance_loss_mlp": 1.044595, + "epoch": 0.09475424620471967, + "flos": 24134053783680.0, + "grad_norm": 2.139997361908705, + "language_loss": 0.86727715, + "learning_rate": 3.956176360347553e-06, + "loss": 0.88974291, + "num_input_tokens_seen": 33819980, + "step": 1576, + "time_per_iteration": 2.4934327602386475 + }, + { + "auxiliary_loss_clip": 0.01066079, + "auxiliary_loss_mlp": 0.01040781, + "balance_loss_clip": 1.03184152, + "balance_loss_mlp": 1.03753877, + "epoch": 0.09481436945738765, + "flos": 68426168065920.0, + "grad_norm": 0.9983005684582642, + "language_loss": 0.65914381, + "learning_rate": 3.956095240823862e-06, + "loss": 0.68021238, + "num_input_tokens_seen": 33878925, + "step": 1577, + "time_per_iteration": 3.12056040763855 + }, + { + "auxiliary_loss_clip": 0.01146206, + "auxiliary_loss_mlp": 0.01048159, + "balance_loss_clip": 1.05278289, + "balance_loss_mlp": 1.02748787, + "epoch": 0.09487449271005562, + "flos": 16654076858880.0, + "grad_norm": 2.06515791280812, + "language_loss": 0.79197598, + "learning_rate": 3.956014047124844e-06, + "loss": 0.81391954, + "num_input_tokens_seen": 33897600, + "step": 1578, + "time_per_iteration": 4.015286445617676 + }, + { + "auxiliary_loss_clip": 0.01181216, + "auxiliary_loss_mlp": 0.01059665, + "balance_loss_clip": 1.0543437, + "balance_loss_mlp": 1.03744447, + "epoch": 0.09493461596272358, + "flos": 24275685110400.0, + "grad_norm": 1.8451357841332325, + "language_loss": 0.77985489, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80226368, + "num_input_tokens_seen": 33917365, + "step": 1579, + "time_per_iteration": 2.513169288635254 + }, + { + "auxiliary_loss_clip": 0.01129192, + "auxiliary_loss_mlp": 0.01068118, + "balance_loss_clip": 1.05336118, + "balance_loss_mlp": 1.04277372, + "epoch": 0.09499473921539155, + "flos": 21870173243520.0, + "grad_norm": 1.8742281381555126, + "language_loss": 0.7313627, + "learning_rate": 3.955851437213144e-06, + "loss": 0.75333583, + "num_input_tokens_seen": 33936680, + "step": 1580, + "time_per_iteration": 2.59196138381958 + }, + { + "auxiliary_loss_clip": 0.01153607, + "auxiliary_loss_mlp": 0.01059287, + "balance_loss_clip": 1.05490541, + "balance_loss_mlp": 1.03790092, + "epoch": 0.09505486246805953, + "flos": 33547137880320.0, + "grad_norm": 1.764348424770447, + "language_loss": 0.7758491, + "learning_rate": 3.955770021006627e-06, + "loss": 0.79797804, + "num_input_tokens_seen": 33960685, + "step": 1581, + "time_per_iteration": 2.6468141078948975 + }, + { + "auxiliary_loss_clip": 0.0114883, + "auxiliary_loss_mlp": 0.0106053, + "balance_loss_clip": 1.06200695, + "balance_loss_mlp": 1.03909636, + "epoch": 0.09511498572072749, + "flos": 21215342350080.0, + "grad_norm": 1.939717258271165, + "language_loss": 0.87108123, + "learning_rate": 3.955688530637116e-06, + "loss": 0.89317483, + "num_input_tokens_seen": 33980015, + "step": 1582, + "time_per_iteration": 3.943749189376831 + }, + { + "auxiliary_loss_clip": 0.01171473, + "auxiliary_loss_mlp": 0.01066541, + "balance_loss_clip": 1.05443192, + "balance_loss_mlp": 1.04309297, + "epoch": 0.09517510897339546, + "flos": 14611262572800.0, + "grad_norm": 3.958624038837923, + "language_loss": 0.66848898, + "learning_rate": 3.955606966107699e-06, + "loss": 0.69086909, + "num_input_tokens_seen": 33997705, + "step": 1583, + "time_per_iteration": 2.528214931488037 + }, + { + "auxiliary_loss_clip": 0.01175074, + "auxiliary_loss_mlp": 0.01059559, + "balance_loss_clip": 1.05801415, + "balance_loss_mlp": 1.03595519, + "epoch": 0.09523523222606343, + "flos": 27817339138560.0, + "grad_norm": 1.8156715554857168, + "language_loss": 0.70310569, + "learning_rate": 3.95552532742147e-06, + "loss": 0.72545207, + "num_input_tokens_seen": 34017465, + "step": 1584, + "time_per_iteration": 3.9374709129333496 + }, + { + "auxiliary_loss_clip": 0.01136259, + "auxiliary_loss_mlp": 0.01056116, + "balance_loss_clip": 1.05417848, + "balance_loss_mlp": 1.03556383, + "epoch": 0.0952953554787314, + "flos": 20706272847360.0, + "grad_norm": 1.4653412588862806, + "language_loss": 0.8085202, + "learning_rate": 3.955443614581525e-06, + "loss": 0.83044392, + "num_input_tokens_seen": 34038550, + "step": 1585, + "time_per_iteration": 2.658764362335205 + }, + { + "auxiliary_loss_clip": 0.01156534, + "auxiliary_loss_mlp": 0.0105136, + "balance_loss_clip": 1.05471265, + "balance_loss_mlp": 1.02823377, + "epoch": 0.09535547873139937, + "flos": 24787627701120.0, + "grad_norm": 1.888139747344243, + "language_loss": 0.71915269, + "learning_rate": 3.955361827590961e-06, + "loss": 0.74123162, + "num_input_tokens_seen": 34058665, + "step": 1586, + "time_per_iteration": 2.5734524726867676 + }, + { + "auxiliary_loss_clip": 0.01045144, + "auxiliary_loss_mlp": 0.01010412, + "balance_loss_clip": 1.02752435, + "balance_loss_mlp": 1.00674021, + "epoch": 0.09541560198406734, + "flos": 71912194905600.0, + "grad_norm": 0.8153150383920831, + "language_loss": 0.55521101, + "learning_rate": 3.955279966452883e-06, + "loss": 0.5757665, + "num_input_tokens_seen": 34109655, + "step": 1587, + "time_per_iteration": 2.9945757389068604 + }, + { + "auxiliary_loss_clip": 0.01133424, + "auxiliary_loss_mlp": 0.01059992, + "balance_loss_clip": 1.05920863, + "balance_loss_mlp": 1.03659153, + "epoch": 0.09547572523673531, + "flos": 28982604251520.0, + "grad_norm": 1.82403793794127, + "language_loss": 0.81030267, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83223689, + "num_input_tokens_seen": 34131115, + "step": 1588, + "time_per_iteration": 2.650310754776001 + }, + { + "auxiliary_loss_clip": 0.0113642, + "auxiliary_loss_mlp": 0.01057681, + "balance_loss_clip": 1.05823517, + "balance_loss_mlp": 1.03552008, + "epoch": 0.09553584848940327, + "flos": 24133910129280.0, + "grad_norm": 1.494204970556393, + "language_loss": 0.81835914, + "learning_rate": 3.955116021746594e-06, + "loss": 0.8403002, + "num_input_tokens_seen": 34151925, + "step": 1589, + "time_per_iteration": 2.6201870441436768 + }, + { + "auxiliary_loss_clip": 0.01120598, + "auxiliary_loss_mlp": 0.01180891, + "balance_loss_clip": 1.05364001, + "balance_loss_mlp": 1.72993553, + "epoch": 0.09559597174207124, + "flos": 42851376789120.0, + "grad_norm": 2.2299908302934957, + "language_loss": 0.64793944, + "learning_rate": 3.955033938184601e-06, + "loss": 0.67095435, + "num_input_tokens_seen": 34175395, + "step": 1590, + "time_per_iteration": 2.8149635791778564 + }, + { + "auxiliary_loss_clip": 0.01141017, + "auxiliary_loss_mlp": 0.01054142, + "balance_loss_clip": 1.05025387, + "balance_loss_mlp": 1.03231502, + "epoch": 0.09565609499473922, + "flos": 32670845683200.0, + "grad_norm": 1.7344772996572568, + "language_loss": 0.83031487, + "learning_rate": 3.954951780487526e-06, + "loss": 0.85226643, + "num_input_tokens_seen": 34197760, + "step": 1591, + "time_per_iteration": 2.6802310943603516 + }, + { + "auxiliary_loss_clip": 0.01161069, + "auxiliary_loss_mlp": 0.0106088, + "balance_loss_clip": 1.053231, + "balance_loss_mlp": 1.03838539, + "epoch": 0.09571621824740718, + "flos": 18478410670080.0, + "grad_norm": 3.0702287123028094, + "language_loss": 0.74089086, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.76311034, + "num_input_tokens_seen": 34215330, + "step": 1592, + "time_per_iteration": 2.53536319732666 + }, + { + "auxiliary_loss_clip": 0.01165021, + "auxiliary_loss_mlp": 0.01054975, + "balance_loss_clip": 1.04969668, + "balance_loss_mlp": 1.03280163, + "epoch": 0.09577634150007515, + "flos": 29387497334400.0, + "grad_norm": 1.8843232081617205, + "language_loss": 0.7426821, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76488203, + "num_input_tokens_seen": 34237745, + "step": 1593, + "time_per_iteration": 2.5781474113464355 + }, + { + "auxiliary_loss_clip": 0.01171833, + "auxiliary_loss_mlp": 0.01063316, + "balance_loss_clip": 1.05449164, + "balance_loss_mlp": 1.04221547, + "epoch": 0.09583646475274313, + "flos": 22747830157440.0, + "grad_norm": 1.8146648384153161, + "language_loss": 0.69971657, + "learning_rate": 3.954704862616971e-06, + "loss": 0.72206807, + "num_input_tokens_seen": 34256565, + "step": 1594, + "time_per_iteration": 2.5351474285125732 + }, + { + "auxiliary_loss_clip": 0.01169788, + "auxiliary_loss_mlp": 0.01054699, + "balance_loss_clip": 1.05502379, + "balance_loss_mlp": 1.03364682, + "epoch": 0.0958965880054111, + "flos": 23218367345280.0, + "grad_norm": 2.287325745684747, + "language_loss": 0.8221311, + "learning_rate": 3.954622408410747e-06, + "loss": 0.84437597, + "num_input_tokens_seen": 34275970, + "step": 1595, + "time_per_iteration": 2.5167031288146973 + }, + { + "auxiliary_loss_clip": 0.01151809, + "auxiliary_loss_mlp": 0.01053354, + "balance_loss_clip": 1.05359948, + "balance_loss_mlp": 1.03007281, + "epoch": 0.09595671125807906, + "flos": 21324438933120.0, + "grad_norm": 1.9629308948334019, + "language_loss": 0.8493253, + "learning_rate": 3.954539880085045e-06, + "loss": 0.87137687, + "num_input_tokens_seen": 34295490, + "step": 1596, + "time_per_iteration": 2.5645833015441895 + }, + { + "auxiliary_loss_clip": 0.01165085, + "auxiliary_loss_mlp": 0.01059292, + "balance_loss_clip": 1.05610669, + "balance_loss_mlp": 1.03638041, + "epoch": 0.09601683451074704, + "flos": 39603472185600.0, + "grad_norm": 1.9832210066634568, + "language_loss": 0.69044495, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71268868, + "num_input_tokens_seen": 34319990, + "step": 1597, + "time_per_iteration": 2.695521593093872 + }, + { + "auxiliary_loss_clip": 0.01165789, + "auxiliary_loss_mlp": 0.01074746, + "balance_loss_clip": 1.05419123, + "balance_loss_mlp": 1.54365134, + "epoch": 0.096076957763415, + "flos": 23732716147200.0, + "grad_norm": 1.9023679787666776, + "language_loss": 0.7486822, + "learning_rate": 3.954374601087729e-06, + "loss": 0.77108759, + "num_input_tokens_seen": 34339225, + "step": 1598, + "time_per_iteration": 2.5535457134246826 + }, + { + "auxiliary_loss_clip": 0.01175142, + "auxiliary_loss_mlp": 0.01061279, + "balance_loss_clip": 1.05916417, + "balance_loss_mlp": 1.03793812, + "epoch": 0.09613708101608297, + "flos": 34678108483200.0, + "grad_norm": 1.6904192005081171, + "language_loss": 0.68824667, + "learning_rate": 3.954291850422382e-06, + "loss": 0.71061087, + "num_input_tokens_seen": 34361020, + "step": 1599, + "time_per_iteration": 2.614008665084839 + }, + { + "auxiliary_loss_clip": 0.01149529, + "auxiliary_loss_mlp": 0.01061083, + "balance_loss_clip": 1.05227804, + "balance_loss_mlp": 1.03987575, + "epoch": 0.09619720426875093, + "flos": 20740028653440.0, + "grad_norm": 2.1423247440638526, + "language_loss": 0.83919674, + "learning_rate": 3.954209025650093e-06, + "loss": 0.86130285, + "num_input_tokens_seen": 34378630, + "step": 1600, + "time_per_iteration": 2.6057565212249756 + }, + { + "auxiliary_loss_clip": 0.01148865, + "auxiliary_loss_mlp": 0.01056708, + "balance_loss_clip": 1.05405331, + "balance_loss_mlp": 1.03501201, + "epoch": 0.09625732752141891, + "flos": 13042720488960.0, + "grad_norm": 2.1783865083936593, + "language_loss": 0.80323339, + "learning_rate": 3.954126126774001e-06, + "loss": 0.82528913, + "num_input_tokens_seen": 34397110, + "step": 1601, + "time_per_iteration": 2.532188653945923 + }, + { + "auxiliary_loss_clip": 0.01174786, + "auxiliary_loss_mlp": 0.01056368, + "balance_loss_clip": 1.05403721, + "balance_loss_mlp": 1.03412318, + "epoch": 0.09631745077408688, + "flos": 22273629782400.0, + "grad_norm": 2.158997287225043, + "language_loss": 0.82530141, + "learning_rate": 3.954043153797251e-06, + "loss": 0.84761298, + "num_input_tokens_seen": 34414165, + "step": 1602, + "time_per_iteration": 2.5283467769622803 + }, + { + "auxiliary_loss_clip": 0.01137055, + "auxiliary_loss_mlp": 0.01059697, + "balance_loss_clip": 1.05501747, + "balance_loss_mlp": 1.0367136, + "epoch": 0.09637757402675484, + "flos": 24754266944640.0, + "grad_norm": 6.315069477407284, + "language_loss": 0.62665462, + "learning_rate": 3.953960106722989e-06, + "loss": 0.6486221, + "num_input_tokens_seen": 34434445, + "step": 1603, + "time_per_iteration": 2.602215528488159 + }, + { + "auxiliary_loss_clip": 0.01186595, + "auxiliary_loss_mlp": 0.01054838, + "balance_loss_clip": 1.0571804, + "balance_loss_mlp": 1.03071058, + "epoch": 0.09643769727942282, + "flos": 22525758322560.0, + "grad_norm": 2.116229904256095, + "language_loss": 0.71335554, + "learning_rate": 3.953876985554364e-06, + "loss": 0.73576987, + "num_input_tokens_seen": 34453095, + "step": 1604, + "time_per_iteration": 2.5427417755126953 + }, + { + "auxiliary_loss_clip": 0.01171401, + "auxiliary_loss_mlp": 0.01060388, + "balance_loss_clip": 1.05571413, + "balance_loss_mlp": 1.03951478, + "epoch": 0.09649782053209079, + "flos": 30921026636160.0, + "grad_norm": 2.0722582744854856, + "language_loss": 0.80024838, + "learning_rate": 3.953793790294527e-06, + "loss": 0.82256627, + "num_input_tokens_seen": 34473680, + "step": 1605, + "time_per_iteration": 2.5889570713043213 + }, + { + "auxiliary_loss_clip": 0.0115893, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_clip": 1.05538893, + "balance_loss_mlp": 1.02639949, + "epoch": 0.09655794378475875, + "flos": 25337635729920.0, + "grad_norm": 2.23233270344022, + "language_loss": 0.74762869, + "learning_rate": 3.953710520946634e-06, + "loss": 0.76970071, + "num_input_tokens_seen": 34492610, + "step": 1606, + "time_per_iteration": 2.5858347415924072 + }, + { + "auxiliary_loss_clip": 0.011657, + "auxiliary_loss_mlp": 0.0105088, + "balance_loss_clip": 1.05459559, + "balance_loss_mlp": 1.03000677, + "epoch": 0.09661806703742673, + "flos": 22346061557760.0, + "grad_norm": 2.191371122763678, + "language_loss": 0.75749904, + "learning_rate": 3.953627177513843e-06, + "loss": 0.77966481, + "num_input_tokens_seen": 34511855, + "step": 1607, + "time_per_iteration": 2.4936752319335938 + }, + { + "auxiliary_loss_clip": 0.01137694, + "auxiliary_loss_mlp": 0.01047739, + "balance_loss_clip": 1.05233049, + "balance_loss_mlp": 1.02626967, + "epoch": 0.0966781902900947, + "flos": 17457578144640.0, + "grad_norm": 2.0697676439345405, + "language_loss": 0.86718303, + "learning_rate": 3.953543759999312e-06, + "loss": 0.88903737, + "num_input_tokens_seen": 34528905, + "step": 1608, + "time_per_iteration": 2.5565075874328613 + }, + { + "auxiliary_loss_clip": 0.01123314, + "auxiliary_loss_mlp": 0.01062524, + "balance_loss_clip": 1.05569077, + "balance_loss_mlp": 1.03856277, + "epoch": 0.09673831354276266, + "flos": 36903995412480.0, + "grad_norm": 2.420247121031451, + "language_loss": 0.71316564, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73502409, + "num_input_tokens_seen": 34548480, + "step": 1609, + "time_per_iteration": 2.7789041996002197 + }, + { + "auxiliary_loss_clip": 0.01145896, + "auxiliary_loss_mlp": 0.01055898, + "balance_loss_clip": 1.05397773, + "balance_loss_mlp": 1.03476238, + "epoch": 0.09679843679543064, + "flos": 20701388597760.0, + "grad_norm": 2.333330129938826, + "language_loss": 0.84379005, + "learning_rate": 3.953376702737693e-06, + "loss": 0.86580789, + "num_input_tokens_seen": 34565410, + "step": 1610, + "time_per_iteration": 2.5723299980163574 + }, + { + "auxiliary_loss_clip": 0.01155982, + "auxiliary_loss_mlp": 0.01053808, + "balance_loss_clip": 1.05507839, + "balance_loss_mlp": 1.03132546, + "epoch": 0.0968585600480986, + "flos": 23514415240320.0, + "grad_norm": 3.1224342164307877, + "language_loss": 0.6686992, + "learning_rate": 3.953293062996939e-06, + "loss": 0.69079709, + "num_input_tokens_seen": 34584840, + "step": 1611, + "time_per_iteration": 2.566049337387085 + }, + { + "auxiliary_loss_clip": 0.01124173, + "auxiliary_loss_mlp": 0.01052381, + "balance_loss_clip": 1.05195558, + "balance_loss_mlp": 1.0310303, + "epoch": 0.09691868330076657, + "flos": 20121072468480.0, + "grad_norm": 2.2670617070606705, + "language_loss": 0.81127125, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83303678, + "num_input_tokens_seen": 34603360, + "step": 1612, + "time_per_iteration": 2.6863596439361572 + }, + { + "auxiliary_loss_clip": 0.0117788, + "auxiliary_loss_mlp": 0.01066505, + "balance_loss_clip": 1.05754912, + "balance_loss_mlp": 1.04468942, + "epoch": 0.09697880655343454, + "flos": 16544692967040.0, + "grad_norm": 2.1255997082134566, + "language_loss": 0.81140232, + "learning_rate": 3.953125561311398e-06, + "loss": 0.83384615, + "num_input_tokens_seen": 34620760, + "step": 1613, + "time_per_iteration": 3.894010543823242 + }, + { + "auxiliary_loss_clip": 0.01148982, + "auxiliary_loss_mlp": 0.01054779, + "balance_loss_clip": 1.0593195, + "balance_loss_mlp": 1.03185511, + "epoch": 0.09703892980610251, + "flos": 26104184899200.0, + "grad_norm": 1.9118217132781186, + "language_loss": 0.84393263, + "learning_rate": 3.953041699372964e-06, + "loss": 0.86597025, + "num_input_tokens_seen": 34640695, + "step": 1614, + "time_per_iteration": 2.607609510421753 + }, + { + "auxiliary_loss_clip": 0.01070613, + "auxiliary_loss_mlp": 0.02214933, + "balance_loss_clip": 1.02780175, + "balance_loss_mlp": 3.5985992, + "epoch": 0.09709905305877048, + "flos": 60443622000000.0, + "grad_norm": 0.7249257101515483, + "language_loss": 0.54638219, + "learning_rate": 3.952957763374992e-06, + "loss": 0.57923758, + "num_input_tokens_seen": 34702395, + "step": 1615, + "time_per_iteration": 3.0793280601501465 + }, + { + "auxiliary_loss_clip": 0.01030188, + "auxiliary_loss_mlp": 0.01012207, + "balance_loss_clip": 1.02454853, + "balance_loss_mlp": 1.00896442, + "epoch": 0.09715917631143844, + "flos": 57639932893440.0, + "grad_norm": 0.7640146911246476, + "language_loss": 0.58249009, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60291409, + "num_input_tokens_seen": 34768910, + "step": 1616, + "time_per_iteration": 4.721745491027832 + }, + { + "auxiliary_loss_clip": 0.01151384, + "auxiliary_loss_mlp": 0.01057428, + "balance_loss_clip": 1.05397773, + "balance_loss_mlp": 1.03403914, + "epoch": 0.09721929956410642, + "flos": 20558212986240.0, + "grad_norm": 1.7186798916040393, + "language_loss": 0.68864793, + "learning_rate": 3.952789669213172e-06, + "loss": 0.7107361, + "num_input_tokens_seen": 34787680, + "step": 1617, + "time_per_iteration": 2.5400750637054443 + }, + { + "auxiliary_loss_clip": 0.0114929, + "auxiliary_loss_mlp": 0.01056019, + "balance_loss_clip": 1.05281448, + "balance_loss_mlp": 1.03066313, + "epoch": 0.09727942281677439, + "flos": 27344359825920.0, + "grad_norm": 1.727278229135788, + "language_loss": 0.80796432, + "learning_rate": 3.952705511055698e-06, + "loss": 0.83001745, + "num_input_tokens_seen": 34808330, + "step": 1618, + "time_per_iteration": 2.623757839202881 + }, + { + "auxiliary_loss_clip": 0.0115949, + "auxiliary_loss_mlp": 0.01050074, + "balance_loss_clip": 1.05306888, + "balance_loss_mlp": 1.02928352, + "epoch": 0.09733954606944235, + "flos": 24900028335360.0, + "grad_norm": 2.1935990150563245, + "language_loss": 0.93155861, + "learning_rate": 3.952621278851435e-06, + "loss": 0.95365417, + "num_input_tokens_seen": 34830020, + "step": 1619, + "time_per_iteration": 2.5793087482452393 + }, + { + "auxiliary_loss_clip": 0.01169499, + "auxiliary_loss_mlp": 0.01052788, + "balance_loss_clip": 1.05496585, + "balance_loss_mlp": 1.03147376, + "epoch": 0.09739966932211033, + "flos": 31503928544640.0, + "grad_norm": 1.8676591550611403, + "language_loss": 0.88680565, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.90902853, + "num_input_tokens_seen": 34850330, + "step": 1620, + "time_per_iteration": 4.057949066162109 + }, + { + "auxiliary_loss_clip": 0.01150341, + "auxiliary_loss_mlp": 0.01058617, + "balance_loss_clip": 1.05212879, + "balance_loss_mlp": 1.03391731, + "epoch": 0.0974597925747783, + "flos": 23878764846720.0, + "grad_norm": 2.191948545924734, + "language_loss": 0.77567935, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79776895, + "num_input_tokens_seen": 34871640, + "step": 1621, + "time_per_iteration": 2.5788774490356445 + }, + { + "auxiliary_loss_clip": 0.01128641, + "auxiliary_loss_mlp": 0.01065113, + "balance_loss_clip": 1.04922485, + "balance_loss_mlp": 1.03981721, + "epoch": 0.09751991582744626, + "flos": 17019575700480.0, + "grad_norm": 1.9612774912704087, + "language_loss": 0.77619147, + "learning_rate": 3.952368137989871e-06, + "loss": 0.79812902, + "num_input_tokens_seen": 34888100, + "step": 1622, + "time_per_iteration": 2.541710138320923 + }, + { + "auxiliary_loss_clip": 0.01147763, + "auxiliary_loss_mlp": 0.01059617, + "balance_loss_clip": 1.05404973, + "balance_loss_mlp": 1.03686035, + "epoch": 0.09758003908011423, + "flos": 28402826826240.0, + "grad_norm": 3.179270059110057, + "language_loss": 0.85670602, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.87877989, + "num_input_tokens_seen": 34910485, + "step": 1623, + "time_per_iteration": 3.9655256271362305 + }, + { + "auxiliary_loss_clip": 0.01170271, + "auxiliary_loss_mlp": 0.01060101, + "balance_loss_clip": 1.05371916, + "balance_loss_mlp": 1.03765368, + "epoch": 0.09764016233278221, + "flos": 18144297336960.0, + "grad_norm": 2.4996693519326283, + "language_loss": 0.80522782, + "learning_rate": 3.952199007240184e-06, + "loss": 0.82753158, + "num_input_tokens_seen": 34928615, + "step": 1624, + "time_per_iteration": 2.488090991973877 + }, + { + "auxiliary_loss_clip": 0.01168107, + "auxiliary_loss_mlp": 0.01047627, + "balance_loss_clip": 1.04990554, + "balance_loss_mlp": 1.02650356, + "epoch": 0.09770028558545017, + "flos": 15265842071040.0, + "grad_norm": 2.738018976605643, + "language_loss": 0.85630202, + "learning_rate": 3.952114330822364e-06, + "loss": 0.87845939, + "num_input_tokens_seen": 34946045, + "step": 1625, + "time_per_iteration": 2.4655022621154785 + }, + { + "auxiliary_loss_clip": 0.01173414, + "auxiliary_loss_mlp": 0.01052354, + "balance_loss_clip": 1.05291343, + "balance_loss_mlp": 1.0308249, + "epoch": 0.09776040883811814, + "flos": 23472435219840.0, + "grad_norm": 2.660910616132785, + "language_loss": 0.85332584, + "learning_rate": 3.952029580380172e-06, + "loss": 0.87558353, + "num_input_tokens_seen": 34962865, + "step": 1626, + "time_per_iteration": 2.5363078117370605 + }, + { + "auxiliary_loss_clip": 0.01162048, + "auxiliary_loss_mlp": 0.01501221, + "balance_loss_clip": 1.05466771, + "balance_loss_mlp": 2.35174513, + "epoch": 0.09782053209078612, + "flos": 24499480798080.0, + "grad_norm": 2.009905442177808, + "language_loss": 0.83585185, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.86248451, + "num_input_tokens_seen": 34983505, + "step": 1627, + "time_per_iteration": 2.539125919342041 + }, + { + "auxiliary_loss_clip": 0.01162899, + "auxiliary_loss_mlp": 0.01051954, + "balance_loss_clip": 1.05166316, + "balance_loss_mlp": 1.03059149, + "epoch": 0.09788065534345408, + "flos": 21580158833280.0, + "grad_norm": 2.6464265337645347, + "language_loss": 0.8433156, + "learning_rate": 3.951859857435534e-06, + "loss": 0.86546409, + "num_input_tokens_seen": 35001825, + "step": 1628, + "time_per_iteration": 2.5112733840942383 + }, + { + "auxiliary_loss_clip": 0.01165165, + "auxiliary_loss_mlp": 0.01053007, + "balance_loss_clip": 1.05013406, + "balance_loss_mlp": 1.03120339, + "epoch": 0.09794077859612205, + "flos": 23842459175040.0, + "grad_norm": 1.8655836334929567, + "language_loss": 0.75802571, + "learning_rate": 3.951774884939523e-06, + "loss": 0.7802074, + "num_input_tokens_seen": 35023075, + "step": 1629, + "time_per_iteration": 2.515502691268921 + }, + { + "auxiliary_loss_clip": 0.01125973, + "auxiliary_loss_mlp": 0.01055712, + "balance_loss_clip": 1.05820298, + "balance_loss_mlp": 1.03258514, + "epoch": 0.09800090184879003, + "flos": 23659889322240.0, + "grad_norm": 1.7160384589155977, + "language_loss": 0.78478312, + "learning_rate": 3.951689838432013e-06, + "loss": 0.80659997, + "num_input_tokens_seen": 35043480, + "step": 1630, + "time_per_iteration": 2.6961770057678223 + }, + { + "auxiliary_loss_clip": 0.01162611, + "auxiliary_loss_mlp": 0.01051676, + "balance_loss_clip": 1.05648804, + "balance_loss_mlp": 1.02873993, + "epoch": 0.09806102510145799, + "flos": 17055773631360.0, + "grad_norm": 1.8993806658781305, + "language_loss": 0.86484474, + "learning_rate": 3.951604717916228e-06, + "loss": 0.88698757, + "num_input_tokens_seen": 35061490, + "step": 1631, + "time_per_iteration": 2.4997169971466064 + }, + { + "auxiliary_loss_clip": 0.01162967, + "auxiliary_loss_mlp": 0.01055001, + "balance_loss_clip": 1.05410099, + "balance_loss_mlp": 1.03421044, + "epoch": 0.09812114835412596, + "flos": 23878477537920.0, + "grad_norm": 5.373839608041589, + "language_loss": 0.83362824, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85580796, + "num_input_tokens_seen": 35079670, + "step": 1632, + "time_per_iteration": 2.569882392883301 + }, + { + "auxiliary_loss_clip": 0.01144978, + "auxiliary_loss_mlp": 0.01055043, + "balance_loss_clip": 1.05601931, + "balance_loss_mlp": 1.03475344, + "epoch": 0.09818127160679392, + "flos": 20595488325120.0, + "grad_norm": 1.7080570668151474, + "language_loss": 0.78590512, + "learning_rate": 3.951434254872751e-06, + "loss": 0.80790532, + "num_input_tokens_seen": 35099205, + "step": 1633, + "time_per_iteration": 2.575016736984253 + }, + { + "auxiliary_loss_clip": 0.01162457, + "auxiliary_loss_mlp": 0.01056445, + "balance_loss_clip": 1.04965949, + "balance_loss_mlp": 1.03429639, + "epoch": 0.0982413948594619, + "flos": 15487339288320.0, + "grad_norm": 2.024162239931351, + "language_loss": 0.72985095, + "learning_rate": 3.951348912351521e-06, + "loss": 0.75203991, + "num_input_tokens_seen": 35115270, + "step": 1634, + "time_per_iteration": 2.4762630462646484 + }, + { + "auxiliary_loss_clip": 0.0115223, + "auxiliary_loss_mlp": 0.01058461, + "balance_loss_clip": 1.05130434, + "balance_loss_mlp": 1.0361805, + "epoch": 0.09830151811212987, + "flos": 24207958016640.0, + "grad_norm": 2.581790637599988, + "language_loss": 0.73380458, + "learning_rate": 3.951263495834947e-06, + "loss": 0.75591147, + "num_input_tokens_seen": 35134065, + "step": 1635, + "time_per_iteration": 2.5592756271362305 + }, + { + "auxiliary_loss_clip": 0.01150834, + "auxiliary_loss_mlp": 0.01065018, + "balance_loss_clip": 1.05477858, + "balance_loss_mlp": 1.04034221, + "epoch": 0.09836164136479783, + "flos": 20594590485120.0, + "grad_norm": 1.8301700929490785, + "language_loss": 0.77896667, + "learning_rate": 3.951178005326264e-06, + "loss": 0.80112517, + "num_input_tokens_seen": 35154870, + "step": 1636, + "time_per_iteration": 2.6120121479034424 + }, + { + "auxiliary_loss_clip": 0.01159528, + "auxiliary_loss_mlp": 0.01056802, + "balance_loss_clip": 1.05581856, + "balance_loss_mlp": 1.03565395, + "epoch": 0.09842176461746581, + "flos": 19934157070080.0, + "grad_norm": 1.92287549066641, + "language_loss": 0.69749123, + "learning_rate": 3.951092440828715e-06, + "loss": 0.71965456, + "num_input_tokens_seen": 35171850, + "step": 1637, + "time_per_iteration": 2.5223934650421143 + }, + { + "auxiliary_loss_clip": 0.01181545, + "auxiliary_loss_mlp": 0.01054405, + "balance_loss_clip": 1.05375409, + "balance_loss_mlp": 1.03284025, + "epoch": 0.09848188787013377, + "flos": 21214659991680.0, + "grad_norm": 2.756839637545218, + "language_loss": 0.77671051, + "learning_rate": 3.951006802345545e-06, + "loss": 0.79907012, + "num_input_tokens_seen": 35188795, + "step": 1638, + "time_per_iteration": 2.496187210083008 + }, + { + "auxiliary_loss_clip": 0.0113475, + "auxiliary_loss_mlp": 0.01047383, + "balance_loss_clip": 1.05406177, + "balance_loss_mlp": 1.02644968, + "epoch": 0.09854201112280174, + "flos": 30154226071680.0, + "grad_norm": 1.5656149296689226, + "language_loss": 0.72798473, + "learning_rate": 3.950921089880003e-06, + "loss": 0.74980605, + "num_input_tokens_seen": 35212100, + "step": 1639, + "time_per_iteration": 2.6436333656311035 + }, + { + "auxiliary_loss_clip": 0.01166791, + "auxiliary_loss_mlp": 0.01048613, + "balance_loss_clip": 1.05338788, + "balance_loss_mlp": 1.02711964, + "epoch": 0.09860213437546972, + "flos": 21795730306560.0, + "grad_norm": 1.9801089851135547, + "language_loss": 0.88515592, + "learning_rate": 3.950835303435337e-06, + "loss": 0.90731001, + "num_input_tokens_seen": 35230390, + "step": 1640, + "time_per_iteration": 2.5419862270355225 + }, + { + "auxiliary_loss_clip": 0.01174093, + "auxiliary_loss_mlp": 0.01044783, + "balance_loss_clip": 1.05824256, + "balance_loss_mlp": 1.02444577, + "epoch": 0.09866225762813768, + "flos": 21835555511040.0, + "grad_norm": 1.9201905273997841, + "language_loss": 0.80891514, + "learning_rate": 3.950749443014801e-06, + "loss": 0.83110392, + "num_input_tokens_seen": 35250405, + "step": 1641, + "time_per_iteration": 2.4970133304595947 + }, + { + "auxiliary_loss_clip": 0.01166419, + "auxiliary_loss_mlp": 0.01061047, + "balance_loss_clip": 1.05172825, + "balance_loss_mlp": 1.03870702, + "epoch": 0.09872238088080565, + "flos": 17599855916160.0, + "grad_norm": 2.6159265104882685, + "language_loss": 0.86033541, + "learning_rate": 3.95066350862165e-06, + "loss": 0.88261008, + "num_input_tokens_seen": 35262820, + "step": 1642, + "time_per_iteration": 2.484950542449951 + }, + { + "auxiliary_loss_clip": 0.01142692, + "auxiliary_loss_mlp": 0.01054525, + "balance_loss_clip": 1.05469298, + "balance_loss_mlp": 1.03369951, + "epoch": 0.09878250413347361, + "flos": 27636134002560.0, + "grad_norm": 1.7801416034561532, + "language_loss": 0.80831039, + "learning_rate": 3.950577500259144e-06, + "loss": 0.83028257, + "num_input_tokens_seen": 35284490, + "step": 1643, + "time_per_iteration": 2.605327844619751 + }, + { + "auxiliary_loss_clip": 0.01171143, + "auxiliary_loss_mlp": 0.01073007, + "balance_loss_clip": 1.05660951, + "balance_loss_mlp": 1.05150211, + "epoch": 0.0988426273861416, + "flos": 16544728880640.0, + "grad_norm": 1.9317255457843905, + "language_loss": 0.82200927, + "learning_rate": 3.950491417930543e-06, + "loss": 0.84445071, + "num_input_tokens_seen": 35302815, + "step": 1644, + "time_per_iteration": 2.498096227645874 + }, + { + "auxiliary_loss_clip": 0.0115706, + "auxiliary_loss_mlp": 0.01107175, + "balance_loss_clip": 1.05251122, + "balance_loss_mlp": 1.59641171, + "epoch": 0.09890275063880956, + "flos": 21215270522880.0, + "grad_norm": 1.8638243837790351, + "language_loss": 0.68059838, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.70324075, + "num_input_tokens_seen": 35321175, + "step": 1645, + "time_per_iteration": 2.523047924041748 + }, + { + "auxiliary_loss_clip": 0.01056882, + "auxiliary_loss_mlp": 0.01014486, + "balance_loss_clip": 1.02419293, + "balance_loss_mlp": 1.01117182, + "epoch": 0.09896287389147752, + "flos": 59379372910080.0, + "grad_norm": 0.855389406388609, + "language_loss": 0.60874653, + "learning_rate": 3.950319031388119e-06, + "loss": 0.62946022, + "num_input_tokens_seen": 35381740, + "step": 1646, + "time_per_iteration": 3.0704548358917236 + }, + { + "auxiliary_loss_clip": 0.01143906, + "auxiliary_loss_mlp": 0.01055024, + "balance_loss_clip": 1.05667746, + "balance_loss_mlp": 1.03257728, + "epoch": 0.0990229971441455, + "flos": 29642678530560.0, + "grad_norm": 2.3027520038816327, + "language_loss": 0.73075736, + "learning_rate": 3.950232727180833e-06, + "loss": 0.7527467, + "num_input_tokens_seen": 35403760, + "step": 1647, + "time_per_iteration": 2.628739833831787 + }, + { + "auxiliary_loss_clip": 0.01153838, + "auxiliary_loss_mlp": 0.01060916, + "balance_loss_clip": 1.05322778, + "balance_loss_mlp": 1.04082918, + "epoch": 0.09908312039681347, + "flos": 21834873152640.0, + "grad_norm": 2.051447786257446, + "language_loss": 0.84169978, + "learning_rate": 3.950146349020525e-06, + "loss": 0.86384732, + "num_input_tokens_seen": 35424050, + "step": 1648, + "time_per_iteration": 2.5864462852478027 + }, + { + "auxiliary_loss_clip": 0.0106881, + "auxiliary_loss_mlp": 0.01005105, + "balance_loss_clip": 1.03026688, + "balance_loss_mlp": 1.00176704, + "epoch": 0.09914324364948143, + "flos": 57564304807680.0, + "grad_norm": 0.726670937568241, + "language_loss": 0.55669916, + "learning_rate": 3.950059896910473e-06, + "loss": 0.57743835, + "num_input_tokens_seen": 35481690, + "step": 1649, + "time_per_iteration": 3.0419468879699707 + }, + { + "auxiliary_loss_clip": 0.01165643, + "auxiliary_loss_mlp": 0.01046315, + "balance_loss_clip": 1.05141282, + "balance_loss_mlp": 1.02547681, + "epoch": 0.09920336690214941, + "flos": 34123934476800.0, + "grad_norm": 2.623191987484333, + "language_loss": 0.89675653, + "learning_rate": 3.949973370853954e-06, + "loss": 0.91887611, + "num_input_tokens_seen": 35498635, + "step": 1650, + "time_per_iteration": 2.588137149810791 + }, + { + "auxiliary_loss_clip": 0.01036032, + "auxiliary_loss_mlp": 0.0144039, + "balance_loss_clip": 1.03135657, + "balance_loss_mlp": 2.24221325, + "epoch": 0.09926349015481738, + "flos": 71216428464000.0, + "grad_norm": 0.9358428949393979, + "language_loss": 0.63712549, + "learning_rate": 3.94988677085425e-06, + "loss": 0.66188967, + "num_input_tokens_seen": 35565720, + "step": 1651, + "time_per_iteration": 3.332594394683838 + }, + { + "auxiliary_loss_clip": 0.0116121, + "auxiliary_loss_mlp": 0.01059745, + "balance_loss_clip": 1.05166888, + "balance_loss_mlp": 1.03765523, + "epoch": 0.09932361340748534, + "flos": 23148700917120.0, + "grad_norm": 2.1162862688045454, + "language_loss": 0.87910157, + "learning_rate": 3.949800096914643e-06, + "loss": 0.90131104, + "num_input_tokens_seen": 35586000, + "step": 1652, + "time_per_iteration": 3.9281647205352783 + }, + { + "auxiliary_loss_clip": 0.01159769, + "auxiliary_loss_mlp": 0.01055081, + "balance_loss_clip": 1.05604231, + "balance_loss_mlp": 1.03423131, + "epoch": 0.09938373666015332, + "flos": 19828651847040.0, + "grad_norm": 1.9661125049244077, + "language_loss": 0.82249516, + "learning_rate": 3.949713349038422e-06, + "loss": 0.84464371, + "num_input_tokens_seen": 35604355, + "step": 1653, + "time_per_iteration": 2.53544282913208 + }, + { + "auxiliary_loss_clip": 0.01167622, + "auxiliary_loss_mlp": 0.01308306, + "balance_loss_clip": 1.05234385, + "balance_loss_mlp": 1.98688579, + "epoch": 0.09944385991282129, + "flos": 22090664880000.0, + "grad_norm": 2.0037517690098094, + "language_loss": 0.7959438, + "learning_rate": 3.949626527228875e-06, + "loss": 0.82070303, + "num_input_tokens_seen": 35625495, + "step": 1654, + "time_per_iteration": 2.5702474117279053 + }, + { + "auxiliary_loss_clip": 0.01181917, + "auxiliary_loss_mlp": 0.01060069, + "balance_loss_clip": 1.05903685, + "balance_loss_mlp": 1.03956532, + "epoch": 0.09950398316548925, + "flos": 19828867328640.0, + "grad_norm": 2.0621416000831823, + "language_loss": 0.80815011, + "learning_rate": 3.949539631489295e-06, + "loss": 0.83057004, + "num_input_tokens_seen": 35645030, + "step": 1655, + "time_per_iteration": 3.868032932281494 + }, + { + "auxiliary_loss_clip": 0.01175547, + "auxiliary_loss_mlp": 0.01057891, + "balance_loss_clip": 1.05241048, + "balance_loss_mlp": 1.03714871, + "epoch": 0.09956410641815722, + "flos": 25003701964800.0, + "grad_norm": 1.8608580094479896, + "language_loss": 0.80454743, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.82688177, + "num_input_tokens_seen": 35664305, + "step": 1656, + "time_per_iteration": 2.51297664642334 + }, + { + "auxiliary_loss_clip": 0.01166463, + "auxiliary_loss_mlp": 0.01057007, + "balance_loss_clip": 1.05555654, + "balance_loss_mlp": 1.03581166, + "epoch": 0.0996242296708252, + "flos": 19317714837120.0, + "grad_norm": 1.770071720689501, + "language_loss": 0.89150345, + "learning_rate": 3.949365618233217e-06, + "loss": 0.91373825, + "num_input_tokens_seen": 35684060, + "step": 1657, + "time_per_iteration": 2.4929065704345703 + }, + { + "auxiliary_loss_clip": 0.01160839, + "auxiliary_loss_mlp": 0.01058143, + "balance_loss_clip": 1.05551374, + "balance_loss_mlp": 1.03595853, + "epoch": 0.09968435292349316, + "flos": 21871609787520.0, + "grad_norm": 2.4592786444855115, + "language_loss": 0.84841943, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.87060928, + "num_input_tokens_seen": 35703250, + "step": 1658, + "time_per_iteration": 2.545592784881592 + }, + { + "auxiliary_loss_clip": 0.01075334, + "auxiliary_loss_mlp": 0.01011324, + "balance_loss_clip": 1.02364349, + "balance_loss_mlp": 1.00805748, + "epoch": 0.09974447617616113, + "flos": 65384533313280.0, + "grad_norm": 0.9011613202575146, + "language_loss": 0.60811186, + "learning_rate": 3.949191309296585e-06, + "loss": 0.62897837, + "num_input_tokens_seen": 35762165, + "step": 1659, + "time_per_iteration": 4.476860523223877 + }, + { + "auxiliary_loss_clip": 0.01152187, + "auxiliary_loss_mlp": 0.01056904, + "balance_loss_clip": 1.05162883, + "balance_loss_mlp": 1.03521955, + "epoch": 0.0998045994288291, + "flos": 23659817495040.0, + "grad_norm": 1.8827729914234468, + "language_loss": 0.85034382, + "learning_rate": 3.949104043956321e-06, + "loss": 0.87243474, + "num_input_tokens_seen": 35781520, + "step": 1660, + "time_per_iteration": 2.546563148498535 + }, + { + "auxiliary_loss_clip": 0.01148782, + "auxiliary_loss_mlp": 0.01057632, + "balance_loss_clip": 1.05674124, + "balance_loss_mlp": 1.03440952, + "epoch": 0.09986472268149707, + "flos": 19609704495360.0, + "grad_norm": 1.9969897590365777, + "language_loss": 0.79797053, + "learning_rate": 3.949016704705836e-06, + "loss": 0.82003462, + "num_input_tokens_seen": 35799565, + "step": 1661, + "time_per_iteration": 2.5231924057006836 + }, + { + "auxiliary_loss_clip": 0.0116542, + "auxiliary_loss_mlp": 0.01054105, + "balance_loss_clip": 1.05111098, + "balance_loss_mlp": 1.03186059, + "epoch": 0.09992484593416504, + "flos": 26213317395840.0, + "grad_norm": 2.1468896675873674, + "language_loss": 0.8371197, + "learning_rate": 3.948929291548443e-06, + "loss": 0.85931492, + "num_input_tokens_seen": 35821085, + "step": 1662, + "time_per_iteration": 3.9578418731689453 + }, + { + "auxiliary_loss_clip": 0.01155283, + "auxiliary_loss_mlp": 0.01064081, + "balance_loss_clip": 1.0523982, + "balance_loss_mlp": 1.04015541, + "epoch": 0.09998496918683301, + "flos": 17493632421120.0, + "grad_norm": 2.3259479624722097, + "language_loss": 0.89115012, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.91334379, + "num_input_tokens_seen": 35839840, + "step": 1663, + "time_per_iteration": 2.5194926261901855 + }, + { + "auxiliary_loss_clip": 0.01175404, + "auxiliary_loss_mlp": 0.01052199, + "balance_loss_clip": 1.05684125, + "balance_loss_mlp": 1.02998996, + "epoch": 0.10004509243950098, + "flos": 22784925928320.0, + "grad_norm": 1.743401963197557, + "language_loss": 0.70255482, + "learning_rate": 3.948754243526191e-06, + "loss": 0.72483081, + "num_input_tokens_seen": 35861545, + "step": 1664, + "time_per_iteration": 2.5678064823150635 + }, + { + "auxiliary_loss_clip": 0.01141779, + "auxiliary_loss_mlp": 0.01049941, + "balance_loss_clip": 1.05589592, + "balance_loss_mlp": 1.02834034, + "epoch": 0.10010521569216894, + "flos": 16253385667200.0, + "grad_norm": 2.263127252197607, + "language_loss": 0.78512859, + "learning_rate": 3.94866660866797e-06, + "loss": 0.80704582, + "num_input_tokens_seen": 35878295, + "step": 1665, + "time_per_iteration": 2.544363021850586 + }, + { + "auxiliary_loss_clip": 0.01171428, + "auxiliary_loss_mlp": 0.01063369, + "balance_loss_clip": 1.06061125, + "balance_loss_mlp": 1.04150605, + "epoch": 0.10016533894483691, + "flos": 23402589223680.0, + "grad_norm": 2.65350261847189, + "language_loss": 0.69971788, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.72206593, + "num_input_tokens_seen": 35898990, + "step": 1666, + "time_per_iteration": 2.5455663204193115 + }, + { + "auxiliary_loss_clip": 0.01112771, + "auxiliary_loss_mlp": 0.01063019, + "balance_loss_clip": 1.05683374, + "balance_loss_mlp": 1.03947532, + "epoch": 0.10022546219750489, + "flos": 19354164163200.0, + "grad_norm": 1.9353468030300898, + "language_loss": 0.78491497, + "learning_rate": 3.948491117273956e-06, + "loss": 0.80667293, + "num_input_tokens_seen": 35916225, + "step": 1667, + "time_per_iteration": 2.6437623500823975 + }, + { + "auxiliary_loss_clip": 0.01151876, + "auxiliary_loss_mlp": 0.01055557, + "balance_loss_clip": 1.05457985, + "balance_loss_mlp": 1.03071368, + "epoch": 0.10028558545017285, + "flos": 27085766837760.0, + "grad_norm": 2.3563942944329352, + "language_loss": 0.76787591, + "learning_rate": 3.948403260744817e-06, + "loss": 0.78995025, + "num_input_tokens_seen": 35934630, + "step": 1668, + "time_per_iteration": 2.592874050140381 + }, + { + "auxiliary_loss_clip": 0.01182209, + "auxiliary_loss_mlp": 0.01053109, + "balance_loss_clip": 1.05670881, + "balance_loss_mlp": 1.0311383, + "epoch": 0.10034570870284082, + "flos": 25847136195840.0, + "grad_norm": 2.7386390687248694, + "language_loss": 0.77872705, + "learning_rate": 3.948315330332031e-06, + "loss": 0.80108023, + "num_input_tokens_seen": 35953855, + "step": 1669, + "time_per_iteration": 2.513627767562866 + }, + { + "auxiliary_loss_clip": 0.01190107, + "auxiliary_loss_mlp": 0.01057438, + "balance_loss_clip": 1.06029034, + "balance_loss_mlp": 1.03491926, + "epoch": 0.1004058319555088, + "flos": 26249587153920.0, + "grad_norm": 2.0687039450440823, + "language_loss": 0.85414052, + "learning_rate": 3.948227326038933e-06, + "loss": 0.876616, + "num_input_tokens_seen": 35974555, + "step": 1670, + "time_per_iteration": 2.51472806930542 + }, + { + "auxiliary_loss_clip": 0.01177051, + "auxiliary_loss_mlp": 0.010486, + "balance_loss_clip": 1.05490303, + "balance_loss_mlp": 1.02771449, + "epoch": 0.10046595520817676, + "flos": 25374480105600.0, + "grad_norm": 1.5443826482261862, + "language_loss": 0.76658237, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.7888388, + "num_input_tokens_seen": 35996830, + "step": 1671, + "time_per_iteration": 2.51322603225708 + }, + { + "auxiliary_loss_clip": 0.01065699, + "auxiliary_loss_mlp": 0.01003461, + "balance_loss_clip": 1.02352548, + "balance_loss_mlp": 1.0006355, + "epoch": 0.10052607846084473, + "flos": 67461821677440.0, + "grad_norm": 0.7823527675825388, + "language_loss": 0.60696256, + "learning_rate": 3.948051095825149e-06, + "loss": 0.62765419, + "num_input_tokens_seen": 36054465, + "step": 1672, + "time_per_iteration": 3.0984489917755127 + }, + { + "auxiliary_loss_clip": 0.01141992, + "auxiliary_loss_mlp": 0.01057888, + "balance_loss_clip": 1.05360401, + "balance_loss_mlp": 1.03542852, + "epoch": 0.10058620171351271, + "flos": 21360493209600.0, + "grad_norm": 2.653941530334597, + "language_loss": 0.76910186, + "learning_rate": 3.947962869911147e-06, + "loss": 0.79110068, + "num_input_tokens_seen": 36073480, + "step": 1673, + "time_per_iteration": 2.54616117477417 + }, + { + "auxiliary_loss_clip": 0.01133007, + "auxiliary_loss_mlp": 0.01057777, + "balance_loss_clip": 1.05265832, + "balance_loss_mlp": 1.03467369, + "epoch": 0.10064632496618067, + "flos": 16800125558400.0, + "grad_norm": 2.075012489307534, + "language_loss": 0.73531806, + "learning_rate": 3.947874570130197e-06, + "loss": 0.75722593, + "num_input_tokens_seen": 36091830, + "step": 1674, + "time_per_iteration": 2.5548598766326904 + }, + { + "auxiliary_loss_clip": 0.01171794, + "auxiliary_loss_mlp": 0.01780695, + "balance_loss_clip": 1.05477774, + "balance_loss_mlp": 2.85056043, + "epoch": 0.10070644821884864, + "flos": 23624445576960.0, + "grad_norm": 1.947592461691345, + "language_loss": 0.79441464, + "learning_rate": 3.947786196485649e-06, + "loss": 0.82393956, + "num_input_tokens_seen": 36111400, + "step": 1675, + "time_per_iteration": 2.533487319946289 + }, + { + "auxiliary_loss_clip": 0.01181756, + "auxiliary_loss_mlp": 0.01066175, + "balance_loss_clip": 1.05722833, + "balance_loss_mlp": 1.0460403, + "epoch": 0.1007665714715166, + "flos": 24462564595200.0, + "grad_norm": 2.374048784936178, + "language_loss": 0.81413126, + "learning_rate": 3.947697748980853e-06, + "loss": 0.83661056, + "num_input_tokens_seen": 36129345, + "step": 1676, + "time_per_iteration": 2.4949231147766113 + }, + { + "auxiliary_loss_clip": 0.01173757, + "auxiliary_loss_mlp": 0.01058342, + "balance_loss_clip": 1.05773973, + "balance_loss_mlp": 1.03689623, + "epoch": 0.10082669472418458, + "flos": 16799119977600.0, + "grad_norm": 2.1661904412477426, + "language_loss": 0.86184305, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88416409, + "num_input_tokens_seen": 36146255, + "step": 1677, + "time_per_iteration": 2.4750161170959473 + }, + { + "auxiliary_loss_clip": 0.0116078, + "auxiliary_loss_mlp": 0.01050818, + "balance_loss_clip": 1.05268204, + "balance_loss_mlp": 1.02905035, + "epoch": 0.10088681797685255, + "flos": 13553513844480.0, + "grad_norm": 1.8735701724344236, + "language_loss": 0.86215031, + "learning_rate": 3.947520632403936e-06, + "loss": 0.88426638, + "num_input_tokens_seen": 36164050, + "step": 1678, + "time_per_iteration": 2.5260531902313232 + }, + { + "auxiliary_loss_clip": 0.01161823, + "auxiliary_loss_mlp": 0.0105341, + "balance_loss_clip": 1.0594461, + "balance_loss_mlp": 1.03174949, + "epoch": 0.10094694122952051, + "flos": 25265706744960.0, + "grad_norm": 2.2348908032406563, + "language_loss": 0.89885199, + "learning_rate": 3.947431963338532e-06, + "loss": 0.92100441, + "num_input_tokens_seen": 36183530, + "step": 1679, + "time_per_iteration": 2.578390121459961 + }, + { + "auxiliary_loss_clip": 0.01077165, + "auxiliary_loss_mlp": 0.01005154, + "balance_loss_clip": 1.02589488, + "balance_loss_mlp": 1.00214982, + "epoch": 0.10100706448218849, + "flos": 69854299885440.0, + "grad_norm": 0.7750616707489534, + "language_loss": 0.52971166, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55053484, + "num_input_tokens_seen": 36248550, + "step": 1680, + "time_per_iteration": 3.1289310455322266 + }, + { + "auxiliary_loss_clip": 0.01181018, + "auxiliary_loss_mlp": 0.01197711, + "balance_loss_clip": 1.05625939, + "balance_loss_mlp": 1.75721645, + "epoch": 0.10106718773485646, + "flos": 20007163463040.0, + "grad_norm": 1.8552620588372701, + "language_loss": 0.76806682, + "learning_rate": 3.947254403670641e-06, + "loss": 0.79185408, + "num_input_tokens_seen": 36266065, + "step": 1681, + "time_per_iteration": 2.461486577987671 + }, + { + "auxiliary_loss_clip": 0.01155376, + "auxiliary_loss_mlp": 0.01058062, + "balance_loss_clip": 1.0553968, + "balance_loss_mlp": 1.03257513, + "epoch": 0.10112731098752442, + "flos": 13479825093120.0, + "grad_norm": 3.287273216525816, + "language_loss": 0.94134235, + "learning_rate": 3.947165513074889e-06, + "loss": 0.96347672, + "num_input_tokens_seen": 36280960, + "step": 1682, + "time_per_iteration": 2.4972054958343506 + }, + { + "auxiliary_loss_clip": 0.01171807, + "auxiliary_loss_mlp": 0.01052214, + "balance_loss_clip": 1.05372882, + "balance_loss_mlp": 1.03089952, + "epoch": 0.1011874342401924, + "flos": 18515901490560.0, + "grad_norm": 1.8762755942465152, + "language_loss": 0.87784034, + "learning_rate": 3.947076548642425e-06, + "loss": 0.90008062, + "num_input_tokens_seen": 36299010, + "step": 1683, + "time_per_iteration": 2.457242965698242 + }, + { + "auxiliary_loss_clip": 0.01135138, + "auxiliary_loss_mlp": 0.01061408, + "balance_loss_clip": 1.05900049, + "balance_loss_mlp": 1.03917587, + "epoch": 0.10124755749286037, + "flos": 20702861055360.0, + "grad_norm": 3.204158047910409, + "language_loss": 0.74946457, + "learning_rate": 3.946987510376624e-06, + "loss": 0.77143002, + "num_input_tokens_seen": 36318400, + "step": 1684, + "time_per_iteration": 2.593762159347534 + }, + { + "auxiliary_loss_clip": 0.01057541, + "auxiliary_loss_mlp": 0.01006962, + "balance_loss_clip": 1.02733672, + "balance_loss_mlp": 1.00348115, + "epoch": 0.10130768074552833, + "flos": 56109456247680.0, + "grad_norm": 0.756496410994953, + "language_loss": 0.61084735, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.63149238, + "num_input_tokens_seen": 36381815, + "step": 1685, + "time_per_iteration": 3.2083797454833984 + }, + { + "auxiliary_loss_clip": 0.01157197, + "auxiliary_loss_mlp": 0.01055481, + "balance_loss_clip": 1.05331862, + "balance_loss_mlp": 1.03340316, + "epoch": 0.1013678039981963, + "flos": 33402346156800.0, + "grad_norm": 2.9622982536318663, + "language_loss": 0.61394286, + "learning_rate": 3.946809212358516e-06, + "loss": 0.63606966, + "num_input_tokens_seen": 36404320, + "step": 1686, + "time_per_iteration": 2.629000186920166 + }, + { + "auxiliary_loss_clip": 0.01147346, + "auxiliary_loss_mlp": 0.01057377, + "balance_loss_clip": 1.06210196, + "balance_loss_mlp": 1.03479874, + "epoch": 0.10142792725086427, + "flos": 31905338008320.0, + "grad_norm": 2.4381170151817018, + "language_loss": 0.81129277, + "learning_rate": 3.946719952612972e-06, + "loss": 0.83333999, + "num_input_tokens_seen": 36427510, + "step": 1687, + "time_per_iteration": 2.6599340438842773 + }, + { + "auxiliary_loss_clip": 0.01176007, + "auxiliary_loss_mlp": 0.01050807, + "balance_loss_clip": 1.05758405, + "balance_loss_mlp": 1.02928948, + "epoch": 0.10148805050353224, + "flos": 28475905046400.0, + "grad_norm": 2.0939914167331377, + "language_loss": 0.71902454, + "learning_rate": 3.94663061904761e-06, + "loss": 0.74129272, + "num_input_tokens_seen": 36448230, + "step": 1688, + "time_per_iteration": 2.5608816146850586 + }, + { + "auxiliary_loss_clip": 0.01154131, + "auxiliary_loss_mlp": 0.01059129, + "balance_loss_clip": 1.05744731, + "balance_loss_mlp": 1.03770745, + "epoch": 0.1015481737562002, + "flos": 25148888737920.0, + "grad_norm": 2.0769117973926807, + "language_loss": 0.86740959, + "learning_rate": 3.94654121166582e-06, + "loss": 0.88954222, + "num_input_tokens_seen": 36464395, + "step": 1689, + "time_per_iteration": 2.5547194480895996 + }, + { + "auxiliary_loss_clip": 0.01168087, + "auxiliary_loss_mlp": 0.01053719, + "balance_loss_clip": 1.05032992, + "balance_loss_mlp": 1.03397799, + "epoch": 0.10160829700886818, + "flos": 30882781630080.0, + "grad_norm": 1.8718334423079868, + "language_loss": 0.8813554, + "learning_rate": 3.946451730470993e-06, + "loss": 0.90357345, + "num_input_tokens_seen": 36486475, + "step": 1690, + "time_per_iteration": 2.5791540145874023 + }, + { + "auxiliary_loss_clip": 0.01156331, + "auxiliary_loss_mlp": 0.01056095, + "balance_loss_clip": 1.05317354, + "balance_loss_mlp": 1.0339818, + "epoch": 0.10166842026153615, + "flos": 20412020632320.0, + "grad_norm": 1.848734048949071, + "language_loss": 0.83459806, + "learning_rate": 3.946362175466521e-06, + "loss": 0.85672235, + "num_input_tokens_seen": 36505310, + "step": 1691, + "time_per_iteration": 2.5394649505615234 + }, + { + "auxiliary_loss_clip": 0.01159971, + "auxiliary_loss_mlp": 0.01051899, + "balance_loss_clip": 1.05292833, + "balance_loss_mlp": 1.03040516, + "epoch": 0.10172854351420411, + "flos": 33476968661760.0, + "grad_norm": 1.5554040917278282, + "language_loss": 0.66454363, + "learning_rate": 3.946272546655801e-06, + "loss": 0.68666232, + "num_input_tokens_seen": 36529820, + "step": 1692, + "time_per_iteration": 4.077995300292969 + }, + { + "auxiliary_loss_clip": 0.01146332, + "auxiliary_loss_mlp": 0.01072514, + "balance_loss_clip": 1.05594361, + "balance_loss_mlp": 1.0504241, + "epoch": 0.1017886667668721, + "flos": 23550325862400.0, + "grad_norm": 1.6763555841319027, + "language_loss": 0.75622815, + "learning_rate": 3.94618284404223e-06, + "loss": 0.77841663, + "num_input_tokens_seen": 36549000, + "step": 1693, + "time_per_iteration": 2.6031036376953125 + }, + { + "auxiliary_loss_clip": 0.01136766, + "auxiliary_loss_mlp": 0.01054501, + "balance_loss_clip": 1.05954564, + "balance_loss_mlp": 1.03129101, + "epoch": 0.10184879001954006, + "flos": 23296078419840.0, + "grad_norm": 1.7318061046272604, + "language_loss": 0.87409329, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.89600599, + "num_input_tokens_seen": 36567515, + "step": 1694, + "time_per_iteration": 3.9995791912078857 + }, + { + "auxiliary_loss_clip": 0.01130341, + "auxiliary_loss_mlp": 0.01056828, + "balance_loss_clip": 1.05114627, + "balance_loss_mlp": 1.03284311, + "epoch": 0.10190891327220802, + "flos": 18333116156160.0, + "grad_norm": 1.919646286561325, + "language_loss": 0.7953403, + "learning_rate": 3.946003217420147e-06, + "loss": 0.81721199, + "num_input_tokens_seen": 36586190, + "step": 1695, + "time_per_iteration": 2.56355619430542 + }, + { + "auxiliary_loss_clip": 0.01128123, + "auxiliary_loss_mlp": 0.01057347, + "balance_loss_clip": 1.0502497, + "balance_loss_mlp": 1.03376698, + "epoch": 0.10196903652487599, + "flos": 26465374108800.0, + "grad_norm": 1.7253950315677058, + "language_loss": 0.86351335, + "learning_rate": 3.945913293418447e-06, + "loss": 0.88536799, + "num_input_tokens_seen": 36607495, + "step": 1696, + "time_per_iteration": 2.6476330757141113 + }, + { + "auxiliary_loss_clip": 0.01162962, + "auxiliary_loss_mlp": 0.01052422, + "balance_loss_clip": 1.05220771, + "balance_loss_mlp": 1.03126228, + "epoch": 0.10202915977754397, + "flos": 21869526798720.0, + "grad_norm": 2.7416056884323914, + "language_loss": 0.82340646, + "learning_rate": 3.945823295627519e-06, + "loss": 0.84556031, + "num_input_tokens_seen": 36628555, + "step": 1697, + "time_per_iteration": 3.9938836097717285 + }, + { + "auxiliary_loss_clip": 0.01181714, + "auxiliary_loss_mlp": 0.01048634, + "balance_loss_clip": 1.05460072, + "balance_loss_mlp": 1.02700925, + "epoch": 0.10208928303021193, + "flos": 22309755886080.0, + "grad_norm": 2.372492895130597, + "language_loss": 0.81229579, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.83459926, + "num_input_tokens_seen": 36646250, + "step": 1698, + "time_per_iteration": 2.474506139755249 + }, + { + "auxiliary_loss_clip": 0.01148845, + "auxiliary_loss_mlp": 0.01047703, + "balance_loss_clip": 1.05510545, + "balance_loss_mlp": 1.02686489, + "epoch": 0.1021494062828799, + "flos": 22125569921280.0, + "grad_norm": 2.7362126731869414, + "language_loss": 0.75769895, + "learning_rate": 3.945643078691637e-06, + "loss": 0.7796644, + "num_input_tokens_seen": 36666675, + "step": 1699, + "time_per_iteration": 2.5797290802001953 + }, + { + "auxiliary_loss_clip": 0.01154056, + "auxiliary_loss_mlp": 0.01048007, + "balance_loss_clip": 1.05329156, + "balance_loss_mlp": 1.02691865, + "epoch": 0.10220952953554788, + "flos": 19646728439040.0, + "grad_norm": 1.8431023863595613, + "language_loss": 0.80172378, + "learning_rate": 3.945552859553516e-06, + "loss": 0.82374442, + "num_input_tokens_seen": 36685225, + "step": 1700, + "time_per_iteration": 2.5174407958984375 + }, + { + "auxiliary_loss_clip": 0.0116711, + "auxiliary_loss_mlp": 0.01045643, + "balance_loss_clip": 1.05290627, + "balance_loss_mlp": 1.02494812, + "epoch": 0.10226965278821584, + "flos": 29787290686080.0, + "grad_norm": 1.9666881978190862, + "language_loss": 0.77117586, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79330337, + "num_input_tokens_seen": 36705985, + "step": 1701, + "time_per_iteration": 3.937105178833008 + }, + { + "auxiliary_loss_clip": 0.01173961, + "auxiliary_loss_mlp": 0.01050267, + "balance_loss_clip": 1.05487573, + "balance_loss_mlp": 1.02832007, + "epoch": 0.10232977604088381, + "flos": 27016818681600.0, + "grad_norm": 2.2635970448667817, + "language_loss": 0.78121394, + "learning_rate": 3.945372199954019e-06, + "loss": 0.80345619, + "num_input_tokens_seen": 36725815, + "step": 1702, + "time_per_iteration": 2.541058301925659 + }, + { + "auxiliary_loss_clip": 0.01153432, + "auxiliary_loss_mlp": 0.01051298, + "balance_loss_clip": 1.05406308, + "balance_loss_mlp": 1.03135419, + "epoch": 0.10238989929355179, + "flos": 20777519473920.0, + "grad_norm": 2.205698764785944, + "language_loss": 0.9453299, + "learning_rate": 3.945281759499494e-06, + "loss": 0.96737719, + "num_input_tokens_seen": 36742345, + "step": 1703, + "time_per_iteration": 2.531846523284912 + }, + { + "auxiliary_loss_clip": 0.01038489, + "auxiliary_loss_mlp": 0.01016016, + "balance_loss_clip": 1.04027462, + "balance_loss_mlp": 1.01220179, + "epoch": 0.10245002254621975, + "flos": 57698322451200.0, + "grad_norm": 0.8703133893021714, + "language_loss": 0.54981375, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57035875, + "num_input_tokens_seen": 36798775, + "step": 1704, + "time_per_iteration": 3.1068997383117676 + }, + { + "auxiliary_loss_clip": 0.01180324, + "auxiliary_loss_mlp": 0.01048115, + "balance_loss_clip": 1.0553019, + "balance_loss_mlp": 1.02650213, + "epoch": 0.10251014579888772, + "flos": 16800125558400.0, + "grad_norm": 2.4984952322017975, + "language_loss": 0.83908957, + "learning_rate": 3.945100657298039e-06, + "loss": 0.86137396, + "num_input_tokens_seen": 36816295, + "step": 1705, + "time_per_iteration": 2.4519641399383545 + }, + { + "auxiliary_loss_clip": 0.0105298, + "auxiliary_loss_mlp": 0.01019988, + "balance_loss_clip": 1.03344798, + "balance_loss_mlp": 1.01691234, + "epoch": 0.1025702690515557, + "flos": 68565500922240.0, + "grad_norm": 0.772230297834778, + "language_loss": 0.60364008, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62436974, + "num_input_tokens_seen": 36882030, + "step": 1706, + "time_per_iteration": 3.1839938163757324 + }, + { + "auxiliary_loss_clip": 0.0114433, + "auxiliary_loss_mlp": 0.01048457, + "balance_loss_clip": 1.04970741, + "balance_loss_mlp": 1.02668953, + "epoch": 0.10263039230422366, + "flos": 14866623336960.0, + "grad_norm": 2.546464925469809, + "language_loss": 0.8606329, + "learning_rate": 3.94491926006294e-06, + "loss": 0.88256073, + "num_input_tokens_seen": 36899245, + "step": 1707, + "time_per_iteration": 2.5396816730499268 + }, + { + "auxiliary_loss_clip": 0.01164831, + "auxiliary_loss_mlp": 0.01047735, + "balance_loss_clip": 1.05649161, + "balance_loss_mlp": 1.02788687, + "epoch": 0.10269051555689163, + "flos": 25337599816320.0, + "grad_norm": 1.5883761278519797, + "language_loss": 0.73039073, + "learning_rate": 3.944828450816369e-06, + "loss": 0.75251639, + "num_input_tokens_seen": 36920950, + "step": 1708, + "time_per_iteration": 2.5603139400482178 + }, + { + "auxiliary_loss_clip": 0.0115009, + "auxiliary_loss_mlp": 0.0156235, + "balance_loss_clip": 1.05417132, + "balance_loss_mlp": 2.42871428, + "epoch": 0.10275063880955959, + "flos": 21068826773760.0, + "grad_norm": 1.7575007822836726, + "language_loss": 0.91337562, + "learning_rate": 3.944737567821709e-06, + "loss": 0.94050002, + "num_input_tokens_seen": 36938900, + "step": 1709, + "time_per_iteration": 2.537498950958252 + }, + { + "auxiliary_loss_clip": 0.01129839, + "auxiliary_loss_mlp": 0.01054313, + "balance_loss_clip": 1.0622344, + "balance_loss_mlp": 1.03278422, + "epoch": 0.10281076206222757, + "flos": 30366780802560.0, + "grad_norm": 2.2155538612842536, + "language_loss": 0.88064063, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90248215, + "num_input_tokens_seen": 36957010, + "step": 1710, + "time_per_iteration": 2.6491658687591553 + }, + { + "auxiliary_loss_clip": 0.01162923, + "auxiliary_loss_mlp": 0.01052097, + "balance_loss_clip": 1.05081046, + "balance_loss_mlp": 1.0315814, + "epoch": 0.10287088531489554, + "flos": 22418313765120.0, + "grad_norm": 2.040467019812419, + "language_loss": 0.79383427, + "learning_rate": 3.944555580601908e-06, + "loss": 0.81598449, + "num_input_tokens_seen": 36977690, + "step": 1711, + "time_per_iteration": 2.511686325073242 + }, + { + "auxiliary_loss_clip": 0.0114834, + "auxiliary_loss_mlp": 0.01054245, + "balance_loss_clip": 1.05249047, + "balance_loss_mlp": 1.03247786, + "epoch": 0.1029310085675635, + "flos": 25115994858240.0, + "grad_norm": 2.1474448764818286, + "language_loss": 0.73989224, + "learning_rate": 3.944464476383668e-06, + "loss": 0.76191813, + "num_input_tokens_seen": 36997300, + "step": 1712, + "time_per_iteration": 2.6086416244506836 + }, + { + "auxiliary_loss_clip": 0.01127561, + "auxiliary_loss_mlp": 0.01063333, + "balance_loss_clip": 1.0524081, + "balance_loss_mlp": 1.04149342, + "epoch": 0.10299113182023148, + "flos": 19865639877120.0, + "grad_norm": 1.7238740079466595, + "language_loss": 0.87099469, + "learning_rate": 3.94437329843114e-06, + "loss": 0.89290357, + "num_input_tokens_seen": 37016110, + "step": 1713, + "time_per_iteration": 2.5488805770874023 + }, + { + "auxiliary_loss_clip": 0.01162137, + "auxiliary_loss_mlp": 0.01059579, + "balance_loss_clip": 1.05320048, + "balance_loss_mlp": 1.03990972, + "epoch": 0.10305125507289944, + "flos": 20447608032000.0, + "grad_norm": 1.6759586785187683, + "language_loss": 0.72233427, + "learning_rate": 3.944282046747782e-06, + "loss": 0.74455142, + "num_input_tokens_seen": 37036405, + "step": 1714, + "time_per_iteration": 2.540839910507202 + }, + { + "auxiliary_loss_clip": 0.01167751, + "auxiliary_loss_mlp": 0.01058349, + "balance_loss_clip": 1.05415404, + "balance_loss_mlp": 1.03647387, + "epoch": 0.10311137832556741, + "flos": 26250772302720.0, + "grad_norm": 2.093736037244032, + "language_loss": 0.90964705, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93190807, + "num_input_tokens_seen": 37057580, + "step": 1715, + "time_per_iteration": 2.558284044265747 + }, + { + "auxiliary_loss_clip": 0.01162621, + "auxiliary_loss_mlp": 0.01062693, + "balance_loss_clip": 1.05007958, + "balance_loss_mlp": 1.04184294, + "epoch": 0.10317150157823539, + "flos": 35298932175360.0, + "grad_norm": 1.875706327962939, + "language_loss": 0.75722075, + "learning_rate": 3.944099322202418e-06, + "loss": 0.77947384, + "num_input_tokens_seen": 37079120, + "step": 1716, + "time_per_iteration": 2.6406195163726807 + }, + { + "auxiliary_loss_clip": 0.01158983, + "auxiliary_loss_mlp": 0.01072647, + "balance_loss_clip": 1.05451608, + "balance_loss_mlp": 1.05019951, + "epoch": 0.10323162483090335, + "flos": 25739943033600.0, + "grad_norm": 2.1749093302522073, + "language_loss": 0.85292661, + "learning_rate": 3.944007849347342e-06, + "loss": 0.87524295, + "num_input_tokens_seen": 37099710, + "step": 1717, + "time_per_iteration": 2.5754611492156982 + }, + { + "auxiliary_loss_clip": 0.01127489, + "auxiliary_loss_mlp": 0.01070926, + "balance_loss_clip": 1.05767846, + "balance_loss_mlp": 1.0489434, + "epoch": 0.10329174808357132, + "flos": 16289870906880.0, + "grad_norm": 2.108945013379632, + "language_loss": 0.8266778, + "learning_rate": 3.943916302775292e-06, + "loss": 0.8486619, + "num_input_tokens_seen": 37117775, + "step": 1718, + "time_per_iteration": 2.543630599975586 + }, + { + "auxiliary_loss_clip": 0.01164306, + "auxiliary_loss_mlp": 0.01054119, + "balance_loss_clip": 1.05519748, + "balance_loss_mlp": 1.03279221, + "epoch": 0.10335187133623928, + "flos": 36687166963200.0, + "grad_norm": 2.2999681639479226, + "language_loss": 0.73060048, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75278473, + "num_input_tokens_seen": 37140280, + "step": 1719, + "time_per_iteration": 2.6379857063293457 + }, + { + "auxiliary_loss_clip": 0.01161706, + "auxiliary_loss_mlp": 0.01051204, + "balance_loss_clip": 1.05447817, + "balance_loss_mlp": 1.03131938, + "epoch": 0.10341199458890726, + "flos": 14975648092800.0, + "grad_norm": 2.1092099486940343, + "language_loss": 0.9252882, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.94741726, + "num_input_tokens_seen": 37158350, + "step": 1720, + "time_per_iteration": 2.492255926132202 + }, + { + "auxiliary_loss_clip": 0.01134385, + "auxiliary_loss_mlp": 0.0105438, + "balance_loss_clip": 1.05007017, + "balance_loss_mlp": 1.03324366, + "epoch": 0.10347211784157523, + "flos": 21031587348480.0, + "grad_norm": 1.732835571101408, + "language_loss": 0.79152822, + "learning_rate": 3.943641220792039e-06, + "loss": 0.81341588, + "num_input_tokens_seen": 37177120, + "step": 1721, + "time_per_iteration": 2.5552775859832764 + }, + { + "auxiliary_loss_clip": 0.01126297, + "auxiliary_loss_mlp": 0.01060872, + "balance_loss_clip": 1.05196166, + "balance_loss_mlp": 1.03586149, + "epoch": 0.1035322410942432, + "flos": 19792094780160.0, + "grad_norm": 2.733970394153954, + "language_loss": 0.81011301, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.8319847, + "num_input_tokens_seen": 37195895, + "step": 1722, + "time_per_iteration": 2.5907089710235596 + }, + { + "auxiliary_loss_clip": 0.01059375, + "auxiliary_loss_mlp": 0.01078303, + "balance_loss_clip": 1.02992845, + "balance_loss_mlp": 1.07540655, + "epoch": 0.10359236434691117, + "flos": 52698874947840.0, + "grad_norm": 0.9461026808466435, + "language_loss": 0.67147028, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69284701, + "num_input_tokens_seen": 37247270, + "step": 1723, + "time_per_iteration": 2.9297189712524414 + }, + { + "auxiliary_loss_clip": 0.01166948, + "auxiliary_loss_mlp": 0.01055123, + "balance_loss_clip": 1.05324006, + "balance_loss_mlp": 1.03505993, + "epoch": 0.10365248759957914, + "flos": 18405404277120.0, + "grad_norm": 3.213318586721877, + "language_loss": 0.78115261, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.80337334, + "num_input_tokens_seen": 37265595, + "step": 1724, + "time_per_iteration": 2.4845407009124756 + }, + { + "auxiliary_loss_clip": 0.01145895, + "auxiliary_loss_mlp": 0.01056077, + "balance_loss_clip": 1.05374432, + "balance_loss_mlp": 1.0349772, + "epoch": 0.1037126108522471, + "flos": 47553555335040.0, + "grad_norm": 1.8318637742336048, + "language_loss": 0.7496568, + "learning_rate": 3.943273412987676e-06, + "loss": 0.77167654, + "num_input_tokens_seen": 37286660, + "step": 1725, + "time_per_iteration": 2.8051393032073975 + }, + { + "auxiliary_loss_clip": 0.01132911, + "auxiliary_loss_mlp": 0.01056396, + "balance_loss_clip": 1.05178928, + "balance_loss_mlp": 1.03480732, + "epoch": 0.10377273410491508, + "flos": 22816670572800.0, + "grad_norm": 2.0853435869543926, + "language_loss": 0.74944401, + "learning_rate": 3.943181276805054e-06, + "loss": 0.77133703, + "num_input_tokens_seen": 37304915, + "step": 1726, + "time_per_iteration": 2.5633206367492676 + }, + { + "auxiliary_loss_clip": 0.01150464, + "auxiliary_loss_mlp": 0.01052084, + "balance_loss_clip": 1.05729389, + "balance_loss_mlp": 1.03067422, + "epoch": 0.10383285735758305, + "flos": 26138694890880.0, + "grad_norm": 11.845370406244118, + "language_loss": 0.73643637, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.75846183, + "num_input_tokens_seen": 37325265, + "step": 1727, + "time_per_iteration": 2.5860443115234375 + }, + { + "auxiliary_loss_clip": 0.01157211, + "auxiliary_loss_mlp": 0.01056291, + "balance_loss_clip": 1.05561399, + "balance_loss_mlp": 1.03517938, + "epoch": 0.10389298061025101, + "flos": 17091791994240.0, + "grad_norm": 2.259312824890131, + "language_loss": 0.84974486, + "learning_rate": 3.942996783386422e-06, + "loss": 0.87187982, + "num_input_tokens_seen": 37341650, + "step": 1728, + "time_per_iteration": 2.4807839393615723 + }, + { + "auxiliary_loss_clip": 0.01153481, + "auxiliary_loss_mlp": 0.01050793, + "balance_loss_clip": 1.0540278, + "balance_loss_mlp": 1.02988398, + "epoch": 0.10395310386291898, + "flos": 20776513893120.0, + "grad_norm": 2.5216398377819798, + "language_loss": 0.70835495, + "learning_rate": 3.942904426157406e-06, + "loss": 0.7303977, + "num_input_tokens_seen": 37360270, + "step": 1729, + "time_per_iteration": 2.5593297481536865 + }, + { + "auxiliary_loss_clip": 0.01155551, + "auxiliary_loss_mlp": 0.01053764, + "balance_loss_clip": 1.05540073, + "balance_loss_mlp": 1.03103077, + "epoch": 0.10401322711558696, + "flos": 12820540913280.0, + "grad_norm": 2.6513950258452765, + "language_loss": 0.81322199, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.83531517, + "num_input_tokens_seen": 37375225, + "step": 1730, + "time_per_iteration": 3.9438223838806152 + }, + { + "auxiliary_loss_clip": 0.01084529, + "auxiliary_loss_mlp": 0.01047079, + "balance_loss_clip": 1.05317879, + "balance_loss_mlp": 1.02678919, + "epoch": 0.10407335036825492, + "flos": 23184683366400.0, + "grad_norm": 1.7240102171951683, + "language_loss": 0.75962621, + "learning_rate": 3.942719490677489e-06, + "loss": 0.78094232, + "num_input_tokens_seen": 37395165, + "step": 1731, + "time_per_iteration": 2.814145803451538 + }, + { + "auxiliary_loss_clip": 0.01124171, + "auxiliary_loss_mlp": 0.01048416, + "balance_loss_clip": 1.04956865, + "balance_loss_mlp": 1.02874613, + "epoch": 0.10413347362092289, + "flos": 26104184899200.0, + "grad_norm": 1.918544005759015, + "language_loss": 0.8289305, + "learning_rate": 3.9426269124336e-06, + "loss": 0.85065639, + "num_input_tokens_seen": 37414845, + "step": 1732, + "time_per_iteration": 2.947763681411743 + }, + { + "auxiliary_loss_clip": 0.01135663, + "auxiliary_loss_mlp": 0.01049166, + "balance_loss_clip": 1.05447412, + "balance_loss_mlp": 1.02954388, + "epoch": 0.10419359687359087, + "flos": 12641059630080.0, + "grad_norm": 2.2252035222939455, + "language_loss": 0.83000064, + "learning_rate": 3.942534260525104e-06, + "loss": 0.8518489, + "num_input_tokens_seen": 37432490, + "step": 1733, + "time_per_iteration": 3.9466803073883057 + }, + { + "auxiliary_loss_clip": 0.01149287, + "auxiliary_loss_mlp": 0.0105177, + "balance_loss_clip": 1.05274796, + "balance_loss_mlp": 1.0317663, + "epoch": 0.10425372012625883, + "flos": 12125094716160.0, + "grad_norm": 2.165286297854378, + "language_loss": 0.76285768, + "learning_rate": 3.942441534955514e-06, + "loss": 0.78486824, + "num_input_tokens_seen": 37449435, + "step": 1734, + "time_per_iteration": 2.5231964588165283 + }, + { + "auxiliary_loss_clip": 0.01132421, + "auxiliary_loss_mlp": 0.01045411, + "balance_loss_clip": 1.04897571, + "balance_loss_mlp": 1.02575386, + "epoch": 0.1043138433789268, + "flos": 25337563902720.0, + "grad_norm": 2.168417789257319, + "language_loss": 0.74906653, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.77084494, + "num_input_tokens_seen": 37469105, + "step": 1735, + "time_per_iteration": 2.5897128582000732 + }, + { + "auxiliary_loss_clip": 0.01160858, + "auxiliary_loss_mlp": 0.01051041, + "balance_loss_clip": 1.0521853, + "balance_loss_mlp": 1.03064489, + "epoch": 0.10437396663159478, + "flos": 29167149352320.0, + "grad_norm": 1.8636968946175627, + "language_loss": 0.78428417, + "learning_rate": 3.94225586284712e-06, + "loss": 0.80640316, + "num_input_tokens_seen": 37490540, + "step": 1736, + "time_per_iteration": 3.9105682373046875 + }, + { + "auxiliary_loss_clip": 0.01161866, + "auxiliary_loss_mlp": 0.01057834, + "balance_loss_clip": 1.05446315, + "balance_loss_mlp": 1.03789067, + "epoch": 0.10443408988426274, + "flos": 25080946162560.0, + "grad_norm": 1.9149999599365735, + "language_loss": 0.70723796, + "learning_rate": 3.942162916315356e-06, + "loss": 0.72943497, + "num_input_tokens_seen": 37511905, + "step": 1737, + "time_per_iteration": 2.5707595348358154 + }, + { + "auxiliary_loss_clip": 0.01146523, + "auxiliary_loss_mlp": 0.01052125, + "balance_loss_clip": 1.04708982, + "balance_loss_mlp": 1.02889144, + "epoch": 0.1044942131369307, + "flos": 26759662237440.0, + "grad_norm": 2.2712310612509525, + "language_loss": 0.81423521, + "learning_rate": 3.942069896136581e-06, + "loss": 0.83622169, + "num_input_tokens_seen": 37533635, + "step": 1738, + "time_per_iteration": 2.5924885272979736 + }, + { + "auxiliary_loss_clip": 0.0117531, + "auxiliary_loss_mlp": 0.01057467, + "balance_loss_clip": 1.05036664, + "balance_loss_mlp": 1.03518713, + "epoch": 0.10455433638959867, + "flos": 18442571875200.0, + "grad_norm": 1.8630843064450895, + "language_loss": 0.74904883, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.77137661, + "num_input_tokens_seen": 37552035, + "step": 1739, + "time_per_iteration": 3.8633999824523926 + }, + { + "auxiliary_loss_clip": 0.01143867, + "auxiliary_loss_mlp": 0.01052484, + "balance_loss_clip": 1.05389833, + "balance_loss_mlp": 1.03219473, + "epoch": 0.10461445964226665, + "flos": 23218977876480.0, + "grad_norm": 1.6212283588393184, + "language_loss": 0.7723701, + "learning_rate": 3.941883634852104e-06, + "loss": 0.79433364, + "num_input_tokens_seen": 37571540, + "step": 1740, + "time_per_iteration": 2.565011978149414 + }, + { + "auxiliary_loss_clip": 0.01146818, + "auxiliary_loss_mlp": 0.01049176, + "balance_loss_clip": 1.05365014, + "balance_loss_mlp": 1.02881551, + "epoch": 0.10467458289493461, + "flos": 24345243797760.0, + "grad_norm": 2.407472764127049, + "language_loss": 0.8584013, + "learning_rate": 3.941790393753467e-06, + "loss": 0.8803612, + "num_input_tokens_seen": 37588265, + "step": 1741, + "time_per_iteration": 2.5485949516296387 + }, + { + "auxiliary_loss_clip": 0.01151023, + "auxiliary_loss_mlp": 0.01053775, + "balance_loss_clip": 1.05233812, + "balance_loss_mlp": 1.03290153, + "epoch": 0.10473470614760258, + "flos": 21287953693440.0, + "grad_norm": 2.623997973886344, + "language_loss": 0.75444317, + "learning_rate": 3.941697079021942e-06, + "loss": 0.77649117, + "num_input_tokens_seen": 37606860, + "step": 1742, + "time_per_iteration": 2.538252353668213 + }, + { + "auxiliary_loss_clip": 0.01125131, + "auxiliary_loss_mlp": 0.01059896, + "balance_loss_clip": 1.05563211, + "balance_loss_mlp": 1.04016685, + "epoch": 0.10479482940027056, + "flos": 21687208341120.0, + "grad_norm": 2.987555389720698, + "language_loss": 0.87398732, + "learning_rate": 3.94160369066107e-06, + "loss": 0.89583766, + "num_input_tokens_seen": 37625210, + "step": 1743, + "time_per_iteration": 2.640143632888794 + }, + { + "auxiliary_loss_clip": 0.01132163, + "auxiliary_loss_mlp": 0.01048536, + "balance_loss_clip": 1.05117619, + "balance_loss_mlp": 1.02751994, + "epoch": 0.10485495265293852, + "flos": 21573694385280.0, + "grad_norm": 2.065369371468023, + "language_loss": 0.75548315, + "learning_rate": 3.941510228674391e-06, + "loss": 0.77729023, + "num_input_tokens_seen": 37644110, + "step": 1744, + "time_per_iteration": 2.587905168533325 + }, + { + "auxiliary_loss_clip": 0.01165242, + "auxiliary_loss_mlp": 0.01052565, + "balance_loss_clip": 1.05707669, + "balance_loss_mlp": 1.0333004, + "epoch": 0.10491507590560649, + "flos": 37961923708800.0, + "grad_norm": 2.0435678240331194, + "language_loss": 0.79312658, + "learning_rate": 3.941416693065451e-06, + "loss": 0.81530464, + "num_input_tokens_seen": 37665800, + "step": 1745, + "time_per_iteration": 2.649773359298706 + }, + { + "auxiliary_loss_clip": 0.0117486, + "auxiliary_loss_mlp": 0.01062079, + "balance_loss_clip": 1.05321646, + "balance_loss_mlp": 1.04193258, + "epoch": 0.10497519915827447, + "flos": 26396282298240.0, + "grad_norm": 2.170149219786544, + "language_loss": 0.82769597, + "learning_rate": 3.941323083837794e-06, + "loss": 0.85006535, + "num_input_tokens_seen": 37685095, + "step": 1746, + "time_per_iteration": 2.5292367935180664 + }, + { + "auxiliary_loss_clip": 0.01156022, + "auxiliary_loss_mlp": 0.0106003, + "balance_loss_clip": 1.05622184, + "balance_loss_mlp": 1.04053915, + "epoch": 0.10503532241094243, + "flos": 40662190581120.0, + "grad_norm": 1.7223830177455677, + "language_loss": 0.70099699, + "learning_rate": 3.941229400994971e-06, + "loss": 0.72315753, + "num_input_tokens_seen": 37707445, + "step": 1747, + "time_per_iteration": 2.711099863052368 + }, + { + "auxiliary_loss_clip": 0.01152918, + "auxiliary_loss_mlp": 0.01061526, + "balance_loss_clip": 1.05539751, + "balance_loss_mlp": 1.04097414, + "epoch": 0.1050954456636104, + "flos": 29789409588480.0, + "grad_norm": 2.636072705669235, + "language_loss": 0.84105939, + "learning_rate": 3.941135644540535e-06, + "loss": 0.86320388, + "num_input_tokens_seen": 37728325, + "step": 1748, + "time_per_iteration": 2.6599011421203613 + }, + { + "auxiliary_loss_clip": 0.01171076, + "auxiliary_loss_mlp": 0.0105079, + "balance_loss_clip": 1.0515753, + "balance_loss_mlp": 1.0298568, + "epoch": 0.10515556891627838, + "flos": 23948754497280.0, + "grad_norm": 1.7328280465115748, + "language_loss": 0.7181977, + "learning_rate": 3.941041814478041e-06, + "loss": 0.74041641, + "num_input_tokens_seen": 37748910, + "step": 1749, + "time_per_iteration": 2.4981110095977783 + }, + { + "auxiliary_loss_clip": 0.01155722, + "auxiliary_loss_mlp": 0.0106304, + "balance_loss_clip": 1.05379081, + "balance_loss_mlp": 1.04201162, + "epoch": 0.10521569216894634, + "flos": 18259606972800.0, + "grad_norm": 2.1780280474511593, + "language_loss": 0.8222928, + "learning_rate": 3.940947910811047e-06, + "loss": 0.8444804, + "num_input_tokens_seen": 37765745, + "step": 1750, + "time_per_iteration": 2.5448145866394043 + }, + { + "auxiliary_loss_clip": 0.01155562, + "auxiliary_loss_mlp": 0.01061524, + "balance_loss_clip": 1.06383872, + "balance_loss_mlp": 1.04104376, + "epoch": 0.10527581542161431, + "flos": 15630909949440.0, + "grad_norm": 2.775513187735577, + "language_loss": 0.92179334, + "learning_rate": 3.940853933543114e-06, + "loss": 0.94396418, + "num_input_tokens_seen": 37780520, + "step": 1751, + "time_per_iteration": 2.5375444889068604 + }, + { + "auxiliary_loss_clip": 0.0116326, + "auxiliary_loss_mlp": 0.01049581, + "balance_loss_clip": 1.05527735, + "balance_loss_mlp": 1.02929175, + "epoch": 0.10533593867428227, + "flos": 18296559089280.0, + "grad_norm": 1.9873912552102726, + "language_loss": 0.79567432, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81780273, + "num_input_tokens_seen": 37799515, + "step": 1752, + "time_per_iteration": 2.510979413986206 + }, + { + "auxiliary_loss_clip": 0.01113114, + "auxiliary_loss_mlp": 0.01068994, + "balance_loss_clip": 1.0522809, + "balance_loss_mlp": 1.04649925, + "epoch": 0.10539606192695025, + "flos": 29023219555200.0, + "grad_norm": 1.8895087992437076, + "language_loss": 0.75686044, + "learning_rate": 3.940665758218686e-06, + "loss": 0.77868152, + "num_input_tokens_seen": 37818695, + "step": 1753, + "time_per_iteration": 2.6459739208221436 + }, + { + "auxiliary_loss_clip": 0.01134791, + "auxiliary_loss_mlp": 0.0105846, + "balance_loss_clip": 1.05337143, + "balance_loss_mlp": 1.03646576, + "epoch": 0.10545618517961822, + "flos": 19969313506560.0, + "grad_norm": 3.033317272160769, + "language_loss": 0.83883905, + "learning_rate": 3.940571560169328e-06, + "loss": 0.86077154, + "num_input_tokens_seen": 37837860, + "step": 1754, + "time_per_iteration": 2.579953193664551 + }, + { + "auxiliary_loss_clip": 0.0113494, + "auxiliary_loss_mlp": 0.01057352, + "balance_loss_clip": 1.05794764, + "balance_loss_mlp": 1.03541732, + "epoch": 0.10551630843228618, + "flos": 16143427157760.0, + "grad_norm": 2.775360992915459, + "language_loss": 0.68719, + "learning_rate": 3.940477288533302e-06, + "loss": 0.70911288, + "num_input_tokens_seen": 37856260, + "step": 1755, + "time_per_iteration": 2.584129571914673 + }, + { + "auxiliary_loss_clip": 0.0115885, + "auxiliary_loss_mlp": 0.01067593, + "balance_loss_clip": 1.05414724, + "balance_loss_mlp": 1.04613495, + "epoch": 0.10557643168495416, + "flos": 23440115957760.0, + "grad_norm": 3.9945891738566868, + "language_loss": 0.76852429, + "learning_rate": 3.940382943314182e-06, + "loss": 0.79078877, + "num_input_tokens_seen": 37876960, + "step": 1756, + "time_per_iteration": 2.5326125621795654 + }, + { + "auxiliary_loss_clip": 0.01178649, + "auxiliary_loss_mlp": 0.01068982, + "balance_loss_clip": 1.05550528, + "balance_loss_mlp": 1.04847801, + "epoch": 0.10563655493762213, + "flos": 21799034357760.0, + "grad_norm": 1.764921852785999, + "language_loss": 0.80173922, + "learning_rate": 3.940288524515547e-06, + "loss": 0.82421553, + "num_input_tokens_seen": 37897070, + "step": 1757, + "time_per_iteration": 2.476378917694092 + }, + { + "auxiliary_loss_clip": 0.01146942, + "auxiliary_loss_mlp": 0.01060747, + "balance_loss_clip": 1.05320334, + "balance_loss_mlp": 1.04004073, + "epoch": 0.10569667819029009, + "flos": 53800863275520.0, + "grad_norm": 1.6585433724236767, + "language_loss": 0.78746974, + "learning_rate": 3.940194032140976e-06, + "loss": 0.80954671, + "num_input_tokens_seen": 37923635, + "step": 1758, + "time_per_iteration": 2.8608648777008057 + }, + { + "auxiliary_loss_clip": 0.01162769, + "auxiliary_loss_mlp": 0.01053312, + "balance_loss_clip": 1.0574826, + "balance_loss_mlp": 1.03198564, + "epoch": 0.10575680144295807, + "flos": 22925515760640.0, + "grad_norm": 1.8110435378713954, + "language_loss": 0.91936111, + "learning_rate": 3.940099466194054e-06, + "loss": 0.94152188, + "num_input_tokens_seen": 37942650, + "step": 1759, + "time_per_iteration": 2.544886589050293 + }, + { + "auxiliary_loss_clip": 0.01154576, + "auxiliary_loss_mlp": 0.01060638, + "balance_loss_clip": 1.05604362, + "balance_loss_mlp": 1.03797674, + "epoch": 0.10581692469562604, + "flos": 14136667148160.0, + "grad_norm": 3.9262514160582693, + "language_loss": 0.77181423, + "learning_rate": 3.940004826678365e-06, + "loss": 0.79396641, + "num_input_tokens_seen": 37960660, + "step": 1760, + "time_per_iteration": 2.530640125274658 + }, + { + "auxiliary_loss_clip": 0.01158429, + "auxiliary_loss_mlp": 0.01059902, + "balance_loss_clip": 1.05340552, + "balance_loss_mlp": 1.03718102, + "epoch": 0.105877047948294, + "flos": 25958674903680.0, + "grad_norm": 2.326696521662957, + "language_loss": 0.88962048, + "learning_rate": 3.939910113597498e-06, + "loss": 0.91180384, + "num_input_tokens_seen": 37978625, + "step": 1761, + "time_per_iteration": 2.5215654373168945 + }, + { + "auxiliary_loss_clip": 0.01108339, + "auxiliary_loss_mlp": 0.0105114, + "balance_loss_clip": 1.0517664, + "balance_loss_mlp": 1.45875061, + "epoch": 0.10593717120096197, + "flos": 30664768032000.0, + "grad_norm": 2.0151983851540787, + "language_loss": 0.78292489, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.80451965, + "num_input_tokens_seen": 38000005, + "step": 1762, + "time_per_iteration": 2.696902275085449 + }, + { + "auxiliary_loss_clip": 0.01068063, + "auxiliary_loss_mlp": 0.01010323, + "balance_loss_clip": 1.04267836, + "balance_loss_mlp": 1.00703323, + "epoch": 0.10599729445362994, + "flos": 66436682497920.0, + "grad_norm": 1.3483662875667615, + "language_loss": 0.60488391, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62566769, + "num_input_tokens_seen": 38066165, + "step": 1763, + "time_per_iteration": 3.306469202041626 + }, + { + "auxiliary_loss_clip": 0.0115544, + "auxiliary_loss_mlp": 0.0104598, + "balance_loss_clip": 1.05570102, + "balance_loss_mlp": 1.02528536, + "epoch": 0.10605741770629791, + "flos": 23948179879680.0, + "grad_norm": 1.7054940994568681, + "language_loss": 0.80317086, + "learning_rate": 3.939625532999763e-06, + "loss": 0.82518506, + "num_input_tokens_seen": 38086150, + "step": 1764, + "time_per_iteration": 2.552823781967163 + }, + { + "auxiliary_loss_clip": 0.01136143, + "auxiliary_loss_mlp": 0.01060363, + "balance_loss_clip": 1.0511446, + "balance_loss_mlp": 1.03663993, + "epoch": 0.10611754095896588, + "flos": 19387524919680.0, + "grad_norm": 1.6537239392430785, + "language_loss": 0.80056667, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.8225317, + "num_input_tokens_seen": 38104205, + "step": 1765, + "time_per_iteration": 2.5708858966827393 + }, + { + "auxiliary_loss_clip": 0.01159118, + "auxiliary_loss_mlp": 0.01053861, + "balance_loss_clip": 1.05231571, + "balance_loss_mlp": 1.03228474, + "epoch": 0.10617766421163385, + "flos": 22237755073920.0, + "grad_norm": 1.984565709032615, + "language_loss": 0.76713896, + "learning_rate": 3.939435444841306e-06, + "loss": 0.78926879, + "num_input_tokens_seen": 38122005, + "step": 1766, + "time_per_iteration": 2.503915548324585 + }, + { + "auxiliary_loss_clip": 0.01176376, + "auxiliary_loss_mlp": 0.01058708, + "balance_loss_clip": 1.05312979, + "balance_loss_mlp": 1.03639162, + "epoch": 0.10623778746430182, + "flos": 28404407024640.0, + "grad_norm": 2.088663084327038, + "language_loss": 0.77353531, + "learning_rate": 3.939340290444895e-06, + "loss": 0.79588616, + "num_input_tokens_seen": 38143365, + "step": 1767, + "time_per_iteration": 2.5584492683410645 + }, + { + "auxiliary_loss_clip": 0.01003662, + "auxiliary_loss_mlp": 0.01009336, + "balance_loss_clip": 1.03118181, + "balance_loss_mlp": 1.00604594, + "epoch": 0.10629791071696978, + "flos": 64234639221120.0, + "grad_norm": 0.6808700566882854, + "language_loss": 0.57865024, + "learning_rate": 3.939245062508506e-06, + "loss": 0.59878027, + "num_input_tokens_seen": 38210035, + "step": 1768, + "time_per_iteration": 4.984228134155273 + }, + { + "auxiliary_loss_clip": 0.01138102, + "auxiliary_loss_mlp": 0.01041145, + "balance_loss_clip": 1.05171156, + "balance_loss_mlp": 1.02105808, + "epoch": 0.10635803396963776, + "flos": 22747578762240.0, + "grad_norm": 2.0203134394279396, + "language_loss": 0.8646363, + "learning_rate": 3.939149761035749e-06, + "loss": 0.88642883, + "num_input_tokens_seen": 38231230, + "step": 1769, + "time_per_iteration": 3.202700614929199 + }, + { + "auxiliary_loss_clip": 0.01135906, + "auxiliary_loss_mlp": 0.00974813, + "balance_loss_clip": 1.05130911, + "balance_loss_mlp": 1.31261635, + "epoch": 0.10641815722230573, + "flos": 31395586147200.0, + "grad_norm": 1.818069600040015, + "language_loss": 0.61748302, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.63859022, + "num_input_tokens_seen": 38253890, + "step": 1770, + "time_per_iteration": 2.7175233364105225 + }, + { + "auxiliary_loss_clip": 0.01075149, + "auxiliary_loss_mlp": 0.01004877, + "balance_loss_clip": 1.03438854, + "balance_loss_mlp": 1.00177717, + "epoch": 0.1064782804749737, + "flos": 58552527784320.0, + "grad_norm": 0.8822162008919421, + "language_loss": 0.57055509, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.59135532, + "num_input_tokens_seen": 38304290, + "step": 1771, + "time_per_iteration": 3.024432897567749 + }, + { + "auxiliary_loss_clip": 0.01142526, + "auxiliary_loss_mlp": 0.01058026, + "balance_loss_clip": 1.05510736, + "balance_loss_mlp": 1.03724825, + "epoch": 0.10653840372764166, + "flos": 23987825516160.0, + "grad_norm": 1.8842535519936046, + "language_loss": 0.88483286, + "learning_rate": 3.938863415435429e-06, + "loss": 0.90683842, + "num_input_tokens_seen": 38324725, + "step": 1772, + "time_per_iteration": 4.080848455429077 + }, + { + "auxiliary_loss_clip": 0.01180133, + "auxiliary_loss_mlp": 0.01055365, + "balance_loss_clip": 1.05284476, + "balance_loss_mlp": 1.03238189, + "epoch": 0.10659852698030964, + "flos": 18294655668480.0, + "grad_norm": 2.4944421194177697, + "language_loss": 0.7587527, + "learning_rate": 3.93876781985337e-06, + "loss": 0.78110766, + "num_input_tokens_seen": 38340735, + "step": 1773, + "time_per_iteration": 2.4679439067840576 + }, + { + "auxiliary_loss_clip": 0.01125118, + "auxiliary_loss_mlp": 0.01062614, + "balance_loss_clip": 1.0542767, + "balance_loss_mlp": 1.03858113, + "epoch": 0.1066586502329776, + "flos": 32160591031680.0, + "grad_norm": 2.1824496372538023, + "language_loss": 0.82977128, + "learning_rate": 3.938672150753041e-06, + "loss": 0.85164857, + "num_input_tokens_seen": 38361315, + "step": 1774, + "time_per_iteration": 4.039280414581299 + }, + { + "auxiliary_loss_clip": 0.01149524, + "auxiliary_loss_mlp": 0.00953471, + "balance_loss_clip": 1.05328441, + "balance_loss_mlp": 1.28024817, + "epoch": 0.10671877348564557, + "flos": 17785155202560.0, + "grad_norm": 2.8293273150900915, + "language_loss": 0.76345521, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.78448522, + "num_input_tokens_seen": 38377425, + "step": 1775, + "time_per_iteration": 2.5170490741729736 + }, + { + "auxiliary_loss_clip": 0.01080124, + "auxiliary_loss_mlp": 0.01008592, + "balance_loss_clip": 1.0308938, + "balance_loss_mlp": 1.00549257, + "epoch": 0.10677889673831355, + "flos": 63510177813120.0, + "grad_norm": 0.8322799037707013, + "language_loss": 0.57461029, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59549749, + "num_input_tokens_seen": 38440275, + "step": 1776, + "time_per_iteration": 3.1108837127685547 + }, + { + "auxiliary_loss_clip": 0.01147282, + "auxiliary_loss_mlp": 0.01066076, + "balance_loss_clip": 1.0519371, + "balance_loss_mlp": 1.04153109, + "epoch": 0.10683901999098151, + "flos": 22017694400640.0, + "grad_norm": 1.8658450700407851, + "language_loss": 0.83312041, + "learning_rate": 3.938384702378727e-06, + "loss": 0.85525399, + "num_input_tokens_seen": 38461820, + "step": 1777, + "time_per_iteration": 4.040153741836548 + }, + { + "auxiliary_loss_clip": 0.01114431, + "auxiliary_loss_mlp": 0.0093014, + "balance_loss_clip": 1.05465508, + "balance_loss_mlp": 1.23516345, + "epoch": 0.10689914324364948, + "flos": 25042952551680.0, + "grad_norm": 1.8128607428067909, + "language_loss": 0.87356329, + "learning_rate": 3.938288739241625e-06, + "loss": 0.89400899, + "num_input_tokens_seen": 38482235, + "step": 1778, + "time_per_iteration": 2.647101879119873 + }, + { + "auxiliary_loss_clip": 0.01134409, + "auxiliary_loss_mlp": 0.00932054, + "balance_loss_clip": 1.05395174, + "balance_loss_mlp": 1.24022281, + "epoch": 0.10695926649631746, + "flos": 16435129507200.0, + "grad_norm": 1.9887797655935426, + "language_loss": 0.84043854, + "learning_rate": 3.938192702604417e-06, + "loss": 0.86110318, + "num_input_tokens_seen": 38500690, + "step": 1779, + "time_per_iteration": 2.6101789474487305 + }, + { + "auxiliary_loss_clip": 0.01138533, + "auxiliary_loss_mlp": 0.00925509, + "balance_loss_clip": 1.05352974, + "balance_loss_mlp": 1.22474432, + "epoch": 0.10701938974898542, + "flos": 16979211792000.0, + "grad_norm": 2.002449799199395, + "language_loss": 0.67376244, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.69440281, + "num_input_tokens_seen": 38518405, + "step": 1780, + "time_per_iteration": 2.549173593521118 + }, + { + "auxiliary_loss_clip": 0.01162453, + "auxiliary_loss_mlp": 0.0104931, + "balance_loss_clip": 1.05594707, + "balance_loss_mlp": 1.02832913, + "epoch": 0.10707951300165339, + "flos": 15888102307200.0, + "grad_norm": 2.8166631777892346, + "language_loss": 0.91754705, + "learning_rate": 3.938000408844265e-06, + "loss": 0.93966466, + "num_input_tokens_seen": 38535060, + "step": 1781, + "time_per_iteration": 2.4879205226898193 + }, + { + "auxiliary_loss_clip": 0.01125462, + "auxiliary_loss_mlp": 0.01054765, + "balance_loss_clip": 1.05047202, + "balance_loss_mlp": 1.03427243, + "epoch": 0.10713963625432135, + "flos": 14247164361600.0, + "grad_norm": 1.936880045850461, + "language_loss": 0.79316783, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.81497014, + "num_input_tokens_seen": 38552855, + "step": 1782, + "time_per_iteration": 2.5683634281158447 + }, + { + "auxiliary_loss_clip": 0.01156334, + "auxiliary_loss_mlp": 0.01052677, + "balance_loss_clip": 1.05509377, + "balance_loss_mlp": 1.03154135, + "epoch": 0.10719975950698933, + "flos": 16756780821120.0, + "grad_norm": 2.103230605396249, + "language_loss": 0.78852177, + "learning_rate": 3.937807821127436e-06, + "loss": 0.81061196, + "num_input_tokens_seen": 38570075, + "step": 1783, + "time_per_iteration": 2.5359129905700684 + }, + { + "auxiliary_loss_clip": 0.01157104, + "auxiliary_loss_mlp": 0.01052217, + "balance_loss_clip": 1.05217326, + "balance_loss_mlp": 1.03077102, + "epoch": 0.1072598827596573, + "flos": 22710626645760.0, + "grad_norm": 2.014188304373719, + "language_loss": 0.86534739, + "learning_rate": 3.937711417044395e-06, + "loss": 0.88744056, + "num_input_tokens_seen": 38587970, + "step": 1784, + "time_per_iteration": 2.548200845718384 + }, + { + "auxiliary_loss_clip": 0.01147063, + "auxiliary_loss_mlp": 0.01054039, + "balance_loss_clip": 1.05259371, + "balance_loss_mlp": 1.03253376, + "epoch": 0.10732000601232526, + "flos": 23258264376960.0, + "grad_norm": 2.3463612640585962, + "language_loss": 1.00922799, + "learning_rate": 3.937614939483143e-06, + "loss": 1.03123903, + "num_input_tokens_seen": 38605840, + "step": 1785, + "time_per_iteration": 2.5721778869628906 + }, + { + "auxiliary_loss_clip": 0.01164347, + "auxiliary_loss_mlp": 0.0105859, + "balance_loss_clip": 1.05583596, + "balance_loss_mlp": 1.03771615, + "epoch": 0.10738012926499324, + "flos": 24207060176640.0, + "grad_norm": 1.4212215929391596, + "language_loss": 0.84679282, + "learning_rate": 3.937518388447339e-06, + "loss": 0.86902225, + "num_input_tokens_seen": 38627070, + "step": 1786, + "time_per_iteration": 2.542224168777466 + }, + { + "auxiliary_loss_clip": 0.0117655, + "auxiliary_loss_mlp": 0.01054945, + "balance_loss_clip": 1.05225635, + "balance_loss_mlp": 1.03240252, + "epoch": 0.1074402525176612, + "flos": 20923065383040.0, + "grad_norm": 1.9138394717432452, + "language_loss": 0.78682321, + "learning_rate": 3.937421763940642e-06, + "loss": 0.80913818, + "num_input_tokens_seen": 38645840, + "step": 1787, + "time_per_iteration": 2.4811363220214844 + }, + { + "auxiliary_loss_clip": 0.01167691, + "auxiliary_loss_mlp": 0.01045219, + "balance_loss_clip": 1.05416906, + "balance_loss_mlp": 1.02411902, + "epoch": 0.10750037577032917, + "flos": 16946928443520.0, + "grad_norm": 2.2144798946887603, + "language_loss": 0.82744288, + "learning_rate": 3.937325065966719e-06, + "loss": 0.84957194, + "num_input_tokens_seen": 38664770, + "step": 1788, + "time_per_iteration": 2.486591339111328 + }, + { + "auxiliary_loss_clip": 0.01174604, + "auxiliary_loss_mlp": 0.0105649, + "balance_loss_clip": 1.05280089, + "balance_loss_mlp": 1.03646326, + "epoch": 0.10756049902299715, + "flos": 20266546550400.0, + "grad_norm": 1.8472713126561915, + "language_loss": 0.77797037, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.80028129, + "num_input_tokens_seen": 38683865, + "step": 1789, + "time_per_iteration": 2.4798715114593506 + }, + { + "auxiliary_loss_clip": 0.01177027, + "auxiliary_loss_mlp": 0.01057915, + "balance_loss_clip": 1.05381703, + "balance_loss_mlp": 1.03476453, + "epoch": 0.10762062227566511, + "flos": 23586523793280.0, + "grad_norm": 3.184103868463773, + "language_loss": 0.74557352, + "learning_rate": 3.937131449631859e-06, + "loss": 0.767923, + "num_input_tokens_seen": 38702485, + "step": 1790, + "time_per_iteration": 2.511636257171631 + }, + { + "auxiliary_loss_clip": 0.01169726, + "auxiliary_loss_mlp": 0.00938638, + "balance_loss_clip": 1.05489826, + "balance_loss_mlp": 1.24680865, + "epoch": 0.10768074552833308, + "flos": 24310626065280.0, + "grad_norm": 2.4968458708852888, + "language_loss": 0.787543, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.80862665, + "num_input_tokens_seen": 38722475, + "step": 1791, + "time_per_iteration": 2.568056344985962 + }, + { + "auxiliary_loss_clip": 0.01133596, + "auxiliary_loss_mlp": 0.01063447, + "balance_loss_clip": 1.05383611, + "balance_loss_mlp": 1.04086852, + "epoch": 0.10774086878100106, + "flos": 25299965341440.0, + "grad_norm": 2.200417121974988, + "language_loss": 0.70931971, + "learning_rate": 3.936937539472126e-06, + "loss": 0.73129016, + "num_input_tokens_seen": 38743285, + "step": 1792, + "time_per_iteration": 2.600085735321045 + }, + { + "auxiliary_loss_clip": 0.01142262, + "auxiliary_loss_mlp": 0.01045445, + "balance_loss_clip": 1.04931021, + "balance_loss_mlp": 1.02299786, + "epoch": 0.10780099203366902, + "flos": 22054035985920.0, + "grad_norm": 2.1000180608698455, + "language_loss": 0.76260114, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.78447825, + "num_input_tokens_seen": 38763035, + "step": 1793, + "time_per_iteration": 2.572072744369507 + }, + { + "auxiliary_loss_clip": 0.01116791, + "auxiliary_loss_mlp": 0.01063843, + "balance_loss_clip": 1.05621743, + "balance_loss_mlp": 1.04195666, + "epoch": 0.10786111528633699, + "flos": 22747471021440.0, + "grad_norm": 1.6480251302865685, + "language_loss": 0.84855539, + "learning_rate": 3.936743335516936e-06, + "loss": 0.8703618, + "num_input_tokens_seen": 38784900, + "step": 1794, + "time_per_iteration": 2.609758138656616 + }, + { + "auxiliary_loss_clip": 0.0111417, + "auxiliary_loss_mlp": 0.01050221, + "balance_loss_clip": 1.04893255, + "balance_loss_mlp": 1.0270226, + "epoch": 0.10792123853900495, + "flos": 20851064570880.0, + "grad_norm": 1.7379188196477193, + "language_loss": 0.74893898, + "learning_rate": 3.936646123375246e-06, + "loss": 0.77058291, + "num_input_tokens_seen": 38804695, + "step": 1795, + "time_per_iteration": 2.638037919998169 + }, + { + "auxiliary_loss_clip": 0.01120903, + "auxiliary_loss_mlp": 0.01055481, + "balance_loss_clip": 1.04834831, + "balance_loss_mlp": 1.03340387, + "epoch": 0.10798136179167293, + "flos": 17748705876480.0, + "grad_norm": 2.871285409630518, + "language_loss": 0.816194, + "learning_rate": 3.936548837795741e-06, + "loss": 0.83795786, + "num_input_tokens_seen": 38822395, + "step": 1796, + "time_per_iteration": 2.5850188732147217 + }, + { + "auxiliary_loss_clip": 0.01141756, + "auxiliary_loss_mlp": 0.01079207, + "balance_loss_clip": 1.05691266, + "balance_loss_mlp": 1.05407786, + "epoch": 0.1080414850443409, + "flos": 13589639948160.0, + "grad_norm": 3.419364175750862, + "language_loss": 0.73908269, + "learning_rate": 3.936451478782111e-06, + "loss": 0.76129234, + "num_input_tokens_seen": 38839865, + "step": 1797, + "time_per_iteration": 2.557633638381958 + }, + { + "auxiliary_loss_clip": 0.01154181, + "auxiliary_loss_mlp": 0.01048792, + "balance_loss_clip": 1.04969335, + "balance_loss_mlp": 1.02856231, + "epoch": 0.10810160829700886, + "flos": 16253421580800.0, + "grad_norm": 2.0939263054375337, + "language_loss": 0.81682545, + "learning_rate": 3.936354046338046e-06, + "loss": 0.83885515, + "num_input_tokens_seen": 38857300, + "step": 1798, + "time_per_iteration": 2.513463020324707 + }, + { + "auxiliary_loss_clip": 0.01138673, + "auxiliary_loss_mlp": 0.01051201, + "balance_loss_clip": 1.05210626, + "balance_loss_mlp": 1.02864647, + "epoch": 0.10816173154967684, + "flos": 15158002464000.0, + "grad_norm": 3.030944472805612, + "language_loss": 0.85908282, + "learning_rate": 3.936256540467242e-06, + "loss": 0.88098145, + "num_input_tokens_seen": 38874960, + "step": 1799, + "time_per_iteration": 2.5537757873535156 + }, + { + "auxiliary_loss_clip": 0.01150948, + "auxiliary_loss_mlp": 0.01060112, + "balance_loss_clip": 1.0533489, + "balance_loss_mlp": 1.03945303, + "epoch": 0.10822185480234481, + "flos": 17785334770560.0, + "grad_norm": 2.288696189098336, + "language_loss": 0.77842802, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.80053866, + "num_input_tokens_seen": 38893610, + "step": 1800, + "time_per_iteration": 2.525588274002075 + }, + { + "auxiliary_loss_clip": 0.01174462, + "auxiliary_loss_mlp": 0.01047272, + "balance_loss_clip": 1.0539434, + "balance_loss_mlp": 1.02729225, + "epoch": 0.10828197805501277, + "flos": 25556654908800.0, + "grad_norm": 1.8514088340532386, + "language_loss": 0.73017943, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.7523967, + "num_input_tokens_seen": 38913485, + "step": 1801, + "time_per_iteration": 2.525867462158203 + }, + { + "auxiliary_loss_clip": 0.0118406, + "auxiliary_loss_mlp": 0.01048449, + "balance_loss_clip": 1.05555606, + "balance_loss_mlp": 1.02819586, + "epoch": 0.10834210130768075, + "flos": 28984435845120.0, + "grad_norm": 2.010276646442789, + "language_loss": 0.6619364, + "learning_rate": 3.935963582331381e-06, + "loss": 0.6842615, + "num_input_tokens_seen": 38935650, + "step": 1802, + "time_per_iteration": 2.537238121032715 + }, + { + "auxiliary_loss_clip": 0.01155606, + "auxiliary_loss_mlp": 0.01060518, + "balance_loss_clip": 1.05577672, + "balance_loss_mlp": 1.03880954, + "epoch": 0.10840222456034872, + "flos": 20264212166400.0, + "grad_norm": 1.8289527022952365, + "language_loss": 0.81348848, + "learning_rate": 3.935865782790621e-06, + "loss": 0.83564973, + "num_input_tokens_seen": 38954130, + "step": 1803, + "time_per_iteration": 2.522123098373413 + }, + { + "auxiliary_loss_clip": 0.01157875, + "auxiliary_loss_mlp": 0.0105661, + "balance_loss_clip": 1.05198944, + "balance_loss_mlp": 1.03433013, + "epoch": 0.10846234781301668, + "flos": 19863054097920.0, + "grad_norm": 1.7210735345813057, + "language_loss": 0.91151452, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.93365937, + "num_input_tokens_seen": 38972905, + "step": 1804, + "time_per_iteration": 2.494198799133301 + }, + { + "auxiliary_loss_clip": 0.01127104, + "auxiliary_loss_mlp": 0.01052659, + "balance_loss_clip": 1.04971588, + "balance_loss_mlp": 1.03033113, + "epoch": 0.10852247106568465, + "flos": 26469037296000.0, + "grad_norm": 2.0311915269725773, + "language_loss": 0.76216924, + "learning_rate": 3.935669963488139e-06, + "loss": 0.7839669, + "num_input_tokens_seen": 38993255, + "step": 1805, + "time_per_iteration": 2.656996488571167 + }, + { + "auxiliary_loss_clip": 0.01146295, + "auxiliary_loss_mlp": 0.01046556, + "balance_loss_clip": 1.0540185, + "balance_loss_mlp": 1.02648091, + "epoch": 0.10858259431835263, + "flos": 30081506987520.0, + "grad_norm": 1.8972499773626876, + "language_loss": 0.86001384, + "learning_rate": 3.935571943733843e-06, + "loss": 0.88194239, + "num_input_tokens_seen": 39012610, + "step": 1806, + "time_per_iteration": 2.5956003665924072 + }, + { + "auxiliary_loss_clip": 0.01165673, + "auxiliary_loss_mlp": 0.00864615, + "balance_loss_clip": 1.05128169, + "balance_loss_mlp": 1.0969882, + "epoch": 0.10864271757102059, + "flos": 19063180085760.0, + "grad_norm": 2.1070937393849083, + "language_loss": 0.81135738, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.83166021, + "num_input_tokens_seen": 39030120, + "step": 1807, + "time_per_iteration": 3.880457878112793 + }, + { + "auxiliary_loss_clip": 0.01135429, + "auxiliary_loss_mlp": 0.01050362, + "balance_loss_clip": 1.05027163, + "balance_loss_mlp": 1.03102648, + "epoch": 0.10870284082368856, + "flos": 24715052271360.0, + "grad_norm": 1.8669969049392539, + "language_loss": 0.79192108, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.813779, + "num_input_tokens_seen": 39049875, + "step": 1808, + "time_per_iteration": 2.599863290786743 + }, + { + "auxiliary_loss_clip": 0.01152373, + "auxiliary_loss_mlp": 0.01052007, + "balance_loss_clip": 1.05400634, + "balance_loss_mlp": 1.03146768, + "epoch": 0.10876296407635654, + "flos": 20627663932800.0, + "grad_norm": 1.7839947534501197, + "language_loss": 0.78929681, + "learning_rate": 3.935277444103342e-06, + "loss": 0.81134057, + "num_input_tokens_seen": 39068935, + "step": 1809, + "time_per_iteration": 2.5424091815948486 + }, + { + "auxiliary_loss_clip": 0.01175535, + "auxiliary_loss_mlp": 0.01051487, + "balance_loss_clip": 1.05220628, + "balance_loss_mlp": 1.03074503, + "epoch": 0.1088230873290245, + "flos": 21579835610880.0, + "grad_norm": 2.5145839146064843, + "language_loss": 0.85074115, + "learning_rate": 3.935179130783046e-06, + "loss": 0.87301135, + "num_input_tokens_seen": 39087370, + "step": 1810, + "time_per_iteration": 3.8687565326690674 + }, + { + "auxiliary_loss_clip": 0.01127238, + "auxiliary_loss_mlp": 0.01054796, + "balance_loss_clip": 1.05304861, + "balance_loss_mlp": 1.03158581, + "epoch": 0.10888321058169247, + "flos": 26469037296000.0, + "grad_norm": 1.8160795807416954, + "language_loss": 0.63268286, + "learning_rate": 3.935080744080564e-06, + "loss": 0.65450323, + "num_input_tokens_seen": 39106635, + "step": 1811, + "time_per_iteration": 2.623990058898926 + }, + { + "auxiliary_loss_clip": 0.01150654, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_clip": 1.05331945, + "balance_loss_mlp": 1.02497077, + "epoch": 0.10894333383436045, + "flos": 25848608653440.0, + "grad_norm": 2.0305890106447784, + "language_loss": 0.74137306, + "learning_rate": 3.934982283999626e-06, + "loss": 0.7633357, + "num_input_tokens_seen": 39126335, + "step": 1812, + "time_per_iteration": 2.580094814300537 + }, + { + "auxiliary_loss_clip": 0.0114117, + "auxiliary_loss_mlp": 0.0104843, + "balance_loss_clip": 1.0527339, + "balance_loss_mlp": 1.02694893, + "epoch": 0.10900345708702841, + "flos": 19537093152000.0, + "grad_norm": 2.0903664712275773, + "language_loss": 0.72594631, + "learning_rate": 3.934883750543966e-06, + "loss": 0.74784231, + "num_input_tokens_seen": 39144820, + "step": 1813, + "time_per_iteration": 3.9049789905548096 + }, + { + "auxiliary_loss_clip": 0.0114043, + "auxiliary_loss_mlp": 0.01047362, + "balance_loss_clip": 1.05502391, + "balance_loss_mlp": 1.02654862, + "epoch": 0.10906358033969638, + "flos": 23623296341760.0, + "grad_norm": 1.8458425012945472, + "language_loss": 0.82677799, + "learning_rate": 3.93478514371732e-06, + "loss": 0.84865588, + "num_input_tokens_seen": 39165945, + "step": 1814, + "time_per_iteration": 2.5530643463134766 + }, + { + "auxiliary_loss_clip": 0.01139016, + "auxiliary_loss_mlp": 0.0105091, + "balance_loss_clip": 1.05373335, + "balance_loss_mlp": 1.03052509, + "epoch": 0.10912370359236434, + "flos": 21214731818880.0, + "grad_norm": 2.4636313301600823, + "language_loss": 0.83993071, + "learning_rate": 3.934686463523429e-06, + "loss": 0.86183, + "num_input_tokens_seen": 39183520, + "step": 1815, + "time_per_iteration": 2.587502956390381 + }, + { + "auxiliary_loss_clip": 0.01142148, + "auxiliary_loss_mlp": 0.01050636, + "balance_loss_clip": 1.05219221, + "balance_loss_mlp": 1.02851069, + "epoch": 0.10918382684503232, + "flos": 13553190622080.0, + "grad_norm": 2.5815783525169733, + "language_loss": 0.71825099, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.74017882, + "num_input_tokens_seen": 39201190, + "step": 1816, + "time_per_iteration": 2.5558204650878906 + }, + { + "auxiliary_loss_clip": 0.01163107, + "auxiliary_loss_mlp": 0.01057687, + "balance_loss_clip": 1.05236435, + "balance_loss_mlp": 1.0360986, + "epoch": 0.10924395009770028, + "flos": 27964321591680.0, + "grad_norm": 3.6250761178373274, + "language_loss": 0.72785282, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.75006074, + "num_input_tokens_seen": 39221210, + "step": 1817, + "time_per_iteration": 3.9970643520355225 + }, + { + "auxiliary_loss_clip": 0.0111492, + "auxiliary_loss_mlp": 0.01049322, + "balance_loss_clip": 1.05173159, + "balance_loss_mlp": 1.0277456, + "epoch": 0.10930407335036825, + "flos": 25593750679680.0, + "grad_norm": 1.571266706376158, + "language_loss": 0.67180085, + "learning_rate": 3.934389982775706e-06, + "loss": 0.6934433, + "num_input_tokens_seen": 39242025, + "step": 1818, + "time_per_iteration": 2.7400951385498047 + }, + { + "auxiliary_loss_clip": 0.01151881, + "auxiliary_loss_mlp": 0.01058172, + "balance_loss_clip": 1.05681348, + "balance_loss_mlp": 1.03649926, + "epoch": 0.10936419660303623, + "flos": 18406194376320.0, + "grad_norm": 2.20821593452066, + "language_loss": 0.73447675, + "learning_rate": 3.934291009150275e-06, + "loss": 0.75657725, + "num_input_tokens_seen": 39259870, + "step": 1819, + "time_per_iteration": 2.6866824626922607 + }, + { + "auxiliary_loss_clip": 0.01141015, + "auxiliary_loss_mlp": 0.00914971, + "balance_loss_clip": 1.05416584, + "balance_loss_mlp": 1.19809675, + "epoch": 0.1094243198557042, + "flos": 23840052963840.0, + "grad_norm": 2.6512341078368706, + "language_loss": 0.74268186, + "learning_rate": 3.934191962176335e-06, + "loss": 0.76324171, + "num_input_tokens_seen": 39278500, + "step": 1820, + "time_per_iteration": 2.5806283950805664 + }, + { + "auxiliary_loss_clip": 0.01177392, + "auxiliary_loss_mlp": 0.01054841, + "balance_loss_clip": 1.05644917, + "balance_loss_mlp": 1.03300142, + "epoch": 0.10948444310837216, + "flos": 14643940970880.0, + "grad_norm": 2.736113311684918, + "language_loss": 0.82928252, + "learning_rate": 3.934092841857642e-06, + "loss": 0.85160482, + "num_input_tokens_seen": 39294800, + "step": 1821, + "time_per_iteration": 2.451608419418335 + }, + { + "auxiliary_loss_clip": 0.01146086, + "auxiliary_loss_mlp": 0.01051777, + "balance_loss_clip": 1.05366707, + "balance_loss_mlp": 1.03183317, + "epoch": 0.10954456636104014, + "flos": 27818811596160.0, + "grad_norm": 2.9186865837401084, + "language_loss": 0.75937796, + "learning_rate": 3.933993648197955e-06, + "loss": 0.78135657, + "num_input_tokens_seen": 39314625, + "step": 1822, + "time_per_iteration": 2.6391220092773438 + }, + { + "auxiliary_loss_clip": 0.01141527, + "auxiliary_loss_mlp": 0.01050522, + "balance_loss_clip": 1.04971313, + "balance_loss_mlp": 1.0306859, + "epoch": 0.1096046896137081, + "flos": 33620934372480.0, + "grad_norm": 1.7272316106548256, + "language_loss": 0.80167818, + "learning_rate": 3.933894381201034e-06, + "loss": 0.82359874, + "num_input_tokens_seen": 39336465, + "step": 1823, + "time_per_iteration": 2.645444393157959 + }, + { + "auxiliary_loss_clip": 0.01148588, + "auxiliary_loss_mlp": 0.01045773, + "balance_loss_clip": 1.05550957, + "balance_loss_mlp": 1.02567434, + "epoch": 0.10966481286637607, + "flos": 26980010219520.0, + "grad_norm": 1.4515807839594805, + "language_loss": 0.79629862, + "learning_rate": 3.933795040870645e-06, + "loss": 0.81824225, + "num_input_tokens_seen": 39357930, + "step": 1824, + "time_per_iteration": 2.5845563411712646 + }, + { + "auxiliary_loss_clip": 0.01140257, + "auxiliary_loss_mlp": 0.01058076, + "balance_loss_clip": 1.05330133, + "balance_loss_mlp": 1.03807282, + "epoch": 0.10972493611904403, + "flos": 23036551678080.0, + "grad_norm": 1.887021416441878, + "language_loss": 0.878443, + "learning_rate": 3.933695627210554e-06, + "loss": 0.90042633, + "num_input_tokens_seen": 39376380, + "step": 1825, + "time_per_iteration": 2.547545909881592 + }, + { + "auxiliary_loss_clip": 0.01126402, + "auxiliary_loss_mlp": 0.01055808, + "balance_loss_clip": 1.04952133, + "balance_loss_mlp": 1.03562605, + "epoch": 0.10978505937171201, + "flos": 38104632443520.0, + "grad_norm": 1.82050709045123, + "language_loss": 0.76530933, + "learning_rate": 3.933596140224532e-06, + "loss": 0.78713137, + "num_input_tokens_seen": 39399935, + "step": 1826, + "time_per_iteration": 2.747860908508301 + }, + { + "auxiliary_loss_clip": 0.01084699, + "auxiliary_loss_mlp": 0.01045711, + "balance_loss_clip": 1.04171455, + "balance_loss_mlp": 1.04203939, + "epoch": 0.10984518262437998, + "flos": 59849694616320.0, + "grad_norm": 0.8373417760297183, + "language_loss": 0.54928392, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57058799, + "num_input_tokens_seen": 39460685, + "step": 1827, + "time_per_iteration": 3.135824203491211 + }, + { + "auxiliary_loss_clip": 0.01096369, + "auxiliary_loss_mlp": 0.01024397, + "balance_loss_clip": 1.04514062, + "balance_loss_mlp": 1.02060568, + "epoch": 0.10990530587704794, + "flos": 66719837410560.0, + "grad_norm": 0.7343750495826956, + "language_loss": 0.55379772, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57500529, + "num_input_tokens_seen": 39524765, + "step": 1828, + "time_per_iteration": 3.1334316730499268 + }, + { + "auxiliary_loss_clip": 0.01154213, + "auxiliary_loss_mlp": 0.01051168, + "balance_loss_clip": 1.05446637, + "balance_loss_mlp": 1.0300082, + "epoch": 0.10996542912971592, + "flos": 25447199189760.0, + "grad_norm": 2.383963141557744, + "language_loss": 0.840554, + "learning_rate": 3.933297239348612e-06, + "loss": 0.86260784, + "num_input_tokens_seen": 39543640, + "step": 1829, + "time_per_iteration": 2.577261447906494 + }, + { + "auxiliary_loss_clip": 0.01124583, + "auxiliary_loss_mlp": 0.0104636, + "balance_loss_clip": 1.05349362, + "balance_loss_mlp": 1.02508163, + "epoch": 0.11002555238238389, + "flos": 44018186186880.0, + "grad_norm": 1.968336757415494, + "language_loss": 0.88815951, + "learning_rate": 3.933197459096614e-06, + "loss": 0.90986896, + "num_input_tokens_seen": 39567525, + "step": 1830, + "time_per_iteration": 2.8070120811462402 + }, + { + "auxiliary_loss_clip": 0.0106435, + "auxiliary_loss_mlp": 0.01008486, + "balance_loss_clip": 1.03776455, + "balance_loss_mlp": 1.00474286, + "epoch": 0.11008567563505185, + "flos": 54065133590400.0, + "grad_norm": 0.6900487421878563, + "language_loss": 0.55497819, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57570654, + "num_input_tokens_seen": 39628470, + "step": 1831, + "time_per_iteration": 3.13627290725708 + }, + { + "auxiliary_loss_clip": 0.0115311, + "auxiliary_loss_mlp": 0.01065887, + "balance_loss_clip": 1.05626249, + "balance_loss_mlp": 1.04369032, + "epoch": 0.11014579888771983, + "flos": 24243150366720.0, + "grad_norm": 2.3406376724747484, + "language_loss": 0.90758264, + "learning_rate": 3.932997678675282e-06, + "loss": 0.92977256, + "num_input_tokens_seen": 39646670, + "step": 1832, + "time_per_iteration": 2.5892183780670166 + }, + { + "auxiliary_loss_clip": 0.01088983, + "auxiliary_loss_mlp": 0.01015858, + "balance_loss_clip": 1.04798102, + "balance_loss_mlp": 1.01197195, + "epoch": 0.1102059221403878, + "flos": 57743965658880.0, + "grad_norm": 0.7161625606429809, + "language_loss": 0.59867024, + "learning_rate": 3.932897678513523e-06, + "loss": 0.61971867, + "num_input_tokens_seen": 39712915, + "step": 1833, + "time_per_iteration": 3.1634912490844727 + }, + { + "auxiliary_loss_clip": 0.01163809, + "auxiliary_loss_mlp": 0.01049993, + "balance_loss_clip": 1.05105364, + "balance_loss_mlp": 1.02854776, + "epoch": 0.11026604539305576, + "flos": 16795923667200.0, + "grad_norm": 2.595673732252315, + "language_loss": 0.80191374, + "learning_rate": 3.93279760505609e-06, + "loss": 0.8240518, + "num_input_tokens_seen": 39730650, + "step": 1834, + "time_per_iteration": 2.5062639713287354 + }, + { + "auxiliary_loss_clip": 0.01134198, + "auxiliary_loss_mlp": 0.01059757, + "balance_loss_clip": 1.05248809, + "balance_loss_mlp": 1.03659463, + "epoch": 0.11032616864572373, + "flos": 23988076911360.0, + "grad_norm": 2.432153527348238, + "language_loss": 0.90856171, + "learning_rate": 3.932697458306779e-06, + "loss": 0.93050128, + "num_input_tokens_seen": 39751065, + "step": 1835, + "time_per_iteration": 2.6446104049682617 + }, + { + "auxiliary_loss_clip": 0.01127064, + "auxiliary_loss_mlp": 0.01062591, + "balance_loss_clip": 1.05323362, + "balance_loss_mlp": 1.03858209, + "epoch": 0.1103862918983917, + "flos": 19683141851520.0, + "grad_norm": 2.2182426790260976, + "language_loss": 0.63614547, + "learning_rate": 3.932597238269386e-06, + "loss": 0.65804201, + "num_input_tokens_seen": 39769245, + "step": 1836, + "time_per_iteration": 2.57279372215271 + }, + { + "auxiliary_loss_clip": 0.01135004, + "auxiliary_loss_mlp": 0.01054659, + "balance_loss_clip": 1.05108941, + "balance_loss_mlp": 1.03420293, + "epoch": 0.11044641515105967, + "flos": 32160878340480.0, + "grad_norm": 2.1327394323874866, + "language_loss": 0.72571188, + "learning_rate": 3.932496944947711e-06, + "loss": 0.74760854, + "num_input_tokens_seen": 39790830, + "step": 1837, + "time_per_iteration": 2.700165271759033 + }, + { + "auxiliary_loss_clip": 0.01159941, + "auxiliary_loss_mlp": 0.01060546, + "balance_loss_clip": 1.05262589, + "balance_loss_mlp": 1.0398159, + "epoch": 0.11050653840372764, + "flos": 16689233295360.0, + "grad_norm": 2.294818244377463, + "language_loss": 0.78771389, + "learning_rate": 3.93239657834556e-06, + "loss": 0.80991876, + "num_input_tokens_seen": 39809475, + "step": 1838, + "time_per_iteration": 2.5129714012145996 + }, + { + "auxiliary_loss_clip": 0.01149173, + "auxiliary_loss_mlp": 0.01062505, + "balance_loss_clip": 1.05377603, + "balance_loss_mlp": 1.04201281, + "epoch": 0.11056666165639562, + "flos": 21208877902080.0, + "grad_norm": 2.2882681140596763, + "language_loss": 0.71975917, + "learning_rate": 3.932296138466736e-06, + "loss": 0.74187589, + "num_input_tokens_seen": 39826355, + "step": 1839, + "time_per_iteration": 2.5251216888427734 + }, + { + "auxiliary_loss_clip": 0.01181104, + "auxiliary_loss_mlp": 0.0089095, + "balance_loss_clip": 1.05805528, + "balance_loss_mlp": 1.14669549, + "epoch": 0.11062678490906358, + "flos": 19165488998400.0, + "grad_norm": 2.1813690172982936, + "language_loss": 0.7860378, + "learning_rate": 3.93219562531505e-06, + "loss": 0.80675828, + "num_input_tokens_seen": 39845335, + "step": 1840, + "time_per_iteration": 2.4960312843322754 + }, + { + "auxiliary_loss_clip": 0.01152629, + "auxiliary_loss_mlp": 0.01051731, + "balance_loss_clip": 1.05122828, + "balance_loss_mlp": 1.02953422, + "epoch": 0.11068690816173155, + "flos": 24895287740160.0, + "grad_norm": 1.6216683001745351, + "language_loss": 0.88125634, + "learning_rate": 3.932095038894311e-06, + "loss": 0.90329987, + "num_input_tokens_seen": 39865065, + "step": 1841, + "time_per_iteration": 2.554553747177124 + }, + { + "auxiliary_loss_clip": 0.0112823, + "auxiliary_loss_mlp": 0.01054659, + "balance_loss_clip": 1.05335593, + "balance_loss_mlp": 1.03348708, + "epoch": 0.11074703141439952, + "flos": 16472368932480.0, + "grad_norm": 1.9926821948323934, + "language_loss": 0.90484089, + "learning_rate": 3.931994379208334e-06, + "loss": 0.92666978, + "num_input_tokens_seen": 39882780, + "step": 1842, + "time_per_iteration": 2.540863513946533 + }, + { + "auxiliary_loss_clip": 0.01149516, + "auxiliary_loss_mlp": 0.01051408, + "balance_loss_clip": 1.04979372, + "balance_loss_mlp": 1.03201246, + "epoch": 0.11080715466706749, + "flos": 19172420323200.0, + "grad_norm": 2.477460914049156, + "language_loss": 0.86271918, + "learning_rate": 3.931893646260937e-06, + "loss": 0.88472843, + "num_input_tokens_seen": 39900295, + "step": 1843, + "time_per_iteration": 2.529961585998535 + }, + { + "auxiliary_loss_clip": 0.01118983, + "auxiliary_loss_mlp": 0.00865283, + "balance_loss_clip": 1.05424786, + "balance_loss_mlp": 1.09868824, + "epoch": 0.11086727791973545, + "flos": 27704687109120.0, + "grad_norm": 1.5962430450048317, + "language_loss": 0.74646568, + "learning_rate": 3.931792840055941e-06, + "loss": 0.76630831, + "num_input_tokens_seen": 39922075, + "step": 1844, + "time_per_iteration": 2.7359042167663574 + }, + { + "auxiliary_loss_clip": 0.01173984, + "auxiliary_loss_mlp": 0.01051315, + "balance_loss_clip": 1.05293655, + "balance_loss_mlp": 1.0289638, + "epoch": 0.11092740117240343, + "flos": 18514967736960.0, + "grad_norm": 1.9911394909815205, + "language_loss": 0.75474167, + "learning_rate": 3.931691960597165e-06, + "loss": 0.77699471, + "num_input_tokens_seen": 39940115, + "step": 1845, + "time_per_iteration": 3.963844060897827 + }, + { + "auxiliary_loss_clip": 0.01146522, + "auxiliary_loss_mlp": 0.01053341, + "balance_loss_clip": 1.05106926, + "balance_loss_mlp": 1.03247929, + "epoch": 0.1109875244250714, + "flos": 20522446018560.0, + "grad_norm": 1.5547304070784076, + "language_loss": 0.76111203, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.78311068, + "num_input_tokens_seen": 39959920, + "step": 1846, + "time_per_iteration": 2.553382158279419 + }, + { + "auxiliary_loss_clip": 0.0116525, + "auxiliary_loss_mlp": 0.01051166, + "balance_loss_clip": 1.05264819, + "balance_loss_mlp": 1.03034043, + "epoch": 0.11104764767773936, + "flos": 14098601710080.0, + "grad_norm": 2.6973114196975447, + "language_loss": 0.86378586, + "learning_rate": 3.931489981933584e-06, + "loss": 0.88594997, + "num_input_tokens_seen": 39974755, + "step": 1847, + "time_per_iteration": 2.4833598136901855 + }, + { + "auxiliary_loss_clip": 0.01172377, + "auxiliary_loss_mlp": 0.0104702, + "balance_loss_clip": 1.05012369, + "balance_loss_mlp": 1.0250622, + "epoch": 0.11110777093040733, + "flos": 20594518657920.0, + "grad_norm": 2.0812836104282977, + "language_loss": 0.7696563, + "learning_rate": 3.931388882736438e-06, + "loss": 0.79185033, + "num_input_tokens_seen": 39993355, + "step": 1848, + "time_per_iteration": 2.5071895122528076 + }, + { + "auxiliary_loss_clip": 0.01165956, + "auxiliary_loss_mlp": 0.0105228, + "balance_loss_clip": 1.05775547, + "balance_loss_mlp": 1.03183603, + "epoch": 0.11116789418307531, + "flos": 21870065502720.0, + "grad_norm": 1.8656214800392978, + "language_loss": 0.77769387, + "learning_rate": 3.931287710300832e-06, + "loss": 0.79987621, + "num_input_tokens_seen": 40012410, + "step": 1849, + "time_per_iteration": 3.884786605834961 + }, + { + "auxiliary_loss_clip": 0.01128372, + "auxiliary_loss_mlp": 0.00861065, + "balance_loss_clip": 1.04705453, + "balance_loss_mlp": 1.09269142, + "epoch": 0.11122801743574327, + "flos": 15523106256000.0, + "grad_norm": 2.9254362634663096, + "language_loss": 0.71905291, + "learning_rate": 3.931186464630601e-06, + "loss": 0.73894727, + "num_input_tokens_seen": 40029315, + "step": 1850, + "time_per_iteration": 2.515484571456909 + }, + { + "auxiliary_loss_clip": 0.01164432, + "auxiliary_loss_mlp": 0.01051492, + "balance_loss_clip": 1.05442584, + "balance_loss_mlp": 1.02961683, + "epoch": 0.11128814068841124, + "flos": 14392279307520.0, + "grad_norm": 2.4179567592518967, + "language_loss": 0.81686932, + "learning_rate": 3.931085145729588e-06, + "loss": 0.83902854, + "num_input_tokens_seen": 40045765, + "step": 1851, + "time_per_iteration": 3.8807077407836914 + }, + { + "auxiliary_loss_clip": 0.01157004, + "auxiliary_loss_mlp": 0.01052031, + "balance_loss_clip": 1.055107, + "balance_loss_mlp": 1.03157496, + "epoch": 0.11134826394107922, + "flos": 16653933204480.0, + "grad_norm": 2.4260299337590587, + "language_loss": 0.8814429, + "learning_rate": 3.930983753601631e-06, + "loss": 0.90353322, + "num_input_tokens_seen": 40061660, + "step": 1852, + "time_per_iteration": 2.471973180770874 + }, + { + "auxiliary_loss_clip": 0.01161991, + "auxiliary_loss_mlp": 0.01054442, + "balance_loss_clip": 1.05421484, + "balance_loss_mlp": 1.03228092, + "epoch": 0.11140838719374718, + "flos": 16690993061760.0, + "grad_norm": 2.027169406133809, + "language_loss": 0.7195285, + "learning_rate": 3.930882288250578e-06, + "loss": 0.74169284, + "num_input_tokens_seen": 40080180, + "step": 1853, + "time_per_iteration": 2.525233268737793 + }, + { + "auxiliary_loss_clip": 0.01075803, + "auxiliary_loss_mlp": 0.01007345, + "balance_loss_clip": 1.02697611, + "balance_loss_mlp": 1.00329149, + "epoch": 0.11146851044641515, + "flos": 60976355587200.0, + "grad_norm": 0.7731089647395152, + "language_loss": 0.5361495, + "learning_rate": 3.930780749680273e-06, + "loss": 0.55698097, + "num_input_tokens_seen": 40138910, + "step": 1854, + "time_per_iteration": 3.053013563156128 + }, + { + "auxiliary_loss_clip": 0.01153558, + "auxiliary_loss_mlp": 0.01050058, + "balance_loss_clip": 1.05138135, + "balance_loss_mlp": 1.02771819, + "epoch": 0.11152863369908313, + "flos": 22193835719040.0, + "grad_norm": 2.2827665931082293, + "language_loss": 0.84803677, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.87007296, + "num_input_tokens_seen": 40157745, + "step": 1855, + "time_per_iteration": 4.079181432723999 + }, + { + "auxiliary_loss_clip": 0.01143661, + "auxiliary_loss_mlp": 0.01057637, + "balance_loss_clip": 1.05315256, + "balance_loss_mlp": 1.03714514, + "epoch": 0.11158875695175109, + "flos": 19537524115200.0, + "grad_norm": 2.914195552531238, + "language_loss": 0.81766975, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.8396827, + "num_input_tokens_seen": 40175375, + "step": 1856, + "time_per_iteration": 2.5623080730438232 + }, + { + "auxiliary_loss_clip": 0.01159363, + "auxiliary_loss_mlp": 0.01043203, + "balance_loss_clip": 1.05191338, + "balance_loss_mlp": 1.02155435, + "epoch": 0.11164888020441906, + "flos": 25442709989760.0, + "grad_norm": 1.6935128136133162, + "language_loss": 0.82834935, + "learning_rate": 3.93047569469238e-06, + "loss": 0.85037494, + "num_input_tokens_seen": 40195715, + "step": 1857, + "time_per_iteration": 2.5342061519622803 + }, + { + "auxiliary_loss_clip": 0.01137239, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_clip": 1.04906797, + "balance_loss_mlp": 1.02573371, + "epoch": 0.11170900345708702, + "flos": 15632741543040.0, + "grad_norm": 2.439290762670628, + "language_loss": 0.82834053, + "learning_rate": 3.930373863283608e-06, + "loss": 0.85017073, + "num_input_tokens_seen": 40213975, + "step": 1858, + "time_per_iteration": 2.54972243309021 + }, + { + "auxiliary_loss_clip": 0.01131711, + "auxiliary_loss_mlp": 0.01062186, + "balance_loss_clip": 1.04906988, + "balance_loss_mlp": 1.03896368, + "epoch": 0.111769126709755, + "flos": 23039424766080.0, + "grad_norm": 2.012051316304491, + "language_loss": 0.91546327, + "learning_rate": 3.930271958674866e-06, + "loss": 0.93740225, + "num_input_tokens_seen": 40233905, + "step": 1859, + "time_per_iteration": 2.5711865425109863 + }, + { + "auxiliary_loss_clip": 0.01162583, + "auxiliary_loss_mlp": 0.01048627, + "balance_loss_clip": 1.05240035, + "balance_loss_mlp": 1.02724123, + "epoch": 0.11182924996242297, + "flos": 20850705434880.0, + "grad_norm": 2.245113773054152, + "language_loss": 0.81779045, + "learning_rate": 3.930169980870018e-06, + "loss": 0.83990252, + "num_input_tokens_seen": 40252810, + "step": 1860, + "time_per_iteration": 2.5366621017456055 + }, + { + "auxiliary_loss_clip": 0.01143181, + "auxiliary_loss_mlp": 0.01056441, + "balance_loss_clip": 1.04900789, + "balance_loss_mlp": 1.0349834, + "epoch": 0.11188937321509093, + "flos": 17455315587840.0, + "grad_norm": 3.406932317504339, + "language_loss": 0.74914628, + "learning_rate": 3.930067929872931e-06, + "loss": 0.77114248, + "num_input_tokens_seen": 40272000, + "step": 1861, + "time_per_iteration": 2.5295350551605225 + }, + { + "auxiliary_loss_clip": 0.01169574, + "auxiliary_loss_mlp": 0.01045156, + "balance_loss_clip": 1.05190647, + "balance_loss_mlp": 1.02593958, + "epoch": 0.11194949646775891, + "flos": 24095916518400.0, + "grad_norm": 1.9079875536027526, + "language_loss": 0.88699341, + "learning_rate": 3.929965805687474e-06, + "loss": 0.90914059, + "num_input_tokens_seen": 40290660, + "step": 1862, + "time_per_iteration": 2.507788896560669 + }, + { + "auxiliary_loss_clip": 0.01157024, + "auxiliary_loss_mlp": 0.01056418, + "balance_loss_clip": 1.05598664, + "balance_loss_mlp": 1.03522301, + "epoch": 0.11200961972042688, + "flos": 25153880728320.0, + "grad_norm": 2.2867914056401686, + "language_loss": 0.86597919, + "learning_rate": 3.92986360831752e-06, + "loss": 0.88811362, + "num_input_tokens_seen": 40307820, + "step": 1863, + "time_per_iteration": 2.551884889602661 + }, + { + "auxiliary_loss_clip": 0.01150367, + "auxiliary_loss_mlp": 0.01048621, + "balance_loss_clip": 1.04892993, + "balance_loss_mlp": 1.02543449, + "epoch": 0.11206974297309484, + "flos": 21288312829440.0, + "grad_norm": 1.8327121190940634, + "language_loss": 0.64318842, + "learning_rate": 3.929761337766945e-06, + "loss": 0.6651783, + "num_input_tokens_seen": 40327430, + "step": 1864, + "time_per_iteration": 2.554988145828247 + }, + { + "auxiliary_loss_clip": 0.01108925, + "auxiliary_loss_mlp": 0.01048382, + "balance_loss_clip": 1.05502105, + "balance_loss_mlp": 1.02853322, + "epoch": 0.11212986622576282, + "flos": 18915982151040.0, + "grad_norm": 2.04272452832174, + "language_loss": 0.74396062, + "learning_rate": 3.929658994039627e-06, + "loss": 0.76553363, + "num_input_tokens_seen": 40344545, + "step": 1865, + "time_per_iteration": 2.604609966278076 + }, + { + "auxiliary_loss_clip": 0.01108673, + "auxiliary_loss_mlp": 0.01055251, + "balance_loss_clip": 1.05095124, + "balance_loss_mlp": 1.031636, + "epoch": 0.11218998947843078, + "flos": 22054754257920.0, + "grad_norm": 3.0519852168631494, + "language_loss": 0.8436622, + "learning_rate": 3.929556577139446e-06, + "loss": 0.86530149, + "num_input_tokens_seen": 40362300, + "step": 1866, + "time_per_iteration": 2.667299747467041 + }, + { + "auxiliary_loss_clip": 0.01092703, + "auxiliary_loss_mlp": 0.00858728, + "balance_loss_clip": 1.04663968, + "balance_loss_mlp": 1.08910871, + "epoch": 0.11225011273109875, + "flos": 24571697091840.0, + "grad_norm": 1.5651177131079566, + "language_loss": 0.81154943, + "learning_rate": 3.929454087070286e-06, + "loss": 0.83106375, + "num_input_tokens_seen": 40384720, + "step": 1867, + "time_per_iteration": 2.7144594192504883 + }, + { + "auxiliary_loss_clip": 0.01173465, + "auxiliary_loss_mlp": 0.01050659, + "balance_loss_clip": 1.05332088, + "balance_loss_mlp": 1.03057218, + "epoch": 0.11231023598376672, + "flos": 28438665621120.0, + "grad_norm": 2.3372424844108455, + "language_loss": 0.87082124, + "learning_rate": 3.929351523836035e-06, + "loss": 0.89306247, + "num_input_tokens_seen": 40404000, + "step": 1868, + "time_per_iteration": 2.554851531982422 + }, + { + "auxiliary_loss_clip": 0.0115419, + "auxiliary_loss_mlp": 0.00882469, + "balance_loss_clip": 1.05532956, + "balance_loss_mlp": 1.13492608, + "epoch": 0.1123703592364347, + "flos": 14426466076800.0, + "grad_norm": 2.364277927329223, + "language_loss": 0.67975354, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.70012009, + "num_input_tokens_seen": 40418665, + "step": 1869, + "time_per_iteration": 2.5024349689483643 + }, + { + "auxiliary_loss_clip": 0.0113011, + "auxiliary_loss_mlp": 0.01051404, + "balance_loss_clip": 1.05061996, + "balance_loss_mlp": 1.02979124, + "epoch": 0.11243048248910266, + "flos": 22236282616320.0, + "grad_norm": 1.9816918831467798, + "language_loss": 0.77294546, + "learning_rate": 3.929146177887814e-06, + "loss": 0.79476058, + "num_input_tokens_seen": 40437870, + "step": 1870, + "time_per_iteration": 2.5979137420654297 + }, + { + "auxiliary_loss_clip": 0.01123907, + "auxiliary_loss_mlp": 0.0104765, + "balance_loss_clip": 1.05093932, + "balance_loss_mlp": 1.02595365, + "epoch": 0.11249060574177062, + "flos": 18584167288320.0, + "grad_norm": 1.9142429927003874, + "language_loss": 0.7633574, + "learning_rate": 3.929043395181631e-06, + "loss": 0.78507304, + "num_input_tokens_seen": 40455570, + "step": 1871, + "time_per_iteration": 2.595543622970581 + }, + { + "auxiliary_loss_clip": 0.01106128, + "auxiliary_loss_mlp": 0.01043939, + "balance_loss_clip": 1.05405915, + "balance_loss_mlp": 1.02391183, + "epoch": 0.1125507289944386, + "flos": 22856567604480.0, + "grad_norm": 3.6369397529343153, + "language_loss": 0.82348675, + "learning_rate": 3.928940539325929e-06, + "loss": 0.84498739, + "num_input_tokens_seen": 40473600, + "step": 1872, + "time_per_iteration": 2.631897449493408 + }, + { + "auxiliary_loss_clip": 0.01174336, + "auxiliary_loss_mlp": 0.01053023, + "balance_loss_clip": 1.05394673, + "balance_loss_mlp": 1.03328228, + "epoch": 0.11261085224710657, + "flos": 19676390094720.0, + "grad_norm": 2.459916987864868, + "language_loss": 0.83033681, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.85261041, + "num_input_tokens_seen": 40490025, + "step": 1873, + "time_per_iteration": 2.4860427379608154 + }, + { + "auxiliary_loss_clip": 0.0113921, + "auxiliary_loss_mlp": 0.01048948, + "balance_loss_clip": 1.05363822, + "balance_loss_mlp": 1.02732348, + "epoch": 0.11267097549977453, + "flos": 26063246373120.0, + "grad_norm": 2.383281029800944, + "language_loss": 0.92532277, + "learning_rate": 3.928734608181575e-06, + "loss": 0.94720435, + "num_input_tokens_seen": 40511580, + "step": 1874, + "time_per_iteration": 2.643134117126465 + }, + { + "auxiliary_loss_clip": 0.01139009, + "auxiliary_loss_mlp": 0.0105937, + "balance_loss_clip": 1.05332923, + "balance_loss_mlp": 1.03960478, + "epoch": 0.11273109875244251, + "flos": 21068036674560.0, + "grad_norm": 1.4774661587835527, + "language_loss": 0.75254059, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77452439, + "num_input_tokens_seen": 40530155, + "step": 1875, + "time_per_iteration": 2.581851005554199 + }, + { + "auxiliary_loss_clip": 0.01161745, + "auxiliary_loss_mlp": 0.01052616, + "balance_loss_clip": 1.05576694, + "balance_loss_mlp": 1.03328085, + "epoch": 0.11279122200511048, + "flos": 27088999061760.0, + "grad_norm": 1.950447294627082, + "language_loss": 0.71855521, + "learning_rate": 3.928528384485984e-06, + "loss": 0.74069881, + "num_input_tokens_seen": 40549500, + "step": 1876, + "time_per_iteration": 2.5400876998901367 + }, + { + "auxiliary_loss_clip": 0.01145523, + "auxiliary_loss_mlp": 0.01047697, + "balance_loss_clip": 1.0550108, + "balance_loss_mlp": 1.02706146, + "epoch": 0.11285134525777844, + "flos": 20187901722240.0, + "grad_norm": 1.911808775601769, + "language_loss": 0.76934922, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.79128146, + "num_input_tokens_seen": 40567475, + "step": 1877, + "time_per_iteration": 2.561160087585449 + }, + { + "auxiliary_loss_clip": 0.01163647, + "auxiliary_loss_mlp": 0.01051709, + "balance_loss_clip": 1.05351222, + "balance_loss_mlp": 1.03002453, + "epoch": 0.11291146851044641, + "flos": 12458453863680.0, + "grad_norm": 2.2675324319310435, + "language_loss": 0.87862074, + "learning_rate": 3.928321868270436e-06, + "loss": 0.90077436, + "num_input_tokens_seen": 40583280, + "step": 1878, + "time_per_iteration": 2.4787580966949463 + }, + { + "auxiliary_loss_clip": 0.01138673, + "auxiliary_loss_mlp": 0.01045297, + "balance_loss_clip": 1.05512655, + "balance_loss_mlp": 1.02479339, + "epoch": 0.11297159176311439, + "flos": 23842315520640.0, + "grad_norm": 2.621209764456612, + "language_loss": 0.81212544, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83396518, + "num_input_tokens_seen": 40603080, + "step": 1879, + "time_per_iteration": 2.629356861114502 + }, + { + "auxiliary_loss_clip": 0.0115032, + "auxiliary_loss_mlp": 0.01056682, + "balance_loss_clip": 1.05136013, + "balance_loss_mlp": 1.03527188, + "epoch": 0.11303171501578235, + "flos": 29930538124800.0, + "grad_norm": 4.077688631626403, + "language_loss": 0.70445269, + "learning_rate": 3.928115059566259e-06, + "loss": 0.72652268, + "num_input_tokens_seen": 40623255, + "step": 1880, + "time_per_iteration": 2.6035544872283936 + }, + { + "auxiliary_loss_clip": 0.01178225, + "auxiliary_loss_mlp": 0.01044918, + "balance_loss_clip": 1.1001091, + "balance_loss_mlp": 1.02425909, + "epoch": 0.11309183826845032, + "flos": 16180558842240.0, + "grad_norm": 2.116783011241823, + "language_loss": 0.72416842, + "learning_rate": 3.928011545540734e-06, + "loss": 0.74639988, + "num_input_tokens_seen": 40641570, + "step": 1881, + "time_per_iteration": 2.548734664916992 + }, + { + "auxiliary_loss_clip": 0.01145082, + "auxiliary_loss_mlp": 0.0085694, + "balance_loss_clip": 1.0570507, + "balance_loss_mlp": 1.09093809, + "epoch": 0.1131519615211183, + "flos": 12020702814720.0, + "grad_norm": 2.9043577092074893, + "language_loss": 0.74206108, + "learning_rate": 3.927907958404819e-06, + "loss": 0.76208133, + "num_input_tokens_seen": 40658775, + "step": 1882, + "time_per_iteration": 2.591176748275757 + }, + { + "auxiliary_loss_clip": 0.01173598, + "auxiliary_loss_mlp": 0.01051127, + "balance_loss_clip": 1.05602241, + "balance_loss_mlp": 1.03014612, + "epoch": 0.11321208477378626, + "flos": 26250125857920.0, + "grad_norm": 2.469989626265307, + "language_loss": 0.79471856, + "learning_rate": 3.92780429816244e-06, + "loss": 0.81696582, + "num_input_tokens_seen": 40679555, + "step": 1883, + "time_per_iteration": 2.5392062664031982 + }, + { + "auxiliary_loss_clip": 0.01134745, + "auxiliary_loss_mlp": 0.01052752, + "balance_loss_clip": 1.05141366, + "balance_loss_mlp": 1.03119886, + "epoch": 0.11327220802645423, + "flos": 13626376583040.0, + "grad_norm": 2.1043459935630917, + "language_loss": 0.77605367, + "learning_rate": 3.927700564817529e-06, + "loss": 0.79792869, + "num_input_tokens_seen": 40697295, + "step": 1884, + "time_per_iteration": 3.937434196472168 + }, + { + "auxiliary_loss_clip": 0.01083078, + "auxiliary_loss_mlp": 0.01004033, + "balance_loss_clip": 1.04387927, + "balance_loss_mlp": 1.00026608, + "epoch": 0.1133323312791222, + "flos": 57191802814080.0, + "grad_norm": 0.7919419566409772, + "language_loss": 0.55193293, + "learning_rate": 3.927596758374019e-06, + "loss": 0.57280409, + "num_input_tokens_seen": 40758095, + "step": 1885, + "time_per_iteration": 3.042633295059204 + }, + { + "auxiliary_loss_clip": 0.01089269, + "auxiliary_loss_mlp": 0.01044867, + "balance_loss_clip": 1.05306768, + "balance_loss_mlp": 1.02509034, + "epoch": 0.11339245453179017, + "flos": 24351708245760.0, + "grad_norm": 2.38087563256885, + "language_loss": 0.90391439, + "learning_rate": 3.927492878835848e-06, + "loss": 0.92525578, + "num_input_tokens_seen": 40777140, + "step": 1886, + "time_per_iteration": 2.8427810668945312 + }, + { + "auxiliary_loss_clip": 0.01114154, + "auxiliary_loss_mlp": 0.010516, + "balance_loss_clip": 1.05278146, + "balance_loss_mlp": 1.03156137, + "epoch": 0.11345257778445814, + "flos": 22670693700480.0, + "grad_norm": 2.40273735281062, + "language_loss": 0.84933096, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87098849, + "num_input_tokens_seen": 40797505, + "step": 1887, + "time_per_iteration": 4.256206750869751 + }, + { + "auxiliary_loss_clip": 0.01138291, + "auxiliary_loss_mlp": 0.01051314, + "balance_loss_clip": 1.05652392, + "balance_loss_mlp": 1.03200173, + "epoch": 0.11351270103712612, + "flos": 20988242611200.0, + "grad_norm": 3.468927976040216, + "language_loss": 0.76596081, + "learning_rate": 3.927284900491277e-06, + "loss": 0.78785688, + "num_input_tokens_seen": 40812970, + "step": 1888, + "time_per_iteration": 2.5871238708496094 + }, + { + "auxiliary_loss_clip": 0.01133271, + "auxiliary_loss_mlp": 0.01053928, + "balance_loss_clip": 1.05932438, + "balance_loss_mlp": 1.03245866, + "epoch": 0.11357282428979408, + "flos": 37347923600640.0, + "grad_norm": 1.9096095660155081, + "language_loss": 0.68174821, + "learning_rate": 3.927180801692764e-06, + "loss": 0.7036202, + "num_input_tokens_seen": 40837745, + "step": 1889, + "time_per_iteration": 2.814681053161621 + }, + { + "auxiliary_loss_clip": 0.01173829, + "auxiliary_loss_mlp": 0.01042354, + "balance_loss_clip": 1.0563519, + "balance_loss_mlp": 1.02282763, + "epoch": 0.11363294754246205, + "flos": 21757018423680.0, + "grad_norm": 2.0413913939254233, + "language_loss": 0.84025353, + "learning_rate": 3.927076629815362e-06, + "loss": 0.86241543, + "num_input_tokens_seen": 40856490, + "step": 1890, + "time_per_iteration": 3.8938915729522705 + }, + { + "auxiliary_loss_clip": 0.0115655, + "auxiliary_loss_mlp": 0.01051713, + "balance_loss_clip": 1.07090926, + "balance_loss_mlp": 1.03135228, + "epoch": 0.11369307079513001, + "flos": 22601637803520.0, + "grad_norm": 2.658255470296402, + "language_loss": 0.6513992, + "learning_rate": 3.926972384863022e-06, + "loss": 0.67348182, + "num_input_tokens_seen": 40874070, + "step": 1891, + "time_per_iteration": 2.561370611190796 + }, + { + "auxiliary_loss_clip": 0.0113965, + "auxiliary_loss_mlp": 0.01045898, + "balance_loss_clip": 1.05160582, + "balance_loss_mlp": 1.02583528, + "epoch": 0.11375319404779799, + "flos": 21944257044480.0, + "grad_norm": 2.0573653359879183, + "language_loss": 0.8829596, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.9048152, + "num_input_tokens_seen": 40892425, + "step": 1892, + "time_per_iteration": 2.5908050537109375 + }, + { + "auxiliary_loss_clip": 0.01116635, + "auxiliary_loss_mlp": 0.01066349, + "balance_loss_clip": 1.05676436, + "balance_loss_mlp": 1.04349673, + "epoch": 0.11381331730046595, + "flos": 26395456285440.0, + "grad_norm": 2.591574952980665, + "language_loss": 0.72671586, + "learning_rate": 3.926763675749339e-06, + "loss": 0.74854571, + "num_input_tokens_seen": 40912190, + "step": 1893, + "time_per_iteration": 4.0842084884643555 + }, + { + "auxiliary_loss_clip": 0.01169124, + "auxiliary_loss_mlp": 0.01055297, + "balance_loss_clip": 1.05184102, + "balance_loss_mlp": 1.03377938, + "epoch": 0.11387344055313392, + "flos": 23804716959360.0, + "grad_norm": 2.3567219504341854, + "language_loss": 0.79576576, + "learning_rate": 3.92665921159591e-06, + "loss": 0.81800997, + "num_input_tokens_seen": 40928395, + "step": 1894, + "time_per_iteration": 2.5028445720672607 + }, + { + "auxiliary_loss_clip": 0.0114787, + "auxiliary_loss_mlp": 0.01055345, + "balance_loss_clip": 1.0567795, + "balance_loss_mlp": 1.034567, + "epoch": 0.1139335638058019, + "flos": 34522865902080.0, + "grad_norm": 4.042974892059261, + "language_loss": 0.79527926, + "learning_rate": 3.926554674383371e-06, + "loss": 0.81731141, + "num_input_tokens_seen": 40946555, + "step": 1895, + "time_per_iteration": 2.6137096881866455 + }, + { + "auxiliary_loss_clip": 0.01084048, + "auxiliary_loss_mlp": 0.01016831, + "balance_loss_clip": 1.03435159, + "balance_loss_mlp": 1.01318324, + "epoch": 0.11399368705846986, + "flos": 70587811520640.0, + "grad_norm": 0.801062419408952, + "language_loss": 0.6330744, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65408319, + "num_input_tokens_seen": 41004910, + "step": 1896, + "time_per_iteration": 3.180043935775757 + }, + { + "auxiliary_loss_clip": 0.01142867, + "auxiliary_loss_mlp": 0.01049838, + "balance_loss_clip": 1.05571103, + "balance_loss_mlp": 1.02814233, + "epoch": 0.11405381031113783, + "flos": 21324259365120.0, + "grad_norm": 1.8667886133517009, + "language_loss": 0.85227072, + "learning_rate": 3.926345380796821e-06, + "loss": 0.87419772, + "num_input_tokens_seen": 41026385, + "step": 1897, + "time_per_iteration": 2.56990122795105 + }, + { + "auxiliary_loss_clip": 0.011744, + "auxiliary_loss_mlp": 0.00874748, + "balance_loss_clip": 1.05525458, + "balance_loss_mlp": 1.12527442, + "epoch": 0.11411393356380581, + "flos": 19719627091200.0, + "grad_norm": 2.3031617954161625, + "language_loss": 0.7974872, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.81797868, + "num_input_tokens_seen": 41045315, + "step": 1898, + "time_per_iteration": 2.5475571155548096 + }, + { + "auxiliary_loss_clip": 0.01112383, + "auxiliary_loss_mlp": 0.01054104, + "balance_loss_clip": 1.04847193, + "balance_loss_mlp": 1.03119195, + "epoch": 0.11417405681647377, + "flos": 17530440883200.0, + "grad_norm": 2.2355279805599024, + "language_loss": 0.73234856, + "learning_rate": 3.926135795021435e-06, + "loss": 0.75401342, + "num_input_tokens_seen": 41063390, + "step": 1899, + "time_per_iteration": 2.574889659881592 + }, + { + "auxiliary_loss_clip": 0.0104308, + "auxiliary_loss_mlp": 0.0100227, + "balance_loss_clip": 1.03377366, + "balance_loss_mlp": 0.99824089, + "epoch": 0.11423418006914174, + "flos": 59674666619520.0, + "grad_norm": 0.912580245704262, + "language_loss": 0.63439256, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65484607, + "num_input_tokens_seen": 41124180, + "step": 1900, + "time_per_iteration": 3.156296968460083 + }, + { + "auxiliary_loss_clip": 0.01105932, + "auxiliary_loss_mlp": 0.01049422, + "balance_loss_clip": 1.04873073, + "balance_loss_mlp": 1.02877545, + "epoch": 0.1142943033218097, + "flos": 22963114321920.0, + "grad_norm": 4.850799468382208, + "language_loss": 0.78096414, + "learning_rate": 3.925925917089001e-06, + "loss": 0.80251771, + "num_input_tokens_seen": 41143485, + "step": 1901, + "time_per_iteration": 2.6657254695892334 + }, + { + "auxiliary_loss_clip": 0.01164516, + "auxiliary_loss_mlp": 0.01048674, + "balance_loss_clip": 1.05489802, + "balance_loss_mlp": 1.02842045, + "epoch": 0.11435442657447768, + "flos": 18256267008000.0, + "grad_norm": 2.198859597093439, + "language_loss": 0.8380692, + "learning_rate": 3.925820868573839e-06, + "loss": 0.86020112, + "num_input_tokens_seen": 41161695, + "step": 1902, + "time_per_iteration": 2.513610363006592 + }, + { + "auxiliary_loss_clip": 0.01152342, + "auxiliary_loss_mlp": 0.01050447, + "balance_loss_clip": 1.0504775, + "balance_loss_mlp": 1.02735567, + "epoch": 0.11441454982714565, + "flos": 24061191045120.0, + "grad_norm": 2.729317549808627, + "language_loss": 0.77497834, + "learning_rate": 3.925715747031356e-06, + "loss": 0.79700619, + "num_input_tokens_seen": 41181715, + "step": 1903, + "time_per_iteration": 2.578197479248047 + }, + { + "auxiliary_loss_clip": 0.01140772, + "auxiliary_loss_mlp": 0.01037269, + "balance_loss_clip": 1.05385864, + "balance_loss_mlp": 1.0190537, + "epoch": 0.11447467307981361, + "flos": 25337707557120.0, + "grad_norm": 2.1266560457897996, + "language_loss": 0.75657326, + "learning_rate": 3.925610552465539e-06, + "loss": 0.77835369, + "num_input_tokens_seen": 41201770, + "step": 1904, + "time_per_iteration": 2.5906918048858643 + }, + { + "auxiliary_loss_clip": 0.01142985, + "auxiliary_loss_mlp": 0.0104608, + "balance_loss_clip": 1.05162358, + "balance_loss_mlp": 1.0240736, + "epoch": 0.11453479633248159, + "flos": 21726063878400.0, + "grad_norm": 2.371010695521114, + "language_loss": 0.92593598, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.9478265, + "num_input_tokens_seen": 41220590, + "step": 1905, + "time_per_iteration": 2.5492703914642334 + }, + { + "auxiliary_loss_clip": 0.01159109, + "auxiliary_loss_mlp": 0.01044333, + "balance_loss_clip": 1.05255497, + "balance_loss_mlp": 1.02248168, + "epoch": 0.11459491958514956, + "flos": 12969714096000.0, + "grad_norm": 5.679498144773597, + "language_loss": 0.77582419, + "learning_rate": 3.925399944279861e-06, + "loss": 0.79785854, + "num_input_tokens_seen": 41237250, + "step": 1906, + "time_per_iteration": 2.516207456588745 + }, + { + "auxiliary_loss_clip": 0.01171906, + "auxiliary_loss_mlp": 0.01048771, + "balance_loss_clip": 1.0538615, + "balance_loss_mlp": 1.02813601, + "epoch": 0.11465504283781752, + "flos": 22711273090560.0, + "grad_norm": 2.7011501943811544, + "language_loss": 0.82136858, + "learning_rate": 3.925294530667986e-06, + "loss": 0.84357536, + "num_input_tokens_seen": 41256680, + "step": 1907, + "time_per_iteration": 2.4873287677764893 + }, + { + "auxiliary_loss_clip": 0.01129454, + "auxiliary_loss_mlp": 0.01061923, + "balance_loss_clip": 1.05093229, + "balance_loss_mlp": 1.04037035, + "epoch": 0.1147151660904855, + "flos": 23398387332480.0, + "grad_norm": 4.277214450906316, + "language_loss": 0.84924662, + "learning_rate": 3.92518904404875e-06, + "loss": 0.87116033, + "num_input_tokens_seen": 41270955, + "step": 1908, + "time_per_iteration": 2.549497127532959 + }, + { + "auxiliary_loss_clip": 0.01030513, + "auxiliary_loss_mlp": 0.01011539, + "balance_loss_clip": 1.02806044, + "balance_loss_mlp": 1.0073185, + "epoch": 0.11477528934315347, + "flos": 63011843498880.0, + "grad_norm": 0.9354675897516539, + "language_loss": 0.60990548, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63032597, + "num_input_tokens_seen": 41319180, + "step": 1909, + "time_per_iteration": 2.9537484645843506 + }, + { + "auxiliary_loss_clip": 0.0117608, + "auxiliary_loss_mlp": 0.01046285, + "balance_loss_clip": 1.05737257, + "balance_loss_mlp": 1.02649641, + "epoch": 0.11483541259582143, + "flos": 16325601960960.0, + "grad_norm": 2.5681538095874097, + "language_loss": 0.78672588, + "learning_rate": 3.924977851804197e-06, + "loss": 0.80894947, + "num_input_tokens_seen": 41337480, + "step": 1910, + "time_per_iteration": 2.4439966678619385 + }, + { + "auxiliary_loss_clip": 0.01153218, + "auxiliary_loss_mlp": 0.0104491, + "balance_loss_clip": 1.05934525, + "balance_loss_mlp": 1.02503753, + "epoch": 0.1148955358484894, + "flos": 21580410228480.0, + "grad_norm": 2.80498823930451, + "language_loss": 0.767308, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.78928924, + "num_input_tokens_seen": 41354650, + "step": 1911, + "time_per_iteration": 2.5395491123199463 + }, + { + "auxiliary_loss_clip": 0.01145384, + "auxiliary_loss_mlp": 0.01047131, + "balance_loss_clip": 1.04978228, + "balance_loss_mlp": 1.02663922, + "epoch": 0.11495565910115738, + "flos": 27673696650240.0, + "grad_norm": 2.4006628447791902, + "language_loss": 0.79143667, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.81336188, + "num_input_tokens_seen": 41376935, + "step": 1912, + "time_per_iteration": 2.62797474861145 + }, + { + "auxiliary_loss_clip": 0.01169692, + "auxiliary_loss_mlp": 0.00864808, + "balance_loss_clip": 1.05306554, + "balance_loss_mlp": 1.11206329, + "epoch": 0.11501578235382534, + "flos": 20632368614400.0, + "grad_norm": 2.247680567380297, + "language_loss": 0.77998257, + "learning_rate": 3.924660515982246e-06, + "loss": 0.8003276, + "num_input_tokens_seen": 41396105, + "step": 1913, + "time_per_iteration": 2.503399610519409 + }, + { + "auxiliary_loss_clip": 0.01157971, + "auxiliary_loss_mlp": 0.01052931, + "balance_loss_clip": 1.05105877, + "balance_loss_mlp": 1.03178322, + "epoch": 0.1150759056064933, + "flos": 19829046896640.0, + "grad_norm": 2.257356375028276, + "language_loss": 0.70086765, + "learning_rate": 3.924554591402939e-06, + "loss": 0.72297668, + "num_input_tokens_seen": 41415600, + "step": 1914, + "time_per_iteration": 2.498991012573242 + }, + { + "auxiliary_loss_clip": 0.01028196, + "auxiliary_loss_mlp": 0.01000571, + "balance_loss_clip": 1.05073166, + "balance_loss_mlp": 0.99766272, + "epoch": 0.11513602885916129, + "flos": 70045776311040.0, + "grad_norm": 0.7653795952186894, + "language_loss": 0.61031091, + "learning_rate": 3.92444859384433e-06, + "loss": 0.6305986, + "num_input_tokens_seen": 41478760, + "step": 1915, + "time_per_iteration": 3.4412107467651367 + }, + { + "auxiliary_loss_clip": 0.01155862, + "auxiliary_loss_mlp": 0.01047809, + "balance_loss_clip": 1.05486155, + "balance_loss_mlp": 1.02667284, + "epoch": 0.11519615211182925, + "flos": 15741730385280.0, + "grad_norm": 2.4105403381936426, + "language_loss": 0.93077338, + "learning_rate": 3.924342523310436e-06, + "loss": 0.95281017, + "num_input_tokens_seen": 41495720, + "step": 1916, + "time_per_iteration": 2.6327672004699707 + }, + { + "auxiliary_loss_clip": 0.01154159, + "auxiliary_loss_mlp": 0.01057648, + "balance_loss_clip": 1.05567956, + "balance_loss_mlp": 1.03518939, + "epoch": 0.11525627536449722, + "flos": 20667632791680.0, + "grad_norm": 1.9511683175163712, + "language_loss": 0.72493082, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.74704885, + "num_input_tokens_seen": 41513585, + "step": 1917, + "time_per_iteration": 2.552410364151001 + }, + { + "auxiliary_loss_clip": 0.01131495, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.053792, + "balance_loss_mlp": 1.02239728, + "epoch": 0.1153163986171652, + "flos": 20303283185280.0, + "grad_norm": 2.931427155215243, + "language_loss": 0.74340689, + "learning_rate": 3.92413016333289e-06, + "loss": 0.76515436, + "num_input_tokens_seen": 41533390, + "step": 1918, + "time_per_iteration": 2.611922025680542 + }, + { + "auxiliary_loss_clip": 0.01137571, + "auxiliary_loss_mlp": 0.01043909, + "balance_loss_clip": 1.05085087, + "balance_loss_mlp": 1.02397752, + "epoch": 0.11537652186983316, + "flos": 17639321984640.0, + "grad_norm": 3.127649392904145, + "language_loss": 0.86862481, + "learning_rate": 3.92402387389729e-06, + "loss": 0.89043963, + "num_input_tokens_seen": 41551015, + "step": 1919, + "time_per_iteration": 2.5762908458709717 + }, + { + "auxiliary_loss_clip": 0.01127315, + "auxiliary_loss_mlp": 0.01058071, + "balance_loss_clip": 1.04911613, + "balance_loss_mlp": 1.03427708, + "epoch": 0.11543664512250112, + "flos": 21069401391360.0, + "grad_norm": 1.9583244932234056, + "language_loss": 0.86496717, + "learning_rate": 3.923917511502512e-06, + "loss": 0.88682103, + "num_input_tokens_seen": 41568055, + "step": 1920, + "time_per_iteration": 2.5835633277893066 + }, + { + "auxiliary_loss_clip": 0.0115644, + "auxiliary_loss_mlp": 0.01050555, + "balance_loss_clip": 1.05324733, + "balance_loss_mlp": 1.03040838, + "epoch": 0.11549676837516909, + "flos": 22747542848640.0, + "grad_norm": 2.058182846956204, + "language_loss": 0.79124546, + "learning_rate": 3.923811076152589e-06, + "loss": 0.81331539, + "num_input_tokens_seen": 41587435, + "step": 1921, + "time_per_iteration": 2.5367937088012695 + }, + { + "auxiliary_loss_clip": 0.01164213, + "auxiliary_loss_mlp": 0.01062121, + "balance_loss_clip": 1.05262995, + "balance_loss_mlp": 1.04042482, + "epoch": 0.11555689162783707, + "flos": 19168972617600.0, + "grad_norm": 1.9085449911309864, + "language_loss": 0.78953028, + "learning_rate": 3.923704567851557e-06, + "loss": 0.81179357, + "num_input_tokens_seen": 41604975, + "step": 1922, + "time_per_iteration": 3.9209723472595215 + }, + { + "auxiliary_loss_clip": 0.01091897, + "auxiliary_loss_mlp": 0.01058493, + "balance_loss_clip": 1.05144727, + "balance_loss_mlp": 1.03787029, + "epoch": 0.11561701488050503, + "flos": 24572056227840.0, + "grad_norm": 2.567780799768338, + "language_loss": 0.84174967, + "learning_rate": 3.923597986603456e-06, + "loss": 0.86325359, + "num_input_tokens_seen": 41626155, + "step": 1923, + "time_per_iteration": 2.8341729640960693 + }, + { + "auxiliary_loss_clip": 0.01163888, + "auxiliary_loss_mlp": 0.01054135, + "balance_loss_clip": 1.05603969, + "balance_loss_mlp": 1.03288031, + "epoch": 0.115677138133173, + "flos": 17092546179840.0, + "grad_norm": 2.5796379672093064, + "language_loss": 0.81236786, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.83454806, + "num_input_tokens_seen": 41644805, + "step": 1924, + "time_per_iteration": 2.8505258560180664 + }, + { + "auxiliary_loss_clip": 0.01061979, + "auxiliary_loss_mlp": 0.01047805, + "balance_loss_clip": 1.03572011, + "balance_loss_mlp": 1.0438714, + "epoch": 0.11573726138584098, + "flos": 62703875266560.0, + "grad_norm": 0.8470400578636588, + "language_loss": 0.61230803, + "learning_rate": 3.923384605282212e-06, + "loss": 0.63340586, + "num_input_tokens_seen": 41709345, + "step": 1925, + "time_per_iteration": 4.646650791168213 + }, + { + "auxiliary_loss_clip": 0.01150377, + "auxiliary_loss_mlp": 0.01074973, + "balance_loss_clip": 1.05193686, + "balance_loss_mlp": 1.05210876, + "epoch": 0.11579738463850894, + "flos": 22601135013120.0, + "grad_norm": 1.7840139497305811, + "language_loss": 0.75308484, + "learning_rate": 3.923277805217161e-06, + "loss": 0.77533835, + "num_input_tokens_seen": 41730210, + "step": 1926, + "time_per_iteration": 2.5421175956726074 + }, + { + "auxiliary_loss_clip": 0.01109141, + "auxiliary_loss_mlp": 0.0086136, + "balance_loss_clip": 1.04434645, + "balance_loss_mlp": 1.1034379, + "epoch": 0.11585750789117691, + "flos": 21726135705600.0, + "grad_norm": 2.572507425613488, + "language_loss": 0.72459483, + "learning_rate": 3.923170932221222e-06, + "loss": 0.74429977, + "num_input_tokens_seen": 41750270, + "step": 1927, + "time_per_iteration": 2.6635801792144775 + }, + { + "auxiliary_loss_clip": 0.01134854, + "auxiliary_loss_mlp": 0.01049437, + "balance_loss_clip": 1.05348957, + "balance_loss_mlp": 1.0272646, + "epoch": 0.11591763114384489, + "flos": 26287544851200.0, + "grad_norm": 2.4232941566432555, + "language_loss": 0.86663133, + "learning_rate": 3.92306398629845e-06, + "loss": 0.88847423, + "num_input_tokens_seen": 41772975, + "step": 1928, + "time_per_iteration": 4.031736373901367 + }, + { + "auxiliary_loss_clip": 0.01125197, + "auxiliary_loss_mlp": 0.01053817, + "balance_loss_clip": 1.0527184, + "balance_loss_mlp": 1.03235936, + "epoch": 0.11597775439651285, + "flos": 23000461488000.0, + "grad_norm": 1.7195344669927148, + "language_loss": 0.77452219, + "learning_rate": 3.922956967452898e-06, + "loss": 0.79631233, + "num_input_tokens_seen": 41791765, + "step": 1929, + "time_per_iteration": 2.582120656967163 + }, + { + "auxiliary_loss_clip": 0.01169503, + "auxiliary_loss_mlp": 0.01052053, + "balance_loss_clip": 1.05431819, + "balance_loss_mlp": 1.03296709, + "epoch": 0.11603787764918082, + "flos": 31941715507200.0, + "grad_norm": 1.7594767426171465, + "language_loss": 0.77109575, + "learning_rate": 3.922849875688626e-06, + "loss": 0.7933113, + "num_input_tokens_seen": 41815615, + "step": 1930, + "time_per_iteration": 2.5957276821136475 + }, + { + "auxiliary_loss_clip": 0.01138317, + "auxiliary_loss_mlp": 0.01050441, + "balance_loss_clip": 1.04940009, + "balance_loss_mlp": 1.02898276, + "epoch": 0.1160980009018488, + "flos": 22271654534400.0, + "grad_norm": 2.0128243354079305, + "language_loss": 0.72119689, + "learning_rate": 3.922742711009693e-06, + "loss": 0.74308443, + "num_input_tokens_seen": 41834810, + "step": 1931, + "time_per_iteration": 2.5467703342437744 + }, + { + "auxiliary_loss_clip": 0.01143522, + "auxiliary_loss_mlp": 0.01059991, + "balance_loss_clip": 1.05389714, + "balance_loss_mlp": 1.03664982, + "epoch": 0.11615812415451676, + "flos": 22783633038720.0, + "grad_norm": 1.84162368696264, + "language_loss": 0.82275778, + "learning_rate": 3.922635473420164e-06, + "loss": 0.84479284, + "num_input_tokens_seen": 41854975, + "step": 1932, + "time_per_iteration": 3.967134952545166 + }, + { + "auxiliary_loss_clip": 0.01054724, + "auxiliary_loss_mlp": 0.01010379, + "balance_loss_clip": 1.053774, + "balance_loss_mlp": 1.00603962, + "epoch": 0.11621824740718473, + "flos": 67146096107520.0, + "grad_norm": 0.7682730043955269, + "language_loss": 0.61015415, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63080513, + "num_input_tokens_seen": 41911105, + "step": 1933, + "time_per_iteration": 3.0733373165130615 + }, + { + "auxiliary_loss_clip": 0.01114106, + "auxiliary_loss_mlp": 0.00842354, + "balance_loss_clip": 1.06359649, + "balance_loss_mlp": 1.0669601, + "epoch": 0.11627837065985269, + "flos": 20375930442240.0, + "grad_norm": 2.54754193513171, + "language_loss": 0.8592447, + "learning_rate": 3.922420779525586e-06, + "loss": 0.87880927, + "num_input_tokens_seen": 41931750, + "step": 1934, + "time_per_iteration": 2.6656596660614014 + }, + { + "auxiliary_loss_clip": 0.01115496, + "auxiliary_loss_mlp": 0.01059382, + "balance_loss_clip": 1.05196762, + "balance_loss_mlp": 1.03592157, + "epoch": 0.11633849391252067, + "flos": 21725812483200.0, + "grad_norm": 2.3958511557226387, + "language_loss": 0.65897548, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.68072426, + "num_input_tokens_seen": 41949400, + "step": 1935, + "time_per_iteration": 2.6300792694091797 + }, + { + "auxiliary_loss_clip": 0.01175022, + "auxiliary_loss_mlp": 0.01050036, + "balance_loss_clip": 1.05524111, + "balance_loss_mlp": 1.03047371, + "epoch": 0.11639861716518864, + "flos": 18805341283200.0, + "grad_norm": 2.504014383210827, + "language_loss": 0.75834501, + "learning_rate": 3.922205794037456e-06, + "loss": 0.78059554, + "num_input_tokens_seen": 41968100, + "step": 1936, + "time_per_iteration": 2.4842000007629395 + }, + { + "auxiliary_loss_clip": 0.01173349, + "auxiliary_loss_mlp": 0.01049694, + "balance_loss_clip": 1.05387664, + "balance_loss_mlp": 1.02773523, + "epoch": 0.1164587404178566, + "flos": 21214983214080.0, + "grad_norm": 2.328378404355905, + "language_loss": 0.8405962, + "learning_rate": 3.922098191955998e-06, + "loss": 0.86282659, + "num_input_tokens_seen": 41986375, + "step": 1937, + "time_per_iteration": 2.502117395401001 + }, + { + "auxiliary_loss_clip": 0.01145243, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.05093718, + "balance_loss_mlp": 1.02647436, + "epoch": 0.11651886367052458, + "flos": 27818632028160.0, + "grad_norm": 4.9746264553029516, + "language_loss": 0.7628144, + "learning_rate": 3.921990516988384e-06, + "loss": 0.78474295, + "num_input_tokens_seen": 42006055, + "step": 1938, + "time_per_iteration": 2.609222650527954 + }, + { + "auxiliary_loss_clip": 0.01175088, + "auxiliary_loss_mlp": 0.0105754, + "balance_loss_clip": 1.05450058, + "balance_loss_mlp": 1.0365355, + "epoch": 0.11657898692319255, + "flos": 22889569224960.0, + "grad_norm": 1.7591232114281288, + "language_loss": 0.79424649, + "learning_rate": 3.921882769138696e-06, + "loss": 0.81657279, + "num_input_tokens_seen": 42024995, + "step": 1939, + "time_per_iteration": 2.546771287918091 + }, + { + "auxiliary_loss_clip": 0.01143275, + "auxiliary_loss_mlp": 0.01053818, + "balance_loss_clip": 1.05222821, + "balance_loss_mlp": 1.03195524, + "epoch": 0.11663911017586051, + "flos": 24315905364480.0, + "grad_norm": 2.7337634309664693, + "language_loss": 0.86572385, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.88769484, + "num_input_tokens_seen": 42042640, + "step": 1940, + "time_per_iteration": 2.587029218673706 + }, + { + "auxiliary_loss_clip": 0.0114771, + "auxiliary_loss_mlp": 0.01054296, + "balance_loss_clip": 1.05461681, + "balance_loss_mlp": 1.03410244, + "epoch": 0.11669923342852849, + "flos": 42340152470400.0, + "grad_norm": 1.4395700694582279, + "language_loss": 0.75498748, + "learning_rate": 3.921667054809449e-06, + "loss": 0.77700752, + "num_input_tokens_seen": 42067005, + "step": 1941, + "time_per_iteration": 2.756819725036621 + }, + { + "auxiliary_loss_clip": 0.01142764, + "auxiliary_loss_mlp": 0.00857947, + "balance_loss_clip": 1.05036712, + "balance_loss_mlp": 1.09793282, + "epoch": 0.11675935668119646, + "flos": 14642288945280.0, + "grad_norm": 2.238744590874912, + "language_loss": 0.89193738, + "learning_rate": 3.921559088338068e-06, + "loss": 0.91194445, + "num_input_tokens_seen": 42082295, + "step": 1942, + "time_per_iteration": 2.533463716506958 + }, + { + "auxiliary_loss_clip": 0.01157602, + "auxiliary_loss_mlp": 0.0104903, + "balance_loss_clip": 1.05344713, + "balance_loss_mlp": 1.02959871, + "epoch": 0.11681947993386442, + "flos": 35116470063360.0, + "grad_norm": 1.7985649532044958, + "language_loss": 0.68047285, + "learning_rate": 3.921451049000975e-06, + "loss": 0.70253915, + "num_input_tokens_seen": 42105295, + "step": 1943, + "time_per_iteration": 2.6642913818359375 + }, + { + "auxiliary_loss_clip": 0.0114482, + "auxiliary_loss_mlp": 0.01047674, + "balance_loss_clip": 1.05336821, + "balance_loss_mlp": 1.02700353, + "epoch": 0.11687960318653239, + "flos": 38983259024640.0, + "grad_norm": 1.8204696420808466, + "language_loss": 0.6912185, + "learning_rate": 3.921342936802265e-06, + "loss": 0.71314341, + "num_input_tokens_seen": 42125520, + "step": 1944, + "time_per_iteration": 2.677407741546631 + }, + { + "auxiliary_loss_clip": 0.01152352, + "auxiliary_loss_mlp": 0.01042135, + "balance_loss_clip": 1.0514226, + "balance_loss_mlp": 1.02260852, + "epoch": 0.11693972643920036, + "flos": 25994980575360.0, + "grad_norm": 1.587655881321349, + "language_loss": 0.82824177, + "learning_rate": 3.921234751746038e-06, + "loss": 0.85018665, + "num_input_tokens_seen": 42146335, + "step": 1945, + "time_per_iteration": 2.554234743118286 + }, + { + "auxiliary_loss_clip": 0.01131029, + "auxiliary_loss_mlp": 0.01048336, + "balance_loss_clip": 1.04711962, + "balance_loss_mlp": 1.02786767, + "epoch": 0.11699984969186833, + "flos": 27272107618560.0, + "grad_norm": 2.803313029658502, + "language_loss": 0.75780559, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.77959919, + "num_input_tokens_seen": 42165320, + "step": 1946, + "time_per_iteration": 2.5951461791992188 + }, + { + "auxiliary_loss_clip": 0.01126981, + "auxiliary_loss_mlp": 0.01042634, + "balance_loss_clip": 1.04801786, + "balance_loss_mlp": 1.02347732, + "epoch": 0.1170599729445363, + "flos": 15267853232640.0, + "grad_norm": 1.9327939025602758, + "language_loss": 0.69170189, + "learning_rate": 3.921018163077448e-06, + "loss": 0.7133981, + "num_input_tokens_seen": 42182955, + "step": 1947, + "time_per_iteration": 2.575732469558716 + }, + { + "auxiliary_loss_clip": 0.01143114, + "auxiliary_loss_mlp": 0.01055993, + "balance_loss_clip": 1.05300927, + "balance_loss_mlp": 1.03553629, + "epoch": 0.11712009619720427, + "flos": 17164439251200.0, + "grad_norm": 1.8421452006453982, + "language_loss": 0.85038567, + "learning_rate": 3.920909759473295e-06, + "loss": 0.87237668, + "num_input_tokens_seen": 42200760, + "step": 1948, + "time_per_iteration": 2.525824785232544 + }, + { + "auxiliary_loss_clip": 0.01071888, + "auxiliary_loss_mlp": 0.01327662, + "balance_loss_clip": 1.04258156, + "balance_loss_mlp": 2.0077157, + "epoch": 0.11718021944987224, + "flos": 70940991997440.0, + "grad_norm": 0.8626807356051814, + "language_loss": 0.65186858, + "learning_rate": 3.920801283028054e-06, + "loss": 0.6758641, + "num_input_tokens_seen": 42265745, + "step": 1949, + "time_per_iteration": 3.1817641258239746 + }, + { + "auxiliary_loss_clip": 0.0114883, + "auxiliary_loss_mlp": 0.01051341, + "balance_loss_clip": 1.05234671, + "balance_loss_mlp": 1.03182614, + "epoch": 0.1172403427025402, + "flos": 27453456408960.0, + "grad_norm": 1.6086578534386813, + "language_loss": 0.72137278, + "learning_rate": 3.920692733745835e-06, + "loss": 0.74337447, + "num_input_tokens_seen": 42286245, + "step": 1950, + "time_per_iteration": 2.5708179473876953 + }, + { + "auxiliary_loss_clip": 0.01161787, + "auxiliary_loss_mlp": 0.01055764, + "balance_loss_clip": 1.05380809, + "balance_loss_mlp": 1.03573656, + "epoch": 0.11730046595520818, + "flos": 15668723992320.0, + "grad_norm": 2.6186429736162853, + "language_loss": 0.76524949, + "learning_rate": 3.920584111630755e-06, + "loss": 0.78742504, + "num_input_tokens_seen": 42302710, + "step": 1951, + "time_per_iteration": 2.497272253036499 + }, + { + "auxiliary_loss_clip": 0.01118101, + "auxiliary_loss_mlp": 0.01061959, + "balance_loss_clip": 1.05036366, + "balance_loss_mlp": 1.04194379, + "epoch": 0.11736058920787615, + "flos": 25630164092160.0, + "grad_norm": 1.9848867179370924, + "language_loss": 0.76375544, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.78555608, + "num_input_tokens_seen": 42324115, + "step": 1952, + "time_per_iteration": 2.6761531829833984 + }, + { + "auxiliary_loss_clip": 0.01117236, + "auxiliary_loss_mlp": 0.01067686, + "balance_loss_clip": 1.0459168, + "balance_loss_mlp": 1.04728913, + "epoch": 0.11742071246054411, + "flos": 21434289701760.0, + "grad_norm": 2.2331394545968597, + "language_loss": 0.72275239, + "learning_rate": 3.920366648918491e-06, + "loss": 0.74460161, + "num_input_tokens_seen": 42342505, + "step": 1953, + "time_per_iteration": 2.58205246925354 + }, + { + "auxiliary_loss_clip": 0.011388, + "auxiliary_loss_mlp": 0.00965308, + "balance_loss_clip": 1.05053067, + "balance_loss_mlp": 1.29889035, + "epoch": 0.11748083571321208, + "flos": 15997845335040.0, + "grad_norm": 2.7332661403031526, + "language_loss": 0.79257196, + "learning_rate": 3.920257808329552e-06, + "loss": 0.81361306, + "num_input_tokens_seen": 42360525, + "step": 1954, + "time_per_iteration": 2.546494722366333 + }, + { + "auxiliary_loss_clip": 0.01112452, + "auxiliary_loss_mlp": 0.01064413, + "balance_loss_clip": 1.05890346, + "balance_loss_mlp": 1.04305041, + "epoch": 0.11754095896588006, + "flos": 16180056051840.0, + "grad_norm": 2.267735624052953, + "language_loss": 0.85795456, + "learning_rate": 3.920148894924246e-06, + "loss": 0.87972319, + "num_input_tokens_seen": 42377045, + "step": 1955, + "time_per_iteration": 2.6169769763946533 + }, + { + "auxiliary_loss_clip": 0.01152582, + "auxiliary_loss_mlp": 0.00955751, + "balance_loss_clip": 1.04714417, + "balance_loss_mlp": 1.28870428, + "epoch": 0.11760108221854802, + "flos": 13261596013440.0, + "grad_norm": 2.53502835899001, + "language_loss": 0.77986956, + "learning_rate": 3.920039908706701e-06, + "loss": 0.80095291, + "num_input_tokens_seen": 42393960, + "step": 1956, + "time_per_iteration": 2.538344383239746 + }, + { + "auxiliary_loss_clip": 0.01147018, + "auxiliary_loss_mlp": 0.0106205, + "balance_loss_clip": 1.05163908, + "balance_loss_mlp": 1.04208207, + "epoch": 0.11766120547121599, + "flos": 24498439303680.0, + "grad_norm": 2.0007919168406443, + "language_loss": 0.80456066, + "learning_rate": 3.91993084968105e-06, + "loss": 0.82665128, + "num_input_tokens_seen": 42413160, + "step": 1957, + "time_per_iteration": 2.5484495162963867 + }, + { + "auxiliary_loss_clip": 0.01155837, + "auxiliary_loss_mlp": 0.01056486, + "balance_loss_clip": 1.05143189, + "balance_loss_mlp": 1.03692412, + "epoch": 0.11772132872388397, + "flos": 17784005967360.0, + "grad_norm": 2.253676816155962, + "language_loss": 0.78356004, + "learning_rate": 3.919821717851428e-06, + "loss": 0.8056832, + "num_input_tokens_seen": 42432590, + "step": 1958, + "time_per_iteration": 2.5432913303375244 + }, + { + "auxiliary_loss_clip": 0.01137817, + "auxiliary_loss_mlp": 0.01054987, + "balance_loss_clip": 1.05194342, + "balance_loss_mlp": 1.03391123, + "epoch": 0.11778145197655193, + "flos": 13217030213760.0, + "grad_norm": 1.8892732562270742, + "language_loss": 0.76747322, + "learning_rate": 3.919712513221976e-06, + "loss": 0.78940117, + "num_input_tokens_seen": 42450135, + "step": 1959, + "time_per_iteration": 2.5276200771331787 + }, + { + "auxiliary_loss_clip": 0.01144867, + "auxiliary_loss_mlp": 0.01060032, + "balance_loss_clip": 1.04876947, + "balance_loss_mlp": 1.04018402, + "epoch": 0.1178415752292199, + "flos": 20230204965120.0, + "grad_norm": 2.295346807349742, + "language_loss": 0.70227647, + "learning_rate": 3.919603235796832e-06, + "loss": 0.72432542, + "num_input_tokens_seen": 42470050, + "step": 1960, + "time_per_iteration": 2.6025986671447754 + }, + { + "auxiliary_loss_clip": 0.01146344, + "auxiliary_loss_mlp": 0.01059941, + "balance_loss_clip": 1.05103159, + "balance_loss_mlp": 1.03963923, + "epoch": 0.11790169848188788, + "flos": 13040134709760.0, + "grad_norm": 3.5303481738363094, + "language_loss": 0.81307012, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.83513296, + "num_input_tokens_seen": 42484335, + "step": 1961, + "time_per_iteration": 4.004866361618042 + }, + { + "auxiliary_loss_clip": 0.01148194, + "auxiliary_loss_mlp": 0.00887008, + "balance_loss_clip": 1.04900384, + "balance_loss_mlp": 1.15622234, + "epoch": 0.11796182173455584, + "flos": 22265728790400.0, + "grad_norm": 1.882794442844709, + "language_loss": 0.9247185, + "learning_rate": 3.919384462576049e-06, + "loss": 0.94507056, + "num_input_tokens_seen": 42502720, + "step": 1962, + "time_per_iteration": 2.529153823852539 + }, + { + "auxiliary_loss_clip": 0.01130924, + "auxiliary_loss_mlp": 0.01057578, + "balance_loss_clip": 1.05040145, + "balance_loss_mlp": 1.03803921, + "epoch": 0.1180219449872238, + "flos": 10635017892480.0, + "grad_norm": 2.160887962706216, + "language_loss": 0.87861317, + "learning_rate": 3.919274966788707e-06, + "loss": 0.90049815, + "num_input_tokens_seen": 42519460, + "step": 1963, + "time_per_iteration": 2.582219362258911 + }, + { + "auxiliary_loss_clip": 0.01143996, + "auxiliary_loss_mlp": 0.00862324, + "balance_loss_clip": 1.04984224, + "balance_loss_mlp": 1.11297774, + "epoch": 0.11808206823989177, + "flos": 20923532259840.0, + "grad_norm": 1.8964746199548845, + "language_loss": 0.83975017, + "learning_rate": 3.919165398222265e-06, + "loss": 0.85981333, + "num_input_tokens_seen": 42539420, + "step": 1964, + "time_per_iteration": 3.9737255573272705 + }, + { + "auxiliary_loss_clip": 0.01115898, + "auxiliary_loss_mlp": 0.01064274, + "balance_loss_clip": 1.05253851, + "balance_loss_mlp": 1.04433072, + "epoch": 0.11814219149255975, + "flos": 20777770869120.0, + "grad_norm": 1.8318993761861306, + "language_loss": 0.83347541, + "learning_rate": 3.919055756880879e-06, + "loss": 0.85527706, + "num_input_tokens_seen": 42558225, + "step": 1965, + "time_per_iteration": 2.6023194789886475 + }, + { + "auxiliary_loss_clip": 0.01164829, + "auxiliary_loss_mlp": 0.01053371, + "balance_loss_clip": 1.04963541, + "balance_loss_mlp": 1.03345108, + "epoch": 0.11820231474522772, + "flos": 48759938542080.0, + "grad_norm": 1.6328952453432575, + "language_loss": 0.74504232, + "learning_rate": 3.918946042768707e-06, + "loss": 0.76722431, + "num_input_tokens_seen": 42580790, + "step": 1966, + "time_per_iteration": 2.704552412033081 + }, + { + "auxiliary_loss_clip": 0.0114806, + "auxiliary_loss_mlp": 0.01054517, + "balance_loss_clip": 1.05682302, + "balance_loss_mlp": 1.03465712, + "epoch": 0.11826243799789568, + "flos": 16690598012160.0, + "grad_norm": 5.638078656269434, + "language_loss": 0.7306813, + "learning_rate": 3.918836255889908e-06, + "loss": 0.75270706, + "num_input_tokens_seen": 42597355, + "step": 1967, + "time_per_iteration": 3.896916389465332 + }, + { + "auxiliary_loss_clip": 0.01150651, + "auxiliary_loss_mlp": 0.01054334, + "balance_loss_clip": 1.05035746, + "balance_loss_mlp": 1.03461695, + "epoch": 0.11832256125056366, + "flos": 16909868586240.0, + "grad_norm": 2.2987294741986934, + "language_loss": 0.88442409, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.90647399, + "num_input_tokens_seen": 42616060, + "step": 1968, + "time_per_iteration": 2.516496181488037 + }, + { + "auxiliary_loss_clip": 0.01148844, + "auxiliary_loss_mlp": 0.01049827, + "balance_loss_clip": 1.05349922, + "balance_loss_mlp": 1.0296452, + "epoch": 0.11838268450323162, + "flos": 22820405587200.0, + "grad_norm": 1.8660465561385662, + "language_loss": 0.66790843, + "learning_rate": 3.918616463849087e-06, + "loss": 0.68989515, + "num_input_tokens_seen": 42636285, + "step": 1969, + "time_per_iteration": 2.551422595977783 + }, + { + "auxiliary_loss_clip": 0.01118416, + "auxiliary_loss_mlp": 0.01053008, + "balance_loss_clip": 1.04925656, + "balance_loss_mlp": 1.03170514, + "epoch": 0.11844280775589959, + "flos": 33545844990720.0, + "grad_norm": 2.119612902792752, + "language_loss": 0.80773586, + "learning_rate": 3.918506458695399e-06, + "loss": 0.82945007, + "num_input_tokens_seen": 42658320, + "step": 1970, + "time_per_iteration": 2.6891331672668457 + }, + { + "auxiliary_loss_clip": 0.01077612, + "auxiliary_loss_mlp": 0.01005543, + "balance_loss_clip": 1.03888416, + "balance_loss_mlp": 1.00264633, + "epoch": 0.11850293100856757, + "flos": 66350998604160.0, + "grad_norm": 0.7981344684973836, + "language_loss": 0.66166365, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68249524, + "num_input_tokens_seen": 42721500, + "step": 1971, + "time_per_iteration": 4.485416889190674 + }, + { + "auxiliary_loss_clip": 0.0113963, + "auxiliary_loss_mlp": 0.01048706, + "balance_loss_clip": 1.05109584, + "balance_loss_mlp": 1.02910781, + "epoch": 0.11856305426123553, + "flos": 24681045070080.0, + "grad_norm": 1.9768675251710752, + "language_loss": 0.79809463, + "learning_rate": 3.918286230142327e-06, + "loss": 0.819978, + "num_input_tokens_seen": 42739825, + "step": 1972, + "time_per_iteration": 2.5836904048919678 + }, + { + "auxiliary_loss_clip": 0.01121804, + "auxiliary_loss_mlp": 0.00929307, + "balance_loss_clip": 1.05085468, + "balance_loss_mlp": 1.22933614, + "epoch": 0.1186231775139035, + "flos": 24280102483200.0, + "grad_norm": 2.4421955258219517, + "language_loss": 0.72831452, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74882561, + "num_input_tokens_seen": 42758695, + "step": 1973, + "time_per_iteration": 2.616840362548828 + }, + { + "auxiliary_loss_clip": 0.01126376, + "auxiliary_loss_mlp": 0.01045729, + "balance_loss_clip": 1.05222845, + "balance_loss_mlp": 1.02566659, + "epoch": 0.11868330076657148, + "flos": 21757413473280.0, + "grad_norm": 2.46406893503361, + "language_loss": 0.72072142, + "learning_rate": 3.918065710622832e-06, + "loss": 0.74244249, + "num_input_tokens_seen": 42778510, + "step": 1974, + "time_per_iteration": 2.6020305156707764 + }, + { + "auxiliary_loss_clip": 0.01125692, + "auxiliary_loss_mlp": 0.01038947, + "balance_loss_clip": 1.06100917, + "balance_loss_mlp": 1.01928985, + "epoch": 0.11874342401923944, + "flos": 17193274894080.0, + "grad_norm": 2.754204847086295, + "language_loss": 0.78464663, + "learning_rate": 3.917955341761128e-06, + "loss": 0.80629301, + "num_input_tokens_seen": 42793995, + "step": 1975, + "time_per_iteration": 2.599945068359375 + }, + { + "auxiliary_loss_clip": 0.01116691, + "auxiliary_loss_mlp": 0.01051801, + "balance_loss_clip": 1.0528934, + "balance_loss_mlp": 1.03281081, + "epoch": 0.11880354727190741, + "flos": 15229572312960.0, + "grad_norm": 2.430186128702226, + "language_loss": 0.75286448, + "learning_rate": 3.917844900170364e-06, + "loss": 0.77454937, + "num_input_tokens_seen": 42809000, + "step": 1976, + "time_per_iteration": 2.5963006019592285 + }, + { + "auxiliary_loss_clip": 0.0115275, + "auxiliary_loss_mlp": 0.01046629, + "balance_loss_clip": 1.05091858, + "balance_loss_mlp": 1.02759171, + "epoch": 0.11886367052457537, + "flos": 27309706179840.0, + "grad_norm": 1.5094173278239351, + "language_loss": 0.75177848, + "learning_rate": 3.91773438585473e-06, + "loss": 0.7737723, + "num_input_tokens_seen": 42831585, + "step": 1977, + "time_per_iteration": 2.591916561126709 + }, + { + "auxiliary_loss_clip": 0.01165803, + "auxiliary_loss_mlp": 0.01052229, + "balance_loss_clip": 1.0515132, + "balance_loss_mlp": 1.03295279, + "epoch": 0.11892379377724335, + "flos": 21798280172160.0, + "grad_norm": 2.1774149384139125, + "language_loss": 0.73740005, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.75958037, + "num_input_tokens_seen": 42848420, + "step": 1978, + "time_per_iteration": 2.4662814140319824 + }, + { + "auxiliary_loss_clip": 0.01128392, + "auxiliary_loss_mlp": 0.0105097, + "balance_loss_clip": 1.05591011, + "balance_loss_mlp": 1.03145552, + "epoch": 0.11898391702991132, + "flos": 13991013498240.0, + "grad_norm": 1.7395930735324432, + "language_loss": 0.73378623, + "learning_rate": 3.917513139065616e-06, + "loss": 0.75557983, + "num_input_tokens_seen": 42866645, + "step": 1979, + "time_per_iteration": 2.5810251235961914 + }, + { + "auxiliary_loss_clip": 0.01122549, + "auxiliary_loss_mlp": 0.01046205, + "balance_loss_clip": 1.05192518, + "balance_loss_mlp": 1.02744186, + "epoch": 0.11904404028257928, + "flos": 32234567091840.0, + "grad_norm": 2.2129578082359354, + "language_loss": 0.98480397, + "learning_rate": 3.917402406600525e-06, + "loss": 1.00649142, + "num_input_tokens_seen": 42888515, + "step": 1980, + "time_per_iteration": 2.6522841453552246 + }, + { + "auxiliary_loss_clip": 0.01143052, + "auxiliary_loss_mlp": 0.01051718, + "balance_loss_clip": 1.05186379, + "balance_loss_mlp": 1.03128517, + "epoch": 0.11910416353524726, + "flos": 23586272398080.0, + "grad_norm": 2.0845675367680827, + "language_loss": 0.85961068, + "learning_rate": 3.917291601427342e-06, + "loss": 0.8815583, + "num_input_tokens_seen": 42909035, + "step": 1981, + "time_per_iteration": 2.569117784500122 + }, + { + "auxiliary_loss_clip": 0.01143128, + "auxiliary_loss_mlp": 0.01059039, + "balance_loss_clip": 1.0519383, + "balance_loss_mlp": 1.03826094, + "epoch": 0.11916428678791523, + "flos": 25333038789120.0, + "grad_norm": 1.9283616913701151, + "language_loss": 0.85265642, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87467808, + "num_input_tokens_seen": 42927555, + "step": 1982, + "time_per_iteration": 2.5797674655914307 + }, + { + "auxiliary_loss_clip": 0.01133814, + "auxiliary_loss_mlp": 0.01046581, + "balance_loss_clip": 1.05236936, + "balance_loss_mlp": 1.02756715, + "epoch": 0.11922441004058319, + "flos": 19788431592960.0, + "grad_norm": 2.437038735480931, + "language_loss": 0.84974194, + "learning_rate": 3.917069772973513e-06, + "loss": 0.87154585, + "num_input_tokens_seen": 42945300, + "step": 1983, + "time_per_iteration": 2.5384044647216797 + }, + { + "auxiliary_loss_clip": 0.01115105, + "auxiliary_loss_mlp": 0.01049909, + "balance_loss_clip": 1.05551076, + "balance_loss_mlp": 1.03040636, + "epoch": 0.11928453329325117, + "flos": 21536347219200.0, + "grad_norm": 3.5431422392311105, + "language_loss": 0.77307051, + "learning_rate": 3.916958749701277e-06, + "loss": 0.79472065, + "num_input_tokens_seen": 42961295, + "step": 1984, + "time_per_iteration": 2.5942463874816895 + }, + { + "auxiliary_loss_clip": 0.01147611, + "auxiliary_loss_mlp": 0.0104995, + "balance_loss_clip": 1.05121756, + "balance_loss_mlp": 1.03123462, + "epoch": 0.11934465654591914, + "flos": 20815010294400.0, + "grad_norm": 1.8131537365760881, + "language_loss": 0.83009195, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.85206747, + "num_input_tokens_seen": 42980330, + "step": 1985, + "time_per_iteration": 2.5186705589294434 + }, + { + "auxiliary_loss_clip": 0.0114665, + "auxiliary_loss_mlp": 0.01042697, + "balance_loss_clip": 1.05732918, + "balance_loss_mlp": 1.02324188, + "epoch": 0.1194047797985871, + "flos": 19060486565760.0, + "grad_norm": 2.162651561195976, + "language_loss": 0.74525714, + "learning_rate": 3.916736485087216e-06, + "loss": 0.76715064, + "num_input_tokens_seen": 42996125, + "step": 1986, + "time_per_iteration": 2.501781940460205 + }, + { + "auxiliary_loss_clip": 0.01133479, + "auxiliary_loss_mlp": 0.01040915, + "balance_loss_clip": 1.04895854, + "balance_loss_mlp": 1.0223304, + "epoch": 0.11946490305125507, + "flos": 27190805184000.0, + "grad_norm": 1.9351143957200532, + "language_loss": 0.72062194, + "learning_rate": 3.916625243753819e-06, + "loss": 0.74236584, + "num_input_tokens_seen": 43014180, + "step": 1987, + "time_per_iteration": 2.5986461639404297 + }, + { + "auxiliary_loss_clip": 0.01142069, + "auxiliary_loss_mlp": 0.01047694, + "balance_loss_clip": 1.04907751, + "balance_loss_mlp": 1.02751207, + "epoch": 0.11952502630392305, + "flos": 21140791672320.0, + "grad_norm": 2.6499638490411925, + "language_loss": 0.71660423, + "learning_rate": 3.916513929741799e-06, + "loss": 0.73850179, + "num_input_tokens_seen": 43032120, + "step": 1988, + "time_per_iteration": 2.5340828895568848 + }, + { + "auxiliary_loss_clip": 0.0115255, + "auxiliary_loss_mlp": 0.01052999, + "balance_loss_clip": 1.05189526, + "balance_loss_mlp": 1.03189933, + "epoch": 0.11958514955659101, + "flos": 22124241118080.0, + "grad_norm": 1.8207622465953963, + "language_loss": 0.80927891, + "learning_rate": 3.91640254305538e-06, + "loss": 0.83133435, + "num_input_tokens_seen": 43052215, + "step": 1989, + "time_per_iteration": 2.548616886138916 + }, + { + "auxiliary_loss_clip": 0.01123967, + "auxiliary_loss_mlp": 0.01055039, + "balance_loss_clip": 1.04762232, + "balance_loss_mlp": 1.03345013, + "epoch": 0.11964527280925898, + "flos": 17421452040960.0, + "grad_norm": 2.6812235165936897, + "language_loss": 0.75735188, + "learning_rate": 3.916291083698784e-06, + "loss": 0.77914196, + "num_input_tokens_seen": 43069720, + "step": 1990, + "time_per_iteration": 2.5588481426239014 + }, + { + "auxiliary_loss_clip": 0.01070776, + "auxiliary_loss_mlp": 0.01011814, + "balance_loss_clip": 1.04346037, + "balance_loss_mlp": 1.00857151, + "epoch": 0.11970539606192696, + "flos": 70679741402880.0, + "grad_norm": 0.85324638721916, + "language_loss": 0.55302262, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57384849, + "num_input_tokens_seen": 43123130, + "step": 1991, + "time_per_iteration": 3.1593527793884277 + }, + { + "auxiliary_loss_clip": 0.01127255, + "auxiliary_loss_mlp": 0.01049894, + "balance_loss_clip": 1.05571568, + "balance_loss_mlp": 1.03102255, + "epoch": 0.11976551931459492, + "flos": 21215019127680.0, + "grad_norm": 3.788502171727554, + "language_loss": 0.78429937, + "learning_rate": 3.916067946991971e-06, + "loss": 0.80607086, + "num_input_tokens_seen": 43140015, + "step": 1992, + "time_per_iteration": 2.6007132530212402 + }, + { + "auxiliary_loss_clip": 0.01166055, + "auxiliary_loss_mlp": 0.01049861, + "balance_loss_clip": 1.05031168, + "balance_loss_mlp": 1.03012013, + "epoch": 0.11982564256726289, + "flos": 25989306226560.0, + "grad_norm": 2.592766573836646, + "language_loss": 0.7918514, + "learning_rate": 3.915956269650216e-06, + "loss": 0.8140105, + "num_input_tokens_seen": 43160105, + "step": 1993, + "time_per_iteration": 2.5283327102661133 + }, + { + "auxiliary_loss_clip": 0.01117372, + "auxiliary_loss_mlp": 0.01047431, + "balance_loss_clip": 1.04796672, + "balance_loss_mlp": 1.02809572, + "epoch": 0.11988576581993086, + "flos": 21650866755840.0, + "grad_norm": 3.2616393357395785, + "language_loss": 0.82154298, + "learning_rate": 3.915844519655208e-06, + "loss": 0.84319097, + "num_input_tokens_seen": 43179835, + "step": 1994, + "time_per_iteration": 2.604741096496582 + }, + { + "auxiliary_loss_clip": 0.0114549, + "auxiliary_loss_mlp": 0.01045819, + "balance_loss_clip": 1.05759418, + "balance_loss_mlp": 1.0280447, + "epoch": 0.11994588907259883, + "flos": 17857407409920.0, + "grad_norm": 2.5427180354856422, + "language_loss": 0.89008641, + "learning_rate": 3.915732697011183e-06, + "loss": 0.91199952, + "num_input_tokens_seen": 43197210, + "step": 1995, + "time_per_iteration": 2.523365020751953 + }, + { + "auxiliary_loss_clip": 0.01139889, + "auxiliary_loss_mlp": 0.0104831, + "balance_loss_clip": 1.0534637, + "balance_loss_mlp": 1.02873611, + "epoch": 0.1200060123252668, + "flos": 24462744163200.0, + "grad_norm": 1.9398591662631763, + "language_loss": 0.74066138, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.76254338, + "num_input_tokens_seen": 43215050, + "step": 1996, + "time_per_iteration": 2.5914840698242188 + }, + { + "auxiliary_loss_clip": 0.01126063, + "auxiliary_loss_mlp": 0.0104736, + "balance_loss_clip": 1.04856801, + "balance_loss_mlp": 1.02749944, + "epoch": 0.12006613557793476, + "flos": 18732191235840.0, + "grad_norm": 2.0618777042747203, + "language_loss": 0.8791343, + "learning_rate": 3.915508833793048e-06, + "loss": 0.90086854, + "num_input_tokens_seen": 43233900, + "step": 1997, + "time_per_iteration": 2.5554914474487305 + }, + { + "auxiliary_loss_clip": 0.01155183, + "auxiliary_loss_mlp": 0.00886712, + "balance_loss_clip": 1.05134785, + "balance_loss_mlp": 1.15757585, + "epoch": 0.12012625883060274, + "flos": 22267739952000.0, + "grad_norm": 2.136047879464062, + "language_loss": 0.79063749, + "learning_rate": 3.915396793227428e-06, + "loss": 0.81105644, + "num_input_tokens_seen": 43252105, + "step": 1998, + "time_per_iteration": 2.533999443054199 + }, + { + "auxiliary_loss_clip": 0.01152608, + "auxiliary_loss_mlp": 0.00899942, + "balance_loss_clip": 1.05016279, + "balance_loss_mlp": 1.18143344, + "epoch": 0.1201863820832707, + "flos": 21758885930880.0, + "grad_norm": 2.0798024135985878, + "language_loss": 0.73231286, + "learning_rate": 3.915284680029769e-06, + "loss": 0.75283831, + "num_input_tokens_seen": 43270315, + "step": 1999, + "time_per_iteration": 2.528099775314331 + }, + { + "auxiliary_loss_clip": 0.01165019, + "auxiliary_loss_mlp": 0.01058324, + "balance_loss_clip": 1.05215883, + "balance_loss_mlp": 1.03915489, + "epoch": 0.12024650533593867, + "flos": 21907987286400.0, + "grad_norm": 2.5027874956362854, + "language_loss": 0.75097722, + "learning_rate": 3.915172494204323e-06, + "loss": 0.7732107, + "num_input_tokens_seen": 43289935, + "step": 2000, + "time_per_iteration": 3.8933932781219482 + }, + { + "auxiliary_loss_clip": 0.01141305, + "auxiliary_loss_mlp": 0.01053046, + "balance_loss_clip": 1.052248, + "balance_loss_mlp": 1.03347158, + "epoch": 0.12030662858860665, + "flos": 21689219502720.0, + "grad_norm": 1.6104929415315563, + "language_loss": 0.84737599, + "learning_rate": 3.915060235755344e-06, + "loss": 0.8693195, + "num_input_tokens_seen": 43309325, + "step": 2001, + "time_per_iteration": 2.555734157562256 + }, + { + "auxiliary_loss_clip": 0.01139069, + "auxiliary_loss_mlp": 0.01053195, + "balance_loss_clip": 1.0482688, + "balance_loss_mlp": 1.03513527, + "epoch": 0.12036675184127461, + "flos": 12933228856320.0, + "grad_norm": 2.380684526008672, + "language_loss": 0.73744106, + "learning_rate": 3.91494790468709e-06, + "loss": 0.75936371, + "num_input_tokens_seen": 43327010, + "step": 2002, + "time_per_iteration": 3.9716906547546387 + }, + { + "auxiliary_loss_clip": 0.01122159, + "auxiliary_loss_mlp": 0.01059501, + "balance_loss_clip": 1.05417776, + "balance_loss_mlp": 1.03844833, + "epoch": 0.12042687509394258, + "flos": 20851028657280.0, + "grad_norm": 2.207588136758955, + "language_loss": 0.77753758, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.7993542, + "num_input_tokens_seen": 43345650, + "step": 2003, + "time_per_iteration": 2.5920002460479736 + }, + { + "auxiliary_loss_clip": 0.01148759, + "auxiliary_loss_mlp": 0.01058923, + "balance_loss_clip": 1.04940081, + "balance_loss_mlp": 1.0393374, + "epoch": 0.12048699834661056, + "flos": 23878513451520.0, + "grad_norm": 1.5901874930960935, + "language_loss": 0.72107381, + "learning_rate": 3.914723024709793e-06, + "loss": 0.74315059, + "num_input_tokens_seen": 43365555, + "step": 2004, + "time_per_iteration": 2.527144432067871 + }, + { + "auxiliary_loss_clip": 0.01146303, + "auxiliary_loss_mlp": 0.01062034, + "balance_loss_clip": 1.05354643, + "balance_loss_mlp": 1.04150629, + "epoch": 0.12054712159927852, + "flos": 19756363726080.0, + "grad_norm": 2.0999419591616433, + "language_loss": 0.78534287, + "learning_rate": 3.914610475809279e-06, + "loss": 0.80742621, + "num_input_tokens_seen": 43384990, + "step": 2005, + "time_per_iteration": 3.9148948192596436 + }, + { + "auxiliary_loss_clip": 0.01104684, + "auxiliary_loss_mlp": 0.01848327, + "balance_loss_clip": 1.04250169, + "balance_loss_mlp": 2.9925499, + "epoch": 0.12060724485194649, + "flos": 51672763123200.0, + "grad_norm": 0.9968664815838961, + "language_loss": 0.58042544, + "learning_rate": 3.914497854306543e-06, + "loss": 0.60995555, + "num_input_tokens_seen": 43436335, + "step": 2006, + "time_per_iteration": 2.8950796127319336 + }, + { + "auxiliary_loss_clip": 0.01145724, + "auxiliary_loss_mlp": 0.01052454, + "balance_loss_clip": 1.05312788, + "balance_loss_mlp": 1.03388143, + "epoch": 0.12066736810461445, + "flos": 18990425088000.0, + "grad_norm": 2.569168696998359, + "language_loss": 0.76575094, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.78773272, + "num_input_tokens_seen": 43456495, + "step": 2007, + "time_per_iteration": 2.550290822982788 + }, + { + "auxiliary_loss_clip": 0.01135056, + "auxiliary_loss_mlp": 0.01059835, + "balance_loss_clip": 1.05314851, + "balance_loss_mlp": 1.04001057, + "epoch": 0.12072749135728243, + "flos": 16471973882880.0, + "grad_norm": 2.6940490896643396, + "language_loss": 0.82868421, + "learning_rate": 3.914272393511494e-06, + "loss": 0.85063308, + "num_input_tokens_seen": 43473085, + "step": 2008, + "time_per_iteration": 2.554098606109619 + }, + { + "auxiliary_loss_clip": 0.01164579, + "auxiliary_loss_mlp": 0.01052384, + "balance_loss_clip": 1.05145442, + "balance_loss_mlp": 1.03292906, + "epoch": 0.1207876146099504, + "flos": 18077108947200.0, + "grad_norm": 2.2226199483690934, + "language_loss": 0.83845901, + "learning_rate": 3.91415955422773e-06, + "loss": 0.86062872, + "num_input_tokens_seen": 43491135, + "step": 2009, + "time_per_iteration": 2.4581258296966553 + }, + { + "auxiliary_loss_clip": 0.01166548, + "auxiliary_loss_mlp": 0.01053268, + "balance_loss_clip": 1.05425644, + "balance_loss_mlp": 1.03217983, + "epoch": 0.12084773786261836, + "flos": 21871573873920.0, + "grad_norm": 1.7177718484415285, + "language_loss": 0.84070957, + "learning_rate": 3.914046642358844e-06, + "loss": 0.86290777, + "num_input_tokens_seen": 43510440, + "step": 2010, + "time_per_iteration": 3.87064528465271 + }, + { + "auxiliary_loss_clip": 0.01138612, + "auxiliary_loss_mlp": 0.01029668, + "balance_loss_clip": 1.05504799, + "balance_loss_mlp": 1.41387534, + "epoch": 0.12090786111528634, + "flos": 18333044328960.0, + "grad_norm": 1.7532307548796595, + "language_loss": 0.83881837, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.86050117, + "num_input_tokens_seen": 43530145, + "step": 2011, + "time_per_iteration": 2.5405757427215576 + }, + { + "auxiliary_loss_clip": 0.01137946, + "auxiliary_loss_mlp": 0.01056826, + "balance_loss_clip": 1.05030715, + "balance_loss_mlp": 1.03748977, + "epoch": 0.1209679843679543, + "flos": 21105850717440.0, + "grad_norm": 2.2144079040968156, + "language_loss": 0.95837605, + "learning_rate": 3.913820600882834e-06, + "loss": 0.98032367, + "num_input_tokens_seen": 43549315, + "step": 2012, + "time_per_iteration": 2.6060707569122314 + }, + { + "auxiliary_loss_clip": 0.01135881, + "auxiliary_loss_mlp": 0.01046057, + "balance_loss_clip": 1.05440927, + "balance_loss_mlp": 1.0259819, + "epoch": 0.12102810762062227, + "flos": 29241053585280.0, + "grad_norm": 2.1417363623006276, + "language_loss": 0.80387414, + "learning_rate": 3.913707471284283e-06, + "loss": 0.82569349, + "num_input_tokens_seen": 43569240, + "step": 2013, + "time_per_iteration": 2.6297709941864014 + }, + { + "auxiliary_loss_clip": 0.0111938, + "auxiliary_loss_mlp": 0.0104825, + "balance_loss_clip": 1.05389166, + "balance_loss_mlp": 1.02757943, + "epoch": 0.12108823087329025, + "flos": 17930701111680.0, + "grad_norm": 4.844003660159937, + "language_loss": 0.77253908, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.79421532, + "num_input_tokens_seen": 43587710, + "step": 2014, + "time_per_iteration": 2.6008076667785645 + }, + { + "auxiliary_loss_clip": 0.01155933, + "auxiliary_loss_mlp": 0.01043335, + "balance_loss_clip": 1.05310178, + "balance_loss_mlp": 1.02368963, + "epoch": 0.12114835412595822, + "flos": 22091850028800.0, + "grad_norm": 5.295026242165726, + "language_loss": 0.87172776, + "learning_rate": 3.913480994387535e-06, + "loss": 0.89372045, + "num_input_tokens_seen": 43606000, + "step": 2015, + "time_per_iteration": 2.546238660812378 + }, + { + "auxiliary_loss_clip": 0.011604, + "auxiliary_loss_mlp": 0.01044948, + "balance_loss_clip": 1.05028927, + "balance_loss_mlp": 1.02610123, + "epoch": 0.12120847737862618, + "flos": 20412343854720.0, + "grad_norm": 3.7349479964449728, + "language_loss": 0.69643474, + "learning_rate": 3.913367647097926e-06, + "loss": 0.71848822, + "num_input_tokens_seen": 43624815, + "step": 2016, + "time_per_iteration": 2.4829280376434326 + }, + { + "auxiliary_loss_clip": 0.01146932, + "auxiliary_loss_mlp": 0.01039566, + "balance_loss_clip": 1.05671573, + "balance_loss_mlp": 1.01837039, + "epoch": 0.12126860063129415, + "flos": 22309037614080.0, + "grad_norm": 2.6979832366887955, + "language_loss": 0.8057881, + "learning_rate": 3.913254227253225e-06, + "loss": 0.82765305, + "num_input_tokens_seen": 43643960, + "step": 2017, + "time_per_iteration": 2.5871660709381104 + }, + { + "auxiliary_loss_clip": 0.01149195, + "auxiliary_loss_mlp": 0.01041434, + "balance_loss_clip": 1.05301881, + "balance_loss_mlp": 1.0208106, + "epoch": 0.12132872388396213, + "flos": 13699275235200.0, + "grad_norm": 2.3934647816737575, + "language_loss": 0.69396234, + "learning_rate": 3.913140734857731e-06, + "loss": 0.71586859, + "num_input_tokens_seen": 43662650, + "step": 2018, + "time_per_iteration": 2.500706195831299 + }, + { + "auxiliary_loss_clip": 0.01135758, + "auxiliary_loss_mlp": 0.01047637, + "balance_loss_clip": 1.06123447, + "balance_loss_mlp": 1.02858734, + "epoch": 0.12138884713663009, + "flos": 26466954307200.0, + "grad_norm": 1.7490491637023857, + "language_loss": 0.72828591, + "learning_rate": 3.91302716991575e-06, + "loss": 0.75011981, + "num_input_tokens_seen": 43684205, + "step": 2019, + "time_per_iteration": 2.651951313018799 + }, + { + "auxiliary_loss_clip": 0.01105969, + "auxiliary_loss_mlp": 0.01051963, + "balance_loss_clip": 1.06014538, + "balance_loss_mlp": 1.03184021, + "epoch": 0.12144897038929806, + "flos": 26141603892480.0, + "grad_norm": 1.90653374969136, + "language_loss": 0.91982889, + "learning_rate": 3.912913532431586e-06, + "loss": 0.94140828, + "num_input_tokens_seen": 43706320, + "step": 2020, + "time_per_iteration": 2.6866672039031982 + }, + { + "auxiliary_loss_clip": 0.01139428, + "auxiliary_loss_mlp": 0.01047248, + "balance_loss_clip": 1.05560911, + "balance_loss_mlp": 1.02759027, + "epoch": 0.12150909364196603, + "flos": 24717530309760.0, + "grad_norm": 2.203851537745432, + "language_loss": 0.77801257, + "learning_rate": 3.912799822409549e-06, + "loss": 0.79987931, + "num_input_tokens_seen": 43724805, + "step": 2021, + "time_per_iteration": 2.5806519985198975 + }, + { + "auxiliary_loss_clip": 0.01165932, + "auxiliary_loss_mlp": 0.01045934, + "balance_loss_clip": 1.05442858, + "balance_loss_mlp": 1.02689672, + "epoch": 0.121569216894634, + "flos": 25186990089600.0, + "grad_norm": 2.7776771253090553, + "language_loss": 0.80865026, + "learning_rate": 3.912686039853952e-06, + "loss": 0.83076894, + "num_input_tokens_seen": 43742320, + "step": 2022, + "time_per_iteration": 2.4969534873962402 + }, + { + "auxiliary_loss_clip": 0.01142376, + "auxiliary_loss_mlp": 0.01058877, + "balance_loss_clip": 1.05785167, + "balance_loss_mlp": 1.0372045, + "epoch": 0.12162934014730196, + "flos": 13444094039040.0, + "grad_norm": 1.944793233016679, + "language_loss": 0.8533293, + "learning_rate": 3.912572184769108e-06, + "loss": 0.87534183, + "num_input_tokens_seen": 43760665, + "step": 2023, + "time_per_iteration": 2.5711355209350586 + }, + { + "auxiliary_loss_clip": 0.0113488, + "auxiliary_loss_mlp": 0.01049064, + "balance_loss_clip": 1.05145597, + "balance_loss_mlp": 1.02866697, + "epoch": 0.12168946339996994, + "flos": 16946138344320.0, + "grad_norm": 2.199542277006388, + "language_loss": 0.85357249, + "learning_rate": 3.912458257159335e-06, + "loss": 0.87541187, + "num_input_tokens_seen": 43779020, + "step": 2024, + "time_per_iteration": 2.5757813453674316 + }, + { + "auxiliary_loss_clip": 0.01164089, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_clip": 1.05038691, + "balance_loss_mlp": 1.02835703, + "epoch": 0.12174958665263791, + "flos": 29821585196160.0, + "grad_norm": 2.5427062198664836, + "language_loss": 0.72016573, + "learning_rate": 3.912344257028954e-06, + "loss": 0.74228334, + "num_input_tokens_seen": 43798850, + "step": 2025, + "time_per_iteration": 2.5489659309387207 + }, + { + "auxiliary_loss_clip": 0.0114262, + "auxiliary_loss_mlp": 0.01044947, + "balance_loss_clip": 1.05232847, + "balance_loss_mlp": 1.02551651, + "epoch": 0.12180970990530587, + "flos": 24641902224000.0, + "grad_norm": 1.6221642400268326, + "language_loss": 0.76204312, + "learning_rate": 3.912230184382286e-06, + "loss": 0.78391886, + "num_input_tokens_seen": 43820130, + "step": 2026, + "time_per_iteration": 2.6011173725128174 + }, + { + "auxiliary_loss_clip": 0.01143951, + "auxiliary_loss_mlp": 0.01040691, + "balance_loss_clip": 1.0517664, + "balance_loss_mlp": 1.02110493, + "epoch": 0.12186983315797385, + "flos": 20521691832960.0, + "grad_norm": 2.5369581138655373, + "language_loss": 0.89190024, + "learning_rate": 3.912116039223659e-06, + "loss": 0.91374665, + "num_input_tokens_seen": 43838485, + "step": 2027, + "time_per_iteration": 2.546464204788208 + }, + { + "auxiliary_loss_clip": 0.01139269, + "auxiliary_loss_mlp": 0.01043713, + "balance_loss_clip": 1.05189896, + "balance_loss_mlp": 1.02559352, + "epoch": 0.12192995641064182, + "flos": 27818344719360.0, + "grad_norm": 1.9321298905541324, + "language_loss": 0.75699538, + "learning_rate": 3.912001821557399e-06, + "loss": 0.77882516, + "num_input_tokens_seen": 43859080, + "step": 2028, + "time_per_iteration": 2.5826075077056885 + }, + { + "auxiliary_loss_clip": 0.01122344, + "auxiliary_loss_mlp": 0.01050271, + "balance_loss_clip": 1.05485964, + "balance_loss_mlp": 1.03056598, + "epoch": 0.12199007966330978, + "flos": 22017119783040.0, + "grad_norm": 2.5361943394995876, + "language_loss": 0.76769638, + "learning_rate": 3.911887531387839e-06, + "loss": 0.78942251, + "num_input_tokens_seen": 43879030, + "step": 2029, + "time_per_iteration": 2.6352481842041016 + }, + { + "auxiliary_loss_clip": 0.01153738, + "auxiliary_loss_mlp": 0.0104387, + "balance_loss_clip": 1.0537082, + "balance_loss_mlp": 1.02462959, + "epoch": 0.12205020291597775, + "flos": 23295216493440.0, + "grad_norm": 2.509319266169487, + "language_loss": 0.79294682, + "learning_rate": 3.911773168719313e-06, + "loss": 0.81492287, + "num_input_tokens_seen": 43898505, + "step": 2030, + "time_per_iteration": 2.530721426010132 + }, + { + "auxiliary_loss_clip": 0.01164905, + "auxiliary_loss_mlp": 0.01041792, + "balance_loss_clip": 1.05478275, + "balance_loss_mlp": 1.02155006, + "epoch": 0.12211032616864573, + "flos": 26031609469440.0, + "grad_norm": 6.679680701791425, + "language_loss": 0.74412334, + "learning_rate": 3.911658733556155e-06, + "loss": 0.76619029, + "num_input_tokens_seen": 43917945, + "step": 2031, + "time_per_iteration": 2.535252571105957 + }, + { + "auxiliary_loss_clip": 0.01171382, + "auxiliary_loss_mlp": 0.01040097, + "balance_loss_clip": 1.05961561, + "balance_loss_mlp": 1.02220356, + "epoch": 0.12217044942131369, + "flos": 20410943224320.0, + "grad_norm": 1.8191351815347432, + "language_loss": 0.75424182, + "learning_rate": 3.911544225902707e-06, + "loss": 0.77635658, + "num_input_tokens_seen": 43937385, + "step": 2032, + "time_per_iteration": 2.4712228775024414 + }, + { + "auxiliary_loss_clip": 0.01152064, + "auxiliary_loss_mlp": 0.01039382, + "balance_loss_clip": 1.05475378, + "balance_loss_mlp": 1.02122688, + "epoch": 0.12223057267398166, + "flos": 22857142222080.0, + "grad_norm": 1.8251130328847562, + "language_loss": 0.89156616, + "learning_rate": 3.911429645763311e-06, + "loss": 0.91348064, + "num_input_tokens_seen": 43958130, + "step": 2033, + "time_per_iteration": 2.6003806591033936 + }, + { + "auxiliary_loss_clip": 0.01152469, + "auxiliary_loss_mlp": 0.01043322, + "balance_loss_clip": 1.05758834, + "balance_loss_mlp": 1.02474928, + "epoch": 0.12229069592664964, + "flos": 20047563285120.0, + "grad_norm": 2.124365429327312, + "language_loss": 0.65382642, + "learning_rate": 3.911314993142311e-06, + "loss": 0.67578429, + "num_input_tokens_seen": 43976800, + "step": 2034, + "time_per_iteration": 2.5123090744018555 + }, + { + "auxiliary_loss_clip": 0.01145604, + "auxiliary_loss_mlp": 0.01046437, + "balance_loss_clip": 1.05338168, + "balance_loss_mlp": 1.02614737, + "epoch": 0.1223508191793176, + "flos": 22274240313600.0, + "grad_norm": 1.6292373445009074, + "language_loss": 0.76602161, + "learning_rate": 3.911200268044055e-06, + "loss": 0.78794199, + "num_input_tokens_seen": 43996620, + "step": 2035, + "time_per_iteration": 2.55924391746521 + }, + { + "auxiliary_loss_clip": 0.01169022, + "auxiliary_loss_mlp": 0.01042946, + "balance_loss_clip": 1.05481398, + "balance_loss_mlp": 1.02310944, + "epoch": 0.12241094243198557, + "flos": 21285978445440.0, + "grad_norm": 1.9166182489866401, + "language_loss": 0.71420723, + "learning_rate": 3.911085470472892e-06, + "loss": 0.73632693, + "num_input_tokens_seen": 44016175, + "step": 2036, + "time_per_iteration": 2.469261646270752 + }, + { + "auxiliary_loss_clip": 0.0113895, + "auxiliary_loss_mlp": 0.01049968, + "balance_loss_clip": 1.05719376, + "balance_loss_mlp": 1.029881, + "epoch": 0.12247106568465355, + "flos": 17382381022080.0, + "grad_norm": 1.6165526719515169, + "language_loss": 0.83160383, + "learning_rate": 3.910970600433178e-06, + "loss": 0.85349298, + "num_input_tokens_seen": 44035060, + "step": 2037, + "time_per_iteration": 2.556770086288452 + }, + { + "auxiliary_loss_clip": 0.01148078, + "auxiliary_loss_mlp": 0.01050501, + "balance_loss_clip": 1.05506635, + "balance_loss_mlp": 1.02925777, + "epoch": 0.12253118893732151, + "flos": 27045438842880.0, + "grad_norm": 3.314865382894265, + "language_loss": 0.80046469, + "learning_rate": 3.910855657929267e-06, + "loss": 0.82245046, + "num_input_tokens_seen": 44053330, + "step": 2038, + "time_per_iteration": 2.574760913848877 + }, + { + "auxiliary_loss_clip": 0.0109981, + "auxiliary_loss_mlp": 0.02297184, + "balance_loss_clip": 1.04865754, + "balance_loss_mlp": 3.76881576, + "epoch": 0.12259131218998948, + "flos": 53861518368000.0, + "grad_norm": 0.8298302125395872, + "language_loss": 0.58661234, + "learning_rate": 3.910740642965518e-06, + "loss": 0.62058234, + "num_input_tokens_seen": 44107575, + "step": 2039, + "time_per_iteration": 4.402273178100586 + }, + { + "auxiliary_loss_clip": 0.01122483, + "auxiliary_loss_mlp": 0.01060125, + "balance_loss_clip": 1.05077159, + "balance_loss_mlp": 1.03739202, + "epoch": 0.12265143544265744, + "flos": 17891917401600.0, + "grad_norm": 2.028054528905554, + "language_loss": 0.8052513, + "learning_rate": 3.910625555546292e-06, + "loss": 0.82707739, + "num_input_tokens_seen": 44126075, + "step": 2040, + "time_per_iteration": 2.565009355545044 + }, + { + "auxiliary_loss_clip": 0.01140837, + "auxiliary_loss_mlp": 0.01049336, + "balance_loss_clip": 1.05085492, + "balance_loss_mlp": 1.02972603, + "epoch": 0.12271155869532542, + "flos": 21799932197760.0, + "grad_norm": 2.1998711227052645, + "language_loss": 0.82817101, + "learning_rate": 3.910510395675953e-06, + "loss": 0.8500728, + "num_input_tokens_seen": 44145605, + "step": 2041, + "time_per_iteration": 3.923802137374878 + }, + { + "auxiliary_loss_clip": 0.01132261, + "auxiliary_loss_mlp": 0.01049405, + "balance_loss_clip": 1.05308533, + "balance_loss_mlp": 1.02789927, + "epoch": 0.12277168194799339, + "flos": 19828759587840.0, + "grad_norm": 1.602312885762978, + "language_loss": 0.67034459, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.69216132, + "num_input_tokens_seen": 44164770, + "step": 2042, + "time_per_iteration": 2.540523052215576 + }, + { + "auxiliary_loss_clip": 0.01132894, + "auxiliary_loss_mlp": 0.01046926, + "balance_loss_clip": 1.0547266, + "balance_loss_mlp": 1.02716136, + "epoch": 0.12283180520066135, + "flos": 23221024951680.0, + "grad_norm": 3.151065764697575, + "language_loss": 0.81705689, + "learning_rate": 3.910279858599409e-06, + "loss": 0.83885503, + "num_input_tokens_seen": 44184025, + "step": 2043, + "time_per_iteration": 3.9773476123809814 + }, + { + "auxiliary_loss_clip": 0.01148156, + "auxiliary_loss_mlp": 0.01045878, + "balance_loss_clip": 1.05534959, + "balance_loss_mlp": 1.02523088, + "epoch": 0.12289192845332933, + "flos": 18588476920320.0, + "grad_norm": 2.2910606504174034, + "language_loss": 0.80318296, + "learning_rate": 3.910164481401946e-06, + "loss": 0.82512325, + "num_input_tokens_seen": 44202950, + "step": 2044, + "time_per_iteration": 2.516981363296509 + }, + { + "auxiliary_loss_clip": 0.01115025, + "auxiliary_loss_mlp": 0.01048872, + "balance_loss_clip": 1.05652118, + "balance_loss_mlp": 1.02928591, + "epoch": 0.1229520517059973, + "flos": 25769532862080.0, + "grad_norm": 1.8823251832660108, + "language_loss": 0.78177607, + "learning_rate": 3.910049031770853e-06, + "loss": 0.80341506, + "num_input_tokens_seen": 44221115, + "step": 2045, + "time_per_iteration": 2.672640085220337 + }, + { + "auxiliary_loss_clip": 0.01160149, + "auxiliary_loss_mlp": 0.01056657, + "balance_loss_clip": 1.05975103, + "balance_loss_mlp": 1.03611755, + "epoch": 0.12301217495866526, + "flos": 20887154760960.0, + "grad_norm": 2.838984674348378, + "language_loss": 0.67325813, + "learning_rate": 3.90993350971051e-06, + "loss": 0.69542623, + "num_input_tokens_seen": 44240575, + "step": 2046, + "time_per_iteration": 2.52974796295166 + }, + { + "auxiliary_loss_clip": 0.01171241, + "auxiliary_loss_mlp": 0.01050857, + "balance_loss_clip": 1.06041861, + "balance_loss_mlp": 1.03186655, + "epoch": 0.12307229821133324, + "flos": 22378811783040.0, + "grad_norm": 2.3305262084138203, + "language_loss": 0.72482026, + "learning_rate": 3.909817915225297e-06, + "loss": 0.74704123, + "num_input_tokens_seen": 44257145, + "step": 2047, + "time_per_iteration": 2.5194969177246094 + }, + { + "auxiliary_loss_clip": 0.01152372, + "auxiliary_loss_mlp": 0.0105406, + "balance_loss_clip": 1.05485559, + "balance_loss_mlp": 1.03276896, + "epoch": 0.1231324214640012, + "flos": 23367396873600.0, + "grad_norm": 2.084616334700215, + "language_loss": 0.76772887, + "learning_rate": 3.909702248319597e-06, + "loss": 0.78979313, + "num_input_tokens_seen": 44278035, + "step": 2048, + "time_per_iteration": 3.958188056945801 + }, + { + "auxiliary_loss_clip": 0.01141879, + "auxiliary_loss_mlp": 0.01041745, + "balance_loss_clip": 1.05402565, + "balance_loss_mlp": 1.02445328, + "epoch": 0.12319254471666917, + "flos": 23767154311680.0, + "grad_norm": 2.1918369811742435, + "language_loss": 0.8533715, + "learning_rate": 3.909586508997797e-06, + "loss": 0.87520778, + "num_input_tokens_seen": 44296980, + "step": 2049, + "time_per_iteration": 2.571837902069092 + }, + { + "auxiliary_loss_clip": 0.01118971, + "auxiliary_loss_mlp": 0.01048179, + "balance_loss_clip": 1.05910885, + "balance_loss_mlp": 1.02890253, + "epoch": 0.12325266796933713, + "flos": 23550146294400.0, + "grad_norm": 1.7493574162631031, + "language_loss": 0.75288624, + "learning_rate": 3.909470697264285e-06, + "loss": 0.77455777, + "num_input_tokens_seen": 44318005, + "step": 2050, + "time_per_iteration": 2.675042152404785 + }, + { + "auxiliary_loss_clip": 0.01132009, + "auxiliary_loss_mlp": 0.01044045, + "balance_loss_clip": 1.05339479, + "balance_loss_mlp": 1.02435136, + "epoch": 0.12331279122200511, + "flos": 24423996366720.0, + "grad_norm": 2.68311152473532, + "language_loss": 0.80793393, + "learning_rate": 3.909354813123452e-06, + "loss": 0.82969439, + "num_input_tokens_seen": 44335260, + "step": 2051, + "time_per_iteration": 2.602168083190918 + }, + { + "auxiliary_loss_clip": 0.01169126, + "auxiliary_loss_mlp": 0.010546, + "balance_loss_clip": 1.0594542, + "balance_loss_mlp": 1.4473834, + "epoch": 0.12337291447467308, + "flos": 25484294960640.0, + "grad_norm": 2.175772641455349, + "language_loss": 0.80037475, + "learning_rate": 3.909238856579693e-06, + "loss": 0.82261199, + "num_input_tokens_seen": 44355315, + "step": 2052, + "time_per_iteration": 2.545330762863159 + }, + { + "auxiliary_loss_clip": 0.01157949, + "auxiliary_loss_mlp": 0.01049516, + "balance_loss_clip": 1.0543642, + "balance_loss_mlp": 1.02904773, + "epoch": 0.12343303772734104, + "flos": 23550002640000.0, + "grad_norm": 2.51064261979418, + "language_loss": 0.73530626, + "learning_rate": 3.909122827637406e-06, + "loss": 0.75738096, + "num_input_tokens_seen": 44373020, + "step": 2053, + "time_per_iteration": 2.5315303802490234 + }, + { + "auxiliary_loss_clip": 0.01168074, + "auxiliary_loss_mlp": 0.01048336, + "balance_loss_clip": 1.05406356, + "balance_loss_mlp": 1.43379021, + "epoch": 0.12349316098000902, + "flos": 47557074867840.0, + "grad_norm": 1.546647524278449, + "language_loss": 0.7417475, + "learning_rate": 3.909006726300991e-06, + "loss": 0.7639116, + "num_input_tokens_seen": 44397525, + "step": 2054, + "time_per_iteration": 2.733426809310913 + }, + { + "auxiliary_loss_clip": 0.01142511, + "auxiliary_loss_mlp": 0.01038066, + "balance_loss_clip": 1.05202544, + "balance_loss_mlp": 1.02039886, + "epoch": 0.12355328423267699, + "flos": 25045969294080.0, + "grad_norm": 2.2338720709342867, + "language_loss": 0.84679157, + "learning_rate": 3.908890552574849e-06, + "loss": 0.86859727, + "num_input_tokens_seen": 44415890, + "step": 2055, + "time_per_iteration": 2.553285837173462 + }, + { + "auxiliary_loss_clip": 0.01127021, + "auxiliary_loss_mlp": 0.01044305, + "balance_loss_clip": 1.05633116, + "balance_loss_mlp": 1.02625632, + "epoch": 0.12361340748534495, + "flos": 27709140395520.0, + "grad_norm": 2.199578139863847, + "language_loss": 0.77617639, + "learning_rate": 3.908774306463384e-06, + "loss": 0.79788959, + "num_input_tokens_seen": 44436625, + "step": 2056, + "time_per_iteration": 2.6780505180358887 + }, + { + "auxiliary_loss_clip": 0.01154444, + "auxiliary_loss_mlp": 0.01046755, + "balance_loss_clip": 1.05323362, + "balance_loss_mlp": 1.02750301, + "epoch": 0.12367353073801293, + "flos": 26140598311680.0, + "grad_norm": 2.1076683935805205, + "language_loss": 0.82466829, + "learning_rate": 3.908657987971009e-06, + "loss": 0.84668034, + "num_input_tokens_seen": 44455265, + "step": 2057, + "time_per_iteration": 2.5323843955993652 + }, + { + "auxiliary_loss_clip": 0.01142672, + "auxiliary_loss_mlp": 0.01047011, + "balance_loss_clip": 1.05445826, + "balance_loss_mlp": 1.02718616, + "epoch": 0.1237336539906809, + "flos": 25156035544320.0, + "grad_norm": 2.671671913417777, + "language_loss": 0.78128791, + "learning_rate": 3.90854159710213e-06, + "loss": 0.80318475, + "num_input_tokens_seen": 44475815, + "step": 2058, + "time_per_iteration": 2.5779802799224854 + }, + { + "auxiliary_loss_clip": 0.01136224, + "auxiliary_loss_mlp": 0.01050054, + "balance_loss_clip": 1.05464125, + "balance_loss_mlp": 1.02920473, + "epoch": 0.12379377724334886, + "flos": 15304589867520.0, + "grad_norm": 2.3898816353296444, + "language_loss": 0.83420825, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.85607105, + "num_input_tokens_seen": 44494045, + "step": 2059, + "time_per_iteration": 2.513643264770508 + }, + { + "auxiliary_loss_clip": 0.01132335, + "auxiliary_loss_mlp": 0.01050996, + "balance_loss_clip": 1.05538714, + "balance_loss_mlp": 1.0299077, + "epoch": 0.12385390049601683, + "flos": 21316717509120.0, + "grad_norm": 2.336095641790558, + "language_loss": 0.8126924, + "learning_rate": 3.908308598252523e-06, + "loss": 0.8345257, + "num_input_tokens_seen": 44509120, + "step": 2060, + "time_per_iteration": 2.5674915313720703 + }, + { + "auxiliary_loss_clip": 0.01148402, + "auxiliary_loss_mlp": 0.01046752, + "balance_loss_clip": 1.05321133, + "balance_loss_mlp": 1.02646303, + "epoch": 0.1239140237486848, + "flos": 15116309752320.0, + "grad_norm": 2.821716728813867, + "language_loss": 0.86156619, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.88351774, + "num_input_tokens_seen": 44525780, + "step": 2061, + "time_per_iteration": 2.5107247829437256 + }, + { + "auxiliary_loss_clip": 0.01151831, + "auxiliary_loss_mlp": 0.01041773, + "balance_loss_clip": 1.05409372, + "balance_loss_mlp": 1.02334356, + "epoch": 0.12397414700135277, + "flos": 21976791788160.0, + "grad_norm": 1.9692461060430528, + "language_loss": 0.85043037, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87236643, + "num_input_tokens_seen": 44543125, + "step": 2062, + "time_per_iteration": 2.505032539367676 + }, + { + "auxiliary_loss_clip": 0.01119588, + "auxiliary_loss_mlp": 0.01051273, + "balance_loss_clip": 1.05261898, + "balance_loss_mlp": 1.03072178, + "epoch": 0.12403427025402074, + "flos": 13400892956160.0, + "grad_norm": 1.8902696648397572, + "language_loss": 0.78696734, + "learning_rate": 3.907958557264774e-06, + "loss": 0.80867594, + "num_input_tokens_seen": 44560275, + "step": 2063, + "time_per_iteration": 2.5467782020568848 + }, + { + "auxiliary_loss_clip": 0.01119857, + "auxiliary_loss_mlp": 0.01047845, + "balance_loss_clip": 1.05567157, + "balance_loss_mlp": 1.02728176, + "epoch": 0.12409439350668872, + "flos": 15304374385920.0, + "grad_norm": 2.231831592999023, + "language_loss": 0.79203665, + "learning_rate": 3.907841732229663e-06, + "loss": 0.81371373, + "num_input_tokens_seen": 44577640, + "step": 2064, + "time_per_iteration": 2.6105082035064697 + }, + { + "auxiliary_loss_clip": 0.01141314, + "auxiliary_loss_mlp": 0.01047336, + "balance_loss_clip": 1.05447388, + "balance_loss_mlp": 1.02870321, + "epoch": 0.12415451675935668, + "flos": 25009376313600.0, + "grad_norm": 2.3362225499568727, + "language_loss": 0.92701232, + "learning_rate": 3.907724834849002e-06, + "loss": 0.94889879, + "num_input_tokens_seen": 44594860, + "step": 2065, + "time_per_iteration": 2.561166286468506 + }, + { + "auxiliary_loss_clip": 0.01145169, + "auxiliary_loss_mlp": 0.01044162, + "balance_loss_clip": 1.05176854, + "balance_loss_mlp": 1.0242182, + "epoch": 0.12421464001202465, + "flos": 23659673840640.0, + "grad_norm": 1.8006732441293682, + "language_loss": 0.80863726, + "learning_rate": 3.907607865127225e-06, + "loss": 0.83053052, + "num_input_tokens_seen": 44614780, + "step": 2066, + "time_per_iteration": 2.5636234283447266 + }, + { + "auxiliary_loss_clip": 0.01058435, + "auxiliary_loss_mlp": 0.01012389, + "balance_loss_clip": 1.04296637, + "balance_loss_mlp": 1.009027, + "epoch": 0.12427476326469263, + "flos": 65732904345600.0, + "grad_norm": 0.8683677350965583, + "language_loss": 0.63308573, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65379393, + "num_input_tokens_seen": 44671240, + "step": 2067, + "time_per_iteration": 3.1505072116851807 + }, + { + "auxiliary_loss_clip": 0.01112822, + "auxiliary_loss_mlp": 0.01055078, + "balance_loss_clip": 1.04787898, + "balance_loss_mlp": 1.03382301, + "epoch": 0.12433488651736059, + "flos": 24535427333760.0, + "grad_norm": 2.5337523285753782, + "language_loss": 0.93499005, + "learning_rate": 3.907373708678063e-06, + "loss": 0.95666903, + "num_input_tokens_seen": 44691050, + "step": 2068, + "time_per_iteration": 2.630073308944702 + }, + { + "auxiliary_loss_clip": 0.01158964, + "auxiliary_loss_mlp": 0.01045415, + "balance_loss_clip": 1.05563402, + "balance_loss_mlp": 1.02755725, + "epoch": 0.12439500977002856, + "flos": 21031659175680.0, + "grad_norm": 1.997201416403602, + "language_loss": 0.80782008, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.82986391, + "num_input_tokens_seen": 44709850, + "step": 2069, + "time_per_iteration": 2.542318344116211 + }, + { + "auxiliary_loss_clip": 0.01115218, + "auxiliary_loss_mlp": 0.01051681, + "balance_loss_clip": 1.05384183, + "balance_loss_mlp": 1.03059244, + "epoch": 0.12445513302269653, + "flos": 26830621555200.0, + "grad_norm": 1.704206229940813, + "language_loss": 0.77611697, + "learning_rate": 3.907139262917696e-06, + "loss": 0.797786, + "num_input_tokens_seen": 44731475, + "step": 2070, + "time_per_iteration": 2.6355955600738525 + }, + { + "auxiliary_loss_clip": 0.01152462, + "auxiliary_loss_mlp": 0.01045458, + "balance_loss_clip": 1.05397284, + "balance_loss_mlp": 1.02576458, + "epoch": 0.1245152562753645, + "flos": 18368919037440.0, + "grad_norm": 2.317248202191959, + "language_loss": 0.80648828, + "learning_rate": 3.907021931556922e-06, + "loss": 0.82846749, + "num_input_tokens_seen": 44749685, + "step": 2071, + "time_per_iteration": 2.5311942100524902 + }, + { + "auxiliary_loss_clip": 0.01154299, + "auxiliary_loss_mlp": 0.01049257, + "balance_loss_clip": 1.05731249, + "balance_loss_mlp": 1.028705, + "epoch": 0.12457537952803246, + "flos": 33107986200960.0, + "grad_norm": 1.8675846256044368, + "language_loss": 0.78215098, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80418658, + "num_input_tokens_seen": 44772165, + "step": 2072, + "time_per_iteration": 2.643401861190796 + }, + { + "auxiliary_loss_clip": 0.01145701, + "auxiliary_loss_mlp": 0.01049578, + "balance_loss_clip": 1.06064439, + "balance_loss_mlp": 1.03096914, + "epoch": 0.12463550278070043, + "flos": 22270217990400.0, + "grad_norm": 1.8229890460224134, + "language_loss": 0.75211215, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.7740649, + "num_input_tokens_seen": 44790580, + "step": 2073, + "time_per_iteration": 2.563952684402466 + }, + { + "auxiliary_loss_clip": 0.01095515, + "auxiliary_loss_mlp": 0.01048243, + "balance_loss_clip": 1.04759598, + "balance_loss_mlp": 1.02779913, + "epoch": 0.12469562603336841, + "flos": 14679025580160.0, + "grad_norm": 2.0871559588633, + "language_loss": 0.90724468, + "learning_rate": 3.906669503605631e-06, + "loss": 0.92868221, + "num_input_tokens_seen": 44806730, + "step": 2074, + "time_per_iteration": 2.5923633575439453 + }, + { + "auxiliary_loss_clip": 0.01102493, + "auxiliary_loss_mlp": 0.01052313, + "balance_loss_clip": 1.05211163, + "balance_loss_mlp": 1.0304023, + "epoch": 0.12475574928603637, + "flos": 24644775312000.0, + "grad_norm": 2.5231633831015308, + "language_loss": 0.84147346, + "learning_rate": 3.906551883013728e-06, + "loss": 0.86302149, + "num_input_tokens_seen": 44825550, + "step": 2075, + "time_per_iteration": 2.6978607177734375 + }, + { + "auxiliary_loss_clip": 0.0110991, + "auxiliary_loss_mlp": 0.01055844, + "balance_loss_clip": 1.05163026, + "balance_loss_mlp": 1.03436279, + "epoch": 0.12481587253870434, + "flos": 21762980081280.0, + "grad_norm": 2.1216772921760656, + "language_loss": 0.73464394, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.7563014, + "num_input_tokens_seen": 44844155, + "step": 2076, + "time_per_iteration": 4.018374681472778 + }, + { + "auxiliary_loss_clip": 0.01100131, + "auxiliary_loss_mlp": 0.01043761, + "balance_loss_clip": 1.05178523, + "balance_loss_mlp": 1.02491426, + "epoch": 0.12487599579137232, + "flos": 21432529935360.0, + "grad_norm": 2.0500246983375154, + "language_loss": 0.75818098, + "learning_rate": 3.906316424944469e-06, + "loss": 0.77961993, + "num_input_tokens_seen": 44863780, + "step": 2077, + "time_per_iteration": 2.6102442741394043 + }, + { + "auxiliary_loss_clip": 0.01151145, + "auxiliary_loss_mlp": 0.01049329, + "balance_loss_clip": 1.05195582, + "balance_loss_mlp": 1.0290637, + "epoch": 0.12493611904404028, + "flos": 16107624276480.0, + "grad_norm": 2.0416758803695014, + "language_loss": 0.82537973, + "learning_rate": 3.906198587476043e-06, + "loss": 0.84738445, + "num_input_tokens_seen": 44881480, + "step": 2078, + "time_per_iteration": 2.4960215091705322 + }, + { + "auxiliary_loss_clip": 0.01140174, + "auxiliary_loss_mlp": 0.01052223, + "balance_loss_clip": 1.05389214, + "balance_loss_mlp": 1.0323745, + "epoch": 0.12499624229670825, + "flos": 21580266574080.0, + "grad_norm": 1.827059966121622, + "language_loss": 0.75101686, + "learning_rate": 3.906080677724374e-06, + "loss": 0.77294081, + "num_input_tokens_seen": 44900390, + "step": 2079, + "time_per_iteration": 3.9292311668395996 + }, + { + "auxiliary_loss_clip": 0.01163984, + "auxiliary_loss_mlp": 0.01053664, + "balance_loss_clip": 1.0576272, + "balance_loss_mlp": 1.03357697, + "epoch": 0.1250563655493762, + "flos": 25699040421120.0, + "grad_norm": 2.1393901123622223, + "language_loss": 0.83601749, + "learning_rate": 3.905962695693935e-06, + "loss": 0.85819393, + "num_input_tokens_seen": 44920375, + "step": 2080, + "time_per_iteration": 2.548893451690674 + }, + { + "auxiliary_loss_clip": 0.01148899, + "auxiliary_loss_mlp": 0.01053071, + "balance_loss_clip": 1.05153704, + "balance_loss_mlp": 1.0335927, + "epoch": 0.12511648880204418, + "flos": 16909509450240.0, + "grad_norm": 2.3155418447759217, + "language_loss": 0.85098481, + "learning_rate": 3.9058446413892e-06, + "loss": 0.8730045, + "num_input_tokens_seen": 44938415, + "step": 2081, + "time_per_iteration": 2.471673011779785 + }, + { + "auxiliary_loss_clip": 0.01147152, + "auxiliary_loss_mlp": 0.01042632, + "balance_loss_clip": 1.04788697, + "balance_loss_mlp": 1.02361798, + "epoch": 0.12517661205471217, + "flos": 17567500740480.0, + "grad_norm": 8.934326014745363, + "language_loss": 0.77032465, + "learning_rate": 3.905726514814646e-06, + "loss": 0.7922225, + "num_input_tokens_seen": 44957135, + "step": 2082, + "time_per_iteration": 3.9454596042633057 + }, + { + "auxiliary_loss_clip": 0.01156564, + "auxiliary_loss_mlp": 0.01052557, + "balance_loss_clip": 1.0634383, + "balance_loss_mlp": 1.03146935, + "epoch": 0.12523673530738014, + "flos": 16033791870720.0, + "grad_norm": 6.908561753227237, + "language_loss": 0.79476643, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.81685764, + "num_input_tokens_seen": 44974480, + "step": 2083, + "time_per_iteration": 2.5277655124664307 + }, + { + "auxiliary_loss_clip": 0.01140811, + "auxiliary_loss_mlp": 0.01049152, + "balance_loss_clip": 1.05097342, + "balance_loss_mlp": 1.02715778, + "epoch": 0.1252968585600481, + "flos": 18807747494400.0, + "grad_norm": 2.379155325094044, + "language_loss": 0.90332973, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.92522931, + "num_input_tokens_seen": 44990310, + "step": 2084, + "time_per_iteration": 2.501002788543701 + }, + { + "auxiliary_loss_clip": 0.01127345, + "auxiliary_loss_mlp": 0.0104866, + "balance_loss_clip": 1.05328441, + "balance_loss_mlp": 1.02882373, + "epoch": 0.12535698181271607, + "flos": 27271568914560.0, + "grad_norm": 2.2239083366700547, + "language_loss": 0.80655241, + "learning_rate": 3.905371701516869e-06, + "loss": 0.8283124, + "num_input_tokens_seen": 45010720, + "step": 2085, + "time_per_iteration": 2.635072946548462 + }, + { + "auxiliary_loss_clip": 0.0116349, + "auxiliary_loss_mlp": 0.01042309, + "balance_loss_clip": 1.05333424, + "balance_loss_mlp": 1.02234137, + "epoch": 0.12541710506538403, + "flos": 22054107813120.0, + "grad_norm": 3.7634063220440583, + "language_loss": 0.88145858, + "learning_rate": 3.905253285907856e-06, + "loss": 0.90351653, + "num_input_tokens_seen": 45030360, + "step": 2086, + "time_per_iteration": 3.827904462814331 + }, + { + "auxiliary_loss_clip": 0.01130891, + "auxiliary_loss_mlp": 0.01045604, + "balance_loss_clip": 1.05413651, + "balance_loss_mlp": 1.02667332, + "epoch": 0.125477228318052, + "flos": 12603173760000.0, + "grad_norm": 2.1954657711089705, + "language_loss": 0.87132895, + "learning_rate": 3.905134798051447e-06, + "loss": 0.89309394, + "num_input_tokens_seen": 45045085, + "step": 2087, + "time_per_iteration": 2.5272021293640137 + }, + { + "auxiliary_loss_clip": 0.01144125, + "auxiliary_loss_mlp": 0.01053559, + "balance_loss_clip": 1.05715442, + "balance_loss_mlp": 1.03230381, + "epoch": 0.12553735157071996, + "flos": 23878549365120.0, + "grad_norm": 1.9239123910592149, + "language_loss": 0.73555183, + "learning_rate": 3.905016237952136e-06, + "loss": 0.75752866, + "num_input_tokens_seen": 45065145, + "step": 2088, + "time_per_iteration": 2.5579521656036377 + }, + { + "auxiliary_loss_clip": 0.0107455, + "auxiliary_loss_mlp": 0.01016813, + "balance_loss_clip": 1.03255272, + "balance_loss_mlp": 1.01384473, + "epoch": 0.12559747482338796, + "flos": 69920841830400.0, + "grad_norm": 0.7661677034651387, + "language_loss": 0.61760056, + "learning_rate": 3.904897605614418e-06, + "loss": 0.63851422, + "num_input_tokens_seen": 45126230, + "step": 2089, + "time_per_iteration": 3.0992448329925537 + }, + { + "auxiliary_loss_clip": 0.01143473, + "auxiliary_loss_mlp": 0.01054248, + "balance_loss_clip": 1.05491865, + "balance_loss_mlp": 1.03385139, + "epoch": 0.12565759807605592, + "flos": 24279563779200.0, + "grad_norm": 2.581448376273402, + "language_loss": 0.77511358, + "learning_rate": 3.904778901042793e-06, + "loss": 0.79709077, + "num_input_tokens_seen": 45145545, + "step": 2090, + "time_per_iteration": 2.572330951690674 + }, + { + "auxiliary_loss_clip": 0.01063986, + "auxiliary_loss_mlp": 0.01010217, + "balance_loss_clip": 1.04171848, + "balance_loss_mlp": 1.00732005, + "epoch": 0.12571772132872389, + "flos": 56451180286080.0, + "grad_norm": 0.7671400189645787, + "language_loss": 0.5931133, + "learning_rate": 3.90466012424176e-06, + "loss": 0.6138553, + "num_input_tokens_seen": 45206845, + "step": 2091, + "time_per_iteration": 3.042355537414551 + }, + { + "auxiliary_loss_clip": 0.01151657, + "auxiliary_loss_mlp": 0.01045887, + "balance_loss_clip": 1.05338073, + "balance_loss_mlp": 1.026968, + "epoch": 0.12577784458139185, + "flos": 41245846675200.0, + "grad_norm": 1.9242185867673192, + "language_loss": 0.63356638, + "learning_rate": 3.904541275215825e-06, + "loss": 0.6555419, + "num_input_tokens_seen": 45228495, + "step": 2092, + "time_per_iteration": 2.6864523887634277 + }, + { + "auxiliary_loss_clip": 0.01146589, + "auxiliary_loss_mlp": 0.01064023, + "balance_loss_clip": 1.05683959, + "balance_loss_mlp": 1.04236293, + "epoch": 0.12583796783405982, + "flos": 19755501799680.0, + "grad_norm": 1.9437579895585686, + "language_loss": 0.80578882, + "learning_rate": 3.904422353969493e-06, + "loss": 0.82789499, + "num_input_tokens_seen": 45245720, + "step": 2093, + "time_per_iteration": 2.5314667224884033 + }, + { + "auxiliary_loss_clip": 0.01143077, + "auxiliary_loss_mlp": 0.01056379, + "balance_loss_clip": 1.05196929, + "balance_loss_mlp": 1.03622079, + "epoch": 0.12589809108672778, + "flos": 22602104680320.0, + "grad_norm": 2.048947932805511, + "language_loss": 0.75677121, + "learning_rate": 3.904303360507276e-06, + "loss": 0.7787658, + "num_input_tokens_seen": 45265650, + "step": 2094, + "time_per_iteration": 2.5267324447631836 + }, + { + "auxiliary_loss_clip": 0.01114022, + "auxiliary_loss_mlp": 0.01053012, + "balance_loss_clip": 1.05050802, + "balance_loss_mlp": 1.0335927, + "epoch": 0.12595821433939577, + "flos": 45222845541120.0, + "grad_norm": 1.7120973124373875, + "language_loss": 0.77101839, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.79268873, + "num_input_tokens_seen": 45287790, + "step": 2095, + "time_per_iteration": 2.7804951667785645 + }, + { + "auxiliary_loss_clip": 0.01142785, + "auxiliary_loss_mlp": 0.01053303, + "balance_loss_clip": 1.05342913, + "balance_loss_mlp": 1.03291798, + "epoch": 0.12601833759206374, + "flos": 14319811618560.0, + "grad_norm": 2.6369227180684423, + "language_loss": 0.83098644, + "learning_rate": 3.904065156953232e-06, + "loss": 0.85294729, + "num_input_tokens_seen": 45305720, + "step": 2096, + "time_per_iteration": 2.546971321105957 + }, + { + "auxiliary_loss_clip": 0.01156374, + "auxiliary_loss_mlp": 0.01057355, + "balance_loss_clip": 1.05493915, + "balance_loss_mlp": 1.03801894, + "epoch": 0.1260784608447317, + "flos": 21288241002240.0, + "grad_norm": 1.7969668076072398, + "language_loss": 0.75599492, + "learning_rate": 3.903945946870439e-06, + "loss": 0.7781322, + "num_input_tokens_seen": 45325290, + "step": 2097, + "time_per_iteration": 2.5111801624298096 + }, + { + "auxiliary_loss_clip": 0.01147395, + "auxiliary_loss_mlp": 0.01060255, + "balance_loss_clip": 1.05237889, + "balance_loss_mlp": 1.04209983, + "epoch": 0.12613858409739967, + "flos": 26251311006720.0, + "grad_norm": 2.1113063430883336, + "language_loss": 0.87022901, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.89230549, + "num_input_tokens_seen": 45344465, + "step": 2098, + "time_per_iteration": 2.557436466217041 + }, + { + "auxiliary_loss_clip": 0.01117843, + "auxiliary_loss_mlp": 0.01059941, + "balance_loss_clip": 1.05579495, + "balance_loss_mlp": 1.037637, + "epoch": 0.12619870735006763, + "flos": 21579979265280.0, + "grad_norm": 6.748844620920046, + "language_loss": 0.69801199, + "learning_rate": 3.903707310115912e-06, + "loss": 0.7197898, + "num_input_tokens_seen": 45362465, + "step": 2099, + "time_per_iteration": 2.606476068496704 + }, + { + "auxiliary_loss_clip": 0.01140503, + "auxiliary_loss_mlp": 0.01064493, + "balance_loss_clip": 1.04974389, + "balance_loss_mlp": 1.04280901, + "epoch": 0.1262588306027356, + "flos": 23367037737600.0, + "grad_norm": 2.0597270704323707, + "language_loss": 0.82081306, + "learning_rate": 3.903587883453228e-06, + "loss": 0.84286302, + "num_input_tokens_seen": 45382700, + "step": 2100, + "time_per_iteration": 2.576624631881714 + }, + { + "auxiliary_loss_clip": 0.01147738, + "auxiliary_loss_mlp": 0.01055058, + "balance_loss_clip": 1.05217898, + "balance_loss_mlp": 1.03435111, + "epoch": 0.12631895385540357, + "flos": 23949185460480.0, + "grad_norm": 2.1689774318871433, + "language_loss": 0.8025766, + "learning_rate": 3.903468384606302e-06, + "loss": 0.82460451, + "num_input_tokens_seen": 45401005, + "step": 2101, + "time_per_iteration": 2.559868335723877 + }, + { + "auxiliary_loss_clip": 0.01088573, + "auxiliary_loss_mlp": 0.01064528, + "balance_loss_clip": 1.03378069, + "balance_loss_mlp": 1.06076145, + "epoch": 0.12637907710807156, + "flos": 70282138780800.0, + "grad_norm": 0.7147294253138414, + "language_loss": 0.57111919, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59265018, + "num_input_tokens_seen": 45466555, + "step": 2102, + "time_per_iteration": 3.131472587585449 + }, + { + "auxiliary_loss_clip": 0.01135201, + "auxiliary_loss_mlp": 0.01051772, + "balance_loss_clip": 1.0519712, + "balance_loss_mlp": 1.03170872, + "epoch": 0.12643920036073952, + "flos": 18915084311040.0, + "grad_norm": 3.129208852872319, + "language_loss": 0.93597353, + "learning_rate": 3.903229170377845e-06, + "loss": 0.9578433, + "num_input_tokens_seen": 45485165, + "step": 2103, + "time_per_iteration": 2.564546585083008 + }, + { + "auxiliary_loss_clip": 0.01141654, + "auxiliary_loss_mlp": 0.01037378, + "balance_loss_clip": 1.05018306, + "balance_loss_mlp": 1.0186857, + "epoch": 0.1264993236134075, + "flos": 27782470010880.0, + "grad_norm": 2.071818606101232, + "language_loss": 0.78215814, + "learning_rate": 3.903109455005387e-06, + "loss": 0.80394852, + "num_input_tokens_seen": 45504630, + "step": 2104, + "time_per_iteration": 2.5594379901885986 + }, + { + "auxiliary_loss_clip": 0.01130369, + "auxiliary_loss_mlp": 0.01049038, + "balance_loss_clip": 1.05392241, + "balance_loss_mlp": 1.03062057, + "epoch": 0.12655944686607545, + "flos": 24754697907840.0, + "grad_norm": 1.708487066566394, + "language_loss": 0.8126899, + "learning_rate": 3.902989667466828e-06, + "loss": 0.83448398, + "num_input_tokens_seen": 45524885, + "step": 2105, + "time_per_iteration": 2.617420196533203 + }, + { + "auxiliary_loss_clip": 0.01163684, + "auxiliary_loss_mlp": 0.01050858, + "balance_loss_clip": 1.05561745, + "balance_loss_mlp": 1.02982986, + "epoch": 0.12661957011874342, + "flos": 24133048202880.0, + "grad_norm": 3.349103037679372, + "language_loss": 0.83813316, + "learning_rate": 3.90286980776671e-06, + "loss": 0.86027855, + "num_input_tokens_seen": 45545000, + "step": 2106, + "time_per_iteration": 2.5255205631256104 + }, + { + "auxiliary_loss_clip": 0.01122078, + "auxiliary_loss_mlp": 0.01043973, + "balance_loss_clip": 1.05415726, + "balance_loss_mlp": 1.02427924, + "epoch": 0.12667969337141138, + "flos": 24569614103040.0, + "grad_norm": 1.820627967007029, + "language_loss": 0.73757148, + "learning_rate": 3.902749875909578e-06, + "loss": 0.75923198, + "num_input_tokens_seen": 45564210, + "step": 2107, + "time_per_iteration": 2.6430203914642334 + }, + { + "auxiliary_loss_clip": 0.01165685, + "auxiliary_loss_mlp": 0.01050436, + "balance_loss_clip": 1.05534863, + "balance_loss_mlp": 1.03132713, + "epoch": 0.12673981662407935, + "flos": 22961677777920.0, + "grad_norm": 1.9955672243692584, + "language_loss": 0.79420471, + "learning_rate": 3.90262987189998e-06, + "loss": 0.81636596, + "num_input_tokens_seen": 45583030, + "step": 2108, + "time_per_iteration": 2.507863759994507 + }, + { + "auxiliary_loss_clip": 0.01165743, + "auxiliary_loss_mlp": 0.01043587, + "balance_loss_clip": 1.05282593, + "balance_loss_mlp": 1.02350008, + "epoch": 0.12679993987674734, + "flos": 17274864637440.0, + "grad_norm": 1.9866891798789112, + "language_loss": 0.7635386, + "learning_rate": 3.902509795742467e-06, + "loss": 0.7856319, + "num_input_tokens_seen": 45602265, + "step": 2109, + "time_per_iteration": 2.475743055343628 + }, + { + "auxiliary_loss_clip": 0.01111647, + "auxiliary_loss_mlp": 0.01058203, + "balance_loss_clip": 1.0493989, + "balance_loss_mlp": 1.03779411, + "epoch": 0.1268600631294153, + "flos": 17275080119040.0, + "grad_norm": 2.221441469484212, + "language_loss": 0.8272413, + "learning_rate": 3.902389647441592e-06, + "loss": 0.84893978, + "num_input_tokens_seen": 45620595, + "step": 2110, + "time_per_iteration": 2.555478811264038 + }, + { + "auxiliary_loss_clip": 0.01143084, + "auxiliary_loss_mlp": 0.00979418, + "balance_loss_clip": 1.05402446, + "balance_loss_mlp": 1.29373991, + "epoch": 0.12692018638208327, + "flos": 24061047390720.0, + "grad_norm": 2.450585125732985, + "language_loss": 0.78596592, + "learning_rate": 3.90226942700191e-06, + "loss": 0.80719095, + "num_input_tokens_seen": 45641140, + "step": 2111, + "time_per_iteration": 2.5817017555236816 + }, + { + "auxiliary_loss_clip": 0.01128189, + "auxiliary_loss_mlp": 0.01066807, + "balance_loss_clip": 1.05406606, + "balance_loss_mlp": 1.04406166, + "epoch": 0.12698030963475124, + "flos": 31831900652160.0, + "grad_norm": 2.462588267953572, + "language_loss": 0.77281225, + "learning_rate": 3.902149134427982e-06, + "loss": 0.79476213, + "num_input_tokens_seen": 45662315, + "step": 2112, + "time_per_iteration": 2.681741237640381 + }, + { + "auxiliary_loss_clip": 0.01124859, + "auxiliary_loss_mlp": 0.01051951, + "balance_loss_clip": 1.05219257, + "balance_loss_mlp": 1.0308392, + "epoch": 0.1270404328874192, + "flos": 25187744275200.0, + "grad_norm": 2.4462191951567522, + "language_loss": 0.85403985, + "learning_rate": 3.902028769724367e-06, + "loss": 0.87580794, + "num_input_tokens_seen": 45680335, + "step": 2113, + "time_per_iteration": 2.5981204509735107 + }, + { + "auxiliary_loss_clip": 0.01128877, + "auxiliary_loss_mlp": 0.01048358, + "balance_loss_clip": 1.04941344, + "balance_loss_mlp": 1.02766299, + "epoch": 0.12710055614008717, + "flos": 15997342544640.0, + "grad_norm": 2.369586098920705, + "language_loss": 0.7448976, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.76666999, + "num_input_tokens_seen": 45696240, + "step": 2114, + "time_per_iteration": 2.5360043048858643 + }, + { + "auxiliary_loss_clip": 0.01152893, + "auxiliary_loss_mlp": 0.01055848, + "balance_loss_clip": 1.05621231, + "balance_loss_mlp": 1.03475988, + "epoch": 0.12716067939275516, + "flos": 15085642515840.0, + "grad_norm": 2.821696915438223, + "language_loss": 0.83184218, + "learning_rate": 3.901787823946341e-06, + "loss": 0.85392964, + "num_input_tokens_seen": 45713695, + "step": 2115, + "time_per_iteration": 3.8670804500579834 + }, + { + "auxiliary_loss_clip": 0.01146418, + "auxiliary_loss_mlp": 0.01049393, + "balance_loss_clip": 1.0546757, + "balance_loss_mlp": 1.02921057, + "epoch": 0.12722080264542313, + "flos": 28366736636160.0, + "grad_norm": 2.0976600125182436, + "language_loss": 0.86779702, + "learning_rate": 3.901667242881065e-06, + "loss": 0.88975513, + "num_input_tokens_seen": 45736655, + "step": 2116, + "time_per_iteration": 2.61495304107666 + }, + { + "auxiliary_loss_clip": 0.01132067, + "auxiliary_loss_mlp": 0.00969611, + "balance_loss_clip": 1.04917526, + "balance_loss_mlp": 1.27845955, + "epoch": 0.1272809258980911, + "flos": 32379897519360.0, + "grad_norm": 2.0081224224122627, + "language_loss": 0.7042737, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.72529048, + "num_input_tokens_seen": 45758195, + "step": 2117, + "time_per_iteration": 2.6310253143310547 + }, + { + "auxiliary_loss_clip": 0.01125715, + "auxiliary_loss_mlp": 0.01052655, + "balance_loss_clip": 1.05090117, + "balance_loss_mlp": 1.02967143, + "epoch": 0.12734104915075906, + "flos": 16034402401920.0, + "grad_norm": 2.2573735597765436, + "language_loss": 0.86778629, + "learning_rate": 3.901425864420852e-06, + "loss": 0.88957, + "num_input_tokens_seen": 45774280, + "step": 2118, + "time_per_iteration": 3.945040702819824 + }, + { + "auxiliary_loss_clip": 0.01150453, + "auxiliary_loss_mlp": 0.01046701, + "balance_loss_clip": 1.05088794, + "balance_loss_mlp": 1.02735376, + "epoch": 0.12740117240342702, + "flos": 18260325244800.0, + "grad_norm": 1.892355164566308, + "language_loss": 0.87145126, + "learning_rate": 3.901305067035068e-06, + "loss": 0.89342284, + "num_input_tokens_seen": 45792760, + "step": 2119, + "time_per_iteration": 2.5300886631011963 + }, + { + "auxiliary_loss_clip": 0.01144184, + "auxiliary_loss_mlp": 0.00928415, + "balance_loss_clip": 1.0526706, + "balance_loss_mlp": 1.20534086, + "epoch": 0.127461295656095, + "flos": 12121790664960.0, + "grad_norm": 2.161859134602549, + "language_loss": 0.87498701, + "learning_rate": 3.901184197551605e-06, + "loss": 0.89571297, + "num_input_tokens_seen": 45804300, + "step": 2120, + "time_per_iteration": 2.469304084777832 + }, + { + "auxiliary_loss_clip": 0.01163227, + "auxiliary_loss_mlp": 0.0104043, + "balance_loss_clip": 1.05259347, + "balance_loss_mlp": 1.0211184, + "epoch": 0.12752141890876295, + "flos": 23149095966720.0, + "grad_norm": 3.0201181968507833, + "language_loss": 0.75503194, + "learning_rate": 3.901063255975046e-06, + "loss": 0.7770685, + "num_input_tokens_seen": 45823780, + "step": 2121, + "time_per_iteration": 3.8766956329345703 + }, + { + "auxiliary_loss_clip": 0.01110492, + "auxiliary_loss_mlp": 0.01046142, + "balance_loss_clip": 1.04793835, + "balance_loss_mlp": 1.02593589, + "epoch": 0.12758154216143094, + "flos": 21615997628160.0, + "grad_norm": 10.249906193211675, + "language_loss": 0.83058739, + "learning_rate": 3.900942242309978e-06, + "loss": 0.85215372, + "num_input_tokens_seen": 45840495, + "step": 2122, + "time_per_iteration": 2.577893018722534 + }, + { + "auxiliary_loss_clip": 0.01147898, + "auxiliary_loss_mlp": 0.0105129, + "balance_loss_clip": 1.05724728, + "balance_loss_mlp": 1.03122711, + "epoch": 0.1276416654140989, + "flos": 15924874855680.0, + "grad_norm": 1.8756384973293219, + "language_loss": 0.78799856, + "learning_rate": 3.90082115656099e-06, + "loss": 0.80999047, + "num_input_tokens_seen": 45857735, + "step": 2123, + "time_per_iteration": 2.5404915809631348 + }, + { + "auxiliary_loss_clip": 0.01166016, + "auxiliary_loss_mlp": 0.01049498, + "balance_loss_clip": 1.05500889, + "balance_loss_mlp": 1.02918482, + "epoch": 0.12770178866676687, + "flos": 22382690451840.0, + "grad_norm": 2.8535545380220277, + "language_loss": 0.7921719, + "learning_rate": 3.900699998732673e-06, + "loss": 0.814327, + "num_input_tokens_seen": 45876485, + "step": 2124, + "time_per_iteration": 2.477965831756592 + }, + { + "auxiliary_loss_clip": 0.01157771, + "auxiliary_loss_mlp": 0.00915889, + "balance_loss_clip": 1.05397081, + "balance_loss_mlp": 1.19300056, + "epoch": 0.12776191191943484, + "flos": 21652482867840.0, + "grad_norm": 1.9454626683393996, + "language_loss": 0.75339675, + "learning_rate": 3.900578768829623e-06, + "loss": 0.77413332, + "num_input_tokens_seen": 45894645, + "step": 2125, + "time_per_iteration": 3.912152051925659 + }, + { + "auxiliary_loss_clip": 0.01157484, + "auxiliary_loss_mlp": 0.00910421, + "balance_loss_clip": 1.05601883, + "balance_loss_mlp": 1.19101906, + "epoch": 0.1278220351721028, + "flos": 25735561574400.0, + "grad_norm": 2.299275066377786, + "language_loss": 0.77680981, + "learning_rate": 3.900457466856434e-06, + "loss": 0.79748881, + "num_input_tokens_seen": 45913755, + "step": 2126, + "time_per_iteration": 2.548546075820923 + }, + { + "auxiliary_loss_clip": 0.01124445, + "auxiliary_loss_mlp": 0.01045847, + "balance_loss_clip": 1.05738878, + "balance_loss_mlp": 1.02672625, + "epoch": 0.12788215842477077, + "flos": 41243224982400.0, + "grad_norm": 1.5030248867320313, + "language_loss": 0.69325984, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.71496272, + "num_input_tokens_seen": 45936095, + "step": 2127, + "time_per_iteration": 2.7434451580047607 + }, + { + "auxiliary_loss_clip": 0.01044959, + "auxiliary_loss_mlp": 0.01600361, + "balance_loss_clip": 1.03847456, + "balance_loss_mlp": 2.52254343, + "epoch": 0.12794228167743876, + "flos": 70877430881280.0, + "grad_norm": 0.9633337272223885, + "language_loss": 0.62765646, + "learning_rate": 3.900214646718047e-06, + "loss": 0.6541096, + "num_input_tokens_seen": 46004655, + "step": 2128, + "time_per_iteration": 3.2330756187438965 + }, + { + "auxiliary_loss_clip": 0.01144488, + "auxiliary_loss_mlp": 0.01045499, + "balance_loss_clip": 1.05185652, + "balance_loss_mlp": 1.0245899, + "epoch": 0.12800240493010673, + "flos": 16289727252480.0, + "grad_norm": 4.406400778939859, + "language_loss": 0.77194238, + "learning_rate": 3.900093128562056e-06, + "loss": 0.79384226, + "num_input_tokens_seen": 46023610, + "step": 2129, + "time_per_iteration": 2.49585223197937 + }, + { + "auxiliary_loss_clip": 0.01120201, + "auxiliary_loss_mlp": 0.01057829, + "balance_loss_clip": 1.05194116, + "balance_loss_mlp": 1.03579938, + "epoch": 0.1280625281827747, + "flos": 20631542601600.0, + "grad_norm": 2.321244752293836, + "language_loss": 0.79784781, + "learning_rate": 3.899971538354343e-06, + "loss": 0.81962812, + "num_input_tokens_seen": 46041725, + "step": 2130, + "time_per_iteration": 2.6086061000823975 + }, + { + "auxiliary_loss_clip": 0.01141912, + "auxiliary_loss_mlp": 0.01054821, + "balance_loss_clip": 1.05541718, + "balance_loss_mlp": 1.03368485, + "epoch": 0.12812265143544266, + "flos": 22638230784000.0, + "grad_norm": 2.0333279291806465, + "language_loss": 0.71327096, + "learning_rate": 3.899849876099518e-06, + "loss": 0.73523831, + "num_input_tokens_seen": 46061095, + "step": 2131, + "time_per_iteration": 2.5541210174560547 + }, + { + "auxiliary_loss_clip": 0.01101008, + "auxiliary_loss_mlp": 0.01049103, + "balance_loss_clip": 1.05238068, + "balance_loss_mlp": 1.02876592, + "epoch": 0.12818277468811062, + "flos": 34714701463680.0, + "grad_norm": 2.3382552888801964, + "language_loss": 0.72897524, + "learning_rate": 3.899728141802197e-06, + "loss": 0.75047636, + "num_input_tokens_seen": 46082670, + "step": 2132, + "time_per_iteration": 2.7614617347717285 + }, + { + "auxiliary_loss_clip": 0.0110523, + "auxiliary_loss_mlp": 0.01054977, + "balance_loss_clip": 1.04953039, + "balance_loss_mlp": 1.03329325, + "epoch": 0.1282428979407786, + "flos": 23112107936640.0, + "grad_norm": 1.9642762745602185, + "language_loss": 0.82118356, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.84278566, + "num_input_tokens_seen": 46102410, + "step": 2133, + "time_per_iteration": 2.616182804107666 + }, + { + "auxiliary_loss_clip": 0.01158863, + "auxiliary_loss_mlp": 0.0105102, + "balance_loss_clip": 1.05171287, + "balance_loss_mlp": 1.03054023, + "epoch": 0.12830302119344655, + "flos": 20886508316160.0, + "grad_norm": 2.7736510827666137, + "language_loss": 0.80198789, + "learning_rate": 3.899484457098528e-06, + "loss": 0.82408673, + "num_input_tokens_seen": 46121145, + "step": 2134, + "time_per_iteration": 2.5145421028137207 + }, + { + "auxiliary_loss_clip": 0.01155995, + "auxiliary_loss_mlp": 0.01049326, + "balance_loss_clip": 1.05894172, + "balance_loss_mlp": 1.0292275, + "epoch": 0.12836314444611455, + "flos": 21397768548480.0, + "grad_norm": 2.6031746819427704, + "language_loss": 0.82921124, + "learning_rate": 3.899362506701421e-06, + "loss": 0.85126442, + "num_input_tokens_seen": 46140740, + "step": 2135, + "time_per_iteration": 2.521883010864258 + }, + { + "auxiliary_loss_clip": 0.01140498, + "auxiliary_loss_mlp": 0.01051685, + "balance_loss_clip": 1.05532789, + "balance_loss_mlp": 1.03119266, + "epoch": 0.1284232676987825, + "flos": 13662466773120.0, + "grad_norm": 2.5755676571254673, + "language_loss": 0.77400082, + "learning_rate": 3.899240484280298e-06, + "loss": 0.79592264, + "num_input_tokens_seen": 46156805, + "step": 2136, + "time_per_iteration": 2.53132963180542 + }, + { + "auxiliary_loss_clip": 0.01039973, + "auxiliary_loss_mlp": 0.0103465, + "balance_loss_clip": 1.03507328, + "balance_loss_mlp": 1.03131187, + "epoch": 0.12848339095145048, + "flos": 59994737735040.0, + "grad_norm": 0.9062860582387595, + "language_loss": 0.5918833, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61262953, + "num_input_tokens_seen": 46222085, + "step": 2137, + "time_per_iteration": 3.331674098968506 + }, + { + "auxiliary_loss_clip": 0.01152497, + "auxiliary_loss_mlp": 0.01049406, + "balance_loss_clip": 1.05377579, + "balance_loss_mlp": 1.02968824, + "epoch": 0.12854351420411844, + "flos": 13881378211200.0, + "grad_norm": 2.855591330032333, + "language_loss": 0.82489932, + "learning_rate": 3.898996223384512e-06, + "loss": 0.84691828, + "num_input_tokens_seen": 46239970, + "step": 2138, + "time_per_iteration": 2.522308111190796 + }, + { + "auxiliary_loss_clip": 0.01155505, + "auxiliary_loss_mlp": 0.01049386, + "balance_loss_clip": 1.05531025, + "balance_loss_mlp": 1.02796435, + "epoch": 0.1286036374567864, + "flos": 22637943475200.0, + "grad_norm": 3.7844505449857095, + "language_loss": 0.78723609, + "learning_rate": 3.898873984919113e-06, + "loss": 0.80928504, + "num_input_tokens_seen": 46257740, + "step": 2139, + "time_per_iteration": 2.539017677307129 + }, + { + "auxiliary_loss_clip": 0.0112953, + "auxiliary_loss_mlp": 0.01046656, + "balance_loss_clip": 1.04851472, + "balance_loss_mlp": 1.0262835, + "epoch": 0.12866376070945437, + "flos": 16324775948160.0, + "grad_norm": 1.9667935507730232, + "language_loss": 0.84905934, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.87082118, + "num_input_tokens_seen": 46275445, + "step": 2140, + "time_per_iteration": 2.609879493713379 + }, + { + "auxiliary_loss_clip": 0.01141051, + "auxiliary_loss_mlp": 0.01045196, + "balance_loss_clip": 1.05094457, + "balance_loss_mlp": 1.02605081, + "epoch": 0.12872388396212234, + "flos": 11874546374400.0, + "grad_norm": 3.6328093676822437, + "language_loss": 0.86175317, + "learning_rate": 3.898629291976476e-06, + "loss": 0.88361561, + "num_input_tokens_seen": 46291710, + "step": 2141, + "time_per_iteration": 2.5380964279174805 + }, + { + "auxiliary_loss_clip": 0.011425, + "auxiliary_loss_mlp": 0.01042437, + "balance_loss_clip": 1.05265546, + "balance_loss_mlp": 1.02241015, + "epoch": 0.12878400721479033, + "flos": 28366700722560.0, + "grad_norm": 6.198603096869942, + "language_loss": 0.68386787, + "learning_rate": 3.898506837508518e-06, + "loss": 0.70571727, + "num_input_tokens_seen": 46311335, + "step": 2142, + "time_per_iteration": 2.606330156326294 + }, + { + "auxiliary_loss_clip": 0.01158568, + "auxiliary_loss_mlp": 0.01208883, + "balance_loss_clip": 1.05319333, + "balance_loss_mlp": 1.73464215, + "epoch": 0.1288441304674583, + "flos": 25885632597120.0, + "grad_norm": 2.1008484213093506, + "language_loss": 0.8318544, + "learning_rate": 3.89838431104899e-06, + "loss": 0.85552895, + "num_input_tokens_seen": 46330985, + "step": 2143, + "time_per_iteration": 2.539196252822876 + }, + { + "auxiliary_loss_clip": 0.01170197, + "auxiliary_loss_mlp": 0.01221131, + "balance_loss_clip": 1.05712664, + "balance_loss_mlp": 1.75766504, + "epoch": 0.12890425372012626, + "flos": 20813789232000.0, + "grad_norm": 1.614771935305401, + "language_loss": 0.81665587, + "learning_rate": 3.898261712602539e-06, + "loss": 0.84056914, + "num_input_tokens_seen": 46351295, + "step": 2144, + "time_per_iteration": 2.4903626441955566 + }, + { + "auxiliary_loss_clip": 0.01128498, + "auxiliary_loss_mlp": 0.01046578, + "balance_loss_clip": 1.04690099, + "balance_loss_mlp": 1.02538228, + "epoch": 0.12896437697279423, + "flos": 22565870835840.0, + "grad_norm": 1.9373065418741835, + "language_loss": 0.7809763, + "learning_rate": 3.898139042173813e-06, + "loss": 0.8027271, + "num_input_tokens_seen": 46368600, + "step": 2145, + "time_per_iteration": 2.537078619003296 + }, + { + "auxiliary_loss_clip": 0.01163695, + "auxiliary_loss_mlp": 0.01044335, + "balance_loss_clip": 1.05103672, + "balance_loss_mlp": 1.02461755, + "epoch": 0.1290245002254622, + "flos": 17493776075520.0, + "grad_norm": 2.342339000658999, + "language_loss": 0.82205039, + "learning_rate": 3.898016299767465e-06, + "loss": 0.84413075, + "num_input_tokens_seen": 46387370, + "step": 2146, + "time_per_iteration": 2.4538824558258057 + }, + { + "auxiliary_loss_clip": 0.01137615, + "auxiliary_loss_mlp": 0.01044856, + "balance_loss_clip": 1.05349159, + "balance_loss_mlp": 1.02401853, + "epoch": 0.12908462347813016, + "flos": 36315957859200.0, + "grad_norm": 2.4995566866270744, + "language_loss": 0.7163707, + "learning_rate": 3.897893485388149e-06, + "loss": 0.73819542, + "num_input_tokens_seen": 46409570, + "step": 2147, + "time_per_iteration": 2.6480953693389893 + }, + { + "auxiliary_loss_clip": 0.01139449, + "auxiliary_loss_mlp": 0.01047212, + "balance_loss_clip": 1.05029869, + "balance_loss_mlp": 1.02857935, + "epoch": 0.12914474673079815, + "flos": 22528703237760.0, + "grad_norm": 3.214006976296634, + "language_loss": 0.71835649, + "learning_rate": 3.897770599040521e-06, + "loss": 0.74022305, + "num_input_tokens_seen": 46429320, + "step": 2148, + "time_per_iteration": 2.5493922233581543 + }, + { + "auxiliary_loss_clip": 0.01163156, + "auxiliary_loss_mlp": 0.01040048, + "balance_loss_clip": 1.05428028, + "balance_loss_mlp": 1.02163053, + "epoch": 0.12920486998346611, + "flos": 21471888263040.0, + "grad_norm": 2.007625228251199, + "language_loss": 0.79210961, + "learning_rate": 3.897647640729242e-06, + "loss": 0.81414163, + "num_input_tokens_seen": 46450155, + "step": 2149, + "time_per_iteration": 2.472214460372925 + }, + { + "auxiliary_loss_clip": 0.0115457, + "auxiliary_loss_mlp": 0.01039553, + "balance_loss_clip": 1.05263031, + "balance_loss_mlp": 1.01915646, + "epoch": 0.12926499323613408, + "flos": 27308556944640.0, + "grad_norm": 28.094723201985126, + "language_loss": 0.75932825, + "learning_rate": 3.897524610458975e-06, + "loss": 0.78126955, + "num_input_tokens_seen": 46470280, + "step": 2150, + "time_per_iteration": 2.583061456680298 + }, + { + "auxiliary_loss_clip": 0.01147277, + "auxiliary_loss_mlp": 0.01044536, + "balance_loss_clip": 1.05100036, + "balance_loss_mlp": 1.02525949, + "epoch": 0.12932511648880204, + "flos": 22091131756800.0, + "grad_norm": 3.501252316424131, + "language_loss": 0.70738649, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.72930467, + "num_input_tokens_seen": 46487605, + "step": 2151, + "time_per_iteration": 2.495633125305176 + }, + { + "auxiliary_loss_clip": 0.01165053, + "auxiliary_loss_mlp": 0.01042354, + "balance_loss_clip": 1.05645084, + "balance_loss_mlp": 1.02379274, + "epoch": 0.12938523974147, + "flos": 20302780394880.0, + "grad_norm": 2.0911879068712906, + "language_loss": 0.83986878, + "learning_rate": 3.897278334060137e-06, + "loss": 0.86194283, + "num_input_tokens_seen": 46505100, + "step": 2152, + "time_per_iteration": 2.4576117992401123 + }, + { + "auxiliary_loss_clip": 0.01155071, + "auxiliary_loss_mlp": 0.01055423, + "balance_loss_clip": 1.052526, + "balance_loss_mlp": 1.03599191, + "epoch": 0.12944536299413797, + "flos": 19499961467520.0, + "grad_norm": 2.32682917579046, + "language_loss": 0.78860021, + "learning_rate": 3.897155087940906e-06, + "loss": 0.81070507, + "num_input_tokens_seen": 46524020, + "step": 2153, + "time_per_iteration": 2.4761552810668945 + }, + { + "auxiliary_loss_clip": 0.0112209, + "auxiliary_loss_mlp": 0.01049512, + "balance_loss_clip": 1.0587008, + "balance_loss_mlp": 1.43294477, + "epoch": 0.12950548624680594, + "flos": 27707919333120.0, + "grad_norm": 1.7247830739546506, + "language_loss": 0.80130327, + "learning_rate": 3.897031769881364e-06, + "loss": 0.82301927, + "num_input_tokens_seen": 46544640, + "step": 2154, + "time_per_iteration": 4.008934497833252 + }, + { + "auxiliary_loss_clip": 0.01148052, + "auxiliary_loss_mlp": 0.01048444, + "balance_loss_clip": 1.05391407, + "balance_loss_mlp": 1.02896512, + "epoch": 0.12956560949947393, + "flos": 17565740974080.0, + "grad_norm": 1.905674736202788, + "language_loss": 0.83265448, + "learning_rate": 3.896908379886188e-06, + "loss": 0.8546195, + "num_input_tokens_seen": 46561395, + "step": 2155, + "time_per_iteration": 2.4858200550079346 + }, + { + "auxiliary_loss_clip": 0.01157406, + "auxiliary_loss_mlp": 0.01048891, + "balance_loss_clip": 1.05346072, + "balance_loss_mlp": 1.02953172, + "epoch": 0.1296257327521419, + "flos": 20740711011840.0, + "grad_norm": 2.6411165690162273, + "language_loss": 0.75885141, + "learning_rate": 3.896784917960055e-06, + "loss": 0.78091443, + "num_input_tokens_seen": 46579395, + "step": 2156, + "time_per_iteration": 4.0071587562561035 + }, + { + "auxiliary_loss_clip": 0.01108867, + "auxiliary_loss_mlp": 0.01047997, + "balance_loss_clip": 1.05753994, + "balance_loss_mlp": 1.02856588, + "epoch": 0.12968585600480986, + "flos": 16395735265920.0, + "grad_norm": 2.2329684924901665, + "language_loss": 0.86579466, + "learning_rate": 3.896661384107648e-06, + "loss": 0.88736331, + "num_input_tokens_seen": 46597090, + "step": 2157, + "time_per_iteration": 2.623391628265381 + }, + { + "auxiliary_loss_clip": 0.01163959, + "auxiliary_loss_mlp": 0.01053978, + "balance_loss_clip": 1.05051041, + "balance_loss_mlp": 1.03375971, + "epoch": 0.12974597925747783, + "flos": 28329533124480.0, + "grad_norm": 2.6681095633731333, + "language_loss": 0.80936694, + "learning_rate": 3.896537778333651e-06, + "loss": 0.83154631, + "num_input_tokens_seen": 46617355, + "step": 2158, + "time_per_iteration": 2.556306838989258 + }, + { + "auxiliary_loss_clip": 0.01168612, + "auxiliary_loss_mlp": 0.0105725, + "balance_loss_clip": 1.05498242, + "balance_loss_mlp": 1.03769982, + "epoch": 0.1298061025101458, + "flos": 9683025782400.0, + "grad_norm": 2.548516663119676, + "language_loss": 0.74692291, + "learning_rate": 3.896414100642752e-06, + "loss": 0.76918149, + "num_input_tokens_seen": 46633130, + "step": 2159, + "time_per_iteration": 3.8341352939605713 + }, + { + "auxiliary_loss_clip": 0.01120635, + "auxiliary_loss_mlp": 0.0105665, + "balance_loss_clip": 1.04890037, + "balance_loss_mlp": 1.03551412, + "epoch": 0.12986622576281376, + "flos": 27709535445120.0, + "grad_norm": 1.9742773416805233, + "language_loss": 0.82187343, + "learning_rate": 3.89629035103964e-06, + "loss": 0.84364629, + "num_input_tokens_seen": 46650575, + "step": 2160, + "time_per_iteration": 2.619985342025757 + }, + { + "auxiliary_loss_clip": 0.01149181, + "auxiliary_loss_mlp": 0.01042913, + "balance_loss_clip": 1.05828536, + "balance_loss_mlp": 1.02329135, + "epoch": 0.12992634901548175, + "flos": 18802719590400.0, + "grad_norm": 1.6981534412638162, + "language_loss": 0.8222059, + "learning_rate": 3.896166529529008e-06, + "loss": 0.84412682, + "num_input_tokens_seen": 46668780, + "step": 2161, + "time_per_iteration": 2.506188154220581 + }, + { + "auxiliary_loss_clip": 0.01143413, + "auxiliary_loss_mlp": 0.01049762, + "balance_loss_clip": 1.05141544, + "balance_loss_mlp": 1.02998543, + "epoch": 0.12998647226814972, + "flos": 29127575543040.0, + "grad_norm": 2.0756361319277556, + "language_loss": 0.82257283, + "learning_rate": 3.896042636115551e-06, + "loss": 0.84450459, + "num_input_tokens_seen": 46687550, + "step": 2162, + "time_per_iteration": 2.5854687690734863 + }, + { + "auxiliary_loss_clip": 0.01128018, + "auxiliary_loss_mlp": 0.01048263, + "balance_loss_clip": 1.04941368, + "balance_loss_mlp": 1.02836716, + "epoch": 0.13004659552081768, + "flos": 19573686132480.0, + "grad_norm": 2.7510951109062467, + "language_loss": 0.72996664, + "learning_rate": 3.895918670803968e-06, + "loss": 0.75172949, + "num_input_tokens_seen": 46706730, + "step": 2163, + "time_per_iteration": 2.556450366973877 + }, + { + "auxiliary_loss_clip": 0.01168068, + "auxiliary_loss_mlp": 0.01027341, + "balance_loss_clip": 1.05443156, + "balance_loss_mlp": 1.38424635, + "epoch": 0.13010671877348565, + "flos": 22490709626880.0, + "grad_norm": 2.66227958513746, + "language_loss": 0.81849277, + "learning_rate": 3.895794633598958e-06, + "loss": 0.84044689, + "num_input_tokens_seen": 46724250, + "step": 2164, + "time_per_iteration": 3.8764946460723877 + }, + { + "auxiliary_loss_clip": 0.01117577, + "auxiliary_loss_mlp": 0.01039249, + "balance_loss_clip": 1.04914641, + "balance_loss_mlp": 1.01976979, + "epoch": 0.1301668420261536, + "flos": 23878226142720.0, + "grad_norm": 2.306127741083391, + "language_loss": 0.72178757, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.74335581, + "num_input_tokens_seen": 46744105, + "step": 2165, + "time_per_iteration": 2.6277711391448975 + }, + { + "auxiliary_loss_clip": 0.01111682, + "auxiliary_loss_mlp": 0.01040979, + "balance_loss_clip": 1.05376935, + "balance_loss_mlp": 1.02005792, + "epoch": 0.13022696527882158, + "flos": 23150065633920.0, + "grad_norm": 1.8911191195027592, + "language_loss": 0.75086975, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.77239633, + "num_input_tokens_seen": 46764250, + "step": 2166, + "time_per_iteration": 2.7056262493133545 + }, + { + "auxiliary_loss_clip": 0.01162649, + "auxiliary_loss_mlp": 0.01044523, + "balance_loss_clip": 1.05048418, + "balance_loss_mlp": 1.02506781, + "epoch": 0.13028708853148954, + "flos": 26908548111360.0, + "grad_norm": 1.8671885225749185, + "language_loss": 0.83024746, + "learning_rate": 3.895422090670421e-06, + "loss": 0.85231912, + "num_input_tokens_seen": 46786865, + "step": 2167, + "time_per_iteration": 2.566591501235962 + }, + { + "auxiliary_loss_clip": 0.01108371, + "auxiliary_loss_mlp": 0.01057247, + "balance_loss_clip": 1.04981565, + "balance_loss_mlp": 1.03690982, + "epoch": 0.13034721178415754, + "flos": 21251468453760.0, + "grad_norm": 1.621918763263411, + "language_loss": 0.83500075, + "learning_rate": 3.89529776593877e-06, + "loss": 0.85665691, + "num_input_tokens_seen": 46807030, + "step": 2168, + "time_per_iteration": 2.6300740242004395 + }, + { + "auxiliary_loss_clip": 0.01077795, + "auxiliary_loss_mlp": 0.01059791, + "balance_loss_clip": 1.04512739, + "balance_loss_mlp": 1.03598535, + "epoch": 0.1304073350368255, + "flos": 18767239931520.0, + "grad_norm": 1.9985476727546274, + "language_loss": 0.79849136, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.81986725, + "num_input_tokens_seen": 46826280, + "step": 2169, + "time_per_iteration": 2.6715986728668213 + }, + { + "auxiliary_loss_clip": 0.01164418, + "auxiliary_loss_mlp": 0.01042472, + "balance_loss_clip": 1.0521462, + "balance_loss_mlp": 1.02142012, + "epoch": 0.13046745828949347, + "flos": 28364653647360.0, + "grad_norm": 2.4646733042315048, + "language_loss": 0.66706544, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.68913436, + "num_input_tokens_seen": 46846505, + "step": 2170, + "time_per_iteration": 2.5381481647491455 + }, + { + "auxiliary_loss_clip": 0.01137071, + "auxiliary_loss_mlp": 0.01045607, + "balance_loss_clip": 1.05263996, + "balance_loss_mlp": 1.02616405, + "epoch": 0.13052758154216143, + "flos": 29605044055680.0, + "grad_norm": 1.721648252953711, + "language_loss": 0.66952002, + "learning_rate": 3.8949243605434e-06, + "loss": 0.69134682, + "num_input_tokens_seen": 46867380, + "step": 2171, + "time_per_iteration": 2.6247501373291016 + }, + { + "auxiliary_loss_clip": 0.01151564, + "auxiliary_loss_mlp": 0.01045575, + "balance_loss_clip": 1.04969692, + "balance_loss_mlp": 1.02467811, + "epoch": 0.1305877047948294, + "flos": 19390864884480.0, + "grad_norm": 1.9853251306567843, + "language_loss": 0.72057128, + "learning_rate": 3.894799748360537e-06, + "loss": 0.74254262, + "num_input_tokens_seen": 46886810, + "step": 2172, + "time_per_iteration": 2.505753755569458 + }, + { + "auxiliary_loss_clip": 0.01122335, + "auxiliary_loss_mlp": 0.01045721, + "balance_loss_clip": 1.05043888, + "balance_loss_mlp": 1.02596807, + "epoch": 0.13064782804749736, + "flos": 16873527000960.0, + "grad_norm": 2.1508486852354705, + "language_loss": 0.75692248, + "learning_rate": 3.894675064326678e-06, + "loss": 0.77860296, + "num_input_tokens_seen": 46905620, + "step": 2173, + "time_per_iteration": 2.6044085025787354 + }, + { + "auxiliary_loss_clip": 0.01131902, + "auxiliary_loss_mlp": 0.01057356, + "balance_loss_clip": 1.05675721, + "balance_loss_mlp": 1.03577876, + "epoch": 0.13070795130016533, + "flos": 24499085748480.0, + "grad_norm": 2.3458353980653803, + "language_loss": 0.70271695, + "learning_rate": 3.894550308446551e-06, + "loss": 0.72460955, + "num_input_tokens_seen": 46925120, + "step": 2174, + "time_per_iteration": 2.615194082260132 + }, + { + "auxiliary_loss_clip": 0.0106491, + "auxiliary_loss_mlp": 0.01037933, + "balance_loss_clip": 1.03661478, + "balance_loss_mlp": 1.0342133, + "epoch": 0.13076807455283332, + "flos": 71054505953280.0, + "grad_norm": 0.8230570650558096, + "language_loss": 0.59036255, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61139095, + "num_input_tokens_seen": 46988195, + "step": 2175, + "time_per_iteration": 3.2819745540618896 + }, + { + "auxiliary_loss_clip": 0.0115523, + "auxiliary_loss_mlp": 0.01053874, + "balance_loss_clip": 1.05332994, + "balance_loss_mlp": 1.03453779, + "epoch": 0.13082819780550128, + "flos": 20264499475200.0, + "grad_norm": 2.0294399164933177, + "language_loss": 0.79678428, + "learning_rate": 3.894300581166417e-06, + "loss": 0.81887531, + "num_input_tokens_seen": 47004720, + "step": 2176, + "time_per_iteration": 2.528665542602539 + }, + { + "auxiliary_loss_clip": 0.01162388, + "auxiliary_loss_mlp": 0.01048995, + "balance_loss_clip": 1.05122828, + "balance_loss_mlp": 1.02729893, + "epoch": 0.13088832105816925, + "flos": 34203441231360.0, + "grad_norm": 1.8895248504771243, + "language_loss": 0.74679035, + "learning_rate": 3.894175609775881e-06, + "loss": 0.76890415, + "num_input_tokens_seen": 47024255, + "step": 2177, + "time_per_iteration": 2.5832173824310303 + }, + { + "auxiliary_loss_clip": 0.0112434, + "auxiliary_loss_mlp": 0.01047618, + "balance_loss_clip": 1.04840779, + "balance_loss_mlp": 1.02658916, + "epoch": 0.13094844431083721, + "flos": 17894970057600.0, + "grad_norm": 2.07745809167143, + "language_loss": 0.82303649, + "learning_rate": 3.894050566558015e-06, + "loss": 0.84475607, + "num_input_tokens_seen": 47042465, + "step": 2178, + "time_per_iteration": 2.590088129043579 + }, + { + "auxiliary_loss_clip": 0.01164037, + "auxiliary_loss_mlp": 0.01042731, + "balance_loss_clip": 1.05400443, + "balance_loss_mlp": 1.02232265, + "epoch": 0.13100856756350518, + "flos": 17311313963520.0, + "grad_norm": 2.6079090705551193, + "language_loss": 0.7456677, + "learning_rate": 3.893925451517562e-06, + "loss": 0.76773536, + "num_input_tokens_seen": 47060370, + "step": 2179, + "time_per_iteration": 2.4402098655700684 + }, + { + "auxiliary_loss_clip": 0.01123597, + "auxiliary_loss_mlp": 0.01045727, + "balance_loss_clip": 1.04891157, + "balance_loss_mlp": 1.02579546, + "epoch": 0.13106869081617314, + "flos": 22200551562240.0, + "grad_norm": 2.3631885842601386, + "language_loss": 0.84773183, + "learning_rate": 3.893800264659266e-06, + "loss": 0.86942506, + "num_input_tokens_seen": 47081415, + "step": 2180, + "time_per_iteration": 2.5952231884002686 + }, + { + "auxiliary_loss_clip": 0.01153832, + "auxiliary_loss_mlp": 0.01055493, + "balance_loss_clip": 1.05470276, + "balance_loss_mlp": 1.03588367, + "epoch": 0.13112881406884114, + "flos": 21763123735680.0, + "grad_norm": 1.9661799416891512, + "language_loss": 0.89745599, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.91954923, + "num_input_tokens_seen": 47099860, + "step": 2181, + "time_per_iteration": 2.508884906768799 + }, + { + "auxiliary_loss_clip": 0.01149816, + "auxiliary_loss_mlp": 0.01051319, + "balance_loss_clip": 1.0515908, + "balance_loss_mlp": 1.03170872, + "epoch": 0.1311889373215091, + "flos": 23331091201920.0, + "grad_norm": 1.913852535144799, + "language_loss": 0.68434632, + "learning_rate": 3.893549675508137e-06, + "loss": 0.70635766, + "num_input_tokens_seen": 47118540, + "step": 2182, + "time_per_iteration": 2.5334997177124023 + }, + { + "auxiliary_loss_clip": 0.01117916, + "auxiliary_loss_mlp": 0.01053002, + "balance_loss_clip": 1.04838145, + "balance_loss_mlp": 1.03208077, + "epoch": 0.13124906057417707, + "flos": 21467363149440.0, + "grad_norm": 2.3167558335095233, + "language_loss": 0.78465056, + "learning_rate": 3.893424273224806e-06, + "loss": 0.80635977, + "num_input_tokens_seen": 47136710, + "step": 2183, + "time_per_iteration": 2.561140298843384 + }, + { + "auxiliary_loss_clip": 0.01159859, + "auxiliary_loss_mlp": 0.01044443, + "balance_loss_clip": 1.04996109, + "balance_loss_mlp": 1.02463078, + "epoch": 0.13130918382684503, + "flos": 23255319461760.0, + "grad_norm": 1.8070099970127325, + "language_loss": 0.86052251, + "learning_rate": 3.893298799142636e-06, + "loss": 0.8825655, + "num_input_tokens_seen": 47157155, + "step": 2184, + "time_per_iteration": 2.4928627014160156 + }, + { + "auxiliary_loss_clip": 0.01131249, + "auxiliary_loss_mlp": 0.01049525, + "balance_loss_clip": 1.05110741, + "balance_loss_mlp": 1.02860427, + "epoch": 0.131369307079513, + "flos": 20850274471680.0, + "grad_norm": 2.203560559324091, + "language_loss": 0.82368201, + "learning_rate": 3.893173253266387e-06, + "loss": 0.84548974, + "num_input_tokens_seen": 47176820, + "step": 2185, + "time_per_iteration": 2.610435724258423 + }, + { + "auxiliary_loss_clip": 0.01138322, + "auxiliary_loss_mlp": 0.01054274, + "balance_loss_clip": 1.05181158, + "balance_loss_mlp": 1.03403187, + "epoch": 0.13142943033218096, + "flos": 17858341163520.0, + "grad_norm": 1.9599731716995554, + "language_loss": 0.72886199, + "learning_rate": 3.893047635600818e-06, + "loss": 0.75078791, + "num_input_tokens_seen": 47195855, + "step": 2186, + "time_per_iteration": 2.5582427978515625 + }, + { + "auxiliary_loss_clip": 0.01151454, + "auxiliary_loss_mlp": 0.01048982, + "balance_loss_clip": 1.05346835, + "balance_loss_mlp": 1.02735758, + "epoch": 0.13148955358484893, + "flos": 20996035862400.0, + "grad_norm": 2.1140512805635683, + "language_loss": 0.79912448, + "learning_rate": 3.892921946150693e-06, + "loss": 0.82112885, + "num_input_tokens_seen": 47214535, + "step": 2187, + "time_per_iteration": 2.5074751377105713 + }, + { + "auxiliary_loss_clip": 0.01051035, + "auxiliary_loss_mlp": 0.01045968, + "balance_loss_clip": 1.04096484, + "balance_loss_mlp": 1.0421052, + "epoch": 0.13154967683751692, + "flos": 70172467580160.0, + "grad_norm": 0.8437720511865218, + "language_loss": 0.59011048, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61108047, + "num_input_tokens_seen": 47270300, + "step": 2188, + "time_per_iteration": 3.2248923778533936 + }, + { + "auxiliary_loss_clip": 0.01090779, + "auxiliary_loss_mlp": 0.01046624, + "balance_loss_clip": 1.05075133, + "balance_loss_mlp": 1.02598882, + "epoch": 0.1316098000901849, + "flos": 20376145923840.0, + "grad_norm": 1.9237537496699424, + "language_loss": 0.73997676, + "learning_rate": 3.892670351915842e-06, + "loss": 0.76135087, + "num_input_tokens_seen": 47290720, + "step": 2189, + "time_per_iteration": 2.6802003383636475 + }, + { + "auxiliary_loss_clip": 0.01155126, + "auxiliary_loss_mlp": 0.01042717, + "balance_loss_clip": 1.05457592, + "balance_loss_mlp": 1.02209401, + "epoch": 0.13166992334285285, + "flos": 23221132692480.0, + "grad_norm": 2.015733404581712, + "language_loss": 0.72549021, + "learning_rate": 3.892544447140657e-06, + "loss": 0.74746859, + "num_input_tokens_seen": 47311820, + "step": 2190, + "time_per_iteration": 2.5325405597686768 + }, + { + "auxiliary_loss_clip": 0.01153132, + "auxiliary_loss_mlp": 0.01047444, + "balance_loss_clip": 1.05429578, + "balance_loss_mlp": 1.02745259, + "epoch": 0.13173004659552082, + "flos": 23330947547520.0, + "grad_norm": 2.0957332985675454, + "language_loss": 0.74930894, + "learning_rate": 3.892418470599996e-06, + "loss": 0.77131474, + "num_input_tokens_seen": 47331605, + "step": 2191, + "time_per_iteration": 2.521662473678589 + }, + { + "auxiliary_loss_clip": 0.01127001, + "auxiliary_loss_mlp": 0.01051641, + "balance_loss_clip": 1.05519044, + "balance_loss_mlp": 1.03044522, + "epoch": 0.13179016984818878, + "flos": 21251504367360.0, + "grad_norm": 3.0975477094958723, + "language_loss": 0.79357803, + "learning_rate": 3.892292422298637e-06, + "loss": 0.81536448, + "num_input_tokens_seen": 47350455, + "step": 2192, + "time_per_iteration": 2.5818686485290527 + }, + { + "auxiliary_loss_clip": 0.0111124, + "auxiliary_loss_mlp": 0.01047961, + "balance_loss_clip": 1.0508976, + "balance_loss_mlp": 1.02762401, + "epoch": 0.13185029310085675, + "flos": 17778690754560.0, + "grad_norm": 5.026571639057798, + "language_loss": 0.8513701, + "learning_rate": 3.892166302241361e-06, + "loss": 0.87296206, + "num_input_tokens_seen": 47368225, + "step": 2193, + "time_per_iteration": 3.9979944229125977 + }, + { + "auxiliary_loss_clip": 0.01075375, + "auxiliary_loss_mlp": 0.01003199, + "balance_loss_clip": 1.05942488, + "balance_loss_mlp": 0.99947929, + "epoch": 0.1319104163535247, + "flos": 69851785933440.0, + "grad_norm": 0.7621368862301643, + "language_loss": 0.54098397, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56176972, + "num_input_tokens_seen": 47427125, + "step": 2194, + "time_per_iteration": 3.115755796432495 + }, + { + "auxiliary_loss_clip": 0.01156941, + "auxiliary_loss_mlp": 0.01046521, + "balance_loss_clip": 1.04784358, + "balance_loss_mlp": 1.02641034, + "epoch": 0.1319705396061927, + "flos": 25193095401600.0, + "grad_norm": 2.296901037438971, + "language_loss": 0.72432137, + "learning_rate": 3.891913846878185e-06, + "loss": 0.74635601, + "num_input_tokens_seen": 47450275, + "step": 2195, + "time_per_iteration": 3.943492889404297 + }, + { + "auxiliary_loss_clip": 0.01131241, + "auxiliary_loss_mlp": 0.01001149, + "balance_loss_clip": 1.05023837, + "balance_loss_mlp": 1.33380735, + "epoch": 0.13203066285886067, + "flos": 20740459616640.0, + "grad_norm": 2.255468187619569, + "language_loss": 0.78381968, + "learning_rate": 3.891787511581859e-06, + "loss": 0.80514354, + "num_input_tokens_seen": 47469155, + "step": 2196, + "time_per_iteration": 2.6567015647888184 + }, + { + "auxiliary_loss_clip": 0.01153103, + "auxiliary_loss_mlp": 0.01043306, + "balance_loss_clip": 1.04967487, + "balance_loss_mlp": 1.02372003, + "epoch": 0.13209078611152864, + "flos": 22054395121920.0, + "grad_norm": 2.086610907915476, + "language_loss": 0.75126624, + "learning_rate": 3.89166110454876e-06, + "loss": 0.77323031, + "num_input_tokens_seen": 47488405, + "step": 2197, + "time_per_iteration": 2.4906678199768066 + }, + { + "auxiliary_loss_clip": 0.01165387, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.05117321, + "balance_loss_mlp": 1.0245719, + "epoch": 0.1321509093641966, + "flos": 16284950743680.0, + "grad_norm": 2.1508735889054584, + "language_loss": 0.79914016, + "learning_rate": 3.891534625783685e-06, + "loss": 0.82124275, + "num_input_tokens_seen": 47505650, + "step": 2198, + "time_per_iteration": 3.816161632537842 + }, + { + "auxiliary_loss_clip": 0.01158741, + "auxiliary_loss_mlp": 0.01045942, + "balance_loss_clip": 1.04967189, + "balance_loss_mlp": 1.02617764, + "epoch": 0.13221103261686457, + "flos": 16983018633600.0, + "grad_norm": 2.545641986529344, + "language_loss": 0.8295548, + "learning_rate": 3.891408075291425e-06, + "loss": 0.8516016, + "num_input_tokens_seen": 47521540, + "step": 2199, + "time_per_iteration": 2.4313833713531494 + }, + { + "auxiliary_loss_clip": 0.01113431, + "auxiliary_loss_mlp": 0.01053708, + "balance_loss_clip": 1.04757524, + "balance_loss_mlp": 1.03263187, + "epoch": 0.13227115586953253, + "flos": 34233605677440.0, + "grad_norm": 1.7663853326046384, + "language_loss": 0.69600111, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.71767247, + "num_input_tokens_seen": 47543625, + "step": 2200, + "time_per_iteration": 2.6982061862945557 + }, + { + "auxiliary_loss_clip": 0.01162535, + "auxiliary_loss_mlp": 0.01049152, + "balance_loss_clip": 1.05080104, + "balance_loss_mlp": 1.02829087, + "epoch": 0.13233127912220052, + "flos": 20704656735360.0, + "grad_norm": 2.042289833481919, + "language_loss": 0.84593868, + "learning_rate": 3.891154759144557e-06, + "loss": 0.86805552, + "num_input_tokens_seen": 47563740, + "step": 2201, + "time_per_iteration": 2.461141586303711 + }, + { + "auxiliary_loss_clip": 0.01164125, + "auxiliary_loss_mlp": 0.01045651, + "balance_loss_clip": 1.05157948, + "balance_loss_mlp": 1.02519453, + "epoch": 0.1323914023748685, + "flos": 25805048434560.0, + "grad_norm": 1.7598336170195388, + "language_loss": 0.86620033, + "learning_rate": 3.891027993499554e-06, + "loss": 0.88829809, + "num_input_tokens_seen": 47582655, + "step": 2202, + "time_per_iteration": 3.9735209941864014 + }, + { + "auxiliary_loss_clip": 0.01131233, + "auxiliary_loss_mlp": 0.0104508, + "balance_loss_clip": 1.05158496, + "balance_loss_mlp": 1.02536297, + "epoch": 0.13245152562753645, + "flos": 21251540280960.0, + "grad_norm": 2.79915510532735, + "language_loss": 0.72507739, + "learning_rate": 3.89090115614658e-06, + "loss": 0.74684048, + "num_input_tokens_seen": 47600875, + "step": 2203, + "time_per_iteration": 2.5517735481262207 + }, + { + "auxiliary_loss_clip": 0.01115622, + "auxiliary_loss_mlp": 0.010484, + "balance_loss_clip": 1.04834199, + "balance_loss_mlp": 1.02914798, + "epoch": 0.13251164888020442, + "flos": 26610955931520.0, + "grad_norm": 3.7444895155290223, + "language_loss": 0.73605746, + "learning_rate": 3.890774247090444e-06, + "loss": 0.7576977, + "num_input_tokens_seen": 47619250, + "step": 2204, + "time_per_iteration": 2.643855094909668 + }, + { + "auxiliary_loss_clip": 0.01161217, + "auxiliary_loss_mlp": 0.01049598, + "balance_loss_clip": 1.06220973, + "balance_loss_mlp": 1.02817631, + "epoch": 0.13257177213287238, + "flos": 29826541272960.0, + "grad_norm": 1.9968036017760995, + "language_loss": 0.7844488, + "learning_rate": 3.89064726633596e-06, + "loss": 0.80655694, + "num_input_tokens_seen": 47639445, + "step": 2205, + "time_per_iteration": 2.5740411281585693 + }, + { + "auxiliary_loss_clip": 0.01119615, + "auxiliary_loss_mlp": 0.01050174, + "balance_loss_clip": 1.04946899, + "balance_loss_mlp": 1.03025436, + "epoch": 0.13263189538554035, + "flos": 21288456483840.0, + "grad_norm": 1.8496959112258748, + "language_loss": 0.79164529, + "learning_rate": 3.890520213887941e-06, + "loss": 0.81334317, + "num_input_tokens_seen": 47658740, + "step": 2206, + "time_per_iteration": 2.550945281982422 + }, + { + "auxiliary_loss_clip": 0.01122669, + "auxiliary_loss_mlp": 0.01047952, + "balance_loss_clip": 1.05255294, + "balance_loss_mlp": 1.02933192, + "epoch": 0.13269201863820831, + "flos": 16874101618560.0, + "grad_norm": 2.2509158196116603, + "language_loss": 0.74600023, + "learning_rate": 3.890393089751208e-06, + "loss": 0.76770639, + "num_input_tokens_seen": 47676880, + "step": 2207, + "time_per_iteration": 2.5660173892974854 + }, + { + "auxiliary_loss_clip": 0.01135705, + "auxiliary_loss_mlp": 0.01044037, + "balance_loss_clip": 1.04789925, + "balance_loss_mlp": 1.0241406, + "epoch": 0.1327521418908763, + "flos": 23768914078080.0, + "grad_norm": 1.9173924079756883, + "language_loss": 0.84371287, + "learning_rate": 3.890265893930578e-06, + "loss": 0.86551028, + "num_input_tokens_seen": 47696635, + "step": 2208, + "time_per_iteration": 2.5439419746398926 + }, + { + "auxiliary_loss_clip": 0.01144171, + "auxiliary_loss_mlp": 0.01050499, + "balance_loss_clip": 1.05369985, + "balance_loss_mlp": 1.03180647, + "epoch": 0.13281226514354427, + "flos": 26505594362880.0, + "grad_norm": 1.7599145468641564, + "language_loss": 0.85447544, + "learning_rate": 3.890138626430876e-06, + "loss": 0.87642217, + "num_input_tokens_seen": 47717760, + "step": 2209, + "time_per_iteration": 2.5741400718688965 + }, + { + "auxiliary_loss_clip": 0.01133563, + "auxiliary_loss_mlp": 0.01015829, + "balance_loss_clip": 1.05151224, + "balance_loss_mlp": 1.35585117, + "epoch": 0.13287238839621224, + "flos": 24498762526080.0, + "grad_norm": 3.0084070121474307, + "language_loss": 0.82481062, + "learning_rate": 3.890011287256929e-06, + "loss": 0.84630454, + "num_input_tokens_seen": 47737685, + "step": 2210, + "time_per_iteration": 2.5543212890625 + }, + { + "auxiliary_loss_clip": 0.01057908, + "auxiliary_loss_mlp": 0.01921028, + "balance_loss_clip": 1.05018413, + "balance_loss_mlp": 3.04678535, + "epoch": 0.1329325116488802, + "flos": 67694344369920.0, + "grad_norm": 0.7721610982941081, + "language_loss": 0.58026803, + "learning_rate": 3.889883876413563e-06, + "loss": 0.61005735, + "num_input_tokens_seen": 47802415, + "step": 2211, + "time_per_iteration": 3.279937744140625 + }, + { + "auxiliary_loss_clip": 0.01067224, + "auxiliary_loss_mlp": 0.01022581, + "balance_loss_clip": 1.03822756, + "balance_loss_mlp": 1.01936269, + "epoch": 0.13299263490154817, + "flos": 72261894741120.0, + "grad_norm": 0.7968550564030759, + "language_loss": 0.55341363, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57431167, + "num_input_tokens_seen": 47871485, + "step": 2212, + "time_per_iteration": 3.2102363109588623 + }, + { + "auxiliary_loss_clip": 0.01129196, + "auxiliary_loss_mlp": 0.01060564, + "balance_loss_clip": 1.05259466, + "balance_loss_mlp": 1.0402627, + "epoch": 0.13305275815421613, + "flos": 17931275729280.0, + "grad_norm": 2.6594402613055395, + "language_loss": 0.73812592, + "learning_rate": 3.889628839737908e-06, + "loss": 0.76002353, + "num_input_tokens_seen": 47888315, + "step": 2213, + "time_per_iteration": 2.5477957725524902 + }, + { + "auxiliary_loss_clip": 0.01112257, + "auxiliary_loss_mlp": 0.01052116, + "balance_loss_clip": 1.04930389, + "balance_loss_mlp": 1.03381753, + "epoch": 0.13311288140688413, + "flos": 22340889999360.0, + "grad_norm": 1.7875951714101368, + "language_loss": 0.79433095, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81597471, + "num_input_tokens_seen": 47906600, + "step": 2214, + "time_per_iteration": 2.612903594970703 + }, + { + "auxiliary_loss_clip": 0.01137669, + "auxiliary_loss_mlp": 0.01051392, + "balance_loss_clip": 1.05491984, + "balance_loss_mlp": 1.03166258, + "epoch": 0.1331730046595521, + "flos": 31868888682240.0, + "grad_norm": 1.6430102214197357, + "language_loss": 0.69154382, + "learning_rate": 3.889373516442597e-06, + "loss": 0.71343446, + "num_input_tokens_seen": 47927630, + "step": 2215, + "time_per_iteration": 2.613560199737549 + }, + { + "auxiliary_loss_clip": 0.01154263, + "auxiliary_loss_mlp": 0.01043917, + "balance_loss_clip": 1.05273855, + "balance_loss_mlp": 1.02456939, + "epoch": 0.13323312791222006, + "flos": 22566589107840.0, + "grad_norm": 1.9316770035469533, + "language_loss": 0.81001699, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83199877, + "num_input_tokens_seen": 47947935, + "step": 2216, + "time_per_iteration": 2.543605327606201 + }, + { + "auxiliary_loss_clip": 0.01148044, + "auxiliary_loss_mlp": 0.01051757, + "balance_loss_clip": 1.05174899, + "balance_loss_mlp": 1.03212333, + "epoch": 0.13329325116488802, + "flos": 15085319293440.0, + "grad_norm": 2.734904696978511, + "language_loss": 0.87101662, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89301455, + "num_input_tokens_seen": 47965515, + "step": 2217, + "time_per_iteration": 2.4840123653411865 + }, + { + "auxiliary_loss_clip": 0.01135555, + "auxiliary_loss_mlp": 0.01053901, + "balance_loss_clip": 1.04998302, + "balance_loss_mlp": 1.03282475, + "epoch": 0.133353374417556, + "flos": 27453671890560.0, + "grad_norm": 2.9735238678267306, + "language_loss": 0.73416054, + "learning_rate": 3.888989994172501e-06, + "loss": 0.75605512, + "num_input_tokens_seen": 47985675, + "step": 2218, + "time_per_iteration": 2.5783402919769287 + }, + { + "auxiliary_loss_clip": 0.01118487, + "auxiliary_loss_mlp": 0.0104656, + "balance_loss_clip": 1.04812157, + "balance_loss_mlp": 1.02632976, + "epoch": 0.13341349767022395, + "flos": 24094695456000.0, + "grad_norm": 2.0253535752531118, + "language_loss": 0.87439203, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.89604247, + "num_input_tokens_seen": 48004985, + "step": 2219, + "time_per_iteration": 2.6086602210998535 + }, + { + "auxiliary_loss_clip": 0.0112659, + "auxiliary_loss_mlp": 0.01058862, + "balance_loss_clip": 1.04855645, + "balance_loss_mlp": 1.04062259, + "epoch": 0.13347362092289192, + "flos": 24133335511680.0, + "grad_norm": 2.023515341012075, + "language_loss": 0.7716471, + "learning_rate": 3.888733954497574e-06, + "loss": 0.79350168, + "num_input_tokens_seen": 48024965, + "step": 2220, + "time_per_iteration": 2.5937328338623047 + }, + { + "auxiliary_loss_clip": 0.01134386, + "auxiliary_loss_mlp": 0.01039654, + "balance_loss_clip": 1.04771125, + "balance_loss_mlp": 1.0219276, + "epoch": 0.1335337441755599, + "flos": 18436538390400.0, + "grad_norm": 6.662801821267267, + "language_loss": 0.79203391, + "learning_rate": 3.888605827226212e-06, + "loss": 0.81377435, + "num_input_tokens_seen": 48040890, + "step": 2221, + "time_per_iteration": 2.4975669384002686 + }, + { + "auxiliary_loss_clip": 0.01065102, + "auxiliary_loss_mlp": 0.01003194, + "balance_loss_clip": 1.02591014, + "balance_loss_mlp": 0.99995178, + "epoch": 0.13359386742822787, + "flos": 50611997652480.0, + "grad_norm": 0.9697818298047008, + "language_loss": 0.68973994, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.71042287, + "num_input_tokens_seen": 48091855, + "step": 2222, + "time_per_iteration": 2.9606451988220215 + }, + { + "auxiliary_loss_clip": 0.0112102, + "auxiliary_loss_mlp": 0.01045249, + "balance_loss_clip": 1.05438471, + "balance_loss_mlp": 1.02770185, + "epoch": 0.13365399068089584, + "flos": 22778569221120.0, + "grad_norm": 1.8345982665629677, + "language_loss": 0.6713255, + "learning_rate": 3.888349357839982e-06, + "loss": 0.69298828, + "num_input_tokens_seen": 48111350, + "step": 2223, + "time_per_iteration": 2.5771777629852295 + }, + { + "auxiliary_loss_clip": 0.01151218, + "auxiliary_loss_mlp": 0.01061128, + "balance_loss_clip": 1.05272293, + "balance_loss_mlp": 1.04101777, + "epoch": 0.1337141139335638, + "flos": 12531603911040.0, + "grad_norm": 3.3760085059909497, + "language_loss": 0.83030772, + "learning_rate": 3.88822101573484e-06, + "loss": 0.85243118, + "num_input_tokens_seen": 48129840, + "step": 2224, + "time_per_iteration": 2.504643678665161 + }, + { + "auxiliary_loss_clip": 0.0116237, + "auxiliary_loss_mlp": 0.01044225, + "balance_loss_clip": 1.05077386, + "balance_loss_mlp": 1.02463865, + "epoch": 0.13377423718623177, + "flos": 23038957889280.0, + "grad_norm": 2.5989701310860682, + "language_loss": 0.66218829, + "learning_rate": 3.888092602028167e-06, + "loss": 0.68425429, + "num_input_tokens_seen": 48149240, + "step": 2225, + "time_per_iteration": 2.4821665287017822 + }, + { + "auxiliary_loss_clip": 0.0114746, + "auxiliary_loss_mlp": 0.01048514, + "balance_loss_clip": 1.05018306, + "balance_loss_mlp": 1.02929759, + "epoch": 0.13383436043889974, + "flos": 16216397637120.0, + "grad_norm": 2.6498775674514436, + "language_loss": 0.89707607, + "learning_rate": 3.887964116724835e-06, + "loss": 0.91903585, + "num_input_tokens_seen": 48166330, + "step": 2226, + "time_per_iteration": 2.4977974891662598 + }, + { + "auxiliary_loss_clip": 0.01138867, + "auxiliary_loss_mlp": 0.01054088, + "balance_loss_clip": 1.05010605, + "balance_loss_mlp": 1.03507388, + "epoch": 0.1338944836915677, + "flos": 24279671520000.0, + "grad_norm": 2.0737079333817645, + "language_loss": 0.74059176, + "learning_rate": 3.887835559829712e-06, + "loss": 0.76252127, + "num_input_tokens_seen": 48187600, + "step": 2227, + "time_per_iteration": 2.5697526931762695 + }, + { + "auxiliary_loss_clip": 0.01148276, + "auxiliary_loss_mlp": 0.01044078, + "balance_loss_clip": 1.0506593, + "balance_loss_mlp": 1.02455199, + "epoch": 0.1339546069442357, + "flos": 17598742594560.0, + "grad_norm": 2.4721153153154343, + "language_loss": 0.85392839, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.87585193, + "num_input_tokens_seen": 48204400, + "step": 2228, + "time_per_iteration": 2.470172166824341 + }, + { + "auxiliary_loss_clip": 0.01131203, + "auxiliary_loss_mlp": 0.01049779, + "balance_loss_clip": 1.05337691, + "balance_loss_mlp": 1.02944207, + "epoch": 0.13401473019690366, + "flos": 18990065952000.0, + "grad_norm": 2.0035308025619023, + "language_loss": 0.81444836, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.83625817, + "num_input_tokens_seen": 48222180, + "step": 2229, + "time_per_iteration": 2.5160717964172363 + }, + { + "auxiliary_loss_clip": 0.01104894, + "auxiliary_loss_mlp": 0.01054032, + "balance_loss_clip": 1.05339384, + "balance_loss_mlp": 1.03419518, + "epoch": 0.13407485344957162, + "flos": 26943812288640.0, + "grad_norm": 1.9480875743486437, + "language_loss": 0.74712706, + "learning_rate": 3.887449459642378e-06, + "loss": 0.76871634, + "num_input_tokens_seen": 48243245, + "step": 2230, + "time_per_iteration": 2.631298780441284 + }, + { + "auxiliary_loss_clip": 0.0112011, + "auxiliary_loss_mlp": 0.01052818, + "balance_loss_clip": 1.0526402, + "balance_loss_mlp": 1.03338706, + "epoch": 0.1341349767022396, + "flos": 20339373375360.0, + "grad_norm": 5.388216697466175, + "language_loss": 0.80247843, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82420766, + "num_input_tokens_seen": 48262600, + "step": 2231, + "time_per_iteration": 2.5515410900115967 + }, + { + "auxiliary_loss_clip": 0.01108241, + "auxiliary_loss_mlp": 0.01058047, + "balance_loss_clip": 1.05312228, + "balance_loss_mlp": 1.03704262, + "epoch": 0.13419509995490755, + "flos": 29862020931840.0, + "grad_norm": 1.9239776639139006, + "language_loss": 0.72260904, + "learning_rate": 3.887191701647992e-06, + "loss": 0.74427193, + "num_input_tokens_seen": 48285075, + "step": 2232, + "time_per_iteration": 4.059293746948242 + }, + { + "auxiliary_loss_clip": 0.01121865, + "auxiliary_loss_mlp": 0.01044097, + "balance_loss_clip": 1.05296779, + "balance_loss_mlp": 1.02327108, + "epoch": 0.13425522320757552, + "flos": 26942986275840.0, + "grad_norm": 3.144523999811054, + "language_loss": 0.65489459, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.6765542, + "num_input_tokens_seen": 48301285, + "step": 2233, + "time_per_iteration": 4.025724411010742 + }, + { + "auxiliary_loss_clip": 0.01161814, + "auxiliary_loss_mlp": 0.01043739, + "balance_loss_clip": 1.05109823, + "balance_loss_mlp": 1.0247606, + "epoch": 0.1343153464602435, + "flos": 15777281871360.0, + "grad_norm": 2.9431583186052763, + "language_loss": 0.81065691, + "learning_rate": 3.886933657403615e-06, + "loss": 0.83271241, + "num_input_tokens_seen": 48317835, + "step": 2234, + "time_per_iteration": 2.4627950191497803 + }, + { + "auxiliary_loss_clip": 0.0113683, + "auxiliary_loss_mlp": 0.01048777, + "balance_loss_clip": 1.05181694, + "balance_loss_mlp": 1.02946496, + "epoch": 0.13437546971291148, + "flos": 24314756129280.0, + "grad_norm": 4.455221553151614, + "language_loss": 0.82132041, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84317648, + "num_input_tokens_seen": 48335670, + "step": 2235, + "time_per_iteration": 2.5386533737182617 + }, + { + "auxiliary_loss_clip": 0.01147445, + "auxiliary_loss_mlp": 0.0104795, + "balance_loss_clip": 1.05234993, + "balance_loss_mlp": 1.02694499, + "epoch": 0.13443559296557944, + "flos": 26650673395200.0, + "grad_norm": 1.9119811057177714, + "language_loss": 0.8639977, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.88595164, + "num_input_tokens_seen": 48357805, + "step": 2236, + "time_per_iteration": 2.5484073162078857 + }, + { + "auxiliary_loss_clip": 0.0116749, + "auxiliary_loss_mlp": 0.01044766, + "balance_loss_clip": 1.0559864, + "balance_loss_mlp": 1.02481079, + "epoch": 0.1344957162182474, + "flos": 21796197183360.0, + "grad_norm": 1.9374236949961903, + "language_loss": 0.77884907, + "learning_rate": 3.886546054403946e-06, + "loss": 0.80097163, + "num_input_tokens_seen": 48377845, + "step": 2237, + "time_per_iteration": 3.8547585010528564 + }, + { + "auxiliary_loss_clip": 0.01143945, + "auxiliary_loss_mlp": 0.01046596, + "balance_loss_clip": 1.05247378, + "balance_loss_mlp": 1.02535272, + "epoch": 0.13455583947091537, + "flos": 19865568049920.0, + "grad_norm": 2.442783748626539, + "language_loss": 0.7894069, + "learning_rate": 3.886416710321491e-06, + "loss": 0.81131232, + "num_input_tokens_seen": 48394735, + "step": 2238, + "time_per_iteration": 2.513523817062378 + }, + { + "auxiliary_loss_clip": 0.0114043, + "auxiliary_loss_mlp": 0.01043693, + "balance_loss_clip": 1.05284238, + "balance_loss_mlp": 1.02278376, + "epoch": 0.13461596272358334, + "flos": 30846835094400.0, + "grad_norm": 2.403131163238022, + "language_loss": 0.67724681, + "learning_rate": 3.886287294705924e-06, + "loss": 0.69908798, + "num_input_tokens_seen": 48414200, + "step": 2239, + "time_per_iteration": 2.5896058082580566 + }, + { + "auxiliary_loss_clip": 0.01148306, + "auxiliary_loss_mlp": 0.01040335, + "balance_loss_clip": 1.05783975, + "balance_loss_mlp": 1.0209161, + "epoch": 0.1346760859762513, + "flos": 12494436312960.0, + "grad_norm": 2.4199869724636653, + "language_loss": 0.8157593, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.83764571, + "num_input_tokens_seen": 48431065, + "step": 2240, + "time_per_iteration": 2.498563766479492 + }, + { + "auxiliary_loss_clip": 0.01110983, + "auxiliary_loss_mlp": 0.01049762, + "balance_loss_clip": 1.05283463, + "balance_loss_mlp": 1.02866149, + "epoch": 0.1347362092289193, + "flos": 21836022387840.0, + "grad_norm": 1.7659016426780036, + "language_loss": 0.77705002, + "learning_rate": 3.886028248895093e-06, + "loss": 0.79865748, + "num_input_tokens_seen": 48450335, + "step": 2241, + "time_per_iteration": 3.976215124130249 + }, + { + "auxiliary_loss_clip": 0.01162106, + "auxiliary_loss_mlp": 0.01037999, + "balance_loss_clip": 1.05563521, + "balance_loss_mlp": 1.02053452, + "epoch": 0.13479633248158726, + "flos": 23509459163520.0, + "grad_norm": 1.9375682740849078, + "language_loss": 0.83558267, + "learning_rate": 3.88589861870965e-06, + "loss": 0.8575837, + "num_input_tokens_seen": 48468555, + "step": 2242, + "time_per_iteration": 2.509639263153076 + }, + { + "auxiliary_loss_clip": 0.0116496, + "auxiliary_loss_mlp": 0.01051938, + "balance_loss_clip": 1.05413365, + "balance_loss_mlp": 1.03084993, + "epoch": 0.13485645573425523, + "flos": 29344332165120.0, + "grad_norm": 3.316340901072225, + "language_loss": 0.64828765, + "learning_rate": 3.885768917010744e-06, + "loss": 0.67045659, + "num_input_tokens_seen": 48488515, + "step": 2243, + "time_per_iteration": 2.5329201221466064 + }, + { + "auxiliary_loss_clip": 0.01124521, + "auxiliary_loss_mlp": 0.01044089, + "balance_loss_clip": 1.04959655, + "balance_loss_mlp": 1.02416933, + "epoch": 0.1349165789869232, + "flos": 28037112503040.0, + "grad_norm": 1.834418701429945, + "language_loss": 0.72754025, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.74922633, + "num_input_tokens_seen": 48510515, + "step": 2244, + "time_per_iteration": 2.591916084289551 + }, + { + "auxiliary_loss_clip": 0.01150379, + "auxiliary_loss_mlp": 0.01046797, + "balance_loss_clip": 1.05388165, + "balance_loss_mlp": 1.02922583, + "epoch": 0.13497670223959116, + "flos": 22853730430080.0, + "grad_norm": 1.6825866540382546, + "language_loss": 0.86239713, + "learning_rate": 3.88550929909221e-06, + "loss": 0.8843689, + "num_input_tokens_seen": 48529940, + "step": 2245, + "time_per_iteration": 2.5146827697753906 + }, + { + "auxiliary_loss_clip": 0.0114862, + "auxiliary_loss_mlp": 0.01046919, + "balance_loss_clip": 1.05235291, + "balance_loss_mlp": 1.02806032, + "epoch": 0.13503682549225912, + "flos": 16504580453760.0, + "grad_norm": 1.8443911738379126, + "language_loss": 0.78833622, + "learning_rate": 3.88537938288243e-06, + "loss": 0.81029165, + "num_input_tokens_seen": 48548190, + "step": 2246, + "time_per_iteration": 2.5010430812835693 + }, + { + "auxiliary_loss_clip": 0.01034574, + "auxiliary_loss_mlp": 0.01039933, + "balance_loss_clip": 1.03767061, + "balance_loss_mlp": 1.03590381, + "epoch": 0.1350969487449271, + "flos": 70756303242240.0, + "grad_norm": 0.7614789262469613, + "language_loss": 0.60618865, + "learning_rate": 3.885249395178874e-06, + "loss": 0.62693375, + "num_input_tokens_seen": 48613165, + "step": 2247, + "time_per_iteration": 3.318683624267578 + }, + { + "auxiliary_loss_clip": 0.01161946, + "auxiliary_loss_mlp": 0.01052983, + "balance_loss_clip": 1.05788255, + "balance_loss_mlp": 1.03111982, + "epoch": 0.13515707199759508, + "flos": 23075981832960.0, + "grad_norm": 2.4976555129548865, + "language_loss": 0.81075227, + "learning_rate": 3.885119335986473e-06, + "loss": 0.83290148, + "num_input_tokens_seen": 48631705, + "step": 2248, + "time_per_iteration": 2.5000176429748535 + }, + { + "auxiliary_loss_clip": 0.01139548, + "auxiliary_loss_mlp": 0.0104347, + "balance_loss_clip": 1.0531553, + "balance_loss_mlp": 1.02530289, + "epoch": 0.13521719525026304, + "flos": 23186371305600.0, + "grad_norm": 1.931658032844801, + "language_loss": 0.7689631, + "learning_rate": 3.884989205310157e-06, + "loss": 0.7907933, + "num_input_tokens_seen": 48649740, + "step": 2249, + "time_per_iteration": 2.567361354827881 + }, + { + "auxiliary_loss_clip": 0.01123101, + "auxiliary_loss_mlp": 0.01051764, + "balance_loss_clip": 1.05355668, + "balance_loss_mlp": 1.03345311, + "epoch": 0.135277318502931, + "flos": 24790931752320.0, + "grad_norm": 1.4971933233877974, + "language_loss": 0.84499663, + "learning_rate": 3.884859003154862e-06, + "loss": 0.86674529, + "num_input_tokens_seen": 48671565, + "step": 2250, + "time_per_iteration": 2.6351797580718994 + }, + { + "auxiliary_loss_clip": 0.01153036, + "auxiliary_loss_mlp": 0.01046047, + "balance_loss_clip": 1.05263269, + "balance_loss_mlp": 1.02565026, + "epoch": 0.13533744175559898, + "flos": 21908525990400.0, + "grad_norm": 2.417404150404338, + "language_loss": 0.8200022, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84199297, + "num_input_tokens_seen": 48690425, + "step": 2251, + "time_per_iteration": 2.5219478607177734 + }, + { + "auxiliary_loss_clip": 0.01162721, + "auxiliary_loss_mlp": 0.0104685, + "balance_loss_clip": 1.05275881, + "balance_loss_mlp": 1.02567828, + "epoch": 0.13539756500826694, + "flos": 21211643249280.0, + "grad_norm": 1.7284399293995605, + "language_loss": 0.85800153, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88009727, + "num_input_tokens_seen": 48707505, + "step": 2252, + "time_per_iteration": 2.466893196105957 + }, + { + "auxiliary_loss_clip": 0.01076283, + "auxiliary_loss_mlp": 0.01002599, + "balance_loss_clip": 1.0430131, + "balance_loss_mlp": 0.99928534, + "epoch": 0.1354576882609349, + "flos": 63242103634560.0, + "grad_norm": 0.7668495589241298, + "language_loss": 0.61797643, + "learning_rate": 3.884467967864485e-06, + "loss": 0.63876522, + "num_input_tokens_seen": 48775895, + "step": 2253, + "time_per_iteration": 3.204958200454712 + }, + { + "auxiliary_loss_clip": 0.0115278, + "auxiliary_loss_mlp": 0.01053616, + "balance_loss_clip": 1.05585551, + "balance_loss_mlp": 1.03487659, + "epoch": 0.1355178115136029, + "flos": 25483037984640.0, + "grad_norm": 1.626941027465393, + "language_loss": 0.89146578, + "learning_rate": 3.884337479842671e-06, + "loss": 0.91352975, + "num_input_tokens_seen": 48798370, + "step": 2254, + "time_per_iteration": 2.568239688873291 + }, + { + "auxiliary_loss_clip": 0.01132267, + "auxiliary_loss_mlp": 0.0105787, + "balance_loss_clip": 1.04951119, + "balance_loss_mlp": 1.03401589, + "epoch": 0.13557793476627086, + "flos": 21616967295360.0, + "grad_norm": 4.340635432900999, + "language_loss": 0.84370488, + "learning_rate": 3.884206920366591e-06, + "loss": 0.86560631, + "num_input_tokens_seen": 48817955, + "step": 2255, + "time_per_iteration": 2.53114652633667 + }, + { + "auxiliary_loss_clip": 0.01162188, + "auxiliary_loss_mlp": 0.01048614, + "balance_loss_clip": 1.05252182, + "balance_loss_mlp": 1.02911162, + "epoch": 0.13563805801893883, + "flos": 24928253447040.0, + "grad_norm": 3.006749689354804, + "language_loss": 0.75487387, + "learning_rate": 3.884076289441196e-06, + "loss": 0.77698183, + "num_input_tokens_seen": 48836330, + "step": 2256, + "time_per_iteration": 2.4890904426574707 + }, + { + "auxiliary_loss_clip": 0.01119789, + "auxiliary_loss_mlp": 0.01047574, + "balance_loss_clip": 1.04892993, + "balance_loss_mlp": 1.02675962, + "epoch": 0.1356981812716068, + "flos": 14750272206720.0, + "grad_norm": 2.1474584147154965, + "language_loss": 0.83619034, + "learning_rate": 3.88394558707144e-06, + "loss": 0.85786396, + "num_input_tokens_seen": 48851890, + "step": 2257, + "time_per_iteration": 2.5046608448028564 + }, + { + "auxiliary_loss_clip": 0.01144622, + "auxiliary_loss_mlp": 0.01136655, + "balance_loss_clip": 1.05074501, + "balance_loss_mlp": 1.59741688, + "epoch": 0.13575830452427476, + "flos": 11108571822720.0, + "grad_norm": 2.964318462281257, + "language_loss": 0.82139659, + "learning_rate": 3.883814813262277e-06, + "loss": 0.84420931, + "num_input_tokens_seen": 48865510, + "step": 2258, + "time_per_iteration": 2.514435291290283 + }, + { + "auxiliary_loss_clip": 0.01151659, + "auxiliary_loss_mlp": 0.01049975, + "balance_loss_clip": 1.04992247, + "balance_loss_mlp": 1.02764678, + "epoch": 0.13581842777694272, + "flos": 17960290940160.0, + "grad_norm": 2.271053524766643, + "language_loss": 0.82505202, + "learning_rate": 3.883683968018669e-06, + "loss": 0.84706837, + "num_input_tokens_seen": 48882360, + "step": 2259, + "time_per_iteration": 2.4597957134246826 + }, + { + "auxiliary_loss_clip": 0.01121454, + "auxiliary_loss_mlp": 0.01054854, + "balance_loss_clip": 1.04950213, + "balance_loss_mlp": 1.03634083, + "epoch": 0.1358785510296107, + "flos": 22857142222080.0, + "grad_norm": 2.07385119066564, + "language_loss": 0.73068082, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.75244391, + "num_input_tokens_seen": 48902700, + "step": 2260, + "time_per_iteration": 2.5644311904907227 + }, + { + "auxiliary_loss_clip": 0.01136024, + "auxiliary_loss_mlp": 0.0105618, + "balance_loss_clip": 1.05080497, + "balance_loss_mlp": 1.03612888, + "epoch": 0.13593867428227868, + "flos": 25739404329600.0, + "grad_norm": 2.714869700607332, + "language_loss": 0.75204843, + "learning_rate": 3.883422063247961e-06, + "loss": 0.77397048, + "num_input_tokens_seen": 48922525, + "step": 2261, + "time_per_iteration": 2.54692006111145 + }, + { + "auxiliary_loss_clip": 0.01161226, + "auxiliary_loss_mlp": 0.01047018, + "balance_loss_clip": 1.04963684, + "balance_loss_mlp": 1.02776551, + "epoch": 0.13599879753494665, + "flos": 31249214225280.0, + "grad_norm": 1.8550541441133357, + "language_loss": 0.6327132, + "learning_rate": 3.883291003730794e-06, + "loss": 0.65479565, + "num_input_tokens_seen": 48942510, + "step": 2262, + "time_per_iteration": 2.565825939178467 + }, + { + "auxiliary_loss_clip": 0.01139478, + "auxiliary_loss_mlp": 0.01044731, + "balance_loss_clip": 1.04828572, + "balance_loss_mlp": 1.02507377, + "epoch": 0.1360589207876146, + "flos": 23915034604800.0, + "grad_norm": 2.3077461344029704, + "language_loss": 0.82582843, + "learning_rate": 3.883159872799043e-06, + "loss": 0.84767056, + "num_input_tokens_seen": 48962625, + "step": 2263, + "time_per_iteration": 2.5631628036499023 + }, + { + "auxiliary_loss_clip": 0.01100338, + "auxiliary_loss_mlp": 0.01061882, + "balance_loss_clip": 1.05430818, + "balance_loss_mlp": 1.03943503, + "epoch": 0.13611904404028258, + "flos": 19974197756160.0, + "grad_norm": 1.9552005827508308, + "language_loss": 0.8795957, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.90121794, + "num_input_tokens_seen": 48982525, + "step": 2264, + "time_per_iteration": 2.6664767265319824 + }, + { + "auxiliary_loss_clip": 0.01155279, + "auxiliary_loss_mlp": 0.0104504, + "balance_loss_clip": 1.05371094, + "balance_loss_mlp": 1.02339137, + "epoch": 0.13617916729295054, + "flos": 15340644144000.0, + "grad_norm": 3.186987629311333, + "language_loss": 0.71239662, + "learning_rate": 3.882897396711683e-06, + "loss": 0.7343998, + "num_input_tokens_seen": 48997605, + "step": 2265, + "time_per_iteration": 2.464078903198242 + }, + { + "auxiliary_loss_clip": 0.01106619, + "auxiliary_loss_mlp": 0.01049545, + "balance_loss_clip": 1.05154991, + "balance_loss_mlp": 1.02983952, + "epoch": 0.1362392905456185, + "flos": 27451445247360.0, + "grad_norm": 2.039058178950468, + "language_loss": 0.66550922, + "learning_rate": 3.882766051566027e-06, + "loss": 0.68707091, + "num_input_tokens_seen": 49018535, + "step": 2266, + "time_per_iteration": 2.681840419769287 + }, + { + "auxiliary_loss_clip": 0.01121816, + "auxiliary_loss_mlp": 0.01055946, + "balance_loss_clip": 1.05363643, + "balance_loss_mlp": 1.03633547, + "epoch": 0.1362994137982865, + "flos": 25009017177600.0, + "grad_norm": 2.168834230618332, + "language_loss": 0.76659155, + "learning_rate": 3.882634635025694e-06, + "loss": 0.78836912, + "num_input_tokens_seen": 49038865, + "step": 2267, + "time_per_iteration": 2.6286492347717285 + }, + { + "auxiliary_loss_clip": 0.01133054, + "auxiliary_loss_mlp": 0.01049732, + "balance_loss_clip": 1.05271792, + "balance_loss_mlp": 1.02883458, + "epoch": 0.13635953705095447, + "flos": 20303031790080.0, + "grad_norm": 1.9708520209954268, + "language_loss": 0.82005966, + "learning_rate": 3.882503147095667e-06, + "loss": 0.84188753, + "num_input_tokens_seen": 49058010, + "step": 2268, + "time_per_iteration": 2.5393872261047363 + }, + { + "auxiliary_loss_clip": 0.01149409, + "auxiliary_loss_mlp": 0.01042817, + "balance_loss_clip": 1.05357718, + "balance_loss_mlp": 1.0231117, + "epoch": 0.13641966030362243, + "flos": 31358418549120.0, + "grad_norm": 1.78313009344237, + "language_loss": 0.76058733, + "learning_rate": 3.882371587780931e-06, + "loss": 0.78250957, + "num_input_tokens_seen": 49080330, + "step": 2269, + "time_per_iteration": 2.569965124130249 + }, + { + "auxiliary_loss_clip": 0.01128153, + "auxiliary_loss_mlp": 0.01044932, + "balance_loss_clip": 1.04854822, + "balance_loss_mlp": 1.02463078, + "epoch": 0.1364797835562904, + "flos": 20478095700480.0, + "grad_norm": 2.108939899666033, + "language_loss": 0.80979359, + "learning_rate": 3.882239957086477e-06, + "loss": 0.83152449, + "num_input_tokens_seen": 49097035, + "step": 2270, + "time_per_iteration": 2.5515968799591064 + }, + { + "auxiliary_loss_clip": 0.01139119, + "auxiliary_loss_mlp": 0.01059139, + "balance_loss_clip": 1.05348706, + "balance_loss_mlp": 1.03822947, + "epoch": 0.13653990680895836, + "flos": 13078343802240.0, + "grad_norm": 2.424910639243959, + "language_loss": 0.7549057, + "learning_rate": 3.882108255017295e-06, + "loss": 0.77688825, + "num_input_tokens_seen": 49113945, + "step": 2271, + "time_per_iteration": 3.8846676349639893 + }, + { + "auxiliary_loss_clip": 0.011509, + "auxiliary_loss_mlp": 0.01053547, + "balance_loss_clip": 1.04985595, + "balance_loss_mlp": 1.03287601, + "epoch": 0.13660003006162633, + "flos": 16946712961920.0, + "grad_norm": 1.9492684807401375, + "language_loss": 0.80239272, + "learning_rate": 3.881976481578379e-06, + "loss": 0.82443714, + "num_input_tokens_seen": 49132855, + "step": 2272, + "time_per_iteration": 3.9356212615966797 + }, + { + "auxiliary_loss_clip": 0.01072031, + "auxiliary_loss_mlp": 0.01005953, + "balance_loss_clip": 1.03757954, + "balance_loss_mlp": 1.00261521, + "epoch": 0.1366601533142943, + "flos": 68682749892480.0, + "grad_norm": 0.6955958389701359, + "language_loss": 0.60689259, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62767243, + "num_input_tokens_seen": 49198310, + "step": 2273, + "time_per_iteration": 3.2053537368774414 + }, + { + "auxiliary_loss_clip": 0.0115854, + "auxiliary_loss_mlp": 0.01006597, + "balance_loss_clip": 1.05025077, + "balance_loss_mlp": 1.34720278, + "epoch": 0.13672027656696228, + "flos": 19244241567360.0, + "grad_norm": 1.9366855426528444, + "language_loss": 0.78017938, + "learning_rate": 3.881712720611336e-06, + "loss": 0.80183077, + "num_input_tokens_seen": 49217250, + "step": 2274, + "time_per_iteration": 2.456404209136963 + }, + { + "auxiliary_loss_clip": 0.01148637, + "auxiliary_loss_mlp": 0.01047786, + "balance_loss_clip": 1.0497421, + "balance_loss_mlp": 1.02690029, + "epoch": 0.13678039981963025, + "flos": 24534924543360.0, + "grad_norm": 1.729096842189411, + "language_loss": 0.78384799, + "learning_rate": 3.881580733093211e-06, + "loss": 0.80581224, + "num_input_tokens_seen": 49236615, + "step": 2275, + "time_per_iteration": 2.52522873878479 + }, + { + "auxiliary_loss_clip": 0.01148427, + "auxiliary_loss_mlp": 0.01040375, + "balance_loss_clip": 1.05041981, + "balance_loss_mlp": 1.02118206, + "epoch": 0.13684052307229821, + "flos": 15669334523520.0, + "grad_norm": 2.9095049429185926, + "language_loss": 0.81727517, + "learning_rate": 3.881448674225356e-06, + "loss": 0.83916318, + "num_input_tokens_seen": 49253935, + "step": 2276, + "time_per_iteration": 3.882133960723877 + }, + { + "auxiliary_loss_clip": 0.0115809, + "auxiliary_loss_mlp": 0.01052592, + "balance_loss_clip": 1.05173635, + "balance_loss_mlp": 1.0293467, + "epoch": 0.13690064632496618, + "flos": 28364689560960.0, + "grad_norm": 3.4740559928285606, + "language_loss": 0.69576287, + "learning_rate": 3.881316544012779e-06, + "loss": 0.7178697, + "num_input_tokens_seen": 49273605, + "step": 2277, + "time_per_iteration": 2.5422699451446533 + }, + { + "auxiliary_loss_clip": 0.01153221, + "auxiliary_loss_mlp": 0.00991125, + "balance_loss_clip": 1.05164742, + "balance_loss_mlp": 1.32036018, + "epoch": 0.13696076957763414, + "flos": 23404779953280.0, + "grad_norm": 2.724646474410736, + "language_loss": 0.8041321, + "learning_rate": 3.88118434246049e-06, + "loss": 0.82557553, + "num_input_tokens_seen": 49291785, + "step": 2278, + "time_per_iteration": 2.5324742794036865 + }, + { + "auxiliary_loss_clip": 0.01150416, + "auxiliary_loss_mlp": 0.01055241, + "balance_loss_clip": 1.05662704, + "balance_loss_mlp": 1.03504694, + "epoch": 0.1370208928303021, + "flos": 37196595601920.0, + "grad_norm": 3.683030160081133, + "language_loss": 0.74840397, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77046055, + "num_input_tokens_seen": 49311405, + "step": 2279, + "time_per_iteration": 4.1052680015563965 + }, + { + "auxiliary_loss_clip": 0.01096454, + "auxiliary_loss_mlp": 0.01050632, + "balance_loss_clip": 1.05040097, + "balance_loss_mlp": 1.03006852, + "epoch": 0.13708101608297008, + "flos": 26976311118720.0, + "grad_norm": 1.8697763356274355, + "language_loss": 0.76655257, + "learning_rate": 3.880919725356831e-06, + "loss": 0.78802341, + "num_input_tokens_seen": 49331835, + "step": 2280, + "time_per_iteration": 2.6671535968780518 + }, + { + "auxiliary_loss_clip": 0.01102642, + "auxiliary_loss_mlp": 0.01048217, + "balance_loss_clip": 1.04763889, + "balance_loss_mlp": 1.02877355, + "epoch": 0.13714113933563807, + "flos": 32556864850560.0, + "grad_norm": 2.8537288063464477, + "language_loss": 0.80241853, + "learning_rate": 3.880787309815496e-06, + "loss": 0.82392704, + "num_input_tokens_seen": 49352290, + "step": 2281, + "time_per_iteration": 2.70766282081604 + }, + { + "auxiliary_loss_clip": 0.01165081, + "auxiliary_loss_mlp": 0.01057028, + "balance_loss_clip": 1.05192637, + "balance_loss_mlp": 1.03733516, + "epoch": 0.13720126258830603, + "flos": 16101267569280.0, + "grad_norm": 1.9158884362633664, + "language_loss": 0.83719778, + "learning_rate": 3.880654822954518e-06, + "loss": 0.85941887, + "num_input_tokens_seen": 49370285, + "step": 2282, + "time_per_iteration": 2.4461870193481445 + }, + { + "auxiliary_loss_clip": 0.01136096, + "auxiliary_loss_mlp": 0.01053781, + "balance_loss_clip": 1.04872906, + "balance_loss_mlp": 1.03488588, + "epoch": 0.137261385840974, + "flos": 18953544798720.0, + "grad_norm": 1.7676907249816516, + "language_loss": 0.73718059, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.75907934, + "num_input_tokens_seen": 49389610, + "step": 2283, + "time_per_iteration": 2.522402286529541 + }, + { + "auxiliary_loss_clip": 0.01148974, + "auxiliary_loss_mlp": 0.01049373, + "balance_loss_clip": 1.05155337, + "balance_loss_mlp": 1.03089523, + "epoch": 0.13732150909364196, + "flos": 23295360147840.0, + "grad_norm": 3.1731879068584425, + "language_loss": 0.84101921, + "learning_rate": 3.880389635293729e-06, + "loss": 0.86300272, + "num_input_tokens_seen": 49408390, + "step": 2284, + "time_per_iteration": 2.51814603805542 + }, + { + "auxiliary_loss_clip": 0.01139919, + "auxiliary_loss_mlp": 0.01049953, + "balance_loss_clip": 1.04867959, + "balance_loss_mlp": 1.02907896, + "epoch": 0.13738163234630993, + "flos": 29351263489920.0, + "grad_norm": 1.914265745262067, + "language_loss": 0.75035793, + "learning_rate": 3.880256934503974e-06, + "loss": 0.77225661, + "num_input_tokens_seen": 49427725, + "step": 2285, + "time_per_iteration": 2.587636947631836 + }, + { + "auxiliary_loss_clip": 0.01136329, + "auxiliary_loss_mlp": 0.01047484, + "balance_loss_clip": 1.04937112, + "balance_loss_mlp": 1.02820754, + "epoch": 0.1374417555989779, + "flos": 26651319840000.0, + "grad_norm": 2.02130645464963, + "language_loss": 0.74751627, + "learning_rate": 3.880124162414689e-06, + "loss": 0.76935434, + "num_input_tokens_seen": 49449000, + "step": 2286, + "time_per_iteration": 2.5668067932128906 + }, + { + "auxiliary_loss_clip": 0.01118516, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_clip": 1.05126083, + "balance_loss_mlp": 1.02292132, + "epoch": 0.1375018788516459, + "flos": 28403401443840.0, + "grad_norm": 2.715533927091294, + "language_loss": 0.86311233, + "learning_rate": 3.879991319030908e-06, + "loss": 0.88473594, + "num_input_tokens_seen": 49468360, + "step": 2287, + "time_per_iteration": 2.6463422775268555 + }, + { + "auxiliary_loss_clip": 0.01115225, + "auxiliary_loss_mlp": 0.01050458, + "balance_loss_clip": 1.04635429, + "balance_loss_mlp": 1.02933407, + "epoch": 0.13756200210431385, + "flos": 37413783187200.0, + "grad_norm": 1.8465752684015717, + "language_loss": 0.68656731, + "learning_rate": 3.879858404357666e-06, + "loss": 0.70822418, + "num_input_tokens_seen": 49493450, + "step": 2288, + "time_per_iteration": 2.6856462955474854 + }, + { + "auxiliary_loss_clip": 0.01103213, + "auxiliary_loss_mlp": 0.0106579, + "balance_loss_clip": 1.0493319, + "balance_loss_mlp": 1.04440439, + "epoch": 0.13762212535698182, + "flos": 22711021695360.0, + "grad_norm": 2.4099122478449, + "language_loss": 0.86997288, + "learning_rate": 3.879725418400005e-06, + "loss": 0.89166296, + "num_input_tokens_seen": 49511220, + "step": 2289, + "time_per_iteration": 2.6050283908843994 + }, + { + "auxiliary_loss_clip": 0.01123599, + "auxiliary_loss_mlp": 0.00966133, + "balance_loss_clip": 1.04715586, + "balance_loss_mlp": 1.27487338, + "epoch": 0.13768224860964978, + "flos": 23952130375680.0, + "grad_norm": 2.0304873146220057, + "language_loss": 0.74671936, + "learning_rate": 3.879592361162969e-06, + "loss": 0.76761663, + "num_input_tokens_seen": 49529820, + "step": 2290, + "time_per_iteration": 2.554211378097534 + }, + { + "auxiliary_loss_clip": 0.01068571, + "auxiliary_loss_mlp": 0.0101112, + "balance_loss_clip": 1.05483317, + "balance_loss_mlp": 1.00823462, + "epoch": 0.13774237186231775, + "flos": 63590438753280.0, + "grad_norm": 0.7070347179497773, + "language_loss": 0.51645386, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53725082, + "num_input_tokens_seen": 49595325, + "step": 2291, + "time_per_iteration": 3.2365164756774902 + }, + { + "auxiliary_loss_clip": 0.01149434, + "auxiliary_loss_mlp": 0.01044375, + "balance_loss_clip": 1.04894972, + "balance_loss_mlp": 1.02389455, + "epoch": 0.1378024951149857, + "flos": 24279456038400.0, + "grad_norm": 2.0970545367447504, + "language_loss": 0.71563935, + "learning_rate": 3.879326032870952e-06, + "loss": 0.73757744, + "num_input_tokens_seen": 49615850, + "step": 2292, + "time_per_iteration": 2.5200703144073486 + }, + { + "auxiliary_loss_clip": 0.01146216, + "auxiliary_loss_mlp": 0.01042112, + "balance_loss_clip": 1.04830635, + "balance_loss_mlp": 1.02290738, + "epoch": 0.13786261836765368, + "flos": 14021537080320.0, + "grad_norm": 3.044581848004234, + "language_loss": 0.79861236, + "learning_rate": 3.879192761826071e-06, + "loss": 0.82049572, + "num_input_tokens_seen": 49631860, + "step": 2293, + "time_per_iteration": 2.4658217430114746 + }, + { + "auxiliary_loss_clip": 0.0114433, + "auxiliary_loss_mlp": 0.01046829, + "balance_loss_clip": 1.04671526, + "balance_loss_mlp": 1.02745759, + "epoch": 0.13792274162032167, + "flos": 28878679226880.0, + "grad_norm": 2.190518617487034, + "language_loss": 0.78096801, + "learning_rate": 3.879059419522011e-06, + "loss": 0.80287957, + "num_input_tokens_seen": 49652145, + "step": 2294, + "time_per_iteration": 2.551774501800537 + }, + { + "auxiliary_loss_clip": 0.01115299, + "auxiliary_loss_mlp": 0.01043677, + "balance_loss_clip": 1.04824758, + "balance_loss_mlp": 1.02608132, + "epoch": 0.13798286487298964, + "flos": 21141150808320.0, + "grad_norm": 2.3300075200369426, + "language_loss": 0.80142468, + "learning_rate": 3.878926005963831e-06, + "loss": 0.82301444, + "num_input_tokens_seen": 49669880, + "step": 2295, + "time_per_iteration": 2.552255630493164 + }, + { + "auxiliary_loss_clip": 0.01142868, + "auxiliary_loss_mlp": 0.0104394, + "balance_loss_clip": 1.04779553, + "balance_loss_mlp": 1.02458072, + "epoch": 0.1380429881256576, + "flos": 22487477402880.0, + "grad_norm": 1.905518745913337, + "language_loss": 0.78820127, + "learning_rate": 3.878792521156588e-06, + "loss": 0.81006938, + "num_input_tokens_seen": 49687255, + "step": 2296, + "time_per_iteration": 2.4935050010681152 + }, + { + "auxiliary_loss_clip": 0.01145252, + "auxiliary_loss_mlp": 0.0105468, + "balance_loss_clip": 1.05113292, + "balance_loss_mlp": 1.03433061, + "epoch": 0.13810311137832557, + "flos": 21393674398080.0, + "grad_norm": 2.141943798818568, + "language_loss": 0.78492808, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.80692744, + "num_input_tokens_seen": 49706650, + "step": 2297, + "time_per_iteration": 2.5190489292144775 + }, + { + "auxiliary_loss_clip": 0.01102042, + "auxiliary_loss_mlp": 0.01047479, + "balance_loss_clip": 1.05065906, + "balance_loss_mlp": 1.02852464, + "epoch": 0.13816323463099353, + "flos": 25989844930560.0, + "grad_norm": 2.5837008613617636, + "language_loss": 0.68829018, + "learning_rate": 3.878525337815164e-06, + "loss": 0.70978534, + "num_input_tokens_seen": 49725715, + "step": 2298, + "time_per_iteration": 2.6319966316223145 + }, + { + "auxiliary_loss_clip": 0.01127883, + "auxiliary_loss_mlp": 0.01046483, + "balance_loss_clip": 1.04813814, + "balance_loss_mlp": 1.02681375, + "epoch": 0.1382233578836615, + "flos": 19244313394560.0, + "grad_norm": 2.8443869097795424, + "language_loss": 0.86913908, + "learning_rate": 3.878391639291116e-06, + "loss": 0.89088279, + "num_input_tokens_seen": 49744710, + "step": 2299, + "time_per_iteration": 2.5411484241485596 + }, + { + "auxiliary_loss_clip": 0.01154852, + "auxiliary_loss_mlp": 0.01048135, + "balance_loss_clip": 1.04835176, + "balance_loss_mlp": 1.02823901, + "epoch": 0.1382834811363295, + "flos": 25666290195840.0, + "grad_norm": 2.248743651878821, + "language_loss": 0.75596237, + "learning_rate": 3.878257869538267e-06, + "loss": 0.77799225, + "num_input_tokens_seen": 49764300, + "step": 2300, + "time_per_iteration": 2.494223117828369 + }, + { + "auxiliary_loss_clip": 0.01122992, + "auxiliary_loss_mlp": 0.01040171, + "balance_loss_clip": 1.05078089, + "balance_loss_mlp": 1.02152634, + "epoch": 0.13834360438899745, + "flos": 19784193788160.0, + "grad_norm": 2.3585778129568444, + "language_loss": 0.82362247, + "learning_rate": 3.878124028561692e-06, + "loss": 0.845254, + "num_input_tokens_seen": 49778380, + "step": 2301, + "time_per_iteration": 2.5430617332458496 + }, + { + "auxiliary_loss_clip": 0.01126895, + "auxiliary_loss_mlp": 0.00955128, + "balance_loss_clip": 1.0474999, + "balance_loss_mlp": 1.26831317, + "epoch": 0.13840372764166542, + "flos": 26651858544000.0, + "grad_norm": 5.303328922028776, + "language_loss": 0.85784739, + "learning_rate": 3.877990116366466e-06, + "loss": 0.87866771, + "num_input_tokens_seen": 49797460, + "step": 2302, + "time_per_iteration": 2.5766992568969727 + }, + { + "auxiliary_loss_clip": 0.01071722, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.04001153, + "balance_loss_mlp": 1.03040981, + "epoch": 0.13846385089433338, + "flos": 70510998286080.0, + "grad_norm": 0.7569729372277287, + "language_loss": 0.65610898, + "learning_rate": 3.877856132957667e-06, + "loss": 0.67715645, + "num_input_tokens_seen": 49868005, + "step": 2303, + "time_per_iteration": 3.2609856128692627 + }, + { + "auxiliary_loss_clip": 0.01140572, + "auxiliary_loss_mlp": 0.01037554, + "balance_loss_clip": 1.04766881, + "balance_loss_mlp": 1.01987529, + "epoch": 0.13852397414700135, + "flos": 17348732956800.0, + "grad_norm": 1.9846981662904268, + "language_loss": 0.78875911, + "learning_rate": 3.877722078340374e-06, + "loss": 0.81054038, + "num_input_tokens_seen": 49885825, + "step": 2304, + "time_per_iteration": 2.5090043544769287 + }, + { + "auxiliary_loss_clip": 0.01146448, + "auxiliary_loss_mlp": 0.0103769, + "balance_loss_clip": 1.05027914, + "balance_loss_mlp": 1.01953459, + "epoch": 0.13858409739966931, + "flos": 21543781334400.0, + "grad_norm": 1.988349723931265, + "language_loss": 0.7780304, + "learning_rate": 3.877587952519672e-06, + "loss": 0.7998718, + "num_input_tokens_seen": 49905975, + "step": 2305, + "time_per_iteration": 2.509162664413452 + }, + { + "auxiliary_loss_clip": 0.01070102, + "auxiliary_loss_mlp": 0.01044866, + "balance_loss_clip": 1.04341972, + "balance_loss_mlp": 1.02595925, + "epoch": 0.13864422065233728, + "flos": 21579907438080.0, + "grad_norm": 1.9203680995644727, + "language_loss": 0.87776464, + "learning_rate": 3.877453755500647e-06, + "loss": 0.89891434, + "num_input_tokens_seen": 49925800, + "step": 2306, + "time_per_iteration": 2.67519474029541 + }, + { + "auxiliary_loss_clip": 0.01078296, + "auxiliary_loss_mlp": 0.01004181, + "balance_loss_clip": 1.03114533, + "balance_loss_mlp": 1.00146282, + "epoch": 0.13870434390500527, + "flos": 53371156872960.0, + "grad_norm": 0.8773613800815611, + "language_loss": 0.59049106, + "learning_rate": 3.877319487288387e-06, + "loss": 0.61131585, + "num_input_tokens_seen": 49977620, + "step": 2307, + "time_per_iteration": 3.1401150226593018 + }, + { + "auxiliary_loss_clip": 0.01160115, + "auxiliary_loss_mlp": 0.00951028, + "balance_loss_clip": 1.05057335, + "balance_loss_mlp": 1.26436496, + "epoch": 0.13876446715767324, + "flos": 22565906749440.0, + "grad_norm": 2.0526371189203036, + "language_loss": 0.79385793, + "learning_rate": 3.877185147887984e-06, + "loss": 0.81496936, + "num_input_tokens_seen": 49996650, + "step": 2308, + "time_per_iteration": 2.5025112628936768 + }, + { + "auxiliary_loss_clip": 0.01120328, + "auxiliary_loss_mlp": 0.01040861, + "balance_loss_clip": 1.04822564, + "balance_loss_mlp": 1.02187061, + "epoch": 0.1388245904103412, + "flos": 20705231352960.0, + "grad_norm": 2.5174666685254095, + "language_loss": 0.77688539, + "learning_rate": 3.877050737304533e-06, + "loss": 0.7984972, + "num_input_tokens_seen": 50015640, + "step": 2309, + "time_per_iteration": 2.5749220848083496 + }, + { + "auxiliary_loss_clip": 0.01120434, + "auxiliary_loss_mlp": 0.01043155, + "balance_loss_clip": 1.04771185, + "balance_loss_mlp": 1.02436769, + "epoch": 0.13888471366300917, + "flos": 20554729367040.0, + "grad_norm": 2.410427148926027, + "language_loss": 0.67824566, + "learning_rate": 3.876916255543129e-06, + "loss": 0.69988155, + "num_input_tokens_seen": 50033500, + "step": 2310, + "time_per_iteration": 3.979099988937378 + }, + { + "auxiliary_loss_clip": 0.01156701, + "auxiliary_loss_mlp": 0.01045293, + "balance_loss_clip": 1.0491786, + "balance_loss_mlp": 1.02521801, + "epoch": 0.13894483691567713, + "flos": 13838033473920.0, + "grad_norm": 2.3216118989655286, + "language_loss": 0.83904058, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.86106056, + "num_input_tokens_seen": 50050075, + "step": 2311, + "time_per_iteration": 3.834423542022705 + }, + { + "auxiliary_loss_clip": 0.01161533, + "auxiliary_loss_mlp": 0.01043047, + "balance_loss_clip": 1.05083394, + "balance_loss_mlp": 1.02425933, + "epoch": 0.1390049601683451, + "flos": 28031186759040.0, + "grad_norm": 3.841084755412252, + "language_loss": 0.82023889, + "learning_rate": 3.876647078506866e-06, + "loss": 0.84228468, + "num_input_tokens_seen": 50070080, + "step": 2312, + "time_per_iteration": 2.509683847427368 + }, + { + "auxiliary_loss_clip": 0.01117831, + "auxiliary_loss_mlp": 0.00940582, + "balance_loss_clip": 1.04907608, + "balance_loss_mlp": 1.24452209, + "epoch": 0.13906508342101306, + "flos": 26756860976640.0, + "grad_norm": 2.673175701754439, + "language_loss": 0.86887813, + "learning_rate": 3.876512383242215e-06, + "loss": 0.88946223, + "num_input_tokens_seen": 50090040, + "step": 2313, + "time_per_iteration": 2.6080315113067627 + }, + { + "auxiliary_loss_clip": 0.01156216, + "auxiliary_loss_mlp": 0.01047957, + "balance_loss_clip": 1.05036461, + "balance_loss_mlp": 1.02903867, + "epoch": 0.13912520667368106, + "flos": 24535104111360.0, + "grad_norm": 2.1793622707873954, + "language_loss": 0.79971445, + "learning_rate": 3.876377616820024e-06, + "loss": 0.82175612, + "num_input_tokens_seen": 50110595, + "step": 2314, + "time_per_iteration": 3.90889310836792 + }, + { + "auxiliary_loss_clip": 0.01117439, + "auxiliary_loss_mlp": 0.01040836, + "balance_loss_clip": 1.04879045, + "balance_loss_mlp": 1.02220333, + "epoch": 0.13918532992634902, + "flos": 19383215287680.0, + "grad_norm": 2.9176018570663196, + "language_loss": 0.85337019, + "learning_rate": 3.876242779245409e-06, + "loss": 0.87495291, + "num_input_tokens_seen": 50125430, + "step": 2315, + "time_per_iteration": 2.5211880207061768 + }, + { + "auxiliary_loss_clip": 0.01145014, + "auxiliary_loss_mlp": 0.01041614, + "balance_loss_clip": 1.04780459, + "balance_loss_mlp": 1.02245688, + "epoch": 0.139245453179017, + "flos": 21323756574720.0, + "grad_norm": 2.287500138878055, + "language_loss": 0.77211845, + "learning_rate": 3.876107870523477e-06, + "loss": 0.79398471, + "num_input_tokens_seen": 50144120, + "step": 2316, + "time_per_iteration": 2.5094428062438965 + }, + { + "auxiliary_loss_clip": 0.01153948, + "auxiliary_loss_mlp": 0.00939395, + "balance_loss_clip": 1.04812002, + "balance_loss_mlp": 1.24493027, + "epoch": 0.13930557643168495, + "flos": 19500607912320.0, + "grad_norm": 1.995780809231942, + "language_loss": 0.76920104, + "learning_rate": 3.875972890659349e-06, + "loss": 0.79013443, + "num_input_tokens_seen": 50162500, + "step": 2317, + "time_per_iteration": 2.474806547164917 + }, + { + "auxiliary_loss_clip": 0.01133562, + "auxiliary_loss_mlp": 0.0104122, + "balance_loss_clip": 1.04748893, + "balance_loss_mlp": 1.02284956, + "epoch": 0.13936569968435292, + "flos": 25410821690880.0, + "grad_norm": 2.2984703660616828, + "language_loss": 0.80349118, + "learning_rate": 3.875837839658139e-06, + "loss": 0.82523894, + "num_input_tokens_seen": 50182415, + "step": 2318, + "time_per_iteration": 4.003982782363892 + }, + { + "auxiliary_loss_clip": 0.01051464, + "auxiliary_loss_mlp": 0.01007544, + "balance_loss_clip": 1.02659345, + "balance_loss_mlp": 1.00514817, + "epoch": 0.13942582293702088, + "flos": 70771063731840.0, + "grad_norm": 0.872366834080059, + "language_loss": 0.59051883, + "learning_rate": 3.87570271752497e-06, + "loss": 0.6111089, + "num_input_tokens_seen": 50245160, + "step": 2319, + "time_per_iteration": 3.1743321418762207 + }, + { + "auxiliary_loss_clip": 0.01111531, + "auxiliary_loss_mlp": 0.01048904, + "balance_loss_clip": 1.04519451, + "balance_loss_mlp": 1.02934158, + "epoch": 0.13948594618968888, + "flos": 35590885920000.0, + "grad_norm": 2.224814398313495, + "language_loss": 0.6545074, + "learning_rate": 3.875567524264967e-06, + "loss": 0.67611176, + "num_input_tokens_seen": 50268215, + "step": 2320, + "time_per_iteration": 2.699268102645874 + }, + { + "auxiliary_loss_clip": 0.01092786, + "auxiliary_loss_mlp": 0.01047179, + "balance_loss_clip": 1.04770553, + "balance_loss_mlp": 1.02796245, + "epoch": 0.13954606944235684, + "flos": 21105204272640.0, + "grad_norm": 1.5100877764740497, + "language_loss": 0.7070905, + "learning_rate": 3.875432259883256e-06, + "loss": 0.72849011, + "num_input_tokens_seen": 50288575, + "step": 2321, + "time_per_iteration": 2.6275782585144043 + }, + { + "auxiliary_loss_clip": 0.01110368, + "auxiliary_loss_mlp": 0.01060198, + "balance_loss_clip": 1.04382467, + "balance_loss_mlp": 1.03845382, + "epoch": 0.1396061926950248, + "flos": 25044425009280.0, + "grad_norm": 1.9847061420442231, + "language_loss": 0.86002183, + "learning_rate": 3.875296924384965e-06, + "loss": 0.88172746, + "num_input_tokens_seen": 50308735, + "step": 2322, + "time_per_iteration": 2.6159567832946777 + }, + { + "auxiliary_loss_clip": 0.0111402, + "auxiliary_loss_mlp": 0.01047734, + "balance_loss_clip": 1.04425669, + "balance_loss_mlp": 1.02957857, + "epoch": 0.13966631594769277, + "flos": 37634023428480.0, + "grad_norm": 2.4228963497799367, + "language_loss": 0.67399824, + "learning_rate": 3.875161517775226e-06, + "loss": 0.69561583, + "num_input_tokens_seen": 50331025, + "step": 2323, + "time_per_iteration": 2.713493824005127 + }, + { + "auxiliary_loss_clip": 0.01120014, + "auxiliary_loss_mlp": 0.01046442, + "balance_loss_clip": 1.04763234, + "balance_loss_mlp": 1.027071, + "epoch": 0.13972643920036074, + "flos": 16690993061760.0, + "grad_norm": 1.9718052121175653, + "language_loss": 0.88877928, + "learning_rate": 3.875026040059175e-06, + "loss": 0.9104439, + "num_input_tokens_seen": 50349725, + "step": 2324, + "time_per_iteration": 2.5433168411254883 + }, + { + "auxiliary_loss_clip": 0.01143469, + "auxiliary_loss_mlp": 0.01050386, + "balance_loss_clip": 1.04609513, + "balance_loss_mlp": 1.03112173, + "epoch": 0.1397865624530287, + "flos": 23331055288320.0, + "grad_norm": 3.0263083909692226, + "language_loss": 0.71444559, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.73638415, + "num_input_tokens_seen": 50367965, + "step": 2325, + "time_per_iteration": 2.5170705318450928 + }, + { + "auxiliary_loss_clip": 0.01126195, + "auxiliary_loss_mlp": 0.00921269, + "balance_loss_clip": 1.04889286, + "balance_loss_mlp": 1.21076131, + "epoch": 0.13984668570569667, + "flos": 22778317825920.0, + "grad_norm": 1.9107831827321762, + "language_loss": 0.81782711, + "learning_rate": 3.874754871328688e-06, + "loss": 0.83830178, + "num_input_tokens_seen": 50385605, + "step": 2326, + "time_per_iteration": 2.5563580989837646 + }, + { + "auxiliary_loss_clip": 0.01142762, + "auxiliary_loss_mlp": 0.01042677, + "balance_loss_clip": 1.04927969, + "balance_loss_mlp": 1.02476001, + "epoch": 0.13990680895836466, + "flos": 19464553635840.0, + "grad_norm": 2.169482508787621, + "language_loss": 0.89070171, + "learning_rate": 3.874619180324534e-06, + "loss": 0.91255605, + "num_input_tokens_seen": 50403985, + "step": 2327, + "time_per_iteration": 2.5271570682525635 + }, + { + "auxiliary_loss_clip": 0.01115175, + "auxiliary_loss_mlp": 0.01059997, + "balance_loss_clip": 1.04857409, + "balance_loss_mlp": 1.0397675, + "epoch": 0.13996693221103262, + "flos": 20303283185280.0, + "grad_norm": 2.132554526858539, + "language_loss": 0.85315973, + "learning_rate": 3.874483418234632e-06, + "loss": 0.87491143, + "num_input_tokens_seen": 50421590, + "step": 2328, + "time_per_iteration": 2.554732322692871 + }, + { + "auxiliary_loss_clip": 0.01142902, + "auxiliary_loss_mlp": 0.01043216, + "balance_loss_clip": 1.04781806, + "balance_loss_mlp": 1.02416646, + "epoch": 0.1400270554637006, + "flos": 26617707688320.0, + "grad_norm": 2.218225721326118, + "language_loss": 0.73846287, + "learning_rate": 3.874347585064131e-06, + "loss": 0.760324, + "num_input_tokens_seen": 50443945, + "step": 2329, + "time_per_iteration": 2.570281505584717 + }, + { + "auxiliary_loss_clip": 0.01144987, + "auxiliary_loss_mlp": 0.01047085, + "balance_loss_clip": 1.04784083, + "balance_loss_mlp": 1.02715349, + "epoch": 0.14008717871636855, + "flos": 19391475415680.0, + "grad_norm": 2.368847262634538, + "language_loss": 0.78365093, + "learning_rate": 3.874211680818183e-06, + "loss": 0.80557162, + "num_input_tokens_seen": 50462065, + "step": 2330, + "time_per_iteration": 2.4848597049713135 + }, + { + "auxiliary_loss_clip": 0.01135589, + "auxiliary_loss_mlp": 0.01042778, + "balance_loss_clip": 1.04652834, + "balance_loss_mlp": 1.02407432, + "epoch": 0.14014730196903652, + "flos": 15304266645120.0, + "grad_norm": 2.3785602183077, + "language_loss": 0.72316593, + "learning_rate": 3.87407570550194e-06, + "loss": 0.74494964, + "num_input_tokens_seen": 50479565, + "step": 2331, + "time_per_iteration": 2.51332426071167 + }, + { + "auxiliary_loss_clip": 0.0115392, + "auxiliary_loss_mlp": 0.01054106, + "balance_loss_clip": 1.05134177, + "balance_loss_mlp": 1.03528285, + "epoch": 0.14020742522170448, + "flos": 14939701557120.0, + "grad_norm": 1.703069969585632, + "language_loss": 0.7249344, + "learning_rate": 3.873939659120557e-06, + "loss": 0.74701464, + "num_input_tokens_seen": 50497305, + "step": 2332, + "time_per_iteration": 2.461261749267578 + }, + { + "auxiliary_loss_clip": 0.01069512, + "auxiliary_loss_mlp": 0.01004913, + "balance_loss_clip": 1.03079426, + "balance_loss_mlp": 1.00178957, + "epoch": 0.14026754847437245, + "flos": 48824580044160.0, + "grad_norm": 0.8292856690048175, + "language_loss": 0.56106591, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58181012, + "num_input_tokens_seen": 50549735, + "step": 2333, + "time_per_iteration": 2.972109317779541 + }, + { + "auxiliary_loss_clip": 0.01123975, + "auxiliary_loss_mlp": 0.01047276, + "balance_loss_clip": 1.05167317, + "balance_loss_mlp": 1.02846515, + "epoch": 0.14032767172704044, + "flos": 25773267876480.0, + "grad_norm": 2.3558414455096384, + "language_loss": 0.82983583, + "learning_rate": 3.873667353183016e-06, + "loss": 0.85154831, + "num_input_tokens_seen": 50570100, + "step": 2334, + "time_per_iteration": 2.617372751235962 + }, + { + "auxiliary_loss_clip": 0.0112794, + "auxiliary_loss_mlp": 0.01047632, + "balance_loss_clip": 1.05133891, + "balance_loss_mlp": 1.02966738, + "epoch": 0.1403877949797084, + "flos": 21216312017280.0, + "grad_norm": 2.6472498371203574, + "language_loss": 0.80997992, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83173561, + "num_input_tokens_seen": 50589185, + "step": 2335, + "time_per_iteration": 2.616795778274536 + }, + { + "auxiliary_loss_clip": 0.01109947, + "auxiliary_loss_mlp": 0.01049077, + "balance_loss_clip": 1.05299914, + "balance_loss_mlp": 1.02606964, + "epoch": 0.14044791823237637, + "flos": 22747973811840.0, + "grad_norm": 1.8835121504200492, + "language_loss": 0.82102263, + "learning_rate": 3.873394763046862e-06, + "loss": 0.84261286, + "num_input_tokens_seen": 50609645, + "step": 2336, + "time_per_iteration": 2.614981174468994 + }, + { + "auxiliary_loss_clip": 0.01147727, + "auxiliary_loss_mlp": 0.01050173, + "balance_loss_clip": 1.05568528, + "balance_loss_mlp": 1.03005028, + "epoch": 0.14050804148504434, + "flos": 22964443125120.0, + "grad_norm": 1.8328119586457146, + "language_loss": 0.80571866, + "learning_rate": 3.873258361417225e-06, + "loss": 0.82769763, + "num_input_tokens_seen": 50628385, + "step": 2337, + "time_per_iteration": 2.5789742469787598 + }, + { + "auxiliary_loss_clip": 0.01144204, + "auxiliary_loss_mlp": 0.01047553, + "balance_loss_clip": 1.05084181, + "balance_loss_mlp": 1.02886081, + "epoch": 0.1405681647377123, + "flos": 22200336080640.0, + "grad_norm": 1.9742930519734958, + "language_loss": 0.79585469, + "learning_rate": 3.873121888753442e-06, + "loss": 0.81777227, + "num_input_tokens_seen": 50647260, + "step": 2338, + "time_per_iteration": 2.5856409072875977 + }, + { + "auxiliary_loss_clip": 0.0115211, + "auxiliary_loss_mlp": 0.0104559, + "balance_loss_clip": 1.05629122, + "balance_loss_mlp": 1.02501476, + "epoch": 0.14062828799038027, + "flos": 23732787974400.0, + "grad_norm": 2.442499847692067, + "language_loss": 0.80119693, + "learning_rate": 3.87298534506069e-06, + "loss": 0.82317394, + "num_input_tokens_seen": 50666130, + "step": 2339, + "time_per_iteration": 2.5583200454711914 + }, + { + "auxiliary_loss_clip": 0.01095591, + "auxiliary_loss_mlp": 0.01053299, + "balance_loss_clip": 1.04831469, + "balance_loss_mlp": 1.0336895, + "epoch": 0.14068841124304826, + "flos": 39202493685120.0, + "grad_norm": 1.8808790666082227, + "language_loss": 0.65868014, + "learning_rate": 3.872848730344146e-06, + "loss": 0.68016905, + "num_input_tokens_seen": 50687440, + "step": 2340, + "time_per_iteration": 2.792349338531494 + }, + { + "auxiliary_loss_clip": 0.01143736, + "auxiliary_loss_mlp": 0.01046777, + "balance_loss_clip": 1.05116522, + "balance_loss_mlp": 1.02741706, + "epoch": 0.14074853449571623, + "flos": 20192283181440.0, + "grad_norm": 2.8043356207839296, + "language_loss": 0.78892392, + "learning_rate": 3.87271204460899e-06, + "loss": 0.81082904, + "num_input_tokens_seen": 50704030, + "step": 2341, + "time_per_iteration": 2.4812891483306885 + }, + { + "auxiliary_loss_clip": 0.01156651, + "auxiliary_loss_mlp": 0.01047817, + "balance_loss_clip": 1.05123103, + "balance_loss_mlp": 1.02906525, + "epoch": 0.1408086577483842, + "flos": 18405871153920.0, + "grad_norm": 2.353033530720655, + "language_loss": 0.80257642, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.8246212, + "num_input_tokens_seen": 50723305, + "step": 2342, + "time_per_iteration": 2.516427755355835 + }, + { + "auxiliary_loss_clip": 0.0114877, + "auxiliary_loss_mlp": 0.01042877, + "balance_loss_clip": 1.05727243, + "balance_loss_mlp": 1.02482891, + "epoch": 0.14086878100105216, + "flos": 25264593423360.0, + "grad_norm": 2.225795725898041, + "language_loss": 0.77608055, + "learning_rate": 3.87243846010358e-06, + "loss": 0.797997, + "num_input_tokens_seen": 50743270, + "step": 2343, + "time_per_iteration": 2.540926456451416 + }, + { + "auxiliary_loss_clip": 0.01062581, + "auxiliary_loss_mlp": 0.01003326, + "balance_loss_clip": 1.03434563, + "balance_loss_mlp": 1.00046468, + "epoch": 0.14092890425372012, + "flos": 65978388869760.0, + "grad_norm": 0.8436246852505997, + "language_loss": 0.61455333, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63521242, + "num_input_tokens_seen": 50802710, + "step": 2344, + "time_per_iteration": 3.0747182369232178 + }, + { + "auxiliary_loss_clip": 0.01140755, + "auxiliary_loss_mlp": 0.01042702, + "balance_loss_clip": 1.04659581, + "balance_loss_mlp": 1.02534556, + "epoch": 0.1409890275063881, + "flos": 23694973931520.0, + "grad_norm": 1.5533608229908993, + "language_loss": 0.64724469, + "learning_rate": 3.872164591585956e-06, + "loss": 0.6690793, + "num_input_tokens_seen": 50822625, + "step": 2345, + "time_per_iteration": 2.5403249263763428 + }, + { + "auxiliary_loss_clip": 0.01151247, + "auxiliary_loss_mlp": 0.01042777, + "balance_loss_clip": 1.04772937, + "balance_loss_mlp": 1.02183163, + "epoch": 0.14104915075905605, + "flos": 23623152687360.0, + "grad_norm": 2.1710525329230896, + "language_loss": 0.7372272, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.75916743, + "num_input_tokens_seen": 50842330, + "step": 2346, + "time_per_iteration": 2.521197557449341 + }, + { + "auxiliary_loss_clip": 0.01145804, + "auxiliary_loss_mlp": 0.0104394, + "balance_loss_clip": 1.05222726, + "balance_loss_mlp": 1.02422285, + "epoch": 0.14110927401172405, + "flos": 20595165102720.0, + "grad_norm": 2.11144005097669, + "language_loss": 0.77865791, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.80055535, + "num_input_tokens_seen": 50861035, + "step": 2347, + "time_per_iteration": 2.4952824115753174 + }, + { + "auxiliary_loss_clip": 0.01159488, + "auxiliary_loss_mlp": 0.01047713, + "balance_loss_clip": 1.05093908, + "balance_loss_mlp": 1.02966452, + "epoch": 0.141169397264392, + "flos": 28548049512960.0, + "grad_norm": 2.7973745479466285, + "language_loss": 0.7680254, + "learning_rate": 3.8717532563775e-06, + "loss": 0.79009736, + "num_input_tokens_seen": 50880105, + "step": 2348, + "time_per_iteration": 3.8984763622283936 + }, + { + "auxiliary_loss_clip": 0.01144058, + "auxiliary_loss_mlp": 0.01046217, + "balance_loss_clip": 1.04949188, + "balance_loss_mlp": 1.02645206, + "epoch": 0.14122952051705998, + "flos": 17092258871040.0, + "grad_norm": 1.7811837062399303, + "language_loss": 0.86548471, + "learning_rate": 3.871616002680272e-06, + "loss": 0.88738751, + "num_input_tokens_seen": 50897720, + "step": 2349, + "time_per_iteration": 2.504899501800537 + }, + { + "auxiliary_loss_clip": 0.01146344, + "auxiliary_loss_mlp": 0.01047456, + "balance_loss_clip": 1.05331147, + "balance_loss_mlp": 1.0291214, + "epoch": 0.14128964376972794, + "flos": 28946801370240.0, + "grad_norm": 1.744153400578831, + "language_loss": 0.88802528, + "learning_rate": 3.871478678011177e-06, + "loss": 0.90996331, + "num_input_tokens_seen": 50918385, + "step": 2350, + "time_per_iteration": 3.9522485733032227 + }, + { + "auxiliary_loss_clip": 0.01139015, + "auxiliary_loss_mlp": 0.01046643, + "balance_loss_clip": 1.05113626, + "balance_loss_mlp": 1.02601993, + "epoch": 0.1413497670223959, + "flos": 18989778643200.0, + "grad_norm": 1.9686023028834925, + "language_loss": 0.80954373, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83140033, + "num_input_tokens_seen": 50938270, + "step": 2351, + "time_per_iteration": 2.563307523727417 + }, + { + "auxiliary_loss_clip": 0.01145942, + "auxiliary_loss_mlp": 0.01042829, + "balance_loss_clip": 1.0491271, + "balance_loss_mlp": 1.02338612, + "epoch": 0.14140989027506387, + "flos": 29862236413440.0, + "grad_norm": 2.9775861775309647, + "language_loss": 0.84139276, + "learning_rate": 3.871203815778219e-06, + "loss": 0.86328042, + "num_input_tokens_seen": 50958155, + "step": 2352, + "time_per_iteration": 2.5839009284973145 + }, + { + "auxiliary_loss_clip": 0.01074234, + "auxiliary_loss_mlp": 0.0101349, + "balance_loss_clip": 1.03581882, + "balance_loss_mlp": 1.0105691, + "epoch": 0.14147001352773186, + "flos": 62079532041600.0, + "grad_norm": 0.9108708315505235, + "language_loss": 0.61892247, + "learning_rate": 3.87106627822478e-06, + "loss": 0.63979971, + "num_input_tokens_seen": 51020705, + "step": 2353, + "time_per_iteration": 4.441821813583374 + }, + { + "auxiliary_loss_clip": 0.01132038, + "auxiliary_loss_mlp": 0.0104884, + "balance_loss_clip": 1.05330324, + "balance_loss_mlp": 1.03035092, + "epoch": 0.14153013678039983, + "flos": 22017514832640.0, + "grad_norm": 1.7911081258702715, + "language_loss": 0.8733592, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.89516801, + "num_input_tokens_seen": 51039995, + "step": 2354, + "time_per_iteration": 2.6256446838378906 + }, + { + "auxiliary_loss_clip": 0.01118874, + "auxiliary_loss_mlp": 0.01046174, + "balance_loss_clip": 1.0518589, + "balance_loss_mlp": 1.0258007, + "epoch": 0.1415902600330678, + "flos": 19720093968000.0, + "grad_norm": 2.353340887008484, + "language_loss": 0.74907368, + "learning_rate": 3.870790990270057e-06, + "loss": 0.77072418, + "num_input_tokens_seen": 51059075, + "step": 2355, + "time_per_iteration": 2.5900442600250244 + }, + { + "auxiliary_loss_clip": 0.01072228, + "auxiliary_loss_mlp": 0.01003831, + "balance_loss_clip": 1.03406572, + "balance_loss_mlp": 1.0008868, + "epoch": 0.14165038328573576, + "flos": 65900929190400.0, + "grad_norm": 0.6776336419900828, + "language_loss": 0.5181582, + "learning_rate": 3.870653239879212e-06, + "loss": 0.53891879, + "num_input_tokens_seen": 51120380, + "step": 2356, + "time_per_iteration": 4.516743421554565 + }, + { + "auxiliary_loss_clip": 0.01157027, + "auxiliary_loss_mlp": 0.01050056, + "balance_loss_clip": 1.04953122, + "balance_loss_mlp": 1.03070796, + "epoch": 0.14171050653840372, + "flos": 12130158533760.0, + "grad_norm": 2.432961504733317, + "language_loss": 0.70673931, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.72881019, + "num_input_tokens_seen": 51136950, + "step": 2357, + "time_per_iteration": 2.529110908508301 + }, + { + "auxiliary_loss_clip": 0.01111861, + "auxiliary_loss_mlp": 0.01054162, + "balance_loss_clip": 1.04524815, + "balance_loss_mlp": 1.03496933, + "epoch": 0.1417706297910717, + "flos": 20412487509120.0, + "grad_norm": 1.8516710233737657, + "language_loss": 0.82284719, + "learning_rate": 3.870377526296674e-06, + "loss": 0.84450746, + "num_input_tokens_seen": 51155175, + "step": 2358, + "time_per_iteration": 2.6158409118652344 + }, + { + "auxiliary_loss_clip": 0.0113962, + "auxiliary_loss_mlp": 0.01048205, + "balance_loss_clip": 1.04979026, + "balance_loss_mlp": 1.02823782, + "epoch": 0.14183075304373965, + "flos": 22380607463040.0, + "grad_norm": 2.6389974228093855, + "language_loss": 0.72568476, + "learning_rate": 3.870239563115436e-06, + "loss": 0.747563, + "num_input_tokens_seen": 51174500, + "step": 2359, + "time_per_iteration": 2.5967633724212646 + }, + { + "auxiliary_loss_clip": 0.01114598, + "auxiliary_loss_mlp": 0.00891882, + "balance_loss_clip": 1.06378746, + "balance_loss_mlp": 1.16804254, + "epoch": 0.14189087629640765, + "flos": 21580913018880.0, + "grad_norm": 2.189460943415681, + "language_loss": 0.75498432, + "learning_rate": 3.870101529014526e-06, + "loss": 0.77504909, + "num_input_tokens_seen": 51194270, + "step": 2360, + "time_per_iteration": 2.625854253768921 + }, + { + "auxiliary_loss_clip": 0.01104416, + "auxiliary_loss_mlp": 0.01048787, + "balance_loss_clip": 1.04892302, + "balance_loss_mlp": 1.02754354, + "epoch": 0.1419509995490756, + "flos": 20008564093440.0, + "grad_norm": 3.1749598234963394, + "language_loss": 0.81500852, + "learning_rate": 3.869963423999178e-06, + "loss": 0.83654052, + "num_input_tokens_seen": 51211850, + "step": 2361, + "time_per_iteration": 2.5813772678375244 + }, + { + "auxiliary_loss_clip": 0.01143059, + "auxiliary_loss_mlp": 0.01050351, + "balance_loss_clip": 1.04883194, + "balance_loss_mlp": 1.03006184, + "epoch": 0.14201112280174358, + "flos": 31941464112000.0, + "grad_norm": 2.1968476439628852, + "language_loss": 0.7432307, + "learning_rate": 3.86982524807463e-06, + "loss": 0.76516485, + "num_input_tokens_seen": 51233545, + "step": 2362, + "time_per_iteration": 2.5793187618255615 + }, + { + "auxiliary_loss_clip": 0.01149184, + "auxiliary_loss_mlp": 0.01045051, + "balance_loss_clip": 1.05184603, + "balance_loss_mlp": 1.02479768, + "epoch": 0.14207124605441154, + "flos": 41464147582080.0, + "grad_norm": 2.021978718972557, + "language_loss": 0.7405777, + "learning_rate": 3.869687001246122e-06, + "loss": 0.76252007, + "num_input_tokens_seen": 51257615, + "step": 2363, + "time_per_iteration": 2.6722002029418945 + }, + { + "auxiliary_loss_clip": 0.01120905, + "auxiliary_loss_mlp": 0.01045643, + "balance_loss_clip": 1.04627252, + "balance_loss_mlp": 1.02530575, + "epoch": 0.1421313693070795, + "flos": 31905086613120.0, + "grad_norm": 1.897988213137336, + "language_loss": 0.72725266, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.74891818, + "num_input_tokens_seen": 51279645, + "step": 2364, + "time_per_iteration": 2.635650396347046 + }, + { + "auxiliary_loss_clip": 0.01133052, + "auxiliary_loss_mlp": 0.01043589, + "balance_loss_clip": 1.04663527, + "balance_loss_mlp": 1.02571917, + "epoch": 0.14219149255974747, + "flos": 26871165031680.0, + "grad_norm": 1.8514624265372541, + "language_loss": 0.90988708, + "learning_rate": 3.869410294898195e-06, + "loss": 0.9316535, + "num_input_tokens_seen": 51299775, + "step": 2365, + "time_per_iteration": 2.5796475410461426 + }, + { + "auxiliary_loss_clip": 0.01119466, + "auxiliary_loss_mlp": 0.01049858, + "balance_loss_clip": 1.04390836, + "balance_loss_mlp": 1.02927017, + "epoch": 0.14225161581241544, + "flos": 27454426076160.0, + "grad_norm": 2.10264149058101, + "language_loss": 0.65041018, + "learning_rate": 3.869271835389268e-06, + "loss": 0.67210346, + "num_input_tokens_seen": 51319430, + "step": 2366, + "time_per_iteration": 2.570834159851074 + }, + { + "auxiliary_loss_clip": 0.01139515, + "auxiliary_loss_mlp": 0.01049053, + "balance_loss_clip": 1.05209172, + "balance_loss_mlp": 1.02900195, + "epoch": 0.14231173906508343, + "flos": 10561436881920.0, + "grad_norm": 2.030600819027385, + "language_loss": 0.8067565, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.82864219, + "num_input_tokens_seen": 51336045, + "step": 2367, + "time_per_iteration": 2.4998860359191895 + }, + { + "auxiliary_loss_clip": 0.01134484, + "auxiliary_loss_mlp": 0.01055894, + "balance_loss_clip": 1.05267739, + "balance_loss_mlp": 1.03505635, + "epoch": 0.1423718623177514, + "flos": 28360882719360.0, + "grad_norm": 1.838936507219784, + "language_loss": 0.82819843, + "learning_rate": 3.868994703727742e-06, + "loss": 0.85010225, + "num_input_tokens_seen": 51357030, + "step": 2368, + "time_per_iteration": 2.5843539237976074 + }, + { + "auxiliary_loss_clip": 0.01119932, + "auxiliary_loss_mlp": 0.01051186, + "balance_loss_clip": 1.0546186, + "balance_loss_mlp": 1.03052735, + "epoch": 0.14243198557041936, + "flos": 19354235990400.0, + "grad_norm": 2.2887122716968538, + "language_loss": 0.870224, + "learning_rate": 3.868856031585652e-06, + "loss": 0.89193517, + "num_input_tokens_seen": 51374890, + "step": 2369, + "time_per_iteration": 2.5474843978881836 + }, + { + "auxiliary_loss_clip": 0.01121296, + "auxiliary_loss_mlp": 0.01049207, + "balance_loss_clip": 1.04974151, + "balance_loss_mlp": 1.02941799, + "epoch": 0.14249210882308733, + "flos": 28806857982720.0, + "grad_norm": 2.387897387838576, + "language_loss": 0.76191443, + "learning_rate": 3.868717288576354e-06, + "loss": 0.7836194, + "num_input_tokens_seen": 51398100, + "step": 2370, + "time_per_iteration": 2.6298346519470215 + }, + { + "auxiliary_loss_clip": 0.01144442, + "auxiliary_loss_mlp": 0.00866392, + "balance_loss_clip": 1.04747415, + "balance_loss_mlp": 1.12016606, + "epoch": 0.1425522320757553, + "flos": 21835016807040.0, + "grad_norm": 3.1318280629287476, + "language_loss": 0.8292712, + "learning_rate": 3.868578474705109e-06, + "loss": 0.84937954, + "num_input_tokens_seen": 51418745, + "step": 2371, + "time_per_iteration": 2.50667667388916 + }, + { + "auxiliary_loss_clip": 0.01163409, + "auxiliary_loss_mlp": 0.01046862, + "balance_loss_clip": 1.05431151, + "balance_loss_mlp": 1.02702498, + "epoch": 0.14261235532842326, + "flos": 17311457617920.0, + "grad_norm": 1.8992084806570018, + "language_loss": 0.82754177, + "learning_rate": 3.868439589977181e-06, + "loss": 0.84964448, + "num_input_tokens_seen": 51437455, + "step": 2372, + "time_per_iteration": 2.4611079692840576 + }, + { + "auxiliary_loss_clip": 0.01162847, + "auxiliary_loss_mlp": 0.01046862, + "balance_loss_clip": 1.05429161, + "balance_loss_mlp": 1.02724028, + "epoch": 0.14267247858109125, + "flos": 18806741913600.0, + "grad_norm": 2.347253838390265, + "language_loss": 0.84869987, + "learning_rate": 3.868300634397836e-06, + "loss": 0.87079692, + "num_input_tokens_seen": 51455710, + "step": 2373, + "time_per_iteration": 2.4502651691436768 + }, + { + "auxiliary_loss_clip": 0.0113331, + "auxiliary_loss_mlp": 0.01053804, + "balance_loss_clip": 1.05015886, + "balance_loss_mlp": 1.03592253, + "epoch": 0.14273260183375922, + "flos": 11358904682880.0, + "grad_norm": 2.0192949736253407, + "language_loss": 0.85991794, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88178909, + "num_input_tokens_seen": 51471270, + "step": 2374, + "time_per_iteration": 2.488788604736328 + }, + { + "auxiliary_loss_clip": 0.01150064, + "auxiliary_loss_mlp": 0.01052147, + "balance_loss_clip": 1.04906094, + "balance_loss_mlp": 1.03208458, + "epoch": 0.14279272508642718, + "flos": 27567688636800.0, + "grad_norm": 1.8171293793217045, + "language_loss": 0.79282093, + "learning_rate": 3.868022510705977e-06, + "loss": 0.81484306, + "num_input_tokens_seen": 51492705, + "step": 2375, + "time_per_iteration": 2.5768489837646484 + }, + { + "auxiliary_loss_clip": 0.01150901, + "auxiliary_loss_mlp": 0.01055518, + "balance_loss_clip": 1.05348802, + "balance_loss_mlp": 1.03671908, + "epoch": 0.14285284833909515, + "flos": 16252559654400.0, + "grad_norm": 2.9996575294357637, + "language_loss": 0.76530689, + "learning_rate": 3.867883342604009e-06, + "loss": 0.78737104, + "num_input_tokens_seen": 51510780, + "step": 2376, + "time_per_iteration": 2.467540740966797 + }, + { + "auxiliary_loss_clip": 0.01150554, + "auxiliary_loss_mlp": 0.01047552, + "balance_loss_clip": 1.05290425, + "balance_loss_mlp": 1.02831149, + "epoch": 0.1429129715917631, + "flos": 19755609540480.0, + "grad_norm": 1.746192914125844, + "language_loss": 0.92959297, + "learning_rate": 3.867744103671717e-06, + "loss": 0.95157409, + "num_input_tokens_seen": 51531400, + "step": 2377, + "time_per_iteration": 2.5246336460113525 + }, + { + "auxiliary_loss_clip": 0.01138522, + "auxiliary_loss_mlp": 0.01045181, + "balance_loss_clip": 1.05441332, + "balance_loss_mlp": 1.0244745, + "epoch": 0.14297309484443108, + "flos": 21137092571520.0, + "grad_norm": 1.9271980538062394, + "language_loss": 0.9178623, + "learning_rate": 3.867604793914382e-06, + "loss": 0.93969929, + "num_input_tokens_seen": 51548215, + "step": 2378, + "time_per_iteration": 2.5184245109558105 + }, + { + "auxiliary_loss_clip": 0.01157365, + "auxiliary_loss_mlp": 0.01047978, + "balance_loss_clip": 1.05692542, + "balance_loss_mlp": 1.02814174, + "epoch": 0.14303321809709904, + "flos": 23586667447680.0, + "grad_norm": 2.0016679823466017, + "language_loss": 0.73856318, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.7606166, + "num_input_tokens_seen": 51566820, + "step": 2379, + "time_per_iteration": 2.520867109298706 + }, + { + "auxiliary_loss_clip": 0.0112883, + "auxiliary_loss_mlp": 0.01053705, + "balance_loss_clip": 1.05220819, + "balance_loss_mlp": 1.03351116, + "epoch": 0.14309334134976703, + "flos": 15888281875200.0, + "grad_norm": 1.8646790066835648, + "language_loss": 0.78695369, + "learning_rate": 3.867325961945714e-06, + "loss": 0.808779, + "num_input_tokens_seen": 51585075, + "step": 2380, + "time_per_iteration": 2.5406453609466553 + }, + { + "auxiliary_loss_clip": 0.01115325, + "auxiliary_loss_mlp": 0.01051192, + "balance_loss_clip": 1.05328631, + "balance_loss_mlp": 1.03139162, + "epoch": 0.143153464602435, + "flos": 16325601960960.0, + "grad_norm": 2.8080046092020066, + "language_loss": 0.88055819, + "learning_rate": 3.867186439744955e-06, + "loss": 0.90222341, + "num_input_tokens_seen": 51603185, + "step": 2381, + "time_per_iteration": 2.545492649078369 + }, + { + "auxiliary_loss_clip": 0.01134981, + "auxiliary_loss_mlp": 0.01050853, + "balance_loss_clip": 1.05584764, + "balance_loss_mlp": 1.0314219, + "epoch": 0.14321358785510296, + "flos": 17092079303040.0, + "grad_norm": 2.09165048714896, + "language_loss": 0.76850283, + "learning_rate": 3.867046846740299e-06, + "loss": 0.79036117, + "num_input_tokens_seen": 51620880, + "step": 2382, + "time_per_iteration": 2.496264696121216 + }, + { + "auxiliary_loss_clip": 0.01122472, + "auxiliary_loss_mlp": 0.01052256, + "balance_loss_clip": 1.05151629, + "balance_loss_mlp": 1.03306341, + "epoch": 0.14327371110777093, + "flos": 26322916769280.0, + "grad_norm": 2.0530936131284547, + "language_loss": 0.76514703, + "learning_rate": 3.866907182937039e-06, + "loss": 0.78689432, + "num_input_tokens_seen": 51640170, + "step": 2383, + "time_per_iteration": 2.5881659984588623 + }, + { + "auxiliary_loss_clip": 0.0112931, + "auxiliary_loss_mlp": 0.01054031, + "balance_loss_clip": 1.04982519, + "balance_loss_mlp": 1.03297865, + "epoch": 0.1433338343604389, + "flos": 18076462502400.0, + "grad_norm": 2.4139628588268756, + "language_loss": 0.88001651, + "learning_rate": 3.866767448340471e-06, + "loss": 0.90184993, + "num_input_tokens_seen": 51656580, + "step": 2384, + "time_per_iteration": 2.511021137237549 + }, + { + "auxiliary_loss_clip": 0.0115865, + "auxiliary_loss_mlp": 0.01050277, + "balance_loss_clip": 1.05780602, + "balance_loss_mlp": 1.02905774, + "epoch": 0.14339395761310686, + "flos": 15522783033600.0, + "grad_norm": 2.625630882876604, + "language_loss": 0.80109084, + "learning_rate": 3.866627642955895e-06, + "loss": 0.82318014, + "num_input_tokens_seen": 51674645, + "step": 2385, + "time_per_iteration": 2.4832351207733154 + }, + { + "auxiliary_loss_clip": 0.0114987, + "auxiliary_loss_mlp": 0.01046667, + "balance_loss_clip": 1.05168986, + "balance_loss_mlp": 1.02741504, + "epoch": 0.14345408086577485, + "flos": 28548767784960.0, + "grad_norm": 9.104349935864557, + "language_loss": 0.75235558, + "learning_rate": 3.866487766788612e-06, + "loss": 0.77432096, + "num_input_tokens_seen": 51695770, + "step": 2386, + "time_per_iteration": 2.5584936141967773 + }, + { + "auxiliary_loss_clip": 0.01163027, + "auxiliary_loss_mlp": 0.0104409, + "balance_loss_clip": 1.05462766, + "balance_loss_mlp": 1.02464676, + "epoch": 0.14351420411844282, + "flos": 20230061310720.0, + "grad_norm": 2.0740357533184173, + "language_loss": 0.78584349, + "learning_rate": 3.866347819843925e-06, + "loss": 0.80791467, + "num_input_tokens_seen": 51714165, + "step": 2387, + "time_per_iteration": 3.848055362701416 + }, + { + "auxiliary_loss_clip": 0.01135868, + "auxiliary_loss_mlp": 0.01049618, + "balance_loss_clip": 1.05529499, + "balance_loss_mlp": 1.02929306, + "epoch": 0.14357432737111078, + "flos": 19865029345920.0, + "grad_norm": 2.1515238488054638, + "language_loss": 0.82122087, + "learning_rate": 3.866207802127143e-06, + "loss": 0.84307569, + "num_input_tokens_seen": 51734440, + "step": 2388, + "time_per_iteration": 3.93045711517334 + }, + { + "auxiliary_loss_clip": 0.0115209, + "auxiliary_loss_mlp": 0.01045783, + "balance_loss_clip": 1.05611753, + "balance_loss_mlp": 1.02688837, + "epoch": 0.14363445062377875, + "flos": 28256814040320.0, + "grad_norm": 1.9639154784444621, + "language_loss": 0.82253081, + "learning_rate": 3.866067713643573e-06, + "loss": 0.84450954, + "num_input_tokens_seen": 51753730, + "step": 2389, + "time_per_iteration": 2.552861213684082 + }, + { + "auxiliary_loss_clip": 0.01142006, + "auxiliary_loss_mlp": 0.01053346, + "balance_loss_clip": 1.05546343, + "balance_loss_mlp": 1.03274703, + "epoch": 0.1436945738764467, + "flos": 18186672407040.0, + "grad_norm": 1.8818231947866149, + "language_loss": 0.82913232, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.8510859, + "num_input_tokens_seen": 51771195, + "step": 2390, + "time_per_iteration": 2.5056447982788086 + }, + { + "auxiliary_loss_clip": 0.0114839, + "auxiliary_loss_mlp": 0.01051726, + "balance_loss_clip": 1.0550282, + "balance_loss_mlp": 1.03215218, + "epoch": 0.14375469712911468, + "flos": 27307910499840.0, + "grad_norm": 1.7342816294104022, + "language_loss": 0.74428582, + "learning_rate": 3.865787324397324e-06, + "loss": 0.76628697, + "num_input_tokens_seen": 51792290, + "step": 2391, + "time_per_iteration": 3.948801279067993 + }, + { + "auxiliary_loss_clip": 0.01063771, + "auxiliary_loss_mlp": 0.01013555, + "balance_loss_clip": 1.04984844, + "balance_loss_mlp": 1.00957346, + "epoch": 0.14381482038178264, + "flos": 56891445287040.0, + "grad_norm": 0.8707198708032963, + "language_loss": 0.61781198, + "learning_rate": 3.865647023645277e-06, + "loss": 0.63858521, + "num_input_tokens_seen": 51843675, + "step": 2392, + "time_per_iteration": 3.0174596309661865 + }, + { + "auxiliary_loss_clip": 0.01154601, + "auxiliary_loss_mlp": 0.01048503, + "balance_loss_clip": 1.05302262, + "balance_loss_mlp": 1.02695012, + "epoch": 0.14387494363445064, + "flos": 14282177143680.0, + "grad_norm": 2.428920892386481, + "language_loss": 0.7671051, + "learning_rate": 3.865506652147709e-06, + "loss": 0.78913611, + "num_input_tokens_seen": 51860285, + "step": 2393, + "time_per_iteration": 2.493244171142578 + }, + { + "auxiliary_loss_clip": 0.01167249, + "auxiliary_loss_mlp": 0.0104939, + "balance_loss_clip": 1.05633152, + "balance_loss_mlp": 1.02985144, + "epoch": 0.1439350668871186, + "flos": 26761493831040.0, + "grad_norm": 2.077460720931814, + "language_loss": 0.7646414, + "learning_rate": 3.865366209909941e-06, + "loss": 0.78680778, + "num_input_tokens_seen": 51880105, + "step": 2394, + "time_per_iteration": 2.525285005569458 + }, + { + "auxiliary_loss_clip": 0.01161888, + "auxiliary_loss_mlp": 0.01049812, + "balance_loss_clip": 1.05324435, + "balance_loss_mlp": 1.03041649, + "epoch": 0.14399519013978657, + "flos": 40700040537600.0, + "grad_norm": 1.5784814653527055, + "language_loss": 0.86147541, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88359243, + "num_input_tokens_seen": 51905175, + "step": 2395, + "time_per_iteration": 4.01458215713501 + }, + { + "auxiliary_loss_clip": 0.01129677, + "auxiliary_loss_mlp": 0.01056355, + "balance_loss_clip": 1.0573082, + "balance_loss_mlp": 1.03501642, + "epoch": 0.14405531339245453, + "flos": 20557530627840.0, + "grad_norm": 1.772261487307856, + "language_loss": 0.82859159, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85045183, + "num_input_tokens_seen": 51924490, + "step": 2396, + "time_per_iteration": 2.565033435821533 + }, + { + "auxiliary_loss_clip": 0.01125847, + "auxiliary_loss_mlp": 0.00846104, + "balance_loss_clip": 1.0493741, + "balance_loss_mlp": 1.08699703, + "epoch": 0.1441154366451225, + "flos": 19572931946880.0, + "grad_norm": 2.316187957587271, + "language_loss": 0.82636178, + "learning_rate": 3.864944458808712e-06, + "loss": 0.84608126, + "num_input_tokens_seen": 51940490, + "step": 2397, + "time_per_iteration": 2.5220518112182617 + }, + { + "auxiliary_loss_clip": 0.01164316, + "auxiliary_loss_mlp": 0.01045982, + "balance_loss_clip": 1.05332339, + "balance_loss_mlp": 1.02589548, + "epoch": 0.14417555989779046, + "flos": 18515721922560.0, + "grad_norm": 1.860365258585905, + "language_loss": 0.79914647, + "learning_rate": 3.86480373366343e-06, + "loss": 0.82124949, + "num_input_tokens_seen": 51957910, + "step": 2398, + "time_per_iteration": 2.4433844089508057 + }, + { + "auxiliary_loss_clip": 0.01150836, + "auxiliary_loss_mlp": 0.01051757, + "balance_loss_clip": 1.05350804, + "balance_loss_mlp": 1.03277946, + "epoch": 0.14423568315045843, + "flos": 26031681296640.0, + "grad_norm": 2.3800124071001254, + "language_loss": 0.65255529, + "learning_rate": 3.864662937804603e-06, + "loss": 0.67458117, + "num_input_tokens_seen": 51978010, + "step": 2399, + "time_per_iteration": 2.543267011642456 + }, + { + "auxiliary_loss_clip": 0.01133007, + "auxiliary_loss_mlp": 0.01047733, + "balance_loss_clip": 1.05065489, + "balance_loss_mlp": 1.02777779, + "epoch": 0.14429580640312642, + "flos": 21288743792640.0, + "grad_norm": 1.5113858370378281, + "language_loss": 0.8198992, + "learning_rate": 3.864522071237571e-06, + "loss": 0.84170663, + "num_input_tokens_seen": 51998515, + "step": 2400, + "time_per_iteration": 2.561065673828125 + }, + { + "auxiliary_loss_clip": 0.01147108, + "auxiliary_loss_mlp": 0.01054395, + "balance_loss_clip": 1.05582404, + "balance_loss_mlp": 1.03236485, + "epoch": 0.14435592965579438, + "flos": 25627865621760.0, + "grad_norm": 1.9762082167977804, + "language_loss": 0.74498641, + "learning_rate": 3.864381133967676e-06, + "loss": 0.76700145, + "num_input_tokens_seen": 52019270, + "step": 2401, + "time_per_iteration": 2.5629281997680664 + }, + { + "auxiliary_loss_clip": 0.01133008, + "auxiliary_loss_mlp": 0.01049602, + "balance_loss_clip": 1.05148911, + "balance_loss_mlp": 1.03052878, + "epoch": 0.14441605290846235, + "flos": 22965053656320.0, + "grad_norm": 1.5962481101633015, + "language_loss": 0.8081578, + "learning_rate": 3.86424012600026e-06, + "loss": 0.82998383, + "num_input_tokens_seen": 52039315, + "step": 2402, + "time_per_iteration": 2.5392982959747314 + }, + { + "auxiliary_loss_clip": 0.01120054, + "auxiliary_loss_mlp": 0.01048627, + "balance_loss_clip": 1.05200195, + "balance_loss_mlp": 1.02840948, + "epoch": 0.14447617616113032, + "flos": 17347655548800.0, + "grad_norm": 2.2002969160158266, + "language_loss": 0.8418709, + "learning_rate": 3.864099047340673e-06, + "loss": 0.8635577, + "num_input_tokens_seen": 52056555, + "step": 2403, + "time_per_iteration": 2.533567428588867 + }, + { + "auxiliary_loss_clip": 0.01126057, + "auxiliary_loss_mlp": 0.00833696, + "balance_loss_clip": 1.05106664, + "balance_loss_mlp": 1.06204689, + "epoch": 0.14453629941379828, + "flos": 24060185464320.0, + "grad_norm": 1.6057215126408448, + "language_loss": 0.70365715, + "learning_rate": 3.863957897994262e-06, + "loss": 0.72325468, + "num_input_tokens_seen": 52075800, + "step": 2404, + "time_per_iteration": 2.571737289428711 + }, + { + "auxiliary_loss_clip": 0.0113364, + "auxiliary_loss_mlp": 0.01053691, + "balance_loss_clip": 1.04930913, + "balance_loss_mlp": 1.03511882, + "epoch": 0.14459642266646625, + "flos": 14429554646400.0, + "grad_norm": 2.3923370709933223, + "language_loss": 0.73765504, + "learning_rate": 3.863816677966381e-06, + "loss": 0.7595284, + "num_input_tokens_seen": 52092585, + "step": 2405, + "time_per_iteration": 2.506847381591797 + }, + { + "auxiliary_loss_clip": 0.01104769, + "auxiliary_loss_mlp": 0.01048467, + "balance_loss_clip": 1.04967916, + "balance_loss_mlp": 1.02873778, + "epoch": 0.14465654591913424, + "flos": 9867032179200.0, + "grad_norm": 2.2066935534490004, + "language_loss": 0.73162353, + "learning_rate": 3.863675387262386e-06, + "loss": 0.75315595, + "num_input_tokens_seen": 52108990, + "step": 2406, + "time_per_iteration": 2.613922357559204 + }, + { + "auxiliary_loss_clip": 0.01150281, + "auxiliary_loss_mlp": 0.01054468, + "balance_loss_clip": 1.05164158, + "balance_loss_mlp": 1.03296304, + "epoch": 0.1447166691718022, + "flos": 24972926987520.0, + "grad_norm": 2.3895689963927667, + "language_loss": 0.75921392, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.78126144, + "num_input_tokens_seen": 52125385, + "step": 2407, + "time_per_iteration": 2.526467800140381 + }, + { + "auxiliary_loss_clip": 0.01157601, + "auxiliary_loss_mlp": 0.0104718, + "balance_loss_clip": 1.0497967, + "balance_loss_mlp": 1.02808285, + "epoch": 0.14477679242447017, + "flos": 21908023200000.0, + "grad_norm": 1.5588972128959082, + "language_loss": 0.79662001, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.81866777, + "num_input_tokens_seen": 52144985, + "step": 2408, + "time_per_iteration": 2.4909427165985107 + }, + { + "auxiliary_loss_clip": 0.01150315, + "auxiliary_loss_mlp": 0.01054395, + "balance_loss_clip": 1.0530684, + "balance_loss_mlp": 1.03327072, + "epoch": 0.14483691567713813, + "flos": 20740746925440.0, + "grad_norm": 3.1724136272465686, + "language_loss": 0.82304269, + "learning_rate": 3.863251091147299e-06, + "loss": 0.84508979, + "num_input_tokens_seen": 52163885, + "step": 2409, + "time_per_iteration": 2.509922981262207 + }, + { + "auxiliary_loss_clip": 0.01108891, + "auxiliary_loss_mlp": 0.01060451, + "balance_loss_clip": 1.04645681, + "balance_loss_mlp": 1.03892159, + "epoch": 0.1448970389298061, + "flos": 35407705536000.0, + "grad_norm": 2.681809018346349, + "language_loss": 0.75073659, + "learning_rate": 3.863109517792446e-06, + "loss": 0.77243, + "num_input_tokens_seen": 52184325, + "step": 2410, + "time_per_iteration": 2.6931052207946777 + }, + { + "auxiliary_loss_clip": 0.01161249, + "auxiliary_loss_mlp": 0.01049748, + "balance_loss_clip": 1.05313826, + "balance_loss_mlp": 1.03111529, + "epoch": 0.14495716218247406, + "flos": 15414368808960.0, + "grad_norm": 1.8206245857700996, + "language_loss": 0.81976438, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.84187436, + "num_input_tokens_seen": 52202740, + "step": 2411, + "time_per_iteration": 2.4585249423980713 + }, + { + "auxiliary_loss_clip": 0.01137195, + "auxiliary_loss_mlp": 0.01055371, + "balance_loss_clip": 1.0538919, + "balance_loss_mlp": 1.03501034, + "epoch": 0.14501728543514203, + "flos": 33693222493440.0, + "grad_norm": 1.9941777373460778, + "language_loss": 0.7031132, + "learning_rate": 3.862826159140214e-06, + "loss": 0.72503889, + "num_input_tokens_seen": 52223100, + "step": 2412, + "time_per_iteration": 2.638899087905884 + }, + { + "auxiliary_loss_clip": 0.0114646, + "auxiliary_loss_mlp": 0.01051787, + "balance_loss_clip": 1.05542278, + "balance_loss_mlp": 1.03167605, + "epoch": 0.14507740868781002, + "flos": 15596112648960.0, + "grad_norm": 2.0913156440252902, + "language_loss": 0.76611447, + "learning_rate": 3.862684373853579e-06, + "loss": 0.78809696, + "num_input_tokens_seen": 52239690, + "step": 2413, + "time_per_iteration": 2.4719831943511963 + }, + { + "auxiliary_loss_clip": 0.01079824, + "auxiliary_loss_mlp": 0.01025002, + "balance_loss_clip": 1.04588723, + "balance_loss_mlp": 1.02161682, + "epoch": 0.145137531940478, + "flos": 66675343438080.0, + "grad_norm": 0.9007164244795998, + "language_loss": 0.58844322, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.60949147, + "num_input_tokens_seen": 52296705, + "step": 2414, + "time_per_iteration": 3.0326743125915527 + }, + { + "auxiliary_loss_clip": 0.01066175, + "auxiliary_loss_mlp": 0.01007473, + "balance_loss_clip": 1.04564226, + "balance_loss_mlp": 1.00382519, + "epoch": 0.14519765519314595, + "flos": 67521578929920.0, + "grad_norm": 0.8384556257576324, + "language_loss": 0.62196672, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64270318, + "num_input_tokens_seen": 52361830, + "step": 2415, + "time_per_iteration": 3.1442854404449463 + }, + { + "auxiliary_loss_clip": 0.01147089, + "auxiliary_loss_mlp": 0.01046561, + "balance_loss_clip": 1.05518508, + "balance_loss_mlp": 1.02602124, + "epoch": 0.14525777844581392, + "flos": 17198913329280.0, + "grad_norm": 2.086668956632305, + "language_loss": 0.72539961, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.74733615, + "num_input_tokens_seen": 52379420, + "step": 2416, + "time_per_iteration": 2.479311466217041 + }, + { + "auxiliary_loss_clip": 0.01050623, + "auxiliary_loss_mlp": 0.01004478, + "balance_loss_clip": 1.03345704, + "balance_loss_mlp": 1.00111604, + "epoch": 0.14531790169848188, + "flos": 65404609015680.0, + "grad_norm": 0.7160943941984923, + "language_loss": 0.60390902, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62445998, + "num_input_tokens_seen": 52446290, + "step": 2417, + "time_per_iteration": 3.2052416801452637 + }, + { + "auxiliary_loss_clip": 0.01165366, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_clip": 1.05416167, + "balance_loss_mlp": 1.03216887, + "epoch": 0.14537802495114985, + "flos": 32562467372160.0, + "grad_norm": 2.3744759204549495, + "language_loss": 0.78859735, + "learning_rate": 3.861974388030356e-06, + "loss": 0.81077188, + "num_input_tokens_seen": 52467295, + "step": 2418, + "time_per_iteration": 2.566204786300659 + }, + { + "auxiliary_loss_clip": 0.01115758, + "auxiliary_loss_mlp": 0.01050018, + "balance_loss_clip": 1.0517683, + "balance_loss_mlp": 1.03053975, + "epoch": 0.1454381482038178, + "flos": 20226685432320.0, + "grad_norm": 1.770108998851477, + "language_loss": 0.72045422, + "learning_rate": 3.861832179025394e-06, + "loss": 0.74211198, + "num_input_tokens_seen": 52487295, + "step": 2419, + "time_per_iteration": 2.5769503116607666 + }, + { + "auxiliary_loss_clip": 0.01139763, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_clip": 1.05569839, + "balance_loss_mlp": 1.0264926, + "epoch": 0.1454982714564858, + "flos": 22893124671360.0, + "grad_norm": 2.3018782583419393, + "language_loss": 0.89662445, + "learning_rate": 3.861689899419569e-06, + "loss": 0.91848433, + "num_input_tokens_seen": 52504220, + "step": 2420, + "time_per_iteration": 2.534346342086792 + }, + { + "auxiliary_loss_clip": 0.01150702, + "auxiliary_loss_mlp": 0.01053266, + "balance_loss_clip": 1.05225205, + "balance_loss_mlp": 1.03446674, + "epoch": 0.14555839470915377, + "flos": 20229845829120.0, + "grad_norm": 2.1373906837566974, + "language_loss": 0.83055341, + "learning_rate": 3.861547549218276e-06, + "loss": 0.85259306, + "num_input_tokens_seen": 52521900, + "step": 2421, + "time_per_iteration": 2.516117811203003 + }, + { + "auxiliary_loss_clip": 0.01104176, + "auxiliary_loss_mlp": 0.01054217, + "balance_loss_clip": 1.0535326, + "balance_loss_mlp": 1.03450036, + "epoch": 0.14561851796182174, + "flos": 22236282616320.0, + "grad_norm": 1.8113527024700427, + "language_loss": 0.8173281, + "learning_rate": 3.861405128426914e-06, + "loss": 0.83891213, + "num_input_tokens_seen": 52540495, + "step": 2422, + "time_per_iteration": 2.6409966945648193 + }, + { + "auxiliary_loss_clip": 0.01050838, + "auxiliary_loss_mlp": 0.01339503, + "balance_loss_clip": 1.04621887, + "balance_loss_mlp": 2.01205993, + "epoch": 0.1456786412144897, + "flos": 52636786289280.0, + "grad_norm": 0.9469547979371113, + "language_loss": 0.63309002, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65699339, + "num_input_tokens_seen": 52603305, + "step": 2423, + "time_per_iteration": 3.209451675415039 + }, + { + "auxiliary_loss_clip": 0.01109929, + "auxiliary_loss_mlp": 0.00839185, + "balance_loss_clip": 1.05773211, + "balance_loss_mlp": 1.08094025, + "epoch": 0.14573876446715767, + "flos": 23221671396480.0, + "grad_norm": 1.4795298307862137, + "language_loss": 0.82603246, + "learning_rate": 3.861120075095585e-06, + "loss": 0.8455236, + "num_input_tokens_seen": 52623435, + "step": 2424, + "time_per_iteration": 2.652581214904785 + }, + { + "auxiliary_loss_clip": 0.01140892, + "auxiliary_loss_mlp": 0.0105159, + "balance_loss_clip": 1.0559752, + "balance_loss_mlp": 1.03276658, + "epoch": 0.14579888771982563, + "flos": 18114384286080.0, + "grad_norm": 2.440243862558632, + "language_loss": 0.78948212, + "learning_rate": 3.860977442566429e-06, + "loss": 0.81140697, + "num_input_tokens_seen": 52642255, + "step": 2425, + "time_per_iteration": 2.578481912612915 + }, + { + "auxiliary_loss_clip": 0.01154443, + "auxiliary_loss_mlp": 0.01050828, + "balance_loss_clip": 1.05824244, + "balance_loss_mlp": 1.03236294, + "epoch": 0.14585901097249362, + "flos": 23001107932800.0, + "grad_norm": 2.3641579815089186, + "language_loss": 0.8323037, + "learning_rate": 3.860834739468821e-06, + "loss": 0.85435641, + "num_input_tokens_seen": 52658700, + "step": 2426, + "time_per_iteration": 3.9456679821014404 + }, + { + "auxiliary_loss_clip": 0.01163995, + "auxiliary_loss_mlp": 0.01048136, + "balance_loss_clip": 1.0582515, + "balance_loss_mlp": 1.02945542, + "epoch": 0.1459191342251616, + "flos": 21908669644800.0, + "grad_norm": 1.920247844892834, + "language_loss": 0.87402934, + "learning_rate": 3.860691965808173e-06, + "loss": 0.89615065, + "num_input_tokens_seen": 52678140, + "step": 2427, + "time_per_iteration": 3.8972432613372803 + }, + { + "auxiliary_loss_clip": 0.01126386, + "auxiliary_loss_mlp": 0.01051955, + "balance_loss_clip": 1.05249596, + "balance_loss_mlp": 1.03005588, + "epoch": 0.14597925747782955, + "flos": 14975504438400.0, + "grad_norm": 2.9763157749945433, + "language_loss": 0.67227399, + "learning_rate": 3.8605491215899e-06, + "loss": 0.69405746, + "num_input_tokens_seen": 52696825, + "step": 2428, + "time_per_iteration": 2.5681796073913574 + }, + { + "auxiliary_loss_clip": 0.01154114, + "auxiliary_loss_mlp": 0.01049116, + "balance_loss_clip": 1.05558991, + "balance_loss_mlp": 1.0294466, + "epoch": 0.14603938073049752, + "flos": 21068898600960.0, + "grad_norm": 2.237858708414702, + "language_loss": 0.83956933, + "learning_rate": 3.860406206819417e-06, + "loss": 0.86160159, + "num_input_tokens_seen": 52715125, + "step": 2429, + "time_per_iteration": 2.551254987716675 + }, + { + "auxiliary_loss_clip": 0.0111886, + "auxiliary_loss_mlp": 0.01049805, + "balance_loss_clip": 1.0486846, + "balance_loss_mlp": 1.03144646, + "epoch": 0.14609950398316549, + "flos": 19864777950720.0, + "grad_norm": 1.8981043785102403, + "language_loss": 0.79172194, + "learning_rate": 3.860263221502145e-06, + "loss": 0.81340861, + "num_input_tokens_seen": 52734015, + "step": 2430, + "time_per_iteration": 3.9648454189300537 + }, + { + "auxiliary_loss_clip": 0.01166718, + "auxiliary_loss_mlp": 0.01046078, + "balance_loss_clip": 1.0578624, + "balance_loss_mlp": 1.02705181, + "epoch": 0.14615962723583345, + "flos": 22418852469120.0, + "grad_norm": 2.0027946257854152, + "language_loss": 0.82810324, + "learning_rate": 3.860120165643504e-06, + "loss": 0.85023123, + "num_input_tokens_seen": 52753025, + "step": 2431, + "time_per_iteration": 2.5266339778900146 + }, + { + "auxiliary_loss_clip": 0.01158487, + "auxiliary_loss_mlp": 0.01052292, + "balance_loss_clip": 1.05544055, + "balance_loss_mlp": 1.03208649, + "epoch": 0.14621975048850142, + "flos": 22346241125760.0, + "grad_norm": 1.7962107326834569, + "language_loss": 0.79103637, + "learning_rate": 3.859977039248921e-06, + "loss": 0.81314421, + "num_input_tokens_seen": 52773420, + "step": 2432, + "time_per_iteration": 2.523865222930908 + }, + { + "auxiliary_loss_clip": 0.01161695, + "auxiliary_loss_mlp": 0.00987369, + "balance_loss_clip": 1.05491257, + "balance_loss_mlp": 1.34084308, + "epoch": 0.1462798737411694, + "flos": 24389163152640.0, + "grad_norm": 2.3494882174115386, + "language_loss": 0.80157524, + "learning_rate": 3.859833842323822e-06, + "loss": 0.82306588, + "num_input_tokens_seen": 52792870, + "step": 2433, + "time_per_iteration": 3.980318307876587 + }, + { + "auxiliary_loss_clip": 0.01124782, + "auxiliary_loss_mlp": 0.01052281, + "balance_loss_clip": 1.05803275, + "balance_loss_mlp": 1.03181303, + "epoch": 0.14633999699383737, + "flos": 19244672530560.0, + "grad_norm": 3.243569300121905, + "language_loss": 0.78697789, + "learning_rate": 3.859690574873638e-06, + "loss": 0.80874854, + "num_input_tokens_seen": 52811615, + "step": 2434, + "time_per_iteration": 2.5750107765197754 + }, + { + "auxiliary_loss_clip": 0.01053702, + "auxiliary_loss_mlp": 0.01047445, + "balance_loss_clip": 1.03591347, + "balance_loss_mlp": 1.04434586, + "epoch": 0.14640012024650534, + "flos": 62660638270080.0, + "grad_norm": 0.8613434107927777, + "language_loss": 0.58438826, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60539973, + "num_input_tokens_seen": 52873230, + "step": 2435, + "time_per_iteration": 3.1452882289886475 + }, + { + "auxiliary_loss_clip": 0.01154763, + "auxiliary_loss_mlp": 0.01048896, + "balance_loss_clip": 1.04962206, + "balance_loss_mlp": 1.02990627, + "epoch": 0.1464602434991733, + "flos": 12276243146880.0, + "grad_norm": 2.0919215929649906, + "language_loss": 0.88277102, + "learning_rate": 3.859403828419744e-06, + "loss": 0.90480763, + "num_input_tokens_seen": 52889325, + "step": 2436, + "time_per_iteration": 2.4759373664855957 + }, + { + "auxiliary_loss_clip": 0.01150011, + "auxiliary_loss_mlp": 0.00981899, + "balance_loss_clip": 1.05321515, + "balance_loss_mlp": 1.33691466, + "epoch": 0.14652036675184127, + "flos": 20922311197440.0, + "grad_norm": 3.740906893719455, + "language_loss": 0.74748564, + "learning_rate": 3.85926034942691e-06, + "loss": 0.76880467, + "num_input_tokens_seen": 52909705, + "step": 2437, + "time_per_iteration": 2.5262320041656494 + }, + { + "auxiliary_loss_clip": 0.0115998, + "auxiliary_loss_mlp": 0.01048482, + "balance_loss_clip": 1.0510428, + "balance_loss_mlp": 1.02728701, + "epoch": 0.14658049000450923, + "flos": 27703681528320.0, + "grad_norm": 2.196523610987453, + "language_loss": 0.73657143, + "learning_rate": 3.859116799930736e-06, + "loss": 0.75865602, + "num_input_tokens_seen": 52930300, + "step": 2438, + "time_per_iteration": 2.5316321849823 + }, + { + "auxiliary_loss_clip": 0.01149526, + "auxiliary_loss_mlp": 0.01041297, + "balance_loss_clip": 1.0552845, + "balance_loss_mlp": 1.02271271, + "epoch": 0.14664061325717723, + "flos": 24936513575040.0, + "grad_norm": 1.9644113073953406, + "language_loss": 0.74373412, + "learning_rate": 3.858973179936668e-06, + "loss": 0.76564229, + "num_input_tokens_seen": 52949955, + "step": 2439, + "time_per_iteration": 2.5277037620544434 + }, + { + "auxiliary_loss_clip": 0.01144026, + "auxiliary_loss_mlp": 0.01042637, + "balance_loss_clip": 1.05004454, + "balance_loss_mlp": 1.02306294, + "epoch": 0.1467007365098452, + "flos": 40297661406720.0, + "grad_norm": 2.770591443615531, + "language_loss": 0.74084342, + "learning_rate": 3.85882948945015e-06, + "loss": 0.76271009, + "num_input_tokens_seen": 52972905, + "step": 2440, + "time_per_iteration": 2.64603590965271 + }, + { + "auxiliary_loss_clip": 0.01153334, + "auxiliary_loss_mlp": 0.01049961, + "balance_loss_clip": 1.04913855, + "balance_loss_mlp": 1.03112578, + "epoch": 0.14676085976251316, + "flos": 26541074021760.0, + "grad_norm": 1.658176913543238, + "language_loss": 0.83039391, + "learning_rate": 3.85868572847663e-06, + "loss": 0.85242689, + "num_input_tokens_seen": 52994850, + "step": 2441, + "time_per_iteration": 2.5142879486083984 + }, + { + "auxiliary_loss_clip": 0.01149052, + "auxiliary_loss_mlp": 0.01047132, + "balance_loss_clip": 1.05323052, + "balance_loss_mlp": 1.02632987, + "epoch": 0.14682098301518112, + "flos": 23550110380800.0, + "grad_norm": 2.4767375342427544, + "language_loss": 0.72192669, + "learning_rate": 3.858541897021563e-06, + "loss": 0.74388856, + "num_input_tokens_seen": 53014740, + "step": 2442, + "time_per_iteration": 2.558194875717163 + }, + { + "auxiliary_loss_clip": 0.0113025, + "auxiliary_loss_mlp": 0.01041552, + "balance_loss_clip": 1.05616021, + "balance_loss_mlp": 1.02137029, + "epoch": 0.1468811062678491, + "flos": 11651073909120.0, + "grad_norm": 3.1335927081817148, + "language_loss": 0.8168655, + "learning_rate": 3.8583979950904e-06, + "loss": 0.83858347, + "num_input_tokens_seen": 53029780, + "step": 2443, + "time_per_iteration": 2.6117029190063477 + }, + { + "auxiliary_loss_clip": 0.01139413, + "auxiliary_loss_mlp": 0.01062802, + "balance_loss_clip": 1.04982018, + "balance_loss_mlp": 1.04197621, + "epoch": 0.14694122952051705, + "flos": 23002616304000.0, + "grad_norm": 1.6954272839268876, + "language_loss": 0.82737553, + "learning_rate": 3.858254022688599e-06, + "loss": 0.84939766, + "num_input_tokens_seen": 53048620, + "step": 2444, + "time_per_iteration": 2.5117576122283936 + }, + { + "auxiliary_loss_clip": 0.01127608, + "auxiliary_loss_mlp": 0.01053611, + "balance_loss_clip": 1.05170977, + "balance_loss_mlp": 1.03383398, + "epoch": 0.14700135277318502, + "flos": 26502972670080.0, + "grad_norm": 1.7301628132893199, + "language_loss": 0.71333551, + "learning_rate": 3.85810997982162e-06, + "loss": 0.73514766, + "num_input_tokens_seen": 53070055, + "step": 2445, + "time_per_iteration": 2.568983316421509 + }, + { + "auxiliary_loss_clip": 0.0107602, + "auxiliary_loss_mlp": 0.01018441, + "balance_loss_clip": 1.03388548, + "balance_loss_mlp": 1.01546121, + "epoch": 0.147061476025853, + "flos": 59449434387840.0, + "grad_norm": 0.8300953499734267, + "language_loss": 0.63046026, + "learning_rate": 3.857965866494923e-06, + "loss": 0.65140486, + "num_input_tokens_seen": 53126945, + "step": 2446, + "time_per_iteration": 2.9828553199768066 + }, + { + "auxiliary_loss_clip": 0.01116569, + "auxiliary_loss_mlp": 0.01045948, + "balance_loss_clip": 1.0556643, + "balance_loss_mlp": 1.0254674, + "epoch": 0.14712159927852098, + "flos": 28330897841280.0, + "grad_norm": 1.7031534385646947, + "language_loss": 0.74843848, + "learning_rate": 3.857821682713975e-06, + "loss": 0.77006364, + "num_input_tokens_seen": 53149130, + "step": 2447, + "time_per_iteration": 2.6760611534118652 + }, + { + "auxiliary_loss_clip": 0.01157768, + "auxiliary_loss_mlp": 0.0104316, + "balance_loss_clip": 1.0513618, + "balance_loss_mlp": 1.02423012, + "epoch": 0.14718172253118894, + "flos": 27089825074560.0, + "grad_norm": 2.517532151401068, + "language_loss": 0.85507625, + "learning_rate": 3.857677428484242e-06, + "loss": 0.87708557, + "num_input_tokens_seen": 53167120, + "step": 2448, + "time_per_iteration": 2.509150981903076 + }, + { + "auxiliary_loss_clip": 0.01071871, + "auxiliary_loss_mlp": 0.01005493, + "balance_loss_clip": 1.02900696, + "balance_loss_mlp": 1.00278652, + "epoch": 0.1472418457838569, + "flos": 66706764860160.0, + "grad_norm": 0.7692393740652745, + "language_loss": 0.56816018, + "learning_rate": 3.857533103811195e-06, + "loss": 0.58893383, + "num_input_tokens_seen": 53227945, + "step": 2449, + "time_per_iteration": 3.0389516353607178 + }, + { + "auxiliary_loss_clip": 0.01131226, + "auxiliary_loss_mlp": 0.01044375, + "balance_loss_clip": 1.05485439, + "balance_loss_mlp": 1.02397823, + "epoch": 0.14730196903652487, + "flos": 19573578391680.0, + "grad_norm": 1.874171497059279, + "language_loss": 0.8522138, + "learning_rate": 3.857388708700307e-06, + "loss": 0.87396979, + "num_input_tokens_seen": 53244615, + "step": 2450, + "time_per_iteration": 2.524604558944702 + }, + { + "auxiliary_loss_clip": 0.01150763, + "auxiliary_loss_mlp": 0.01048961, + "balance_loss_clip": 1.05203807, + "balance_loss_mlp": 1.02888668, + "epoch": 0.14736209228919284, + "flos": 16071031296000.0, + "grad_norm": 1.870008240168326, + "language_loss": 0.74964827, + "learning_rate": 3.857244243157052e-06, + "loss": 0.77164555, + "num_input_tokens_seen": 53262205, + "step": 2451, + "time_per_iteration": 2.5117850303649902 + }, + { + "auxiliary_loss_clip": 0.01130636, + "auxiliary_loss_mlp": 0.01041167, + "balance_loss_clip": 1.05098557, + "balance_loss_mlp": 1.02261829, + "epoch": 0.1474222155418608, + "flos": 23039460679680.0, + "grad_norm": 1.6580491146596554, + "language_loss": 0.81963849, + "learning_rate": 3.85709970718691e-06, + "loss": 0.84135652, + "num_input_tokens_seen": 53282445, + "step": 2452, + "time_per_iteration": 2.5508720874786377 + }, + { + "auxiliary_loss_clip": 0.01096191, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.0548861, + "balance_loss_mlp": 1.01807296, + "epoch": 0.1474823387945288, + "flos": 17018641946880.0, + "grad_norm": 1.5109463044096114, + "language_loss": 0.74402618, + "learning_rate": 3.856955100795361e-06, + "loss": 0.76535213, + "num_input_tokens_seen": 53299060, + "step": 2453, + "time_per_iteration": 2.6646926403045654 + }, + { + "auxiliary_loss_clip": 0.01137097, + "auxiliary_loss_mlp": 0.01051882, + "balance_loss_clip": 1.05282497, + "balance_loss_mlp": 1.03237963, + "epoch": 0.14754246204719676, + "flos": 17895041884800.0, + "grad_norm": 2.1549545530372907, + "language_loss": 0.76012897, + "learning_rate": 3.856810423987889e-06, + "loss": 0.78201866, + "num_input_tokens_seen": 53315970, + "step": 2454, + "time_per_iteration": 2.510106325149536 + }, + { + "auxiliary_loss_clip": 0.01137557, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.05019379, + "balance_loss_mlp": 1.0220809, + "epoch": 0.14760258529986472, + "flos": 13079097987840.0, + "grad_norm": 2.265577839602067, + "language_loss": 0.82894778, + "learning_rate": 3.856665676769979e-06, + "loss": 0.85074019, + "num_input_tokens_seen": 53332940, + "step": 2455, + "time_per_iteration": 2.533565044403076 + }, + { + "auxiliary_loss_clip": 0.0112294, + "auxiliary_loss_mlp": 0.01053676, + "balance_loss_clip": 1.04720056, + "balance_loss_mlp": 1.03449559, + "epoch": 0.1476627085525327, + "flos": 30806399358720.0, + "grad_norm": 1.8823770492251262, + "language_loss": 0.84305835, + "learning_rate": 3.85652085914712e-06, + "loss": 0.86482441, + "num_input_tokens_seen": 53353295, + "step": 2456, + "time_per_iteration": 2.655630350112915 + }, + { + "auxiliary_loss_clip": 0.01146791, + "auxiliary_loss_mlp": 0.01045621, + "balance_loss_clip": 1.05030489, + "balance_loss_mlp": 1.02690518, + "epoch": 0.14772283180520066, + "flos": 21689434984320.0, + "grad_norm": 1.8060540515808403, + "language_loss": 0.8435986, + "learning_rate": 3.856375971124805e-06, + "loss": 0.86552274, + "num_input_tokens_seen": 53373410, + "step": 2457, + "time_per_iteration": 2.5273449420928955 + }, + { + "auxiliary_loss_clip": 0.01142774, + "auxiliary_loss_mlp": 0.0104232, + "balance_loss_clip": 1.04881418, + "balance_loss_mlp": 1.02429569, + "epoch": 0.14778295505786862, + "flos": 18770400328320.0, + "grad_norm": 1.9838495513220475, + "language_loss": 0.75447989, + "learning_rate": 3.856231012708527e-06, + "loss": 0.77633083, + "num_input_tokens_seen": 53391430, + "step": 2458, + "time_per_iteration": 2.467681884765625 + }, + { + "auxiliary_loss_clip": 0.01110542, + "auxiliary_loss_mlp": 0.01052805, + "balance_loss_clip": 1.0506289, + "balance_loss_mlp": 1.03163338, + "epoch": 0.1478430783105366, + "flos": 22893555634560.0, + "grad_norm": 1.834634078240255, + "language_loss": 0.83709669, + "learning_rate": 3.856085983903782e-06, + "loss": 0.85873014, + "num_input_tokens_seen": 53409960, + "step": 2459, + "time_per_iteration": 2.599130392074585 + }, + { + "auxiliary_loss_clip": 0.01123642, + "auxiliary_loss_mlp": 0.01041213, + "balance_loss_clip": 1.04999483, + "balance_loss_mlp": 1.02315307, + "epoch": 0.14790320156320458, + "flos": 15085319293440.0, + "grad_norm": 2.176376109791904, + "language_loss": 0.75190997, + "learning_rate": 3.855940884716071e-06, + "loss": 0.7735585, + "num_input_tokens_seen": 53426160, + "step": 2460, + "time_per_iteration": 2.4943206310272217 + }, + { + "auxiliary_loss_clip": 0.01123548, + "auxiliary_loss_mlp": 0.01043716, + "balance_loss_clip": 1.04941165, + "balance_loss_mlp": 1.02422571, + "epoch": 0.14796332481587254, + "flos": 26504768350080.0, + "grad_norm": 1.927374889449342, + "language_loss": 0.81223053, + "learning_rate": 3.855795715150896e-06, + "loss": 0.83390319, + "num_input_tokens_seen": 53448530, + "step": 2461, + "time_per_iteration": 2.6131250858306885 + }, + { + "auxiliary_loss_clip": 0.01147703, + "auxiliary_loss_mlp": 0.01049002, + "balance_loss_clip": 1.04955769, + "balance_loss_mlp": 1.02865255, + "epoch": 0.1480234480685405, + "flos": 17563191108480.0, + "grad_norm": 2.4233182118796206, + "language_loss": 0.66306216, + "learning_rate": 3.855650475213761e-06, + "loss": 0.68502921, + "num_input_tokens_seen": 53465915, + "step": 2462, + "time_per_iteration": 2.452157735824585 + }, + { + "auxiliary_loss_clip": 0.01128782, + "auxiliary_loss_mlp": 0.01049197, + "balance_loss_clip": 1.04962313, + "balance_loss_mlp": 1.02872908, + "epoch": 0.14808357132120847, + "flos": 53582203232640.0, + "grad_norm": 2.2887562715226095, + "language_loss": 0.67260385, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.69438362, + "num_input_tokens_seen": 53496055, + "step": 2463, + "time_per_iteration": 2.838212490081787 + }, + { + "auxiliary_loss_clip": 0.01144863, + "auxiliary_loss_mlp": 0.01046644, + "balance_loss_clip": 1.04688454, + "balance_loss_mlp": 1.0267961, + "epoch": 0.14814369457387644, + "flos": 19829190551040.0, + "grad_norm": 1.8689639233499031, + "language_loss": 0.7730124, + "learning_rate": 3.855359784245646e-06, + "loss": 0.79492748, + "num_input_tokens_seen": 53513790, + "step": 2464, + "time_per_iteration": 2.4797089099884033 + }, + { + "auxiliary_loss_clip": 0.01130844, + "auxiliary_loss_mlp": 0.01053821, + "balance_loss_clip": 1.0510639, + "balance_loss_mlp": 1.03466368, + "epoch": 0.1482038178265444, + "flos": 23914962777600.0, + "grad_norm": 1.6356893213162957, + "language_loss": 0.79758656, + "learning_rate": 3.855214333225688e-06, + "loss": 0.81943321, + "num_input_tokens_seen": 53533410, + "step": 2465, + "time_per_iteration": 3.919557809829712 + }, + { + "auxiliary_loss_clip": 0.01163191, + "auxiliary_loss_mlp": 0.01043837, + "balance_loss_clip": 1.05319953, + "balance_loss_mlp": 1.02433479, + "epoch": 0.1482639410792124, + "flos": 24170503109760.0, + "grad_norm": 1.9029268506611359, + "language_loss": 0.76246303, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78453332, + "num_input_tokens_seen": 53554775, + "step": 2466, + "time_per_iteration": 3.8749442100524902 + }, + { + "auxiliary_loss_clip": 0.01037272, + "auxiliary_loss_mlp": 0.01014812, + "balance_loss_clip": 1.05620003, + "balance_loss_mlp": 1.01156926, + "epoch": 0.14832406433188036, + "flos": 66191051341440.0, + "grad_norm": 0.7998652644299296, + "language_loss": 0.60061216, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62113297, + "num_input_tokens_seen": 53609675, + "step": 2467, + "time_per_iteration": 3.2249529361724854 + }, + { + "auxiliary_loss_clip": 0.01135875, + "auxiliary_loss_mlp": 0.0104135, + "balance_loss_clip": 1.04920852, + "balance_loss_mlp": 1.02133441, + "epoch": 0.14838418758454833, + "flos": 25411252654080.0, + "grad_norm": 2.085809278772307, + "language_loss": 0.87784356, + "learning_rate": 3.85477755808841e-06, + "loss": 0.89961576, + "num_input_tokens_seen": 53626950, + "step": 2468, + "time_per_iteration": 2.5537025928497314 + }, + { + "auxiliary_loss_clip": 0.01128679, + "auxiliary_loss_mlp": 0.010492, + "balance_loss_clip": 1.05012918, + "balance_loss_mlp": 1.02901793, + "epoch": 0.1484443108372163, + "flos": 23289901280640.0, + "grad_norm": 2.075383674593907, + "language_loss": 0.76332307, + "learning_rate": 3.854631825701919e-06, + "loss": 0.78510183, + "num_input_tokens_seen": 53644200, + "step": 2469, + "time_per_iteration": 3.9496967792510986 + }, + { + "auxiliary_loss_clip": 0.01122919, + "auxiliary_loss_mlp": 0.01042869, + "balance_loss_clip": 1.04995394, + "balance_loss_mlp": 1.0239265, + "epoch": 0.14850443408988426, + "flos": 14647675985280.0, + "grad_norm": 2.7376396953558313, + "language_loss": 0.75922924, + "learning_rate": 3.854486022987603e-06, + "loss": 0.78088707, + "num_input_tokens_seen": 53659650, + "step": 2470, + "time_per_iteration": 2.531712055206299 + }, + { + "auxiliary_loss_clip": 0.01154646, + "auxiliary_loss_mlp": 0.01042854, + "balance_loss_clip": 1.05118227, + "balance_loss_mlp": 1.02382874, + "epoch": 0.14856455734255222, + "flos": 23548314700800.0, + "grad_norm": 2.2479707314336985, + "language_loss": 0.72155458, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.74352956, + "num_input_tokens_seen": 53680275, + "step": 2471, + "time_per_iteration": 2.4988839626312256 + }, + { + "auxiliary_loss_clip": 0.01132005, + "auxiliary_loss_mlp": 0.01051792, + "balance_loss_clip": 1.05407846, + "balance_loss_mlp": 1.03114533, + "epoch": 0.1486246805952202, + "flos": 18077288515200.0, + "grad_norm": 1.9727013375037021, + "language_loss": 0.89569962, + "learning_rate": 3.854194206597615e-06, + "loss": 0.91753757, + "num_input_tokens_seen": 53698270, + "step": 2472, + "time_per_iteration": 3.9005980491638184 + }, + { + "auxiliary_loss_clip": 0.01119007, + "auxiliary_loss_mlp": 0.01042492, + "balance_loss_clip": 1.04953325, + "balance_loss_mlp": 1.02333486, + "epoch": 0.14868480384788818, + "flos": 19353625459200.0, + "grad_norm": 2.500654269965786, + "language_loss": 0.81027508, + "learning_rate": 3.854048192933008e-06, + "loss": 0.83189011, + "num_input_tokens_seen": 53716845, + "step": 2473, + "time_per_iteration": 2.579444646835327 + }, + { + "auxiliary_loss_clip": 0.01152729, + "auxiliary_loss_mlp": 0.01054864, + "balance_loss_clip": 1.05374527, + "balance_loss_mlp": 1.03589797, + "epoch": 0.14874492710055615, + "flos": 22200192426240.0, + "grad_norm": 2.5779842400356, + "language_loss": 0.77427024, + "learning_rate": 3.853902108962709e-06, + "loss": 0.79634619, + "num_input_tokens_seen": 53734970, + "step": 2474, + "time_per_iteration": 2.5281834602355957 + }, + { + "auxiliary_loss_clip": 0.01124186, + "auxiliary_loss_mlp": 0.01055664, + "balance_loss_clip": 1.05612373, + "balance_loss_mlp": 1.03547025, + "epoch": 0.1488050503532241, + "flos": 21103444506240.0, + "grad_norm": 2.216483796459402, + "language_loss": 0.82636696, + "learning_rate": 3.853755954692255e-06, + "loss": 0.84816551, + "num_input_tokens_seen": 53753415, + "step": 2475, + "time_per_iteration": 2.5758626461029053 + }, + { + "auxiliary_loss_clip": 0.01106685, + "auxiliary_loss_mlp": 0.0105121, + "balance_loss_clip": 1.05677629, + "balance_loss_mlp": 1.0323987, + "epoch": 0.14886517360589208, + "flos": 12786569625600.0, + "grad_norm": 2.0308729257005633, + "language_loss": 0.80298829, + "learning_rate": 3.85360973012719e-06, + "loss": 0.8245672, + "num_input_tokens_seen": 53770305, + "step": 2476, + "time_per_iteration": 2.578641891479492 + }, + { + "auxiliary_loss_clip": 0.0114552, + "auxiliary_loss_mlp": 0.01046262, + "balance_loss_clip": 1.05288124, + "balance_loss_mlp": 1.02866721, + "epoch": 0.14892529685856004, + "flos": 29022860419200.0, + "grad_norm": 2.290198847001989, + "language_loss": 0.77178276, + "learning_rate": 3.853463435273058e-06, + "loss": 0.79370058, + "num_input_tokens_seen": 53788895, + "step": 2477, + "time_per_iteration": 2.5667755603790283 + }, + { + "auxiliary_loss_clip": 0.01066828, + "auxiliary_loss_mlp": 0.01063099, + "balance_loss_clip": 1.04164124, + "balance_loss_mlp": 1.05990434, + "epoch": 0.148985420111228, + "flos": 61926121054080.0, + "grad_norm": 0.8063715614661756, + "language_loss": 0.60128599, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62258524, + "num_input_tokens_seen": 53850260, + "step": 2478, + "time_per_iteration": 3.1874613761901855 + }, + { + "auxiliary_loss_clip": 0.01105649, + "auxiliary_loss_mlp": 0.01042552, + "balance_loss_clip": 1.05316949, + "balance_loss_mlp": 1.02458727, + "epoch": 0.149045543363896, + "flos": 23915106432000.0, + "grad_norm": 2.535729457658, + "language_loss": 0.71423948, + "learning_rate": 3.853170634719787e-06, + "loss": 0.73572147, + "num_input_tokens_seen": 53867520, + "step": 2479, + "time_per_iteration": 2.60453200340271 + }, + { + "auxiliary_loss_clip": 0.01137076, + "auxiliary_loss_mlp": 0.01044636, + "balance_loss_clip": 1.05295014, + "balance_loss_mlp": 1.02553868, + "epoch": 0.14910566661656396, + "flos": 23654394541440.0, + "grad_norm": 1.637759596947358, + "language_loss": 0.80818486, + "learning_rate": 3.853024129031751e-06, + "loss": 0.83000201, + "num_input_tokens_seen": 53886620, + "step": 2480, + "time_per_iteration": 2.5721001625061035 + }, + { + "auxiliary_loss_clip": 0.01132655, + "auxiliary_loss_mlp": 0.01045492, + "balance_loss_clip": 1.05246758, + "balance_loss_mlp": 1.02622771, + "epoch": 0.14916578986923193, + "flos": 20515299212160.0, + "grad_norm": 2.051030781148257, + "language_loss": 0.84475863, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86654007, + "num_input_tokens_seen": 53902230, + "step": 2481, + "time_per_iteration": 2.5518529415130615 + }, + { + "auxiliary_loss_clip": 0.01144685, + "auxiliary_loss_mlp": 0.01051144, + "balance_loss_clip": 1.0528264, + "balance_loss_mlp": 1.03058028, + "epoch": 0.1492259131218999, + "flos": 22491822948480.0, + "grad_norm": 2.156740679571822, + "language_loss": 0.77747107, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.7994293, + "num_input_tokens_seen": 53919475, + "step": 2482, + "time_per_iteration": 2.517514944076538 + }, + { + "auxiliary_loss_clip": 0.01134843, + "auxiliary_loss_mlp": 0.01042443, + "balance_loss_clip": 1.05538678, + "balance_loss_mlp": 1.02146244, + "epoch": 0.14928603637456786, + "flos": 23185868515200.0, + "grad_norm": 2.1340305139684514, + "language_loss": 0.78995121, + "learning_rate": 3.852584190388713e-06, + "loss": 0.81172407, + "num_input_tokens_seen": 53939150, + "step": 2483, + "time_per_iteration": 2.59138560295105 + }, + { + "auxiliary_loss_clip": 0.01144995, + "auxiliary_loss_mlp": 0.00924096, + "balance_loss_clip": 1.05083609, + "balance_loss_mlp": 1.22345781, + "epoch": 0.14934615962723582, + "flos": 21653237053440.0, + "grad_norm": 1.6784635016747396, + "language_loss": 0.7075941, + "learning_rate": 3.852437403666595e-06, + "loss": 0.72828501, + "num_input_tokens_seen": 53958735, + "step": 2484, + "time_per_iteration": 2.521122694015503 + }, + { + "auxiliary_loss_clip": 0.01135644, + "auxiliary_loss_mlp": 0.00934919, + "balance_loss_clip": 1.05124986, + "balance_loss_mlp": 1.24374747, + "epoch": 0.1494062828799038, + "flos": 27010066924800.0, + "grad_norm": 1.9318308901317545, + "language_loss": 0.84831786, + "learning_rate": 3.852290546699863e-06, + "loss": 0.8690235, + "num_input_tokens_seen": 53975065, + "step": 2485, + "time_per_iteration": 2.5630247592926025 + }, + { + "auxiliary_loss_clip": 0.01140593, + "auxiliary_loss_mlp": 0.01047117, + "balance_loss_clip": 1.05318594, + "balance_loss_mlp": 1.02663684, + "epoch": 0.14946640613257178, + "flos": 21214947300480.0, + "grad_norm": 2.1147801382617932, + "language_loss": 0.85118562, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.87306267, + "num_input_tokens_seen": 53993330, + "step": 2486, + "time_per_iteration": 2.509166955947876 + }, + { + "auxiliary_loss_clip": 0.0114271, + "auxiliary_loss_mlp": 0.01042267, + "balance_loss_clip": 1.04925942, + "balance_loss_mlp": 1.02541041, + "epoch": 0.14952652938523975, + "flos": 13370872164480.0, + "grad_norm": 2.0955883566401696, + "language_loss": 0.74721968, + "learning_rate": 3.851996622054842e-06, + "loss": 0.76906949, + "num_input_tokens_seen": 54010515, + "step": 2487, + "time_per_iteration": 2.4926884174346924 + }, + { + "auxiliary_loss_clip": 0.01145746, + "auxiliary_loss_mlp": 0.01046898, + "balance_loss_clip": 1.04930615, + "balance_loss_mlp": 1.02846837, + "epoch": 0.1495866526379077, + "flos": 35517699959040.0, + "grad_norm": 2.0762590093721163, + "language_loss": 0.71766913, + "learning_rate": 3.8518495543877e-06, + "loss": 0.73959559, + "num_input_tokens_seen": 54031315, + "step": 2488, + "time_per_iteration": 2.627891778945923 + }, + { + "auxiliary_loss_clip": 0.01134993, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_clip": 1.05414379, + "balance_loss_mlp": 1.02934313, + "epoch": 0.14964677589057568, + "flos": 17632749795840.0, + "grad_norm": 4.110003360833931, + "language_loss": 0.70598185, + "learning_rate": 3.851702416498235e-06, + "loss": 0.7278145, + "num_input_tokens_seen": 54045965, + "step": 2489, + "time_per_iteration": 2.513213634490967 + }, + { + "auxiliary_loss_clip": 0.01133772, + "auxiliary_loss_mlp": 0.01048261, + "balance_loss_clip": 1.04835331, + "balance_loss_mlp": 1.02904451, + "epoch": 0.14970689914324364, + "flos": 20185280029440.0, + "grad_norm": 4.649488786577494, + "language_loss": 0.81687176, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.83869207, + "num_input_tokens_seen": 54059960, + "step": 2490, + "time_per_iteration": 2.507173776626587 + }, + { + "auxiliary_loss_clip": 0.01123518, + "auxiliary_loss_mlp": 0.01046458, + "balance_loss_clip": 1.05591297, + "balance_loss_mlp": 1.0271697, + "epoch": 0.1497670223959116, + "flos": 37228699382400.0, + "grad_norm": 2.167732959182962, + "language_loss": 0.80146402, + "learning_rate": 3.851407930074666e-06, + "loss": 0.82316381, + "num_input_tokens_seen": 54079330, + "step": 2491, + "time_per_iteration": 2.715275526046753 + }, + { + "auxiliary_loss_clip": 0.01140685, + "auxiliary_loss_mlp": 0.01053055, + "balance_loss_clip": 1.0504148, + "balance_loss_mlp": 1.03179955, + "epoch": 0.1498271456485796, + "flos": 24455848752000.0, + "grad_norm": 2.064398087951897, + "language_loss": 0.90649939, + "learning_rate": 3.851260581551727e-06, + "loss": 0.92843676, + "num_input_tokens_seen": 54097555, + "step": 2492, + "time_per_iteration": 2.518488645553589 + }, + { + "auxiliary_loss_clip": 0.01148818, + "auxiliary_loss_mlp": 0.01053491, + "balance_loss_clip": 1.05246234, + "balance_loss_mlp": 1.03411913, + "epoch": 0.14988726890124757, + "flos": 16253601148800.0, + "grad_norm": 3.583506721024229, + "language_loss": 0.79328847, + "learning_rate": 3.851113162828802e-06, + "loss": 0.81531155, + "num_input_tokens_seen": 54115600, + "step": 2493, + "time_per_iteration": 2.505173444747925 + }, + { + "auxiliary_loss_clip": 0.01148176, + "auxiliary_loss_mlp": 0.01045096, + "balance_loss_clip": 1.05128086, + "balance_loss_mlp": 1.02510512, + "epoch": 0.14994739215391553, + "flos": 20666555383680.0, + "grad_norm": 2.5900376005730488, + "language_loss": 0.80314076, + "learning_rate": 3.85096567391148e-06, + "loss": 0.82507348, + "num_input_tokens_seen": 54135220, + "step": 2494, + "time_per_iteration": 2.523502826690674 + }, + { + "auxiliary_loss_clip": 0.0113976, + "auxiliary_loss_mlp": 0.010477, + "balance_loss_clip": 1.05485439, + "balance_loss_mlp": 1.0276494, + "epoch": 0.1500075154065835, + "flos": 70652375239680.0, + "grad_norm": 1.9666951098265444, + "language_loss": 0.66467059, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68654519, + "num_input_tokens_seen": 54161065, + "step": 2495, + "time_per_iteration": 2.974846601486206 + }, + { + "auxiliary_loss_clip": 0.01084751, + "auxiliary_loss_mlp": 0.01019043, + "balance_loss_clip": 1.04845512, + "balance_loss_mlp": 1.01605058, + "epoch": 0.15006763865925146, + "flos": 68011937447040.0, + "grad_norm": 0.8982829157603746, + "language_loss": 0.59466273, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61570066, + "num_input_tokens_seen": 54225095, + "step": 2496, + "time_per_iteration": 3.1170949935913086 + }, + { + "auxiliary_loss_clip": 0.01160417, + "auxiliary_loss_mlp": 0.0105546, + "balance_loss_clip": 1.05074561, + "balance_loss_mlp": 1.03505135, + "epoch": 0.15012776191191943, + "flos": 18916269459840.0, + "grad_norm": 1.7901043711673648, + "language_loss": 0.65183806, + "learning_rate": 3.850522786049075e-06, + "loss": 0.67399681, + "num_input_tokens_seen": 54243750, + "step": 2497, + "time_per_iteration": 2.4556875228881836 + }, + { + "auxiliary_loss_clip": 0.01124939, + "auxiliary_loss_mlp": 0.0106176, + "balance_loss_clip": 1.05286956, + "balance_loss_mlp": 1.04002798, + "epoch": 0.1501878851645874, + "flos": 23701330638720.0, + "grad_norm": 1.4858828350386342, + "language_loss": 0.75043976, + "learning_rate": 3.850375016410121e-06, + "loss": 0.7723068, + "num_input_tokens_seen": 54266185, + "step": 2498, + "time_per_iteration": 2.6588547229766846 + }, + { + "auxiliary_loss_clip": 0.01127115, + "auxiliary_loss_mlp": 0.01050967, + "balance_loss_clip": 1.05204296, + "balance_loss_mlp": 1.03031969, + "epoch": 0.15024800841725539, + "flos": 20412523422720.0, + "grad_norm": 4.640523968907937, + "language_loss": 0.72077155, + "learning_rate": 3.850227176604761e-06, + "loss": 0.7425524, + "num_input_tokens_seen": 54283940, + "step": 2499, + "time_per_iteration": 2.542942523956299 + }, + { + "auxiliary_loss_clip": 0.01132318, + "auxiliary_loss_mlp": 0.01067322, + "balance_loss_clip": 1.05226731, + "balance_loss_mlp": 1.04684246, + "epoch": 0.15030813166992335, + "flos": 31831002812160.0, + "grad_norm": 1.8843672582138238, + "language_loss": 0.71581495, + "learning_rate": 3.850079266638601e-06, + "loss": 0.73781133, + "num_input_tokens_seen": 54304830, + "step": 2500, + "time_per_iteration": 2.6081840991973877 + }, + { + "auxiliary_loss_clip": 0.01124739, + "auxiliary_loss_mlp": 0.01070387, + "balance_loss_clip": 1.04889917, + "balance_loss_mlp": 1.05043149, + "epoch": 0.15036825492259132, + "flos": 35657822914560.0, + "grad_norm": 1.744526931384772, + "language_loss": 0.65146405, + "learning_rate": 3.849931286517249e-06, + "loss": 0.6734153, + "num_input_tokens_seen": 54325595, + "step": 2501, + "time_per_iteration": 2.73694109916687 + }, + { + "auxiliary_loss_clip": 0.01134365, + "auxiliary_loss_mlp": 0.01079556, + "balance_loss_clip": 1.04861701, + "balance_loss_mlp": 1.05830109, + "epoch": 0.15042837817525928, + "flos": 18838163335680.0, + "grad_norm": 2.314857911102935, + "language_loss": 0.83793402, + "learning_rate": 3.849783236246318e-06, + "loss": 0.86007321, + "num_input_tokens_seen": 54342180, + "step": 2502, + "time_per_iteration": 2.5477378368377686 + }, + { + "auxiliary_loss_clip": 0.01118484, + "auxiliary_loss_mlp": 0.01065443, + "balance_loss_clip": 1.0469476, + "balance_loss_mlp": 1.04653668, + "epoch": 0.15048850142792725, + "flos": 19535548867200.0, + "grad_norm": 2.0537044022343305, + "language_loss": 0.77425718, + "learning_rate": 3.849635115831421e-06, + "loss": 0.79609644, + "num_input_tokens_seen": 54360255, + "step": 2503, + "time_per_iteration": 2.5523369312286377 + }, + { + "auxiliary_loss_clip": 0.01157119, + "auxiliary_loss_mlp": 0.0105673, + "balance_loss_clip": 1.05201411, + "balance_loss_mlp": 1.03819287, + "epoch": 0.1505486246805952, + "flos": 22017550746240.0, + "grad_norm": 2.583759616398654, + "language_loss": 0.85529566, + "learning_rate": 3.849486925278176e-06, + "loss": 0.87743413, + "num_input_tokens_seen": 54378260, + "step": 2504, + "time_per_iteration": 3.9146196842193604 + }, + { + "auxiliary_loss_clip": 0.01143468, + "auxiliary_loss_mlp": 0.01049228, + "balance_loss_clip": 1.05002737, + "balance_loss_mlp": 1.03102481, + "epoch": 0.15060874793326318, + "flos": 20743153136640.0, + "grad_norm": 1.6398931919116126, + "language_loss": 0.83018124, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85210818, + "num_input_tokens_seen": 54399745, + "step": 2505, + "time_per_iteration": 3.9763729572296143 + }, + { + "auxiliary_loss_clip": 0.01120317, + "auxiliary_loss_mlp": 0.01057417, + "balance_loss_clip": 1.05058157, + "balance_loss_mlp": 1.0377475, + "epoch": 0.15066887118593117, + "flos": 16471902055680.0, + "grad_norm": 2.3076563837183803, + "language_loss": 0.76510113, + "learning_rate": 3.849190333779117e-06, + "loss": 0.78687847, + "num_input_tokens_seen": 54417105, + "step": 2506, + "time_per_iteration": 2.564502716064453 + }, + { + "auxiliary_loss_clip": 0.01162492, + "auxiliary_loss_mlp": 0.01059004, + "balance_loss_clip": 1.05216813, + "balance_loss_mlp": 1.03885794, + "epoch": 0.15072899443859913, + "flos": 19859319083520.0, + "grad_norm": 3.6867381562255575, + "language_loss": 0.768471, + "learning_rate": 3.849041932844552e-06, + "loss": 0.79068601, + "num_input_tokens_seen": 54433920, + "step": 2507, + "time_per_iteration": 3.873556613922119 + }, + { + "auxiliary_loss_clip": 0.01141533, + "auxiliary_loss_mlp": 0.01053177, + "balance_loss_clip": 1.04697347, + "balance_loss_mlp": 1.03407955, + "epoch": 0.1507891176912671, + "flos": 20776226584320.0, + "grad_norm": 2.022889721079339, + "language_loss": 0.69416809, + "learning_rate": 3.848893461794131e-06, + "loss": 0.71611518, + "num_input_tokens_seen": 54451540, + "step": 2508, + "time_per_iteration": 2.4934346675872803 + }, + { + "auxiliary_loss_clip": 0.01128214, + "auxiliary_loss_mlp": 0.0106073, + "balance_loss_clip": 1.05350566, + "balance_loss_mlp": 1.04187119, + "epoch": 0.15084924094393506, + "flos": 23586631534080.0, + "grad_norm": 1.8281911134299806, + "language_loss": 0.77949977, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.80138922, + "num_input_tokens_seen": 54470800, + "step": 2509, + "time_per_iteration": 2.5723891258239746 + }, + { + "auxiliary_loss_clip": 0.01137045, + "auxiliary_loss_mlp": 0.00880045, + "balance_loss_clip": 1.04927325, + "balance_loss_mlp": 1.14530373, + "epoch": 0.15090936419660303, + "flos": 18911313383040.0, + "grad_norm": 3.1766073472814895, + "language_loss": 0.80245662, + "learning_rate": 3.848596309368246e-06, + "loss": 0.82262748, + "num_input_tokens_seen": 54486525, + "step": 2510, + "time_per_iteration": 2.488866090774536 + }, + { + "auxiliary_loss_clip": 0.0114898, + "auxiliary_loss_mlp": 0.01059693, + "balance_loss_clip": 1.05192995, + "balance_loss_mlp": 1.03912961, + "epoch": 0.150969487449271, + "flos": 17928223073280.0, + "grad_norm": 2.5355982351648128, + "language_loss": 0.7406621, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.76274884, + "num_input_tokens_seen": 54503795, + "step": 2511, + "time_per_iteration": 3.987982988357544 + }, + { + "auxiliary_loss_clip": 0.01092794, + "auxiliary_loss_mlp": 0.01049698, + "balance_loss_clip": 1.04707658, + "balance_loss_mlp": 1.03017139, + "epoch": 0.151029610701939, + "flos": 24243078539520.0, + "grad_norm": 2.2046265161962144, + "language_loss": 0.69163018, + "learning_rate": 3.848298876546534e-06, + "loss": 0.71305513, + "num_input_tokens_seen": 54523025, + "step": 2512, + "time_per_iteration": 2.6761438846588135 + }, + { + "auxiliary_loss_clip": 0.01146222, + "auxiliary_loss_mlp": 0.01054872, + "balance_loss_clip": 1.04937458, + "balance_loss_mlp": 1.03579879, + "epoch": 0.15108973395460695, + "flos": 30262496641920.0, + "grad_norm": 2.11195926861456, + "language_loss": 0.73997235, + "learning_rate": 3.84815005500134e-06, + "loss": 0.76198328, + "num_input_tokens_seen": 54545025, + "step": 2513, + "time_per_iteration": 2.553710460662842 + }, + { + "auxiliary_loss_clip": 0.0102298, + "auxiliary_loss_mlp": 0.01038038, + "balance_loss_clip": 1.03179371, + "balance_loss_mlp": 1.03474748, + "epoch": 0.15114985720727492, + "flos": 60437624428800.0, + "grad_norm": 0.8740523425864012, + "language_loss": 0.64763385, + "learning_rate": 3.84800116337411e-06, + "loss": 0.668244, + "num_input_tokens_seen": 54604545, + "step": 2514, + "time_per_iteration": 3.1574530601501465 + }, + { + "auxiliary_loss_clip": 0.01141346, + "auxiliary_loss_mlp": 0.0103963, + "balance_loss_clip": 1.05046594, + "balance_loss_mlp": 1.02077067, + "epoch": 0.15120998045994288, + "flos": 20521691832960.0, + "grad_norm": 2.055907157330445, + "language_loss": 0.72729599, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.74910581, + "num_input_tokens_seen": 54620590, + "step": 2515, + "time_per_iteration": 2.474595785140991 + }, + { + "auxiliary_loss_clip": 0.01126926, + "auxiliary_loss_mlp": 0.01041685, + "balance_loss_clip": 1.04961264, + "balance_loss_mlp": 1.0208714, + "epoch": 0.15127010371261085, + "flos": 21178893024000.0, + "grad_norm": 1.8932481374314358, + "language_loss": 0.77344477, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.79513085, + "num_input_tokens_seen": 54640410, + "step": 2516, + "time_per_iteration": 2.562945604324341 + }, + { + "auxiliary_loss_clip": 0.01068518, + "auxiliary_loss_mlp": 0.0100878, + "balance_loss_clip": 1.03562427, + "balance_loss_mlp": 1.00612187, + "epoch": 0.1513302269652788, + "flos": 65320648974720.0, + "grad_norm": 0.7259082508010725, + "language_loss": 0.54675132, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56752437, + "num_input_tokens_seen": 54701430, + "step": 2517, + "time_per_iteration": 3.12744402885437 + }, + { + "auxiliary_loss_clip": 0.01111521, + "auxiliary_loss_mlp": 0.01049102, + "balance_loss_clip": 1.04592192, + "balance_loss_mlp": 1.02680957, + "epoch": 0.15139035021794678, + "flos": 19135827342720.0, + "grad_norm": 2.1264570267527487, + "language_loss": 0.7876718, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.80927801, + "num_input_tokens_seen": 54720845, + "step": 2518, + "time_per_iteration": 2.555051803588867 + }, + { + "auxiliary_loss_clip": 0.01148479, + "auxiliary_loss_mlp": 0.01048283, + "balance_loss_clip": 1.05069709, + "balance_loss_mlp": 1.02639639, + "epoch": 0.15145047347061477, + "flos": 26578564842240.0, + "grad_norm": 2.452230839740843, + "language_loss": 0.70433033, + "learning_rate": 3.847255654205137e-06, + "loss": 0.72629797, + "num_input_tokens_seen": 54740495, + "step": 2519, + "time_per_iteration": 2.5511722564697266 + }, + { + "auxiliary_loss_clip": 0.01147159, + "auxiliary_loss_mlp": 0.01047765, + "balance_loss_clip": 1.05064321, + "balance_loss_mlp": 1.0274514, + "epoch": 0.15151059672328274, + "flos": 20302959962880.0, + "grad_norm": 2.0091587017778445, + "language_loss": 0.7884922, + "learning_rate": 3.847106342204354e-06, + "loss": 0.81044143, + "num_input_tokens_seen": 54758415, + "step": 2520, + "time_per_iteration": 2.4996514320373535 + }, + { + "auxiliary_loss_clip": 0.01139298, + "auxiliary_loss_mlp": 0.01054512, + "balance_loss_clip": 1.04932129, + "balance_loss_mlp": 1.03318512, + "epoch": 0.1515707199759507, + "flos": 27228367831680.0, + "grad_norm": 2.027122658888103, + "language_loss": 0.74943638, + "learning_rate": 3.846956960161114e-06, + "loss": 0.77137446, + "num_input_tokens_seen": 54779355, + "step": 2521, + "time_per_iteration": 2.563396453857422 + }, + { + "auxiliary_loss_clip": 0.01131464, + "auxiliary_loss_mlp": 0.01049253, + "balance_loss_clip": 1.04858923, + "balance_loss_mlp": 1.02840304, + "epoch": 0.15163084322861867, + "flos": 23587349806080.0, + "grad_norm": 2.5785418498676407, + "language_loss": 0.82430768, + "learning_rate": 3.84680750808108e-06, + "loss": 0.84611481, + "num_input_tokens_seen": 54799465, + "step": 2522, + "time_per_iteration": 2.560260772705078 + }, + { + "auxiliary_loss_clip": 0.01026068, + "auxiliary_loss_mlp": 0.01005294, + "balance_loss_clip": 1.0333817, + "balance_loss_mlp": 1.0027194, + "epoch": 0.15169096648128663, + "flos": 66889622021760.0, + "grad_norm": 0.8202544696166308, + "language_loss": 0.57920825, + "learning_rate": 3.846657985969922e-06, + "loss": 0.59952188, + "num_input_tokens_seen": 54857665, + "step": 2523, + "time_per_iteration": 3.1694202423095703 + }, + { + "auxiliary_loss_clip": 0.01138149, + "auxiliary_loss_mlp": 0.01053853, + "balance_loss_clip": 1.04839444, + "balance_loss_mlp": 1.03114343, + "epoch": 0.1517510897339546, + "flos": 29095435848960.0, + "grad_norm": 1.8439307734729804, + "language_loss": 0.75148451, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.7734046, + "num_input_tokens_seen": 54879895, + "step": 2524, + "time_per_iteration": 2.6086173057556152 + }, + { + "auxiliary_loss_clip": 0.01135205, + "auxiliary_loss_mlp": 0.01044264, + "balance_loss_clip": 1.04756415, + "balance_loss_mlp": 1.0243206, + "epoch": 0.1518112129866226, + "flos": 18406553512320.0, + "grad_norm": 2.1940824193768673, + "language_loss": 0.74776828, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.76956296, + "num_input_tokens_seen": 54898245, + "step": 2525, + "time_per_iteration": 2.52813982963562 + }, + { + "auxiliary_loss_clip": 0.01144014, + "auxiliary_loss_mlp": 0.01048067, + "balance_loss_clip": 1.05349541, + "balance_loss_mlp": 1.02691936, + "epoch": 0.15187133623929056, + "flos": 19425410789760.0, + "grad_norm": 1.8756965286249285, + "language_loss": 0.80041063, + "learning_rate": 3.846208999506402e-06, + "loss": 0.82233143, + "num_input_tokens_seen": 54917060, + "step": 2526, + "time_per_iteration": 2.4979918003082275 + }, + { + "auxiliary_loss_clip": 0.01135094, + "auxiliary_loss_mlp": 0.01050416, + "balance_loss_clip": 1.05089903, + "balance_loss_mlp": 1.03113997, + "epoch": 0.15193145949195852, + "flos": 17566207850880.0, + "grad_norm": 1.7793354380422595, + "language_loss": 0.85154343, + "learning_rate": 3.846059197327466e-06, + "loss": 0.87339842, + "num_input_tokens_seen": 54936365, + "step": 2527, + "time_per_iteration": 2.5334696769714355 + }, + { + "auxiliary_loss_clip": 0.0112417, + "auxiliary_loss_mlp": 0.010507, + "balance_loss_clip": 1.0487659, + "balance_loss_mlp": 1.03019643, + "epoch": 0.15199158274462649, + "flos": 36176265866880.0, + "grad_norm": 1.704276548762612, + "language_loss": 0.69371343, + "learning_rate": 3.845909325145779e-06, + "loss": 0.71546209, + "num_input_tokens_seen": 54961365, + "step": 2528, + "time_per_iteration": 2.6612491607666016 + }, + { + "auxiliary_loss_clip": 0.01133501, + "auxiliary_loss_mlp": 0.01056481, + "balance_loss_clip": 1.05222726, + "balance_loss_mlp": 1.03640664, + "epoch": 0.15205170599729445, + "flos": 23074042498560.0, + "grad_norm": 1.7518748121359065, + "language_loss": 0.86890495, + "learning_rate": 3.845759382967026e-06, + "loss": 0.89080477, + "num_input_tokens_seen": 54980750, + "step": 2529, + "time_per_iteration": 2.571930408477783 + }, + { + "auxiliary_loss_clip": 0.01124579, + "auxiliary_loss_mlp": 0.01042148, + "balance_loss_clip": 1.04975533, + "balance_loss_mlp": 1.02213299, + "epoch": 0.15211182924996242, + "flos": 21908382336000.0, + "grad_norm": 2.510939320720816, + "language_loss": 0.83184993, + "learning_rate": 3.845609370796893e-06, + "loss": 0.85351717, + "num_input_tokens_seen": 54999675, + "step": 2530, + "time_per_iteration": 2.585864305496216 + }, + { + "auxiliary_loss_clip": 0.01125927, + "auxiliary_loss_mlp": 0.01049013, + "balance_loss_clip": 1.05047512, + "balance_loss_mlp": 1.02858031, + "epoch": 0.15217195250263038, + "flos": 13881521865600.0, + "grad_norm": 1.9514312696968228, + "language_loss": 0.80374604, + "learning_rate": 3.845459288641066e-06, + "loss": 0.82549536, + "num_input_tokens_seen": 55018295, + "step": 2531, + "time_per_iteration": 2.5352721214294434 + }, + { + "auxiliary_loss_clip": 0.01143014, + "auxiliary_loss_mlp": 0.01052997, + "balance_loss_clip": 1.04903853, + "balance_loss_mlp": 1.03401923, + "epoch": 0.15223207575529837, + "flos": 24535319592960.0, + "grad_norm": 1.9844243455813229, + "language_loss": 0.7896744, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.81163454, + "num_input_tokens_seen": 55037975, + "step": 2532, + "time_per_iteration": 2.533263921737671 + }, + { + "auxiliary_loss_clip": 0.01144622, + "auxiliary_loss_mlp": 0.01055249, + "balance_loss_clip": 1.04882407, + "balance_loss_mlp": 1.03402925, + "epoch": 0.15229219900796634, + "flos": 25556798563200.0, + "grad_norm": 1.822884760811584, + "language_loss": 0.8783493, + "learning_rate": 3.845158914395105e-06, + "loss": 0.90034807, + "num_input_tokens_seen": 55057135, + "step": 2533, + "time_per_iteration": 2.529696226119995 + }, + { + "auxiliary_loss_clip": 0.01119351, + "auxiliary_loss_mlp": 0.01056793, + "balance_loss_clip": 1.05187714, + "balance_loss_mlp": 1.03645563, + "epoch": 0.1523523222606343, + "flos": 18217806520320.0, + "grad_norm": 2.5763237261782663, + "language_loss": 0.78841448, + "learning_rate": 3.84500862231636e-06, + "loss": 0.8101759, + "num_input_tokens_seen": 55075525, + "step": 2534, + "time_per_iteration": 2.56652569770813 + }, + { + "auxiliary_loss_clip": 0.01161246, + "auxiliary_loss_mlp": 0.01054156, + "balance_loss_clip": 1.05030704, + "balance_loss_mlp": 1.03329444, + "epoch": 0.15241244551330227, + "flos": 13260087642240.0, + "grad_norm": 3.4762863431661337, + "language_loss": 0.77463758, + "learning_rate": 3.844858260274702e-06, + "loss": 0.79679161, + "num_input_tokens_seen": 55090845, + "step": 2535, + "time_per_iteration": 2.4526448249816895 + }, + { + "auxiliary_loss_clip": 0.01144788, + "auxiliary_loss_mlp": 0.01052256, + "balance_loss_clip": 1.05274749, + "balance_loss_mlp": 1.03209734, + "epoch": 0.15247256876597023, + "flos": 19715568854400.0, + "grad_norm": 2.554221678649223, + "language_loss": 0.78224379, + "learning_rate": 3.844707828275835e-06, + "loss": 0.80421424, + "num_input_tokens_seen": 55108750, + "step": 2536, + "time_per_iteration": 2.4952940940856934 + }, + { + "auxiliary_loss_clip": 0.01124818, + "auxiliary_loss_mlp": 0.01061625, + "balance_loss_clip": 1.05017316, + "balance_loss_mlp": 1.04196715, + "epoch": 0.1525326920186382, + "flos": 20375858615040.0, + "grad_norm": 2.852800899099084, + "language_loss": 0.75776547, + "learning_rate": 3.844557326325461e-06, + "loss": 0.77962995, + "num_input_tokens_seen": 55126750, + "step": 2537, + "time_per_iteration": 2.521355628967285 + }, + { + "auxiliary_loss_clip": 0.01146749, + "auxiliary_loss_mlp": 0.01058097, + "balance_loss_clip": 1.05160391, + "balance_loss_mlp": 1.03852248, + "epoch": 0.15259281527130616, + "flos": 13589963170560.0, + "grad_norm": 2.7656485179567114, + "language_loss": 0.78022754, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.80227602, + "num_input_tokens_seen": 55144690, + "step": 2538, + "time_per_iteration": 2.483314275741577 + }, + { + "auxiliary_loss_clip": 0.01107465, + "auxiliary_loss_mlp": 0.01049015, + "balance_loss_clip": 1.05090356, + "balance_loss_mlp": 1.0300374, + "epoch": 0.15265293852397416, + "flos": 22860374446080.0, + "grad_norm": 1.5921112778681392, + "language_loss": 0.89959598, + "learning_rate": 3.844256112593029e-06, + "loss": 0.92116082, + "num_input_tokens_seen": 55166055, + "step": 2539, + "time_per_iteration": 2.6397695541381836 + }, + { + "auxiliary_loss_clip": 0.01142273, + "auxiliary_loss_mlp": 0.01050259, + "balance_loss_clip": 1.05230188, + "balance_loss_mlp": 1.030303, + "epoch": 0.15271306177664212, + "flos": 29238108670080.0, + "grad_norm": 4.493926878854294, + "language_loss": 0.93545157, + "learning_rate": 3.844105400822391e-06, + "loss": 0.95737684, + "num_input_tokens_seen": 55186285, + "step": 2540, + "time_per_iteration": 2.5667154788970947 + }, + { + "auxiliary_loss_clip": 0.01129979, + "auxiliary_loss_mlp": 0.01052404, + "balance_loss_clip": 1.0473243, + "balance_loss_mlp": 1.033831, + "epoch": 0.1527731850293101, + "flos": 31246269310080.0, + "grad_norm": 1.8158228762175372, + "language_loss": 0.75441462, + "learning_rate": 3.843954619123092e-06, + "loss": 0.77623844, + "num_input_tokens_seen": 55207915, + "step": 2541, + "time_per_iteration": 2.6231472492218018 + }, + { + "auxiliary_loss_clip": 0.01120513, + "auxiliary_loss_mlp": 0.01051743, + "balance_loss_clip": 1.0493474, + "balance_loss_mlp": 1.03252649, + "epoch": 0.15283330828197805, + "flos": 22382079920640.0, + "grad_norm": 1.7183786107362886, + "language_loss": 0.81710762, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83883023, + "num_input_tokens_seen": 55227860, + "step": 2542, + "time_per_iteration": 2.5596468448638916 + }, + { + "auxiliary_loss_clip": 0.01160009, + "auxiliary_loss_mlp": 0.0105614, + "balance_loss_clip": 1.05316317, + "balance_loss_mlp": 1.03616071, + "epoch": 0.15289343153464602, + "flos": 25520133755520.0, + "grad_norm": 2.3327947569238834, + "language_loss": 0.78019363, + "learning_rate": 3.843652845961383e-06, + "loss": 0.80235505, + "num_input_tokens_seen": 55247330, + "step": 2543, + "time_per_iteration": 5.326687335968018 + }, + { + "auxiliary_loss_clip": 0.01145659, + "auxiliary_loss_mlp": 0.01053998, + "balance_loss_clip": 1.0511229, + "balance_loss_mlp": 1.03477001, + "epoch": 0.15295355478731398, + "flos": 22710016114560.0, + "grad_norm": 2.0183930676047583, + "language_loss": 0.86343306, + "learning_rate": 3.843501854510416e-06, + "loss": 0.88542962, + "num_input_tokens_seen": 55266195, + "step": 2544, + "time_per_iteration": 2.527777910232544 + }, + { + "auxiliary_loss_clip": 0.01148784, + "auxiliary_loss_mlp": 0.01054293, + "balance_loss_clip": 1.04916787, + "balance_loss_mlp": 1.03347957, + "epoch": 0.15301367803998198, + "flos": 23251907669760.0, + "grad_norm": 4.615663077139077, + "language_loss": 0.82433456, + "learning_rate": 3.843350793153673e-06, + "loss": 0.84636533, + "num_input_tokens_seen": 55283305, + "step": 2545, + "time_per_iteration": 2.5196216106414795 + }, + { + "auxiliary_loss_clip": 0.01158488, + "auxiliary_loss_mlp": 0.01047582, + "balance_loss_clip": 1.05189228, + "balance_loss_mlp": 1.02862787, + "epoch": 0.15307380129264994, + "flos": 25886279041920.0, + "grad_norm": 2.3450245893343964, + "language_loss": 0.71101475, + "learning_rate": 3.843199661896884e-06, + "loss": 0.7330755, + "num_input_tokens_seen": 55303035, + "step": 2546, + "time_per_iteration": 3.9465863704681396 + }, + { + "auxiliary_loss_clip": 0.01129636, + "auxiliary_loss_mlp": 0.01051002, + "balance_loss_clip": 1.04975724, + "balance_loss_mlp": 1.03081965, + "epoch": 0.1531339245453179, + "flos": 46973239205760.0, + "grad_norm": 1.758471411871533, + "language_loss": 0.77432281, + "learning_rate": 3.843048460745779e-06, + "loss": 0.79612923, + "num_input_tokens_seen": 55327570, + "step": 2547, + "time_per_iteration": 2.7889163494110107 + }, + { + "auxiliary_loss_clip": 0.01115147, + "auxiliary_loss_mlp": 0.010451, + "balance_loss_clip": 1.05391181, + "balance_loss_mlp": 1.02537072, + "epoch": 0.15319404779798587, + "flos": 35882049565440.0, + "grad_norm": 2.137997111189484, + "language_loss": 0.74430847, + "learning_rate": 3.842897189706092e-06, + "loss": 0.76591098, + "num_input_tokens_seen": 55351090, + "step": 2548, + "time_per_iteration": 2.7302422523498535 + }, + { + "auxiliary_loss_clip": 0.0113904, + "auxiliary_loss_mlp": 0.01048724, + "balance_loss_clip": 1.05182588, + "balance_loss_mlp": 1.02821994, + "epoch": 0.15325417105065384, + "flos": 25664638170240.0, + "grad_norm": 2.0238430258452698, + "language_loss": 0.80559909, + "learning_rate": 3.842745848783558e-06, + "loss": 0.82747674, + "num_input_tokens_seen": 55371050, + "step": 2549, + "time_per_iteration": 2.5879571437835693 + }, + { + "auxiliary_loss_clip": 0.01146272, + "auxiliary_loss_mlp": 0.01047281, + "balance_loss_clip": 1.050583, + "balance_loss_mlp": 1.02745676, + "epoch": 0.1533142943033218, + "flos": 18770831291520.0, + "grad_norm": 1.9548365062171056, + "language_loss": 0.74873894, + "learning_rate": 3.842594437983917e-06, + "loss": 0.77067447, + "num_input_tokens_seen": 55390375, + "step": 2550, + "time_per_iteration": 3.9963526725769043 + }, + { + "auxiliary_loss_clip": 0.01152815, + "auxiliary_loss_mlp": 0.01044814, + "balance_loss_clip": 1.05254066, + "balance_loss_mlp": 1.02457261, + "epoch": 0.15337441755598977, + "flos": 23107367341440.0, + "grad_norm": 2.4050823229418365, + "language_loss": 0.76782262, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.78979892, + "num_input_tokens_seen": 55408890, + "step": 2551, + "time_per_iteration": 2.4923460483551025 + }, + { + "auxiliary_loss_clip": 0.01076759, + "auxiliary_loss_mlp": 0.01005417, + "balance_loss_clip": 1.04264545, + "balance_loss_mlp": 1.00258017, + "epoch": 0.15343454080865776, + "flos": 59861079227520.0, + "grad_norm": 0.9658240234898465, + "language_loss": 0.56676167, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58758336, + "num_input_tokens_seen": 55463815, + "step": 2552, + "time_per_iteration": 3.0442583560943604 + }, + { + "auxiliary_loss_clip": 0.01117284, + "auxiliary_loss_mlp": 0.01043056, + "balance_loss_clip": 1.05243623, + "balance_loss_mlp": 1.02281451, + "epoch": 0.15349466406132573, + "flos": 11910887959680.0, + "grad_norm": 2.2322835760058597, + "language_loss": 0.88332069, + "learning_rate": 3.84213978637978e-06, + "loss": 0.90492404, + "num_input_tokens_seen": 55481050, + "step": 2553, + "time_per_iteration": 2.5500214099884033 + }, + { + "auxiliary_loss_clip": 0.0114822, + "auxiliary_loss_mlp": 0.01047968, + "balance_loss_clip": 1.05233574, + "balance_loss_mlp": 1.02718961, + "epoch": 0.1535547873139937, + "flos": 24096922099200.0, + "grad_norm": 1.9482394906536369, + "language_loss": 0.78227711, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80423903, + "num_input_tokens_seen": 55500050, + "step": 2554, + "time_per_iteration": 2.517559051513672 + }, + { + "auxiliary_loss_clip": 0.01093373, + "auxiliary_loss_mlp": 0.01059305, + "balance_loss_clip": 1.05001712, + "balance_loss_mlp": 1.03687036, + "epoch": 0.15361491056666166, + "flos": 17566459246080.0, + "grad_norm": 2.485267507428169, + "language_loss": 0.77934194, + "learning_rate": 3.841836336030151e-06, + "loss": 0.80086869, + "num_input_tokens_seen": 55518125, + "step": 2555, + "time_per_iteration": 2.577765703201294 + }, + { + "auxiliary_loss_clip": 0.0112801, + "auxiliary_loss_mlp": 0.01047697, + "balance_loss_clip": 1.05090237, + "balance_loss_mlp": 1.0291357, + "epoch": 0.15367503381932962, + "flos": 25046041121280.0, + "grad_norm": 1.5491674311202603, + "language_loss": 0.76928902, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.79104602, + "num_input_tokens_seen": 55540960, + "step": 2556, + "time_per_iteration": 2.6282296180725098 + }, + { + "auxiliary_loss_clip": 0.01139179, + "auxiliary_loss_mlp": 0.00860632, + "balance_loss_clip": 1.05250299, + "balance_loss_mlp": 1.11179781, + "epoch": 0.15373515707199759, + "flos": 21507332008320.0, + "grad_norm": 2.2018105070249288, + "language_loss": 0.90251791, + "learning_rate": 3.84153260631005e-06, + "loss": 0.92251599, + "num_input_tokens_seen": 55559210, + "step": 2557, + "time_per_iteration": 2.4976017475128174 + }, + { + "auxiliary_loss_clip": 0.01138358, + "auxiliary_loss_mlp": 0.01049309, + "balance_loss_clip": 1.04956651, + "balance_loss_mlp": 1.02872157, + "epoch": 0.15379528032466555, + "flos": 25994729180160.0, + "grad_norm": 1.9548326111825167, + "language_loss": 0.71053481, + "learning_rate": 3.841380636700468e-06, + "loss": 0.73241144, + "num_input_tokens_seen": 55578925, + "step": 2558, + "time_per_iteration": 2.563786029815674 + }, + { + "auxiliary_loss_clip": 0.01134482, + "auxiliary_loss_mlp": 0.01047713, + "balance_loss_clip": 1.05161393, + "balance_loss_mlp": 1.02787685, + "epoch": 0.15385540357733354, + "flos": 19277315015040.0, + "grad_norm": 1.9826286102375483, + "language_loss": 0.92315483, + "learning_rate": 3.841228597265548e-06, + "loss": 0.94497675, + "num_input_tokens_seen": 55597255, + "step": 2559, + "time_per_iteration": 2.5170583724975586 + }, + { + "auxiliary_loss_clip": 0.01135743, + "auxiliary_loss_mlp": 0.01053967, + "balance_loss_clip": 1.05424476, + "balance_loss_mlp": 1.03341496, + "epoch": 0.1539155268300015, + "flos": 28549126920960.0, + "grad_norm": 3.08303640166999, + "language_loss": 0.63885242, + "learning_rate": 3.841076488011055e-06, + "loss": 0.66074955, + "num_input_tokens_seen": 55619515, + "step": 2560, + "time_per_iteration": 2.6054844856262207 + }, + { + "auxiliary_loss_clip": 0.0113726, + "auxiliary_loss_mlp": 0.01050918, + "balance_loss_clip": 1.05251837, + "balance_loss_mlp": 1.03021097, + "epoch": 0.15397565008266947, + "flos": 23547883737600.0, + "grad_norm": 4.994365369489055, + "language_loss": 0.88214368, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.9040255, + "num_input_tokens_seen": 55640050, + "step": 2561, + "time_per_iteration": 2.553596019744873 + }, + { + "auxiliary_loss_clip": 0.01145112, + "auxiliary_loss_mlp": 0.01048583, + "balance_loss_clip": 1.05259371, + "balance_loss_mlp": 1.02948546, + "epoch": 0.15403577333533744, + "flos": 17129821518720.0, + "grad_norm": 1.7033734633207729, + "language_loss": 0.8320027, + "learning_rate": 3.840772060066425e-06, + "loss": 0.85393965, + "num_input_tokens_seen": 55658695, + "step": 2562, + "time_per_iteration": 2.492812156677246 + }, + { + "auxiliary_loss_clip": 0.01134478, + "auxiliary_loss_mlp": 0.00869171, + "balance_loss_clip": 1.05442357, + "balance_loss_mlp": 1.12458491, + "epoch": 0.1540958965880054, + "flos": 17894503180800.0, + "grad_norm": 2.4911158370411797, + "language_loss": 0.74867189, + "learning_rate": 3.840619741387832e-06, + "loss": 0.76870835, + "num_input_tokens_seen": 55676340, + "step": 2563, + "time_per_iteration": 2.5173745155334473 + }, + { + "auxiliary_loss_clip": 0.01119423, + "auxiliary_loss_mlp": 0.01048466, + "balance_loss_clip": 1.05064583, + "balance_loss_mlp": 1.02784252, + "epoch": 0.15415601984067337, + "flos": 32161057908480.0, + "grad_norm": 2.0408635080308675, + "language_loss": 0.76217675, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.78385568, + "num_input_tokens_seen": 55698890, + "step": 2564, + "time_per_iteration": 2.6974117755889893 + }, + { + "auxiliary_loss_clip": 0.01133136, + "auxiliary_loss_mlp": 0.01055372, + "balance_loss_clip": 1.05012202, + "balance_loss_mlp": 1.03629804, + "epoch": 0.15421614309334136, + "flos": 24024418496640.0, + "grad_norm": 2.0292507503310597, + "language_loss": 0.70627475, + "learning_rate": 3.840314894646969e-06, + "loss": 0.72815984, + "num_input_tokens_seen": 55718535, + "step": 2565, + "time_per_iteration": 2.554295301437378 + }, + { + "auxiliary_loss_clip": 0.01142283, + "auxiliary_loss_mlp": 0.01052779, + "balance_loss_clip": 1.04870152, + "balance_loss_mlp": 1.03320503, + "epoch": 0.15427626634600933, + "flos": 24386290064640.0, + "grad_norm": 1.9466351950972853, + "language_loss": 0.71872288, + "learning_rate": 3.840162366596259e-06, + "loss": 0.74067342, + "num_input_tokens_seen": 55738970, + "step": 2566, + "time_per_iteration": 2.5477030277252197 + }, + { + "auxiliary_loss_clip": 0.01153744, + "auxiliary_loss_mlp": 0.01042989, + "balance_loss_clip": 1.04815865, + "balance_loss_mlp": 1.02430868, + "epoch": 0.1543363895986773, + "flos": 23331522165120.0, + "grad_norm": 1.6322097085438834, + "language_loss": 0.8511951, + "learning_rate": 3.840009768766408e-06, + "loss": 0.87316245, + "num_input_tokens_seen": 55759585, + "step": 2567, + "time_per_iteration": 2.4784634113311768 + }, + { + "auxiliary_loss_clip": 0.01110045, + "auxiliary_loss_mlp": 0.01054278, + "balance_loss_clip": 1.04885781, + "balance_loss_mlp": 1.03478742, + "epoch": 0.15439651285134526, + "flos": 24274284480000.0, + "grad_norm": 2.211941030955953, + "language_loss": 0.78245115, + "learning_rate": 3.839857101163202e-06, + "loss": 0.80409437, + "num_input_tokens_seen": 55779250, + "step": 2568, + "time_per_iteration": 2.6046855449676514 + }, + { + "auxiliary_loss_clip": 0.01128518, + "auxiliary_loss_mlp": 0.01038183, + "balance_loss_clip": 1.05311489, + "balance_loss_mlp": 1.01730943, + "epoch": 0.15445663610401322, + "flos": 22456163721600.0, + "grad_norm": 2.005929588279904, + "language_loss": 0.7026242, + "learning_rate": 3.83970436379243e-06, + "loss": 0.72429121, + "num_input_tokens_seen": 55800470, + "step": 2569, + "time_per_iteration": 2.547910690307617 + }, + { + "auxiliary_loss_clip": 0.01130472, + "auxiliary_loss_mlp": 0.01043024, + "balance_loss_clip": 1.05044353, + "balance_loss_mlp": 1.02381897, + "epoch": 0.1545167593566812, + "flos": 22049510872320.0, + "grad_norm": 1.9269957749430744, + "language_loss": 0.76616973, + "learning_rate": 3.839551556659884e-06, + "loss": 0.78790468, + "num_input_tokens_seen": 55817795, + "step": 2570, + "time_per_iteration": 2.5354466438293457 + }, + { + "auxiliary_loss_clip": 0.01142975, + "auxiliary_loss_mlp": 0.01045213, + "balance_loss_clip": 1.05332744, + "balance_loss_mlp": 1.02488828, + "epoch": 0.15457688260934915, + "flos": 19318253541120.0, + "grad_norm": 2.278589381739884, + "language_loss": 0.775446, + "learning_rate": 3.839398679771359e-06, + "loss": 0.79732788, + "num_input_tokens_seen": 55836125, + "step": 2571, + "time_per_iteration": 2.472949743270874 + }, + { + "auxiliary_loss_clip": 0.01138649, + "auxiliary_loss_mlp": 0.01046947, + "balance_loss_clip": 1.05138683, + "balance_loss_mlp": 1.02762341, + "epoch": 0.15463700586201715, + "flos": 24133981956480.0, + "grad_norm": 1.9657092301537848, + "language_loss": 0.82674485, + "learning_rate": 3.839245733132652e-06, + "loss": 0.84860086, + "num_input_tokens_seen": 55855280, + "step": 2572, + "time_per_iteration": 2.548285722732544 + }, + { + "auxiliary_loss_clip": 0.01161078, + "auxiliary_loss_mlp": 0.01047176, + "balance_loss_clip": 1.0529201, + "balance_loss_mlp": 1.02773345, + "epoch": 0.1546971291146851, + "flos": 22420935457920.0, + "grad_norm": 1.5463024952676792, + "language_loss": 0.90564501, + "learning_rate": 3.839092716749563e-06, + "loss": 0.92772758, + "num_input_tokens_seen": 55875695, + "step": 2573, + "time_per_iteration": 2.476029396057129 + }, + { + "auxiliary_loss_clip": 0.01097899, + "auxiliary_loss_mlp": 0.01050663, + "balance_loss_clip": 1.05549121, + "balance_loss_mlp": 1.02971756, + "epoch": 0.15475725236735308, + "flos": 17530225401600.0, + "grad_norm": 1.8673872232848931, + "language_loss": 0.70427346, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72575909, + "num_input_tokens_seen": 55894575, + "step": 2574, + "time_per_iteration": 2.580514430999756 + }, + { + "auxiliary_loss_clip": 0.01130388, + "auxiliary_loss_mlp": 0.01048142, + "balance_loss_clip": 1.04904199, + "balance_loss_mlp": 1.0263741, + "epoch": 0.15481737562002104, + "flos": 22561740771840.0, + "grad_norm": 1.8217795631431755, + "language_loss": 0.82517636, + "learning_rate": 3.838786474773448e-06, + "loss": 0.84696174, + "num_input_tokens_seen": 55912855, + "step": 2575, + "time_per_iteration": 2.52793550491333 + }, + { + "auxiliary_loss_clip": 0.01133294, + "auxiliary_loss_mlp": 0.01052214, + "balance_loss_clip": 1.04802489, + "balance_loss_mlp": 1.03308117, + "epoch": 0.154877498872689, + "flos": 24900567039360.0, + "grad_norm": 1.8544409588099326, + "language_loss": 0.84676719, + "learning_rate": 3.838633249192036e-06, + "loss": 0.86862224, + "num_input_tokens_seen": 55932375, + "step": 2576, + "time_per_iteration": 2.557020425796509 + }, + { + "auxiliary_loss_clip": 0.01156903, + "auxiliary_loss_mlp": 0.01045763, + "balance_loss_clip": 1.04918814, + "balance_loss_mlp": 1.0260458, + "epoch": 0.15493762212535697, + "flos": 28147501975680.0, + "grad_norm": 1.8340139993553701, + "language_loss": 0.82088304, + "learning_rate": 3.838479953889465e-06, + "loss": 0.84290969, + "num_input_tokens_seen": 55953970, + "step": 2577, + "time_per_iteration": 2.5128233432769775 + }, + { + "auxiliary_loss_clip": 0.01129615, + "auxiliary_loss_mlp": 0.01050876, + "balance_loss_clip": 1.05319309, + "balance_loss_mlp": 1.0313375, + "epoch": 0.15499774537802496, + "flos": 25411073086080.0, + "grad_norm": 2.953090357528126, + "language_loss": 0.76897573, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.79078066, + "num_input_tokens_seen": 55973120, + "step": 2578, + "time_per_iteration": 2.5880885124206543 + }, + { + "auxiliary_loss_clip": 0.01121793, + "auxiliary_loss_mlp": 0.01046604, + "balance_loss_clip": 1.04757071, + "balance_loss_mlp": 1.02655268, + "epoch": 0.15505786863069293, + "flos": 22091562720000.0, + "grad_norm": 1.8286879994927463, + "language_loss": 0.82634807, + "learning_rate": 3.83817315414411e-06, + "loss": 0.848032, + "num_input_tokens_seen": 55993260, + "step": 2579, + "time_per_iteration": 2.558387517929077 + }, + { + "auxiliary_loss_clip": 0.01139256, + "auxiliary_loss_mlp": 0.01047385, + "balance_loss_clip": 1.05205345, + "balance_loss_mlp": 1.02754855, + "epoch": 0.1551179918833609, + "flos": 18917131386240.0, + "grad_norm": 1.9448337536451037, + "language_loss": 0.80735278, + "learning_rate": 3.838019649712958e-06, + "loss": 0.82921916, + "num_input_tokens_seen": 56012130, + "step": 2580, + "time_per_iteration": 2.5290110111236572 + }, + { + "auxiliary_loss_clip": 0.01067171, + "auxiliary_loss_mlp": 0.0100769, + "balance_loss_clip": 1.0338341, + "balance_loss_mlp": 1.00486457, + "epoch": 0.15517811513602886, + "flos": 66239172587520.0, + "grad_norm": 0.8479080104445912, + "language_loss": 0.58838534, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.60913396, + "num_input_tokens_seen": 56079045, + "step": 2581, + "time_per_iteration": 4.556372165679932 + }, + { + "auxiliary_loss_clip": 0.01116365, + "auxiliary_loss_mlp": 0.01050844, + "balance_loss_clip": 1.05249953, + "balance_loss_mlp": 1.02869534, + "epoch": 0.15523823838869683, + "flos": 24021078531840.0, + "grad_norm": 1.8857941419449658, + "language_loss": 0.84954005, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.87121218, + "num_input_tokens_seen": 56098745, + "step": 2582, + "time_per_iteration": 3.993600845336914 + }, + { + "auxiliary_loss_clip": 0.01146047, + "auxiliary_loss_mlp": 0.01055366, + "balance_loss_clip": 1.05201161, + "balance_loss_mlp": 1.03501713, + "epoch": 0.1552983616413648, + "flos": 20485062938880.0, + "grad_norm": 2.2228599941288585, + "language_loss": 0.78595889, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.80797297, + "num_input_tokens_seen": 56117655, + "step": 2583, + "time_per_iteration": 2.518681764602661 + }, + { + "auxiliary_loss_clip": 0.01143751, + "auxiliary_loss_mlp": 0.01057379, + "balance_loss_clip": 1.05177617, + "balance_loss_mlp": 1.03462195, + "epoch": 0.15535848489403276, + "flos": 32123710742400.0, + "grad_norm": 2.1204701696313, + "language_loss": 0.76430607, + "learning_rate": 3.837404935067705e-06, + "loss": 0.78631735, + "num_input_tokens_seen": 56141960, + "step": 2584, + "time_per_iteration": 3.971644639968872 + }, + { + "auxiliary_loss_clip": 0.01141764, + "auxiliary_loss_mlp": 0.01042944, + "balance_loss_clip": 1.05445266, + "balance_loss_mlp": 1.02283311, + "epoch": 0.15541860814670075, + "flos": 19098444263040.0, + "grad_norm": 1.7437975099688403, + "language_loss": 0.75670063, + "learning_rate": 3.837251082205368e-06, + "loss": 0.77854764, + "num_input_tokens_seen": 56161430, + "step": 2585, + "time_per_iteration": 2.5077338218688965 + }, + { + "auxiliary_loss_clip": 0.01115827, + "auxiliary_loss_mlp": 0.01043049, + "balance_loss_clip": 1.04827201, + "balance_loss_mlp": 1.02289057, + "epoch": 0.1554787313993687, + "flos": 19172097100800.0, + "grad_norm": 1.8928225421284093, + "language_loss": 0.61282724, + "learning_rate": 3.837097159674286e-06, + "loss": 0.63441598, + "num_input_tokens_seen": 56179390, + "step": 2586, + "time_per_iteration": 2.550844430923462 + }, + { + "auxiliary_loss_clip": 0.01133037, + "auxiliary_loss_mlp": 0.01043845, + "balance_loss_clip": 1.05139494, + "balance_loss_mlp": 1.02425861, + "epoch": 0.15553885465203668, + "flos": 16143822207360.0, + "grad_norm": 1.6379651136131848, + "language_loss": 0.80988228, + "learning_rate": 3.836943167480296e-06, + "loss": 0.83165109, + "num_input_tokens_seen": 56198020, + "step": 2587, + "time_per_iteration": 2.511411666870117 + }, + { + "auxiliary_loss_clip": 0.01163191, + "auxiliary_loss_mlp": 0.01056787, + "balance_loss_clip": 1.05253649, + "balance_loss_mlp": 1.03339815, + "epoch": 0.15559897790470464, + "flos": 25337779384320.0, + "grad_norm": 2.1017308810604987, + "language_loss": 0.88740063, + "learning_rate": 3.836789105629236e-06, + "loss": 0.90960038, + "num_input_tokens_seen": 56218165, + "step": 2588, + "time_per_iteration": 4.034830331802368 + }, + { + "auxiliary_loss_clip": 0.01103605, + "auxiliary_loss_mlp": 0.01054511, + "balance_loss_clip": 1.05438316, + "balance_loss_mlp": 1.03190923, + "epoch": 0.1556591011573726, + "flos": 23148772744320.0, + "grad_norm": 1.8822686605393952, + "language_loss": 0.64537483, + "learning_rate": 3.83663497412695e-06, + "loss": 0.66695595, + "num_input_tokens_seen": 56237160, + "step": 2589, + "time_per_iteration": 2.635561466217041 + }, + { + "auxiliary_loss_clip": 0.0110919, + "auxiliary_loss_mlp": 0.01046284, + "balance_loss_clip": 1.05230737, + "balance_loss_mlp": 1.02383757, + "epoch": 0.15571922441004057, + "flos": 25370888745600.0, + "grad_norm": 2.5015777904066767, + "language_loss": 0.83204043, + "learning_rate": 3.836480772979281e-06, + "loss": 0.85359514, + "num_input_tokens_seen": 56257610, + "step": 2590, + "time_per_iteration": 2.60048508644104 + }, + { + "auxiliary_loss_clip": 0.01121143, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.04782736, + "balance_loss_mlp": 1.02139163, + "epoch": 0.15577934766270854, + "flos": 14501375890560.0, + "grad_norm": 2.471849204214886, + "language_loss": 0.79266214, + "learning_rate": 3.836326502192077e-06, + "loss": 0.81428438, + "num_input_tokens_seen": 56275215, + "step": 2591, + "time_per_iteration": 2.53827166557312 + }, + { + "auxiliary_loss_clip": 0.0114508, + "auxiliary_loss_mlp": 0.01050597, + "balance_loss_clip": 1.05001843, + "balance_loss_mlp": 1.03157139, + "epoch": 0.15583947091537653, + "flos": 37414537372800.0, + "grad_norm": 2.8990394122230105, + "language_loss": 0.64942372, + "learning_rate": 3.836172161771189e-06, + "loss": 0.67138052, + "num_input_tokens_seen": 56297130, + "step": 2592, + "time_per_iteration": 2.62737774848938 + }, + { + "auxiliary_loss_clip": 0.01137842, + "auxiliary_loss_mlp": 0.01050049, + "balance_loss_clip": 1.05554557, + "balance_loss_mlp": 1.0294255, + "epoch": 0.1558995941680445, + "flos": 21834729498240.0, + "grad_norm": 1.9967419005426041, + "language_loss": 0.82070494, + "learning_rate": 3.836017751722467e-06, + "loss": 0.84258384, + "num_input_tokens_seen": 56314995, + "step": 2593, + "time_per_iteration": 2.5337977409362793 + }, + { + "auxiliary_loss_clip": 0.01147251, + "auxiliary_loss_mlp": 0.0104324, + "balance_loss_clip": 1.05376887, + "balance_loss_mlp": 1.02248597, + "epoch": 0.15595971742071246, + "flos": 19792633484160.0, + "grad_norm": 2.345644485779454, + "language_loss": 0.72896183, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.75086671, + "num_input_tokens_seen": 56334005, + "step": 2594, + "time_per_iteration": 2.490967035293579 + }, + { + "auxiliary_loss_clip": 0.01122448, + "auxiliary_loss_mlp": 0.01043658, + "balance_loss_clip": 1.05042076, + "balance_loss_mlp": 1.02353597, + "epoch": 0.15601984067338043, + "flos": 26722135503360.0, + "grad_norm": 2.0160700319514806, + "language_loss": 0.81734759, + "learning_rate": 3.835708722764952e-06, + "loss": 0.83900857, + "num_input_tokens_seen": 56353795, + "step": 2595, + "time_per_iteration": 2.584061861038208 + }, + { + "auxiliary_loss_clip": 0.0115793, + "auxiliary_loss_mlp": 0.01052562, + "balance_loss_clip": 1.05071187, + "balance_loss_mlp": 1.03295159, + "epoch": 0.1560799639260484, + "flos": 18369278173440.0, + "grad_norm": 1.867048680812236, + "language_loss": 0.86728686, + "learning_rate": 3.835554103867876e-06, + "loss": 0.88939178, + "num_input_tokens_seen": 56373195, + "step": 2596, + "time_per_iteration": 2.473767042160034 + }, + { + "auxiliary_loss_clip": 0.0114731, + "auxiliary_loss_mlp": 0.01046314, + "balance_loss_clip": 1.05201685, + "balance_loss_mlp": 1.02677536, + "epoch": 0.15614008717871636, + "flos": 22598980197120.0, + "grad_norm": 1.7133444338726196, + "language_loss": 0.68706858, + "learning_rate": 3.835399415366404e-06, + "loss": 0.70900482, + "num_input_tokens_seen": 56391525, + "step": 2597, + "time_per_iteration": 2.5299389362335205 + }, + { + "auxiliary_loss_clip": 0.0112504, + "auxiliary_loss_mlp": 0.0104955, + "balance_loss_clip": 1.05099702, + "balance_loss_mlp": 1.03072715, + "epoch": 0.15620021043138435, + "flos": 22746860490240.0, + "grad_norm": 1.920193386187035, + "language_loss": 0.7996304, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.82137632, + "num_input_tokens_seen": 56410715, + "step": 2598, + "time_per_iteration": 2.5381078720092773 + }, + { + "auxiliary_loss_clip": 0.01131501, + "auxiliary_loss_mlp": 0.00830907, + "balance_loss_clip": 1.04837883, + "balance_loss_mlp": 1.05966735, + "epoch": 0.15626033368405232, + "flos": 13114936782720.0, + "grad_norm": 1.942526942435539, + "language_loss": 0.82932025, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.84894431, + "num_input_tokens_seen": 56429170, + "step": 2599, + "time_per_iteration": 2.5336642265319824 + }, + { + "auxiliary_loss_clip": 0.01164703, + "auxiliary_loss_mlp": 0.01049898, + "balance_loss_clip": 1.05414498, + "balance_loss_mlp": 1.02902436, + "epoch": 0.15632045693672028, + "flos": 16472297105280.0, + "grad_norm": 2.2564295678241195, + "language_loss": 0.81096494, + "learning_rate": 3.834934932294287e-06, + "loss": 0.83311093, + "num_input_tokens_seen": 56445685, + "step": 2600, + "time_per_iteration": 2.4484968185424805 + }, + { + "auxiliary_loss_clip": 0.01162567, + "auxiliary_loss_mlp": 0.00822825, + "balance_loss_clip": 1.05523181, + "balance_loss_mlp": 1.04498982, + "epoch": 0.15638058018938825, + "flos": 20850346298880.0, + "grad_norm": 1.9120580260250803, + "language_loss": 0.8840012, + "learning_rate": 3.834779965433917e-06, + "loss": 0.90385514, + "num_input_tokens_seen": 56465900, + "step": 2601, + "time_per_iteration": 2.5022640228271484 + }, + { + "auxiliary_loss_clip": 0.0116383, + "auxiliary_loss_mlp": 0.01070277, + "balance_loss_clip": 1.05589998, + "balance_loss_mlp": 1.04824758, + "epoch": 0.1564407034420562, + "flos": 21872220318720.0, + "grad_norm": 2.2427485748434357, + "language_loss": 0.7833873, + "learning_rate": 3.834624928998508e-06, + "loss": 0.80572832, + "num_input_tokens_seen": 56485020, + "step": 2602, + "time_per_iteration": 2.4814453125 + }, + { + "auxiliary_loss_clip": 0.01127145, + "auxiliary_loss_mlp": 0.01049286, + "balance_loss_clip": 1.0498929, + "balance_loss_mlp": 1.02853215, + "epoch": 0.15650082669472418, + "flos": 21834549930240.0, + "grad_norm": 1.870434730899044, + "language_loss": 0.73711717, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.75888145, + "num_input_tokens_seen": 56505205, + "step": 2603, + "time_per_iteration": 2.584061622619629 + }, + { + "auxiliary_loss_clip": 0.01149512, + "auxiliary_loss_mlp": 0.01054733, + "balance_loss_clip": 1.05262041, + "balance_loss_mlp": 1.03499222, + "epoch": 0.15656094994739214, + "flos": 13800542653440.0, + "grad_norm": 2.7713403994135657, + "language_loss": 0.87974572, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.90178818, + "num_input_tokens_seen": 56521495, + "step": 2604, + "time_per_iteration": 2.4781904220581055 + }, + { + "auxiliary_loss_clip": 0.01150665, + "auxiliary_loss_mlp": 0.01047196, + "balance_loss_clip": 1.0523665, + "balance_loss_mlp": 1.02780056, + "epoch": 0.15662107320006013, + "flos": 27308197808640.0, + "grad_norm": 2.059091206466095, + "language_loss": 0.85477799, + "learning_rate": 3.834159402300841e-06, + "loss": 0.87675661, + "num_input_tokens_seen": 56540665, + "step": 2605, + "time_per_iteration": 2.5495266914367676 + }, + { + "auxiliary_loss_clip": 0.01152897, + "auxiliary_loss_mlp": 0.01057772, + "balance_loss_clip": 1.05248463, + "balance_loss_mlp": 1.0364809, + "epoch": 0.1566811964527281, + "flos": 26685075646080.0, + "grad_norm": 1.89210486431735, + "language_loss": 0.73640132, + "learning_rate": 3.834004087624087e-06, + "loss": 0.75850791, + "num_input_tokens_seen": 56560805, + "step": 2606, + "time_per_iteration": 2.541048288345337 + }, + { + "auxiliary_loss_clip": 0.01164064, + "auxiliary_loss_mlp": 0.01053113, + "balance_loss_clip": 1.05788541, + "balance_loss_mlp": 1.03430176, + "epoch": 0.15674131970539606, + "flos": 16103422385280.0, + "grad_norm": 2.120554732423527, + "language_loss": 0.76248789, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.78465962, + "num_input_tokens_seen": 56576335, + "step": 2607, + "time_per_iteration": 2.4469385147094727 + }, + { + "auxiliary_loss_clip": 0.01122012, + "auxiliary_loss_mlp": 0.01049303, + "balance_loss_clip": 1.05333447, + "balance_loss_mlp": 1.03030109, + "epoch": 0.15680144295806403, + "flos": 19169690889600.0, + "grad_norm": 2.3748372323619766, + "language_loss": 0.82001567, + "learning_rate": 3.833693249639615e-06, + "loss": 0.84172881, + "num_input_tokens_seen": 56595880, + "step": 2608, + "time_per_iteration": 2.5525760650634766 + }, + { + "auxiliary_loss_clip": 0.0113239, + "auxiliary_loss_mlp": 0.01054914, + "balance_loss_clip": 1.05113292, + "balance_loss_mlp": 1.0330863, + "epoch": 0.156861566210732, + "flos": 20813430096000.0, + "grad_norm": 2.300273580300276, + "language_loss": 0.72607803, + "learning_rate": 3.833537726343684e-06, + "loss": 0.74795109, + "num_input_tokens_seen": 56615130, + "step": 2609, + "time_per_iteration": 2.560511350631714 + }, + { + "auxiliary_loss_clip": 0.01147219, + "auxiliary_loss_mlp": 0.01046477, + "balance_loss_clip": 1.05044079, + "balance_loss_mlp": 1.02612782, + "epoch": 0.15692168946339996, + "flos": 20047922421120.0, + "grad_norm": 1.7517836051760087, + "language_loss": 0.72035408, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74229103, + "num_input_tokens_seen": 56634005, + "step": 2610, + "time_per_iteration": 2.491142511367798 + }, + { + "auxiliary_loss_clip": 0.01161558, + "auxiliary_loss_mlp": 0.01056694, + "balance_loss_clip": 1.05189931, + "balance_loss_mlp": 1.0352006, + "epoch": 0.15698181271606793, + "flos": 21398019943680.0, + "grad_norm": 1.75391907176345, + "language_loss": 0.73206174, + "learning_rate": 3.833226471173919e-06, + "loss": 0.75424427, + "num_input_tokens_seen": 56653480, + "step": 2611, + "time_per_iteration": 2.4889700412750244 + }, + { + "auxiliary_loss_clip": 0.01143729, + "auxiliary_loss_mlp": 0.01044535, + "balance_loss_clip": 1.0509181, + "balance_loss_mlp": 1.0248059, + "epoch": 0.15704193596873592, + "flos": 20845785271680.0, + "grad_norm": 2.3213041859120573, + "language_loss": 0.71256876, + "learning_rate": 3.833070739311887e-06, + "loss": 0.73445135, + "num_input_tokens_seen": 56672270, + "step": 2612, + "time_per_iteration": 2.4896185398101807 + }, + { + "auxiliary_loss_clip": 0.01125027, + "auxiliary_loss_mlp": 0.010595, + "balance_loss_clip": 1.05591702, + "balance_loss_mlp": 1.03861475, + "epoch": 0.15710205922140388, + "flos": 21762908254080.0, + "grad_norm": 1.8910469238577052, + "language_loss": 0.75777924, + "learning_rate": 3.83291493793963e-06, + "loss": 0.77962446, + "num_input_tokens_seen": 56691510, + "step": 2613, + "time_per_iteration": 2.5613925457000732 + }, + { + "auxiliary_loss_clip": 0.01119408, + "auxiliary_loss_mlp": 0.01060748, + "balance_loss_clip": 1.04981589, + "balance_loss_mlp": 1.040447, + "epoch": 0.15716218247407185, + "flos": 25007760201600.0, + "grad_norm": 1.733329974169963, + "language_loss": 0.65678358, + "learning_rate": 3.832759067063055e-06, + "loss": 0.67858517, + "num_input_tokens_seen": 56712230, + "step": 2614, + "time_per_iteration": 2.5945029258728027 + }, + { + "auxiliary_loss_clip": 0.01154315, + "auxiliary_loss_mlp": 0.01047056, + "balance_loss_clip": 1.05488086, + "balance_loss_mlp": 1.02637279, + "epoch": 0.1572223057267398, + "flos": 20191780391040.0, + "grad_norm": 2.2321556286864017, + "language_loss": 0.75306499, + "learning_rate": 3.832603126688072e-06, + "loss": 0.77507877, + "num_input_tokens_seen": 56727490, + "step": 2615, + "time_per_iteration": 2.4856534004211426 + }, + { + "auxiliary_loss_clip": 0.01142303, + "auxiliary_loss_mlp": 0.0105755, + "balance_loss_clip": 1.05513799, + "balance_loss_mlp": 1.03655779, + "epoch": 0.15728242897940778, + "flos": 20959514709120.0, + "grad_norm": 1.8559273202831703, + "language_loss": 0.72939742, + "learning_rate": 3.832447116820594e-06, + "loss": 0.75139594, + "num_input_tokens_seen": 56747385, + "step": 2616, + "time_per_iteration": 2.500152111053467 + }, + { + "auxiliary_loss_clip": 0.011338, + "auxiliary_loss_mlp": 0.01048576, + "balance_loss_clip": 1.0529027, + "balance_loss_mlp": 1.02795291, + "epoch": 0.15734255223207574, + "flos": 23038275530880.0, + "grad_norm": 2.1843930363104773, + "language_loss": 0.72392285, + "learning_rate": 3.832291037466539e-06, + "loss": 0.74574661, + "num_input_tokens_seen": 56768055, + "step": 2617, + "time_per_iteration": 2.551046848297119 + }, + { + "auxiliary_loss_clip": 0.01142016, + "auxiliary_loss_mlp": 0.01056252, + "balance_loss_clip": 1.05274332, + "balance_loss_mlp": 1.03493714, + "epoch": 0.15740267548474374, + "flos": 20551281661440.0, + "grad_norm": 2.1206294291603838, + "language_loss": 0.74147701, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.76345974, + "num_input_tokens_seen": 56785110, + "step": 2618, + "time_per_iteration": 2.490840196609497 + }, + { + "auxiliary_loss_clip": 0.01164222, + "auxiliary_loss_mlp": 0.0104674, + "balance_loss_clip": 1.05414999, + "balance_loss_mlp": 1.02511501, + "epoch": 0.1574627987374117, + "flos": 22666922772480.0, + "grad_norm": 2.6671900837979865, + "language_loss": 0.78794169, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.81005132, + "num_input_tokens_seen": 56804975, + "step": 2619, + "time_per_iteration": 2.488884687423706 + }, + { + "auxiliary_loss_clip": 0.01132177, + "auxiliary_loss_mlp": 0.01050029, + "balance_loss_clip": 1.05761254, + "balance_loss_mlp": 1.03039479, + "epoch": 0.15752292199007967, + "flos": 16800664262400.0, + "grad_norm": 1.8358367933463895, + "language_loss": 0.76949549, + "learning_rate": 3.831822382544101e-06, + "loss": 0.79131758, + "num_input_tokens_seen": 56822470, + "step": 2620, + "time_per_iteration": 3.9031753540039062 + }, + { + "auxiliary_loss_clip": 0.01142328, + "auxiliary_loss_mlp": 0.01052935, + "balance_loss_clip": 1.05396438, + "balance_loss_mlp": 1.03175139, + "epoch": 0.15758304524274763, + "flos": 29826002568960.0, + "grad_norm": 1.785181033788137, + "language_loss": 0.70912296, + "learning_rate": 3.831666025302944e-06, + "loss": 0.73107553, + "num_input_tokens_seen": 56842100, + "step": 2621, + "time_per_iteration": 3.987882614135742 + }, + { + "auxiliary_loss_clip": 0.01102618, + "auxiliary_loss_mlp": 0.01050125, + "balance_loss_clip": 1.05142379, + "balance_loss_mlp": 1.02733171, + "epoch": 0.1576431684954156, + "flos": 53577426723840.0, + "grad_norm": 2.4527770316609967, + "language_loss": 0.72637117, + "learning_rate": 3.831509598604828e-06, + "loss": 0.74789858, + "num_input_tokens_seen": 56865920, + "step": 2622, + "time_per_iteration": 2.8701534271240234 + }, + { + "auxiliary_loss_clip": 0.0109935, + "auxiliary_loss_mlp": 0.01047971, + "balance_loss_clip": 1.05285454, + "balance_loss_mlp": 1.02808726, + "epoch": 0.15770329174808356, + "flos": 20813609664000.0, + "grad_norm": 1.762116110632521, + "language_loss": 0.87398648, + "learning_rate": 3.831353102455684e-06, + "loss": 0.89545971, + "num_input_tokens_seen": 56885265, + "step": 2623, + "time_per_iteration": 3.9830081462860107 + }, + { + "auxiliary_loss_clip": 0.01162755, + "auxiliary_loss_mlp": 0.01044541, + "balance_loss_clip": 1.05709052, + "balance_loss_mlp": 1.02549171, + "epoch": 0.15776341500075153, + "flos": 24974004395520.0, + "grad_norm": 3.3152341447330844, + "language_loss": 0.81674856, + "learning_rate": 3.831196536861448e-06, + "loss": 0.83882153, + "num_input_tokens_seen": 56906710, + "step": 2624, + "time_per_iteration": 2.524467945098877 + }, + { + "auxiliary_loss_clip": 0.01124708, + "auxiliary_loss_mlp": 0.01052011, + "balance_loss_clip": 1.05098176, + "balance_loss_mlp": 1.03155518, + "epoch": 0.15782353825341952, + "flos": 21907915459200.0, + "grad_norm": 2.29388260702212, + "language_loss": 0.80304754, + "learning_rate": 3.831039901828054e-06, + "loss": 0.82481474, + "num_input_tokens_seen": 56924275, + "step": 2625, + "time_per_iteration": 2.5585405826568604 + }, + { + "auxiliary_loss_clip": 0.01161201, + "auxiliary_loss_mlp": 0.01044705, + "balance_loss_clip": 1.05663323, + "balance_loss_mlp": 1.02571464, + "epoch": 0.15788366150608749, + "flos": 26177191292160.0, + "grad_norm": 2.096946721628493, + "language_loss": 0.80662417, + "learning_rate": 3.830883197361445e-06, + "loss": 0.82868326, + "num_input_tokens_seen": 56941525, + "step": 2626, + "time_per_iteration": 2.514620542526245 + }, + { + "auxiliary_loss_clip": 0.01099858, + "auxiliary_loss_mlp": 0.01057298, + "balance_loss_clip": 1.05168235, + "balance_loss_mlp": 1.03523231, + "epoch": 0.15794378475875545, + "flos": 27709822753920.0, + "grad_norm": 1.8903550082680596, + "language_loss": 0.73452771, + "learning_rate": 3.830726423467561e-06, + "loss": 0.75609928, + "num_input_tokens_seen": 56962145, + "step": 2627, + "time_per_iteration": 4.030405044555664 + }, + { + "auxiliary_loss_clip": 0.01119838, + "auxiliary_loss_mlp": 0.01051729, + "balance_loss_clip": 1.05533814, + "balance_loss_mlp": 1.03202415, + "epoch": 0.15800390801142342, + "flos": 12130158533760.0, + "grad_norm": 2.479595297301469, + "language_loss": 0.84946352, + "learning_rate": 3.830569580152348e-06, + "loss": 0.87117916, + "num_input_tokens_seen": 56977505, + "step": 2628, + "time_per_iteration": 2.574367046356201 + }, + { + "auxiliary_loss_clip": 0.01132314, + "auxiliary_loss_mlp": 0.01040389, + "balance_loss_clip": 1.05449271, + "balance_loss_mlp": 1.02257872, + "epoch": 0.15806403126409138, + "flos": 20704728562560.0, + "grad_norm": 1.9310631673366168, + "language_loss": 0.7676568, + "learning_rate": 3.830412667421752e-06, + "loss": 0.78938383, + "num_input_tokens_seen": 56996770, + "step": 2629, + "time_per_iteration": 2.533308982849121 + }, + { + "auxiliary_loss_clip": 0.01146697, + "auxiliary_loss_mlp": 0.01047947, + "balance_loss_clip": 1.0554018, + "balance_loss_mlp": 1.02797925, + "epoch": 0.15812415451675935, + "flos": 17821712269440.0, + "grad_norm": 2.617215189417647, + "language_loss": 0.73822027, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.76016676, + "num_input_tokens_seen": 57014970, + "step": 2630, + "time_per_iteration": 2.496027946472168 + }, + { + "auxiliary_loss_clip": 0.0115365, + "auxiliary_loss_mlp": 0.01048386, + "balance_loss_clip": 1.05359316, + "balance_loss_mlp": 1.02832305, + "epoch": 0.15818427776942734, + "flos": 20084048524800.0, + "grad_norm": 3.4481435477759046, + "language_loss": 0.83666772, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.85868806, + "num_input_tokens_seen": 57034045, + "step": 2631, + "time_per_iteration": 2.4975268840789795 + }, + { + "auxiliary_loss_clip": 0.01161182, + "auxiliary_loss_mlp": 0.01047184, + "balance_loss_clip": 1.05502796, + "balance_loss_mlp": 1.0283016, + "epoch": 0.1582444010220953, + "flos": 21214911386880.0, + "grad_norm": 1.6876165232554075, + "language_loss": 0.78216863, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.80425227, + "num_input_tokens_seen": 57053695, + "step": 2632, + "time_per_iteration": 2.4962499141693115 + }, + { + "auxiliary_loss_clip": 0.01154114, + "auxiliary_loss_mlp": 0.01058494, + "balance_loss_clip": 1.05756056, + "balance_loss_mlp": 1.03882432, + "epoch": 0.15830452427476327, + "flos": 17858341163520.0, + "grad_norm": 2.411049366150869, + "language_loss": 0.83279407, + "learning_rate": 3.829784322464594e-06, + "loss": 0.85492015, + "num_input_tokens_seen": 57071290, + "step": 2633, + "time_per_iteration": 2.481889486312866 + }, + { + "auxiliary_loss_clip": 0.01166307, + "auxiliary_loss_mlp": 0.0104524, + "balance_loss_clip": 1.05930114, + "balance_loss_mlp": 1.02588093, + "epoch": 0.15836464752743123, + "flos": 24534960456960.0, + "grad_norm": 1.633802985447358, + "language_loss": 0.77220398, + "learning_rate": 3.829627062746394e-06, + "loss": 0.79431939, + "num_input_tokens_seen": 57091465, + "step": 2634, + "time_per_iteration": 2.521247625350952 + }, + { + "auxiliary_loss_clip": 0.01127288, + "auxiliary_loss_mlp": 0.00822557, + "balance_loss_clip": 1.05660152, + "balance_loss_mlp": 1.04901791, + "epoch": 0.1584247707800992, + "flos": 20120821073280.0, + "grad_norm": 2.0312836582656724, + "language_loss": 0.88842142, + "learning_rate": 3.829469733648552e-06, + "loss": 0.90791988, + "num_input_tokens_seen": 57110075, + "step": 2635, + "time_per_iteration": 2.5634281635284424 + }, + { + "auxiliary_loss_clip": 0.01095231, + "auxiliary_loss_mlp": 0.01052254, + "balance_loss_clip": 1.05062938, + "balance_loss_mlp": 1.03240561, + "epoch": 0.15848489403276717, + "flos": 20375966355840.0, + "grad_norm": 2.161211880770881, + "language_loss": 0.75658792, + "learning_rate": 3.829312335177034e-06, + "loss": 0.77806282, + "num_input_tokens_seen": 57128945, + "step": 2636, + "time_per_iteration": 2.6574482917785645 + }, + { + "auxiliary_loss_clip": 0.01121429, + "auxiliary_loss_mlp": 0.01045384, + "balance_loss_clip": 1.05568707, + "balance_loss_mlp": 1.02464163, + "epoch": 0.15854501728543513, + "flos": 39346890359040.0, + "grad_norm": 2.4803510457512474, + "language_loss": 0.72497612, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74664426, + "num_input_tokens_seen": 57152385, + "step": 2637, + "time_per_iteration": 2.70961594581604 + }, + { + "auxiliary_loss_clip": 0.01149333, + "auxiliary_loss_mlp": 0.01048343, + "balance_loss_clip": 1.05533028, + "balance_loss_mlp": 1.02974629, + "epoch": 0.15860514053810312, + "flos": 24864225454080.0, + "grad_norm": 2.1167733594815497, + "language_loss": 0.77890044, + "learning_rate": 3.82899733013685e-06, + "loss": 0.80087721, + "num_input_tokens_seen": 57172620, + "step": 2638, + "time_per_iteration": 2.543856382369995 + }, + { + "auxiliary_loss_clip": 0.0112673, + "auxiliary_loss_mlp": 0.0106155, + "balance_loss_clip": 1.05089581, + "balance_loss_mlp": 1.0409863, + "epoch": 0.1586652637907711, + "flos": 26177694082560.0, + "grad_norm": 1.9338920979058634, + "language_loss": 0.75877416, + "learning_rate": 3.828839723580128e-06, + "loss": 0.78065705, + "num_input_tokens_seen": 57194680, + "step": 2639, + "time_per_iteration": 2.610563278198242 + }, + { + "auxiliary_loss_clip": 0.01100198, + "auxiliary_loss_mlp": 0.01064608, + "balance_loss_clip": 1.04998398, + "balance_loss_mlp": 1.04448533, + "epoch": 0.15872538704343905, + "flos": 19792058866560.0, + "grad_norm": 1.824889954406104, + "language_loss": 0.8116073, + "learning_rate": 3.82868204767362e-06, + "loss": 0.83325535, + "num_input_tokens_seen": 57214675, + "step": 2640, + "time_per_iteration": 2.6685256958007812 + }, + { + "auxiliary_loss_clip": 0.01129693, + "auxiliary_loss_mlp": 0.01062522, + "balance_loss_clip": 1.04976082, + "balance_loss_mlp": 1.04157734, + "epoch": 0.15878551029610702, + "flos": 28475366342400.0, + "grad_norm": 1.3898970448036656, + "language_loss": 0.66897982, + "learning_rate": 3.828524302423306e-06, + "loss": 0.69090199, + "num_input_tokens_seen": 57235830, + "step": 2641, + "time_per_iteration": 2.5933640003204346 + }, + { + "auxiliary_loss_clip": 0.01141186, + "auxiliary_loss_mlp": 0.01055763, + "balance_loss_clip": 1.05005074, + "balance_loss_mlp": 1.0358665, + "epoch": 0.15884563354877498, + "flos": 24206701040640.0, + "grad_norm": 2.948118422299727, + "language_loss": 0.75368565, + "learning_rate": 3.828366487835167e-06, + "loss": 0.77565515, + "num_input_tokens_seen": 57255970, + "step": 2642, + "time_per_iteration": 2.5654218196868896 + }, + { + "auxiliary_loss_clip": 0.01152968, + "auxiliary_loss_mlp": 0.0105584, + "balance_loss_clip": 1.05715704, + "balance_loss_mlp": 1.03639734, + "epoch": 0.15890575680144295, + "flos": 23949795991680.0, + "grad_norm": 1.8657306217653318, + "language_loss": 0.70521414, + "learning_rate": 3.828208603915186e-06, + "loss": 0.72730225, + "num_input_tokens_seen": 57274435, + "step": 2643, + "time_per_iteration": 2.5392372608184814 + }, + { + "auxiliary_loss_clip": 0.01160129, + "auxiliary_loss_mlp": 0.01047644, + "balance_loss_clip": 1.05610108, + "balance_loss_mlp": 1.0296073, + "epoch": 0.15896588005411091, + "flos": 21215019127680.0, + "grad_norm": 1.9825744542407426, + "language_loss": 0.78503489, + "learning_rate": 3.828050650669353e-06, + "loss": 0.80711257, + "num_input_tokens_seen": 57293115, + "step": 2644, + "time_per_iteration": 2.4848146438598633 + }, + { + "auxiliary_loss_clip": 0.01148743, + "auxiliary_loss_mlp": 0.01055261, + "balance_loss_clip": 1.05507874, + "balance_loss_mlp": 1.03646207, + "epoch": 0.1590260033067789, + "flos": 24352390604160.0, + "grad_norm": 2.1094900718985548, + "language_loss": 0.82461941, + "learning_rate": 3.827892628103657e-06, + "loss": 0.84665942, + "num_input_tokens_seen": 57312565, + "step": 2645, + "time_per_iteration": 2.537905693054199 + }, + { + "auxiliary_loss_clip": 0.0116056, + "auxiliary_loss_mlp": 0.010586, + "balance_loss_clip": 1.05173063, + "balance_loss_mlp": 1.03841782, + "epoch": 0.15908612655944687, + "flos": 32048944583040.0, + "grad_norm": 2.002229436167892, + "language_loss": 0.70346981, + "learning_rate": 3.827734536224087e-06, + "loss": 0.7256614, + "num_input_tokens_seen": 57333360, + "step": 2646, + "time_per_iteration": 2.582068681716919 + }, + { + "auxiliary_loss_clip": 0.01131172, + "auxiliary_loss_mlp": 0.01047677, + "balance_loss_clip": 1.05218029, + "balance_loss_mlp": 1.02896059, + "epoch": 0.15914624981211484, + "flos": 17785370684160.0, + "grad_norm": 3.300115635235867, + "language_loss": 0.62569183, + "learning_rate": 3.827576375036642e-06, + "loss": 0.64748031, + "num_input_tokens_seen": 57350575, + "step": 2647, + "time_per_iteration": 2.517565965652466 + }, + { + "auxiliary_loss_clip": 0.01159617, + "auxiliary_loss_mlp": 0.01048813, + "balance_loss_clip": 1.05520296, + "balance_loss_mlp": 1.02998996, + "epoch": 0.1592063730647828, + "flos": 17712507945600.0, + "grad_norm": 1.9507275686818082, + "language_loss": 0.89596862, + "learning_rate": 3.827418144547318e-06, + "loss": 0.91805291, + "num_input_tokens_seen": 57367570, + "step": 2648, + "time_per_iteration": 2.471160650253296 + }, + { + "auxiliary_loss_clip": 0.01155235, + "auxiliary_loss_mlp": 0.0104797, + "balance_loss_clip": 1.05411148, + "balance_loss_mlp": 1.03049397, + "epoch": 0.15926649631745077, + "flos": 18803545603200.0, + "grad_norm": 1.8869840243088019, + "language_loss": 0.91607773, + "learning_rate": 3.827259844762114e-06, + "loss": 0.93810976, + "num_input_tokens_seen": 57383980, + "step": 2649, + "time_per_iteration": 2.438542604446411 + }, + { + "auxiliary_loss_clip": 0.01092358, + "auxiliary_loss_mlp": 0.01044758, + "balance_loss_clip": 1.04958189, + "balance_loss_mlp": 1.02440882, + "epoch": 0.15932661957011873, + "flos": 17566243764480.0, + "grad_norm": 3.2032205815575083, + "language_loss": 0.71672189, + "learning_rate": 3.827101475687033e-06, + "loss": 0.73809302, + "num_input_tokens_seen": 57400840, + "step": 2650, + "time_per_iteration": 2.624683380126953 + }, + { + "auxiliary_loss_clip": 0.01144273, + "auxiliary_loss_mlp": 0.01044331, + "balance_loss_clip": 1.05125725, + "balance_loss_mlp": 1.02692628, + "epoch": 0.15938674282278673, + "flos": 13334351011200.0, + "grad_norm": 1.946444085560815, + "language_loss": 0.71285963, + "learning_rate": 3.826943037328082e-06, + "loss": 0.73474568, + "num_input_tokens_seen": 57419230, + "step": 2651, + "time_per_iteration": 2.4705519676208496 + }, + { + "auxiliary_loss_clip": 0.01118748, + "auxiliary_loss_mlp": 0.00817511, + "balance_loss_clip": 1.05197144, + "balance_loss_mlp": 1.03868854, + "epoch": 0.1594468660754547, + "flos": 22488842119680.0, + "grad_norm": 1.7946529271345346, + "language_loss": 0.799007, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.81836963, + "num_input_tokens_seen": 57439315, + "step": 2652, + "time_per_iteration": 2.621919870376587 + }, + { + "auxiliary_loss_clip": 0.01133714, + "auxiliary_loss_mlp": 0.00817177, + "balance_loss_clip": 1.05190325, + "balance_loss_mlp": 1.04060185, + "epoch": 0.15950698932812266, + "flos": 15007320910080.0, + "grad_norm": 2.5720532530749685, + "language_loss": 0.70038092, + "learning_rate": 3.826625952782601e-06, + "loss": 0.71988988, + "num_input_tokens_seen": 57454635, + "step": 2653, + "time_per_iteration": 2.493992328643799 + }, + { + "auxiliary_loss_clip": 0.01143734, + "auxiliary_loss_mlp": 0.01039332, + "balance_loss_clip": 1.05204964, + "balance_loss_mlp": 1.02043772, + "epoch": 0.15956711258079062, + "flos": 30155052084480.0, + "grad_norm": 2.0521348935328776, + "language_loss": 0.76803803, + "learning_rate": 3.826467306608095e-06, + "loss": 0.78986865, + "num_input_tokens_seen": 57476805, + "step": 2654, + "time_per_iteration": 2.5978341102600098 + }, + { + "auxiliary_loss_clip": 0.01115774, + "auxiliary_loss_mlp": 0.01043785, + "balance_loss_clip": 1.05055571, + "balance_loss_mlp": 1.02511716, + "epoch": 0.1596272358334586, + "flos": 21032700670080.0, + "grad_norm": 1.792724133478427, + "language_loss": 0.82296503, + "learning_rate": 3.826308591173765e-06, + "loss": 0.84456062, + "num_input_tokens_seen": 57496400, + "step": 2655, + "time_per_iteration": 2.579822063446045 + }, + { + "auxiliary_loss_clip": 0.01118326, + "auxiliary_loss_mlp": 0.01043738, + "balance_loss_clip": 1.05385828, + "balance_loss_mlp": 1.02533197, + "epoch": 0.15968735908612655, + "flos": 15268032800640.0, + "grad_norm": 1.829073733520542, + "language_loss": 0.73182213, + "learning_rate": 3.826149806485631e-06, + "loss": 0.75344276, + "num_input_tokens_seen": 57513700, + "step": 2656, + "time_per_iteration": 2.560667037963867 + }, + { + "auxiliary_loss_clip": 0.01115151, + "auxiliary_loss_mlp": 0.0104495, + "balance_loss_clip": 1.05292308, + "balance_loss_mlp": 1.02675855, + "epoch": 0.15974748233879452, + "flos": 52665726695040.0, + "grad_norm": 2.552769703960878, + "language_loss": 0.77842367, + "learning_rate": 3.825990952549713e-06, + "loss": 0.80002463, + "num_input_tokens_seen": 57536180, + "step": 2657, + "time_per_iteration": 2.8569767475128174 + }, + { + "auxiliary_loss_clip": 0.01143549, + "auxiliary_loss_mlp": 0.01048259, + "balance_loss_clip": 1.05322146, + "balance_loss_mlp": 1.02888799, + "epoch": 0.1598076055914625, + "flos": 18733232730240.0, + "grad_norm": 1.7255591243680666, + "language_loss": 0.74017644, + "learning_rate": 3.825832029372035e-06, + "loss": 0.76209456, + "num_input_tokens_seen": 57555025, + "step": 2658, + "time_per_iteration": 3.867783784866333 + }, + { + "auxiliary_loss_clip": 0.01129995, + "auxiliary_loss_mlp": 0.01051151, + "balance_loss_clip": 1.05215693, + "balance_loss_mlp": 1.02891827, + "epoch": 0.15986772884413047, + "flos": 34349238535680.0, + "grad_norm": 1.9309510807958796, + "language_loss": 0.7556963, + "learning_rate": 3.825673036958624e-06, + "loss": 0.77750778, + "num_input_tokens_seen": 57577660, + "step": 2659, + "time_per_iteration": 4.059804916381836 + }, + { + "auxiliary_loss_clip": 0.01124172, + "auxiliary_loss_mlp": 0.01058702, + "balance_loss_clip": 1.05380368, + "balance_loss_mlp": 1.03868699, + "epoch": 0.15992785209679844, + "flos": 22054969739520.0, + "grad_norm": 2.1394496725422174, + "language_loss": 0.9068619, + "learning_rate": 3.825513975315508e-06, + "loss": 0.92869055, + "num_input_tokens_seen": 57596335, + "step": 2660, + "time_per_iteration": 2.578277587890625 + }, + { + "auxiliary_loss_clip": 0.01112146, + "auxiliary_loss_mlp": 0.01062072, + "balance_loss_clip": 1.05358768, + "balance_loss_mlp": 1.04035187, + "epoch": 0.1599879753494664, + "flos": 33066652625280.0, + "grad_norm": 1.6972217622954069, + "language_loss": 0.77824885, + "learning_rate": 3.82535484444872e-06, + "loss": 0.79999095, + "num_input_tokens_seen": 57616830, + "step": 2661, + "time_per_iteration": 4.143225908279419 + }, + { + "auxiliary_loss_clip": 0.01133866, + "auxiliary_loss_mlp": 0.00810849, + "balance_loss_clip": 1.04980874, + "balance_loss_mlp": 1.02856135, + "epoch": 0.16004809860213437, + "flos": 28038010343040.0, + "grad_norm": 1.9451730892349675, + "language_loss": 0.74215937, + "learning_rate": 3.825195644364292e-06, + "loss": 0.76160651, + "num_input_tokens_seen": 57635515, + "step": 2662, + "time_per_iteration": 2.5956945419311523 + }, + { + "auxiliary_loss_clip": 0.01135404, + "auxiliary_loss_mlp": 0.00814726, + "balance_loss_clip": 1.0543046, + "balance_loss_mlp": 1.02848375, + "epoch": 0.16010822185480234, + "flos": 22780113505920.0, + "grad_norm": 2.2731088562404795, + "language_loss": 0.82065177, + "learning_rate": 3.825036375068263e-06, + "loss": 0.84015298, + "num_input_tokens_seen": 57654250, + "step": 2663, + "time_per_iteration": 2.5595338344573975 + }, + { + "auxiliary_loss_clip": 0.01119153, + "auxiliary_loss_mlp": 0.01055359, + "balance_loss_clip": 1.05884743, + "balance_loss_mlp": 1.03582036, + "epoch": 0.16016834510747033, + "flos": 20084012611200.0, + "grad_norm": 2.6667364800866964, + "language_loss": 0.79649299, + "learning_rate": 3.824877036566672e-06, + "loss": 0.81823814, + "num_input_tokens_seen": 57672645, + "step": 2664, + "time_per_iteration": 2.596796751022339 + }, + { + "auxiliary_loss_clip": 0.01151995, + "auxiliary_loss_mlp": 0.010507, + "balance_loss_clip": 1.05370629, + "balance_loss_mlp": 1.03109038, + "epoch": 0.1602284683601383, + "flos": 21173829206400.0, + "grad_norm": 2.0726836612525803, + "language_loss": 0.94360006, + "learning_rate": 3.824717628865561e-06, + "loss": 0.96562696, + "num_input_tokens_seen": 57691055, + "step": 2665, + "time_per_iteration": 3.9174795150756836 + }, + { + "auxiliary_loss_clip": 0.01124236, + "auxiliary_loss_mlp": 0.01048907, + "balance_loss_clip": 1.05014634, + "balance_loss_mlp": 1.02833188, + "epoch": 0.16028859161280626, + "flos": 14647568244480.0, + "grad_norm": 2.0535486574164223, + "language_loss": 0.8491565, + "learning_rate": 3.824558151970974e-06, + "loss": 0.87088788, + "num_input_tokens_seen": 57707235, + "step": 2666, + "time_per_iteration": 2.5604076385498047 + }, + { + "auxiliary_loss_clip": 0.01132116, + "auxiliary_loss_mlp": 0.00805964, + "balance_loss_clip": 1.05669987, + "balance_loss_mlp": 1.01958537, + "epoch": 0.16034871486547422, + "flos": 20990325600000.0, + "grad_norm": 3.2321630582335192, + "language_loss": 0.81585872, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.83523953, + "num_input_tokens_seen": 57724190, + "step": 2667, + "time_per_iteration": 2.5674827098846436 + }, + { + "auxiliary_loss_clip": 0.01160824, + "auxiliary_loss_mlp": 0.01052651, + "balance_loss_clip": 1.05561233, + "balance_loss_mlp": 1.03274298, + "epoch": 0.1604088381181422, + "flos": 21397732634880.0, + "grad_norm": 1.8128773187372504, + "language_loss": 0.7402916, + "learning_rate": 3.824238990625567e-06, + "loss": 0.76242638, + "num_input_tokens_seen": 57743620, + "step": 2668, + "time_per_iteration": 2.492417097091675 + }, + { + "auxiliary_loss_clip": 0.01147232, + "auxiliary_loss_mlp": 0.01049533, + "balance_loss_clip": 1.05566549, + "balance_loss_mlp": 1.02960134, + "epoch": 0.16046896137081015, + "flos": 23877040993920.0, + "grad_norm": 2.045418817292727, + "language_loss": 0.77290076, + "learning_rate": 3.824079306186848e-06, + "loss": 0.79486835, + "num_input_tokens_seen": 57764810, + "step": 2669, + "time_per_iteration": 2.5273842811584473 + }, + { + "auxiliary_loss_clip": 0.01078794, + "auxiliary_loss_mlp": 0.01053905, + "balance_loss_clip": 1.04636586, + "balance_loss_mlp": 1.05059052, + "epoch": 0.16052908462347812, + "flos": 59806709015040.0, + "grad_norm": 0.8082869695151813, + "language_loss": 0.55556238, + "learning_rate": 3.823919552578861e-06, + "loss": 0.5768894, + "num_input_tokens_seen": 57824390, + "step": 2670, + "time_per_iteration": 3.0069684982299805 + }, + { + "auxiliary_loss_clip": 0.01145902, + "auxiliary_loss_mlp": 0.01045369, + "balance_loss_clip": 1.05050063, + "balance_loss_mlp": 1.02577078, + "epoch": 0.1605892078761461, + "flos": 18296559089280.0, + "grad_norm": 2.243360587984137, + "language_loss": 0.77546376, + "learning_rate": 3.82375972980766e-06, + "loss": 0.79737651, + "num_input_tokens_seen": 57843665, + "step": 2671, + "time_per_iteration": 2.4860310554504395 + }, + { + "auxiliary_loss_clip": 0.01149399, + "auxiliary_loss_mlp": 0.01045514, + "balance_loss_clip": 1.05460429, + "balance_loss_mlp": 1.02607107, + "epoch": 0.16064933112881408, + "flos": 32160734686080.0, + "grad_norm": 1.941381399036769, + "language_loss": 0.6496228, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.67157197, + "num_input_tokens_seen": 57863305, + "step": 2672, + "time_per_iteration": 2.5939865112304688 + }, + { + "auxiliary_loss_clip": 0.01145771, + "auxiliary_loss_mlp": 0.01044762, + "balance_loss_clip": 1.05370796, + "balance_loss_mlp": 1.02351892, + "epoch": 0.16070945438148204, + "flos": 19828795501440.0, + "grad_norm": 2.064521066290296, + "language_loss": 0.85700703, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.87891233, + "num_input_tokens_seen": 57883025, + "step": 2673, + "time_per_iteration": 2.478292942047119 + }, + { + "auxiliary_loss_clip": 0.01126271, + "auxiliary_loss_mlp": 0.01048727, + "balance_loss_clip": 1.05585647, + "balance_loss_mlp": 1.02972484, + "epoch": 0.16076957763415, + "flos": 18913144976640.0, + "grad_norm": 2.536638039839282, + "language_loss": 0.72950184, + "learning_rate": 3.823279846575403e-06, + "loss": 0.75125176, + "num_input_tokens_seen": 57901430, + "step": 2674, + "time_per_iteration": 2.5571987628936768 + }, + { + "auxiliary_loss_clip": 0.01145474, + "auxiliary_loss_mlp": 0.01043662, + "balance_loss_clip": 1.04974127, + "balance_loss_mlp": 1.02212143, + "epoch": 0.16082970088681797, + "flos": 16764358590720.0, + "grad_norm": 1.6752462353462072, + "language_loss": 0.84308785, + "learning_rate": 3.823119747211986e-06, + "loss": 0.86497927, + "num_input_tokens_seen": 57919550, + "step": 2675, + "time_per_iteration": 2.4934885501861572 + }, + { + "auxiliary_loss_clip": 0.01112207, + "auxiliary_loss_mlp": 0.01051017, + "balance_loss_clip": 1.05016637, + "balance_loss_mlp": 1.02935624, + "epoch": 0.16088982413948594, + "flos": 35150261783040.0, + "grad_norm": 1.964529401307919, + "language_loss": 0.82749355, + "learning_rate": 3.822959578715685e-06, + "loss": 0.84912586, + "num_input_tokens_seen": 57939890, + "step": 2676, + "time_per_iteration": 2.703517198562622 + }, + { + "auxiliary_loss_clip": 0.01146499, + "auxiliary_loss_mlp": 0.01046412, + "balance_loss_clip": 1.05404162, + "balance_loss_mlp": 1.02838731, + "epoch": 0.1609499473921539, + "flos": 18625105814400.0, + "grad_norm": 1.913289631672295, + "language_loss": 0.73580086, + "learning_rate": 3.822799341092573e-06, + "loss": 0.75773001, + "num_input_tokens_seen": 57957410, + "step": 2677, + "time_per_iteration": 2.4746367931365967 + }, + { + "auxiliary_loss_clip": 0.01136591, + "auxiliary_loss_mlp": 0.0104217, + "balance_loss_clip": 1.05234051, + "balance_loss_mlp": 1.02236903, + "epoch": 0.1610100706448219, + "flos": 33145728416640.0, + "grad_norm": 1.922411149232369, + "language_loss": 0.76130587, + "learning_rate": 3.822639034348728e-06, + "loss": 0.78309351, + "num_input_tokens_seen": 57977900, + "step": 2678, + "time_per_iteration": 2.642801523208618 + }, + { + "auxiliary_loss_clip": 0.01146102, + "auxiliary_loss_mlp": 0.01041986, + "balance_loss_clip": 1.05253959, + "balance_loss_mlp": 1.02177954, + "epoch": 0.16107019389748986, + "flos": 34676707852800.0, + "grad_norm": 2.3976193499428806, + "language_loss": 0.70637459, + "learning_rate": 3.822478658490228e-06, + "loss": 0.72825545, + "num_input_tokens_seen": 57998210, + "step": 2679, + "time_per_iteration": 2.6104655265808105 + }, + { + "auxiliary_loss_clip": 0.0104153, + "auxiliary_loss_mlp": 0.01292318, + "balance_loss_clip": 1.03249192, + "balance_loss_mlp": 1.93311465, + "epoch": 0.16113031715015783, + "flos": 65713403260800.0, + "grad_norm": 0.8203695036103317, + "language_loss": 0.51809406, + "learning_rate": 3.822318213523154e-06, + "loss": 0.54143256, + "num_input_tokens_seen": 58059420, + "step": 2680, + "time_per_iteration": 3.2261202335357666 + }, + { + "auxiliary_loss_clip": 0.01139868, + "auxiliary_loss_mlp": 0.01048515, + "balance_loss_clip": 1.05118573, + "balance_loss_mlp": 1.02677083, + "epoch": 0.1611904404028258, + "flos": 20810413353600.0, + "grad_norm": 19.173513603471626, + "language_loss": 0.80556095, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.82744479, + "num_input_tokens_seen": 58078370, + "step": 2681, + "time_per_iteration": 2.526144027709961 + }, + { + "auxiliary_loss_clip": 0.011337, + "auxiliary_loss_mlp": 0.01055404, + "balance_loss_clip": 1.05234659, + "balance_loss_mlp": 1.03596091, + "epoch": 0.16125056365549376, + "flos": 27013335062400.0, + "grad_norm": 1.9641037547900773, + "language_loss": 0.68823719, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71012831, + "num_input_tokens_seen": 58097395, + "step": 2682, + "time_per_iteration": 2.590691089630127 + }, + { + "auxiliary_loss_clip": 0.01134463, + "auxiliary_loss_mlp": 0.01052011, + "balance_loss_clip": 1.05395436, + "balance_loss_mlp": 1.03147149, + "epoch": 0.16131068690816172, + "flos": 19276524915840.0, + "grad_norm": 1.8904593025900922, + "language_loss": 0.87582201, + "learning_rate": 3.821836464031348e-06, + "loss": 0.89768672, + "num_input_tokens_seen": 58115630, + "step": 2683, + "time_per_iteration": 2.5146102905273438 + }, + { + "auxiliary_loss_clip": 0.0115975, + "auxiliary_loss_mlp": 0.01050858, + "balance_loss_clip": 1.05319107, + "balance_loss_mlp": 1.03066361, + "epoch": 0.16137081016082971, + "flos": 35337931367040.0, + "grad_norm": 2.7716169471614602, + "language_loss": 0.74349028, + "learning_rate": 3.821675742690849e-06, + "loss": 0.76559633, + "num_input_tokens_seen": 58138655, + "step": 2684, + "time_per_iteration": 2.62308931350708 + }, + { + "auxiliary_loss_clip": 0.01135941, + "auxiliary_loss_mlp": 0.00904749, + "balance_loss_clip": 1.05666447, + "balance_loss_mlp": 1.20612693, + "epoch": 0.16143093341349768, + "flos": 34235257703040.0, + "grad_norm": 2.0487747295492396, + "language_loss": 0.70849597, + "learning_rate": 3.821514952272223e-06, + "loss": 0.72890282, + "num_input_tokens_seen": 58157440, + "step": 2685, + "time_per_iteration": 2.6421124935150146 + }, + { + "auxiliary_loss_clip": 0.01108729, + "auxiliary_loss_mlp": 0.01061225, + "balance_loss_clip": 1.04943681, + "balance_loss_mlp": 1.04029191, + "epoch": 0.16149105666616564, + "flos": 27999262546560.0, + "grad_norm": 1.9709972730415162, + "language_loss": 0.71785879, + "learning_rate": 3.821354092781567e-06, + "loss": 0.73955834, + "num_input_tokens_seen": 58176660, + "step": 2686, + "time_per_iteration": 2.628288984298706 + }, + { + "auxiliary_loss_clip": 0.01151475, + "auxiliary_loss_mlp": 0.01050293, + "balance_loss_clip": 1.0534507, + "balance_loss_mlp": 1.03005087, + "epoch": 0.1615511799188336, + "flos": 19422214479360.0, + "grad_norm": 1.8580722792049957, + "language_loss": 0.82284951, + "learning_rate": 3.821193164224981e-06, + "loss": 0.84486723, + "num_input_tokens_seen": 58195085, + "step": 2687, + "time_per_iteration": 2.4869236946105957 + }, + { + "auxiliary_loss_clip": 0.01149124, + "auxiliary_loss_mlp": 0.01053927, + "balance_loss_clip": 1.04907477, + "balance_loss_mlp": 1.03201663, + "epoch": 0.16161130317150157, + "flos": 22854915578880.0, + "grad_norm": 1.7549357476938325, + "language_loss": 0.71679032, + "learning_rate": 3.821032166608568e-06, + "loss": 0.73882091, + "num_input_tokens_seen": 58213540, + "step": 2688, + "time_per_iteration": 2.499110460281372 + }, + { + "auxiliary_loss_clip": 0.01118553, + "auxiliary_loss_mlp": 0.01048761, + "balance_loss_clip": 1.05201614, + "balance_loss_mlp": 1.02966356, + "epoch": 0.16167142642416954, + "flos": 26110577520000.0, + "grad_norm": 1.5681496384520943, + "language_loss": 0.75843716, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.78011024, + "num_input_tokens_seen": 58236995, + "step": 2689, + "time_per_iteration": 2.635648012161255 + }, + { + "auxiliary_loss_clip": 0.01161245, + "auxiliary_loss_mlp": 0.0105629, + "balance_loss_clip": 1.05544615, + "balance_loss_mlp": 1.03584576, + "epoch": 0.1617315496768375, + "flos": 22779646629120.0, + "grad_norm": 1.8666053779539216, + "language_loss": 0.87753564, + "learning_rate": 3.820709964220683e-06, + "loss": 0.89971095, + "num_input_tokens_seen": 58257230, + "step": 2690, + "time_per_iteration": 2.5588769912719727 + }, + { + "auxiliary_loss_clip": 0.01143548, + "auxiliary_loss_mlp": 0.01046307, + "balance_loss_clip": 1.05520725, + "balance_loss_mlp": 1.02830637, + "epoch": 0.1617916729295055, + "flos": 22017299351040.0, + "grad_norm": 1.7010302015746661, + "language_loss": 0.88332486, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.90522337, + "num_input_tokens_seen": 58277080, + "step": 2691, + "time_per_iteration": 2.5193605422973633 + }, + { + "auxiliary_loss_clip": 0.01146197, + "auxiliary_loss_mlp": 0.01048051, + "balance_loss_clip": 1.05163872, + "balance_loss_mlp": 1.02658141, + "epoch": 0.16185179618217346, + "flos": 23438248450560.0, + "grad_norm": 2.532509606446305, + "language_loss": 0.82574093, + "learning_rate": 3.820387485666784e-06, + "loss": 0.84768337, + "num_input_tokens_seen": 58294815, + "step": 2692, + "time_per_iteration": 2.512941837310791 + }, + { + "auxiliary_loss_clip": 0.01162553, + "auxiliary_loss_mlp": 0.01054436, + "balance_loss_clip": 1.051404, + "balance_loss_mlp": 1.03391981, + "epoch": 0.16191191943484143, + "flos": 25666110627840.0, + "grad_norm": 2.4320561661533247, + "language_loss": 0.81254935, + "learning_rate": 3.820226142842862e-06, + "loss": 0.8347193, + "num_input_tokens_seen": 58313215, + "step": 2693, + "time_per_iteration": 2.5429534912109375 + }, + { + "auxiliary_loss_clip": 0.01155519, + "auxiliary_loss_mlp": 0.01058818, + "balance_loss_clip": 1.05269861, + "balance_loss_mlp": 1.04014993, + "epoch": 0.1619720426875094, + "flos": 23477355383040.0, + "grad_norm": 1.4476927854723711, + "language_loss": 0.83949208, + "learning_rate": 3.820064730995783e-06, + "loss": 0.86163551, + "num_input_tokens_seen": 58333215, + "step": 2694, + "time_per_iteration": 2.5380687713623047 + }, + { + "auxiliary_loss_clip": 0.01110841, + "auxiliary_loss_mlp": 0.01064812, + "balance_loss_clip": 1.04859245, + "balance_loss_mlp": 1.04368806, + "epoch": 0.16203216594017736, + "flos": 24133658734080.0, + "grad_norm": 2.0655928621359414, + "language_loss": 0.68864882, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71040535, + "num_input_tokens_seen": 58351160, + "step": 2695, + "time_per_iteration": 2.578021764755249 + }, + { + "auxiliary_loss_clip": 0.01149614, + "auxiliary_loss_mlp": 0.01057184, + "balance_loss_clip": 1.05414295, + "balance_loss_mlp": 1.03648901, + "epoch": 0.16209228919284532, + "flos": 22340889999360.0, + "grad_norm": 2.1456998644879106, + "language_loss": 0.82850838, + "learning_rate": 3.819741700256637e-06, + "loss": 0.8505764, + "num_input_tokens_seen": 58368505, + "step": 2696, + "time_per_iteration": 2.5328567028045654 + }, + { + "auxiliary_loss_clip": 0.01167228, + "auxiliary_loss_mlp": 0.0105814, + "balance_loss_clip": 1.05514598, + "balance_loss_mlp": 1.03682566, + "epoch": 0.1621524124455133, + "flos": 15815131827840.0, + "grad_norm": 2.31476607850295, + "language_loss": 0.88740689, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.90966058, + "num_input_tokens_seen": 58385085, + "step": 2697, + "time_per_iteration": 3.868121385574341 + }, + { + "auxiliary_loss_clip": 0.01152294, + "auxiliary_loss_mlp": 0.01047971, + "balance_loss_clip": 1.05025125, + "balance_loss_mlp": 1.02944541, + "epoch": 0.16221253569818128, + "flos": 30186688988160.0, + "grad_norm": 1.457150840493522, + "language_loss": 0.80595386, + "learning_rate": 3.819418393498343e-06, + "loss": 0.82795656, + "num_input_tokens_seen": 58406985, + "step": 2698, + "time_per_iteration": 3.9901914596557617 + }, + { + "auxiliary_loss_clip": 0.01143422, + "auxiliary_loss_mlp": 0.01044197, + "balance_loss_clip": 1.05300224, + "balance_loss_mlp": 1.02493238, + "epoch": 0.16227265895084925, + "flos": 24605991601920.0, + "grad_norm": 1.95575769639907, + "language_loss": 0.77431965, + "learning_rate": 3.819256636627339e-06, + "loss": 0.79619586, + "num_input_tokens_seen": 58426205, + "step": 2699, + "time_per_iteration": 3.984302520751953 + }, + { + "auxiliary_loss_clip": 0.01133137, + "auxiliary_loss_mlp": 0.01040544, + "balance_loss_clip": 1.05134153, + "balance_loss_mlp": 1.02256763, + "epoch": 0.1623327822035172, + "flos": 19573326996480.0, + "grad_norm": 2.172024885654685, + "language_loss": 0.85671866, + "learning_rate": 3.81909481076994e-06, + "loss": 0.8784554, + "num_input_tokens_seen": 58443830, + "step": 2700, + "time_per_iteration": 2.5627851486206055 + }, + { + "auxiliary_loss_clip": 0.01143, + "auxiliary_loss_mlp": 0.00950719, + "balance_loss_clip": 1.0490042, + "balance_loss_mlp": 1.28568542, + "epoch": 0.16239290545618518, + "flos": 26468462678400.0, + "grad_norm": 1.5941704100801748, + "language_loss": 0.80655563, + "learning_rate": 3.818932915932284e-06, + "loss": 0.82749283, + "num_input_tokens_seen": 58464405, + "step": 2701, + "time_per_iteration": 2.5487163066864014 + }, + { + "auxiliary_loss_clip": 0.01139772, + "auxiliary_loss_mlp": 0.010461, + "balance_loss_clip": 1.0538224, + "balance_loss_mlp": 1.02662086, + "epoch": 0.16245302870885314, + "flos": 15851940289920.0, + "grad_norm": 2.2729532913807087, + "language_loss": 0.73288035, + "learning_rate": 3.818770952120511e-06, + "loss": 0.75473905, + "num_input_tokens_seen": 58483295, + "step": 2702, + "time_per_iteration": 2.5764291286468506 + }, + { + "auxiliary_loss_clip": 0.01147615, + "auxiliary_loss_mlp": 0.01048893, + "balance_loss_clip": 1.05262816, + "balance_loss_mlp": 1.0274359, + "epoch": 0.1625131519615211, + "flos": 14756521173120.0, + "grad_norm": 1.9882712654332582, + "language_loss": 0.73034507, + "learning_rate": 3.81860891934076e-06, + "loss": 0.75231016, + "num_input_tokens_seen": 58501205, + "step": 2703, + "time_per_iteration": 2.483491897583008 + }, + { + "auxiliary_loss_clip": 0.01158105, + "auxiliary_loss_mlp": 0.01049314, + "balance_loss_clip": 1.04997206, + "balance_loss_mlp": 1.02814245, + "epoch": 0.1625732752141891, + "flos": 28220508368640.0, + "grad_norm": 2.0299743179780476, + "language_loss": 0.70864922, + "learning_rate": 3.818446817599176e-06, + "loss": 0.73072338, + "num_input_tokens_seen": 58522315, + "step": 2704, + "time_per_iteration": 3.8935294151306152 + }, + { + "auxiliary_loss_clip": 0.01028706, + "auxiliary_loss_mlp": 0.01036622, + "balance_loss_clip": 1.03337717, + "balance_loss_mlp": 1.03354633, + "epoch": 0.16263339846685707, + "flos": 67327947688320.0, + "grad_norm": 0.7859325215468879, + "language_loss": 0.53329688, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55395019, + "num_input_tokens_seen": 58586695, + "step": 2705, + "time_per_iteration": 3.1838834285736084 + }, + { + "auxiliary_loss_clip": 0.01133318, + "auxiliary_loss_mlp": 0.00917205, + "balance_loss_clip": 1.05283189, + "balance_loss_mlp": 1.21622825, + "epoch": 0.16269352171952503, + "flos": 14319165173760.0, + "grad_norm": 6.433154380322071, + "language_loss": 0.75327158, + "learning_rate": 3.818122407255102e-06, + "loss": 0.77377683, + "num_input_tokens_seen": 58602435, + "step": 2706, + "time_per_iteration": 2.5242412090301514 + }, + { + "auxiliary_loss_clip": 0.01130941, + "auxiliary_loss_mlp": 0.0105163, + "balance_loss_clip": 1.05226767, + "balance_loss_mlp": 1.03231764, + "epoch": 0.162753644972193, + "flos": 28361205941760.0, + "grad_norm": 2.018028329783564, + "language_loss": 0.7244699, + "learning_rate": 3.817960098664914e-06, + "loss": 0.74629557, + "num_input_tokens_seen": 58621275, + "step": 2707, + "time_per_iteration": 2.650559663772583 + }, + { + "auxiliary_loss_clip": 0.01143647, + "auxiliary_loss_mlp": 0.01052479, + "balance_loss_clip": 1.05551732, + "balance_loss_mlp": 1.03259492, + "epoch": 0.16281376822486096, + "flos": 19937856170880.0, + "grad_norm": 12.305593870749805, + "language_loss": 0.83561224, + "learning_rate": 3.817797721137495e-06, + "loss": 0.85757351, + "num_input_tokens_seen": 58637550, + "step": 2708, + "time_per_iteration": 2.53810453414917 + }, + { + "auxiliary_loss_clip": 0.01098201, + "auxiliary_loss_mlp": 0.00896643, + "balance_loss_clip": 1.04948211, + "balance_loss_mlp": 1.1797936, + "epoch": 0.16287389147752893, + "flos": 21251719848960.0, + "grad_norm": 2.5043668165943576, + "language_loss": 0.86525786, + "learning_rate": 3.817635274679006e-06, + "loss": 0.88520634, + "num_input_tokens_seen": 58654135, + "step": 2709, + "time_per_iteration": 2.617201566696167 + }, + { + "auxiliary_loss_clip": 0.01136284, + "auxiliary_loss_mlp": 0.0088481, + "balance_loss_clip": 1.05112839, + "balance_loss_mlp": 1.16346121, + "epoch": 0.1629340147301969, + "flos": 19244672530560.0, + "grad_norm": 1.664927471703942, + "language_loss": 0.91589355, + "learning_rate": 3.817472759295605e-06, + "loss": 0.93610454, + "num_input_tokens_seen": 58674320, + "step": 2710, + "time_per_iteration": 2.5667223930358887 + }, + { + "auxiliary_loss_clip": 0.01114927, + "auxiliary_loss_mlp": 0.01056324, + "balance_loss_clip": 1.05328012, + "balance_loss_mlp": 1.03635621, + "epoch": 0.16299413798286488, + "flos": 21249816428160.0, + "grad_norm": 2.5136879560345227, + "language_loss": 0.81871688, + "learning_rate": 3.817310174993453e-06, + "loss": 0.84042943, + "num_input_tokens_seen": 58691000, + "step": 2711, + "time_per_iteration": 2.5697169303894043 + }, + { + "auxiliary_loss_clip": 0.01142942, + "auxiliary_loss_mlp": 0.01044481, + "balance_loss_clip": 1.05013776, + "balance_loss_mlp": 1.02437019, + "epoch": 0.16305426123553285, + "flos": 18770579896320.0, + "grad_norm": 2.8629389741387907, + "language_loss": 0.80807483, + "learning_rate": 3.817147521778719e-06, + "loss": 0.82994908, + "num_input_tokens_seen": 58710230, + "step": 2712, + "time_per_iteration": 2.5495386123657227 + }, + { + "auxiliary_loss_clip": 0.01166705, + "auxiliary_loss_mlp": 0.01051614, + "balance_loss_clip": 1.05663764, + "balance_loss_mlp": 1.03104997, + "epoch": 0.16311438448820081, + "flos": 22087648137600.0, + "grad_norm": 2.2854744352140224, + "language_loss": 0.76789558, + "learning_rate": 3.816984799657568e-06, + "loss": 0.79007876, + "num_input_tokens_seen": 58728610, + "step": 2713, + "time_per_iteration": 2.4847543239593506 + }, + { + "auxiliary_loss_clip": 0.01152401, + "auxiliary_loss_mlp": 0.01060568, + "balance_loss_clip": 1.05872989, + "balance_loss_mlp": 1.04069555, + "epoch": 0.16317450774086878, + "flos": 16467700164480.0, + "grad_norm": 2.3405320815460158, + "language_loss": 0.7922554, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.81438506, + "num_input_tokens_seen": 58744385, + "step": 2714, + "time_per_iteration": 2.506089448928833 + }, + { + "auxiliary_loss_clip": 0.01146366, + "auxiliary_loss_mlp": 0.01057776, + "balance_loss_clip": 1.05278206, + "balance_loss_mlp": 1.03799951, + "epoch": 0.16323463099353674, + "flos": 24352929308160.0, + "grad_norm": 1.6955057359460362, + "language_loss": 0.77986825, + "learning_rate": 3.816659148720702e-06, + "loss": 0.80190957, + "num_input_tokens_seen": 58763905, + "step": 2715, + "time_per_iteration": 2.5383594036102295 + }, + { + "auxiliary_loss_clip": 0.01128552, + "auxiliary_loss_mlp": 0.01045344, + "balance_loss_clip": 1.04975367, + "balance_loss_mlp": 1.02666366, + "epoch": 0.1632947542462047, + "flos": 24900782520960.0, + "grad_norm": 2.893480651727335, + "language_loss": 0.81707764, + "learning_rate": 3.816496219917336e-06, + "loss": 0.83881664, + "num_input_tokens_seen": 58785580, + "step": 2716, + "time_per_iteration": 2.589437246322632 + }, + { + "auxiliary_loss_clip": 0.0114013, + "auxiliary_loss_mlp": 0.0105566, + "balance_loss_clip": 1.05536127, + "balance_loss_mlp": 1.03681254, + "epoch": 0.1633548774988727, + "flos": 24900279730560.0, + "grad_norm": 1.8542855053271434, + "language_loss": 0.85953832, + "learning_rate": 3.816333222232251e-06, + "loss": 0.88149625, + "num_input_tokens_seen": 58806075, + "step": 2717, + "time_per_iteration": 2.5721468925476074 + }, + { + "auxiliary_loss_clip": 0.01134098, + "auxiliary_loss_mlp": 0.01049251, + "balance_loss_clip": 1.05409896, + "balance_loss_mlp": 1.03070211, + "epoch": 0.16341500075154067, + "flos": 30441798357120.0, + "grad_norm": 1.8059476607240434, + "language_loss": 0.76089865, + "learning_rate": 3.816170155671629e-06, + "loss": 0.78273207, + "num_input_tokens_seen": 58827405, + "step": 2718, + "time_per_iteration": 2.6224029064178467 + }, + { + "auxiliary_loss_clip": 0.01142386, + "auxiliary_loss_mlp": 0.01045812, + "balance_loss_clip": 1.05344653, + "balance_loss_mlp": 1.02741766, + "epoch": 0.16347512400420863, + "flos": 22784530878720.0, + "grad_norm": 2.2112671199320864, + "language_loss": 0.73815763, + "learning_rate": 3.816007020241652e-06, + "loss": 0.76003963, + "num_input_tokens_seen": 58847205, + "step": 2719, + "time_per_iteration": 2.549989938735962 + }, + { + "auxiliary_loss_clip": 0.01129278, + "auxiliary_loss_mlp": 0.010507, + "balance_loss_clip": 1.05130601, + "balance_loss_mlp": 1.03073192, + "epoch": 0.1635352472568766, + "flos": 22633274707200.0, + "grad_norm": 1.614564790886584, + "language_loss": 0.72302604, + "learning_rate": 3.815843815948507e-06, + "loss": 0.74482584, + "num_input_tokens_seen": 58866865, + "step": 2720, + "time_per_iteration": 2.597231864929199 + }, + { + "auxiliary_loss_clip": 0.01105314, + "auxiliary_loss_mlp": 0.01051581, + "balance_loss_clip": 1.04918301, + "balance_loss_mlp": 1.02998054, + "epoch": 0.16359537050954456, + "flos": 15522998515200.0, + "grad_norm": 2.2121347083315563, + "language_loss": 0.75938642, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.78095531, + "num_input_tokens_seen": 58885200, + "step": 2721, + "time_per_iteration": 2.5674898624420166 + }, + { + "auxiliary_loss_clip": 0.01109349, + "auxiliary_loss_mlp": 0.01059766, + "balance_loss_clip": 1.04759002, + "balance_loss_mlp": 1.03749776, + "epoch": 0.16365549376221253, + "flos": 22090162089600.0, + "grad_norm": 1.693655638988614, + "language_loss": 0.79608893, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.81778002, + "num_input_tokens_seen": 58906385, + "step": 2722, + "time_per_iteration": 2.6328768730163574 + }, + { + "auxiliary_loss_clip": 0.01146882, + "auxiliary_loss_mlp": 0.0085739, + "balance_loss_clip": 1.05432165, + "balance_loss_mlp": 1.10834622, + "epoch": 0.1637156170148805, + "flos": 24060400945920.0, + "grad_norm": 2.1104889977473524, + "language_loss": 0.84645343, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.86649621, + "num_input_tokens_seen": 58925040, + "step": 2723, + "time_per_iteration": 2.522733688354492 + }, + { + "auxiliary_loss_clip": 0.01105832, + "auxiliary_loss_mlp": 0.01040816, + "balance_loss_clip": 1.05046821, + "balance_loss_mlp": 1.02120614, + "epoch": 0.1637757402675485, + "flos": 26685362954880.0, + "grad_norm": 2.0786211436900954, + "language_loss": 0.70985049, + "learning_rate": 3.815190310268058e-06, + "loss": 0.73131692, + "num_input_tokens_seen": 58944790, + "step": 2724, + "time_per_iteration": 2.6469380855560303 + }, + { + "auxiliary_loss_clip": 0.01115712, + "auxiliary_loss_mlp": 0.01048964, + "balance_loss_clip": 1.05354619, + "balance_loss_mlp": 1.03049791, + "epoch": 0.16383586352021645, + "flos": 16106941918080.0, + "grad_norm": 2.0033095433008135, + "language_loss": 0.71085691, + "learning_rate": 3.815026761751955e-06, + "loss": 0.73250365, + "num_input_tokens_seen": 58962500, + "step": 2725, + "time_per_iteration": 2.5601398944854736 + }, + { + "auxiliary_loss_clip": 0.01111557, + "auxiliary_loss_mlp": 0.01041789, + "balance_loss_clip": 1.05357718, + "balance_loss_mlp": 1.02275145, + "epoch": 0.16389598677288442, + "flos": 19165991788800.0, + "grad_norm": 1.9595531103129147, + "language_loss": 0.88314617, + "learning_rate": 3.814863144409855e-06, + "loss": 0.90467966, + "num_input_tokens_seen": 58980355, + "step": 2726, + "time_per_iteration": 2.555626392364502 + }, + { + "auxiliary_loss_clip": 0.01150561, + "auxiliary_loss_mlp": 0.01045766, + "balance_loss_clip": 1.05519652, + "balance_loss_mlp": 1.02632308, + "epoch": 0.16395611002555238, + "flos": 21507008785920.0, + "grad_norm": 2.1930004731506196, + "language_loss": 0.73689544, + "learning_rate": 3.814699458247963e-06, + "loss": 0.75885874, + "num_input_tokens_seen": 58999505, + "step": 2727, + "time_per_iteration": 2.515079975128174 + }, + { + "auxiliary_loss_clip": 0.01144875, + "auxiliary_loss_mlp": 0.0104531, + "balance_loss_clip": 1.05317283, + "balance_loss_mlp": 1.02817988, + "epoch": 0.16401623327822035, + "flos": 21470918595840.0, + "grad_norm": 1.5949431818501372, + "language_loss": 0.82782567, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.84972751, + "num_input_tokens_seen": 59017930, + "step": 2728, + "time_per_iteration": 2.524932622909546 + }, + { + "auxiliary_loss_clip": 0.01151869, + "auxiliary_loss_mlp": 0.01047286, + "balance_loss_clip": 1.05423307, + "balance_loss_mlp": 1.02737808, + "epoch": 0.1640763565308883, + "flos": 13626232928640.0, + "grad_norm": 2.48458424643167, + "language_loss": 0.8504076, + "learning_rate": 3.814371879489633e-06, + "loss": 0.87239915, + "num_input_tokens_seen": 59035130, + "step": 2729, + "time_per_iteration": 2.4818005561828613 + }, + { + "auxiliary_loss_clip": 0.01159237, + "auxiliary_loss_mlp": 0.01047934, + "balance_loss_clip": 1.05298805, + "balance_loss_mlp": 1.02976656, + "epoch": 0.16413647978355628, + "flos": 15451464579840.0, + "grad_norm": 2.200805089311736, + "language_loss": 0.73442566, + "learning_rate": 3.814207986905616e-06, + "loss": 0.75649738, + "num_input_tokens_seen": 59053080, + "step": 2730, + "time_per_iteration": 2.4542348384857178 + }, + { + "auxiliary_loss_clip": 0.01136704, + "auxiliary_loss_mlp": 0.01051464, + "balance_loss_clip": 1.04773128, + "balance_loss_mlp": 1.03099561, + "epoch": 0.16419660303622427, + "flos": 45878682015360.0, + "grad_norm": 2.401223118860667, + "language_loss": 0.74759614, + "learning_rate": 3.814044025526651e-06, + "loss": 0.76947778, + "num_input_tokens_seen": 59075610, + "step": 2731, + "time_per_iteration": 2.719670057296753 + }, + { + "auxiliary_loss_clip": 0.01124269, + "auxiliary_loss_mlp": 0.01053074, + "balance_loss_clip": 1.05562377, + "balance_loss_mlp": 1.03280807, + "epoch": 0.16425672628889224, + "flos": 18952826526720.0, + "grad_norm": 2.0724753605297694, + "language_loss": 0.79272354, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.81449693, + "num_input_tokens_seen": 59094555, + "step": 2732, + "time_per_iteration": 2.564490795135498 + }, + { + "auxiliary_loss_clip": 0.01137078, + "auxiliary_loss_mlp": 0.01047394, + "balance_loss_clip": 1.05202723, + "balance_loss_mlp": 1.02822483, + "epoch": 0.1643168495415602, + "flos": 24312996362880.0, + "grad_norm": 1.8733094710945413, + "language_loss": 0.69387954, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.71572423, + "num_input_tokens_seen": 59113515, + "step": 2733, + "time_per_iteration": 2.5508759021759033 + }, + { + "auxiliary_loss_clip": 0.01139445, + "auxiliary_loss_mlp": 0.01050231, + "balance_loss_clip": 1.05349338, + "balance_loss_mlp": 1.02829647, + "epoch": 0.16437697279422817, + "flos": 26428421992320.0, + "grad_norm": 1.678468618487968, + "language_loss": 0.80700803, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.82890475, + "num_input_tokens_seen": 59133275, + "step": 2734, + "time_per_iteration": 2.5930395126342773 + }, + { + "auxiliary_loss_clip": 0.01130431, + "auxiliary_loss_mlp": 0.01061516, + "balance_loss_clip": 1.05277228, + "balance_loss_mlp": 1.04003453, + "epoch": 0.16443709604689613, + "flos": 34532239351680.0, + "grad_norm": 2.5719655992304484, + "language_loss": 0.82100368, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.84292316, + "num_input_tokens_seen": 59154095, + "step": 2735, + "time_per_iteration": 2.640289068222046 + }, + { + "auxiliary_loss_clip": 0.01075643, + "auxiliary_loss_mlp": 0.01044192, + "balance_loss_clip": 1.05443358, + "balance_loss_mlp": 1.02566671, + "epoch": 0.1644972192995641, + "flos": 23258048895360.0, + "grad_norm": 2.7720807421999303, + "language_loss": 0.7813046, + "learning_rate": 3.813223186925296e-06, + "loss": 0.80250287, + "num_input_tokens_seen": 59173795, + "step": 2736, + "time_per_iteration": 5.4762349128723145 + }, + { + "auxiliary_loss_clip": 0.01141235, + "auxiliary_loss_mlp": 0.01057864, + "balance_loss_clip": 1.05281842, + "balance_loss_mlp": 1.03896964, + "epoch": 0.1645573425522321, + "flos": 26979543342720.0, + "grad_norm": 1.9880645836291604, + "language_loss": 0.81722987, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.83922088, + "num_input_tokens_seen": 59191610, + "step": 2737, + "time_per_iteration": 2.5748472213745117 + }, + { + "auxiliary_loss_clip": 0.0114202, + "auxiliary_loss_mlp": 0.01061765, + "balance_loss_clip": 1.04953671, + "balance_loss_mlp": 1.04118967, + "epoch": 0.16461746580490005, + "flos": 28731768600960.0, + "grad_norm": 1.703648875689689, + "language_loss": 0.87450475, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.89654267, + "num_input_tokens_seen": 59213000, + "step": 2738, + "time_per_iteration": 3.930824041366577 + }, + { + "auxiliary_loss_clip": 0.01133666, + "auxiliary_loss_mlp": 0.01055938, + "balance_loss_clip": 1.05129695, + "balance_loss_mlp": 1.03600669, + "epoch": 0.16467758905756802, + "flos": 24930156867840.0, + "grad_norm": 1.8475232512129576, + "language_loss": 0.72380507, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74570119, + "num_input_tokens_seen": 59232340, + "step": 2739, + "time_per_iteration": 2.562509298324585 + }, + { + "auxiliary_loss_clip": 0.01146177, + "auxiliary_loss_mlp": 0.01049913, + "balance_loss_clip": 1.05047858, + "balance_loss_mlp": 1.03033888, + "epoch": 0.16473771231023598, + "flos": 24826519152000.0, + "grad_norm": 2.163373955545855, + "language_loss": 0.82031256, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.84227347, + "num_input_tokens_seen": 59253950, + "step": 2740, + "time_per_iteration": 2.6030588150024414 + }, + { + "auxiliary_loss_clip": 0.01108025, + "auxiliary_loss_mlp": 0.01071131, + "balance_loss_clip": 1.04780722, + "balance_loss_mlp": 1.04565573, + "epoch": 0.16479783556290395, + "flos": 39896072375040.0, + "grad_norm": 1.954869785782303, + "language_loss": 0.68999875, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.71179032, + "num_input_tokens_seen": 59275545, + "step": 2741, + "time_per_iteration": 2.7714107036590576 + }, + { + "auxiliary_loss_clip": 0.01160855, + "auxiliary_loss_mlp": 0.01046396, + "balance_loss_clip": 1.05374622, + "balance_loss_mlp": 1.02709639, + "epoch": 0.16485795881557191, + "flos": 19897061299200.0, + "grad_norm": 1.925985116242982, + "language_loss": 0.79738945, + "learning_rate": 3.812235911671472e-06, + "loss": 0.81946194, + "num_input_tokens_seen": 59293480, + "step": 2742, + "time_per_iteration": 3.853848934173584 + }, + { + "auxiliary_loss_clip": 0.01135781, + "auxiliary_loss_mlp": 0.01055787, + "balance_loss_clip": 1.05246782, + "balance_loss_mlp": 1.03581953, + "epoch": 0.16491808206823988, + "flos": 20556129997440.0, + "grad_norm": 3.47812481950133, + "language_loss": 0.84971619, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.87163186, + "num_input_tokens_seen": 59313435, + "step": 2743, + "time_per_iteration": 2.5474965572357178 + }, + { + "auxiliary_loss_clip": 0.01157272, + "auxiliary_loss_mlp": 0.01054161, + "balance_loss_clip": 1.05260515, + "balance_loss_mlp": 1.03405011, + "epoch": 0.16497820532090787, + "flos": 23800802376960.0, + "grad_norm": 1.6094137354799132, + "language_loss": 0.85668492, + "learning_rate": 3.811906270092265e-06, + "loss": 0.8787992, + "num_input_tokens_seen": 59331535, + "step": 2744, + "time_per_iteration": 2.518165111541748 + }, + { + "auxiliary_loss_clip": 0.011294, + "auxiliary_loss_mlp": 0.01054432, + "balance_loss_clip": 1.05277145, + "balance_loss_mlp": 1.03540587, + "epoch": 0.16503832857357584, + "flos": 25482642935040.0, + "grad_norm": 1.7944534838794501, + "language_loss": 0.82840991, + "learning_rate": 3.811741346238036e-06, + "loss": 0.85024822, + "num_input_tokens_seen": 59350680, + "step": 2745, + "time_per_iteration": 2.5560920238494873 + }, + { + "auxiliary_loss_clip": 0.01130745, + "auxiliary_loss_mlp": 0.01057626, + "balance_loss_clip": 1.05787516, + "balance_loss_mlp": 1.03807592, + "epoch": 0.1650984518262438, + "flos": 17676058619520.0, + "grad_norm": 1.9164788750900865, + "language_loss": 0.76553077, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.78741455, + "num_input_tokens_seen": 59367020, + "step": 2746, + "time_per_iteration": 2.560053825378418 + }, + { + "auxiliary_loss_clip": 0.01159431, + "auxiliary_loss_mlp": 0.01055617, + "balance_loss_clip": 1.05262733, + "balance_loss_mlp": 1.03542352, + "epoch": 0.16515857507891177, + "flos": 18698327688960.0, + "grad_norm": 1.587969598327242, + "language_loss": 0.80787677, + "learning_rate": 3.811411292431592e-06, + "loss": 0.83002722, + "num_input_tokens_seen": 59386075, + "step": 2747, + "time_per_iteration": 2.4530060291290283 + }, + { + "auxiliary_loss_clip": 0.01151838, + "auxiliary_loss_mlp": 0.01048157, + "balance_loss_clip": 1.05556643, + "balance_loss_mlp": 1.02791572, + "epoch": 0.16521869833157973, + "flos": 15010481306880.0, + "grad_norm": 2.107758699088543, + "language_loss": 0.70029032, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.72229028, + "num_input_tokens_seen": 59402690, + "step": 2748, + "time_per_iteration": 2.507517099380493 + }, + { + "auxiliary_loss_clip": 0.01167088, + "auxiliary_loss_mlp": 0.00842586, + "balance_loss_clip": 1.0583694, + "balance_loss_mlp": 1.08000112, + "epoch": 0.1652788215842477, + "flos": 22121152548480.0, + "grad_norm": 2.303862665908159, + "language_loss": 0.88183355, + "learning_rate": 3.811080963869561e-06, + "loss": 0.90193033, + "num_input_tokens_seen": 59421130, + "step": 2749, + "time_per_iteration": 2.5047249794006348 + }, + { + "auxiliary_loss_clip": 0.01147185, + "auxiliary_loss_mlp": 0.01049129, + "balance_loss_clip": 1.05149901, + "balance_loss_mlp": 1.02889872, + "epoch": 0.16533894483691566, + "flos": 18333080242560.0, + "grad_norm": 1.9479541715285893, + "language_loss": 0.78941536, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.81137848, + "num_input_tokens_seen": 59438970, + "step": 2750, + "time_per_iteration": 2.488950252532959 + }, + { + "auxiliary_loss_clip": 0.01150154, + "auxiliary_loss_mlp": 0.01049764, + "balance_loss_clip": 1.05504632, + "balance_loss_mlp": 1.03015387, + "epoch": 0.16539906808958366, + "flos": 22382115834240.0, + "grad_norm": 1.7251288368220443, + "language_loss": 0.94874507, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.97074425, + "num_input_tokens_seen": 59458510, + "step": 2751, + "time_per_iteration": 2.5006072521209717 + }, + { + "auxiliary_loss_clip": 0.01069651, + "auxiliary_loss_mlp": 0.01056015, + "balance_loss_clip": 1.05213952, + "balance_loss_mlp": 1.03590417, + "epoch": 0.16545919134225162, + "flos": 22711093522560.0, + "grad_norm": 1.9117433547643519, + "language_loss": 0.70594549, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.72720212, + "num_input_tokens_seen": 59477110, + "step": 2752, + "time_per_iteration": 2.661979913711548 + }, + { + "auxiliary_loss_clip": 0.01066976, + "auxiliary_loss_mlp": 0.01016386, + "balance_loss_clip": 1.03223252, + "balance_loss_mlp": 1.01290512, + "epoch": 0.1655193145949196, + "flos": 67802974076160.0, + "grad_norm": 0.7815292572606064, + "language_loss": 0.54066873, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56150234, + "num_input_tokens_seen": 59541155, + "step": 2753, + "time_per_iteration": 3.227463722229004 + }, + { + "auxiliary_loss_clip": 0.01160053, + "auxiliary_loss_mlp": 0.00830464, + "balance_loss_clip": 1.05320168, + "balance_loss_mlp": 1.05913115, + "epoch": 0.16557943784758755, + "flos": 24280389792000.0, + "grad_norm": 1.7259836614352193, + "language_loss": 0.75803649, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.7779417, + "num_input_tokens_seen": 59561155, + "step": 2754, + "time_per_iteration": 2.5059049129486084 + }, + { + "auxiliary_loss_clip": 0.01140449, + "auxiliary_loss_mlp": 0.01066673, + "balance_loss_clip": 1.05783117, + "balance_loss_mlp": 1.04321241, + "epoch": 0.16563956110025552, + "flos": 20083617561600.0, + "grad_norm": 2.3473405878336604, + "language_loss": 0.86670291, + "learning_rate": 3.810088330151188e-06, + "loss": 0.8887741, + "num_input_tokens_seen": 59580460, + "step": 2755, + "time_per_iteration": 2.5629043579101562 + }, + { + "auxiliary_loss_clip": 0.01119381, + "auxiliary_loss_mlp": 0.01060666, + "balance_loss_clip": 1.04720521, + "balance_loss_mlp": 1.04103231, + "epoch": 0.16569968435292348, + "flos": 28034454896640.0, + "grad_norm": 1.770263404644781, + "language_loss": 0.73287815, + "learning_rate": 3.80992265092595e-06, + "loss": 0.75467861, + "num_input_tokens_seen": 59600025, + "step": 2756, + "time_per_iteration": 2.604966878890991 + }, + { + "auxiliary_loss_clip": 0.01125309, + "auxiliary_loss_mlp": 0.01056156, + "balance_loss_clip": 1.05057299, + "balance_loss_mlp": 1.03591418, + "epoch": 0.16575980760559147, + "flos": 26250233598720.0, + "grad_norm": 1.5955189974992747, + "language_loss": 0.74747181, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.76928639, + "num_input_tokens_seen": 59620600, + "step": 2757, + "time_per_iteration": 2.637022018432617 + }, + { + "auxiliary_loss_clip": 0.01140092, + "auxiliary_loss_mlp": 0.01051765, + "balance_loss_clip": 1.05561233, + "balance_loss_mlp": 1.03273916, + "epoch": 0.16581993085825944, + "flos": 26943955943040.0, + "grad_norm": 1.558935458985494, + "language_loss": 0.84743607, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.86935461, + "num_input_tokens_seen": 59641385, + "step": 2758, + "time_per_iteration": 2.5889735221862793 + }, + { + "auxiliary_loss_clip": 0.011641, + "auxiliary_loss_mlp": 0.01064298, + "balance_loss_clip": 1.05810249, + "balance_loss_mlp": 1.04582095, + "epoch": 0.1658800541109274, + "flos": 21653632103040.0, + "grad_norm": 2.221960236237499, + "language_loss": 0.79371798, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81600201, + "num_input_tokens_seen": 59659865, + "step": 2759, + "time_per_iteration": 2.4944968223571777 + }, + { + "auxiliary_loss_clip": 0.01102583, + "auxiliary_loss_mlp": 0.01059153, + "balance_loss_clip": 1.05547428, + "balance_loss_mlp": 1.03892279, + "epoch": 0.16594017736359537, + "flos": 16435488643200.0, + "grad_norm": 2.4007074875532606, + "language_loss": 0.75360668, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.77522403, + "num_input_tokens_seen": 59678780, + "step": 2760, + "time_per_iteration": 2.614659547805786 + }, + { + "auxiliary_loss_clip": 0.01116559, + "auxiliary_loss_mlp": 0.01045098, + "balance_loss_clip": 1.05282128, + "balance_loss_mlp": 1.02582169, + "epoch": 0.16600030061626334, + "flos": 22637297030400.0, + "grad_norm": 2.052881874590094, + "language_loss": 0.73143721, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.75305378, + "num_input_tokens_seen": 59698795, + "step": 2761, + "time_per_iteration": 2.6424593925476074 + }, + { + "auxiliary_loss_clip": 0.01133773, + "auxiliary_loss_mlp": 0.01052576, + "balance_loss_clip": 1.05281854, + "balance_loss_mlp": 1.03325224, + "epoch": 0.1660604238689313, + "flos": 26396569607040.0, + "grad_norm": 1.9152253881458319, + "language_loss": 0.88713109, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.90899462, + "num_input_tokens_seen": 59718795, + "step": 2762, + "time_per_iteration": 2.56805682182312 + }, + { + "auxiliary_loss_clip": 0.0111623, + "auxiliary_loss_mlp": 0.01053285, + "balance_loss_clip": 1.05409181, + "balance_loss_mlp": 1.03363895, + "epoch": 0.16612054712159927, + "flos": 23039999383680.0, + "grad_norm": 1.6552529489392105, + "language_loss": 0.88188791, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.90358317, + "num_input_tokens_seen": 59737555, + "step": 2763, + "time_per_iteration": 2.6354095935821533 + }, + { + "auxiliary_loss_clip": 0.01068965, + "auxiliary_loss_mlp": 0.01010918, + "balance_loss_clip": 1.02821684, + "balance_loss_mlp": 1.00746059, + "epoch": 0.16618067037426726, + "flos": 59241225202560.0, + "grad_norm": 0.7833965595945858, + "language_loss": 0.59812993, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.61892879, + "num_input_tokens_seen": 59800915, + "step": 2764, + "time_per_iteration": 3.097243070602417 + }, + { + "auxiliary_loss_clip": 0.01153564, + "auxiliary_loss_mlp": 0.01055013, + "balance_loss_clip": 1.05557036, + "balance_loss_mlp": 1.03322196, + "epoch": 0.16624079362693522, + "flos": 27198813916800.0, + "grad_norm": 2.5324096391724056, + "language_loss": 0.82291096, + "learning_rate": 3.808428450193401e-06, + "loss": 0.84499675, + "num_input_tokens_seen": 59822910, + "step": 2765, + "time_per_iteration": 2.5553781986236572 + }, + { + "auxiliary_loss_clip": 0.01170687, + "auxiliary_loss_mlp": 0.0104995, + "balance_loss_clip": 1.05783081, + "balance_loss_mlp": 1.02845681, + "epoch": 0.1663009168796032, + "flos": 10925068216320.0, + "grad_norm": 6.920890050147014, + "language_loss": 0.70718098, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.72938728, + "num_input_tokens_seen": 59838805, + "step": 2766, + "time_per_iteration": 2.4572510719299316 + }, + { + "auxiliary_loss_clip": 0.01152187, + "auxiliary_loss_mlp": 0.01044869, + "balance_loss_clip": 1.05945277, + "balance_loss_mlp": 1.02618921, + "epoch": 0.16636104013227115, + "flos": 17894431353600.0, + "grad_norm": 2.302468328561425, + "language_loss": 0.88861644, + "learning_rate": 3.808095651090769e-06, + "loss": 0.91058707, + "num_input_tokens_seen": 59855345, + "step": 2767, + "time_per_iteration": 2.484997034072876 + }, + { + "auxiliary_loss_clip": 0.01055447, + "auxiliary_loss_mlp": 0.01000249, + "balance_loss_clip": 1.02626586, + "balance_loss_mlp": 0.99674428, + "epoch": 0.16642116338493912, + "flos": 66726050463360.0, + "grad_norm": 0.6430342034551394, + "language_loss": 0.52914703, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.54970396, + "num_input_tokens_seen": 59917710, + "step": 2768, + "time_per_iteration": 3.228600263595581 + }, + { + "auxiliary_loss_clip": 0.01138346, + "auxiliary_loss_mlp": 0.01049124, + "balance_loss_clip": 1.05658722, + "balance_loss_mlp": 1.0286324, + "epoch": 0.16648128663760708, + "flos": 19026048401280.0, + "grad_norm": 4.971555357494953, + "language_loss": 0.84875786, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.87063265, + "num_input_tokens_seen": 59935105, + "step": 2769, + "time_per_iteration": 2.5082907676696777 + }, + { + "auxiliary_loss_clip": 0.01048094, + "auxiliary_loss_mlp": 0.01007213, + "balance_loss_clip": 1.02633929, + "balance_loss_mlp": 1.00389886, + "epoch": 0.16654140989027508, + "flos": 70134976759680.0, + "grad_norm": 0.808535131728097, + "language_loss": 0.57394886, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59450191, + "num_input_tokens_seen": 59984085, + "step": 2770, + "time_per_iteration": 2.990462303161621 + }, + { + "auxiliary_loss_clip": 0.01044578, + "auxiliary_loss_mlp": 0.01016288, + "balance_loss_clip": 1.03779888, + "balance_loss_mlp": 1.0131644, + "epoch": 0.16660153314294304, + "flos": 70272406195200.0, + "grad_norm": 0.8658153599191056, + "language_loss": 0.56229889, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58290756, + "num_input_tokens_seen": 60043470, + "step": 2771, + "time_per_iteration": 3.000206708908081 + }, + { + "auxiliary_loss_clip": 0.01113926, + "auxiliary_loss_mlp": 0.01057924, + "balance_loss_clip": 1.05184734, + "balance_loss_mlp": 1.03655005, + "epoch": 0.166661656395611, + "flos": 23075048079360.0, + "grad_norm": 2.31771319077267, + "language_loss": 0.70174706, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.72346556, + "num_input_tokens_seen": 60063045, + "step": 2772, + "time_per_iteration": 2.6458218097686768 + }, + { + "auxiliary_loss_clip": 0.01146642, + "auxiliary_loss_mlp": 0.01051476, + "balance_loss_clip": 1.05087829, + "balance_loss_mlp": 1.03088856, + "epoch": 0.16672177964827897, + "flos": 28366341586560.0, + "grad_norm": 3.553758573174903, + "language_loss": 0.86218131, + "learning_rate": 3.807095608468975e-06, + "loss": 0.88416255, + "num_input_tokens_seen": 60081945, + "step": 2773, + "time_per_iteration": 2.556415557861328 + }, + { + "auxiliary_loss_clip": 0.0109972, + "auxiliary_loss_mlp": 0.01048147, + "balance_loss_clip": 1.0508914, + "balance_loss_mlp": 1.02863228, + "epoch": 0.16678190290094694, + "flos": 19091010147840.0, + "grad_norm": 2.3103319451787088, + "language_loss": 0.82341361, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.84489226, + "num_input_tokens_seen": 60096820, + "step": 2774, + "time_per_iteration": 5.434317111968994 + }, + { + "auxiliary_loss_clip": 0.01121414, + "auxiliary_loss_mlp": 0.0104828, + "balance_loss_clip": 1.05211473, + "balance_loss_mlp": 1.02733469, + "epoch": 0.1668420261536149, + "flos": 21799106184960.0, + "grad_norm": 2.259361592647879, + "language_loss": 0.83034939, + "learning_rate": 3.806761712658952e-06, + "loss": 0.85204631, + "num_input_tokens_seen": 60116140, + "step": 2775, + "time_per_iteration": 2.5777127742767334 + }, + { + "auxiliary_loss_clip": 0.01146371, + "auxiliary_loss_mlp": 0.01053117, + "balance_loss_clip": 1.05434072, + "balance_loss_mlp": 1.0349257, + "epoch": 0.16690214940628287, + "flos": 19062533640960.0, + "grad_norm": 1.9432640804092056, + "language_loss": 0.80774808, + "learning_rate": 3.806594661981897e-06, + "loss": 0.82974303, + "num_input_tokens_seen": 60134235, + "step": 2776, + "time_per_iteration": 2.4948902130126953 + }, + { + "auxiliary_loss_clip": 0.01137109, + "auxiliary_loss_mlp": 0.01055926, + "balance_loss_clip": 1.05468833, + "balance_loss_mlp": 1.03642368, + "epoch": 0.16696227265895086, + "flos": 18588548747520.0, + "grad_norm": 2.0741148090418626, + "language_loss": 0.79971433, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.8216446, + "num_input_tokens_seen": 60153275, + "step": 2777, + "time_per_iteration": 3.995971441268921 + }, + { + "auxiliary_loss_clip": 0.01147476, + "auxiliary_loss_mlp": 0.01051888, + "balance_loss_clip": 1.05179811, + "balance_loss_mlp": 1.03181338, + "epoch": 0.16702239591161883, + "flos": 23294139085440.0, + "grad_norm": 1.7653427400063066, + "language_loss": 0.85309923, + "learning_rate": 3.806260355115371e-06, + "loss": 0.87509286, + "num_input_tokens_seen": 60173215, + "step": 2778, + "time_per_iteration": 2.545367956161499 + }, + { + "auxiliary_loss_clip": 0.01134148, + "auxiliary_loss_mlp": 0.01044373, + "balance_loss_clip": 1.05365181, + "balance_loss_mlp": 1.02503705, + "epoch": 0.1670825191642868, + "flos": 24425648392320.0, + "grad_norm": 1.9797931050808406, + "language_loss": 0.74465191, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.76643711, + "num_input_tokens_seen": 60190515, + "step": 2779, + "time_per_iteration": 2.5579113960266113 + }, + { + "auxiliary_loss_clip": 0.01117804, + "auxiliary_loss_mlp": 0.00818864, + "balance_loss_clip": 1.05177867, + "balance_loss_mlp": 1.03772378, + "epoch": 0.16714264241695476, + "flos": 26797512193920.0, + "grad_norm": 2.102113771708161, + "language_loss": 0.65222681, + "learning_rate": 3.805925774274554e-06, + "loss": 0.67159349, + "num_input_tokens_seen": 60211655, + "step": 2780, + "time_per_iteration": 2.6560912132263184 + }, + { + "auxiliary_loss_clip": 0.01124293, + "auxiliary_loss_mlp": 0.01047569, + "balance_loss_clip": 1.05295038, + "balance_loss_mlp": 1.02662373, + "epoch": 0.16720276566962272, + "flos": 21835304115840.0, + "grad_norm": 2.0755797024539584, + "language_loss": 0.78413785, + "learning_rate": 3.805758381129643e-06, + "loss": 0.80585647, + "num_input_tokens_seen": 60230860, + "step": 2781, + "time_per_iteration": 3.9903244972229004 + }, + { + "auxiliary_loss_clip": 0.01097083, + "auxiliary_loss_mlp": 0.01052436, + "balance_loss_clip": 1.04681349, + "balance_loss_mlp": 1.03207493, + "epoch": 0.1672628889222907, + "flos": 21470415805440.0, + "grad_norm": 1.857862100382753, + "language_loss": 0.75287533, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77437055, + "num_input_tokens_seen": 60250535, + "step": 2782, + "time_per_iteration": 2.623544931411743 + }, + { + "auxiliary_loss_clip": 0.01134296, + "auxiliary_loss_mlp": 0.01052835, + "balance_loss_clip": 1.05419338, + "balance_loss_mlp": 1.03165174, + "epoch": 0.16732301217495865, + "flos": 30774008269440.0, + "grad_norm": 3.042160868309584, + "language_loss": 0.6727193, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.69459057, + "num_input_tokens_seen": 60269530, + "step": 2783, + "time_per_iteration": 2.6395814418792725 + }, + { + "auxiliary_loss_clip": 0.01158213, + "auxiliary_loss_mlp": 0.01054921, + "balance_loss_clip": 1.05325818, + "balance_loss_mlp": 1.03605044, + "epoch": 0.16738313542762664, + "flos": 23474625949440.0, + "grad_norm": 1.6621122398479173, + "language_loss": 0.70134687, + "learning_rate": 3.805255790873081e-06, + "loss": 0.72347814, + "num_input_tokens_seen": 60289900, + "step": 2784, + "time_per_iteration": 2.4844400882720947 + }, + { + "auxiliary_loss_clip": 0.01137966, + "auxiliary_loss_mlp": 0.0106009, + "balance_loss_clip": 1.05268896, + "balance_loss_mlp": 1.03878713, + "epoch": 0.1674432586802946, + "flos": 29789086366080.0, + "grad_norm": 1.9849786131432834, + "language_loss": 0.60861927, + "learning_rate": 3.805088123868126e-06, + "loss": 0.63059986, + "num_input_tokens_seen": 60310025, + "step": 2785, + "time_per_iteration": 2.606717348098755 + }, + { + "auxiliary_loss_clip": 0.01048988, + "auxiliary_loss_mlp": 0.01042785, + "balance_loss_clip": 1.02979243, + "balance_loss_mlp": 1.03944707, + "epoch": 0.16750338193296258, + "flos": 66136073575680.0, + "grad_norm": 0.7999354336654856, + "language_loss": 0.58763623, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.60855401, + "num_input_tokens_seen": 60377800, + "step": 2786, + "time_per_iteration": 3.1775310039520264 + }, + { + "auxiliary_loss_clip": 0.01143264, + "auxiliary_loss_mlp": 0.01050149, + "balance_loss_clip": 1.05446756, + "balance_loss_mlp": 1.03014541, + "epoch": 0.16756350518563054, + "flos": 25696777864320.0, + "grad_norm": 2.4271606608275627, + "language_loss": 0.75829065, + "learning_rate": 3.80475258451721e-06, + "loss": 0.7802248, + "num_input_tokens_seen": 60398215, + "step": 2787, + "time_per_iteration": 2.56078839302063 + }, + { + "auxiliary_loss_clip": 0.01149544, + "auxiliary_loss_mlp": 0.01043303, + "balance_loss_clip": 1.05504918, + "balance_loss_mlp": 1.02442026, + "epoch": 0.1676236284382985, + "flos": 23836102467840.0, + "grad_norm": 1.7878398542518648, + "language_loss": 0.77468491, + "learning_rate": 3.804584712183972e-06, + "loss": 0.79661345, + "num_input_tokens_seen": 60416910, + "step": 2788, + "time_per_iteration": 2.556183338165283 + }, + { + "auxiliary_loss_clip": 0.01047736, + "auxiliary_loss_mlp": 0.01012217, + "balance_loss_clip": 1.02857876, + "balance_loss_mlp": 1.00909352, + "epoch": 0.16768375169096647, + "flos": 59874902985600.0, + "grad_norm": 0.8608655528684915, + "language_loss": 0.59292656, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61352611, + "num_input_tokens_seen": 60468660, + "step": 2789, + "time_per_iteration": 2.9948930740356445 + }, + { + "auxiliary_loss_clip": 0.0114994, + "auxiliary_loss_mlp": 0.01055117, + "balance_loss_clip": 1.05471325, + "balance_loss_mlp": 1.03555465, + "epoch": 0.16774387494363446, + "flos": 38435657207040.0, + "grad_norm": 1.4226874175524844, + "language_loss": 0.7017808, + "learning_rate": 3.804248762233765e-06, + "loss": 0.72383142, + "num_input_tokens_seen": 60492370, + "step": 2790, + "time_per_iteration": 2.683670997619629 + }, + { + "auxiliary_loss_clip": 0.01129467, + "auxiliary_loss_mlp": 0.01053769, + "balance_loss_clip": 1.05511653, + "balance_loss_mlp": 1.03480268, + "epoch": 0.16780399819630243, + "flos": 22637620252800.0, + "grad_norm": 1.8049917860175317, + "language_loss": 0.79625928, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.81809163, + "num_input_tokens_seen": 60512655, + "step": 2791, + "time_per_iteration": 2.5963921546936035 + }, + { + "auxiliary_loss_clip": 0.01127295, + "auxiliary_loss_mlp": 0.01048254, + "balance_loss_clip": 1.05591583, + "balance_loss_mlp": 1.02754712, + "epoch": 0.1678641214489704, + "flos": 32891516887680.0, + "grad_norm": 1.878703192623841, + "language_loss": 0.70956194, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.73131746, + "num_input_tokens_seen": 60533090, + "step": 2792, + "time_per_iteration": 2.639566659927368 + }, + { + "auxiliary_loss_clip": 0.01133869, + "auxiliary_loss_mlp": 0.01045574, + "balance_loss_clip": 1.05612993, + "balance_loss_mlp": 1.02598786, + "epoch": 0.16792424470163836, + "flos": 19974916028160.0, + "grad_norm": 2.0135349324887746, + "language_loss": 0.7139616, + "learning_rate": 3.803744324194691e-06, + "loss": 0.73575604, + "num_input_tokens_seen": 60553190, + "step": 2793, + "time_per_iteration": 2.578856945037842 + }, + { + "auxiliary_loss_clip": 0.01148071, + "auxiliary_loss_mlp": 0.01055871, + "balance_loss_clip": 1.05815482, + "balance_loss_mlp": 1.03581977, + "epoch": 0.16798436795430632, + "flos": 19719878486400.0, + "grad_norm": 3.7339494185166417, + "language_loss": 0.77306449, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79510397, + "num_input_tokens_seen": 60571995, + "step": 2794, + "time_per_iteration": 2.518659830093384 + }, + { + "auxiliary_loss_clip": 0.01142254, + "auxiliary_loss_mlp": 0.01058243, + "balance_loss_clip": 1.05457461, + "balance_loss_mlp": 1.03868043, + "epoch": 0.1680444912069743, + "flos": 28104839596800.0, + "grad_norm": 3.0862546171751597, + "language_loss": 0.71866846, + "learning_rate": 3.803407690167187e-06, + "loss": 0.74067342, + "num_input_tokens_seen": 60591275, + "step": 2795, + "time_per_iteration": 2.581310272216797 + }, + { + "auxiliary_loss_clip": 0.01137226, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.05232096, + "balance_loss_mlp": 1.02258492, + "epoch": 0.16810461445964225, + "flos": 18075205526400.0, + "grad_norm": 2.8185065503971565, + "language_loss": 0.84333503, + "learning_rate": 3.803239270572142e-06, + "loss": 0.86512411, + "num_input_tokens_seen": 60609235, + "step": 2796, + "time_per_iteration": 2.5261318683624268 + }, + { + "auxiliary_loss_clip": 0.01103091, + "auxiliary_loss_mlp": 0.01057202, + "balance_loss_clip": 1.04967391, + "balance_loss_mlp": 1.03647184, + "epoch": 0.16816473771231025, + "flos": 23878657105920.0, + "grad_norm": 2.094683133649052, + "language_loss": 0.81720412, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.83880711, + "num_input_tokens_seen": 60629880, + "step": 2797, + "time_per_iteration": 2.6530420780181885 + }, + { + "auxiliary_loss_clip": 0.01144697, + "auxiliary_loss_mlp": 0.01052443, + "balance_loss_clip": 1.05383873, + "balance_loss_mlp": 1.03475273, + "epoch": 0.1682248609649782, + "flos": 22783597125120.0, + "grad_norm": 1.433874522870038, + "language_loss": 0.75096226, + "learning_rate": 3.802902226251401e-06, + "loss": 0.77293366, + "num_input_tokens_seen": 60651175, + "step": 2798, + "time_per_iteration": 2.575908899307251 + }, + { + "auxiliary_loss_clip": 0.011616, + "auxiliary_loss_mlp": 0.01054228, + "balance_loss_clip": 1.05643892, + "balance_loss_mlp": 1.03566754, + "epoch": 0.16828498421764618, + "flos": 20705123612160.0, + "grad_norm": 1.4903272763580981, + "language_loss": 0.79860234, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.82076061, + "num_input_tokens_seen": 60670210, + "step": 2799, + "time_per_iteration": 2.484650135040283 + }, + { + "auxiliary_loss_clip": 0.01078325, + "auxiliary_loss_mlp": 0.01049711, + "balance_loss_clip": 1.04935384, + "balance_loss_mlp": 1.02811027, + "epoch": 0.16834510747031414, + "flos": 29420606695680.0, + "grad_norm": 2.067409789230319, + "language_loss": 0.7043817, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.72566199, + "num_input_tokens_seen": 60690895, + "step": 2800, + "time_per_iteration": 2.7426886558532715 + }, + { + "auxiliary_loss_clip": 0.01116848, + "auxiliary_loss_mlp": 0.00815912, + "balance_loss_clip": 1.05614543, + "balance_loss_mlp": 1.03632808, + "epoch": 0.1684052307229821, + "flos": 18145374744960.0, + "grad_norm": 1.9589669041468236, + "language_loss": 0.83796978, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.8572973, + "num_input_tokens_seen": 60708280, + "step": 2801, + "time_per_iteration": 2.572870969772339 + }, + { + "auxiliary_loss_clip": 0.01132135, + "auxiliary_loss_mlp": 0.01052263, + "balance_loss_clip": 1.0545789, + "balance_loss_mlp": 1.03270042, + "epoch": 0.16846535397565007, + "flos": 16574929240320.0, + "grad_norm": 4.35534201899318, + "language_loss": 0.83268082, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.85452479, + "num_input_tokens_seen": 60724150, + "step": 2802, + "time_per_iteration": 2.5374155044555664 + }, + { + "auxiliary_loss_clip": 0.0114978, + "auxiliary_loss_mlp": 0.01050041, + "balance_loss_clip": 1.0542295, + "balance_loss_mlp": 1.03076482, + "epoch": 0.16852547722831807, + "flos": 30408868563840.0, + "grad_norm": 1.878996672053155, + "language_loss": 0.80818331, + "learning_rate": 3.802058419152413e-06, + "loss": 0.83018148, + "num_input_tokens_seen": 60746485, + "step": 2803, + "time_per_iteration": 2.5968077182769775 + }, + { + "auxiliary_loss_clip": 0.01149443, + "auxiliary_loss_mlp": 0.01051143, + "balance_loss_clip": 1.05551386, + "balance_loss_mlp": 1.03199804, + "epoch": 0.16858560048098603, + "flos": 33507420416640.0, + "grad_norm": 4.269622778214664, + "language_loss": 0.76563495, + "learning_rate": 3.801889452704297e-06, + "loss": 0.78764075, + "num_input_tokens_seen": 60762875, + "step": 2804, + "time_per_iteration": 2.6066489219665527 + }, + { + "auxiliary_loss_clip": 0.01032891, + "auxiliary_loss_mlp": 0.01024183, + "balance_loss_clip": 1.02185345, + "balance_loss_mlp": 1.02048743, + "epoch": 0.168645723733654, + "flos": 67370502326400.0, + "grad_norm": 0.841192131715085, + "language_loss": 0.5541274, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57469809, + "num_input_tokens_seen": 60825510, + "step": 2805, + "time_per_iteration": 3.1317827701568604 + }, + { + "auxiliary_loss_clip": 0.01140098, + "auxiliary_loss_mlp": 0.01042308, + "balance_loss_clip": 1.04931712, + "balance_loss_mlp": 1.02430749, + "epoch": 0.16870584698632196, + "flos": 21324618501120.0, + "grad_norm": 1.8589829084899674, + "language_loss": 0.72706515, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.74888921, + "num_input_tokens_seen": 60844440, + "step": 2806, + "time_per_iteration": 2.5054423809051514 + }, + { + "auxiliary_loss_clip": 0.01116756, + "auxiliary_loss_mlp": 0.01050378, + "balance_loss_clip": 1.05362415, + "balance_loss_mlp": 1.03114986, + "epoch": 0.16876597023898993, + "flos": 20740746925440.0, + "grad_norm": 2.709972491241616, + "language_loss": 0.69998997, + "learning_rate": 3.80138214341862e-06, + "loss": 0.72166133, + "num_input_tokens_seen": 60863210, + "step": 2807, + "time_per_iteration": 2.558473587036133 + }, + { + "auxiliary_loss_clip": 0.01133727, + "auxiliary_loss_mlp": 0.01054799, + "balance_loss_clip": 1.05029583, + "balance_loss_mlp": 1.03434253, + "epoch": 0.1688260934916579, + "flos": 20303498666880.0, + "grad_norm": 2.379129622492227, + "language_loss": 0.70180202, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.72368729, + "num_input_tokens_seen": 60882510, + "step": 2808, + "time_per_iteration": 2.5451200008392334 + }, + { + "auxiliary_loss_clip": 0.01122449, + "auxiliary_loss_mlp": 0.01055377, + "balance_loss_clip": 1.05403125, + "balance_loss_mlp": 1.03463459, + "epoch": 0.16888621674432586, + "flos": 20340702178560.0, + "grad_norm": 2.8240274502934346, + "language_loss": 0.8008033, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.82258153, + "num_input_tokens_seen": 60901105, + "step": 2809, + "time_per_iteration": 2.5653305053710938 + }, + { + "auxiliary_loss_clip": 0.01154969, + "auxiliary_loss_mlp": 0.01051801, + "balance_loss_clip": 1.05429101, + "balance_loss_mlp": 1.03225076, + "epoch": 0.16894633999699385, + "flos": 16244802316800.0, + "grad_norm": 2.673546072027647, + "language_loss": 0.88939863, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.91146636, + "num_input_tokens_seen": 60915340, + "step": 2810, + "time_per_iteration": 2.479069948196411 + }, + { + "auxiliary_loss_clip": 0.01155447, + "auxiliary_loss_mlp": 0.01059385, + "balance_loss_clip": 1.0583353, + "balance_loss_mlp": 1.03914332, + "epoch": 0.16900646324966181, + "flos": 19610171372160.0, + "grad_norm": 2.1569316294488874, + "language_loss": 0.92818463, + "learning_rate": 3.800704774747416e-06, + "loss": 0.950333, + "num_input_tokens_seen": 60933735, + "step": 2811, + "time_per_iteration": 2.495182514190674 + }, + { + "auxiliary_loss_clip": 0.01143532, + "auxiliary_loss_mlp": 0.01058253, + "balance_loss_clip": 1.05532682, + "balance_loss_mlp": 1.03947771, + "epoch": 0.16906658650232978, + "flos": 22018089450240.0, + "grad_norm": 2.058008745810162, + "language_loss": 0.78869855, + "learning_rate": 3.800535261856291e-06, + "loss": 0.81071639, + "num_input_tokens_seen": 60953105, + "step": 2812, + "time_per_iteration": 2.5183701515197754 + }, + { + "auxiliary_loss_clip": 0.01148615, + "auxiliary_loss_mlp": 0.01053824, + "balance_loss_clip": 1.05564356, + "balance_loss_mlp": 1.03572762, + "epoch": 0.16912670975499774, + "flos": 11763690024960.0, + "grad_norm": 2.5787391742812424, + "language_loss": 0.74925363, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.77127802, + "num_input_tokens_seen": 60969150, + "step": 2813, + "time_per_iteration": 5.329814910888672 + }, + { + "auxiliary_loss_clip": 0.01135406, + "auxiliary_loss_mlp": 0.01054144, + "balance_loss_clip": 1.05116761, + "balance_loss_mlp": 1.03453445, + "epoch": 0.1691868330076657, + "flos": 17161386595200.0, + "grad_norm": 2.7575196087928, + "language_loss": 0.68377042, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.70566595, + "num_input_tokens_seen": 60982825, + "step": 2814, + "time_per_iteration": 2.5003573894500732 + }, + { + "auxiliary_loss_clip": 0.01162158, + "auxiliary_loss_mlp": 0.01047419, + "balance_loss_clip": 1.05583191, + "balance_loss_mlp": 1.02790463, + "epoch": 0.16924695626033368, + "flos": 22416553998720.0, + "grad_norm": 2.093891283805485, + "language_loss": 0.62375343, + "learning_rate": 3.800026313549776e-06, + "loss": 0.64584911, + "num_input_tokens_seen": 61000875, + "step": 2815, + "time_per_iteration": 2.4901442527770996 + }, + { + "auxiliary_loss_clip": 0.01130296, + "auxiliary_loss_mlp": 0.01049339, + "balance_loss_clip": 1.04966307, + "balance_loss_mlp": 1.02970529, + "epoch": 0.16930707951300164, + "flos": 25739655724800.0, + "grad_norm": 1.7606678870380874, + "language_loss": 0.82603586, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.84783226, + "num_input_tokens_seen": 61021940, + "step": 2816, + "time_per_iteration": 3.9851162433624268 + }, + { + "auxiliary_loss_clip": 0.01135926, + "auxiliary_loss_mlp": 0.01052109, + "balance_loss_clip": 1.05540931, + "balance_loss_mlp": 1.03303576, + "epoch": 0.16936720276566963, + "flos": 22747040058240.0, + "grad_norm": 2.0481528165656164, + "language_loss": 0.87482119, + "learning_rate": 3.799686673382153e-06, + "loss": 0.89670157, + "num_input_tokens_seen": 61040285, + "step": 2817, + "time_per_iteration": 2.5469205379486084 + }, + { + "auxiliary_loss_clip": 0.01136878, + "auxiliary_loss_mlp": 0.01058703, + "balance_loss_clip": 1.05142415, + "balance_loss_mlp": 1.0388906, + "epoch": 0.1694273260183376, + "flos": 19573973441280.0, + "grad_norm": 1.6768906303230233, + "language_loss": 0.81476372, + "learning_rate": 3.799516750928672e-06, + "loss": 0.83671951, + "num_input_tokens_seen": 61059020, + "step": 2818, + "time_per_iteration": 2.533008098602295 + }, + { + "auxiliary_loss_clip": 0.01157952, + "auxiliary_loss_mlp": 0.01055409, + "balance_loss_clip": 1.05231798, + "balance_loss_mlp": 1.03591847, + "epoch": 0.16948744927100556, + "flos": 12457843332480.0, + "grad_norm": 4.918806692755507, + "language_loss": 0.81568801, + "learning_rate": 3.799346760237336e-06, + "loss": 0.83782166, + "num_input_tokens_seen": 61074245, + "step": 2819, + "time_per_iteration": 2.439852476119995 + }, + { + "auxiliary_loss_clip": 0.01041122, + "auxiliary_loss_mlp": 0.01015026, + "balance_loss_clip": 1.01840436, + "balance_loss_mlp": 1.01229656, + "epoch": 0.16954757252367353, + "flos": 71291694435840.0, + "grad_norm": 0.9584793877451202, + "language_loss": 0.61029911, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63086057, + "num_input_tokens_seen": 61127080, + "step": 2820, + "time_per_iteration": 4.429564476013184 + }, + { + "auxiliary_loss_clip": 0.01125681, + "auxiliary_loss_mlp": 0.010565, + "balance_loss_clip": 1.0513742, + "balance_loss_mlp": 1.03662801, + "epoch": 0.1696076957763415, + "flos": 29606516513280.0, + "grad_norm": 1.8761655187640343, + "language_loss": 0.78699982, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.80882168, + "num_input_tokens_seen": 61146955, + "step": 2821, + "time_per_iteration": 2.616596221923828 + }, + { + "auxiliary_loss_clip": 0.01138749, + "auxiliary_loss_mlp": 0.01058574, + "balance_loss_clip": 1.05174422, + "balance_loss_mlp": 1.03764069, + "epoch": 0.16966781902900946, + "flos": 24388588535040.0, + "grad_norm": 1.9886440315890659, + "language_loss": 0.78717947, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.80915272, + "num_input_tokens_seen": 61166605, + "step": 2822, + "time_per_iteration": 2.5253169536590576 + }, + { + "auxiliary_loss_clip": 0.01141731, + "auxiliary_loss_mlp": 0.00815681, + "balance_loss_clip": 1.05007362, + "balance_loss_mlp": 1.03610158, + "epoch": 0.16972794228167745, + "flos": 23038814234880.0, + "grad_norm": 1.900436378881157, + "language_loss": 0.75207156, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.77164567, + "num_input_tokens_seen": 61186535, + "step": 2823, + "time_per_iteration": 2.5225589275360107 + }, + { + "auxiliary_loss_clip": 0.01131181, + "auxiliary_loss_mlp": 0.01056333, + "balance_loss_clip": 1.05307794, + "balance_loss_mlp": 1.03648424, + "epoch": 0.16978806553434542, + "flos": 35228691129600.0, + "grad_norm": 3.176394596317063, + "language_loss": 0.60262775, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.6245029, + "num_input_tokens_seen": 61208965, + "step": 2824, + "time_per_iteration": 2.659937620162964 + }, + { + "auxiliary_loss_clip": 0.01141142, + "auxiliary_loss_mlp": 0.01045696, + "balance_loss_clip": 1.05468094, + "balance_loss_mlp": 1.02564538, + "epoch": 0.16984818878701338, + "flos": 32014290936960.0, + "grad_norm": 1.922048086476283, + "language_loss": 0.73146099, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.75332934, + "num_input_tokens_seen": 61230670, + "step": 2825, + "time_per_iteration": 2.626640796661377 + }, + { + "auxiliary_loss_clip": 0.01160219, + "auxiliary_loss_mlp": 0.01053899, + "balance_loss_clip": 1.05124629, + "balance_loss_mlp": 1.03236961, + "epoch": 0.16990831203968135, + "flos": 22818609907200.0, + "grad_norm": 2.256685349965444, + "language_loss": 0.85578978, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.87793094, + "num_input_tokens_seen": 61249510, + "step": 2826, + "time_per_iteration": 2.4618210792541504 + }, + { + "auxiliary_loss_clip": 0.01139968, + "auxiliary_loss_mlp": 0.01051969, + "balance_loss_clip": 1.05083585, + "balance_loss_mlp": 1.03188169, + "epoch": 0.1699684352923493, + "flos": 23039604334080.0, + "grad_norm": 1.6772071811374327, + "language_loss": 0.82641202, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.84833133, + "num_input_tokens_seen": 61269440, + "step": 2827, + "time_per_iteration": 2.532891273498535 + }, + { + "auxiliary_loss_clip": 0.01130923, + "auxiliary_loss_mlp": 0.01048404, + "balance_loss_clip": 1.05160379, + "balance_loss_mlp": 1.02763808, + "epoch": 0.17002855854501728, + "flos": 21434110133760.0, + "grad_norm": 1.9489119121400609, + "language_loss": 0.73699236, + "learning_rate": 3.797813774376267e-06, + "loss": 0.75878561, + "num_input_tokens_seen": 61288195, + "step": 2828, + "time_per_iteration": 2.541637897491455 + }, + { + "auxiliary_loss_clip": 0.01046746, + "auxiliary_loss_mlp": 0.01012118, + "balance_loss_clip": 1.03354216, + "balance_loss_mlp": 1.00924516, + "epoch": 0.17008868179768524, + "flos": 71453509205760.0, + "grad_norm": 0.7692616405283027, + "language_loss": 0.56463939, + "learning_rate": 3.797643101661336e-06, + "loss": 0.58522803, + "num_input_tokens_seen": 61350850, + "step": 2829, + "time_per_iteration": 3.238046169281006 + }, + { + "auxiliary_loss_clip": 0.01116048, + "auxiliary_loss_mlp": 0.01058123, + "balance_loss_clip": 1.0470506, + "balance_loss_mlp": 1.03696346, + "epoch": 0.17014880505035324, + "flos": 24900315644160.0, + "grad_norm": 2.1169824269108406, + "language_loss": 0.83226085, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.85400259, + "num_input_tokens_seen": 61370765, + "step": 2830, + "time_per_iteration": 2.596724033355713 + }, + { + "auxiliary_loss_clip": 0.01122765, + "auxiliary_loss_mlp": 0.01046039, + "balance_loss_clip": 1.04926872, + "balance_loss_mlp": 1.025177, + "epoch": 0.1702089283030212, + "flos": 29862415981440.0, + "grad_norm": 2.131226399170026, + "language_loss": 0.78473425, + "learning_rate": 3.797301551737529e-06, + "loss": 0.80642235, + "num_input_tokens_seen": 61388935, + "step": 2831, + "time_per_iteration": 2.6141417026519775 + }, + { + "auxiliary_loss_clip": 0.01121098, + "auxiliary_loss_mlp": 0.01050652, + "balance_loss_clip": 1.04808974, + "balance_loss_mlp": 1.03020775, + "epoch": 0.17026905155568917, + "flos": 17744180762880.0, + "grad_norm": 1.8909029481621262, + "language_loss": 0.79606998, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.81778753, + "num_input_tokens_seen": 61407350, + "step": 2832, + "time_per_iteration": 2.549471616744995 + }, + { + "auxiliary_loss_clip": 0.01127174, + "auxiliary_loss_mlp": 0.01049976, + "balance_loss_clip": 1.05005372, + "balance_loss_mlp": 1.03098631, + "epoch": 0.17032917480835713, + "flos": 23148665003520.0, + "grad_norm": 1.61894353972642, + "language_loss": 0.88773465, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.9095062, + "num_input_tokens_seen": 61429010, + "step": 2833, + "time_per_iteration": 2.5802981853485107 + }, + { + "auxiliary_loss_clip": 0.01156881, + "auxiliary_loss_mlp": 0.01045145, + "balance_loss_clip": 1.05127704, + "balance_loss_mlp": 1.02648854, + "epoch": 0.1703892980610251, + "flos": 39202565512320.0, + "grad_norm": 1.9953578583775617, + "language_loss": 0.72482467, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.74684489, + "num_input_tokens_seen": 61450040, + "step": 2834, + "time_per_iteration": 2.61782169342041 + }, + { + "auxiliary_loss_clip": 0.01116645, + "auxiliary_loss_mlp": 0.01053443, + "balance_loss_clip": 1.05182886, + "balance_loss_mlp": 1.03550243, + "epoch": 0.17044942131369306, + "flos": 23039101543680.0, + "grad_norm": 1.8730871466290222, + "language_loss": 0.86474597, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.88644683, + "num_input_tokens_seen": 61468585, + "step": 2835, + "time_per_iteration": 2.581347703933716 + }, + { + "auxiliary_loss_clip": 0.01147046, + "auxiliary_loss_mlp": 0.01052362, + "balance_loss_clip": 1.049124, + "balance_loss_mlp": 1.03123844, + "epoch": 0.17050954456636103, + "flos": 17054983532160.0, + "grad_norm": 2.0804408048148324, + "language_loss": 0.73723835, + "learning_rate": 3.796446484348989e-06, + "loss": 0.7592324, + "num_input_tokens_seen": 61486330, + "step": 2836, + "time_per_iteration": 2.501251220703125 + }, + { + "auxiliary_loss_clip": 0.01099788, + "auxiliary_loss_mlp": 0.01050927, + "balance_loss_clip": 1.04683924, + "balance_loss_mlp": 1.02983844, + "epoch": 0.17056966781902902, + "flos": 16836969934080.0, + "grad_norm": 2.361696416651369, + "language_loss": 0.80045748, + "learning_rate": 3.796275266481036e-06, + "loss": 0.82196468, + "num_input_tokens_seen": 61503950, + "step": 2837, + "time_per_iteration": 2.574387311935425 + }, + { + "auxiliary_loss_clip": 0.011433, + "auxiliary_loss_mlp": 0.0104545, + "balance_loss_clip": 1.05222154, + "balance_loss_mlp": 1.02697217, + "epoch": 0.17062979107169698, + "flos": 17712543859200.0, + "grad_norm": 2.2148056290928904, + "language_loss": 0.83477557, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.85666311, + "num_input_tokens_seen": 61523550, + "step": 2838, + "time_per_iteration": 2.493795394897461 + }, + { + "auxiliary_loss_clip": 0.01101927, + "auxiliary_loss_mlp": 0.01050641, + "balance_loss_clip": 1.04810166, + "balance_loss_mlp": 1.03228331, + "epoch": 0.17068991432436495, + "flos": 22525040050560.0, + "grad_norm": 1.668853732832902, + "language_loss": 0.93644625, + "learning_rate": 3.795932626406812e-06, + "loss": 0.95797193, + "num_input_tokens_seen": 61542720, + "step": 2839, + "time_per_iteration": 2.6169276237487793 + }, + { + "auxiliary_loss_clip": 0.01126758, + "auxiliary_loss_mlp": 0.01049011, + "balance_loss_clip": 1.05036759, + "balance_loss_mlp": 1.02811384, + "epoch": 0.17075003757703291, + "flos": 25882939077120.0, + "grad_norm": 2.2211664911154076, + "language_loss": 0.83836174, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.86011934, + "num_input_tokens_seen": 61563040, + "step": 2840, + "time_per_iteration": 2.5939884185791016 + }, + { + "auxiliary_loss_clip": 0.01148966, + "auxiliary_loss_mlp": 0.01048162, + "balance_loss_clip": 1.05243576, + "balance_loss_mlp": 1.0278486, + "epoch": 0.17081016082970088, + "flos": 20120713332480.0, + "grad_norm": 1.8834625867711212, + "language_loss": 0.7618587, + "learning_rate": 3.79558971392481e-06, + "loss": 0.78382993, + "num_input_tokens_seen": 61581890, + "step": 2841, + "time_per_iteration": 2.518474578857422 + }, + { + "auxiliary_loss_clip": 0.01139046, + "auxiliary_loss_mlp": 0.01050159, + "balance_loss_clip": 1.0511148, + "balance_loss_mlp": 1.03112113, + "epoch": 0.17087028408236885, + "flos": 24936477661440.0, + "grad_norm": 1.9107173062652114, + "language_loss": 0.76767266, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.78956473, + "num_input_tokens_seen": 61602095, + "step": 2842, + "time_per_iteration": 2.5793943405151367 + }, + { + "auxiliary_loss_clip": 0.01154695, + "auxiliary_loss_mlp": 0.01049024, + "balance_loss_clip": 1.05174875, + "balance_loss_mlp": 1.02985573, + "epoch": 0.17093040733503684, + "flos": 19057864872960.0, + "grad_norm": 2.4634089537561747, + "language_loss": 0.85774994, + "learning_rate": 3.795246529087043e-06, + "loss": 0.87978715, + "num_input_tokens_seen": 61620400, + "step": 2843, + "time_per_iteration": 2.4594357013702393 + }, + { + "auxiliary_loss_clip": 0.01154128, + "auxiliary_loss_mlp": 0.01046365, + "balance_loss_clip": 1.05159092, + "balance_loss_mlp": 1.02754235, + "epoch": 0.1709905305877048, + "flos": 13078954333440.0, + "grad_norm": 2.2110534605919727, + "language_loss": 0.68393481, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.70593965, + "num_input_tokens_seen": 61637680, + "step": 2844, + "time_per_iteration": 2.4244508743286133 + }, + { + "auxiliary_loss_clip": 0.01134126, + "auxiliary_loss_mlp": 0.00819531, + "balance_loss_clip": 1.05077648, + "balance_loss_mlp": 1.04483581, + "epoch": 0.17105065384037277, + "flos": 19209336526080.0, + "grad_norm": 1.7250157561601462, + "language_loss": 0.78357446, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.80311108, + "num_input_tokens_seen": 61655630, + "step": 2845, + "time_per_iteration": 2.5046427249908447 + }, + { + "auxiliary_loss_clip": 0.01143701, + "auxiliary_loss_mlp": 0.01049833, + "balance_loss_clip": 1.05107212, + "balance_loss_mlp": 1.03185618, + "epoch": 0.17111077709304073, + "flos": 18515183218560.0, + "grad_norm": 2.3059295613339157, + "language_loss": 0.77924454, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.80117989, + "num_input_tokens_seen": 61673475, + "step": 2846, + "time_per_iteration": 2.4778501987457275 + }, + { + "auxiliary_loss_clip": 0.01143761, + "auxiliary_loss_mlp": 0.01050453, + "balance_loss_clip": 1.05174589, + "balance_loss_mlp": 1.0316658, + "epoch": 0.1711709003457087, + "flos": 25082670015360.0, + "grad_norm": 2.4505242993590137, + "language_loss": 0.79787141, + "learning_rate": 3.794559342552472e-06, + "loss": 0.81981349, + "num_input_tokens_seen": 61693370, + "step": 2847, + "time_per_iteration": 2.511660099029541 + }, + { + "auxiliary_loss_clip": 0.01143354, + "auxiliary_loss_mlp": 0.01053771, + "balance_loss_clip": 1.04879951, + "balance_loss_mlp": 1.03428042, + "epoch": 0.17123102359837666, + "flos": 17566387418880.0, + "grad_norm": 2.2416680243469598, + "language_loss": 0.86965472, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.891626, + "num_input_tokens_seen": 61710820, + "step": 2848, + "time_per_iteration": 2.472407579421997 + }, + { + "auxiliary_loss_clip": 0.01114898, + "auxiliary_loss_mlp": 0.01049191, + "balance_loss_clip": 1.05062985, + "balance_loss_mlp": 1.02946186, + "epoch": 0.17129114685104463, + "flos": 26173635845760.0, + "grad_norm": 1.813068522178027, + "language_loss": 0.75010598, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77174687, + "num_input_tokens_seen": 61729855, + "step": 2849, + "time_per_iteration": 2.583392381668091 + }, + { + "auxiliary_loss_clip": 0.01038658, + "auxiliary_loss_mlp": 0.01007124, + "balance_loss_clip": 1.03035414, + "balance_loss_mlp": 1.00442946, + "epoch": 0.17135127010371262, + "flos": 69269710037760.0, + "grad_norm": 0.7959005706383175, + "language_loss": 0.57507271, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59553051, + "num_input_tokens_seen": 61790290, + "step": 2850, + "time_per_iteration": 3.1467602252960205 + }, + { + "auxiliary_loss_clip": 0.01116767, + "auxiliary_loss_mlp": 0.010455, + "balance_loss_clip": 1.04968834, + "balance_loss_mlp": 1.02658117, + "epoch": 0.1714113933563806, + "flos": 23550110380800.0, + "grad_norm": 2.11599141610493, + "language_loss": 0.81807655, + "learning_rate": 3.793871067220031e-06, + "loss": 0.83969921, + "num_input_tokens_seen": 61809265, + "step": 2851, + "time_per_iteration": 4.017199516296387 + }, + { + "auxiliary_loss_clip": 0.01113454, + "auxiliary_loss_mlp": 0.01045212, + "balance_loss_clip": 1.04965353, + "balance_loss_mlp": 1.02694929, + "epoch": 0.17147151660904855, + "flos": 21142443697920.0, + "grad_norm": 1.8869235806991436, + "language_loss": 0.93335748, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.95494413, + "num_input_tokens_seen": 61828980, + "step": 2852, + "time_per_iteration": 2.568448543548584 + }, + { + "auxiliary_loss_clip": 0.01123863, + "auxiliary_loss_mlp": 0.01052194, + "balance_loss_clip": 1.0502429, + "balance_loss_mlp": 1.03318, + "epoch": 0.17153163986171652, + "flos": 18624890332800.0, + "grad_norm": 9.605017439504994, + "language_loss": 0.69408739, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.71584797, + "num_input_tokens_seen": 61847915, + "step": 2853, + "time_per_iteration": 2.564440965652466 + }, + { + "auxiliary_loss_clip": 0.01122869, + "auxiliary_loss_mlp": 0.0104638, + "balance_loss_clip": 1.05676186, + "balance_loss_mlp": 1.02805793, + "epoch": 0.17159176311438448, + "flos": 18223265387520.0, + "grad_norm": 2.484251734421179, + "language_loss": 0.66595155, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.687644, + "num_input_tokens_seen": 61865570, + "step": 2854, + "time_per_iteration": 4.05839729309082 + }, + { + "auxiliary_loss_clip": 0.01124257, + "auxiliary_loss_mlp": 0.01041639, + "balance_loss_clip": 1.05196619, + "balance_loss_mlp": 1.02316189, + "epoch": 0.17165188636705245, + "flos": 20738987159040.0, + "grad_norm": 1.6082338942048273, + "language_loss": 0.89584315, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91750211, + "num_input_tokens_seen": 61883340, + "step": 2855, + "time_per_iteration": 2.5535194873809814 + }, + { + "auxiliary_loss_clip": 0.01160299, + "auxiliary_loss_mlp": 0.01052593, + "balance_loss_clip": 1.05430984, + "balance_loss_mlp": 1.03396046, + "epoch": 0.17171200961972044, + "flos": 24899884680960.0, + "grad_norm": 2.17527094775061, + "language_loss": 0.83149886, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.8536278, + "num_input_tokens_seen": 61900610, + "step": 2856, + "time_per_iteration": 2.5007376670837402 + }, + { + "auxiliary_loss_clip": 0.01148659, + "auxiliary_loss_mlp": 0.01051232, + "balance_loss_clip": 1.05486631, + "balance_loss_mlp": 1.03267086, + "epoch": 0.1717721328723884, + "flos": 20157234485760.0, + "grad_norm": 2.0522905906691995, + "language_loss": 0.86260235, + "learning_rate": 3.792836613639026e-06, + "loss": 0.88460124, + "num_input_tokens_seen": 61916795, + "step": 2857, + "time_per_iteration": 2.510795831680298 + }, + { + "auxiliary_loss_clip": 0.01148081, + "auxiliary_loss_mlp": 0.01056964, + "balance_loss_clip": 1.05462861, + "balance_loss_mlp": 1.03703237, + "epoch": 0.17183225612505637, + "flos": 23361650697600.0, + "grad_norm": 2.135113707951393, + "language_loss": 0.78303277, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.80508327, + "num_input_tokens_seen": 61936665, + "step": 2858, + "time_per_iteration": 3.926889657974243 + }, + { + "auxiliary_loss_clip": 0.01160172, + "auxiliary_loss_mlp": 0.01053156, + "balance_loss_clip": 1.07450402, + "balance_loss_mlp": 1.03256857, + "epoch": 0.17189237937772434, + "flos": 18114240631680.0, + "grad_norm": 2.300572422560323, + "language_loss": 0.77463281, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.7967661, + "num_input_tokens_seen": 61954415, + "step": 2859, + "time_per_iteration": 2.5157456398010254 + }, + { + "auxiliary_loss_clip": 0.01120102, + "auxiliary_loss_mlp": 0.01041227, + "balance_loss_clip": 1.06873417, + "balance_loss_mlp": 1.02162886, + "epoch": 0.1719525026303923, + "flos": 23258408031360.0, + "grad_norm": 2.3104229179389293, + "language_loss": 0.77066261, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.79227591, + "num_input_tokens_seen": 61973940, + "step": 2860, + "time_per_iteration": 2.62282133102417 + }, + { + "auxiliary_loss_clip": 0.01148398, + "auxiliary_loss_mlp": 0.0104497, + "balance_loss_clip": 1.05280316, + "balance_loss_mlp": 1.02606344, + "epoch": 0.17201262588306027, + "flos": 20810413353600.0, + "grad_norm": 2.6750246423837787, + "language_loss": 0.81662339, + "learning_rate": 3.792145618140317e-06, + "loss": 0.83855712, + "num_input_tokens_seen": 61991845, + "step": 2861, + "time_per_iteration": 2.5068347454071045 + }, + { + "auxiliary_loss_clip": 0.01132087, + "auxiliary_loss_mlp": 0.01051852, + "balance_loss_clip": 1.05180585, + "balance_loss_mlp": 1.03352928, + "epoch": 0.17207274913572823, + "flos": 20375858615040.0, + "grad_norm": 2.3776555817811356, + "language_loss": 0.85964882, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.8814882, + "num_input_tokens_seen": 62009395, + "step": 2862, + "time_per_iteration": 2.5348544120788574 + }, + { + "auxiliary_loss_clip": 0.01120728, + "auxiliary_loss_mlp": 0.01041975, + "balance_loss_clip": 1.05348825, + "balance_loss_mlp": 1.0247612, + "epoch": 0.17213287238839622, + "flos": 26797727675520.0, + "grad_norm": 1.8299949932070538, + "language_loss": 0.78020906, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.80183613, + "num_input_tokens_seen": 62029005, + "step": 2863, + "time_per_iteration": 2.620295524597168 + }, + { + "auxiliary_loss_clip": 0.01125914, + "auxiliary_loss_mlp": 0.00809431, + "balance_loss_clip": 1.05410981, + "balance_loss_mlp": 1.02629542, + "epoch": 0.1721929956410642, + "flos": 26030819370240.0, + "grad_norm": 1.8323399339164812, + "language_loss": 0.72376615, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.7431196, + "num_input_tokens_seen": 62048730, + "step": 2864, + "time_per_iteration": 2.639220952987671 + }, + { + "auxiliary_loss_clip": 0.01123396, + "auxiliary_loss_mlp": 0.01052639, + "balance_loss_clip": 1.05321884, + "balance_loss_mlp": 1.03326738, + "epoch": 0.17225311889373215, + "flos": 22273091078400.0, + "grad_norm": 1.6501035713580428, + "language_loss": 0.72621214, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.74797249, + "num_input_tokens_seen": 62069000, + "step": 2865, + "time_per_iteration": 2.5765178203582764 + }, + { + "auxiliary_loss_clip": 0.01146229, + "auxiliary_loss_mlp": 0.00816258, + "balance_loss_clip": 1.05602038, + "balance_loss_mlp": 1.03677559, + "epoch": 0.17231324214640012, + "flos": 21287774125440.0, + "grad_norm": 3.3573541874259583, + "language_loss": 0.7855742, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.80519903, + "num_input_tokens_seen": 62086750, + "step": 2866, + "time_per_iteration": 2.525108814239502 + }, + { + "auxiliary_loss_clip": 0.01161477, + "auxiliary_loss_mlp": 0.01048584, + "balance_loss_clip": 1.05604911, + "balance_loss_mlp": 1.02875924, + "epoch": 0.17237336539906808, + "flos": 19680735640320.0, + "grad_norm": 1.7022854103544645, + "language_loss": 0.79753268, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.81963325, + "num_input_tokens_seen": 62106240, + "step": 2867, + "time_per_iteration": 2.493462562561035 + }, + { + "auxiliary_loss_clip": 0.01132599, + "auxiliary_loss_mlp": 0.01039114, + "balance_loss_clip": 1.05193472, + "balance_loss_mlp": 1.02013576, + "epoch": 0.17243348865173605, + "flos": 17529650784000.0, + "grad_norm": 1.8120038090180435, + "language_loss": 0.79348248, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.81519961, + "num_input_tokens_seen": 62124895, + "step": 2868, + "time_per_iteration": 2.519169807434082 + }, + { + "auxiliary_loss_clip": 0.01130655, + "auxiliary_loss_mlp": 0.01044807, + "balance_loss_clip": 1.07659304, + "balance_loss_mlp": 1.0270927, + "epoch": 0.17249361190440402, + "flos": 18259858368000.0, + "grad_norm": 2.1895452992331887, + "language_loss": 0.83900356, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.86075819, + "num_input_tokens_seen": 62143510, + "step": 2869, + "time_per_iteration": 2.587867498397827 + }, + { + "auxiliary_loss_clip": 0.01133376, + "auxiliary_loss_mlp": 0.01049601, + "balance_loss_clip": 1.05313921, + "balance_loss_mlp": 1.03039598, + "epoch": 0.172553735157072, + "flos": 21174367910400.0, + "grad_norm": 15.045694308476499, + "language_loss": 0.7726475, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.79447722, + "num_input_tokens_seen": 62162285, + "step": 2870, + "time_per_iteration": 2.5454421043395996 + }, + { + "auxiliary_loss_clip": 0.01154093, + "auxiliary_loss_mlp": 0.01043176, + "balance_loss_clip": 1.0541718, + "balance_loss_mlp": 1.02612948, + "epoch": 0.17261385840973997, + "flos": 22273270646400.0, + "grad_norm": 1.9954769955860114, + "language_loss": 0.77207547, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.79404813, + "num_input_tokens_seen": 62180970, + "step": 2871, + "time_per_iteration": 2.48834228515625 + }, + { + "auxiliary_loss_clip": 0.0113783, + "auxiliary_loss_mlp": 0.01045111, + "balance_loss_clip": 1.05822349, + "balance_loss_mlp": 1.02597761, + "epoch": 0.17267398166240794, + "flos": 27922233830400.0, + "grad_norm": 3.3521514224342, + "language_loss": 0.74119055, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.76301998, + "num_input_tokens_seen": 62198965, + "step": 2872, + "time_per_iteration": 2.6126041412353516 + }, + { + "auxiliary_loss_clip": 0.0115308, + "auxiliary_loss_mlp": 0.01047126, + "balance_loss_clip": 1.05242538, + "balance_loss_mlp": 1.02838659, + "epoch": 0.1727341049150759, + "flos": 21945118970880.0, + "grad_norm": 1.802066445844266, + "language_loss": 0.82550919, + "learning_rate": 3.790066109323988e-06, + "loss": 0.84751129, + "num_input_tokens_seen": 62219890, + "step": 2873, + "time_per_iteration": 2.499656915664673 + }, + { + "auxiliary_loss_clip": 0.01111993, + "auxiliary_loss_mlp": 0.01047193, + "balance_loss_clip": 1.04831696, + "balance_loss_mlp": 1.02734423, + "epoch": 0.17279422816774387, + "flos": 18107883924480.0, + "grad_norm": 1.9589609037378661, + "language_loss": 0.75076938, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.77236122, + "num_input_tokens_seen": 62237140, + "step": 2874, + "time_per_iteration": 2.5713133811950684 + }, + { + "auxiliary_loss_clip": 0.01157366, + "auxiliary_loss_mlp": 0.01044858, + "balance_loss_clip": 1.05296826, + "balance_loss_mlp": 1.02504563, + "epoch": 0.17285435142041183, + "flos": 21835447770240.0, + "grad_norm": 2.6546478076809614, + "language_loss": 0.81097305, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.8329953, + "num_input_tokens_seen": 62255405, + "step": 2875, + "time_per_iteration": 2.483370542526245 + }, + { + "auxiliary_loss_clip": 0.0113856, + "auxiliary_loss_mlp": 0.01050962, + "balance_loss_clip": 1.05571795, + "balance_loss_mlp": 1.03106582, + "epoch": 0.17291447467307983, + "flos": 18368452160640.0, + "grad_norm": 2.519842669443865, + "language_loss": 0.87820888, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.90010417, + "num_input_tokens_seen": 62271280, + "step": 2876, + "time_per_iteration": 2.5212438106536865 + }, + { + "auxiliary_loss_clip": 0.01137039, + "auxiliary_loss_mlp": 0.01044408, + "balance_loss_clip": 1.05570066, + "balance_loss_mlp": 1.02522755, + "epoch": 0.1729745979257478, + "flos": 18624638937600.0, + "grad_norm": 1.7344409557400629, + "language_loss": 0.8494249, + "learning_rate": 3.789370767013681e-06, + "loss": 0.87123942, + "num_input_tokens_seen": 62289140, + "step": 2877, + "time_per_iteration": 2.522310972213745 + }, + { + "auxiliary_loss_clip": 0.01134601, + "auxiliary_loss_mlp": 0.01046224, + "balance_loss_clip": 1.06645799, + "balance_loss_mlp": 1.02678108, + "epoch": 0.17303472117841576, + "flos": 22998234844800.0, + "grad_norm": 2.2471510325457085, + "language_loss": 0.79827631, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.82008457, + "num_input_tokens_seen": 62307490, + "step": 2878, + "time_per_iteration": 2.606806993484497 + }, + { + "auxiliary_loss_clip": 0.01134007, + "auxiliary_loss_mlp": 0.01048817, + "balance_loss_clip": 1.05076122, + "balance_loss_mlp": 1.03045881, + "epoch": 0.17309484443108372, + "flos": 25664386775040.0, + "grad_norm": 3.44638218731754, + "language_loss": 0.70749921, + "learning_rate": 3.78902268871344e-06, + "loss": 0.72932744, + "num_input_tokens_seen": 62328570, + "step": 2879, + "time_per_iteration": 2.59049391746521 + }, + { + "auxiliary_loss_clip": 0.01129497, + "auxiliary_loss_mlp": 0.01051114, + "balance_loss_clip": 1.04865837, + "balance_loss_mlp": 1.03233874, + "epoch": 0.1731549676837517, + "flos": 13552903313280.0, + "grad_norm": 1.956952823864924, + "language_loss": 0.83193946, + "learning_rate": 3.78884854780014e-06, + "loss": 0.85374558, + "num_input_tokens_seen": 62345735, + "step": 2880, + "time_per_iteration": 2.52827525138855 + }, + { + "auxiliary_loss_clip": 0.01111483, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.05306923, + "balance_loss_mlp": 1.03343642, + "epoch": 0.17321509093641965, + "flos": 22857070394880.0, + "grad_norm": 1.882345190860551, + "language_loss": 0.81325865, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.83492982, + "num_input_tokens_seen": 62365525, + "step": 2881, + "time_per_iteration": 2.59496808052063 + }, + { + "auxiliary_loss_clip": 0.01136789, + "auxiliary_loss_mlp": 0.01045535, + "balance_loss_clip": 1.05265951, + "balance_loss_mlp": 1.02773666, + "epoch": 0.17327521418908762, + "flos": 24352785653760.0, + "grad_norm": 2.075185895255664, + "language_loss": 0.77298248, + "learning_rate": 3.788500062480197e-06, + "loss": 0.79480577, + "num_input_tokens_seen": 62385160, + "step": 2882, + "time_per_iteration": 2.5467443466186523 + }, + { + "auxiliary_loss_clip": 0.01116555, + "auxiliary_loss_mlp": 0.01053975, + "balance_loss_clip": 1.05174637, + "balance_loss_mlp": 1.03523517, + "epoch": 0.1733353374417556, + "flos": 33105651816960.0, + "grad_norm": 2.2844508223984272, + "language_loss": 0.76611286, + "learning_rate": 3.788325718086769e-06, + "loss": 0.78781819, + "num_input_tokens_seen": 62405280, + "step": 2883, + "time_per_iteration": 2.706714391708374 + }, + { + "auxiliary_loss_clip": 0.0111167, + "auxiliary_loss_mlp": 0.01045112, + "balance_loss_clip": 1.04906106, + "balance_loss_mlp": 1.02661061, + "epoch": 0.17339546069442358, + "flos": 24388947671040.0, + "grad_norm": 2.0213885815811605, + "language_loss": 0.85953999, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.88110775, + "num_input_tokens_seen": 62423665, + "step": 2884, + "time_per_iteration": 2.589056968688965 + }, + { + "auxiliary_loss_clip": 0.01138139, + "auxiliary_loss_mlp": 0.00810753, + "balance_loss_clip": 1.0543443, + "balance_loss_mlp": 1.02889562, + "epoch": 0.17345558394709154, + "flos": 27454174680960.0, + "grad_norm": 2.458062701847446, + "language_loss": 0.74333674, + "learning_rate": 3.787976825866055e-06, + "loss": 0.76282567, + "num_input_tokens_seen": 62445170, + "step": 2885, + "time_per_iteration": 2.626267671585083 + }, + { + "auxiliary_loss_clip": 0.01129591, + "auxiliary_loss_mlp": 0.01044453, + "balance_loss_clip": 1.05308473, + "balance_loss_mlp": 1.02745342, + "epoch": 0.1735157071997595, + "flos": 24682158391680.0, + "grad_norm": 1.5538529015647276, + "language_loss": 0.70770383, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.72944427, + "num_input_tokens_seen": 62466135, + "step": 2886, + "time_per_iteration": 2.5850133895874023 + }, + { + "auxiliary_loss_clip": 0.01145017, + "auxiliary_loss_mlp": 0.01043581, + "balance_loss_clip": 1.05082798, + "balance_loss_mlp": 1.02468622, + "epoch": 0.17357583045242747, + "flos": 21688932193920.0, + "grad_norm": 2.5645110637781627, + "language_loss": 0.69637185, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.71825778, + "num_input_tokens_seen": 62483910, + "step": 2887, + "time_per_iteration": 2.5200905799865723 + }, + { + "auxiliary_loss_clip": 0.0111647, + "auxiliary_loss_mlp": 0.01048838, + "balance_loss_clip": 1.05194426, + "balance_loss_mlp": 1.03055155, + "epoch": 0.17363595370509544, + "flos": 15375728753280.0, + "grad_norm": 1.6220846818290815, + "language_loss": 0.8487035, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87035656, + "num_input_tokens_seen": 62501530, + "step": 2888, + "time_per_iteration": 2.555405378341675 + }, + { + "auxiliary_loss_clip": 0.01093828, + "auxiliary_loss_mlp": 0.01053272, + "balance_loss_clip": 1.04740274, + "balance_loss_mlp": 1.03142107, + "epoch": 0.1736960769577634, + "flos": 23440941970560.0, + "grad_norm": 2.196358872859896, + "language_loss": 0.78521848, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.8066895, + "num_input_tokens_seen": 62521295, + "step": 2889, + "time_per_iteration": 4.0509352684021 + }, + { + "auxiliary_loss_clip": 0.01112576, + "auxiliary_loss_mlp": 0.00810837, + "balance_loss_clip": 1.05193305, + "balance_loss_mlp": 1.03017211, + "epoch": 0.1737562002104314, + "flos": 18587830475520.0, + "grad_norm": 2.087964727654909, + "language_loss": 0.8406595, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.85989368, + "num_input_tokens_seen": 62539615, + "step": 2890, + "time_per_iteration": 3.9716060161590576 + }, + { + "auxiliary_loss_clip": 0.01143392, + "auxiliary_loss_mlp": 0.01046908, + "balance_loss_clip": 1.05453706, + "balance_loss_mlp": 1.02801323, + "epoch": 0.17381632346309936, + "flos": 15998060816640.0, + "grad_norm": 2.177067399017797, + "language_loss": 0.81991309, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.84181607, + "num_input_tokens_seen": 62556820, + "step": 2891, + "time_per_iteration": 2.5609354972839355 + }, + { + "auxiliary_loss_clip": 0.01099412, + "auxiliary_loss_mlp": 0.01055288, + "balance_loss_clip": 1.04281652, + "balance_loss_mlp": 1.03288865, + "epoch": 0.17387644671576732, + "flos": 13369830670080.0, + "grad_norm": 2.209771387820264, + "language_loss": 0.81475335, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.83630037, + "num_input_tokens_seen": 62572450, + "step": 2892, + "time_per_iteration": 2.5467910766601562 + }, + { + "auxiliary_loss_clip": 0.0115299, + "auxiliary_loss_mlp": 0.01055426, + "balance_loss_clip": 1.05900836, + "balance_loss_mlp": 1.03562522, + "epoch": 0.1739365699684353, + "flos": 26615516958720.0, + "grad_norm": 1.9176163667144206, + "language_loss": 0.74642324, + "learning_rate": 3.786578545502627e-06, + "loss": 0.76850736, + "num_input_tokens_seen": 62592580, + "step": 2893, + "time_per_iteration": 4.018085956573486 + }, + { + "auxiliary_loss_clip": 0.01134681, + "auxiliary_loss_mlp": 0.01045023, + "balance_loss_clip": 1.05330515, + "balance_loss_mlp": 1.02544916, + "epoch": 0.17399669322110325, + "flos": 23367971491200.0, + "grad_norm": 1.9665593900601435, + "language_loss": 0.83077586, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.85257286, + "num_input_tokens_seen": 62611220, + "step": 2894, + "time_per_iteration": 2.5483009815216064 + }, + { + "auxiliary_loss_clip": 0.01114045, + "auxiliary_loss_mlp": 0.01044327, + "balance_loss_clip": 1.05007482, + "balance_loss_mlp": 1.02249968, + "epoch": 0.17405681647377122, + "flos": 22054107813120.0, + "grad_norm": 2.5608366414010697, + "language_loss": 0.74558687, + "learning_rate": 3.786228297806741e-06, + "loss": 0.76717061, + "num_input_tokens_seen": 62629185, + "step": 2895, + "time_per_iteration": 2.582894802093506 + }, + { + "auxiliary_loss_clip": 0.01037035, + "auxiliary_loss_mlp": 0.01049476, + "balance_loss_clip": 1.04274738, + "balance_loss_mlp": 1.04623365, + "epoch": 0.1741169397264392, + "flos": 61457559114240.0, + "grad_norm": 0.9170684436718151, + "language_loss": 0.62770057, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.64856565, + "num_input_tokens_seen": 62691895, + "step": 2896, + "time_per_iteration": 3.274322748184204 + }, + { + "auxiliary_loss_clip": 0.01131099, + "auxiliary_loss_mlp": 0.0080351, + "balance_loss_clip": 1.05140066, + "balance_loss_mlp": 1.01638937, + "epoch": 0.17417706297910718, + "flos": 27017680608000.0, + "grad_norm": 1.5861034590362757, + "language_loss": 0.75830138, + "learning_rate": 3.785877779175034e-06, + "loss": 0.7776475, + "num_input_tokens_seen": 62713790, + "step": 2897, + "time_per_iteration": 4.00433874130249 + }, + { + "auxiliary_loss_clip": 0.01143519, + "auxiliary_loss_mlp": 0.01040146, + "balance_loss_clip": 1.05438113, + "balance_loss_mlp": 1.02163267, + "epoch": 0.17423718623177514, + "flos": 33508856960640.0, + "grad_norm": 1.9228870348704274, + "language_loss": 0.69416428, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.71600091, + "num_input_tokens_seen": 62736285, + "step": 2898, + "time_per_iteration": 2.6113712787628174 + }, + { + "auxiliary_loss_clip": 0.011343, + "auxiliary_loss_mlp": 0.01042858, + "balance_loss_clip": 1.05959964, + "balance_loss_mlp": 1.02355766, + "epoch": 0.1742973094844431, + "flos": 27198634348800.0, + "grad_norm": 2.441435762802727, + "language_loss": 0.76013374, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.78190535, + "num_input_tokens_seen": 62756240, + "step": 2899, + "time_per_iteration": 2.5884125232696533 + }, + { + "auxiliary_loss_clip": 0.01096951, + "auxiliary_loss_mlp": 0.01047743, + "balance_loss_clip": 1.0499835, + "balance_loss_mlp": 1.02776325, + "epoch": 0.17435743273711107, + "flos": 22710734386560.0, + "grad_norm": 1.8304664900337155, + "language_loss": 0.72745997, + "learning_rate": 3.785351493339121e-06, + "loss": 0.74890685, + "num_input_tokens_seen": 62775910, + "step": 2900, + "time_per_iteration": 2.6228227615356445 + }, + { + "auxiliary_loss_clip": 0.01110019, + "auxiliary_loss_mlp": 0.00806612, + "balance_loss_clip": 1.05024481, + "balance_loss_mlp": 1.02162075, + "epoch": 0.17441755598977904, + "flos": 41646466039680.0, + "grad_norm": 1.603977032354177, + "language_loss": 0.69919294, + "learning_rate": 3.785175929316863e-06, + "loss": 0.71835923, + "num_input_tokens_seen": 62799385, + "step": 2901, + "time_per_iteration": 2.760504722595215 + }, + { + "auxiliary_loss_clip": 0.01134691, + "auxiliary_loss_mlp": 0.01055807, + "balance_loss_clip": 1.05881882, + "balance_loss_mlp": 1.03707922, + "epoch": 0.174477679242447, + "flos": 26287077974400.0, + "grad_norm": 2.0881185534782403, + "language_loss": 0.7653079, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78721285, + "num_input_tokens_seen": 62819380, + "step": 2902, + "time_per_iteration": 2.6073036193847656 + }, + { + "auxiliary_loss_clip": 0.01147196, + "auxiliary_loss_mlp": 0.0105225, + "balance_loss_clip": 1.05186105, + "balance_loss_mlp": 1.03389192, + "epoch": 0.174537802495115, + "flos": 17858412990720.0, + "grad_norm": 1.9897002853826633, + "language_loss": 0.81272733, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.83472186, + "num_input_tokens_seen": 62836205, + "step": 2903, + "time_per_iteration": 2.4902195930480957 + }, + { + "auxiliary_loss_clip": 0.01125882, + "auxiliary_loss_mlp": 0.01042788, + "balance_loss_clip": 1.05343187, + "balance_loss_mlp": 1.02392936, + "epoch": 0.17459792574778296, + "flos": 16940715390720.0, + "grad_norm": 1.9274455743239058, + "language_loss": 0.73514318, + "learning_rate": 3.784648831112429e-06, + "loss": 0.75682992, + "num_input_tokens_seen": 62854045, + "step": 2904, + "time_per_iteration": 2.5359294414520264 + }, + { + "auxiliary_loss_clip": 0.01114866, + "auxiliary_loss_mlp": 0.01048177, + "balance_loss_clip": 1.06286395, + "balance_loss_mlp": 1.02953219, + "epoch": 0.17465804900045093, + "flos": 25520026014720.0, + "grad_norm": 1.819004443431058, + "language_loss": 0.64677048, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.66840088, + "num_input_tokens_seen": 62873075, + "step": 2905, + "time_per_iteration": 2.665915012359619 + }, + { + "auxiliary_loss_clip": 0.01130966, + "auxiliary_loss_mlp": 0.01048798, + "balance_loss_clip": 1.05209064, + "balance_loss_mlp": 1.0290693, + "epoch": 0.1747181722531189, + "flos": 24129708238080.0, + "grad_norm": 1.7576675873137075, + "language_loss": 0.79257977, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.81437743, + "num_input_tokens_seen": 62892675, + "step": 2906, + "time_per_iteration": 2.589737892150879 + }, + { + "auxiliary_loss_clip": 0.01147097, + "auxiliary_loss_mlp": 0.01052327, + "balance_loss_clip": 1.0552305, + "balance_loss_mlp": 1.03317058, + "epoch": 0.17477829550578686, + "flos": 17748813617280.0, + "grad_norm": 1.7449451131445253, + "language_loss": 0.81293046, + "learning_rate": 3.784121123841449e-06, + "loss": 0.8349247, + "num_input_tokens_seen": 62910675, + "step": 2907, + "time_per_iteration": 2.483914375305176 + }, + { + "auxiliary_loss_clip": 0.01144974, + "auxiliary_loss_mlp": 0.01050364, + "balance_loss_clip": 1.05353308, + "balance_loss_mlp": 1.03238726, + "epoch": 0.17483841875845482, + "flos": 15377344865280.0, + "grad_norm": 2.184213289837377, + "language_loss": 0.8074159, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.82936925, + "num_input_tokens_seen": 62928130, + "step": 2908, + "time_per_iteration": 2.5117392539978027 + }, + { + "auxiliary_loss_clip": 0.01127626, + "auxiliary_loss_mlp": 0.01053014, + "balance_loss_clip": 1.05169904, + "balance_loss_mlp": 1.03341591, + "epoch": 0.17489854201112282, + "flos": 17163254102400.0, + "grad_norm": 2.8160251244991485, + "language_loss": 0.80545259, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.827259, + "num_input_tokens_seen": 62944290, + "step": 2909, + "time_per_iteration": 2.5063986778259277 + }, + { + "auxiliary_loss_clip": 0.01086176, + "auxiliary_loss_mlp": 0.01059912, + "balance_loss_clip": 1.04710054, + "balance_loss_mlp": 1.03705943, + "epoch": 0.17495866526379078, + "flos": 19755286318080.0, + "grad_norm": 1.6675221592735325, + "language_loss": 0.76633048, + "learning_rate": 3.783592807684017e-06, + "loss": 0.78779137, + "num_input_tokens_seen": 62963505, + "step": 2910, + "time_per_iteration": 2.60888934135437 + }, + { + "auxiliary_loss_clip": 0.01160387, + "auxiliary_loss_mlp": 0.01054138, + "balance_loss_clip": 1.05428052, + "balance_loss_mlp": 1.03376555, + "epoch": 0.17501878851645875, + "flos": 28511133310080.0, + "grad_norm": 1.8879186430586612, + "language_loss": 0.87313139, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89527667, + "num_input_tokens_seen": 62985020, + "step": 2911, + "time_per_iteration": 2.546182870864868 + }, + { + "auxiliary_loss_clip": 0.01154638, + "auxiliary_loss_mlp": 0.00811498, + "balance_loss_clip": 1.05016589, + "balance_loss_mlp": 1.03046119, + "epoch": 0.1750789117691267, + "flos": 17931203902080.0, + "grad_norm": 2.334245540448843, + "language_loss": 0.90028405, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.91994542, + "num_input_tokens_seen": 63001745, + "step": 2912, + "time_per_iteration": 2.4731333255767822 + }, + { + "auxiliary_loss_clip": 0.01147029, + "auxiliary_loss_mlp": 0.01046319, + "balance_loss_clip": 1.05090904, + "balance_loss_mlp": 1.02691126, + "epoch": 0.17513903502179468, + "flos": 18259427404800.0, + "grad_norm": 1.8446631591636307, + "language_loss": 0.72207189, + "learning_rate": 3.783063882820439e-06, + "loss": 0.74400538, + "num_input_tokens_seen": 63019750, + "step": 2913, + "time_per_iteration": 2.4917943477630615 + }, + { + "auxiliary_loss_clip": 0.01137521, + "auxiliary_loss_mlp": 0.01047929, + "balance_loss_clip": 1.05378938, + "balance_loss_mlp": 1.02934456, + "epoch": 0.17519915827446264, + "flos": 20704728562560.0, + "grad_norm": 1.9011188384433966, + "language_loss": 0.69629109, + "learning_rate": 3.782887439295741e-06, + "loss": 0.71814555, + "num_input_tokens_seen": 63039500, + "step": 2914, + "time_per_iteration": 2.5492985248565674 + }, + { + "auxiliary_loss_clip": 0.01142344, + "auxiliary_loss_mlp": 0.01047109, + "balance_loss_clip": 1.05402493, + "balance_loss_mlp": 1.02807164, + "epoch": 0.1752592815271306, + "flos": 20523415685760.0, + "grad_norm": 1.8822013484975568, + "language_loss": 0.93579727, + "learning_rate": 3.782710928163772e-06, + "loss": 0.95769185, + "num_input_tokens_seen": 63059785, + "step": 2915, + "time_per_iteration": 2.5085201263427734 + }, + { + "auxiliary_loss_clip": 0.01128061, + "auxiliary_loss_mlp": 0.01044603, + "balance_loss_clip": 1.06387877, + "balance_loss_mlp": 1.02508819, + "epoch": 0.1753194047797986, + "flos": 21799178012160.0, + "grad_norm": 1.7790503673452518, + "language_loss": 0.80966485, + "learning_rate": 3.782534349431226e-06, + "loss": 0.83139145, + "num_input_tokens_seen": 63079385, + "step": 2916, + "time_per_iteration": 2.5945425033569336 + }, + { + "auxiliary_loss_clip": 0.01148648, + "auxiliary_loss_mlp": 0.0105452, + "balance_loss_clip": 1.0540539, + "balance_loss_mlp": 1.03527927, + "epoch": 0.17537952803246656, + "flos": 20668351063680.0, + "grad_norm": 1.6225705648262605, + "language_loss": 0.73973149, + "learning_rate": 3.782357703104799e-06, + "loss": 0.76176322, + "num_input_tokens_seen": 63098970, + "step": 2917, + "time_per_iteration": 2.5192577838897705 + }, + { + "auxiliary_loss_clip": 0.01133818, + "auxiliary_loss_mlp": 0.01054567, + "balance_loss_clip": 1.05175865, + "balance_loss_mlp": 1.03508854, + "epoch": 0.17543965128513453, + "flos": 23295072839040.0, + "grad_norm": 1.8975282640332778, + "language_loss": 0.77469969, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.79658353, + "num_input_tokens_seen": 63118750, + "step": 2918, + "time_per_iteration": 2.5319983959198 + }, + { + "auxiliary_loss_clip": 0.01097203, + "auxiliary_loss_mlp": 0.01051678, + "balance_loss_clip": 1.0494715, + "balance_loss_mlp": 1.02964783, + "epoch": 0.1754997745378025, + "flos": 29095615416960.0, + "grad_norm": 3.4502126401328503, + "language_loss": 0.73827893, + "learning_rate": 3.782004207697098e-06, + "loss": 0.75976777, + "num_input_tokens_seen": 63136865, + "step": 2919, + "time_per_iteration": 2.6653575897216797 + }, + { + "auxiliary_loss_clip": 0.01127333, + "auxiliary_loss_mlp": 0.01051635, + "balance_loss_clip": 1.04983449, + "balance_loss_mlp": 1.03223991, + "epoch": 0.17555989779047046, + "flos": 30371844620160.0, + "grad_norm": 1.7893724690115456, + "language_loss": 0.74213469, + "learning_rate": 3.781827358629228e-06, + "loss": 0.76392436, + "num_input_tokens_seen": 63158325, + "step": 2920, + "time_per_iteration": 2.61352801322937 + }, + { + "auxiliary_loss_clip": 0.01119797, + "auxiliary_loss_mlp": 0.01051027, + "balance_loss_clip": 1.04715121, + "balance_loss_mlp": 1.03090453, + "epoch": 0.17562002104313842, + "flos": 23287746464640.0, + "grad_norm": 2.616717268931314, + "language_loss": 0.79901445, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.8207227, + "num_input_tokens_seen": 63173115, + "step": 2921, + "time_per_iteration": 2.5060837268829346 + }, + { + "auxiliary_loss_clip": 0.01124643, + "auxiliary_loss_mlp": 0.01050056, + "balance_loss_clip": 1.05328786, + "balance_loss_mlp": 1.03012466, + "epoch": 0.1756801442958064, + "flos": 24790500789120.0, + "grad_norm": 1.76945238221899, + "language_loss": 0.87558311, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.89733005, + "num_input_tokens_seen": 63192880, + "step": 2922, + "time_per_iteration": 2.594142198562622 + }, + { + "auxiliary_loss_clip": 0.01148048, + "auxiliary_loss_mlp": 0.01049887, + "balance_loss_clip": 1.05295837, + "balance_loss_mlp": 1.03105187, + "epoch": 0.17574026754847438, + "flos": 25771651764480.0, + "grad_norm": 2.5324387990587587, + "language_loss": 0.62315881, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.6451382, + "num_input_tokens_seen": 63214395, + "step": 2923, + "time_per_iteration": 2.5555741786956787 + }, + { + "auxiliary_loss_clip": 0.01129741, + "auxiliary_loss_mlp": 0.01046968, + "balance_loss_clip": 1.05321574, + "balance_loss_mlp": 1.02620149, + "epoch": 0.17580039080114235, + "flos": 17456608477440.0, + "grad_norm": 2.220825986283819, + "language_loss": 0.80323148, + "learning_rate": 3.78111928675413e-06, + "loss": 0.82499862, + "num_input_tokens_seen": 63231020, + "step": 2924, + "time_per_iteration": 2.5847175121307373 + }, + { + "auxiliary_loss_clip": 0.0113489, + "auxiliary_loss_mlp": 0.01054956, + "balance_loss_clip": 1.05398726, + "balance_loss_mlp": 1.03416562, + "epoch": 0.1758605140538103, + "flos": 14864648088960.0, + "grad_norm": 2.222199879307277, + "language_loss": 0.71246302, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73436141, + "num_input_tokens_seen": 63246245, + "step": 2925, + "time_per_iteration": 2.4941041469573975 + }, + { + "auxiliary_loss_clip": 0.01120903, + "auxiliary_loss_mlp": 0.01043914, + "balance_loss_clip": 1.05313802, + "balance_loss_mlp": 1.02537644, + "epoch": 0.17592063730647828, + "flos": 23004268329600.0, + "grad_norm": 1.6268663801303818, + "language_loss": 0.71947503, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.74112314, + "num_input_tokens_seen": 63267790, + "step": 2926, + "time_per_iteration": 2.6124072074890137 + }, + { + "auxiliary_loss_clip": 0.01104773, + "auxiliary_loss_mlp": 0.01051932, + "balance_loss_clip": 1.0483458, + "balance_loss_mlp": 1.02829254, + "epoch": 0.17598076055914624, + "flos": 20741501111040.0, + "grad_norm": 2.0881672192192777, + "language_loss": 0.84935397, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.87092102, + "num_input_tokens_seen": 63286830, + "step": 2927, + "time_per_iteration": 2.553095579147339 + }, + { + "auxiliary_loss_clip": 0.0111044, + "auxiliary_loss_mlp": 0.01051211, + "balance_loss_clip": 1.06372523, + "balance_loss_mlp": 1.03390181, + "epoch": 0.1760408838118142, + "flos": 34092441227520.0, + "grad_norm": 2.5710698712522313, + "language_loss": 0.72354817, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.74516469, + "num_input_tokens_seen": 63308870, + "step": 2928, + "time_per_iteration": 5.548267364501953 + }, + { + "auxiliary_loss_clip": 0.01119141, + "auxiliary_loss_mlp": 0.01049737, + "balance_loss_clip": 1.05032706, + "balance_loss_mlp": 1.02990055, + "epoch": 0.1761010070644822, + "flos": 24168384207360.0, + "grad_norm": 1.854679366371153, + "language_loss": 0.8325026, + "learning_rate": 3.780232677305744e-06, + "loss": 0.85419136, + "num_input_tokens_seen": 63329005, + "step": 2929, + "time_per_iteration": 2.597723960876465 + }, + { + "auxiliary_loss_clip": 0.01131542, + "auxiliary_loss_mlp": 0.01041608, + "balance_loss_clip": 1.05742049, + "balance_loss_mlp": 1.02254689, + "epoch": 0.17616113031715017, + "flos": 26576697335040.0, + "grad_norm": 1.670080609555047, + "language_loss": 0.79195368, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.81368512, + "num_input_tokens_seen": 63349390, + "step": 2930, + "time_per_iteration": 2.60609769821167 + }, + { + "auxiliary_loss_clip": 0.01159387, + "auxiliary_loss_mlp": 0.01046545, + "balance_loss_clip": 1.05531502, + "balance_loss_mlp": 1.02707839, + "epoch": 0.17622125356981813, + "flos": 25666685245440.0, + "grad_norm": 2.1082321549569887, + "language_loss": 0.76353794, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.78559732, + "num_input_tokens_seen": 63368835, + "step": 2931, + "time_per_iteration": 2.5234673023223877 + }, + { + "auxiliary_loss_clip": 0.01074287, + "auxiliary_loss_mlp": 0.01045357, + "balance_loss_clip": 1.0438714, + "balance_loss_mlp": 1.02653337, + "epoch": 0.1762813768224861, + "flos": 16508530949760.0, + "grad_norm": 3.6679281437694073, + "language_loss": 0.7504459, + "learning_rate": 3.779699901503696e-06, + "loss": 0.77164233, + "num_input_tokens_seen": 63385220, + "step": 2932, + "time_per_iteration": 4.031884670257568 + }, + { + "auxiliary_loss_clip": 0.01151382, + "auxiliary_loss_mlp": 0.01042373, + "balance_loss_clip": 1.0529952, + "balance_loss_mlp": 1.02190518, + "epoch": 0.17634150007515406, + "flos": 11211850402560.0, + "grad_norm": 2.8303994654671554, + "language_loss": 0.89996243, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.92189991, + "num_input_tokens_seen": 63400865, + "step": 2933, + "time_per_iteration": 2.484219551086426 + }, + { + "auxiliary_loss_clip": 0.0115756, + "auxiliary_loss_mlp": 0.01055157, + "balance_loss_clip": 1.05555522, + "balance_loss_mlp": 1.03692961, + "epoch": 0.17640162332782203, + "flos": 23659925235840.0, + "grad_norm": 1.7267112615777147, + "language_loss": 0.8834486, + "learning_rate": 3.779344380192448e-06, + "loss": 0.90557581, + "num_input_tokens_seen": 63421390, + "step": 2934, + "time_per_iteration": 2.5124008655548096 + }, + { + "auxiliary_loss_clip": 0.01129148, + "auxiliary_loss_mlp": 0.01048756, + "balance_loss_clip": 1.05531383, + "balance_loss_mlp": 1.03086305, + "epoch": 0.17646174658049, + "flos": 53796984606720.0, + "grad_norm": 1.6936556109248069, + "language_loss": 0.70558494, + "learning_rate": 3.779166518324077e-06, + "loss": 0.727364, + "num_input_tokens_seen": 63444715, + "step": 2935, + "time_per_iteration": 4.226144313812256 + }, + { + "auxiliary_loss_clip": 0.01125911, + "auxiliary_loss_mlp": 0.01039227, + "balance_loss_clip": 1.05263793, + "balance_loss_mlp": 1.02039206, + "epoch": 0.17652186983315798, + "flos": 24243868638720.0, + "grad_norm": 1.9699932267242088, + "language_loss": 0.69386798, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.71551931, + "num_input_tokens_seen": 63465525, + "step": 2936, + "time_per_iteration": 2.573772430419922 + }, + { + "auxiliary_loss_clip": 0.01108151, + "auxiliary_loss_mlp": 0.01045841, + "balance_loss_clip": 1.05289817, + "balance_loss_mlp": 1.02744734, + "epoch": 0.17658199308582595, + "flos": 27454282421760.0, + "grad_norm": 1.9356638839950495, + "language_loss": 0.71262199, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.73416191, + "num_input_tokens_seen": 63485815, + "step": 2937, + "time_per_iteration": 2.6495559215545654 + }, + { + "auxiliary_loss_clip": 0.01139355, + "auxiliary_loss_mlp": 0.01041233, + "balance_loss_clip": 1.05821943, + "balance_loss_mlp": 1.02167094, + "epoch": 0.17664211633849392, + "flos": 22418672901120.0, + "grad_norm": 2.858890818710361, + "language_loss": 0.75788647, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.77969241, + "num_input_tokens_seen": 63503905, + "step": 2938, + "time_per_iteration": 2.5561225414276123 + }, + { + "auxiliary_loss_clip": 0.01150566, + "auxiliary_loss_mlp": 0.01039174, + "balance_loss_clip": 1.05687737, + "balance_loss_mlp": 1.02119684, + "epoch": 0.17670223959116188, + "flos": 24715124098560.0, + "grad_norm": 2.19156361058105, + "language_loss": 0.7131179, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.73501527, + "num_input_tokens_seen": 63521985, + "step": 2939, + "time_per_iteration": 2.530850410461426 + }, + { + "auxiliary_loss_clip": 0.01161116, + "auxiliary_loss_mlp": 0.01044606, + "balance_loss_clip": 1.05687857, + "balance_loss_mlp": 1.02559209, + "epoch": 0.17676236284382985, + "flos": 22527051212160.0, + "grad_norm": 2.4514631158252187, + "language_loss": 0.73399305, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.75605023, + "num_input_tokens_seen": 63539830, + "step": 2940, + "time_per_iteration": 2.4759058952331543 + }, + { + "auxiliary_loss_clip": 0.011243, + "auxiliary_loss_mlp": 0.01048859, + "balance_loss_clip": 1.05572355, + "balance_loss_mlp": 1.02896345, + "epoch": 0.1768224860964978, + "flos": 12385160161920.0, + "grad_norm": 2.388246139232071, + "language_loss": 0.854873, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.87660456, + "num_input_tokens_seen": 63555495, + "step": 2941, + "time_per_iteration": 2.543886184692383 + }, + { + "auxiliary_loss_clip": 0.01159987, + "auxiliary_loss_mlp": 0.01040707, + "balance_loss_clip": 1.05449998, + "balance_loss_mlp": 1.02176499, + "epoch": 0.1768826093491658, + "flos": 24353360271360.0, + "grad_norm": 2.3282555167707026, + "language_loss": 0.7657854, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.78779238, + "num_input_tokens_seen": 63575290, + "step": 2942, + "time_per_iteration": 2.514430522918701 + }, + { + "auxiliary_loss_clip": 0.01101157, + "auxiliary_loss_mlp": 0.00823422, + "balance_loss_clip": 1.04736876, + "balance_loss_mlp": 1.05175447, + "epoch": 0.17694273260183377, + "flos": 23587062497280.0, + "grad_norm": 1.7995989176202907, + "language_loss": 0.80293238, + "learning_rate": 3.77774119516197e-06, + "loss": 0.82217824, + "num_input_tokens_seen": 63594670, + "step": 2943, + "time_per_iteration": 2.679992437362671 + }, + { + "auxiliary_loss_clip": 0.01131252, + "auxiliary_loss_mlp": 0.01050522, + "balance_loss_clip": 1.05131817, + "balance_loss_mlp": 1.0295651, + "epoch": 0.17700285585450173, + "flos": 26760991040640.0, + "grad_norm": 1.980561818627125, + "language_loss": 0.80788219, + "learning_rate": 3.777562726341155e-06, + "loss": 0.82969999, + "num_input_tokens_seen": 63614780, + "step": 2944, + "time_per_iteration": 2.5805845260620117 + }, + { + "auxiliary_loss_clip": 0.01159749, + "auxiliary_loss_mlp": 0.01056431, + "balance_loss_clip": 1.05233479, + "balance_loss_mlp": 1.03763199, + "epoch": 0.1770629791071697, + "flos": 42776323320960.0, + "grad_norm": 2.1220926651857726, + "language_loss": 0.74408811, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.76624995, + "num_input_tokens_seen": 63637190, + "step": 2945, + "time_per_iteration": 2.656189203262329 + }, + { + "auxiliary_loss_clip": 0.01146003, + "auxiliary_loss_mlp": 0.01044768, + "balance_loss_clip": 1.05539978, + "balance_loss_mlp": 1.02602863, + "epoch": 0.17712310235983766, + "flos": 17345572560000.0, + "grad_norm": 2.495924806849644, + "language_loss": 0.78333086, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.80523854, + "num_input_tokens_seen": 63652140, + "step": 2946, + "time_per_iteration": 2.44804310798645 + }, + { + "auxiliary_loss_clip": 0.01109647, + "auxiliary_loss_mlp": 0.01052108, + "balance_loss_clip": 1.05279922, + "balance_loss_mlp": 1.03246212, + "epoch": 0.17718322561250563, + "flos": 23878477537920.0, + "grad_norm": 1.695759742234261, + "language_loss": 0.76017606, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.78179359, + "num_input_tokens_seen": 63671700, + "step": 2947, + "time_per_iteration": 2.5979435443878174 + }, + { + "auxiliary_loss_clip": 0.01146334, + "auxiliary_loss_mlp": 0.01044093, + "balance_loss_clip": 1.05201852, + "balance_loss_mlp": 1.02438784, + "epoch": 0.1772433488651736, + "flos": 36466352104320.0, + "grad_norm": 2.181650091116727, + "language_loss": 0.72913307, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.75103736, + "num_input_tokens_seen": 63691685, + "step": 2948, + "time_per_iteration": 2.6115617752075195 + }, + { + "auxiliary_loss_clip": 0.01158297, + "auxiliary_loss_mlp": 0.01044127, + "balance_loss_clip": 1.06611979, + "balance_loss_mlp": 1.02545917, + "epoch": 0.1773034721178416, + "flos": 26684716510080.0, + "grad_norm": 1.8233519653521333, + "language_loss": 0.82257777, + "learning_rate": 3.776669371292171e-06, + "loss": 0.84460199, + "num_input_tokens_seen": 63711720, + "step": 2949, + "time_per_iteration": 2.5919477939605713 + }, + { + "auxiliary_loss_clip": 0.01077799, + "auxiliary_loss_mlp": 0.01010389, + "balance_loss_clip": 1.04600704, + "balance_loss_mlp": 1.00719452, + "epoch": 0.17736359537050955, + "flos": 57117467617920.0, + "grad_norm": 0.7529679864600998, + "language_loss": 0.65026027, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.67114216, + "num_input_tokens_seen": 63776280, + "step": 2950, + "time_per_iteration": 3.190380573272705 + }, + { + "auxiliary_loss_clip": 0.01120892, + "auxiliary_loss_mlp": 0.01043549, + "balance_loss_clip": 1.05293047, + "balance_loss_mlp": 1.02476192, + "epoch": 0.17742371862317752, + "flos": 27198203385600.0, + "grad_norm": 2.5714958596618422, + "language_loss": 0.83865118, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.86029553, + "num_input_tokens_seen": 63797535, + "step": 2951, + "time_per_iteration": 2.6427111625671387 + }, + { + "auxiliary_loss_clip": 0.01125666, + "auxiliary_loss_mlp": 0.01044665, + "balance_loss_clip": 1.05146229, + "balance_loss_mlp": 1.02486384, + "epoch": 0.17748384187584548, + "flos": 20959694277120.0, + "grad_norm": 2.5105126934937716, + "language_loss": 0.80213618, + "learning_rate": 3.776132549750806e-06, + "loss": 0.82383943, + "num_input_tokens_seen": 63817045, + "step": 2952, + "time_per_iteration": 2.5700857639312744 + }, + { + "auxiliary_loss_clip": 0.01160397, + "auxiliary_loss_mlp": 0.01048424, + "balance_loss_clip": 1.05545664, + "balance_loss_mlp": 1.02832568, + "epoch": 0.17754396512851345, + "flos": 25009986844800.0, + "grad_norm": 2.192834126458989, + "language_loss": 0.79649723, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.8185854, + "num_input_tokens_seen": 63837665, + "step": 2953, + "time_per_iteration": 2.5272061824798584 + }, + { + "auxiliary_loss_clip": 0.01120993, + "auxiliary_loss_mlp": 0.01043632, + "balance_loss_clip": 1.05447936, + "balance_loss_mlp": 1.0244509, + "epoch": 0.1776040883811814, + "flos": 32051566275840.0, + "grad_norm": 1.8340359156995647, + "language_loss": 0.8778832, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.8995294, + "num_input_tokens_seen": 63858455, + "step": 2954, + "time_per_iteration": 2.680642604827881 + }, + { + "auxiliary_loss_clip": 0.0113957, + "auxiliary_loss_mlp": 0.01052792, + "balance_loss_clip": 1.05514598, + "balance_loss_mlp": 1.0339092, + "epoch": 0.17766421163384938, + "flos": 21574125348480.0, + "grad_norm": 2.103274935313538, + "language_loss": 0.84631824, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.86824191, + "num_input_tokens_seen": 63876935, + "step": 2955, + "time_per_iteration": 2.563159465789795 + }, + { + "auxiliary_loss_clip": 0.01130116, + "auxiliary_loss_mlp": 0.01050814, + "balance_loss_clip": 1.05283022, + "balance_loss_mlp": 1.03057241, + "epoch": 0.17772433488651737, + "flos": 22419319345920.0, + "grad_norm": 1.7264236232643893, + "language_loss": 0.70892906, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.7307384, + "num_input_tokens_seen": 63896815, + "step": 2956, + "time_per_iteration": 2.5730204582214355 + }, + { + "auxiliary_loss_clip": 0.01149784, + "auxiliary_loss_mlp": 0.01053103, + "balance_loss_clip": 1.05607748, + "balance_loss_mlp": 1.03377891, + "epoch": 0.17778445813918534, + "flos": 25629445820160.0, + "grad_norm": 1.8600708881288932, + "language_loss": 0.83256441, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.85459328, + "num_input_tokens_seen": 63916140, + "step": 2957, + "time_per_iteration": 2.5378241539001465 + }, + { + "auxiliary_loss_clip": 0.01101997, + "auxiliary_loss_mlp": 0.01044855, + "balance_loss_clip": 1.05079389, + "balance_loss_mlp": 1.02498317, + "epoch": 0.1778445813918533, + "flos": 25628871202560.0, + "grad_norm": 1.6235078718362561, + "language_loss": 0.75598133, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.77744991, + "num_input_tokens_seen": 63935220, + "step": 2958, + "time_per_iteration": 2.655172348022461 + }, + { + "auxiliary_loss_clip": 0.01147604, + "auxiliary_loss_mlp": 0.01048053, + "balance_loss_clip": 1.05839539, + "balance_loss_mlp": 1.02917051, + "epoch": 0.17790470464452127, + "flos": 22345522853760.0, + "grad_norm": 5.289409080568085, + "language_loss": 0.80405802, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.82601458, + "num_input_tokens_seen": 63954550, + "step": 2959, + "time_per_iteration": 2.5301895141601562 + }, + { + "auxiliary_loss_clip": 0.011677, + "auxiliary_loss_mlp": 0.01051278, + "balance_loss_clip": 1.05932474, + "balance_loss_mlp": 1.03131032, + "epoch": 0.17796482789718923, + "flos": 18765875214720.0, + "grad_norm": 2.041852633684074, + "language_loss": 0.52284062, + "learning_rate": 3.774698062689362e-06, + "loss": 0.54503036, + "num_input_tokens_seen": 63972425, + "step": 2960, + "time_per_iteration": 2.4874885082244873 + }, + { + "auxiliary_loss_clip": 0.0111395, + "auxiliary_loss_mlp": 0.01062552, + "balance_loss_clip": 1.05589724, + "balance_loss_mlp": 1.04159474, + "epoch": 0.1780249511498572, + "flos": 23440941970560.0, + "grad_norm": 1.9261056559183216, + "language_loss": 0.89137936, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.91314441, + "num_input_tokens_seen": 63992165, + "step": 2961, + "time_per_iteration": 2.6163876056671143 + }, + { + "auxiliary_loss_clip": 0.01115183, + "auxiliary_loss_mlp": 0.01051772, + "balance_loss_clip": 1.05429649, + "balance_loss_mlp": 1.03178072, + "epoch": 0.1780850744025252, + "flos": 23367468700800.0, + "grad_norm": 1.6345170373938518, + "language_loss": 0.79134417, + "learning_rate": 3.774338767820631e-06, + "loss": 0.81301367, + "num_input_tokens_seen": 64013470, + "step": 2962, + "time_per_iteration": 2.6353790760040283 + }, + { + "auxiliary_loss_clip": 0.01145812, + "auxiliary_loss_mlp": 0.01055773, + "balance_loss_clip": 1.05503273, + "balance_loss_mlp": 1.03351617, + "epoch": 0.17814519765519315, + "flos": 13771994319360.0, + "grad_norm": 1.666630943113837, + "language_loss": 0.74846959, + "learning_rate": 3.774159019458203e-06, + "loss": 0.77048546, + "num_input_tokens_seen": 64030975, + "step": 2963, + "time_per_iteration": 2.494568347930908 + }, + { + "auxiliary_loss_clip": 0.01142043, + "auxiliary_loss_mlp": 0.01046798, + "balance_loss_clip": 1.05823302, + "balance_loss_mlp": 1.02668691, + "epoch": 0.17820532090786112, + "flos": 21976396738560.0, + "grad_norm": 1.815175761070858, + "language_loss": 0.78628916, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.80817759, + "num_input_tokens_seen": 64050075, + "step": 2964, + "time_per_iteration": 2.5668118000030518 + }, + { + "auxiliary_loss_clip": 0.01153507, + "auxiliary_loss_mlp": 0.00810305, + "balance_loss_clip": 1.05846453, + "balance_loss_mlp": 1.03116012, + "epoch": 0.17826544416052909, + "flos": 24790752184320.0, + "grad_norm": 2.0596756142120025, + "language_loss": 0.81510031, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.83473849, + "num_input_tokens_seen": 64071920, + "step": 2965, + "time_per_iteration": 2.539760112762451 + }, + { + "auxiliary_loss_clip": 0.01148169, + "auxiliary_loss_mlp": 0.01043922, + "balance_loss_clip": 1.05505729, + "balance_loss_mlp": 1.02627921, + "epoch": 0.17832556741319705, + "flos": 13879582531200.0, + "grad_norm": 2.2357623177728456, + "language_loss": 0.94421065, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.96613163, + "num_input_tokens_seen": 64086835, + "step": 2966, + "time_per_iteration": 2.463391065597534 + }, + { + "auxiliary_loss_clip": 0.01116936, + "auxiliary_loss_mlp": 0.00807526, + "balance_loss_clip": 1.05628777, + "balance_loss_mlp": 1.0272001, + "epoch": 0.17838569066586502, + "flos": 36641703323520.0, + "grad_norm": 2.2411834757480515, + "language_loss": 0.730708, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.74995267, + "num_input_tokens_seen": 64107360, + "step": 2967, + "time_per_iteration": 4.2493064403533936 + }, + { + "auxiliary_loss_clip": 0.01130226, + "auxiliary_loss_mlp": 0.01050694, + "balance_loss_clip": 1.05512106, + "balance_loss_mlp": 1.0313468, + "epoch": 0.17844581391853298, + "flos": 18727271072640.0, + "grad_norm": 1.9294636075824487, + "language_loss": 0.7694909, + "learning_rate": 3.773259268638157e-06, + "loss": 0.79130006, + "num_input_tokens_seen": 64124690, + "step": 2968, + "time_per_iteration": 3.89522647857666 + }, + { + "auxiliary_loss_clip": 0.01086208, + "auxiliary_loss_mlp": 0.01046969, + "balance_loss_clip": 1.049227, + "balance_loss_mlp": 1.0275141, + "epoch": 0.17850593717120097, + "flos": 27378259286400.0, + "grad_norm": 1.9301122986116048, + "language_loss": 0.75794005, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.77927172, + "num_input_tokens_seen": 64146315, + "step": 2969, + "time_per_iteration": 2.6725833415985107 + }, + { + "auxiliary_loss_clip": 0.01052277, + "auxiliary_loss_mlp": 0.01035156, + "balance_loss_clip": 1.03901553, + "balance_loss_mlp": 1.03241432, + "epoch": 0.17856606042386894, + "flos": 66996025084800.0, + "grad_norm": 0.8489364856842697, + "language_loss": 0.69037056, + "learning_rate": 3.772898897567171e-06, + "loss": 0.71124494, + "num_input_tokens_seen": 64210875, + "step": 2970, + "time_per_iteration": 3.2114129066467285 + }, + { + "auxiliary_loss_clip": 0.01127135, + "auxiliary_loss_mlp": 0.01043091, + "balance_loss_clip": 1.05182469, + "balance_loss_mlp": 1.02340984, + "epoch": 0.1786261836765369, + "flos": 36977001805440.0, + "grad_norm": 1.9842444486096198, + "language_loss": 0.67778349, + "learning_rate": 3.772718611185505e-06, + "loss": 0.69948578, + "num_input_tokens_seen": 64230740, + "step": 2971, + "time_per_iteration": 4.137200593948364 + }, + { + "auxiliary_loss_clip": 0.01117033, + "auxiliary_loss_mlp": 0.0104903, + "balance_loss_clip": 1.06215239, + "balance_loss_mlp": 1.02837086, + "epoch": 0.17868630692920487, + "flos": 24825441744000.0, + "grad_norm": 1.7883837237609734, + "language_loss": 0.89930332, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.92096394, + "num_input_tokens_seen": 64252300, + "step": 2972, + "time_per_iteration": 2.642200469970703 + }, + { + "auxiliary_loss_clip": 0.01119764, + "auxiliary_loss_mlp": 0.01063871, + "balance_loss_clip": 1.05089903, + "balance_loss_mlp": 1.04244876, + "epoch": 0.17874643018187283, + "flos": 16981977139200.0, + "grad_norm": 2.422009188626755, + "language_loss": 0.88810194, + "learning_rate": 3.77235783676401e-06, + "loss": 0.90993828, + "num_input_tokens_seen": 64270105, + "step": 2973, + "time_per_iteration": 3.967153787612915 + }, + { + "auxiliary_loss_clip": 0.01159211, + "auxiliary_loss_mlp": 0.01056544, + "balance_loss_clip": 1.05417514, + "balance_loss_mlp": 1.03713655, + "epoch": 0.1788065534345408, + "flos": 21032233793280.0, + "grad_norm": 2.600292413082903, + "language_loss": 0.76018912, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.78234673, + "num_input_tokens_seen": 64287250, + "step": 2974, + "time_per_iteration": 2.476093053817749 + }, + { + "auxiliary_loss_clip": 0.01137264, + "auxiliary_loss_mlp": 0.01049182, + "balance_loss_clip": 1.05524969, + "balance_loss_mlp": 1.03053725, + "epoch": 0.17886667668720876, + "flos": 23987717775360.0, + "grad_norm": 2.942190638923503, + "language_loss": 0.7417407, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.76360524, + "num_input_tokens_seen": 64307140, + "step": 2975, + "time_per_iteration": 2.5626392364501953 + }, + { + "auxiliary_loss_clip": 0.01146186, + "auxiliary_loss_mlp": 0.01056344, + "balance_loss_clip": 1.05167723, + "balance_loss_mlp": 1.03784227, + "epoch": 0.17892679993987676, + "flos": 25739476156800.0, + "grad_norm": 1.7722499725876983, + "language_loss": 0.73453283, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.75655812, + "num_input_tokens_seen": 64328760, + "step": 2976, + "time_per_iteration": 2.52687931060791 + }, + { + "auxiliary_loss_clip": 0.01142246, + "auxiliary_loss_mlp": 0.01057605, + "balance_loss_clip": 1.05069447, + "balance_loss_mlp": 1.04156566, + "epoch": 0.17898692319254472, + "flos": 25699686865920.0, + "grad_norm": 1.6049410023588304, + "language_loss": 0.77360809, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.79560661, + "num_input_tokens_seen": 64348800, + "step": 2977, + "time_per_iteration": 2.538892984390259 + }, + { + "auxiliary_loss_clip": 0.01131891, + "auxiliary_loss_mlp": 0.01057257, + "balance_loss_clip": 1.0691061, + "balance_loss_mlp": 1.039101, + "epoch": 0.1790470464452127, + "flos": 19317786664320.0, + "grad_norm": 2.002933069468797, + "language_loss": 0.80088222, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.82277369, + "num_input_tokens_seen": 64367955, + "step": 2978, + "time_per_iteration": 2.5949902534484863 + }, + { + "auxiliary_loss_clip": 0.01139165, + "auxiliary_loss_mlp": 0.0105252, + "balance_loss_clip": 1.05300617, + "balance_loss_mlp": 1.03292179, + "epoch": 0.17910716969788065, + "flos": 30044267562240.0, + "grad_norm": 1.45521750891281, + "language_loss": 0.7597605, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.78167737, + "num_input_tokens_seen": 64389805, + "step": 2979, + "time_per_iteration": 2.618556261062622 + }, + { + "auxiliary_loss_clip": 0.01116271, + "auxiliary_loss_mlp": 0.01053324, + "balance_loss_clip": 1.05147791, + "balance_loss_mlp": 1.03506088, + "epoch": 0.17916729295054862, + "flos": 19427709260160.0, + "grad_norm": 1.74307087557171, + "language_loss": 0.69252264, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.71421856, + "num_input_tokens_seen": 64408220, + "step": 2980, + "time_per_iteration": 2.5602591037750244 + }, + { + "auxiliary_loss_clip": 0.0114631, + "auxiliary_loss_mlp": 0.01052111, + "balance_loss_clip": 1.05086327, + "balance_loss_mlp": 1.03139281, + "epoch": 0.17922741620321658, + "flos": 14611549881600.0, + "grad_norm": 1.9401448111318798, + "language_loss": 0.71133256, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.73331678, + "num_input_tokens_seen": 64426380, + "step": 2981, + "time_per_iteration": 2.47927188873291 + }, + { + "auxiliary_loss_clip": 0.01127728, + "auxiliary_loss_mlp": 0.01060073, + "balance_loss_clip": 1.05087996, + "balance_loss_mlp": 1.0408926, + "epoch": 0.17928753945588458, + "flos": 17165301177600.0, + "grad_norm": 2.1332203709780524, + "language_loss": 0.82051134, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.8423894, + "num_input_tokens_seen": 64444355, + "step": 2982, + "time_per_iteration": 2.491520404815674 + }, + { + "auxiliary_loss_clip": 0.01156488, + "auxiliary_loss_mlp": 0.01048009, + "balance_loss_clip": 1.05254304, + "balance_loss_mlp": 1.03012717, + "epoch": 0.17934766270855254, + "flos": 31395622060800.0, + "grad_norm": 1.4991512572639922, + "language_loss": 0.82984984, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.8518948, + "num_input_tokens_seen": 64467800, + "step": 2983, + "time_per_iteration": 2.544684410095215 + }, + { + "auxiliary_loss_clip": 0.01148823, + "auxiliary_loss_mlp": 0.01055903, + "balance_loss_clip": 1.04920626, + "balance_loss_mlp": 1.03657937, + "epoch": 0.1794077859612205, + "flos": 20814184281600.0, + "grad_norm": 2.0758128034831254, + "language_loss": 0.85186774, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.87391508, + "num_input_tokens_seen": 64487230, + "step": 2984, + "time_per_iteration": 2.484694719314575 + }, + { + "auxiliary_loss_clip": 0.01122614, + "auxiliary_loss_mlp": 0.01048255, + "balance_loss_clip": 1.04867697, + "balance_loss_mlp": 1.02881169, + "epoch": 0.17946790921388847, + "flos": 28986447006720.0, + "grad_norm": 1.6059347019433265, + "language_loss": 0.89364874, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.91535747, + "num_input_tokens_seen": 64509165, + "step": 2985, + "time_per_iteration": 2.634368658065796 + }, + { + "auxiliary_loss_clip": 0.0115297, + "auxiliary_loss_mlp": 0.01048757, + "balance_loss_clip": 1.05143094, + "balance_loss_mlp": 1.03209198, + "epoch": 0.17952803246655644, + "flos": 20737406960640.0, + "grad_norm": 2.1165325995121353, + "language_loss": 0.6966691, + "learning_rate": 3.770006252694922e-06, + "loss": 0.7186864, + "num_input_tokens_seen": 64527940, + "step": 2986, + "time_per_iteration": 2.4444000720977783 + }, + { + "auxiliary_loss_clip": 0.01152276, + "auxiliary_loss_mlp": 0.0079674, + "balance_loss_clip": 1.04967439, + "balance_loss_mlp": 1.01363623, + "epoch": 0.1795881557192244, + "flos": 28255988027520.0, + "grad_norm": 2.783359728227293, + "language_loss": 0.77185631, + "learning_rate": 3.769824891588688e-06, + "loss": 0.79134643, + "num_input_tokens_seen": 64545230, + "step": 2987, + "time_per_iteration": 2.52290940284729 + }, + { + "auxiliary_loss_clip": 0.0115758, + "auxiliary_loss_mlp": 0.0104509, + "balance_loss_clip": 1.05029488, + "balance_loss_mlp": 1.02518225, + "epoch": 0.17964827897189237, + "flos": 18552027594240.0, + "grad_norm": 1.744209969047335, + "language_loss": 0.78374046, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.80576718, + "num_input_tokens_seen": 64563820, + "step": 2988, + "time_per_iteration": 2.4398508071899414 + }, + { + "auxiliary_loss_clip": 0.01016111, + "auxiliary_loss_mlp": 0.01181222, + "balance_loss_clip": 1.03070402, + "balance_loss_mlp": 1.71153152, + "epoch": 0.17970840222456036, + "flos": 58165088711040.0, + "grad_norm": 0.7704829214551564, + "language_loss": 0.62785143, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64982474, + "num_input_tokens_seen": 64621315, + "step": 2989, + "time_per_iteration": 3.1377387046813965 + }, + { + "auxiliary_loss_clip": 0.01134246, + "auxiliary_loss_mlp": 0.0104258, + "balance_loss_clip": 1.05122733, + "balance_loss_mlp": 1.02457988, + "epoch": 0.17976852547722832, + "flos": 20300805146880.0, + "grad_norm": 3.4872757119252746, + "language_loss": 0.70585704, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.72762531, + "num_input_tokens_seen": 64639885, + "step": 2990, + "time_per_iteration": 2.533043146133423 + }, + { + "auxiliary_loss_clip": 0.01134341, + "auxiliary_loss_mlp": 0.01043714, + "balance_loss_clip": 1.05176306, + "balance_loss_mlp": 1.0259515, + "epoch": 0.1798286487298963, + "flos": 39669367685760.0, + "grad_norm": 1.9639711331875813, + "language_loss": 0.68470728, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.70648777, + "num_input_tokens_seen": 64661220, + "step": 2991, + "time_per_iteration": 2.6862313747406006 + }, + { + "auxiliary_loss_clip": 0.01096572, + "auxiliary_loss_mlp": 0.01039946, + "balance_loss_clip": 1.046386, + "balance_loss_mlp": 1.02125394, + "epoch": 0.17988877198256426, + "flos": 25520313323520.0, + "grad_norm": 1.462522516451752, + "language_loss": 0.82618791, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.84755307, + "num_input_tokens_seen": 64682530, + "step": 2992, + "time_per_iteration": 2.6289451122283936 + }, + { + "auxiliary_loss_clip": 0.01138257, + "auxiliary_loss_mlp": 0.01042597, + "balance_loss_clip": 1.04864097, + "balance_loss_mlp": 1.02491856, + "epoch": 0.17994889523523222, + "flos": 18807496099200.0, + "grad_norm": 1.9128477920315141, + "language_loss": 0.82128233, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.84309083, + "num_input_tokens_seen": 64701025, + "step": 2993, + "time_per_iteration": 2.4856393337249756 + }, + { + "auxiliary_loss_clip": 0.01132431, + "auxiliary_loss_mlp": 0.01044049, + "balance_loss_clip": 1.04718649, + "balance_loss_mlp": 1.02513099, + "epoch": 0.18000901848790019, + "flos": 21104450087040.0, + "grad_norm": 1.6130680162347455, + "language_loss": 0.78369772, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.80546248, + "num_input_tokens_seen": 64719570, + "step": 2994, + "time_per_iteration": 2.5127711296081543 + }, + { + "auxiliary_loss_clip": 0.01155384, + "auxiliary_loss_mlp": 0.01042928, + "balance_loss_clip": 1.05014777, + "balance_loss_mlp": 1.02520192, + "epoch": 0.18006914174056818, + "flos": 19646441130240.0, + "grad_norm": 2.3191833306215828, + "language_loss": 0.8078959, + "learning_rate": 3.768371587287296e-06, + "loss": 0.82987905, + "num_input_tokens_seen": 64738110, + "step": 2995, + "time_per_iteration": 2.451361894607544 + }, + { + "auxiliary_loss_clip": 0.01142184, + "auxiliary_loss_mlp": 0.01046727, + "balance_loss_clip": 1.05161071, + "balance_loss_mlp": 1.02946556, + "epoch": 0.18012926499323614, + "flos": 19499889640320.0, + "grad_norm": 1.7660657225275864, + "language_loss": 0.84471494, + "learning_rate": 3.768189622421512e-06, + "loss": 0.86660403, + "num_input_tokens_seen": 64756345, + "step": 2996, + "time_per_iteration": 2.476933717727661 + }, + { + "auxiliary_loss_clip": 0.01117264, + "auxiliary_loss_mlp": 0.01041703, + "balance_loss_clip": 1.05009878, + "balance_loss_mlp": 1.02391648, + "epoch": 0.1801893882459041, + "flos": 19464553635840.0, + "grad_norm": 1.6173462268177818, + "language_loss": 0.8791225, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90071219, + "num_input_tokens_seen": 64776375, + "step": 2997, + "time_per_iteration": 2.58276629447937 + }, + { + "auxiliary_loss_clip": 0.01133796, + "auxiliary_loss_mlp": 0.01045833, + "balance_loss_clip": 1.04833353, + "balance_loss_mlp": 1.02690291, + "epoch": 0.18024951149857207, + "flos": 26870590414080.0, + "grad_norm": 4.081127300282766, + "language_loss": 0.85438192, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.87617826, + "num_input_tokens_seen": 64796210, + "step": 2998, + "time_per_iteration": 2.550574541091919 + }, + { + "auxiliary_loss_clip": 0.01156082, + "auxiliary_loss_mlp": 0.01046001, + "balance_loss_clip": 1.05402803, + "balance_loss_mlp": 1.02864361, + "epoch": 0.18030963475124004, + "flos": 30226621933440.0, + "grad_norm": 2.170713770723279, + "language_loss": 0.84199572, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.86401659, + "num_input_tokens_seen": 64818590, + "step": 2999, + "time_per_iteration": 2.5431764125823975 + }, + { + "auxiliary_loss_clip": 0.01141226, + "auxiliary_loss_mlp": 0.01049772, + "balance_loss_clip": 1.04998398, + "balance_loss_mlp": 1.03043675, + "epoch": 0.180369758003908, + "flos": 22307493329280.0, + "grad_norm": 1.7999630525523629, + "language_loss": 0.74776196, + "learning_rate": 3.76746109252814e-06, + "loss": 0.76967198, + "num_input_tokens_seen": 64838350, + "step": 3000, + "time_per_iteration": 2.498661994934082 + }, + { + "auxiliary_loss_clip": 0.01129777, + "auxiliary_loss_mlp": 0.00921389, + "balance_loss_clip": 1.05021095, + "balance_loss_mlp": 1.24534202, + "epoch": 0.18042988125657597, + "flos": 23732033788800.0, + "grad_norm": 1.8750708883012825, + "language_loss": 0.71183366, + "learning_rate": 3.76727879248177e-06, + "loss": 0.73234534, + "num_input_tokens_seen": 64858065, + "step": 3001, + "time_per_iteration": 2.5668869018554688 + }, + { + "auxiliary_loss_clip": 0.01145984, + "auxiliary_loss_mlp": 0.01046731, + "balance_loss_clip": 1.05052125, + "balance_loss_mlp": 1.02805054, + "epoch": 0.18049000450924396, + "flos": 24093582134400.0, + "grad_norm": 2.3021249917825073, + "language_loss": 0.88340354, + "learning_rate": 3.767096425420011e-06, + "loss": 0.90533066, + "num_input_tokens_seen": 64877305, + "step": 3002, + "time_per_iteration": 2.496974468231201 + }, + { + "auxiliary_loss_clip": 0.01154538, + "auxiliary_loss_mlp": 0.01046526, + "balance_loss_clip": 1.05124021, + "balance_loss_mlp": 1.02888346, + "epoch": 0.18055012776191193, + "flos": 22163168482560.0, + "grad_norm": 1.7876322873087913, + "language_loss": 0.80654389, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.82855451, + "num_input_tokens_seen": 64896955, + "step": 3003, + "time_per_iteration": 2.46458101272583 + }, + { + "auxiliary_loss_clip": 0.0115587, + "auxiliary_loss_mlp": 0.01045139, + "balance_loss_clip": 1.05087829, + "balance_loss_mlp": 1.02735305, + "epoch": 0.1806102510145799, + "flos": 28913512440960.0, + "grad_norm": 2.436893985401114, + "language_loss": 0.67536902, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.69737911, + "num_input_tokens_seen": 64917080, + "step": 3004, + "time_per_iteration": 2.51444149017334 + }, + { + "auxiliary_loss_clip": 0.01143871, + "auxiliary_loss_mlp": 0.01047194, + "balance_loss_clip": 1.05086207, + "balance_loss_mlp": 1.02872872, + "epoch": 0.18067037426724786, + "flos": 19025689265280.0, + "grad_norm": 2.8838017094796307, + "language_loss": 0.85379237, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.87570298, + "num_input_tokens_seen": 64935215, + "step": 3005, + "time_per_iteration": 2.492448568344116 + }, + { + "auxiliary_loss_clip": 0.0113753, + "auxiliary_loss_mlp": 0.01043215, + "balance_loss_clip": 1.04862356, + "balance_loss_mlp": 1.02646589, + "epoch": 0.18073049751991582, + "flos": 27453635976960.0, + "grad_norm": 1.63410259564564, + "language_loss": 0.83076191, + "learning_rate": 3.766366287157432e-06, + "loss": 0.85256934, + "num_input_tokens_seen": 64956275, + "step": 3006, + "time_per_iteration": 5.353426456451416 + }, + { + "auxiliary_loss_clip": 0.01125479, + "auxiliary_loss_mlp": 0.01051436, + "balance_loss_clip": 1.04964197, + "balance_loss_mlp": 1.03229141, + "epoch": 0.1807906207725838, + "flos": 28729039167360.0, + "grad_norm": 1.6617736784832295, + "language_loss": 0.77349281, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79526198, + "num_input_tokens_seen": 64979390, + "step": 3007, + "time_per_iteration": 2.588491439819336 + }, + { + "auxiliary_loss_clip": 0.01060962, + "auxiliary_loss_mlp": 0.01142458, + "balance_loss_clip": 1.0342288, + "balance_loss_mlp": 1.13869143, + "epoch": 0.18085074402525175, + "flos": 64466515468800.0, + "grad_norm": 0.832232338729336, + "language_loss": 0.56997347, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59200776, + "num_input_tokens_seen": 65043135, + "step": 3008, + "time_per_iteration": 3.2681915760040283 + }, + { + "auxiliary_loss_clip": 0.01129715, + "auxiliary_loss_mlp": 0.01048708, + "balance_loss_clip": 1.05132258, + "balance_loss_mlp": 1.02988529, + "epoch": 0.18091086727791975, + "flos": 23476960333440.0, + "grad_norm": 1.8147284224596485, + "language_loss": 0.67225385, + "learning_rate": 3.765817980138021e-06, + "loss": 0.69403815, + "num_input_tokens_seen": 65062845, + "step": 3009, + "time_per_iteration": 2.5605862140655518 + }, + { + "auxiliary_loss_clip": 0.01156802, + "auxiliary_loss_mlp": 0.01040984, + "balance_loss_clip": 1.05197465, + "balance_loss_mlp": 1.02343678, + "epoch": 0.1809709905305877, + "flos": 24170467196160.0, + "grad_norm": 1.734316243490035, + "language_loss": 0.75327349, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.77525139, + "num_input_tokens_seen": 65082110, + "step": 3010, + "time_per_iteration": 3.882336139678955 + }, + { + "auxiliary_loss_clip": 0.01124392, + "auxiliary_loss_mlp": 0.01038183, + "balance_loss_clip": 1.0531168, + "balance_loss_mlp": 1.02146983, + "epoch": 0.18103111378325568, + "flos": 21650902669440.0, + "grad_norm": 1.5432644083753309, + "language_loss": 0.66847444, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.69010019, + "num_input_tokens_seen": 65101985, + "step": 3011, + "time_per_iteration": 2.5455150604248047 + }, + { + "auxiliary_loss_clip": 0.01104526, + "auxiliary_loss_mlp": 0.00888436, + "balance_loss_clip": 1.04520905, + "balance_loss_mlp": 1.17580414, + "epoch": 0.18109123703592364, + "flos": 53686918356480.0, + "grad_norm": 2.4008576555381205, + "language_loss": 0.71091032, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.73083991, + "num_input_tokens_seen": 65129295, + "step": 3012, + "time_per_iteration": 4.329438209533691 + }, + { + "auxiliary_loss_clip": 0.01133053, + "auxiliary_loss_mlp": 0.0104953, + "balance_loss_clip": 1.05447125, + "balance_loss_mlp": 1.03175616, + "epoch": 0.1811513602885916, + "flos": 35845564325760.0, + "grad_norm": 2.186009145711596, + "language_loss": 0.6240747, + "learning_rate": 3.765085966704609e-06, + "loss": 0.64590055, + "num_input_tokens_seen": 65150625, + "step": 3013, + "time_per_iteration": 2.6439266204833984 + }, + { + "auxiliary_loss_clip": 0.011292, + "auxiliary_loss_mlp": 0.0104913, + "balance_loss_clip": 1.0508852, + "balance_loss_mlp": 1.03167796, + "epoch": 0.18121148354125957, + "flos": 23732572492800.0, + "grad_norm": 1.6275097663988767, + "language_loss": 0.7624439, + "learning_rate": 3.764902795998309e-06, + "loss": 0.78422725, + "num_input_tokens_seen": 65170880, + "step": 3014, + "time_per_iteration": 2.5523548126220703 + }, + { + "auxiliary_loss_clip": 0.01159059, + "auxiliary_loss_mlp": 0.01050494, + "balance_loss_clip": 1.05304039, + "balance_loss_mlp": 1.03151572, + "epoch": 0.18127160679392756, + "flos": 28728320895360.0, + "grad_norm": 1.6048348246258177, + "language_loss": 0.65874755, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.68084306, + "num_input_tokens_seen": 65192530, + "step": 3015, + "time_per_iteration": 2.5248751640319824 + }, + { + "auxiliary_loss_clip": 0.01127777, + "auxiliary_loss_mlp": 0.00856035, + "balance_loss_clip": 1.0529263, + "balance_loss_mlp": 1.12223363, + "epoch": 0.18133173004659553, + "flos": 20485062938880.0, + "grad_norm": 1.8759364840424069, + "language_loss": 0.77984363, + "learning_rate": 3.764536253816785e-06, + "loss": 0.79968178, + "num_input_tokens_seen": 65211675, + "step": 3016, + "time_per_iteration": 2.5431647300720215 + }, + { + "auxiliary_loss_clip": 0.01146265, + "auxiliary_loss_mlp": 0.01052754, + "balance_loss_clip": 1.05923653, + "balance_loss_mlp": 1.03317976, + "epoch": 0.1813918532992635, + "flos": 22852078404480.0, + "grad_norm": 1.847018906288768, + "language_loss": 0.83317745, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.85516763, + "num_input_tokens_seen": 65231185, + "step": 3017, + "time_per_iteration": 2.517423629760742 + }, + { + "auxiliary_loss_clip": 0.01137922, + "auxiliary_loss_mlp": 0.01044953, + "balance_loss_clip": 1.05007684, + "balance_loss_mlp": 1.02729809, + "epoch": 0.18145197655193146, + "flos": 36065122208640.0, + "grad_norm": 2.572438735080716, + "language_loss": 0.67508125, + "learning_rate": 3.764169443989697e-06, + "loss": 0.69691002, + "num_input_tokens_seen": 65251645, + "step": 3018, + "time_per_iteration": 2.6610617637634277 + }, + { + "auxiliary_loss_clip": 0.01143004, + "auxiliary_loss_mlp": 0.00836998, + "balance_loss_clip": 1.05155325, + "balance_loss_mlp": 1.08633137, + "epoch": 0.18151209980459942, + "flos": 24023951619840.0, + "grad_norm": 3.5959550620740406, + "language_loss": 0.76214302, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.78194308, + "num_input_tokens_seen": 65271125, + "step": 3019, + "time_per_iteration": 2.518836736679077 + }, + { + "auxiliary_loss_clip": 0.01112079, + "auxiliary_loss_mlp": 0.01052473, + "balance_loss_clip": 1.04689407, + "balance_loss_mlp": 1.03391218, + "epoch": 0.1815722230572674, + "flos": 23951627585280.0, + "grad_norm": 2.0383154848364913, + "language_loss": 0.8129406, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.83458602, + "num_input_tokens_seen": 65290600, + "step": 3020, + "time_per_iteration": 2.5971333980560303 + }, + { + "auxiliary_loss_clip": 0.01130494, + "auxiliary_loss_mlp": 0.01046242, + "balance_loss_clip": 1.05212879, + "balance_loss_mlp": 1.02644134, + "epoch": 0.18163234630993536, + "flos": 24386469632640.0, + "grad_norm": 2.519290447259399, + "language_loss": 0.77664095, + "learning_rate": 3.763618727535352e-06, + "loss": 0.79840827, + "num_input_tokens_seen": 65311040, + "step": 3021, + "time_per_iteration": 2.571307420730591 + }, + { + "auxiliary_loss_clip": 0.01139989, + "auxiliary_loss_mlp": 0.01043632, + "balance_loss_clip": 1.04967475, + "balance_loss_mlp": 1.02544022, + "epoch": 0.18169246956260335, + "flos": 24681332378880.0, + "grad_norm": 1.6882240826822532, + "language_loss": 0.8503387, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87217486, + "num_input_tokens_seen": 65332115, + "step": 3022, + "time_per_iteration": 2.5278990268707275 + }, + { + "auxiliary_loss_clip": 0.01127891, + "auxiliary_loss_mlp": 0.01047785, + "balance_loss_clip": 1.04890752, + "balance_loss_mlp": 1.0276866, + "epoch": 0.1817525928152713, + "flos": 24243294021120.0, + "grad_norm": 1.9335829017026491, + "language_loss": 0.69473904, + "learning_rate": 3.763251248837859e-06, + "loss": 0.71649581, + "num_input_tokens_seen": 65352210, + "step": 3023, + "time_per_iteration": 2.577880620956421 + }, + { + "auxiliary_loss_clip": 0.01125279, + "auxiliary_loss_mlp": 0.01042451, + "balance_loss_clip": 1.0456773, + "balance_loss_mlp": 1.02419996, + "epoch": 0.18181271606793928, + "flos": 16472081623680.0, + "grad_norm": 1.9938471822198907, + "language_loss": 0.74280739, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.76448464, + "num_input_tokens_seen": 65370600, + "step": 3024, + "time_per_iteration": 2.5060901641845703 + }, + { + "auxiliary_loss_clip": 0.01139085, + "auxiliary_loss_mlp": 0.01046804, + "balance_loss_clip": 1.05050659, + "balance_loss_mlp": 1.02834988, + "epoch": 0.18187283932060724, + "flos": 18581042805120.0, + "grad_norm": 2.07232941820835, + "language_loss": 0.88403791, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.90589684, + "num_input_tokens_seen": 65387270, + "step": 3025, + "time_per_iteration": 2.487823009490967 + }, + { + "auxiliary_loss_clip": 0.01128431, + "auxiliary_loss_mlp": 0.01048904, + "balance_loss_clip": 1.04842067, + "balance_loss_mlp": 1.03039062, + "epoch": 0.1819329625732752, + "flos": 20266833859200.0, + "grad_norm": 1.755384225735399, + "language_loss": 0.78998554, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.81175888, + "num_input_tokens_seen": 65406550, + "step": 3026, + "time_per_iteration": 2.5295052528381348 + }, + { + "auxiliary_loss_clip": 0.01123457, + "auxiliary_loss_mlp": 0.01050685, + "balance_loss_clip": 1.05085063, + "balance_loss_mlp": 1.03212392, + "epoch": 0.18199308582594317, + "flos": 25915186512000.0, + "grad_norm": 1.6237678337750117, + "language_loss": 0.76157218, + "learning_rate": 3.762515489146692e-06, + "loss": 0.78331357, + "num_input_tokens_seen": 65425955, + "step": 3027, + "time_per_iteration": 2.58508563041687 + }, + { + "auxiliary_loss_clip": 0.01156619, + "auxiliary_loss_mlp": 0.01050518, + "balance_loss_clip": 1.05017304, + "balance_loss_mlp": 1.03144443, + "epoch": 0.18205320907861114, + "flos": 15377524433280.0, + "grad_norm": 1.92110293280215, + "language_loss": 0.85164356, + "learning_rate": 3.762331382119546e-06, + "loss": 0.87371492, + "num_input_tokens_seen": 65442820, + "step": 3028, + "time_per_iteration": 2.461099863052368 + }, + { + "auxiliary_loss_clip": 0.01150103, + "auxiliary_loss_mlp": 0.01042013, + "balance_loss_clip": 1.04958451, + "balance_loss_mlp": 1.02443004, + "epoch": 0.18211333233127913, + "flos": 25624310175360.0, + "grad_norm": 1.76432801193489, + "language_loss": 0.82592273, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.84784383, + "num_input_tokens_seen": 65461825, + "step": 3029, + "time_per_iteration": 2.530620813369751 + }, + { + "auxiliary_loss_clip": 0.01112183, + "auxiliary_loss_mlp": 0.01048256, + "balance_loss_clip": 1.04885101, + "balance_loss_mlp": 1.02766812, + "epoch": 0.1821734555839471, + "flos": 14976007228800.0, + "grad_norm": 2.383843248053209, + "language_loss": 0.77767396, + "learning_rate": 3.761962967588891e-06, + "loss": 0.79927832, + "num_input_tokens_seen": 65479480, + "step": 3030, + "time_per_iteration": 2.544044256210327 + }, + { + "auxiliary_loss_clip": 0.01134863, + "auxiliary_loss_mlp": 0.01049217, + "balance_loss_clip": 1.04780674, + "balance_loss_mlp": 1.03023887, + "epoch": 0.18223357883661506, + "flos": 20194007034240.0, + "grad_norm": 2.147355474320735, + "language_loss": 0.84680527, + "learning_rate": 3.761778660099352e-06, + "loss": 0.86864603, + "num_input_tokens_seen": 65497775, + "step": 3031, + "time_per_iteration": 2.503438711166382 + }, + { + "auxiliary_loss_clip": 0.01114433, + "auxiliary_loss_mlp": 0.00799019, + "balance_loss_clip": 1.04642272, + "balance_loss_mlp": 1.01588941, + "epoch": 0.18229370208928303, + "flos": 15231978524160.0, + "grad_norm": 1.816780386456844, + "language_loss": 0.79894567, + "learning_rate": 3.76159428580299e-06, + "loss": 0.81808019, + "num_input_tokens_seen": 65516505, + "step": 3032, + "time_per_iteration": 2.555082082748413 + }, + { + "auxiliary_loss_clip": 0.01157713, + "auxiliary_loss_mlp": 0.01052839, + "balance_loss_clip": 1.05180621, + "balance_loss_mlp": 1.03445685, + "epoch": 0.182353825341951, + "flos": 23840483927040.0, + "grad_norm": 3.5027688852048864, + "language_loss": 0.81180245, + "learning_rate": 3.761409844706795e-06, + "loss": 0.83390796, + "num_input_tokens_seen": 65536160, + "step": 3033, + "time_per_iteration": 2.522714376449585 + }, + { + "auxiliary_loss_clip": 0.01045753, + "auxiliary_loss_mlp": 0.01005405, + "balance_loss_clip": 1.04537964, + "balance_loss_mlp": 1.00209129, + "epoch": 0.18241394859461896, + "flos": 61190957393280.0, + "grad_norm": 0.8803895618834763, + "language_loss": 0.63503361, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.65554512, + "num_input_tokens_seen": 65589375, + "step": 3034, + "time_per_iteration": 3.121440887451172 + }, + { + "auxiliary_loss_clip": 0.01120553, + "auxiliary_loss_mlp": 0.01039728, + "balance_loss_clip": 1.04749155, + "balance_loss_mlp": 1.02265751, + "epoch": 0.18247407184728695, + "flos": 18471694826880.0, + "grad_norm": 2.4173441212920204, + "language_loss": 0.79455721, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.81616002, + "num_input_tokens_seen": 65606720, + "step": 3035, + "time_per_iteration": 2.547461748123169 + }, + { + "auxiliary_loss_clip": 0.01124515, + "auxiliary_loss_mlp": 0.01043108, + "balance_loss_clip": 1.04666245, + "balance_loss_mlp": 1.02616882, + "epoch": 0.18253419509995492, + "flos": 21795191602560.0, + "grad_norm": 1.8993409684698206, + "language_loss": 0.85167599, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.87335217, + "num_input_tokens_seen": 65625495, + "step": 3036, + "time_per_iteration": 2.536027669906616 + }, + { + "auxiliary_loss_clip": 0.01136625, + "auxiliary_loss_mlp": 0.01041347, + "balance_loss_clip": 1.05230117, + "balance_loss_mlp": 1.02424026, + "epoch": 0.18259431835262288, + "flos": 20149764456960.0, + "grad_norm": 1.941360197198144, + "language_loss": 0.79997623, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82175589, + "num_input_tokens_seen": 65643515, + "step": 3037, + "time_per_iteration": 2.489978075027466 + }, + { + "auxiliary_loss_clip": 0.01134634, + "auxiliary_loss_mlp": 0.00802021, + "balance_loss_clip": 1.05343008, + "balance_loss_mlp": 1.02217436, + "epoch": 0.18265444160529085, + "flos": 16981653916800.0, + "grad_norm": 3.0855967349767783, + "language_loss": 0.79188943, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.81125599, + "num_input_tokens_seen": 65658155, + "step": 3038, + "time_per_iteration": 2.496943712234497 + }, + { + "auxiliary_loss_clip": 0.01120084, + "auxiliary_loss_mlp": 0.01051857, + "balance_loss_clip": 1.04518008, + "balance_loss_mlp": 1.03321242, + "epoch": 0.1827145648579588, + "flos": 34423250509440.0, + "grad_norm": 2.3292607438771724, + "language_loss": 0.67858362, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.70030302, + "num_input_tokens_seen": 65679310, + "step": 3039, + "time_per_iteration": 2.644120216369629 + }, + { + "auxiliary_loss_clip": 0.01125927, + "auxiliary_loss_mlp": 0.01048927, + "balance_loss_clip": 1.0490737, + "balance_loss_mlp": 1.03034258, + "epoch": 0.18277468811062678, + "flos": 53287017264000.0, + "grad_norm": 4.270624063924718, + "language_loss": 0.73843759, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.76018608, + "num_input_tokens_seen": 65705235, + "step": 3040, + "time_per_iteration": 2.813457727432251 + }, + { + "auxiliary_loss_clip": 0.01137662, + "auxiliary_loss_mlp": 0.0104323, + "balance_loss_clip": 1.04861677, + "balance_loss_mlp": 1.02477646, + "epoch": 0.18283481136329474, + "flos": 31650659602560.0, + "grad_norm": 2.0987730895272594, + "language_loss": 0.59925443, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.62106329, + "num_input_tokens_seen": 65727575, + "step": 3041, + "time_per_iteration": 2.5746612548828125 + }, + { + "auxiliary_loss_clip": 0.01115929, + "auxiliary_loss_mlp": 0.01050737, + "balance_loss_clip": 1.04949045, + "balance_loss_mlp": 1.03259373, + "epoch": 0.18289493461596273, + "flos": 53137664513280.0, + "grad_norm": 1.7230038706479454, + "language_loss": 0.59818041, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.61984706, + "num_input_tokens_seen": 65751370, + "step": 3042, + "time_per_iteration": 2.856874942779541 + }, + { + "auxiliary_loss_clip": 0.0112052, + "auxiliary_loss_mlp": 0.01047073, + "balance_loss_clip": 1.04972649, + "balance_loss_mlp": 1.02926278, + "epoch": 0.1829550578686307, + "flos": 25589369220480.0, + "grad_norm": 1.7002060007697757, + "language_loss": 0.87647045, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.89814645, + "num_input_tokens_seen": 65771040, + "step": 3043, + "time_per_iteration": 2.5682122707366943 + }, + { + "auxiliary_loss_clip": 0.01055352, + "auxiliary_loss_mlp": 0.01051113, + "balance_loss_clip": 1.04976773, + "balance_loss_mlp": 1.03078806, + "epoch": 0.18301518112129866, + "flos": 22601422321920.0, + "grad_norm": 2.222263798331082, + "language_loss": 0.70991659, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.73098123, + "num_input_tokens_seen": 65789345, + "step": 3044, + "time_per_iteration": 4.109682559967041 + }, + { + "auxiliary_loss_clip": 0.01101691, + "auxiliary_loss_mlp": 0.01049206, + "balance_loss_clip": 1.04916775, + "balance_loss_mlp": 1.03002512, + "epoch": 0.18307530437396663, + "flos": 34020799551360.0, + "grad_norm": 2.2139638467993743, + "language_loss": 0.64183354, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66334254, + "num_input_tokens_seen": 65810990, + "step": 3045, + "time_per_iteration": 4.099668264389038 + }, + { + "auxiliary_loss_clip": 0.01151257, + "auxiliary_loss_mlp": 0.01044936, + "balance_loss_clip": 1.05058086, + "balance_loss_mlp": 1.02736449, + "epoch": 0.1831354276266346, + "flos": 21279765392640.0, + "grad_norm": 7.627348590818992, + "language_loss": 0.79761964, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.81958157, + "num_input_tokens_seen": 65827230, + "step": 3046, + "time_per_iteration": 2.4398183822631836 + }, + { + "auxiliary_loss_clip": 0.01115213, + "auxiliary_loss_mlp": 0.01044748, + "balance_loss_clip": 1.04415274, + "balance_loss_mlp": 1.02628255, + "epoch": 0.18319555087930256, + "flos": 21032952065280.0, + "grad_norm": 1.9832759410969631, + "language_loss": 0.79380405, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.81540358, + "num_input_tokens_seen": 65845900, + "step": 3047, + "time_per_iteration": 2.556051015853882 + }, + { + "auxiliary_loss_clip": 0.01138514, + "auxiliary_loss_mlp": 0.01041199, + "balance_loss_clip": 1.04990268, + "balance_loss_mlp": 1.02427101, + "epoch": 0.18325567413197055, + "flos": 34382958428160.0, + "grad_norm": 1.4956425557172026, + "language_loss": 0.81094456, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.83274174, + "num_input_tokens_seen": 65868730, + "step": 3048, + "time_per_iteration": 3.9817967414855957 + }, + { + "auxiliary_loss_clip": 0.01135013, + "auxiliary_loss_mlp": 0.01045857, + "balance_loss_clip": 1.04692936, + "balance_loss_mlp": 1.02587724, + "epoch": 0.18331579738463852, + "flos": 20558464381440.0, + "grad_norm": 1.8157378427220674, + "language_loss": 0.86672133, + "learning_rate": 3.758449708105424e-06, + "loss": 0.88853002, + "num_input_tokens_seen": 65888420, + "step": 3049, + "time_per_iteration": 2.513273239135742 + }, + { + "auxiliary_loss_clip": 0.01139509, + "auxiliary_loss_mlp": 0.01049137, + "balance_loss_clip": 1.04845548, + "balance_loss_mlp": 1.02899051, + "epoch": 0.18337592063730648, + "flos": 19607872901760.0, + "grad_norm": 2.492652110335141, + "language_loss": 0.77594405, + "learning_rate": 3.75826413248424e-06, + "loss": 0.79783052, + "num_input_tokens_seen": 65905840, + "step": 3050, + "time_per_iteration": 2.465207099914551 + }, + { + "auxiliary_loss_clip": 0.01123233, + "auxiliary_loss_mlp": 0.01039915, + "balance_loss_clip": 1.04478538, + "balance_loss_mlp": 1.02198553, + "epoch": 0.18343604388997445, + "flos": 20850885002880.0, + "grad_norm": 1.9689854727808425, + "language_loss": 0.99462783, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.01625919, + "num_input_tokens_seen": 65922845, + "step": 3051, + "time_per_iteration": 3.9788222312927246 + }, + { + "auxiliary_loss_clip": 0.01121681, + "auxiliary_loss_mlp": 0.01039334, + "balance_loss_clip": 1.04715991, + "balance_loss_mlp": 1.02004576, + "epoch": 0.1834961671426424, + "flos": 24394370624640.0, + "grad_norm": 1.672531151101026, + "language_loss": 0.86072123, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.88233137, + "num_input_tokens_seen": 65945555, + "step": 3052, + "time_per_iteration": 2.6032707691192627 + }, + { + "auxiliary_loss_clip": 0.01148519, + "auxiliary_loss_mlp": 0.01044179, + "balance_loss_clip": 1.04846859, + "balance_loss_mlp": 1.02628589, + "epoch": 0.18355629039531038, + "flos": 21251612108160.0, + "grad_norm": 1.9876039844477378, + "language_loss": 0.73149574, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.75342268, + "num_input_tokens_seen": 65963965, + "step": 3053, + "time_per_iteration": 2.4534664154052734 + }, + { + "auxiliary_loss_clip": 0.01154114, + "auxiliary_loss_mlp": 0.01043306, + "balance_loss_clip": 1.05102777, + "balance_loss_mlp": 1.02478147, + "epoch": 0.18361641364797834, + "flos": 28656499651200.0, + "grad_norm": 1.9103898394068557, + "language_loss": 0.61786938, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.63984364, + "num_input_tokens_seen": 65985965, + "step": 3054, + "time_per_iteration": 2.54599666595459 + }, + { + "auxiliary_loss_clip": 0.01103799, + "auxiliary_loss_mlp": 0.01047439, + "balance_loss_clip": 1.05103052, + "balance_loss_mlp": 1.03017771, + "epoch": 0.18367653690064634, + "flos": 20918827578240.0, + "grad_norm": 2.0949643679263215, + "language_loss": 0.78160024, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80311263, + "num_input_tokens_seen": 66005645, + "step": 3055, + "time_per_iteration": 2.6206202507019043 + }, + { + "auxiliary_loss_clip": 0.01092097, + "auxiliary_loss_mlp": 0.01052607, + "balance_loss_clip": 1.0456965, + "balance_loss_mlp": 1.0337956, + "epoch": 0.1837366601533143, + "flos": 28765596234240.0, + "grad_norm": 2.579964854704961, + "language_loss": 0.7009266, + "learning_rate": 3.757149278859014e-06, + "loss": 0.72237366, + "num_input_tokens_seen": 66025675, + "step": 3056, + "time_per_iteration": 2.654783248901367 + }, + { + "auxiliary_loss_clip": 0.01139668, + "auxiliary_loss_mlp": 0.0104184, + "balance_loss_clip": 1.04824591, + "balance_loss_mlp": 1.02466178, + "epoch": 0.18379678340598227, + "flos": 21251432540160.0, + "grad_norm": 1.522596740654723, + "language_loss": 0.80604124, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.8278563, + "num_input_tokens_seen": 66046125, + "step": 3057, + "time_per_iteration": 2.4931893348693848 + }, + { + "auxiliary_loss_clip": 0.01145774, + "auxiliary_loss_mlp": 0.01042374, + "balance_loss_clip": 1.04888403, + "balance_loss_mlp": 1.02183425, + "epoch": 0.18385690665865023, + "flos": 20449619193600.0, + "grad_norm": 2.1576110602964205, + "language_loss": 0.82501209, + "learning_rate": 3.756777127858533e-06, + "loss": 0.84689355, + "num_input_tokens_seen": 66064375, + "step": 3058, + "time_per_iteration": 2.4946346282958984 + }, + { + "auxiliary_loss_clip": 0.01113085, + "auxiliary_loss_mlp": 0.00802743, + "balance_loss_clip": 1.04399538, + "balance_loss_mlp": 1.02248967, + "epoch": 0.1839170299113182, + "flos": 26140562398080.0, + "grad_norm": 2.7544625627371118, + "language_loss": 0.85848188, + "learning_rate": 3.756590952429017e-06, + "loss": 0.87764013, + "num_input_tokens_seen": 66084590, + "step": 3059, + "time_per_iteration": 2.58817458152771 + }, + { + "auxiliary_loss_clip": 0.0114813, + "auxiliary_loss_mlp": 0.00801536, + "balance_loss_clip": 1.04827285, + "balance_loss_mlp": 1.02199769, + "epoch": 0.18397715316398616, + "flos": 31758032332800.0, + "grad_norm": 1.5258600786635184, + "language_loss": 0.72777092, + "learning_rate": 3.756404710389396e-06, + "loss": 0.7472676, + "num_input_tokens_seen": 66107105, + "step": 3060, + "time_per_iteration": 2.5587899684906006 + }, + { + "auxiliary_loss_clip": 0.01142343, + "auxiliary_loss_mlp": 0.01037989, + "balance_loss_clip": 1.04887867, + "balance_loss_mlp": 1.01915431, + "epoch": 0.18403727641665413, + "flos": 24611989173120.0, + "grad_norm": 1.9628524085829961, + "language_loss": 0.72702318, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.7488265, + "num_input_tokens_seen": 66129295, + "step": 3061, + "time_per_iteration": 2.5410821437835693 + }, + { + "auxiliary_loss_clip": 0.01133042, + "auxiliary_loss_mlp": 0.0104823, + "balance_loss_clip": 1.04785919, + "balance_loss_mlp": 1.02914417, + "epoch": 0.18409739966932212, + "flos": 23439900476160.0, + "grad_norm": 2.3964887658841176, + "language_loss": 0.81591642, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.8377291, + "num_input_tokens_seen": 66146910, + "step": 3062, + "time_per_iteration": 2.510471820831299 + }, + { + "auxiliary_loss_clip": 0.01138764, + "auxiliary_loss_mlp": 0.01042435, + "balance_loss_clip": 1.05029237, + "balance_loss_mlp": 1.02426767, + "epoch": 0.18415752292199009, + "flos": 21872112577920.0, + "grad_norm": 2.382880010780113, + "language_loss": 0.7276963, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.74950832, + "num_input_tokens_seen": 66165370, + "step": 3063, + "time_per_iteration": 2.498452663421631 + }, + { + "auxiliary_loss_clip": 0.0112776, + "auxiliary_loss_mlp": 0.01037247, + "balance_loss_clip": 1.04674637, + "balance_loss_mlp": 1.02147579, + "epoch": 0.18421764617465805, + "flos": 25410678036480.0, + "grad_norm": 1.7063326577187148, + "language_loss": 0.65845263, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.68010271, + "num_input_tokens_seen": 66186210, + "step": 3064, + "time_per_iteration": 2.5683610439300537 + }, + { + "auxiliary_loss_clip": 0.01136273, + "auxiliary_loss_mlp": 0.01045549, + "balance_loss_clip": 1.04829431, + "balance_loss_mlp": 1.02810907, + "epoch": 0.18427776942732602, + "flos": 27198131558400.0, + "grad_norm": 1.9732491546125797, + "language_loss": 0.68794978, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.709768, + "num_input_tokens_seen": 66204800, + "step": 3065, + "time_per_iteration": 2.5219485759735107 + }, + { + "auxiliary_loss_clip": 0.01131389, + "auxiliary_loss_mlp": 0.01041872, + "balance_loss_clip": 1.04947162, + "balance_loss_mlp": 1.02321553, + "epoch": 0.18433789267999398, + "flos": 27852351920640.0, + "grad_norm": 3.16796414263879, + "language_loss": 0.73329222, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.75502485, + "num_input_tokens_seen": 66222195, + "step": 3066, + "time_per_iteration": 2.593531847000122 + }, + { + "auxiliary_loss_clip": 0.01124078, + "auxiliary_loss_mlp": 0.010383, + "balance_loss_clip": 1.04920459, + "balance_loss_mlp": 1.0208714, + "epoch": 0.18439801593266195, + "flos": 17856940533120.0, + "grad_norm": 2.16173903392107, + "language_loss": 0.81802773, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.83965158, + "num_input_tokens_seen": 66239505, + "step": 3067, + "time_per_iteration": 2.5086212158203125 + }, + { + "auxiliary_loss_clip": 0.01080239, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.03781104, + "balance_loss_mlp": 1.43463171, + "epoch": 0.18445813918532994, + "flos": 56389522590720.0, + "grad_norm": 0.7940779191365099, + "language_loss": 0.59711432, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61825311, + "num_input_tokens_seen": 66295695, + "step": 3068, + "time_per_iteration": 2.94635009765625 + }, + { + "auxiliary_loss_clip": 0.01121956, + "auxiliary_loss_mlp": 0.0103793, + "balance_loss_clip": 1.05223703, + "balance_loss_mlp": 1.0216105, + "epoch": 0.1845182624379979, + "flos": 20957180325120.0, + "grad_norm": 1.716663015529881, + "language_loss": 0.76590329, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.78750217, + "num_input_tokens_seen": 66315315, + "step": 3069, + "time_per_iteration": 2.5468287467956543 + }, + { + "auxiliary_loss_clip": 0.01139705, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.04732788, + "balance_loss_mlp": 1.02419376, + "epoch": 0.18457838569066587, + "flos": 20485170679680.0, + "grad_norm": 1.9678401891460418, + "language_loss": 0.84665549, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.86846936, + "num_input_tokens_seen": 66333675, + "step": 3070, + "time_per_iteration": 2.5022239685058594 + }, + { + "auxiliary_loss_clip": 0.0111995, + "auxiliary_loss_mlp": 0.01045263, + "balance_loss_clip": 1.04642272, + "balance_loss_mlp": 1.02695227, + "epoch": 0.18463850894333383, + "flos": 25010022758400.0, + "grad_norm": 2.0233314103901523, + "language_loss": 0.77879, + "learning_rate": 3.754351653708265e-06, + "loss": 0.8004421, + "num_input_tokens_seen": 66354075, + "step": 3071, + "time_per_iteration": 2.5916078090667725 + }, + { + "auxiliary_loss_clip": 0.01108532, + "auxiliary_loss_mlp": 0.01044425, + "balance_loss_clip": 1.05208874, + "balance_loss_mlp": 1.0270561, + "epoch": 0.1846986321960018, + "flos": 16800628348800.0, + "grad_norm": 1.9459660169171797, + "language_loss": 0.76967871, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.79120827, + "num_input_tokens_seen": 66372520, + "step": 3072, + "time_per_iteration": 2.5603513717651367 + }, + { + "auxiliary_loss_clip": 0.01135489, + "auxiliary_loss_mlp": 0.01041117, + "balance_loss_clip": 1.04911721, + "balance_loss_mlp": 1.02265108, + "epoch": 0.18475875544866976, + "flos": 20814327936000.0, + "grad_norm": 2.2883955609671034, + "language_loss": 0.86214983, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.88391584, + "num_input_tokens_seen": 66390745, + "step": 3073, + "time_per_iteration": 2.487492322921753 + }, + { + "auxiliary_loss_clip": 0.0115172, + "auxiliary_loss_mlp": 0.01042715, + "balance_loss_clip": 1.05145657, + "balance_loss_mlp": 1.02616906, + "epoch": 0.18481887870133773, + "flos": 22601422321920.0, + "grad_norm": 2.3549350129025335, + "language_loss": 0.91308981, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.93503416, + "num_input_tokens_seen": 66410525, + "step": 3074, + "time_per_iteration": 2.4682490825653076 + }, + { + "auxiliary_loss_clip": 0.01101422, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_clip": 1.04643607, + "balance_loss_mlp": 1.02630329, + "epoch": 0.18487900195400572, + "flos": 29458815788160.0, + "grad_norm": 1.7614725777912241, + "language_loss": 0.64965224, + "learning_rate": 3.75360309139087e-06, + "loss": 0.67111653, + "num_input_tokens_seen": 66432535, + "step": 3075, + "time_per_iteration": 2.6090705394744873 + }, + { + "auxiliary_loss_clip": 0.01130095, + "auxiliary_loss_mlp": 0.01040378, + "balance_loss_clip": 1.05128849, + "balance_loss_mlp": 1.02317572, + "epoch": 0.1849391252066737, + "flos": 20628777254400.0, + "grad_norm": 1.7856848149354143, + "language_loss": 0.73173785, + "learning_rate": 3.753415784551761e-06, + "loss": 0.75344253, + "num_input_tokens_seen": 66450620, + "step": 3076, + "time_per_iteration": 2.5256187915802 + }, + { + "auxiliary_loss_clip": 0.01115072, + "auxiliary_loss_mlp": 0.01037941, + "balance_loss_clip": 1.05146003, + "balance_loss_mlp": 1.02145422, + "epoch": 0.18499924845934165, + "flos": 14428549065600.0, + "grad_norm": 2.363784441141818, + "language_loss": 0.81269217, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.83422232, + "num_input_tokens_seen": 66467865, + "step": 3077, + "time_per_iteration": 2.5930373668670654 + }, + { + "auxiliary_loss_clip": 0.011202, + "auxiliary_loss_mlp": 0.01043445, + "balance_loss_clip": 1.04882789, + "balance_loss_mlp": 1.02664876, + "epoch": 0.18505937171200962, + "flos": 23727652329600.0, + "grad_norm": 2.112828914358928, + "language_loss": 0.78953946, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.81117594, + "num_input_tokens_seen": 66486245, + "step": 3078, + "time_per_iteration": 2.5690486431121826 + }, + { + "auxiliary_loss_clip": 0.01150232, + "auxiliary_loss_mlp": 0.01047011, + "balance_loss_clip": 1.04968452, + "balance_loss_mlp": 1.03051209, + "epoch": 0.18511949496467758, + "flos": 25957489754880.0, + "grad_norm": 2.086056853094844, + "language_loss": 0.77418655, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.79615897, + "num_input_tokens_seen": 66506510, + "step": 3079, + "time_per_iteration": 2.52986216545105 + }, + { + "auxiliary_loss_clip": 0.01115944, + "auxiliary_loss_mlp": 0.01038605, + "balance_loss_clip": 1.04744911, + "balance_loss_mlp": 1.02146292, + "epoch": 0.18517961821734555, + "flos": 42413553912960.0, + "grad_norm": 1.8103780844410122, + "language_loss": 0.82101977, + "learning_rate": 3.752665892369369e-06, + "loss": 0.84256518, + "num_input_tokens_seen": 66530960, + "step": 3080, + "time_per_iteration": 2.782543659210205 + }, + { + "auxiliary_loss_clip": 0.01110571, + "auxiliary_loss_mlp": 0.01045729, + "balance_loss_clip": 1.04607666, + "balance_loss_mlp": 1.02768087, + "epoch": 0.18523974147001354, + "flos": 24097568544000.0, + "grad_norm": 1.8618077101822463, + "language_loss": 0.74423003, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.76579297, + "num_input_tokens_seen": 66550275, + "step": 3081, + "time_per_iteration": 2.615497589111328 + }, + { + "auxiliary_loss_clip": 0.01123855, + "auxiliary_loss_mlp": 0.01053373, + "balance_loss_clip": 1.05157971, + "balance_loss_mlp": 1.03541994, + "epoch": 0.1852998647226815, + "flos": 27375278457600.0, + "grad_norm": 2.0844464833460634, + "language_loss": 0.72034919, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.74212146, + "num_input_tokens_seen": 66569040, + "step": 3082, + "time_per_iteration": 2.577605724334717 + }, + { + "auxiliary_loss_clip": 0.01119939, + "auxiliary_loss_mlp": 0.01049092, + "balance_loss_clip": 1.05342233, + "balance_loss_mlp": 1.03066218, + "epoch": 0.18535998797534947, + "flos": 18332757020160.0, + "grad_norm": 2.477494426279642, + "language_loss": 0.69530118, + "learning_rate": 3.752102775364407e-06, + "loss": 0.71699154, + "num_input_tokens_seen": 66587775, + "step": 3083, + "time_per_iteration": 3.943408727645874 + }, + { + "auxiliary_loss_clip": 0.01119309, + "auxiliary_loss_mlp": 0.01048838, + "balance_loss_clip": 1.0493331, + "balance_loss_mlp": 1.03173113, + "epoch": 0.18542011122801744, + "flos": 37845859887360.0, + "grad_norm": 2.5278872220185153, + "language_loss": 0.68733144, + "learning_rate": 3.751914936806767e-06, + "loss": 0.70901287, + "num_input_tokens_seen": 66610800, + "step": 3084, + "time_per_iteration": 2.6786043643951416 + }, + { + "auxiliary_loss_clip": 0.01149162, + "auxiliary_loss_mlp": 0.01041377, + "balance_loss_clip": 1.04949522, + "balance_loss_mlp": 1.02436566, + "epoch": 0.1854802344806854, + "flos": 25186128163200.0, + "grad_norm": 1.575877891965453, + "language_loss": 0.78138876, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.80329418, + "num_input_tokens_seen": 66630960, + "step": 3085, + "time_per_iteration": 2.5270917415618896 + }, + { + "auxiliary_loss_clip": 0.01148856, + "auxiliary_loss_mlp": 0.0104766, + "balance_loss_clip": 1.04882574, + "balance_loss_mlp": 1.03093529, + "epoch": 0.18554035773335337, + "flos": 26684788337280.0, + "grad_norm": 2.204581040648067, + "language_loss": 0.73339611, + "learning_rate": 3.751539060400244e-06, + "loss": 0.7553612, + "num_input_tokens_seen": 66650585, + "step": 3086, + "time_per_iteration": 2.5025787353515625 + }, + { + "auxiliary_loss_clip": 0.01137533, + "auxiliary_loss_mlp": 0.01044697, + "balance_loss_clip": 1.04881334, + "balance_loss_mlp": 1.02670801, + "epoch": 0.18560048098602133, + "flos": 22346887570560.0, + "grad_norm": 2.7222905177005496, + "language_loss": 0.69978809, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.72161043, + "num_input_tokens_seen": 66670045, + "step": 3087, + "time_per_iteration": 3.920544385910034 + }, + { + "auxiliary_loss_clip": 0.01111702, + "auxiliary_loss_mlp": 0.0104868, + "balance_loss_clip": 1.04850507, + "balance_loss_mlp": 1.03003573, + "epoch": 0.18566060423868933, + "flos": 17748526308480.0, + "grad_norm": 2.2290092395405114, + "language_loss": 0.72391838, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.7455222, + "num_input_tokens_seen": 66688790, + "step": 3088, + "time_per_iteration": 2.5305416584014893 + }, + { + "auxiliary_loss_clip": 0.01125026, + "auxiliary_loss_mlp": 0.01042972, + "balance_loss_clip": 1.04864609, + "balance_loss_mlp": 1.02584159, + "epoch": 0.1857207274913573, + "flos": 24677274142080.0, + "grad_norm": 1.9347994149812453, + "language_loss": 0.92510533, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.94678533, + "num_input_tokens_seen": 66708090, + "step": 3089, + "time_per_iteration": 2.5448367595672607 + }, + { + "auxiliary_loss_clip": 0.01099744, + "auxiliary_loss_mlp": 0.01041247, + "balance_loss_clip": 1.04734337, + "balance_loss_mlp": 1.02311504, + "epoch": 0.18578085074402526, + "flos": 28147825198080.0, + "grad_norm": 2.9771058074317103, + "language_loss": 0.57897866, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.60038859, + "num_input_tokens_seen": 66727320, + "step": 3090, + "time_per_iteration": 4.047714471817017 + }, + { + "auxiliary_loss_clip": 0.01123541, + "auxiliary_loss_mlp": 0.01043226, + "balance_loss_clip": 1.04704332, + "balance_loss_mlp": 1.02545214, + "epoch": 0.18584097399669322, + "flos": 23951878980480.0, + "grad_norm": 1.7428998132119318, + "language_loss": 0.81814951, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.83981717, + "num_input_tokens_seen": 66747505, + "step": 3091, + "time_per_iteration": 2.548675298690796 + }, + { + "auxiliary_loss_clip": 0.01100407, + "auxiliary_loss_mlp": 0.01047538, + "balance_loss_clip": 1.05060649, + "balance_loss_mlp": 1.02960908, + "epoch": 0.18590109724936119, + "flos": 17201678676480.0, + "grad_norm": 2.824681383892353, + "language_loss": 0.84140384, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.86288333, + "num_input_tokens_seen": 66766425, + "step": 3092, + "time_per_iteration": 2.594717264175415 + }, + { + "auxiliary_loss_clip": 0.0113232, + "auxiliary_loss_mlp": 0.01044708, + "balance_loss_clip": 1.05010223, + "balance_loss_mlp": 1.02700543, + "epoch": 0.18596122050202915, + "flos": 17234644383360.0, + "grad_norm": 2.4554560254813222, + "language_loss": 0.93363714, + "learning_rate": 3.750221401168038e-06, + "loss": 0.95540738, + "num_input_tokens_seen": 66781130, + "step": 3093, + "time_per_iteration": 2.483088254928589 + }, + { + "auxiliary_loss_clip": 0.01125791, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.0513432, + "balance_loss_mlp": 1.0212276, + "epoch": 0.18602134375469712, + "flos": 19020733188480.0, + "grad_norm": 1.9381110238798234, + "language_loss": 0.77577317, + "learning_rate": 3.750032898603443e-06, + "loss": 0.79742384, + "num_input_tokens_seen": 66797535, + "step": 3094, + "time_per_iteration": 2.520418643951416 + }, + { + "auxiliary_loss_clip": 0.01090066, + "auxiliary_loss_mlp": 0.0104444, + "balance_loss_clip": 1.04865122, + "balance_loss_mlp": 1.02853799, + "epoch": 0.1860814670073651, + "flos": 50950094417280.0, + "grad_norm": 1.6883284309125253, + "language_loss": 0.69915903, + "learning_rate": 3.749844329677425e-06, + "loss": 0.72050411, + "num_input_tokens_seen": 66821720, + "step": 3095, + "time_per_iteration": 2.8828094005584717 + }, + { + "auxiliary_loss_clip": 0.01105937, + "auxiliary_loss_mlp": 0.01047806, + "balance_loss_clip": 1.04636264, + "balance_loss_mlp": 1.02764726, + "epoch": 0.18614159026003307, + "flos": 19390972625280.0, + "grad_norm": 2.701053987258419, + "language_loss": 0.80605948, + "learning_rate": 3.749655694397135e-06, + "loss": 0.82759684, + "num_input_tokens_seen": 66839060, + "step": 3096, + "time_per_iteration": 2.5307981967926025 + }, + { + "auxiliary_loss_clip": 0.01142109, + "auxiliary_loss_mlp": 0.01044447, + "balance_loss_clip": 1.05121088, + "balance_loss_mlp": 1.02619576, + "epoch": 0.18620171351270104, + "flos": 21798782962560.0, + "grad_norm": 2.0599506016461744, + "language_loss": 0.74928552, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.77115113, + "num_input_tokens_seen": 66857760, + "step": 3097, + "time_per_iteration": 2.5017731189727783 + }, + { + "auxiliary_loss_clip": 0.01126968, + "auxiliary_loss_mlp": 0.01040437, + "balance_loss_clip": 1.051736, + "balance_loss_mlp": 1.02329433, + "epoch": 0.186261836765369, + "flos": 16362877299840.0, + "grad_norm": 3.076231370208987, + "language_loss": 0.66519624, + "learning_rate": 3.749278224802352e-06, + "loss": 0.68687028, + "num_input_tokens_seen": 66876460, + "step": 3098, + "time_per_iteration": 2.5133728981018066 + }, + { + "auxiliary_loss_clip": 0.01155515, + "auxiliary_loss_mlp": 0.01047103, + "balance_loss_clip": 1.05186284, + "balance_loss_mlp": 1.02750468, + "epoch": 0.18632196001803697, + "flos": 23370054480000.0, + "grad_norm": 1.6806715123253373, + "language_loss": 0.69731337, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.71933955, + "num_input_tokens_seen": 66897960, + "step": 3099, + "time_per_iteration": 2.500502586364746 + }, + { + "auxiliary_loss_clip": 0.01145787, + "auxiliary_loss_mlp": 0.01051155, + "balance_loss_clip": 1.05427432, + "balance_loss_mlp": 1.03233242, + "epoch": 0.18638208327070493, + "flos": 22492002516480.0, + "grad_norm": 1.5751821065655003, + "language_loss": 0.72072864, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.74269807, + "num_input_tokens_seen": 66917675, + "step": 3100, + "time_per_iteration": 2.5126194953918457 + }, + { + "auxiliary_loss_clip": 0.01131234, + "auxiliary_loss_mlp": 0.01049043, + "balance_loss_clip": 1.0506202, + "balance_loss_mlp": 1.0305419, + "epoch": 0.18644220652337293, + "flos": 29165245931520.0, + "grad_norm": 2.1709066462401374, + "language_loss": 0.80442011, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82622284, + "num_input_tokens_seen": 66936000, + "step": 3101, + "time_per_iteration": 2.600951671600342 + }, + { + "auxiliary_loss_clip": 0.01106484, + "auxiliary_loss_mlp": 0.01041968, + "balance_loss_clip": 1.05099702, + "balance_loss_mlp": 1.02502847, + "epoch": 0.1865023297760409, + "flos": 24243796811520.0, + "grad_norm": 12.509489298863416, + "language_loss": 0.77376753, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.79525208, + "num_input_tokens_seen": 66955700, + "step": 3102, + "time_per_iteration": 2.59336519241333 + }, + { + "auxiliary_loss_clip": 0.011459, + "auxiliary_loss_mlp": 0.01039978, + "balance_loss_clip": 1.05109727, + "balance_loss_mlp": 1.02238226, + "epoch": 0.18656245302870886, + "flos": 19128716449920.0, + "grad_norm": 2.648559357207612, + "language_loss": 0.76482046, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.78667921, + "num_input_tokens_seen": 66972815, + "step": 3103, + "time_per_iteration": 2.502370595932007 + }, + { + "auxiliary_loss_clip": 0.01130083, + "auxiliary_loss_mlp": 0.0104286, + "balance_loss_clip": 1.05139387, + "balance_loss_mlp": 1.02502596, + "epoch": 0.18662257628137682, + "flos": 17786088956160.0, + "grad_norm": 2.1434524373939983, + "language_loss": 0.79194182, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.81367123, + "num_input_tokens_seen": 66992280, + "step": 3104, + "time_per_iteration": 2.521860361099243 + }, + { + "auxiliary_loss_clip": 0.01104575, + "auxiliary_loss_mlp": 0.01048774, + "balance_loss_clip": 1.05004692, + "balance_loss_mlp": 1.02918792, + "epoch": 0.1866826995340448, + "flos": 24024382583040.0, + "grad_norm": 1.8484207999780256, + "language_loss": 0.85143232, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87296581, + "num_input_tokens_seen": 67012220, + "step": 3105, + "time_per_iteration": 2.6337544918060303 + }, + { + "auxiliary_loss_clip": 0.01122488, + "auxiliary_loss_mlp": 0.01041457, + "balance_loss_clip": 1.05227411, + "balance_loss_mlp": 1.02221656, + "epoch": 0.18674282278671275, + "flos": 26141244756480.0, + "grad_norm": 1.9537184223659891, + "language_loss": 0.87117696, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.89281631, + "num_input_tokens_seen": 67032030, + "step": 3106, + "time_per_iteration": 2.6148219108581543 + }, + { + "auxiliary_loss_clip": 0.01146064, + "auxiliary_loss_mlp": 0.01041221, + "balance_loss_clip": 1.05292046, + "balance_loss_mlp": 1.02308953, + "epoch": 0.18680294603938072, + "flos": 19201938324480.0, + "grad_norm": 2.0598563380315897, + "language_loss": 0.78379792, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.8056708, + "num_input_tokens_seen": 67048920, + "step": 3107, + "time_per_iteration": 2.506296157836914 + }, + { + "auxiliary_loss_clip": 0.01147444, + "auxiliary_loss_mlp": 0.01050681, + "balance_loss_clip": 1.05191696, + "balance_loss_mlp": 1.03073752, + "epoch": 0.1868630692920487, + "flos": 28544889116160.0, + "grad_norm": 2.6609821942071554, + "language_loss": 0.74351448, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.76549578, + "num_input_tokens_seen": 67068645, + "step": 3108, + "time_per_iteration": 2.5623955726623535 + }, + { + "auxiliary_loss_clip": 0.01106227, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.05180824, + "balance_loss_mlp": 1.02580428, + "epoch": 0.18692319254471668, + "flos": 17238020261760.0, + "grad_norm": 1.621525458217393, + "language_loss": 0.74324751, + "learning_rate": 3.747197400772658e-06, + "loss": 0.76475936, + "num_input_tokens_seen": 67087075, + "step": 3109, + "time_per_iteration": 2.5804126262664795 + }, + { + "auxiliary_loss_clip": 0.01146294, + "auxiliary_loss_mlp": 0.0104539, + "balance_loss_clip": 1.05607414, + "balance_loss_mlp": 1.02667451, + "epoch": 0.18698331579738464, + "flos": 23185186156800.0, + "grad_norm": 1.50832115242426, + "language_loss": 0.84388179, + "learning_rate": 3.747007837284772e-06, + "loss": 0.86579859, + "num_input_tokens_seen": 67108040, + "step": 3110, + "time_per_iteration": 2.5255074501037598 + }, + { + "auxiliary_loss_clip": 0.01146019, + "auxiliary_loss_mlp": 0.01046129, + "balance_loss_clip": 1.05870283, + "balance_loss_mlp": 1.02710366, + "epoch": 0.1870434390500526, + "flos": 25516721963520.0, + "grad_norm": 1.585798863719456, + "language_loss": 0.84511781, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.86703926, + "num_input_tokens_seen": 67127605, + "step": 3111, + "time_per_iteration": 2.5558245182037354 + }, + { + "auxiliary_loss_clip": 0.01129778, + "auxiliary_loss_mlp": 0.01042266, + "balance_loss_clip": 1.05585754, + "balance_loss_mlp": 1.0244081, + "epoch": 0.18710356230272057, + "flos": 19500823393920.0, + "grad_norm": 1.8508455899820442, + "language_loss": 0.76561999, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.7873404, + "num_input_tokens_seen": 67145785, + "step": 3112, + "time_per_iteration": 2.5254390239715576 + }, + { + "auxiliary_loss_clip": 0.01145501, + "auxiliary_loss_mlp": 0.01043759, + "balance_loss_clip": 1.05343652, + "balance_loss_mlp": 1.02602077, + "epoch": 0.18716368555538854, + "flos": 26760847386240.0, + "grad_norm": 1.8611499340789506, + "language_loss": 0.64874673, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.6706394, + "num_input_tokens_seen": 67165930, + "step": 3113, + "time_per_iteration": 2.5528404712677 + }, + { + "auxiliary_loss_clip": 0.01154639, + "auxiliary_loss_mlp": 0.01043146, + "balance_loss_clip": 1.05802035, + "balance_loss_mlp": 1.02440596, + "epoch": 0.1872238088080565, + "flos": 25189827264000.0, + "grad_norm": 3.214071877337392, + "language_loss": 0.81084085, + "learning_rate": 3.746248920938024e-06, + "loss": 0.83281869, + "num_input_tokens_seen": 67185830, + "step": 3114, + "time_per_iteration": 2.5382561683654785 + }, + { + "auxiliary_loss_clip": 0.01110655, + "auxiliary_loss_mlp": 0.01050046, + "balance_loss_clip": 1.05757356, + "balance_loss_mlp": 1.02993512, + "epoch": 0.1872839320607245, + "flos": 24134305178880.0, + "grad_norm": 16.832589873260904, + "language_loss": 0.57366586, + "learning_rate": 3.74605902628851e-06, + "loss": 0.5952729, + "num_input_tokens_seen": 67206930, + "step": 3115, + "time_per_iteration": 2.632845163345337 + }, + { + "auxiliary_loss_clip": 0.0111921, + "auxiliary_loss_mlp": 0.01056416, + "balance_loss_clip": 1.04986918, + "balance_loss_mlp": 1.03568602, + "epoch": 0.18734405531339246, + "flos": 21173793292800.0, + "grad_norm": 2.2715935179971884, + "language_loss": 0.71301734, + "learning_rate": 3.745869065428261e-06, + "loss": 0.73477364, + "num_input_tokens_seen": 67226290, + "step": 3116, + "time_per_iteration": 2.5672287940979004 + }, + { + "auxiliary_loss_clip": 0.01152633, + "auxiliary_loss_mlp": 0.01035792, + "balance_loss_clip": 1.05427766, + "balance_loss_mlp": 1.01800644, + "epoch": 0.18740417856606043, + "flos": 17237697039360.0, + "grad_norm": 2.2712902489541302, + "language_loss": 0.79156202, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.81344628, + "num_input_tokens_seen": 67244410, + "step": 3117, + "time_per_iteration": 2.467670202255249 + }, + { + "auxiliary_loss_clip": 0.01137376, + "auxiliary_loss_mlp": 0.010459, + "balance_loss_clip": 1.05664897, + "balance_loss_mlp": 1.02741027, + "epoch": 0.1874643018187284, + "flos": 32558049999360.0, + "grad_norm": 1.810810998212889, + "language_loss": 0.84140038, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86323315, + "num_input_tokens_seen": 67264470, + "step": 3118, + "time_per_iteration": 2.6290299892425537 + }, + { + "auxiliary_loss_clip": 0.01149504, + "auxiliary_loss_mlp": 0.01047661, + "balance_loss_clip": 1.05797732, + "balance_loss_mlp": 1.03048337, + "epoch": 0.18752442507139636, + "flos": 23258156636160.0, + "grad_norm": 1.874953408214316, + "language_loss": 0.76747, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.78944159, + "num_input_tokens_seen": 67284315, + "step": 3119, + "time_per_iteration": 2.5235116481781006 + }, + { + "auxiliary_loss_clip": 0.01157087, + "auxiliary_loss_mlp": 0.01048192, + "balance_loss_clip": 1.05461001, + "balance_loss_mlp": 1.03099012, + "epoch": 0.18758454832406432, + "flos": 21760933006080.0, + "grad_norm": 1.773234312306949, + "language_loss": 0.82316542, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.84521818, + "num_input_tokens_seen": 67302780, + "step": 3120, + "time_per_iteration": 2.4811439514160156 + }, + { + "auxiliary_loss_clip": 0.01133081, + "auxiliary_loss_mlp": 0.01036988, + "balance_loss_clip": 1.05465388, + "balance_loss_mlp": 1.02001297, + "epoch": 0.1876446715767323, + "flos": 29570210841600.0, + "grad_norm": 2.1301678503823935, + "language_loss": 0.84947604, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.87117672, + "num_input_tokens_seen": 67323405, + "step": 3121, + "time_per_iteration": 2.5861263275146484 + }, + { + "auxiliary_loss_clip": 0.01102948, + "auxiliary_loss_mlp": 0.01045671, + "balance_loss_clip": 1.05325556, + "balance_loss_mlp": 1.02740812, + "epoch": 0.18770479482940028, + "flos": 30339992234880.0, + "grad_norm": 1.8660693236759498, + "language_loss": 0.70085263, + "learning_rate": 3.744727910244937e-06, + "loss": 0.7223388, + "num_input_tokens_seen": 67345800, + "step": 3122, + "time_per_iteration": 5.541229724884033 + }, + { + "auxiliary_loss_clip": 0.0115375, + "auxiliary_loss_mlp": 0.01043349, + "balance_loss_clip": 1.05394304, + "balance_loss_mlp": 1.02406061, + "epoch": 0.18776491808206824, + "flos": 14465357527680.0, + "grad_norm": 2.313320283487551, + "language_loss": 0.70765674, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.72962767, + "num_input_tokens_seen": 67363575, + "step": 3123, + "time_per_iteration": 2.4887890815734863 + }, + { + "auxiliary_loss_clip": 0.01149016, + "auxiliary_loss_mlp": 0.0104474, + "balance_loss_clip": 1.05778861, + "balance_loss_mlp": 1.02766967, + "epoch": 0.1878250413347362, + "flos": 24498547044480.0, + "grad_norm": 2.2689133989347496, + "language_loss": 0.74061, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.76254761, + "num_input_tokens_seen": 67381765, + "step": 3124, + "time_per_iteration": 2.547609329223633 + }, + { + "auxiliary_loss_clip": 0.01158569, + "auxiliary_loss_mlp": 0.01050467, + "balance_loss_clip": 1.05605769, + "balance_loss_mlp": 1.03122616, + "epoch": 0.18788516458740417, + "flos": 39786185692800.0, + "grad_norm": 1.7449987423048428, + "language_loss": 0.80309111, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.82518148, + "num_input_tokens_seen": 67405000, + "step": 3125, + "time_per_iteration": 2.655449628829956 + }, + { + "auxiliary_loss_clip": 0.01054827, + "auxiliary_loss_mlp": 0.0100223, + "balance_loss_clip": 1.05279732, + "balance_loss_mlp": 0.99908334, + "epoch": 0.18794528784007214, + "flos": 64699250664960.0, + "grad_norm": 0.9485236786419264, + "language_loss": 0.63581312, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65638369, + "num_input_tokens_seen": 67467140, + "step": 3126, + "time_per_iteration": 4.642847299575806 + }, + { + "auxiliary_loss_clip": 0.01129138, + "auxiliary_loss_mlp": 0.01040972, + "balance_loss_clip": 1.05679226, + "balance_loss_mlp": 1.02365065, + "epoch": 0.1880054110927401, + "flos": 28622061486720.0, + "grad_norm": 1.6152107839059424, + "language_loss": 0.81562865, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.83732975, + "num_input_tokens_seen": 67487980, + "step": 3127, + "time_per_iteration": 2.6079609394073486 + }, + { + "auxiliary_loss_clip": 0.01095939, + "auxiliary_loss_mlp": 0.01001559, + "balance_loss_clip": 1.05485034, + "balance_loss_mlp": 0.99798292, + "epoch": 0.1880655343454081, + "flos": 64488958490880.0, + "grad_norm": 0.7616735039825855, + "language_loss": 0.61884987, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.63982487, + "num_input_tokens_seen": 67552500, + "step": 3128, + "time_per_iteration": 3.158031463623047 + }, + { + "auxiliary_loss_clip": 0.01105159, + "auxiliary_loss_mlp": 0.010516, + "balance_loss_clip": 1.04524243, + "balance_loss_mlp": 1.03115535, + "epoch": 0.18812565759807606, + "flos": 32124464928000.0, + "grad_norm": 2.1351766080006276, + "language_loss": 0.71328521, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.73485279, + "num_input_tokens_seen": 67573295, + "step": 3129, + "time_per_iteration": 4.139815092086792 + }, + { + "auxiliary_loss_clip": 0.01153037, + "auxiliary_loss_mlp": 0.01047578, + "balance_loss_clip": 1.05226612, + "balance_loss_mlp": 1.02936244, + "epoch": 0.18818578085074403, + "flos": 20624539449600.0, + "grad_norm": 1.8722798737773303, + "language_loss": 0.85386759, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87587374, + "num_input_tokens_seen": 67590010, + "step": 3130, + "time_per_iteration": 2.483309268951416 + }, + { + "auxiliary_loss_clip": 0.01105965, + "auxiliary_loss_mlp": 0.01052706, + "balance_loss_clip": 1.04861379, + "balance_loss_mlp": 1.03434777, + "epoch": 0.188245904103412, + "flos": 28840506048000.0, + "grad_norm": 1.9349992610665965, + "language_loss": 0.76662588, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.7882126, + "num_input_tokens_seen": 67611110, + "step": 3131, + "time_per_iteration": 2.6089823246002197 + }, + { + "auxiliary_loss_clip": 0.01121369, + "auxiliary_loss_mlp": 0.01051756, + "balance_loss_clip": 1.05152321, + "balance_loss_mlp": 1.03312349, + "epoch": 0.18830602735607996, + "flos": 29420319386880.0, + "grad_norm": 2.739358372754681, + "language_loss": 0.81386244, + "learning_rate": 3.74282069289017e-06, + "loss": 0.8355937, + "num_input_tokens_seen": 67631990, + "step": 3132, + "time_per_iteration": 2.6457366943359375 + }, + { + "auxiliary_loss_clip": 0.01093236, + "auxiliary_loss_mlp": 0.00861407, + "balance_loss_clip": 1.05040884, + "balance_loss_mlp": 1.12681246, + "epoch": 0.18836615060874792, + "flos": 28872933050880.0, + "grad_norm": 2.429540064108657, + "language_loss": 0.80056357, + "learning_rate": 3.742629607551614e-06, + "loss": 0.82011002, + "num_input_tokens_seen": 67650490, + "step": 3133, + "time_per_iteration": 2.6529829502105713 + }, + { + "auxiliary_loss_clip": 0.01115145, + "auxiliary_loss_mlp": 0.01057059, + "balance_loss_clip": 1.04798937, + "balance_loss_mlp": 1.03773499, + "epoch": 0.18842627386141592, + "flos": 22601673717120.0, + "grad_norm": 3.218304039361362, + "language_loss": 0.82768905, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.84941113, + "num_input_tokens_seen": 67668860, + "step": 3134, + "time_per_iteration": 2.5607805252075195 + }, + { + "auxiliary_loss_clip": 0.01125933, + "auxiliary_loss_mlp": 0.01055015, + "balance_loss_clip": 1.04672325, + "balance_loss_mlp": 1.0367645, + "epoch": 0.18848639711408388, + "flos": 24573600512640.0, + "grad_norm": 1.8410570938681095, + "language_loss": 0.83068079, + "learning_rate": 3.742247238639684e-06, + "loss": 0.85249019, + "num_input_tokens_seen": 67690220, + "step": 3135, + "time_per_iteration": 2.588998556137085 + }, + { + "auxiliary_loss_clip": 0.01137759, + "auxiliary_loss_mlp": 0.0104639, + "balance_loss_clip": 1.05010724, + "balance_loss_mlp": 1.02879429, + "epoch": 0.18854652036675185, + "flos": 34166920078080.0, + "grad_norm": 2.3729982336318973, + "language_loss": 0.78126276, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.80310428, + "num_input_tokens_seen": 67709820, + "step": 3136, + "time_per_iteration": 2.59486722946167 + }, + { + "auxiliary_loss_clip": 0.01135105, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_clip": 1.05540645, + "balance_loss_mlp": 1.02753437, + "epoch": 0.1886066436194198, + "flos": 24200236592640.0, + "grad_norm": 2.792536715879011, + "language_loss": 0.80920219, + "learning_rate": 3.741864605462996e-06, + "loss": 0.83101106, + "num_input_tokens_seen": 67729490, + "step": 3137, + "time_per_iteration": 2.55684757232666 + }, + { + "auxiliary_loss_clip": 0.01154003, + "auxiliary_loss_mlp": 0.01044509, + "balance_loss_clip": 1.0529933, + "balance_loss_mlp": 1.02817726, + "epoch": 0.18866676687208778, + "flos": 21251109317760.0, + "grad_norm": 1.684333843490975, + "language_loss": 0.80857658, + "learning_rate": 3.741673189793504e-06, + "loss": 0.83056176, + "num_input_tokens_seen": 67749665, + "step": 3138, + "time_per_iteration": 2.4614830017089844 + }, + { + "auxiliary_loss_clip": 0.01142649, + "auxiliary_loss_mlp": 0.0105575, + "balance_loss_clip": 1.04961538, + "balance_loss_mlp": 1.03805912, + "epoch": 0.18872689012475574, + "flos": 37308673013760.0, + "grad_norm": 2.1928299576356514, + "language_loss": 0.637743, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.65972698, + "num_input_tokens_seen": 67776230, + "step": 3139, + "time_per_iteration": 2.6656370162963867 + }, + { + "auxiliary_loss_clip": 0.01147128, + "auxiliary_loss_mlp": 0.01041451, + "balance_loss_clip": 1.04748297, + "balance_loss_mlp": 1.02390301, + "epoch": 0.1887870133774237, + "flos": 21652303299840.0, + "grad_norm": 2.390915533982515, + "language_loss": 0.71573037, + "learning_rate": 3.741290160328514e-06, + "loss": 0.73761618, + "num_input_tokens_seen": 67795080, + "step": 3140, + "time_per_iteration": 2.448986768722534 + }, + { + "auxiliary_loss_clip": 0.01149948, + "auxiliary_loss_mlp": 0.01044863, + "balance_loss_clip": 1.04775953, + "balance_loss_mlp": 1.02615869, + "epoch": 0.1888471366300917, + "flos": 15924659374080.0, + "grad_norm": 2.837267779820146, + "language_loss": 0.87176251, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.89371061, + "num_input_tokens_seen": 67813110, + "step": 3141, + "time_per_iteration": 2.4590044021606445 + }, + { + "auxiliary_loss_clip": 0.01129874, + "auxiliary_loss_mlp": 0.01042201, + "balance_loss_clip": 1.05082643, + "balance_loss_mlp": 1.02291274, + "epoch": 0.18890725988275966, + "flos": 18551955767040.0, + "grad_norm": 1.9230278645302414, + "language_loss": 0.77511013, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79683083, + "num_input_tokens_seen": 67831070, + "step": 3142, + "time_per_iteration": 2.4964609146118164 + }, + { + "auxiliary_loss_clip": 0.01124937, + "auxiliary_loss_mlp": 0.01039824, + "balance_loss_clip": 1.0519954, + "balance_loss_mlp": 1.02396965, + "epoch": 0.18896738313542763, + "flos": 28840865184000.0, + "grad_norm": 1.7161157722232474, + "language_loss": 0.79051793, + "learning_rate": 3.740715120924971e-06, + "loss": 0.8121655, + "num_input_tokens_seen": 67852170, + "step": 3143, + "time_per_iteration": 2.604966163635254 + }, + { + "auxiliary_loss_clip": 0.01117021, + "auxiliary_loss_mlp": 0.01045017, + "balance_loss_clip": 1.04959381, + "balance_loss_mlp": 1.02718329, + "epoch": 0.1890275063880956, + "flos": 22412747157120.0, + "grad_norm": 2.296211626538829, + "language_loss": 0.71433902, + "learning_rate": 3.740523309097912e-06, + "loss": 0.73595941, + "num_input_tokens_seen": 67869945, + "step": 3144, + "time_per_iteration": 2.5658531188964844 + }, + { + "auxiliary_loss_clip": 0.01121426, + "auxiliary_loss_mlp": 0.01047033, + "balance_loss_clip": 1.04734325, + "balance_loss_mlp": 1.02806687, + "epoch": 0.18908762964076356, + "flos": 24243904552320.0, + "grad_norm": 2.3934980820132044, + "language_loss": 0.73537242, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.75705707, + "num_input_tokens_seen": 67890240, + "step": 3145, + "time_per_iteration": 2.579765796661377 + }, + { + "auxiliary_loss_clip": 0.01110249, + "auxiliary_loss_mlp": 0.01042109, + "balance_loss_clip": 1.04836643, + "balance_loss_mlp": 1.02463293, + "epoch": 0.18914775289343153, + "flos": 16982910892800.0, + "grad_norm": 2.287367767842877, + "language_loss": 0.76335245, + "learning_rate": 3.740139487448616e-06, + "loss": 0.78487599, + "num_input_tokens_seen": 67907825, + "step": 3146, + "time_per_iteration": 2.5484111309051514 + }, + { + "auxiliary_loss_clip": 0.01094328, + "auxiliary_loss_mlp": 0.01048161, + "balance_loss_clip": 1.04731989, + "balance_loss_mlp": 1.02873015, + "epoch": 0.1892078761460995, + "flos": 21543781334400.0, + "grad_norm": 1.801036692118683, + "language_loss": 0.78431177, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.80573666, + "num_input_tokens_seen": 67926670, + "step": 3147, + "time_per_iteration": 2.594238758087158 + }, + { + "auxiliary_loss_clip": 0.01139248, + "auxiliary_loss_mlp": 0.01043541, + "balance_loss_clip": 1.05026007, + "balance_loss_mlp": 1.02651751, + "epoch": 0.18926799939876748, + "flos": 23001538896000.0, + "grad_norm": 2.7881785024500174, + "language_loss": 0.67721522, + "learning_rate": 3.739755401854267e-06, + "loss": 0.69904315, + "num_input_tokens_seen": 67943645, + "step": 3148, + "time_per_iteration": 2.5157783031463623 + }, + { + "auxiliary_loss_clip": 0.0111221, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.0487802, + "balance_loss_mlp": 1.02007198, + "epoch": 0.18932812265143545, + "flos": 22273019251200.0, + "grad_norm": 4.335875345147421, + "language_loss": 0.75466877, + "learning_rate": 3.739563260095902e-06, + "loss": 0.77616054, + "num_input_tokens_seen": 67962345, + "step": 3149, + "time_per_iteration": 2.5807857513427734 + }, + { + "auxiliary_loss_clip": 0.01131255, + "auxiliary_loss_mlp": 0.01041833, + "balance_loss_clip": 1.0532701, + "balance_loss_mlp": 1.02538264, + "epoch": 0.1893882459041034, + "flos": 18624423456000.0, + "grad_norm": 2.6745940393010486, + "language_loss": 0.80833673, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.83006763, + "num_input_tokens_seen": 67979760, + "step": 3150, + "time_per_iteration": 2.5003201961517334 + }, + { + "auxiliary_loss_clip": 0.01133646, + "auxiliary_loss_mlp": 0.01048665, + "balance_loss_clip": 1.05087316, + "balance_loss_mlp": 1.03124869, + "epoch": 0.18944836915677138, + "flos": 22892981016960.0, + "grad_norm": 2.192040602204963, + "language_loss": 0.85416484, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.87598795, + "num_input_tokens_seen": 67996895, + "step": 3151, + "time_per_iteration": 2.515146493911743 + }, + { + "auxiliary_loss_clip": 0.01115984, + "auxiliary_loss_mlp": 0.01046396, + "balance_loss_clip": 1.0494597, + "balance_loss_mlp": 1.02882504, + "epoch": 0.18950849240943934, + "flos": 26796542526720.0, + "grad_norm": 2.2682169020949208, + "language_loss": 0.74344379, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.76506758, + "num_input_tokens_seen": 68018365, + "step": 3152, + "time_per_iteration": 2.620814085006714 + }, + { + "auxiliary_loss_clip": 0.01122908, + "auxiliary_loss_mlp": 0.01040603, + "balance_loss_clip": 1.05359054, + "balance_loss_mlp": 1.02230477, + "epoch": 0.1895686156621073, + "flos": 24971239048320.0, + "grad_norm": 2.1571687451511035, + "language_loss": 0.75515741, + "learning_rate": 3.738794033491209e-06, + "loss": 0.77679253, + "num_input_tokens_seen": 68037985, + "step": 3153, + "time_per_iteration": 2.60398268699646 + }, + { + "auxiliary_loss_clip": 0.01156851, + "auxiliary_loss_mlp": 0.01046606, + "balance_loss_clip": 1.05503583, + "balance_loss_mlp": 1.02890348, + "epoch": 0.1896287389147753, + "flos": 21944544353280.0, + "grad_norm": 1.991562524014051, + "language_loss": 0.79263568, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.81467026, + "num_input_tokens_seen": 68057975, + "step": 3154, + "time_per_iteration": 2.471313238143921 + }, + { + "auxiliary_loss_clip": 0.01113229, + "auxiliary_loss_mlp": 0.01051445, + "balance_loss_clip": 1.04617906, + "balance_loss_mlp": 1.03215694, + "epoch": 0.18968886216744327, + "flos": 18179058723840.0, + "grad_norm": 2.459614285361488, + "language_loss": 0.7260896, + "learning_rate": 3.738409024548223e-06, + "loss": 0.74773633, + "num_input_tokens_seen": 68074175, + "step": 3155, + "time_per_iteration": 2.543168783187866 + }, + { + "auxiliary_loss_clip": 0.01125782, + "auxiliary_loss_mlp": 0.01040861, + "balance_loss_clip": 1.05263019, + "balance_loss_mlp": 1.02346873, + "epoch": 0.18974898542011123, + "flos": 20412487509120.0, + "grad_norm": 1.9262185060066148, + "language_loss": 0.74085855, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76252496, + "num_input_tokens_seen": 68095230, + "step": 3156, + "time_per_iteration": 2.5644567012786865 + }, + { + "auxiliary_loss_clip": 0.01154657, + "auxiliary_loss_mlp": 0.01036422, + "balance_loss_clip": 1.05285203, + "balance_loss_mlp": 1.02014995, + "epoch": 0.1898091086727792, + "flos": 23985024255360.0, + "grad_norm": 2.0136307409696945, + "language_loss": 0.6851272, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.70703804, + "num_input_tokens_seen": 68113805, + "step": 3157, + "time_per_iteration": 2.513479709625244 + }, + { + "auxiliary_loss_clip": 0.01116053, + "auxiliary_loss_mlp": 0.01042306, + "balance_loss_clip": 1.05185533, + "balance_loss_mlp": 1.0250448, + "epoch": 0.18986923192544716, + "flos": 27637067756160.0, + "grad_norm": 1.87248048120988, + "language_loss": 0.79659534, + "learning_rate": 3.737831016747176e-06, + "loss": 0.81817889, + "num_input_tokens_seen": 68133190, + "step": 3158, + "time_per_iteration": 2.5977070331573486 + }, + { + "auxiliary_loss_clip": 0.01159632, + "auxiliary_loss_mlp": 0.01038758, + "balance_loss_clip": 1.05497706, + "balance_loss_mlp": 1.02059078, + "epoch": 0.18992935517811513, + "flos": 25484151306240.0, + "grad_norm": 1.7985147850493108, + "language_loss": 0.72232509, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74430895, + "num_input_tokens_seen": 68152330, + "step": 3159, + "time_per_iteration": 2.520251750946045 + }, + { + "auxiliary_loss_clip": 0.0114881, + "auxiliary_loss_mlp": 0.01043493, + "balance_loss_clip": 1.05892944, + "balance_loss_mlp": 1.02518213, + "epoch": 0.1899894784307831, + "flos": 17420805596160.0, + "grad_norm": 2.052464138762349, + "language_loss": 0.84915459, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.87107766, + "num_input_tokens_seen": 68170185, + "step": 3160, + "time_per_iteration": 3.872096300125122 + }, + { + "auxiliary_loss_clip": 0.01134257, + "auxiliary_loss_mlp": 0.01043575, + "balance_loss_clip": 1.05487597, + "balance_loss_mlp": 1.0267663, + "epoch": 0.19004960168345109, + "flos": 27492240119040.0, + "grad_norm": 1.9732324573788618, + "language_loss": 0.738029, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.75980723, + "num_input_tokens_seen": 68191665, + "step": 3161, + "time_per_iteration": 3.9717118740081787 + }, + { + "auxiliary_loss_clip": 0.01134354, + "auxiliary_loss_mlp": 0.01049599, + "balance_loss_clip": 1.04970181, + "balance_loss_mlp": 1.03038263, + "epoch": 0.19010972493611905, + "flos": 38654676385920.0, + "grad_norm": 2.584442324043975, + "language_loss": 0.8078618, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.8297013, + "num_input_tokens_seen": 68214635, + "step": 3162, + "time_per_iteration": 2.6530282497406006 + }, + { + "auxiliary_loss_clip": 0.01156605, + "auxiliary_loss_mlp": 0.01038415, + "balance_loss_clip": 1.0555445, + "balance_loss_mlp": 1.02035534, + "epoch": 0.19016984818878702, + "flos": 19244744357760.0, + "grad_norm": 2.213712549662201, + "language_loss": 0.75113177, + "learning_rate": 3.73686635253511e-06, + "loss": 0.77308202, + "num_input_tokens_seen": 68232150, + "step": 3163, + "time_per_iteration": 2.453986167907715 + }, + { + "auxiliary_loss_clip": 0.01097874, + "auxiliary_loss_mlp": 0.01047035, + "balance_loss_clip": 1.05051947, + "balance_loss_mlp": 1.02805734, + "epoch": 0.19022997144145498, + "flos": 37596891744000.0, + "grad_norm": 5.5148660894389705, + "language_loss": 0.74428195, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76573104, + "num_input_tokens_seen": 68253370, + "step": 3164, + "time_per_iteration": 2.740471363067627 + }, + { + "auxiliary_loss_clip": 0.01139773, + "auxiliary_loss_mlp": 0.01034987, + "balance_loss_clip": 1.05293989, + "balance_loss_mlp": 1.01729584, + "epoch": 0.19029009469412295, + "flos": 61530921665280.0, + "grad_norm": 1.4886743482912517, + "language_loss": 0.66815525, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.68990284, + "num_input_tokens_seen": 68278895, + "step": 3165, + "time_per_iteration": 4.264160871505737 + }, + { + "auxiliary_loss_clip": 0.01144192, + "auxiliary_loss_mlp": 0.01045761, + "balance_loss_clip": 1.05406356, + "balance_loss_mlp": 1.02685475, + "epoch": 0.1903502179467909, + "flos": 13954851480960.0, + "grad_norm": 2.264479347608474, + "language_loss": 0.74551225, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.76741183, + "num_input_tokens_seen": 68294880, + "step": 3166, + "time_per_iteration": 2.49582839012146 + }, + { + "auxiliary_loss_clip": 0.01051974, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.04178309, + "balance_loss_mlp": 1.02801287, + "epoch": 0.1904103411994589, + "flos": 66899641916160.0, + "grad_norm": 0.7809703193964058, + "language_loss": 0.50365973, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52449584, + "num_input_tokens_seen": 68359665, + "step": 3167, + "time_per_iteration": 4.57387375831604 + }, + { + "auxiliary_loss_clip": 0.01134425, + "auxiliary_loss_mlp": 0.01047349, + "balance_loss_clip": 1.05223072, + "balance_loss_mlp": 1.03005219, + "epoch": 0.19047046445212687, + "flos": 21908741472000.0, + "grad_norm": 1.7251222301897158, + "language_loss": 0.74446106, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.7662788, + "num_input_tokens_seen": 68378950, + "step": 3168, + "time_per_iteration": 2.5324606895446777 + }, + { + "auxiliary_loss_clip": 0.0104654, + "auxiliary_loss_mlp": 0.01026165, + "balance_loss_clip": 1.04538429, + "balance_loss_mlp": 1.02215958, + "epoch": 0.19053058770479483, + "flos": 59255156701440.0, + "grad_norm": 0.8618994983497107, + "language_loss": 0.60080254, + "learning_rate": 3.73570658211056e-06, + "loss": 0.62152958, + "num_input_tokens_seen": 68434235, + "step": 3169, + "time_per_iteration": 3.084881067276001 + }, + { + "auxiliary_loss_clip": 0.01100922, + "auxiliary_loss_mlp": 0.01045677, + "balance_loss_clip": 1.05237222, + "balance_loss_mlp": 1.02772391, + "epoch": 0.1905907109574628, + "flos": 23951304362880.0, + "grad_norm": 1.528538023908902, + "language_loss": 0.78332877, + "learning_rate": 3.735513056633436e-06, + "loss": 0.80479479, + "num_input_tokens_seen": 68453830, + "step": 3170, + "time_per_iteration": 2.667311429977417 + }, + { + "auxiliary_loss_clip": 0.01137995, + "auxiliary_loss_mlp": 0.01040602, + "balance_loss_clip": 1.05046082, + "balance_loss_mlp": 1.02337599, + "epoch": 0.19065083421013077, + "flos": 20812316774400.0, + "grad_norm": 1.8022990796524612, + "language_loss": 0.78557408, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.80736005, + "num_input_tokens_seen": 68473005, + "step": 3171, + "time_per_iteration": 2.5501315593719482 + }, + { + "auxiliary_loss_clip": 0.01154214, + "auxiliary_loss_mlp": 0.01039188, + "balance_loss_clip": 1.0514214, + "balance_loss_mlp": 1.02122295, + "epoch": 0.19071095746279873, + "flos": 31284981192960.0, + "grad_norm": 2.518474228196192, + "language_loss": 0.7850731, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.80700713, + "num_input_tokens_seen": 68493470, + "step": 3172, + "time_per_iteration": 2.6355881690979004 + }, + { + "auxiliary_loss_clip": 0.01142465, + "auxiliary_loss_mlp": 0.01049564, + "balance_loss_clip": 1.0514698, + "balance_loss_mlp": 1.03132463, + "epoch": 0.1907710807154667, + "flos": 14356117290240.0, + "grad_norm": 1.6148597307678, + "language_loss": 0.80158269, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.82350302, + "num_input_tokens_seen": 68511290, + "step": 3173, + "time_per_iteration": 2.508340835571289 + }, + { + "auxiliary_loss_clip": 0.01115119, + "auxiliary_loss_mlp": 0.00820695, + "balance_loss_clip": 1.05007529, + "balance_loss_mlp": 1.05415177, + "epoch": 0.1908312039681347, + "flos": 26907039740160.0, + "grad_norm": 3.073745500484498, + "language_loss": 0.78793609, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.80729425, + "num_input_tokens_seen": 68532575, + "step": 3174, + "time_per_iteration": 2.6058740615844727 + }, + { + "auxiliary_loss_clip": 0.01105906, + "auxiliary_loss_mlp": 0.01045613, + "balance_loss_clip": 1.05221391, + "balance_loss_mlp": 1.02706361, + "epoch": 0.19089132722080265, + "flos": 14494695960960.0, + "grad_norm": 3.2278587820142004, + "language_loss": 0.81073898, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.83225423, + "num_input_tokens_seen": 68548760, + "step": 3175, + "time_per_iteration": 2.589191436767578 + }, + { + "auxiliary_loss_clip": 0.01079814, + "auxiliary_loss_mlp": 0.01056029, + "balance_loss_clip": 1.04832006, + "balance_loss_mlp": 1.03687191, + "epoch": 0.19095145047347062, + "flos": 13952876232960.0, + "grad_norm": 2.576244461817974, + "language_loss": 0.85596657, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.87732506, + "num_input_tokens_seen": 68563100, + "step": 3176, + "time_per_iteration": 2.5945746898651123 + }, + { + "auxiliary_loss_clip": 0.01135796, + "auxiliary_loss_mlp": 0.01048391, + "balance_loss_clip": 1.05574751, + "balance_loss_mlp": 1.02893651, + "epoch": 0.19101157372613858, + "flos": 25301832848640.0, + "grad_norm": 4.08661053896423, + "language_loss": 0.80961251, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.8314544, + "num_input_tokens_seen": 68581650, + "step": 3177, + "time_per_iteration": 2.5872857570648193 + }, + { + "auxiliary_loss_clip": 0.01122665, + "auxiliary_loss_mlp": 0.01039702, + "balance_loss_clip": 1.05095506, + "balance_loss_mlp": 1.02158237, + "epoch": 0.19107169697880655, + "flos": 20558212986240.0, + "grad_norm": 2.1730871014979685, + "language_loss": 0.74718797, + "learning_rate": 3.73396248424356e-06, + "loss": 0.76881164, + "num_input_tokens_seen": 68600360, + "step": 3178, + "time_per_iteration": 2.5505893230438232 + }, + { + "auxiliary_loss_clip": 0.01141092, + "auxiliary_loss_mlp": 0.01035392, + "balance_loss_clip": 1.0502696, + "balance_loss_mlp": 1.01815486, + "epoch": 0.19113182023147451, + "flos": 22163204396160.0, + "grad_norm": 1.8404925008512711, + "language_loss": 0.81425065, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.83601546, + "num_input_tokens_seen": 68617885, + "step": 3179, + "time_per_iteration": 2.5137524604797363 + }, + { + "auxiliary_loss_clip": 0.01144295, + "auxiliary_loss_mlp": 0.0104022, + "balance_loss_clip": 1.05353594, + "balance_loss_mlp": 1.02280366, + "epoch": 0.19119194348414248, + "flos": 18581796990720.0, + "grad_norm": 4.669816937423525, + "language_loss": 0.79197061, + "learning_rate": 3.733574183478691e-06, + "loss": 0.81381577, + "num_input_tokens_seen": 68634550, + "step": 3180, + "time_per_iteration": 2.4949803352355957 + }, + { + "auxiliary_loss_clip": 0.01128206, + "auxiliary_loss_mlp": 0.01044132, + "balance_loss_clip": 1.05099142, + "balance_loss_mlp": 1.02493978, + "epoch": 0.19125206673681047, + "flos": 19026623018880.0, + "grad_norm": 2.9737449335503987, + "language_loss": 0.79144049, + "learning_rate": 3.733379934486615e-06, + "loss": 0.81316388, + "num_input_tokens_seen": 68651895, + "step": 3181, + "time_per_iteration": 2.530320644378662 + }, + { + "auxiliary_loss_clip": 0.01140117, + "auxiliary_loss_mlp": 0.01046865, + "balance_loss_clip": 1.05099607, + "balance_loss_mlp": 1.02887654, + "epoch": 0.19131218998947844, + "flos": 21690153256320.0, + "grad_norm": 1.9501726010691187, + "language_loss": 0.73871809, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.76058793, + "num_input_tokens_seen": 68671500, + "step": 3182, + "time_per_iteration": 2.5142581462860107 + }, + { + "auxiliary_loss_clip": 0.01124323, + "auxiliary_loss_mlp": 0.01040924, + "balance_loss_clip": 1.0515548, + "balance_loss_mlp": 1.02261317, + "epoch": 0.1913723132421464, + "flos": 18442500048000.0, + "grad_norm": 1.7390907652565806, + "language_loss": 0.64848828, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.67014074, + "num_input_tokens_seen": 68690570, + "step": 3183, + "time_per_iteration": 2.5721189975738525 + }, + { + "auxiliary_loss_clip": 0.0112752, + "auxiliary_loss_mlp": 0.01045033, + "balance_loss_clip": 1.04731596, + "balance_loss_mlp": 1.02654326, + "epoch": 0.19143243649481437, + "flos": 27160102033920.0, + "grad_norm": 3.1487972078892126, + "language_loss": 0.73467326, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.7563988, + "num_input_tokens_seen": 68709735, + "step": 3184, + "time_per_iteration": 2.5792317390441895 + }, + { + "auxiliary_loss_clip": 0.01121718, + "auxiliary_loss_mlp": 0.01049602, + "balance_loss_clip": 1.04897296, + "balance_loss_mlp": 1.02918148, + "epoch": 0.19149255974748233, + "flos": 21718952985600.0, + "grad_norm": 1.8936875711150327, + "language_loss": 0.88044155, + "learning_rate": 3.732602281292598e-06, + "loss": 0.90215474, + "num_input_tokens_seen": 68727565, + "step": 3185, + "time_per_iteration": 2.5387885570526123 + }, + { + "auxiliary_loss_clip": 0.01150492, + "auxiliary_loss_mlp": 0.01041556, + "balance_loss_clip": 1.05008197, + "balance_loss_mlp": 1.02323329, + "epoch": 0.1915526830001503, + "flos": 22963293889920.0, + "grad_norm": 2.1384034525308677, + "language_loss": 0.72902596, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.7509464, + "num_input_tokens_seen": 68748110, + "step": 3186, + "time_per_iteration": 2.483429193496704 + }, + { + "auxiliary_loss_clip": 0.01131003, + "auxiliary_loss_mlp": 0.01041689, + "balance_loss_clip": 1.05495715, + "balance_loss_mlp": 1.0213635, + "epoch": 0.1916128062528183, + "flos": 26140741966080.0, + "grad_norm": 1.880594914617483, + "language_loss": 0.83779466, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.85952151, + "num_input_tokens_seen": 68769765, + "step": 3187, + "time_per_iteration": 2.6088030338287354 + }, + { + "auxiliary_loss_clip": 0.0106546, + "auxiliary_loss_mlp": 0.01004489, + "balance_loss_clip": 1.02995658, + "balance_loss_mlp": 1.00177085, + "epoch": 0.19167292950548626, + "flos": 54925767457920.0, + "grad_norm": 0.8501622468882478, + "language_loss": 0.5579688, + "learning_rate": 3.732018351516544e-06, + "loss": 0.57866824, + "num_input_tokens_seen": 68826815, + "step": 3188, + "time_per_iteration": 3.146822452545166 + }, + { + "auxiliary_loss_clip": 0.01135721, + "auxiliary_loss_mlp": 0.01047798, + "balance_loss_clip": 1.05115223, + "balance_loss_mlp": 1.03044116, + "epoch": 0.19173305275815422, + "flos": 29935601942400.0, + "grad_norm": 1.75700400607208, + "language_loss": 0.7000708, + "learning_rate": 3.731823576891397e-06, + "loss": 0.72190607, + "num_input_tokens_seen": 68847585, + "step": 3189, + "time_per_iteration": 2.594344139099121 + }, + { + "auxiliary_loss_clip": 0.01118277, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.04966748, + "balance_loss_mlp": 1.01957262, + "epoch": 0.1917931760108222, + "flos": 24752471264640.0, + "grad_norm": 2.0413280908600098, + "language_loss": 0.74097604, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.76251894, + "num_input_tokens_seen": 68866620, + "step": 3190, + "time_per_iteration": 2.5525827407836914 + }, + { + "auxiliary_loss_clip": 0.01108637, + "auxiliary_loss_mlp": 0.01061104, + "balance_loss_clip": 1.04744673, + "balance_loss_mlp": 1.04182839, + "epoch": 0.19185329926349015, + "flos": 18843550375680.0, + "grad_norm": 2.570864697177665, + "language_loss": 0.84073281, + "learning_rate": 3.73143383063572e-06, + "loss": 0.86243021, + "num_input_tokens_seen": 68885515, + "step": 3191, + "time_per_iteration": 2.561770439147949 + }, + { + "auxiliary_loss_clip": 0.01122262, + "auxiliary_loss_mlp": 0.01045366, + "balance_loss_clip": 1.04541671, + "balance_loss_mlp": 1.02804494, + "epoch": 0.19191342251615812, + "flos": 22086858038400.0, + "grad_norm": 1.875468212231769, + "language_loss": 0.89403325, + "learning_rate": 3.73123885901997e-06, + "loss": 0.91570956, + "num_input_tokens_seen": 68903225, + "step": 3192, + "time_per_iteration": 2.5288219451904297 + }, + { + "auxiliary_loss_clip": 0.0112405, + "auxiliary_loss_mlp": 0.01050428, + "balance_loss_clip": 1.0509758, + "balance_loss_mlp": 1.03139007, + "epoch": 0.19197354576882608, + "flos": 22199115018240.0, + "grad_norm": 2.24748754906377, + "language_loss": 0.75186032, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.77360511, + "num_input_tokens_seen": 68922860, + "step": 3193, + "time_per_iteration": 2.5830509662628174 + }, + { + "auxiliary_loss_clip": 0.01124003, + "auxiliary_loss_mlp": 0.00803102, + "balance_loss_clip": 1.04957592, + "balance_loss_mlp": 1.02353334, + "epoch": 0.19203366902149407, + "flos": 24896185580160.0, + "grad_norm": 2.7244440915051524, + "language_loss": 0.74765766, + "learning_rate": 3.730848718849612e-06, + "loss": 0.76692867, + "num_input_tokens_seen": 68943000, + "step": 3194, + "time_per_iteration": 2.5639002323150635 + }, + { + "auxiliary_loss_clip": 0.01061011, + "auxiliary_loss_mlp": 0.01016301, + "balance_loss_clip": 1.02703571, + "balance_loss_mlp": 1.01308239, + "epoch": 0.19209379227416204, + "flos": 68416722789120.0, + "grad_norm": 0.8596416944852697, + "language_loss": 0.68554509, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70631814, + "num_input_tokens_seen": 69000255, + "step": 3195, + "time_per_iteration": 3.041114568710327 + }, + { + "auxiliary_loss_clip": 0.01113685, + "auxiliary_loss_mlp": 0.01056241, + "balance_loss_clip": 1.04779851, + "balance_loss_mlp": 1.03671432, + "epoch": 0.19215391552683, + "flos": 22055185221120.0, + "grad_norm": 2.6900070510889202, + "language_loss": 0.73099351, + "learning_rate": 3.730458316143429e-06, + "loss": 0.7526927, + "num_input_tokens_seen": 69019665, + "step": 3196, + "time_per_iteration": 2.5798418521881104 + }, + { + "auxiliary_loss_clip": 0.01134024, + "auxiliary_loss_mlp": 0.01047124, + "balance_loss_clip": 1.05590618, + "balance_loss_mlp": 1.02932596, + "epoch": 0.19221403877949797, + "flos": 20302959962880.0, + "grad_norm": 2.024231566370262, + "language_loss": 0.83458531, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.85639679, + "num_input_tokens_seen": 69039055, + "step": 3197, + "time_per_iteration": 2.5140347480773926 + }, + { + "auxiliary_loss_clip": 0.01089955, + "auxiliary_loss_mlp": 0.01052249, + "balance_loss_clip": 1.05185485, + "balance_loss_mlp": 1.03244877, + "epoch": 0.19227416203216594, + "flos": 23185329811200.0, + "grad_norm": 2.17408933000405, + "language_loss": 0.80037606, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.82179815, + "num_input_tokens_seen": 69056370, + "step": 3198, + "time_per_iteration": 2.627760887145996 + }, + { + "auxiliary_loss_clip": 0.01130352, + "auxiliary_loss_mlp": 0.01051281, + "balance_loss_clip": 1.04923964, + "balance_loss_mlp": 1.03297019, + "epoch": 0.1923342852848339, + "flos": 25776607841280.0, + "grad_norm": 2.340084961024336, + "language_loss": 0.78796262, + "learning_rate": 3.729872219959029e-06, + "loss": 0.80977893, + "num_input_tokens_seen": 69075915, + "step": 3199, + "time_per_iteration": 5.352616786956787 + }, + { + "auxiliary_loss_clip": 0.01116488, + "auxiliary_loss_mlp": 0.01053525, + "balance_loss_clip": 1.0486263, + "balance_loss_mlp": 1.03492868, + "epoch": 0.19239440853750187, + "flos": 17128349061120.0, + "grad_norm": 3.2914731610694616, + "language_loss": 0.84007913, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.86177927, + "num_input_tokens_seen": 69094145, + "step": 3200, + "time_per_iteration": 2.539914131164551 + }, + { + "auxiliary_loss_clip": 0.01153726, + "auxiliary_loss_mlp": 0.01051028, + "balance_loss_clip": 1.05207229, + "balance_loss_mlp": 1.03381407, + "epoch": 0.19245453179016986, + "flos": 16435093593600.0, + "grad_norm": 2.039963542116121, + "language_loss": 0.79519033, + "learning_rate": 3.729481161172443e-06, + "loss": 0.81723785, + "num_input_tokens_seen": 69111110, + "step": 3201, + "time_per_iteration": 2.4444971084594727 + }, + { + "auxiliary_loss_clip": 0.01100117, + "auxiliary_loss_mlp": 0.01045062, + "balance_loss_clip": 1.04675531, + "balance_loss_mlp": 1.02678728, + "epoch": 0.19251465504283782, + "flos": 20230276792320.0, + "grad_norm": 1.942535423653178, + "language_loss": 0.69227087, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.71372271, + "num_input_tokens_seen": 69130280, + "step": 3202, + "time_per_iteration": 2.5836756229400635 + }, + { + "auxiliary_loss_clip": 0.01129837, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.05030489, + "balance_loss_mlp": 1.02079415, + "epoch": 0.1925747782955058, + "flos": 19464374067840.0, + "grad_norm": 1.7872281834036303, + "language_loss": 0.91255069, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.93423462, + "num_input_tokens_seen": 69149570, + "step": 3203, + "time_per_iteration": 2.5184192657470703 + }, + { + "auxiliary_loss_clip": 0.01141921, + "auxiliary_loss_mlp": 0.01048901, + "balance_loss_clip": 1.04880095, + "balance_loss_mlp": 1.03019667, + "epoch": 0.19263490154817375, + "flos": 17785586165760.0, + "grad_norm": 2.9769886713882427, + "language_loss": 0.81776732, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.83967555, + "num_input_tokens_seen": 69168190, + "step": 3204, + "time_per_iteration": 3.8514444828033447 + }, + { + "auxiliary_loss_clip": 0.01112742, + "auxiliary_loss_mlp": 0.01046104, + "balance_loss_clip": 1.04743505, + "balance_loss_mlp": 1.0284487, + "epoch": 0.19269502480084172, + "flos": 17457075354240.0, + "grad_norm": 2.297398288266007, + "language_loss": 0.75651813, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.77810657, + "num_input_tokens_seen": 69186950, + "step": 3205, + "time_per_iteration": 2.57621693611145 + }, + { + "auxiliary_loss_clip": 0.01135708, + "auxiliary_loss_mlp": 0.01049716, + "balance_loss_clip": 1.05310225, + "balance_loss_mlp": 1.03212118, + "epoch": 0.19275514805350968, + "flos": 21506901045120.0, + "grad_norm": 2.917972205610784, + "language_loss": 0.83352101, + "learning_rate": 3.728502366649107e-06, + "loss": 0.85537523, + "num_input_tokens_seen": 69204850, + "step": 3206, + "time_per_iteration": 3.939209461212158 + }, + { + "auxiliary_loss_clip": 0.01048644, + "auxiliary_loss_mlp": 0.010037, + "balance_loss_clip": 1.02158117, + "balance_loss_mlp": 1.00009978, + "epoch": 0.19281527130617768, + "flos": 47695979738880.0, + "grad_norm": 0.8419473442723836, + "language_loss": 0.60560095, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62612438, + "num_input_tokens_seen": 69259200, + "step": 3207, + "time_per_iteration": 2.947436571121216 + }, + { + "auxiliary_loss_clip": 0.01117425, + "auxiliary_loss_mlp": 0.01045364, + "balance_loss_clip": 1.050758, + "balance_loss_mlp": 1.02755451, + "epoch": 0.19287539455884564, + "flos": 11801252672640.0, + "grad_norm": 2.4669124251459964, + "language_loss": 0.75260508, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.77423298, + "num_input_tokens_seen": 69275835, + "step": 3208, + "time_per_iteration": 2.5378518104553223 + }, + { + "auxiliary_loss_clip": 0.01143752, + "auxiliary_loss_mlp": 0.00796186, + "balance_loss_clip": 1.04957008, + "balance_loss_mlp": 1.01260948, + "epoch": 0.1929355178115136, + "flos": 20631434860800.0, + "grad_norm": 2.1296869941936944, + "language_loss": 0.60824847, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.62764788, + "num_input_tokens_seen": 69294810, + "step": 3209, + "time_per_iteration": 2.492734432220459 + }, + { + "auxiliary_loss_clip": 0.01154999, + "auxiliary_loss_mlp": 0.01048067, + "balance_loss_clip": 1.05018878, + "balance_loss_mlp": 1.02804017, + "epoch": 0.19299564106418157, + "flos": 40807916058240.0, + "grad_norm": 2.3063968620770763, + "language_loss": 0.79900527, + "learning_rate": 3.727718151176243e-06, + "loss": 0.82103598, + "num_input_tokens_seen": 69316065, + "step": 3210, + "time_per_iteration": 2.6449081897735596 + }, + { + "auxiliary_loss_clip": 0.01114777, + "auxiliary_loss_mlp": 0.01039429, + "balance_loss_clip": 1.04499364, + "balance_loss_mlp": 1.02191734, + "epoch": 0.19305576431684954, + "flos": 11361418634880.0, + "grad_norm": 2.1382958937281096, + "language_loss": 0.83120441, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.85274649, + "num_input_tokens_seen": 69332900, + "step": 3211, + "time_per_iteration": 2.5183794498443604 + }, + { + "auxiliary_loss_clip": 0.0106314, + "auxiliary_loss_mlp": 0.0100613, + "balance_loss_clip": 1.02081192, + "balance_loss_mlp": 1.00264883, + "epoch": 0.1931158875695175, + "flos": 54511895975040.0, + "grad_norm": 0.9640360044430135, + "language_loss": 0.6366058, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.65729851, + "num_input_tokens_seen": 69382535, + "step": 3212, + "time_per_iteration": 2.9352073669433594 + }, + { + "auxiliary_loss_clip": 0.0112642, + "auxiliary_loss_mlp": 0.01047532, + "balance_loss_clip": 1.05067015, + "balance_loss_mlp": 1.03002012, + "epoch": 0.19317601082218547, + "flos": 19828436365440.0, + "grad_norm": 1.6604524303089223, + "language_loss": 0.76276112, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.7845006, + "num_input_tokens_seen": 69400600, + "step": 3213, + "time_per_iteration": 2.520739793777466 + }, + { + "auxiliary_loss_clip": 0.01121666, + "auxiliary_loss_mlp": 0.01045108, + "balance_loss_clip": 1.05070162, + "balance_loss_mlp": 1.02645135, + "epoch": 0.19323613407485346, + "flos": 13152068467200.0, + "grad_norm": 2.3578127155743105, + "language_loss": 0.70696962, + "learning_rate": 3.726932887459503e-06, + "loss": 0.72863734, + "num_input_tokens_seen": 69417350, + "step": 3214, + "time_per_iteration": 2.5383896827697754 + }, + { + "auxiliary_loss_clip": 0.01151288, + "auxiliary_loss_mlp": 0.01049499, + "balance_loss_clip": 1.04934692, + "balance_loss_mlp": 1.0300802, + "epoch": 0.19329625732752143, + "flos": 14027247342720.0, + "grad_norm": 2.2375838986521313, + "language_loss": 0.75737697, + "learning_rate": 3.72673640779803e-06, + "loss": 0.77938485, + "num_input_tokens_seen": 69431845, + "step": 3215, + "time_per_iteration": 2.4480459690093994 + }, + { + "auxiliary_loss_clip": 0.01113341, + "auxiliary_loss_mlp": 0.01052196, + "balance_loss_clip": 1.0507139, + "balance_loss_mlp": 1.03456461, + "epoch": 0.1933563805801894, + "flos": 23441732069760.0, + "grad_norm": 2.0965926041438943, + "language_loss": 0.88312352, + "learning_rate": 3.72653986265854e-06, + "loss": 0.90477896, + "num_input_tokens_seen": 69453275, + "step": 3216, + "time_per_iteration": 2.6002695560455322 + }, + { + "auxiliary_loss_clip": 0.01151288, + "auxiliary_loss_mlp": 0.01053993, + "balance_loss_clip": 1.05041397, + "balance_loss_mlp": 1.03610003, + "epoch": 0.19341650383285736, + "flos": 20485314334080.0, + "grad_norm": 1.9844599361289452, + "language_loss": 0.80424595, + "learning_rate": 3.726343252048485e-06, + "loss": 0.82629871, + "num_input_tokens_seen": 69471830, + "step": 3217, + "time_per_iteration": 2.4625868797302246 + }, + { + "auxiliary_loss_clip": 0.01137242, + "auxiliary_loss_mlp": 0.01051701, + "balance_loss_clip": 1.05226493, + "balance_loss_mlp": 1.03210318, + "epoch": 0.19347662708552532, + "flos": 17858484817920.0, + "grad_norm": 2.734041486537027, + "language_loss": 0.61750537, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.63939482, + "num_input_tokens_seen": 69489320, + "step": 3218, + "time_per_iteration": 2.5274431705474854 + }, + { + "auxiliary_loss_clip": 0.01153481, + "auxiliary_loss_mlp": 0.01045116, + "balance_loss_clip": 1.05086553, + "balance_loss_mlp": 1.02736545, + "epoch": 0.1935367503381933, + "flos": 18187247024640.0, + "grad_norm": 1.7150437821310596, + "language_loss": 0.80289006, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.82487607, + "num_input_tokens_seen": 69506665, + "step": 3219, + "time_per_iteration": 2.454946756362915 + }, + { + "auxiliary_loss_clip": 0.01100649, + "auxiliary_loss_mlp": 0.01048984, + "balance_loss_clip": 1.0488708, + "balance_loss_mlp": 1.0296247, + "epoch": 0.19359687359086128, + "flos": 15957122290560.0, + "grad_norm": 2.2388463255854956, + "language_loss": 0.85946214, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.88095844, + "num_input_tokens_seen": 69523835, + "step": 3220, + "time_per_iteration": 2.6098787784576416 + }, + { + "auxiliary_loss_clip": 0.01145188, + "auxiliary_loss_mlp": 0.01039962, + "balance_loss_clip": 1.04949582, + "balance_loss_mlp": 1.02370811, + "epoch": 0.19365699684352924, + "flos": 21215198695680.0, + "grad_norm": 1.9365119181076715, + "language_loss": 0.84057105, + "learning_rate": 3.725556155051766e-06, + "loss": 0.86242253, + "num_input_tokens_seen": 69542620, + "step": 3221, + "time_per_iteration": 2.490363359451294 + }, + { + "auxiliary_loss_clip": 0.01140772, + "auxiliary_loss_mlp": 0.01045072, + "balance_loss_clip": 1.05050373, + "balance_loss_mlp": 1.02841878, + "epoch": 0.1937171200961972, + "flos": 17311098481920.0, + "grad_norm": 2.4014845737807584, + "language_loss": 0.85819721, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.88005567, + "num_input_tokens_seen": 69561130, + "step": 3222, + "time_per_iteration": 2.5092411041259766 + }, + { + "auxiliary_loss_clip": 0.01072055, + "auxiliary_loss_mlp": 0.01039787, + "balance_loss_clip": 1.04618192, + "balance_loss_mlp": 1.02085686, + "epoch": 0.19377724334886517, + "flos": 22635968227200.0, + "grad_norm": 1.9088949455167903, + "language_loss": 0.78566861, + "learning_rate": 3.72516221392398e-06, + "loss": 0.80678701, + "num_input_tokens_seen": 69580425, + "step": 3223, + "time_per_iteration": 2.6770312786102295 + }, + { + "auxiliary_loss_clip": 0.01141154, + "auxiliary_loss_mlp": 0.01045022, + "balance_loss_clip": 1.05101848, + "balance_loss_mlp": 1.0268302, + "epoch": 0.19383736660153314, + "flos": 15077813351040.0, + "grad_norm": 1.9814483816718913, + "language_loss": 0.75482559, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.77668738, + "num_input_tokens_seen": 69597085, + "step": 3224, + "time_per_iteration": 2.4869399070739746 + }, + { + "auxiliary_loss_clip": 0.01100465, + "auxiliary_loss_mlp": 0.01053385, + "balance_loss_clip": 1.04548621, + "balance_loss_mlp": 1.03297591, + "epoch": 0.1938974898542011, + "flos": 47119934350080.0, + "grad_norm": 2.59646414124123, + "language_loss": 0.70715779, + "learning_rate": 3.7247680111229e-06, + "loss": 0.72869629, + "num_input_tokens_seen": 69618885, + "step": 3225, + "time_per_iteration": 2.8448405265808105 + }, + { + "auxiliary_loss_clip": 0.01123601, + "auxiliary_loss_mlp": 0.01046427, + "balance_loss_clip": 1.05631709, + "balance_loss_mlp": 1.02933264, + "epoch": 0.19395761310686907, + "flos": 25812554376960.0, + "grad_norm": 2.26534696345421, + "language_loss": 0.69014382, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71184409, + "num_input_tokens_seen": 69638200, + "step": 3226, + "time_per_iteration": 2.6155080795288086 + }, + { + "auxiliary_loss_clip": 0.01122218, + "auxiliary_loss_mlp": 0.01043257, + "balance_loss_clip": 1.05097818, + "balance_loss_mlp": 1.0242548, + "epoch": 0.19401773635953706, + "flos": 23039604334080.0, + "grad_norm": 2.034673115881795, + "language_loss": 0.75973368, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.7813884, + "num_input_tokens_seen": 69657550, + "step": 3227, + "time_per_iteration": 2.5929644107818604 + }, + { + "auxiliary_loss_clip": 0.01114558, + "auxiliary_loss_mlp": 0.01042988, + "balance_loss_clip": 1.050354, + "balance_loss_mlp": 1.02586937, + "epoch": 0.19407785961220503, + "flos": 15920780705280.0, + "grad_norm": 3.0062666979892745, + "language_loss": 0.69543821, + "learning_rate": 3.724176216414662e-06, + "loss": 0.71701366, + "num_input_tokens_seen": 69675005, + "step": 3228, + "time_per_iteration": 2.5607962608337402 + }, + { + "auxiliary_loss_clip": 0.01139728, + "auxiliary_loss_mlp": 0.01041807, + "balance_loss_clip": 1.04944313, + "balance_loss_mlp": 1.02439046, + "epoch": 0.194137982864873, + "flos": 25921722787200.0, + "grad_norm": 1.934839726888585, + "language_loss": 0.74067461, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76249003, + "num_input_tokens_seen": 69696455, + "step": 3229, + "time_per_iteration": 2.5767550468444824 + }, + { + "auxiliary_loss_clip": 0.01116411, + "auxiliary_loss_mlp": 0.01043367, + "balance_loss_clip": 1.05182374, + "balance_loss_mlp": 1.02539039, + "epoch": 0.19419810611754096, + "flos": 13261344618240.0, + "grad_norm": 1.820750970145502, + "language_loss": 0.65239704, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.67399478, + "num_input_tokens_seen": 69714245, + "step": 3230, + "time_per_iteration": 2.533379554748535 + }, + { + "auxiliary_loss_clip": 0.01118705, + "auxiliary_loss_mlp": 0.00800352, + "balance_loss_clip": 1.04840612, + "balance_loss_mlp": 1.02206254, + "epoch": 0.19425822937020892, + "flos": 15705568368000.0, + "grad_norm": 1.9641396353098515, + "language_loss": 0.81872278, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.83791339, + "num_input_tokens_seen": 69731515, + "step": 3231, + "time_per_iteration": 2.5613300800323486 + }, + { + "auxiliary_loss_clip": 0.01127194, + "auxiliary_loss_mlp": 0.01041467, + "balance_loss_clip": 1.05091345, + "balance_loss_mlp": 1.02258384, + "epoch": 0.1943183526228769, + "flos": 23105392093440.0, + "grad_norm": 1.8562655932334644, + "language_loss": 0.86694682, + "learning_rate": 3.72338624150555e-06, + "loss": 0.88863337, + "num_input_tokens_seen": 69748885, + "step": 3232, + "time_per_iteration": 2.5626444816589355 + }, + { + "auxiliary_loss_clip": 0.01100382, + "auxiliary_loss_mlp": 0.01052225, + "balance_loss_clip": 1.05155921, + "balance_loss_mlp": 1.0325793, + "epoch": 0.19437847587554485, + "flos": 24712610146560.0, + "grad_norm": 1.7543461861919698, + "language_loss": 0.85026813, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87179422, + "num_input_tokens_seen": 69767540, + "step": 3233, + "time_per_iteration": 2.6394073963165283 + }, + { + "auxiliary_loss_clip": 0.01143603, + "auxiliary_loss_mlp": 0.01049419, + "balance_loss_clip": 1.05011392, + "balance_loss_mlp": 1.03189552, + "epoch": 0.19443859912821285, + "flos": 23116130259840.0, + "grad_norm": 1.7590725628999118, + "language_loss": 0.89402759, + "learning_rate": 3.722990861915158e-06, + "loss": 0.91595781, + "num_input_tokens_seen": 69789340, + "step": 3234, + "time_per_iteration": 2.547166347503662 + }, + { + "auxiliary_loss_clip": 0.01132913, + "auxiliary_loss_mlp": 0.01046437, + "balance_loss_clip": 1.04710889, + "balance_loss_mlp": 1.02776885, + "epoch": 0.1944987223808808, + "flos": 15084385539840.0, + "grad_norm": 2.3085519844802147, + "language_loss": 0.78336751, + "learning_rate": 3.722793074112234e-06, + "loss": 0.805161, + "num_input_tokens_seen": 69806470, + "step": 3235, + "time_per_iteration": 2.5068070888519287 + }, + { + "auxiliary_loss_clip": 0.0113062, + "auxiliary_loss_mlp": 0.01043971, + "balance_loss_clip": 1.05186641, + "balance_loss_mlp": 1.02716279, + "epoch": 0.19455884563354878, + "flos": 17126876603520.0, + "grad_norm": 2.1354861256165307, + "language_loss": 0.79022926, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.81197518, + "num_input_tokens_seen": 69822655, + "step": 3236, + "time_per_iteration": 2.5263383388519287 + }, + { + "auxiliary_loss_clip": 0.01151548, + "auxiliary_loss_mlp": 0.01040875, + "balance_loss_clip": 1.05211747, + "balance_loss_mlp": 1.0224098, + "epoch": 0.19461896888621674, + "flos": 20193396503040.0, + "grad_norm": 1.7912458680857635, + "language_loss": 0.75773942, + "learning_rate": 3.72239730252843e-06, + "loss": 0.77966368, + "num_input_tokens_seen": 69841895, + "step": 3237, + "time_per_iteration": 3.936593770980835 + }, + { + "auxiliary_loss_clip": 0.01154743, + "auxiliary_loss_mlp": 0.0105121, + "balance_loss_clip": 1.05022216, + "balance_loss_mlp": 1.03385329, + "epoch": 0.1946790921388847, + "flos": 25301365971840.0, + "grad_norm": 4.261608506761535, + "language_loss": 0.7522608, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.77432036, + "num_input_tokens_seen": 69862220, + "step": 3238, + "time_per_iteration": 2.496584415435791 + }, + { + "auxiliary_loss_clip": 0.01105751, + "auxiliary_loss_mlp": 0.01040033, + "balance_loss_clip": 1.04669476, + "balance_loss_mlp": 1.02146006, + "epoch": 0.19473921539155267, + "flos": 20193396503040.0, + "grad_norm": 3.2733538830422515, + "language_loss": 0.73250818, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.75396597, + "num_input_tokens_seen": 69881830, + "step": 3239, + "time_per_iteration": 3.948404312133789 + }, + { + "auxiliary_loss_clip": 0.01125231, + "auxiliary_loss_mlp": 0.01047496, + "balance_loss_clip": 1.04728794, + "balance_loss_mlp": 1.02962613, + "epoch": 0.19479933864422067, + "flos": 20887549810560.0, + "grad_norm": 2.3465884681937506, + "language_loss": 0.73420417, + "learning_rate": 3.721803155320412e-06, + "loss": 0.75593144, + "num_input_tokens_seen": 69900515, + "step": 3240, + "time_per_iteration": 2.526747703552246 + }, + { + "auxiliary_loss_clip": 0.01127418, + "auxiliary_loss_mlp": 0.0103631, + "balance_loss_clip": 1.05483294, + "balance_loss_mlp": 1.01906013, + "epoch": 0.19485946189688863, + "flos": 23295072839040.0, + "grad_norm": 2.0178538609134837, + "language_loss": 0.66272652, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.68436372, + "num_input_tokens_seen": 69920060, + "step": 3241, + "time_per_iteration": 2.5215232372283936 + }, + { + "auxiliary_loss_clip": 0.01128532, + "auxiliary_loss_mlp": 0.01040839, + "balance_loss_clip": 1.05023754, + "balance_loss_mlp": 1.02332711, + "epoch": 0.1949195851495566, + "flos": 23295036925440.0, + "grad_norm": 1.452531203227124, + "language_loss": 0.8304584, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.85215211, + "num_input_tokens_seen": 69939820, + "step": 3242, + "time_per_iteration": 3.915907859802246 + }, + { + "auxiliary_loss_clip": 0.01066334, + "auxiliary_loss_mlp": 0.01012461, + "balance_loss_clip": 1.02512097, + "balance_loss_mlp": 1.00924253, + "epoch": 0.19497970840222456, + "flos": 64962871557120.0, + "grad_norm": 0.8419656905917792, + "language_loss": 0.57497168, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59575963, + "num_input_tokens_seen": 70002145, + "step": 3243, + "time_per_iteration": 3.1004536151885986 + }, + { + "auxiliary_loss_clip": 0.01131209, + "auxiliary_loss_mlp": 0.01052273, + "balance_loss_clip": 1.04534578, + "balance_loss_mlp": 1.03216207, + "epoch": 0.19503983165489253, + "flos": 19644717277440.0, + "grad_norm": 2.0738155526155, + "language_loss": 0.83651114, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.85834599, + "num_input_tokens_seen": 70020510, + "step": 3244, + "time_per_iteration": 3.8990097045898438 + }, + { + "auxiliary_loss_clip": 0.01137855, + "auxiliary_loss_mlp": 0.01047761, + "balance_loss_clip": 1.05025876, + "balance_loss_mlp": 1.03097618, + "epoch": 0.1950999549075605, + "flos": 21141976821120.0, + "grad_norm": 1.733635934154372, + "language_loss": 0.7715047, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.79336089, + "num_input_tokens_seen": 70040760, + "step": 3245, + "time_per_iteration": 2.496063232421875 + }, + { + "auxiliary_loss_clip": 0.0113642, + "auxiliary_loss_mlp": 0.01039543, + "balance_loss_clip": 1.04862273, + "balance_loss_mlp": 1.02148271, + "epoch": 0.19516007816022846, + "flos": 20884820376960.0, + "grad_norm": 2.0709844248693745, + "language_loss": 0.84433544, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.86609507, + "num_input_tokens_seen": 70058720, + "step": 3246, + "time_per_iteration": 2.4785845279693604 + }, + { + "auxiliary_loss_clip": 0.01139756, + "auxiliary_loss_mlp": 0.00796454, + "balance_loss_clip": 1.04889941, + "balance_loss_mlp": 1.01632953, + "epoch": 0.19522020141289645, + "flos": 16910515031040.0, + "grad_norm": 2.1199074476513067, + "language_loss": 0.75920796, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.77857006, + "num_input_tokens_seen": 70076470, + "step": 3247, + "time_per_iteration": 2.4822113513946533 + }, + { + "auxiliary_loss_clip": 0.01116161, + "auxiliary_loss_mlp": 0.01042711, + "balance_loss_clip": 1.05190015, + "balance_loss_mlp": 1.0252353, + "epoch": 0.19528032466556441, + "flos": 26724829023360.0, + "grad_norm": 2.009152373167641, + "language_loss": 0.75523454, + "learning_rate": 3.720215890515421e-06, + "loss": 0.77682328, + "num_input_tokens_seen": 70096220, + "step": 3248, + "time_per_iteration": 2.604114055633545 + }, + { + "auxiliary_loss_clip": 0.01148099, + "auxiliary_loss_mlp": 0.01049609, + "balance_loss_clip": 1.0467279, + "balance_loss_mlp": 1.03176355, + "epoch": 0.19534044791823238, + "flos": 21032808410880.0, + "grad_norm": 2.0496814784466566, + "language_loss": 0.78434885, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.80632591, + "num_input_tokens_seen": 70114800, + "step": 3249, + "time_per_iteration": 2.4469313621520996 + }, + { + "auxiliary_loss_clip": 0.01141007, + "auxiliary_loss_mlp": 0.01049143, + "balance_loss_clip": 1.04769468, + "balance_loss_mlp": 1.0316906, + "epoch": 0.19540057117090034, + "flos": 22344050396160.0, + "grad_norm": 1.5698944768530867, + "language_loss": 0.73113221, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.75303376, + "num_input_tokens_seen": 70134930, + "step": 3250, + "time_per_iteration": 2.488373279571533 + }, + { + "auxiliary_loss_clip": 0.0110173, + "auxiliary_loss_mlp": 0.0103971, + "balance_loss_clip": 1.04771435, + "balance_loss_mlp": 1.02213907, + "epoch": 0.1954606944235683, + "flos": 20301631159680.0, + "grad_norm": 2.0664699263406376, + "language_loss": 0.78815746, + "learning_rate": 3.719619589699017e-06, + "loss": 0.80957186, + "num_input_tokens_seen": 70152045, + "step": 3251, + "time_per_iteration": 2.5392239093780518 + }, + { + "auxiliary_loss_clip": 0.01148771, + "auxiliary_loss_mlp": 0.01044434, + "balance_loss_clip": 1.04765558, + "balance_loss_mlp": 1.02679086, + "epoch": 0.19552081767623627, + "flos": 17346865449600.0, + "grad_norm": 2.8671932463487555, + "language_loss": 0.83283484, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.85476685, + "num_input_tokens_seen": 70169240, + "step": 3252, + "time_per_iteration": 2.412708044052124 + }, + { + "auxiliary_loss_clip": 0.01136603, + "auxiliary_loss_mlp": 0.01052454, + "balance_loss_clip": 1.04738414, + "balance_loss_mlp": 1.03155637, + "epoch": 0.19558094092890424, + "flos": 31977626129280.0, + "grad_norm": 1.6965419249853735, + "language_loss": 0.7343502, + "learning_rate": 3.719221729768117e-06, + "loss": 0.75624073, + "num_input_tokens_seen": 70192690, + "step": 3253, + "time_per_iteration": 2.55769419670105 + }, + { + "auxiliary_loss_clip": 0.01099952, + "auxiliary_loss_mlp": 0.01041051, + "balance_loss_clip": 1.05002451, + "balance_loss_mlp": 1.0229311, + "epoch": 0.19564106418157223, + "flos": 22268889187200.0, + "grad_norm": 1.8238767103200455, + "language_loss": 0.76728785, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.7886979, + "num_input_tokens_seen": 70209685, + "step": 3254, + "time_per_iteration": 2.584441661834717 + }, + { + "auxiliary_loss_clip": 0.01023352, + "auxiliary_loss_mlp": 0.01007753, + "balance_loss_clip": 1.02680278, + "balance_loss_mlp": 1.0042721, + "epoch": 0.1957011874342402, + "flos": 54364554385920.0, + "grad_norm": 0.7612769350343391, + "language_loss": 0.55323118, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.5735423, + "num_input_tokens_seen": 70265050, + "step": 3255, + "time_per_iteration": 3.2127888202667236 + }, + { + "auxiliary_loss_clip": 0.01129108, + "auxiliary_loss_mlp": 0.01042786, + "balance_loss_clip": 1.0519352, + "balance_loss_mlp": 1.0245831, + "epoch": 0.19576131068690816, + "flos": 16506699356160.0, + "grad_norm": 2.6828205080654324, + "language_loss": 0.70497632, + "learning_rate": 3.718624450942688e-06, + "loss": 0.72669524, + "num_input_tokens_seen": 70281830, + "step": 3256, + "time_per_iteration": 2.9588983058929443 + }, + { + "auxiliary_loss_clip": 0.01147786, + "auxiliary_loss_mlp": 0.01042627, + "balance_loss_clip": 1.04783881, + "balance_loss_mlp": 1.02528226, + "epoch": 0.19582143393957613, + "flos": 14719676797440.0, + "grad_norm": 2.238339897395564, + "language_loss": 0.80067188, + "learning_rate": 3.718425227649987e-06, + "loss": 0.82257605, + "num_input_tokens_seen": 70297420, + "step": 3257, + "time_per_iteration": 2.450559616088867 + }, + { + "auxiliary_loss_clip": 0.01101482, + "auxiliary_loss_mlp": 0.01040291, + "balance_loss_clip": 1.05151486, + "balance_loss_mlp": 1.02313638, + "epoch": 0.1958815571922441, + "flos": 24425504737920.0, + "grad_norm": 2.01212194941165, + "language_loss": 0.74935007, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.77076781, + "num_input_tokens_seen": 70319210, + "step": 3258, + "time_per_iteration": 2.690911293029785 + }, + { + "auxiliary_loss_clip": 0.01081562, + "auxiliary_loss_mlp": 0.01048564, + "balance_loss_clip": 1.04737234, + "balance_loss_mlp": 1.02835798, + "epoch": 0.19594168044491206, + "flos": 24900279730560.0, + "grad_norm": 6.003969498498999, + "language_loss": 0.7383796, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.75968081, + "num_input_tokens_seen": 70339045, + "step": 3259, + "time_per_iteration": 2.665027618408203 + }, + { + "auxiliary_loss_clip": 0.01127066, + "auxiliary_loss_mlp": 0.0104349, + "balance_loss_clip": 1.04945016, + "balance_loss_mlp": 1.02467918, + "epoch": 0.19600180369758005, + "flos": 12057008486400.0, + "grad_norm": 2.8940158183417366, + "language_loss": 0.77196652, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.79367214, + "num_input_tokens_seen": 70356505, + "step": 3260, + "time_per_iteration": 2.5723419189453125 + }, + { + "auxiliary_loss_clip": 0.01140416, + "auxiliary_loss_mlp": 0.01049443, + "balance_loss_clip": 1.04751873, + "balance_loss_mlp": 1.03125155, + "epoch": 0.19606192695024802, + "flos": 20850202644480.0, + "grad_norm": 2.7742365845602373, + "language_loss": 0.82137764, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.84327614, + "num_input_tokens_seen": 70375410, + "step": 3261, + "time_per_iteration": 2.4897830486297607 + }, + { + "auxiliary_loss_clip": 0.01117739, + "auxiliary_loss_mlp": 0.01042568, + "balance_loss_clip": 1.05407238, + "balance_loss_mlp": 1.02399564, + "epoch": 0.19612205020291598, + "flos": 28475509996800.0, + "grad_norm": 1.789598330358096, + "language_loss": 0.76931751, + "learning_rate": 3.717428133894807e-06, + "loss": 0.7909205, + "num_input_tokens_seen": 70396315, + "step": 3262, + "time_per_iteration": 2.6371021270751953 + }, + { + "auxiliary_loss_clip": 0.01141717, + "auxiliary_loss_mlp": 0.01044113, + "balance_loss_clip": 1.05248868, + "balance_loss_mlp": 1.02661324, + "epoch": 0.19618217345558395, + "flos": 25556618995200.0, + "grad_norm": 1.7665637847834788, + "language_loss": 0.8616538, + "learning_rate": 3.71722851973837e-06, + "loss": 0.88351214, + "num_input_tokens_seen": 70417945, + "step": 3263, + "time_per_iteration": 2.5474720001220703 + }, + { + "auxiliary_loss_clip": 0.01128661, + "auxiliary_loss_mlp": 0.01038483, + "balance_loss_clip": 1.05128038, + "balance_loss_mlp": 1.02098322, + "epoch": 0.1962422967082519, + "flos": 25264413855360.0, + "grad_norm": 1.7996355052996422, + "language_loss": 0.7414028, + "learning_rate": 3.717028840464455e-06, + "loss": 0.76307428, + "num_input_tokens_seen": 70438690, + "step": 3264, + "time_per_iteration": 2.601285457611084 + }, + { + "auxiliary_loss_clip": 0.01137469, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.05127728, + "balance_loss_mlp": 1.028005, + "epoch": 0.19630241996091988, + "flos": 18807352444800.0, + "grad_norm": 2.080231413100442, + "language_loss": 0.78564823, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.80747259, + "num_input_tokens_seen": 70455385, + "step": 3265, + "time_per_iteration": 2.4894556999206543 + }, + { + "auxiliary_loss_clip": 0.01015473, + "auxiliary_loss_mlp": 0.01007761, + "balance_loss_clip": 1.01893032, + "balance_loss_mlp": 1.00510299, + "epoch": 0.19636254321358784, + "flos": 62321137896960.0, + "grad_norm": 0.7864506441228578, + "language_loss": 0.53439766, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55462998, + "num_input_tokens_seen": 70514280, + "step": 3266, + "time_per_iteration": 3.1949355602264404 + }, + { + "auxiliary_loss_clip": 0.01125634, + "auxiliary_loss_mlp": 0.00802539, + "balance_loss_clip": 1.0506525, + "balance_loss_mlp": 1.02467942, + "epoch": 0.19642266646625584, + "flos": 21069329564160.0, + "grad_norm": 1.9536652125328091, + "language_loss": 0.80348015, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.82276195, + "num_input_tokens_seen": 70531800, + "step": 3267, + "time_per_iteration": 2.5653140544891357 + }, + { + "auxiliary_loss_clip": 0.0112795, + "auxiliary_loss_mlp": 0.01043352, + "balance_loss_clip": 1.04846478, + "balance_loss_mlp": 1.02557826, + "epoch": 0.1964827897189238, + "flos": 14538651229440.0, + "grad_norm": 2.1332956642785974, + "language_loss": 0.86523467, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.88694769, + "num_input_tokens_seen": 70550615, + "step": 3268, + "time_per_iteration": 2.5538671016693115 + }, + { + "auxiliary_loss_clip": 0.01102881, + "auxiliary_loss_mlp": 0.01045207, + "balance_loss_clip": 1.05378771, + "balance_loss_mlp": 1.02756429, + "epoch": 0.19654291297159177, + "flos": 19244636616960.0, + "grad_norm": 1.980292437854082, + "language_loss": 0.69060701, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.71208787, + "num_input_tokens_seen": 70568690, + "step": 3269, + "time_per_iteration": 2.590402364730835 + }, + { + "auxiliary_loss_clip": 0.01115434, + "auxiliary_loss_mlp": 0.01059585, + "balance_loss_clip": 1.05122888, + "balance_loss_mlp": 1.04034424, + "epoch": 0.19660303622425973, + "flos": 25775710001280.0, + "grad_norm": 1.8391123692930271, + "language_loss": 0.80823714, + "learning_rate": 3.715829397778135e-06, + "loss": 0.82998729, + "num_input_tokens_seen": 70588665, + "step": 3270, + "time_per_iteration": 2.590343475341797 + }, + { + "auxiliary_loss_clip": 0.01136823, + "auxiliary_loss_mlp": 0.0104708, + "balance_loss_clip": 1.04876447, + "balance_loss_mlp": 1.02994931, + "epoch": 0.1966631594769277, + "flos": 20595093275520.0, + "grad_norm": 2.0915679383222003, + "language_loss": 0.84022164, + "learning_rate": 3.715629262894028e-06, + "loss": 0.86206067, + "num_input_tokens_seen": 70606900, + "step": 3271, + "time_per_iteration": 2.498685598373413 + }, + { + "auxiliary_loss_clip": 0.01136372, + "auxiliary_loss_mlp": 0.01057328, + "balance_loss_clip": 1.05152476, + "balance_loss_mlp": 1.03949463, + "epoch": 0.19672328272959566, + "flos": 23623188600960.0, + "grad_norm": 2.0764233844289346, + "language_loss": 0.80308515, + "learning_rate": 3.715429062953087e-06, + "loss": 0.8250221, + "num_input_tokens_seen": 70625955, + "step": 3272, + "time_per_iteration": 2.5142102241516113 + }, + { + "auxiliary_loss_clip": 0.01118086, + "auxiliary_loss_mlp": 0.01065935, + "balance_loss_clip": 1.04689813, + "balance_loss_mlp": 1.04437006, + "epoch": 0.19678340598226365, + "flos": 23110922787840.0, + "grad_norm": 1.8189130691948165, + "language_loss": 0.8047595, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.82659972, + "num_input_tokens_seen": 70646090, + "step": 3273, + "time_per_iteration": 2.547788381576538 + }, + { + "auxiliary_loss_clip": 0.01140584, + "auxiliary_loss_mlp": 0.01053516, + "balance_loss_clip": 1.05093765, + "balance_loss_mlp": 1.03636241, + "epoch": 0.19684352923493162, + "flos": 24534852716160.0, + "grad_norm": 1.9101776131046144, + "language_loss": 0.77595425, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.79789531, + "num_input_tokens_seen": 70666065, + "step": 3274, + "time_per_iteration": 2.5332815647125244 + }, + { + "auxiliary_loss_clip": 0.01138913, + "auxiliary_loss_mlp": 0.01052428, + "balance_loss_clip": 1.05340278, + "balance_loss_mlp": 1.03376007, + "epoch": 0.19690365248759958, + "flos": 21796448578560.0, + "grad_norm": 7.51699774298835, + "language_loss": 0.81107926, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83299267, + "num_input_tokens_seen": 70681580, + "step": 3275, + "time_per_iteration": 2.4692206382751465 + }, + { + "auxiliary_loss_clip": 0.01114769, + "auxiliary_loss_mlp": 0.01049236, + "balance_loss_clip": 1.05106986, + "balance_loss_mlp": 1.03051984, + "epoch": 0.19696377574026755, + "flos": 19056643810560.0, + "grad_norm": 2.3296336930953836, + "language_loss": 0.81116921, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.83280933, + "num_input_tokens_seen": 70697745, + "step": 3276, + "time_per_iteration": 3.912823438644409 + }, + { + "auxiliary_loss_clip": 0.01136738, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.04681683, + "balance_loss_mlp": 1.01937485, + "epoch": 0.19702389899293551, + "flos": 22820656982400.0, + "grad_norm": 2.1966898592045774, + "language_loss": 0.89353323, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.91527575, + "num_input_tokens_seen": 70715110, + "step": 3277, + "time_per_iteration": 3.8930046558380127 + }, + { + "auxiliary_loss_clip": 0.01105764, + "auxiliary_loss_mlp": 0.01050758, + "balance_loss_clip": 1.05055058, + "balance_loss_mlp": 1.03054023, + "epoch": 0.19708402224560348, + "flos": 22894237992960.0, + "grad_norm": 2.1569819907952414, + "language_loss": 0.62501413, + "learning_rate": 3.714226497539239e-06, + "loss": 0.64657938, + "num_input_tokens_seen": 70734715, + "step": 3278, + "time_per_iteration": 2.601461172103882 + }, + { + "auxiliary_loss_clip": 0.01113778, + "auxiliary_loss_mlp": 0.01055316, + "balance_loss_clip": 1.05154347, + "balance_loss_mlp": 1.03617048, + "epoch": 0.19714414549827144, + "flos": 25662519267840.0, + "grad_norm": 2.262917658758903, + "language_loss": 0.73796356, + "learning_rate": 3.714025842413166e-06, + "loss": 0.75965452, + "num_input_tokens_seen": 70752650, + "step": 3279, + "time_per_iteration": 2.5930287837982178 + }, + { + "auxiliary_loss_clip": 0.01141249, + "auxiliary_loss_mlp": 0.01042013, + "balance_loss_clip": 1.04807723, + "balance_loss_mlp": 1.02431083, + "epoch": 0.19720426875093944, + "flos": 23915824704000.0, + "grad_norm": 1.8725884200075202, + "language_loss": 0.8259899, + "learning_rate": 3.713825122291061e-06, + "loss": 0.84782255, + "num_input_tokens_seen": 70772365, + "step": 3280, + "time_per_iteration": 2.5292646884918213 + }, + { + "auxiliary_loss_clip": 0.01106847, + "auxiliary_loss_mlp": 0.01052886, + "balance_loss_clip": 1.0516752, + "balance_loss_mlp": 1.03366971, + "epoch": 0.1972643920036074, + "flos": 13881952828800.0, + "grad_norm": 3.030054514917669, + "language_loss": 0.77674508, + "learning_rate": 3.713624337180536e-06, + "loss": 0.79834247, + "num_input_tokens_seen": 70790340, + "step": 3281, + "time_per_iteration": 4.0080695152282715 + }, + { + "auxiliary_loss_clip": 0.01122576, + "auxiliary_loss_mlp": 0.01047162, + "balance_loss_clip": 1.053738, + "balance_loss_mlp": 1.03012705, + "epoch": 0.19732451525627537, + "flos": 19863592801920.0, + "grad_norm": 1.7199607620967226, + "language_loss": 0.79417372, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.81587112, + "num_input_tokens_seen": 70809295, + "step": 3282, + "time_per_iteration": 2.536085605621338 + }, + { + "auxiliary_loss_clip": 0.01110383, + "auxiliary_loss_mlp": 0.01044473, + "balance_loss_clip": 1.05225313, + "balance_loss_mlp": 1.02683043, + "epoch": 0.19738463850894333, + "flos": 24973429777920.0, + "grad_norm": 3.585271668549556, + "language_loss": 0.71985221, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.74140084, + "num_input_tokens_seen": 70828765, + "step": 3283, + "time_per_iteration": 4.006861209869385 + }, + { + "auxiliary_loss_clip": 0.01135635, + "auxiliary_loss_mlp": 0.01046812, + "balance_loss_clip": 1.05172777, + "balance_loss_mlp": 1.02934802, + "epoch": 0.1974447617616113, + "flos": 18368883123840.0, + "grad_norm": 2.57439658783808, + "language_loss": 0.78905171, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.81087625, + "num_input_tokens_seen": 70846805, + "step": 3284, + "time_per_iteration": 2.473994255065918 + }, + { + "auxiliary_loss_clip": 0.011254, + "auxiliary_loss_mlp": 0.00801549, + "balance_loss_clip": 1.05192971, + "balance_loss_mlp": 1.02437925, + "epoch": 0.19750488501427926, + "flos": 22892945103360.0, + "grad_norm": 2.0083784989296305, + "language_loss": 0.86485088, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.88412035, + "num_input_tokens_seen": 70863805, + "step": 3285, + "time_per_iteration": 2.5330419540405273 + }, + { + "auxiliary_loss_clip": 0.01122696, + "auxiliary_loss_mlp": 0.01044876, + "balance_loss_clip": 1.05545712, + "balance_loss_mlp": 1.02672076, + "epoch": 0.19756500826694723, + "flos": 21871502046720.0, + "grad_norm": 2.064065742610555, + "language_loss": 0.88246822, + "learning_rate": 3.712619437068174e-06, + "loss": 0.90414393, + "num_input_tokens_seen": 70882660, + "step": 3286, + "time_per_iteration": 2.5421111583709717 + }, + { + "auxiliary_loss_clip": 0.01117421, + "auxiliary_loss_mlp": 0.01053657, + "balance_loss_clip": 1.05129194, + "balance_loss_mlp": 1.03209245, + "epoch": 0.19762513151961522, + "flos": 15158972131200.0, + "grad_norm": 2.149687026494698, + "language_loss": 0.77930146, + "learning_rate": 3.712418262187102e-06, + "loss": 0.80101228, + "num_input_tokens_seen": 70898765, + "step": 3287, + "time_per_iteration": 2.529116153717041 + }, + { + "auxiliary_loss_clip": 0.0112764, + "auxiliary_loss_mlp": 0.01050901, + "balance_loss_clip": 1.05326104, + "balance_loss_mlp": 1.0316608, + "epoch": 0.1976852547722832, + "flos": 16979175878400.0, + "grad_norm": 4.016206525924968, + "language_loss": 0.8148526, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.83663797, + "num_input_tokens_seen": 70916370, + "step": 3288, + "time_per_iteration": 2.4900996685028076 + }, + { + "auxiliary_loss_clip": 0.01131263, + "auxiliary_loss_mlp": 0.01051415, + "balance_loss_clip": 1.05077195, + "balance_loss_mlp": 1.03343868, + "epoch": 0.19774537802495115, + "flos": 20302924049280.0, + "grad_norm": 1.6543335769540386, + "language_loss": 0.72898316, + "learning_rate": 3.712015717627374e-06, + "loss": 0.75080991, + "num_input_tokens_seen": 70934870, + "step": 3289, + "time_per_iteration": 2.4970452785491943 + }, + { + "auxiliary_loss_clip": 0.01131723, + "auxiliary_loss_mlp": 0.01045109, + "balance_loss_clip": 1.05309796, + "balance_loss_mlp": 1.02616692, + "epoch": 0.19780550127761912, + "flos": 27235478724480.0, + "grad_norm": 1.7654481887142006, + "language_loss": 0.79606974, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.81783807, + "num_input_tokens_seen": 70955140, + "step": 3290, + "time_per_iteration": 2.571993827819824 + }, + { + "auxiliary_loss_clip": 0.01045557, + "auxiliary_loss_mlp": 0.01011122, + "balance_loss_clip": 1.02707219, + "balance_loss_mlp": 1.00857115, + "epoch": 0.19786562453028708, + "flos": 63550972684800.0, + "grad_norm": 0.8960846080928302, + "language_loss": 0.60342979, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62399656, + "num_input_tokens_seen": 71012005, + "step": 3291, + "time_per_iteration": 3.167348861694336 + }, + { + "auxiliary_loss_clip": 0.01155999, + "auxiliary_loss_mlp": 0.01043635, + "balance_loss_clip": 1.05201626, + "balance_loss_mlp": 1.02373922, + "epoch": 0.19792574778295505, + "flos": 26286647011200.0, + "grad_norm": 1.7942676119591836, + "language_loss": 0.8133986, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.83539498, + "num_input_tokens_seen": 71031140, + "step": 3292, + "time_per_iteration": 2.5083141326904297 + }, + { + "auxiliary_loss_clip": 0.01117678, + "auxiliary_loss_mlp": 0.00799688, + "balance_loss_clip": 1.05238008, + "balance_loss_mlp": 1.01527464, + "epoch": 0.19798587103562304, + "flos": 19938107566080.0, + "grad_norm": 1.9341564156621003, + "language_loss": 0.81406951, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.83324325, + "num_input_tokens_seen": 71050250, + "step": 3293, + "time_per_iteration": 2.543415069580078 + }, + { + "auxiliary_loss_clip": 0.01136142, + "auxiliary_loss_mlp": 0.01049126, + "balance_loss_clip": 1.05260229, + "balance_loss_mlp": 1.02933717, + "epoch": 0.198045994288291, + "flos": 20120282369280.0, + "grad_norm": 1.982512192608721, + "language_loss": 0.61448193, + "learning_rate": 3.711008220265093e-06, + "loss": 0.6363346, + "num_input_tokens_seen": 71068665, + "step": 3294, + "time_per_iteration": 2.541515588760376 + }, + { + "auxiliary_loss_clip": 0.01129779, + "auxiliary_loss_mlp": 0.01039001, + "balance_loss_clip": 1.05084419, + "balance_loss_mlp": 1.02164388, + "epoch": 0.19810611754095897, + "flos": 17967653228160.0, + "grad_norm": 1.9813279064381748, + "language_loss": 0.87191004, + "learning_rate": 3.710806526117251e-06, + "loss": 0.89359784, + "num_input_tokens_seen": 71085320, + "step": 3295, + "time_per_iteration": 2.5219264030456543 + }, + { + "auxiliary_loss_clip": 0.01110873, + "auxiliary_loss_mlp": 0.01053328, + "balance_loss_clip": 1.04860854, + "balance_loss_mlp": 1.03541112, + "epoch": 0.19816624079362694, + "flos": 15084996071040.0, + "grad_norm": 2.4546391364681766, + "language_loss": 0.80768919, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.82933116, + "num_input_tokens_seen": 71102020, + "step": 3296, + "time_per_iteration": 2.5175559520721436 + }, + { + "auxiliary_loss_clip": 0.01118491, + "auxiliary_loss_mlp": 0.01048832, + "balance_loss_clip": 1.04787326, + "balance_loss_mlp": 1.02841187, + "epoch": 0.1982263640462949, + "flos": 24900315644160.0, + "grad_norm": 2.084209941560928, + "language_loss": 0.68266779, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70434099, + "num_input_tokens_seen": 71123390, + "step": 3297, + "time_per_iteration": 2.59619402885437 + }, + { + "auxiliary_loss_clip": 0.01148442, + "auxiliary_loss_mlp": 0.01037363, + "balance_loss_clip": 1.05175829, + "balance_loss_mlp": 1.01985145, + "epoch": 0.19828648729896287, + "flos": 20376181837440.0, + "grad_norm": 1.8559053624726098, + "language_loss": 0.81644136, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83829939, + "num_input_tokens_seen": 71141800, + "step": 3298, + "time_per_iteration": 2.447580337524414 + }, + { + "auxiliary_loss_clip": 0.01131663, + "auxiliary_loss_mlp": 0.01041661, + "balance_loss_clip": 1.05202651, + "balance_loss_mlp": 1.02133608, + "epoch": 0.19834661055163083, + "flos": 18880035615360.0, + "grad_norm": 2.4752445228323525, + "language_loss": 0.85185754, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.87359071, + "num_input_tokens_seen": 71159505, + "step": 3299, + "time_per_iteration": 2.513279676437378 + }, + { + "auxiliary_loss_clip": 0.01033989, + "auxiliary_loss_mlp": 0.01006097, + "balance_loss_clip": 1.02562237, + "balance_loss_mlp": 1.00346231, + "epoch": 0.19840673380429882, + "flos": 60259184640000.0, + "grad_norm": 0.7982187505892517, + "language_loss": 0.53235543, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55275631, + "num_input_tokens_seen": 71223265, + "step": 3300, + "time_per_iteration": 3.141616106033325 + }, + { + "auxiliary_loss_clip": 0.01105926, + "auxiliary_loss_mlp": 0.01063595, + "balance_loss_clip": 1.04684436, + "balance_loss_mlp": 1.04039681, + "epoch": 0.1984668570569668, + "flos": 19902017376000.0, + "grad_norm": 1.6103771596426892, + "language_loss": 0.73596382, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.75765902, + "num_input_tokens_seen": 71242385, + "step": 3301, + "time_per_iteration": 2.5656323432922363 + }, + { + "auxiliary_loss_clip": 0.01114868, + "auxiliary_loss_mlp": 0.01042185, + "balance_loss_clip": 1.05292201, + "balance_loss_mlp": 1.02411282, + "epoch": 0.19852698030963475, + "flos": 15630766295040.0, + "grad_norm": 2.409301268279074, + "language_loss": 0.87857592, + "learning_rate": 3.709392851040235e-06, + "loss": 0.90014637, + "num_input_tokens_seen": 71258990, + "step": 3302, + "time_per_iteration": 2.5389819145202637 + }, + { + "auxiliary_loss_clip": 0.01117342, + "auxiliary_loss_mlp": 0.01047402, + "balance_loss_clip": 1.05323255, + "balance_loss_mlp": 1.02878189, + "epoch": 0.19858710356230272, + "flos": 43143007311360.0, + "grad_norm": 4.02163790287082, + "language_loss": 0.73582959, + "learning_rate": 3.709190638115111e-06, + "loss": 0.75747699, + "num_input_tokens_seen": 71282770, + "step": 3303, + "time_per_iteration": 2.763754367828369 + }, + { + "auxiliary_loss_clip": 0.01137524, + "auxiliary_loss_mlp": 0.01046956, + "balance_loss_clip": 1.051615, + "balance_loss_mlp": 1.02900338, + "epoch": 0.19864722681497068, + "flos": 35144084643840.0, + "grad_norm": 2.1624385797327514, + "language_loss": 0.74912369, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.7709685, + "num_input_tokens_seen": 71301410, + "step": 3304, + "time_per_iteration": 2.605717420578003 + }, + { + "auxiliary_loss_clip": 0.01127848, + "auxiliary_loss_mlp": 0.0103643, + "balance_loss_clip": 1.04958153, + "balance_loss_mlp": 1.01901329, + "epoch": 0.19870735006763865, + "flos": 19426200888960.0, + "grad_norm": 1.6697206178558528, + "language_loss": 0.86044621, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.88208902, + "num_input_tokens_seen": 71319670, + "step": 3305, + "time_per_iteration": 2.5096516609191895 + }, + { + "auxiliary_loss_clip": 0.01126886, + "auxiliary_loss_mlp": 0.01042047, + "balance_loss_clip": 1.04734612, + "balance_loss_mlp": 1.0237844, + "epoch": 0.19876747332030664, + "flos": 23547380947200.0, + "grad_norm": 1.76553115385197, + "language_loss": 0.68208426, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.70377362, + "num_input_tokens_seen": 71339850, + "step": 3306, + "time_per_iteration": 2.573091745376587 + }, + { + "auxiliary_loss_clip": 0.01116082, + "auxiliary_loss_mlp": 0.01039677, + "balance_loss_clip": 1.04718351, + "balance_loss_mlp": 1.0226059, + "epoch": 0.1988275965729746, + "flos": 19829406032640.0, + "grad_norm": 1.562596760300182, + "language_loss": 0.76212156, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.78367913, + "num_input_tokens_seen": 71359795, + "step": 3307, + "time_per_iteration": 2.540689706802368 + }, + { + "auxiliary_loss_clip": 0.01150609, + "auxiliary_loss_mlp": 0.01044448, + "balance_loss_clip": 1.05207109, + "balance_loss_mlp": 1.02725768, + "epoch": 0.19888771982564257, + "flos": 23513625141120.0, + "grad_norm": 1.7432862006242638, + "language_loss": 0.75497031, + "learning_rate": 3.708178601452737e-06, + "loss": 0.77692086, + "num_input_tokens_seen": 71378885, + "step": 3308, + "time_per_iteration": 2.514305830001831 + }, + { + "auxiliary_loss_clip": 0.01105231, + "auxiliary_loss_mlp": 0.01043465, + "balance_loss_clip": 1.05255628, + "balance_loss_mlp": 1.02467752, + "epoch": 0.19894784307831054, + "flos": 18150510389760.0, + "grad_norm": 1.7579892345177726, + "language_loss": 0.76092207, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.78240907, + "num_input_tokens_seen": 71397285, + "step": 3309, + "time_per_iteration": 2.571211099624634 + }, + { + "auxiliary_loss_clip": 0.01136556, + "auxiliary_loss_mlp": 0.01052244, + "balance_loss_clip": 1.04954743, + "balance_loss_mlp": 1.03232408, + "epoch": 0.1990079663309785, + "flos": 24276044246400.0, + "grad_norm": 1.5608973085550537, + "language_loss": 0.87897718, + "learning_rate": 3.707773333313917e-06, + "loss": 0.9008652, + "num_input_tokens_seen": 71415775, + "step": 3310, + "time_per_iteration": 2.54250168800354 + }, + { + "auxiliary_loss_clip": 0.01147271, + "auxiliary_loss_mlp": 0.01038838, + "balance_loss_clip": 1.0494802, + "balance_loss_mlp": 1.0209806, + "epoch": 0.19906808958364647, + "flos": 34897666366080.0, + "grad_norm": 2.3759238770042055, + "language_loss": 0.63846099, + "learning_rate": 3.70757060210226e-06, + "loss": 0.66032207, + "num_input_tokens_seen": 71437315, + "step": 3311, + "time_per_iteration": 2.573270320892334 + }, + { + "auxiliary_loss_clip": 0.01111877, + "auxiliary_loss_mlp": 0.01040979, + "balance_loss_clip": 1.04895639, + "balance_loss_mlp": 1.02271628, + "epoch": 0.19912821283631443, + "flos": 24024885373440.0, + "grad_norm": 3.023810423911428, + "language_loss": 0.74011111, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76163965, + "num_input_tokens_seen": 71456320, + "step": 3312, + "time_per_iteration": 2.5957820415496826 + }, + { + "auxiliary_loss_clip": 0.01137433, + "auxiliary_loss_mlp": 0.01048727, + "balance_loss_clip": 1.05104733, + "balance_loss_mlp": 1.03085732, + "epoch": 0.19918833608898243, + "flos": 19859031774720.0, + "grad_norm": 2.005599126884347, + "language_loss": 0.834768, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.85662961, + "num_input_tokens_seen": 71475360, + "step": 3313, + "time_per_iteration": 2.5033764839172363 + }, + { + "auxiliary_loss_clip": 0.01138602, + "auxiliary_loss_mlp": 0.01045383, + "balance_loss_clip": 1.0513072, + "balance_loss_mlp": 1.02751374, + "epoch": 0.1992484593416504, + "flos": 29095794984960.0, + "grad_norm": 2.01811917967288, + "language_loss": 0.80942428, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.83126414, + "num_input_tokens_seen": 71496155, + "step": 3314, + "time_per_iteration": 2.5842645168304443 + }, + { + "auxiliary_loss_clip": 0.01109761, + "auxiliary_loss_mlp": 0.01046759, + "balance_loss_clip": 1.04544187, + "balance_loss_mlp": 1.02937865, + "epoch": 0.19930858259431836, + "flos": 23295001011840.0, + "grad_norm": 1.665119470094462, + "language_loss": 0.87711012, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.89867526, + "num_input_tokens_seen": 71517295, + "step": 3315, + "time_per_iteration": 4.028660535812378 + }, + { + "auxiliary_loss_clip": 0.01114626, + "auxiliary_loss_mlp": 0.00796958, + "balance_loss_clip": 1.05146539, + "balance_loss_mlp": 1.01423144, + "epoch": 0.19936870584698632, + "flos": 25378825651200.0, + "grad_norm": 1.8894919509743227, + "language_loss": 0.71273398, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.73184979, + "num_input_tokens_seen": 71540000, + "step": 3316, + "time_per_iteration": 3.9917256832122803 + }, + { + "auxiliary_loss_clip": 0.01016674, + "auxiliary_loss_mlp": 0.01008294, + "balance_loss_clip": 1.02884018, + "balance_loss_mlp": 1.00561166, + "epoch": 0.1994288290996543, + "flos": 62168053109760.0, + "grad_norm": 0.835463578108911, + "language_loss": 0.66240323, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68265295, + "num_input_tokens_seen": 71607880, + "step": 3317, + "time_per_iteration": 3.3859307765960693 + }, + { + "auxiliary_loss_clip": 0.01139043, + "auxiliary_loss_mlp": 0.01049067, + "balance_loss_clip": 1.04822743, + "balance_loss_mlp": 1.02998209, + "epoch": 0.19948895235232225, + "flos": 19025832919680.0, + "grad_norm": 2.3023488155875937, + "language_loss": 0.74178112, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.76366222, + "num_input_tokens_seen": 71625695, + "step": 3318, + "time_per_iteration": 2.5921742916107178 + }, + { + "auxiliary_loss_clip": 0.01113032, + "auxiliary_loss_mlp": 0.01049382, + "balance_loss_clip": 1.04779887, + "balance_loss_mlp": 1.03201294, + "epoch": 0.19954907560499022, + "flos": 37815803182080.0, + "grad_norm": 2.2028137995519024, + "language_loss": 0.79032218, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.81194633, + "num_input_tokens_seen": 71648520, + "step": 3319, + "time_per_iteration": 2.7042741775512695 + }, + { + "auxiliary_loss_clip": 0.01132079, + "auxiliary_loss_mlp": 0.01042274, + "balance_loss_clip": 1.05224919, + "balance_loss_mlp": 1.0230453, + "epoch": 0.1996091988576582, + "flos": 49565199594240.0, + "grad_norm": 2.5041305526812043, + "language_loss": 0.76276696, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.78451049, + "num_input_tokens_seen": 71672185, + "step": 3320, + "time_per_iteration": 4.230144739151001 + }, + { + "auxiliary_loss_clip": 0.01116677, + "auxiliary_loss_mlp": 0.01044356, + "balance_loss_clip": 1.04906225, + "balance_loss_mlp": 1.02662969, + "epoch": 0.19966932211032618, + "flos": 22635788659200.0, + "grad_norm": 1.917012296278872, + "language_loss": 0.80064785, + "learning_rate": 3.705539729936701e-06, + "loss": 0.82225817, + "num_input_tokens_seen": 71692890, + "step": 3321, + "time_per_iteration": 3.9482314586639404 + }, + { + "auxiliary_loss_clip": 0.01029903, + "auxiliary_loss_mlp": 0.01004601, + "balance_loss_clip": 1.02104378, + "balance_loss_mlp": 1.00187135, + "epoch": 0.19972944536299414, + "flos": 54082117745280.0, + "grad_norm": 0.9025459696493671, + "language_loss": 0.65171719, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67206216, + "num_input_tokens_seen": 71745815, + "step": 3322, + "time_per_iteration": 2.945993423461914 + }, + { + "auxiliary_loss_clip": 0.01036975, + "auxiliary_loss_mlp": 0.01013504, + "balance_loss_clip": 1.02747512, + "balance_loss_mlp": 1.01085782, + "epoch": 0.1997895686156621, + "flos": 69355031817600.0, + "grad_norm": 0.7864191662094184, + "language_loss": 0.5695821, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59008694, + "num_input_tokens_seen": 71806915, + "step": 3323, + "time_per_iteration": 3.2409138679504395 + }, + { + "auxiliary_loss_clip": 0.0112959, + "auxiliary_loss_mlp": 0.00798671, + "balance_loss_clip": 1.05184698, + "balance_loss_mlp": 1.01703179, + "epoch": 0.19984969186833007, + "flos": 18552063507840.0, + "grad_norm": 2.0090650435752044, + "language_loss": 0.80539191, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.82467455, + "num_input_tokens_seen": 71824645, + "step": 3324, + "time_per_iteration": 2.5097837448120117 + }, + { + "auxiliary_loss_clip": 0.01131123, + "auxiliary_loss_mlp": 0.01045698, + "balance_loss_clip": 1.04686594, + "balance_loss_mlp": 1.02650571, + "epoch": 0.19990981512099804, + "flos": 26429678968320.0, + "grad_norm": 1.7889441359760727, + "language_loss": 0.53707862, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.55884683, + "num_input_tokens_seen": 71845125, + "step": 3325, + "time_per_iteration": 2.555075168609619 + }, + { + "auxiliary_loss_clip": 0.01127579, + "auxiliary_loss_mlp": 0.01046714, + "balance_loss_clip": 1.05075932, + "balance_loss_mlp": 1.02959538, + "epoch": 0.19996993837366603, + "flos": 16325997010560.0, + "grad_norm": 1.909021284812481, + "language_loss": 0.8619293, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.88367224, + "num_input_tokens_seen": 71863500, + "step": 3326, + "time_per_iteration": 2.5059351921081543 + }, + { + "auxiliary_loss_clip": 0.01148041, + "auxiliary_loss_mlp": 0.01041268, + "balance_loss_clip": 1.05154228, + "balance_loss_mlp": 1.02360117, + "epoch": 0.200030061626334, + "flos": 20844169159680.0, + "grad_norm": 1.8952223124237986, + "language_loss": 0.71900713, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.74090016, + "num_input_tokens_seen": 71881845, + "step": 3327, + "time_per_iteration": 2.4791247844696045 + }, + { + "auxiliary_loss_clip": 0.01125862, + "auxiliary_loss_mlp": 0.01046439, + "balance_loss_clip": 1.04959095, + "balance_loss_mlp": 1.0271033, + "epoch": 0.20009018487900196, + "flos": 23762629198080.0, + "grad_norm": 2.305050613111692, + "language_loss": 0.76387143, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.7855944, + "num_input_tokens_seen": 71900940, + "step": 3328, + "time_per_iteration": 2.5441510677337646 + }, + { + "auxiliary_loss_clip": 0.01120335, + "auxiliary_loss_mlp": 0.01043594, + "balance_loss_clip": 1.05129337, + "balance_loss_mlp": 1.0263567, + "epoch": 0.20015030813166992, + "flos": 28111555440000.0, + "grad_norm": 1.7423102762457783, + "language_loss": 0.69776535, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.71940458, + "num_input_tokens_seen": 71921925, + "step": 3329, + "time_per_iteration": 2.653041362762451 + }, + { + "auxiliary_loss_clip": 0.01106715, + "auxiliary_loss_mlp": 0.0106203, + "balance_loss_clip": 1.04736459, + "balance_loss_mlp": 1.03905833, + "epoch": 0.2002104313843379, + "flos": 26067160955520.0, + "grad_norm": 1.7803965756444502, + "language_loss": 0.81644595, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.83813334, + "num_input_tokens_seen": 71941855, + "step": 3330, + "time_per_iteration": 2.6207263469696045 + }, + { + "auxiliary_loss_clip": 0.01134305, + "auxiliary_loss_mlp": 0.01040921, + "balance_loss_clip": 1.05068576, + "balance_loss_mlp": 1.02270579, + "epoch": 0.20027055463700585, + "flos": 22966633854720.0, + "grad_norm": 2.152395380354671, + "language_loss": 0.76646584, + "learning_rate": 3.703502390349417e-06, + "loss": 0.78821808, + "num_input_tokens_seen": 71960915, + "step": 3331, + "time_per_iteration": 2.5592243671417236 + }, + { + "auxiliary_loss_clip": 0.01095142, + "auxiliary_loss_mlp": 0.01060408, + "balance_loss_clip": 1.04886901, + "balance_loss_mlp": 1.03905725, + "epoch": 0.20033067788967382, + "flos": 17165660313600.0, + "grad_norm": 2.1085381614126235, + "language_loss": 0.79292601, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.8144815, + "num_input_tokens_seen": 71979220, + "step": 3332, + "time_per_iteration": 2.678664445877075 + }, + { + "auxiliary_loss_clip": 0.01048226, + "auxiliary_loss_mlp": 0.0101148, + "balance_loss_clip": 1.02209735, + "balance_loss_mlp": 1.00869048, + "epoch": 0.2003908011423418, + "flos": 60825566292480.0, + "grad_norm": 1.4618236980177526, + "language_loss": 0.61940742, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64000452, + "num_input_tokens_seen": 72033950, + "step": 3333, + "time_per_iteration": 3.000027656555176 + }, + { + "auxiliary_loss_clip": 0.01112419, + "auxiliary_loss_mlp": 0.00799368, + "balance_loss_clip": 1.05128431, + "balance_loss_mlp": 1.01804841, + "epoch": 0.20045092439500978, + "flos": 24206234163840.0, + "grad_norm": 2.0536790796252453, + "language_loss": 0.80988765, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.82900548, + "num_input_tokens_seen": 72051395, + "step": 3334, + "time_per_iteration": 2.615004301071167 + }, + { + "auxiliary_loss_clip": 0.01094649, + "auxiliary_loss_mlp": 0.01052167, + "balance_loss_clip": 1.0496279, + "balance_loss_mlp": 1.03225923, + "epoch": 0.20051104764767774, + "flos": 29387605075200.0, + "grad_norm": 2.6302363892076444, + "language_loss": 0.74409306, + "learning_rate": 3.702685645366134e-06, + "loss": 0.76556122, + "num_input_tokens_seen": 72071305, + "step": 3335, + "time_per_iteration": 2.680752992630005 + }, + { + "auxiliary_loss_clip": 0.01148995, + "auxiliary_loss_mlp": 0.01058225, + "balance_loss_clip": 1.05612075, + "balance_loss_mlp": 1.03980756, + "epoch": 0.2005711709003457, + "flos": 23513804709120.0, + "grad_norm": 1.6423333662693385, + "language_loss": 0.80093241, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.8230046, + "num_input_tokens_seen": 72090165, + "step": 3336, + "time_per_iteration": 2.5767722129821777 + }, + { + "auxiliary_loss_clip": 0.01113488, + "auxiliary_loss_mlp": 0.01054803, + "balance_loss_clip": 1.04849029, + "balance_loss_mlp": 1.03441882, + "epoch": 0.20063129415301367, + "flos": 22523388024960.0, + "grad_norm": 2.9492692045898976, + "language_loss": 0.77535897, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.79704189, + "num_input_tokens_seen": 72107210, + "step": 3337, + "time_per_iteration": 2.59751296043396 + }, + { + "auxiliary_loss_clip": 0.01149483, + "auxiliary_loss_mlp": 0.01046774, + "balance_loss_clip": 1.05023205, + "balance_loss_mlp": 1.02717578, + "epoch": 0.20069141740568164, + "flos": 25958243940480.0, + "grad_norm": 2.2067088808026023, + "language_loss": 0.6936323, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.71559489, + "num_input_tokens_seen": 72126315, + "step": 3338, + "time_per_iteration": 2.5493454933166504 + }, + { + "auxiliary_loss_clip": 0.01119045, + "auxiliary_loss_mlp": 0.01052254, + "balance_loss_clip": 1.05071568, + "balance_loss_mlp": 1.03356171, + "epoch": 0.2007515406583496, + "flos": 24790608529920.0, + "grad_norm": 2.063211476400424, + "language_loss": 0.69063288, + "learning_rate": 3.701867867326735e-06, + "loss": 0.7123459, + "num_input_tokens_seen": 72146470, + "step": 3339, + "time_per_iteration": 2.644824266433716 + }, + { + "auxiliary_loss_clip": 0.01119451, + "auxiliary_loss_mlp": 0.01041795, + "balance_loss_clip": 1.05320835, + "balance_loss_mlp": 1.02321076, + "epoch": 0.2008116639110176, + "flos": 37925582123520.0, + "grad_norm": 2.0489630559883545, + "language_loss": 0.66515994, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.68677247, + "num_input_tokens_seen": 72166600, + "step": 3340, + "time_per_iteration": 2.8311378955841064 + }, + { + "auxiliary_loss_clip": 0.0114176, + "auxiliary_loss_mlp": 0.01038486, + "balance_loss_clip": 1.05174339, + "balance_loss_mlp": 1.01934147, + "epoch": 0.20087178716368556, + "flos": 20740531443840.0, + "grad_norm": 2.338644935214447, + "language_loss": 0.74338686, + "learning_rate": 3.701458591066019e-06, + "loss": 0.76518929, + "num_input_tokens_seen": 72185160, + "step": 3341, + "time_per_iteration": 2.52592396736145 + }, + { + "auxiliary_loss_clip": 0.01111404, + "auxiliary_loss_mlp": 0.01051628, + "balance_loss_clip": 1.05327916, + "balance_loss_mlp": 1.03279305, + "epoch": 0.20093191041635353, + "flos": 23842279607040.0, + "grad_norm": 1.84177369327494, + "language_loss": 0.71666443, + "learning_rate": 3.70125385615256e-06, + "loss": 0.73829478, + "num_input_tokens_seen": 72205160, + "step": 3342, + "time_per_iteration": 2.670555591583252 + }, + { + "auxiliary_loss_clip": 0.01112322, + "auxiliary_loss_mlp": 0.01047178, + "balance_loss_clip": 1.0468415, + "balance_loss_mlp": 1.02909446, + "epoch": 0.2009920336690215, + "flos": 21792067119360.0, + "grad_norm": 2.2208321943376967, + "language_loss": 0.71996355, + "learning_rate": 3.701049056727384e-06, + "loss": 0.74155855, + "num_input_tokens_seen": 72223555, + "step": 3343, + "time_per_iteration": 2.6304030418395996 + }, + { + "auxiliary_loss_clip": 0.01113556, + "auxiliary_loss_mlp": 0.01053421, + "balance_loss_clip": 1.04809737, + "balance_loss_mlp": 1.03414476, + "epoch": 0.20105215692168946, + "flos": 26359222440960.0, + "grad_norm": 1.9789234633667807, + "language_loss": 0.8015092, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.82317901, + "num_input_tokens_seen": 72242465, + "step": 3344, + "time_per_iteration": 2.63647198677063 + }, + { + "auxiliary_loss_clip": 0.01149291, + "auxiliary_loss_mlp": 0.01044273, + "balance_loss_clip": 1.04955482, + "balance_loss_mlp": 1.02592683, + "epoch": 0.20111228017435742, + "flos": 18807280617600.0, + "grad_norm": 2.1237143386125594, + "language_loss": 0.83716571, + "learning_rate": 3.700639264372948e-06, + "loss": 0.85910136, + "num_input_tokens_seen": 72260655, + "step": 3345, + "time_per_iteration": 2.4709863662719727 + }, + { + "auxiliary_loss_clip": 0.01093822, + "auxiliary_loss_mlp": 0.01041785, + "balance_loss_clip": 1.04705751, + "balance_loss_mlp": 1.02477407, + "epoch": 0.20117240342702541, + "flos": 19975059682560.0, + "grad_norm": 1.6355700774322997, + "language_loss": 0.67957461, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.70093071, + "num_input_tokens_seen": 72279055, + "step": 3346, + "time_per_iteration": 2.6116139888763428 + }, + { + "auxiliary_loss_clip": 0.01115378, + "auxiliary_loss_mlp": 0.01050152, + "balance_loss_clip": 1.04713345, + "balance_loss_mlp": 1.03124511, + "epoch": 0.20123252667969338, + "flos": 23142703345920.0, + "grad_norm": 2.058296242408319, + "language_loss": 0.73106849, + "learning_rate": 3.70022921406487e-06, + "loss": 0.75272381, + "num_input_tokens_seen": 72297895, + "step": 3347, + "time_per_iteration": 2.581087112426758 + }, + { + "auxiliary_loss_clip": 0.01137605, + "auxiliary_loss_mlp": 0.01053706, + "balance_loss_clip": 1.05154061, + "balance_loss_mlp": 1.03670716, + "epoch": 0.20129264993236134, + "flos": 23221671396480.0, + "grad_norm": 1.6423403697671757, + "language_loss": 0.86628306, + "learning_rate": 3.70002409219765e-06, + "loss": 0.88819611, + "num_input_tokens_seen": 72318385, + "step": 3348, + "time_per_iteration": 2.5450663566589355 + }, + { + "auxiliary_loss_clip": 0.01096207, + "auxiliary_loss_mlp": 0.01044217, + "balance_loss_clip": 1.04679334, + "balance_loss_mlp": 1.02471399, + "epoch": 0.2013527731850293, + "flos": 21871466133120.0, + "grad_norm": 1.7288029526777509, + "language_loss": 0.71020436, + "learning_rate": 3.699818905865346e-06, + "loss": 0.73160863, + "num_input_tokens_seen": 72338235, + "step": 3349, + "time_per_iteration": 2.627180814743042 + }, + { + "auxiliary_loss_clip": 0.01116914, + "auxiliary_loss_mlp": 0.01053902, + "balance_loss_clip": 1.04890513, + "balance_loss_mlp": 1.03435183, + "epoch": 0.20141289643769728, + "flos": 18040803275520.0, + "grad_norm": 1.83893083213916, + "language_loss": 0.71196723, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.73367536, + "num_input_tokens_seen": 72357825, + "step": 3350, + "time_per_iteration": 2.5875754356384277 + }, + { + "auxiliary_loss_clip": 0.01127812, + "auxiliary_loss_mlp": 0.01056028, + "balance_loss_clip": 1.05205274, + "balance_loss_mlp": 1.03286529, + "epoch": 0.20147301969036524, + "flos": 23951412103680.0, + "grad_norm": 2.3414474234421454, + "language_loss": 0.75847638, + "learning_rate": 3.69940833983661e-06, + "loss": 0.7803148, + "num_input_tokens_seen": 72376335, + "step": 3351, + "time_per_iteration": 2.5649807453155518 + }, + { + "auxiliary_loss_clip": 0.01131007, + "auxiliary_loss_mlp": 0.01046896, + "balance_loss_clip": 1.05120897, + "balance_loss_mlp": 1.02697659, + "epoch": 0.2015331429430332, + "flos": 25588471380480.0, + "grad_norm": 1.4758640867766386, + "language_loss": 0.80542445, + "learning_rate": 3.699202960155748e-06, + "loss": 0.82720339, + "num_input_tokens_seen": 72395440, + "step": 3352, + "time_per_iteration": 2.5943145751953125 + }, + { + "auxiliary_loss_clip": 0.01140844, + "auxiliary_loss_mlp": 0.01047548, + "balance_loss_clip": 1.05122375, + "balance_loss_mlp": 1.02913046, + "epoch": 0.2015932661957012, + "flos": 26724972677760.0, + "grad_norm": 2.690596434950351, + "language_loss": 0.80268884, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.82457274, + "num_input_tokens_seen": 72414670, + "step": 3353, + "time_per_iteration": 4.172781229019165 + }, + { + "auxiliary_loss_clip": 0.01118083, + "auxiliary_loss_mlp": 0.01044453, + "balance_loss_clip": 1.04795408, + "balance_loss_mlp": 1.02684593, + "epoch": 0.20165338944836916, + "flos": 15633136592640.0, + "grad_norm": 2.320681021136594, + "language_loss": 0.89928472, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.92091012, + "num_input_tokens_seen": 72432210, + "step": 3354, + "time_per_iteration": 3.928866386413574 + }, + { + "auxiliary_loss_clip": 0.01048553, + "auxiliary_loss_mlp": 0.00978983, + "balance_loss_clip": 1.03141248, + "balance_loss_mlp": 1.33880997, + "epoch": 0.20171351270103713, + "flos": 57912529207680.0, + "grad_norm": 0.8324849460296164, + "language_loss": 0.55855614, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.57883149, + "num_input_tokens_seen": 72489225, + "step": 3355, + "time_per_iteration": 3.11468505859375 + }, + { + "auxiliary_loss_clip": 0.01127232, + "auxiliary_loss_mlp": 0.00801082, + "balance_loss_clip": 1.05213928, + "balance_loss_mlp": 1.02076769, + "epoch": 0.2017736359537051, + "flos": 20814363849600.0, + "grad_norm": 1.61663359533939, + "language_loss": 0.84486943, + "learning_rate": 3.698380797170751e-06, + "loss": 0.86415255, + "num_input_tokens_seen": 72508715, + "step": 3356, + "time_per_iteration": 2.584040641784668 + }, + { + "auxiliary_loss_clip": 0.0112414, + "auxiliary_loss_mlp": 0.01053547, + "balance_loss_clip": 1.04831445, + "balance_loss_mlp": 1.02968168, + "epoch": 0.20183375920637306, + "flos": 17092043389440.0, + "grad_norm": 2.8516172506048583, + "language_loss": 0.68872714, + "learning_rate": 3.698175095398085e-06, + "loss": 0.71050406, + "num_input_tokens_seen": 72525135, + "step": 3357, + "time_per_iteration": 3.970665693283081 + }, + { + "auxiliary_loss_clip": 0.01134433, + "auxiliary_loss_mlp": 0.01047634, + "balance_loss_clip": 1.05090189, + "balance_loss_mlp": 1.02748775, + "epoch": 0.20189388245904102, + "flos": 18661339658880.0, + "grad_norm": 1.7376036906702537, + "language_loss": 0.72135162, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.74317235, + "num_input_tokens_seen": 72543690, + "step": 3358, + "time_per_iteration": 2.5461273193359375 + }, + { + "auxiliary_loss_clip": 0.01134949, + "auxiliary_loss_mlp": 0.01054668, + "balance_loss_clip": 1.04839945, + "balance_loss_mlp": 1.0371207, + "epoch": 0.20195400571170902, + "flos": 16797539779200.0, + "grad_norm": 1.741878173003893, + "language_loss": 0.83071709, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.85261321, + "num_input_tokens_seen": 72560725, + "step": 3359, + "time_per_iteration": 3.858407497406006 + }, + { + "auxiliary_loss_clip": 0.01052243, + "auxiliary_loss_mlp": 0.01024896, + "balance_loss_clip": 1.02430153, + "balance_loss_mlp": 1.02177262, + "epoch": 0.20201412896437698, + "flos": 67174716268800.0, + "grad_norm": 0.7847262736373956, + "language_loss": 0.58985806, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61062944, + "num_input_tokens_seen": 72621940, + "step": 3360, + "time_per_iteration": 3.0955324172973633 + }, + { + "auxiliary_loss_clip": 0.0110051, + "auxiliary_loss_mlp": 0.01057282, + "balance_loss_clip": 1.05665207, + "balance_loss_mlp": 1.03810108, + "epoch": 0.20207425221704495, + "flos": 21325013550720.0, + "grad_norm": 2.257621568237434, + "language_loss": 0.62243432, + "learning_rate": 3.697351644435763e-06, + "loss": 0.64401221, + "num_input_tokens_seen": 72639135, + "step": 3361, + "time_per_iteration": 2.674290418624878 + }, + { + "auxiliary_loss_clip": 0.01117051, + "auxiliary_loss_mlp": 0.01063685, + "balance_loss_clip": 1.05049837, + "balance_loss_mlp": 1.04370522, + "epoch": 0.2021343754697129, + "flos": 22527158952960.0, + "grad_norm": 1.9952915689815687, + "language_loss": 0.75854146, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.7803489, + "num_input_tokens_seen": 72658525, + "step": 3362, + "time_per_iteration": 2.595655918121338 + }, + { + "auxiliary_loss_clip": 0.01137211, + "auxiliary_loss_mlp": 0.00826532, + "balance_loss_clip": 1.05080938, + "balance_loss_mlp": 1.06649733, + "epoch": 0.20219449872238088, + "flos": 19062785036160.0, + "grad_norm": 1.6573573086285027, + "language_loss": 0.76777142, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.78740883, + "num_input_tokens_seen": 72678085, + "step": 3363, + "time_per_iteration": 2.5721564292907715 + }, + { + "auxiliary_loss_clip": 0.01136478, + "auxiliary_loss_mlp": 0.01048459, + "balance_loss_clip": 1.04821301, + "balance_loss_mlp": 1.03129244, + "epoch": 0.20225462197504884, + "flos": 24717027519360.0, + "grad_norm": 1.5058919481013322, + "language_loss": 0.7544868, + "learning_rate": 3.696733380367391e-06, + "loss": 0.77633607, + "num_input_tokens_seen": 72698695, + "step": 3364, + "time_per_iteration": 2.5697171688079834 + }, + { + "auxiliary_loss_clip": 0.01104986, + "auxiliary_loss_mlp": 0.01043433, + "balance_loss_clip": 1.05116248, + "balance_loss_mlp": 1.02373981, + "epoch": 0.2023147452277168, + "flos": 22018304931840.0, + "grad_norm": 10.323511875801541, + "language_loss": 0.71414173, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.73562586, + "num_input_tokens_seen": 72717880, + "step": 3365, + "time_per_iteration": 2.625694513320923 + }, + { + "auxiliary_loss_clip": 0.01104407, + "auxiliary_loss_mlp": 0.01047448, + "balance_loss_clip": 1.04990864, + "balance_loss_mlp": 1.02882802, + "epoch": 0.2023748684803848, + "flos": 17745365911680.0, + "grad_norm": 2.704703847054694, + "language_loss": 0.86421734, + "learning_rate": 3.696320882607286e-06, + "loss": 0.88573587, + "num_input_tokens_seen": 72736410, + "step": 3366, + "time_per_iteration": 2.5391907691955566 + }, + { + "auxiliary_loss_clip": 0.01113048, + "auxiliary_loss_mlp": 0.01040418, + "balance_loss_clip": 1.05169654, + "balance_loss_mlp": 1.0220716, + "epoch": 0.20243499173305277, + "flos": 31138932493440.0, + "grad_norm": 2.098987858712941, + "language_loss": 0.69654167, + "learning_rate": 3.696114537236335e-06, + "loss": 0.71807635, + "num_input_tokens_seen": 72758295, + "step": 3367, + "time_per_iteration": 2.6517255306243896 + }, + { + "auxiliary_loss_clip": 0.01140334, + "auxiliary_loss_mlp": 0.01045332, + "balance_loss_clip": 1.04980648, + "balance_loss_mlp": 1.02413702, + "epoch": 0.20249511498572073, + "flos": 33839235279360.0, + "grad_norm": 1.9516157635528146, + "language_loss": 0.68892878, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.71078539, + "num_input_tokens_seen": 72782495, + "step": 3368, + "time_per_iteration": 2.6133289337158203 + }, + { + "auxiliary_loss_clip": 0.01118199, + "auxiliary_loss_mlp": 0.0105145, + "balance_loss_clip": 1.05357778, + "balance_loss_mlp": 1.03205431, + "epoch": 0.2025552382383887, + "flos": 21215629658880.0, + "grad_norm": 2.0170485633658672, + "language_loss": 0.77734435, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.79904079, + "num_input_tokens_seen": 72801885, + "step": 3369, + "time_per_iteration": 2.5903189182281494 + }, + { + "auxiliary_loss_clip": 0.01132787, + "auxiliary_loss_mlp": 0.01053794, + "balance_loss_clip": 1.04888058, + "balance_loss_mlp": 1.0350666, + "epoch": 0.20261536149105666, + "flos": 14647388676480.0, + "grad_norm": 2.902896481472456, + "language_loss": 0.65016598, + "learning_rate": 3.695495115253795e-06, + "loss": 0.67203176, + "num_input_tokens_seen": 72816990, + "step": 3370, + "time_per_iteration": 2.482893943786621 + }, + { + "auxiliary_loss_clip": 0.01053408, + "auxiliary_loss_mlp": 0.01000056, + "balance_loss_clip": 1.02631986, + "balance_loss_mlp": 0.99736196, + "epoch": 0.20267548474372463, + "flos": 66783649921920.0, + "grad_norm": 0.6814662131243528, + "language_loss": 0.58147174, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60200638, + "num_input_tokens_seen": 72879240, + "step": 3371, + "time_per_iteration": 3.182375192642212 + }, + { + "auxiliary_loss_clip": 0.01117187, + "auxiliary_loss_mlp": 0.01043003, + "balance_loss_clip": 1.0477736, + "balance_loss_mlp": 1.02487159, + "epoch": 0.2027356079963926, + "flos": 24680793674880.0, + "grad_norm": 2.1524260264049198, + "language_loss": 0.91829985, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.93990177, + "num_input_tokens_seen": 72899030, + "step": 3372, + "time_per_iteration": 2.61246657371521 + }, + { + "auxiliary_loss_clip": 0.01133616, + "auxiliary_loss_mlp": 0.01051763, + "balance_loss_clip": 1.04825962, + "balance_loss_mlp": 1.03030539, + "epoch": 0.20279573124906058, + "flos": 26392762765440.0, + "grad_norm": 1.60092624523118, + "language_loss": 0.78727973, + "learning_rate": 3.694875114631167e-06, + "loss": 0.80913353, + "num_input_tokens_seen": 72919190, + "step": 3373, + "time_per_iteration": 2.5494368076324463 + }, + { + "auxiliary_loss_clip": 0.01089379, + "auxiliary_loss_mlp": 0.01045383, + "balance_loss_clip": 1.0457058, + "balance_loss_mlp": 1.02406788, + "epoch": 0.20285585450172855, + "flos": 33799984692480.0, + "grad_norm": 1.7725651742733801, + "language_loss": 0.71680099, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.73814863, + "num_input_tokens_seen": 72939720, + "step": 3374, + "time_per_iteration": 2.7234768867492676 + }, + { + "auxiliary_loss_clip": 0.01047409, + "auxiliary_loss_mlp": 0.01001296, + "balance_loss_clip": 1.0273037, + "balance_loss_mlp": 0.99822026, + "epoch": 0.20291597775439651, + "flos": 71164823598720.0, + "grad_norm": 0.9715764246786197, + "language_loss": 0.62442183, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64490891, + "num_input_tokens_seen": 73000015, + "step": 3375, + "time_per_iteration": 3.1200056076049805 + }, + { + "auxiliary_loss_clip": 0.01147906, + "auxiliary_loss_mlp": 0.01045948, + "balance_loss_clip": 1.05000818, + "balance_loss_mlp": 1.02793574, + "epoch": 0.20297610100706448, + "flos": 19494287118720.0, + "grad_norm": 1.5463271629893487, + "language_loss": 0.8215481, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.84348667, + "num_input_tokens_seen": 73017675, + "step": 3376, + "time_per_iteration": 2.480059862136841 + }, + { + "auxiliary_loss_clip": 0.01136414, + "auxiliary_loss_mlp": 0.01039819, + "balance_loss_clip": 1.04845893, + "balance_loss_mlp": 1.02012515, + "epoch": 0.20303622425973245, + "flos": 25044245441280.0, + "grad_norm": 2.221221824254492, + "language_loss": 0.81850249, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.8402648, + "num_input_tokens_seen": 73036135, + "step": 3377, + "time_per_iteration": 2.5193910598754883 + }, + { + "auxiliary_loss_clip": 0.01120298, + "auxiliary_loss_mlp": 0.0105242, + "balance_loss_clip": 1.05216753, + "balance_loss_mlp": 1.03291714, + "epoch": 0.2030963475124004, + "flos": 21979988098560.0, + "grad_norm": 1.95070036799559, + "language_loss": 0.7696861, + "learning_rate": 3.69384049496805e-06, + "loss": 0.79141325, + "num_input_tokens_seen": 73054075, + "step": 3378, + "time_per_iteration": 2.5563714504241943 + }, + { + "auxiliary_loss_clip": 0.01088527, + "auxiliary_loss_mlp": 0.01054459, + "balance_loss_clip": 1.04987347, + "balance_loss_mlp": 1.03264368, + "epoch": 0.2031564707650684, + "flos": 19500392430720.0, + "grad_norm": 2.3612405014591786, + "language_loss": 0.79645985, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.81788969, + "num_input_tokens_seen": 73073530, + "step": 3379, + "time_per_iteration": 2.6220247745513916 + }, + { + "auxiliary_loss_clip": 0.01134756, + "auxiliary_loss_mlp": 0.01044217, + "balance_loss_clip": 1.0528307, + "balance_loss_mlp": 1.02665746, + "epoch": 0.20321659401773637, + "flos": 22747075971840.0, + "grad_norm": 1.9010308785150776, + "language_loss": 0.86655867, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.88834834, + "num_input_tokens_seen": 73092820, + "step": 3380, + "time_per_iteration": 2.5565786361694336 + }, + { + "auxiliary_loss_clip": 0.01151954, + "auxiliary_loss_mlp": 0.01048846, + "balance_loss_clip": 1.05479467, + "balance_loss_mlp": 1.03076172, + "epoch": 0.20327671727040433, + "flos": 22455840499200.0, + "grad_norm": 2.227354497010493, + "language_loss": 0.75072646, + "learning_rate": 3.693218952340186e-06, + "loss": 0.7727344, + "num_input_tokens_seen": 73113385, + "step": 3381, + "time_per_iteration": 2.5147640705108643 + }, + { + "auxiliary_loss_clip": 0.01116513, + "auxiliary_loss_mlp": 0.01058212, + "balance_loss_clip": 1.04741263, + "balance_loss_mlp": 1.03953242, + "epoch": 0.2033368405230723, + "flos": 19535010163200.0, + "grad_norm": 1.640897769490943, + "language_loss": 0.79330939, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.81505668, + "num_input_tokens_seen": 73131195, + "step": 3382, + "time_per_iteration": 2.574390172958374 + }, + { + "auxiliary_loss_clip": 0.01111734, + "auxiliary_loss_mlp": 0.0080974, + "balance_loss_clip": 1.04907501, + "balance_loss_mlp": 1.03494632, + "epoch": 0.20339696377574026, + "flos": 13809233744640.0, + "grad_norm": 2.9480296338349135, + "language_loss": 0.80346853, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.82268327, + "num_input_tokens_seen": 73148850, + "step": 3383, + "time_per_iteration": 2.567694664001465 + }, + { + "auxiliary_loss_clip": 0.01097145, + "auxiliary_loss_mlp": 0.01039841, + "balance_loss_clip": 1.04496074, + "balance_loss_mlp": 1.02094591, + "epoch": 0.20345708702840823, + "flos": 20339409288960.0, + "grad_norm": 2.023666106868807, + "language_loss": 0.74414682, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.76551664, + "num_input_tokens_seen": 73166775, + "step": 3384, + "time_per_iteration": 2.577505111694336 + }, + { + "auxiliary_loss_clip": 0.01139534, + "auxiliary_loss_mlp": 0.01048769, + "balance_loss_clip": 1.04862809, + "balance_loss_mlp": 1.0288018, + "epoch": 0.2035172102810762, + "flos": 20333950421760.0, + "grad_norm": 6.119993973992206, + "language_loss": 0.76722944, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.78911245, + "num_input_tokens_seen": 73183215, + "step": 3385, + "time_per_iteration": 2.5399293899536133 + }, + { + "auxiliary_loss_clip": 0.01107435, + "auxiliary_loss_mlp": 0.01064399, + "balance_loss_clip": 1.05225682, + "balance_loss_mlp": 1.04445577, + "epoch": 0.2035773335337442, + "flos": 23330983461120.0, + "grad_norm": 1.5005830569086331, + "language_loss": 0.68473536, + "learning_rate": 3.692181763924639e-06, + "loss": 0.70645368, + "num_input_tokens_seen": 73203290, + "step": 3386, + "time_per_iteration": 2.642139196395874 + }, + { + "auxiliary_loss_clip": 0.01101352, + "auxiliary_loss_mlp": 0.01059582, + "balance_loss_clip": 1.04929352, + "balance_loss_mlp": 1.03945947, + "epoch": 0.20363745678641215, + "flos": 28330287310080.0, + "grad_norm": 1.357919103899785, + "language_loss": 0.81207776, + "learning_rate": 3.691974133706947e-06, + "loss": 0.83368713, + "num_input_tokens_seen": 73226185, + "step": 3387, + "time_per_iteration": 2.7188615798950195 + }, + { + "auxiliary_loss_clip": 0.01115115, + "auxiliary_loss_mlp": 0.01044149, + "balance_loss_clip": 1.04717243, + "balance_loss_mlp": 1.02575457, + "epoch": 0.20369758003908012, + "flos": 18915658928640.0, + "grad_norm": 3.344659379482522, + "language_loss": 0.79439509, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.81598771, + "num_input_tokens_seen": 73243300, + "step": 3388, + "time_per_iteration": 2.5606720447540283 + }, + { + "auxiliary_loss_clip": 0.01149461, + "auxiliary_loss_mlp": 0.01044902, + "balance_loss_clip": 1.05050468, + "balance_loss_mlp": 1.02574515, + "epoch": 0.20375770329174808, + "flos": 19206499351680.0, + "grad_norm": 1.7902276710875138, + "language_loss": 0.71979403, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.7417376, + "num_input_tokens_seen": 73261490, + "step": 3389, + "time_per_iteration": 2.488412618637085 + }, + { + "auxiliary_loss_clip": 0.01134344, + "auxiliary_loss_mlp": 0.01047232, + "balance_loss_clip": 1.04932392, + "balance_loss_mlp": 1.02894545, + "epoch": 0.20381782654441605, + "flos": 19391008538880.0, + "grad_norm": 2.1677307904641396, + "language_loss": 0.86921829, + "learning_rate": 3.691350858126404e-06, + "loss": 0.89103413, + "num_input_tokens_seen": 73280180, + "step": 3390, + "time_per_iteration": 2.5605275630950928 + }, + { + "auxiliary_loss_clip": 0.01117723, + "auxiliary_loss_mlp": 0.0105723, + "balance_loss_clip": 1.04772973, + "balance_loss_mlp": 1.03716719, + "epoch": 0.203877949797084, + "flos": 24827704300800.0, + "grad_norm": 2.3006120129509293, + "language_loss": 0.70977181, + "learning_rate": 3.691142971316662e-06, + "loss": 0.73152137, + "num_input_tokens_seen": 73300680, + "step": 3391, + "time_per_iteration": 4.018676042556763 + }, + { + "auxiliary_loss_clip": 0.01112741, + "auxiliary_loss_mlp": 0.01048196, + "balance_loss_clip": 1.0483191, + "balance_loss_mlp": 1.02993274, + "epoch": 0.20393807304975198, + "flos": 18003707504640.0, + "grad_norm": 2.4592445544482304, + "language_loss": 0.86422455, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.88583386, + "num_input_tokens_seen": 73316760, + "step": 3392, + "time_per_iteration": 4.007593631744385 + }, + { + "auxiliary_loss_clip": 0.01137903, + "auxiliary_loss_mlp": 0.01048985, + "balance_loss_clip": 1.04854882, + "balance_loss_mlp": 1.03138924, + "epoch": 0.20399819630241997, + "flos": 24206988349440.0, + "grad_norm": 2.1471200240705466, + "language_loss": 0.80460072, + "learning_rate": 3.69072700532013e-06, + "loss": 0.8264696, + "num_input_tokens_seen": 73339385, + "step": 3393, + "time_per_iteration": 2.566272258758545 + }, + { + "auxiliary_loss_clip": 0.01117143, + "auxiliary_loss_mlp": 0.01037235, + "balance_loss_clip": 1.04702795, + "balance_loss_mlp": 1.01971138, + "epoch": 0.20405831955508794, + "flos": 20777124424320.0, + "grad_norm": 1.804341172107247, + "language_loss": 0.86395991, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.88550371, + "num_input_tokens_seen": 73357235, + "step": 3394, + "time_per_iteration": 2.545711040496826 + }, + { + "auxiliary_loss_clip": 0.01131237, + "auxiliary_loss_mlp": 0.01048289, + "balance_loss_clip": 1.05103517, + "balance_loss_mlp": 1.03128982, + "epoch": 0.2041184428077559, + "flos": 15486908325120.0, + "grad_norm": 2.4221781663366357, + "language_loss": 0.84440231, + "learning_rate": 3.69031078287345e-06, + "loss": 0.86619753, + "num_input_tokens_seen": 73374435, + "step": 3395, + "time_per_iteration": 2.475126266479492 + }, + { + "auxiliary_loss_clip": 0.01136062, + "auxiliary_loss_mlp": 0.01037693, + "balance_loss_clip": 1.04932463, + "balance_loss_mlp": 1.01914358, + "epoch": 0.20417856606042387, + "flos": 15588463052160.0, + "grad_norm": 2.048376689197412, + "language_loss": 0.84105361, + "learning_rate": 3.690102575501033e-06, + "loss": 0.86279112, + "num_input_tokens_seen": 73391025, + "step": 3396, + "time_per_iteration": 3.9363181591033936 + }, + { + "auxiliary_loss_clip": 0.01108087, + "auxiliary_loss_mlp": 0.01044157, + "balance_loss_clip": 1.04722571, + "balance_loss_mlp": 1.02519059, + "epoch": 0.20423868931309183, + "flos": 24279348297600.0, + "grad_norm": 2.1598500764415394, + "language_loss": 0.77469385, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.79621625, + "num_input_tokens_seen": 73409270, + "step": 3397, + "time_per_iteration": 2.6185953617095947 + }, + { + "auxiliary_loss_clip": 0.0112147, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_clip": 1.04895675, + "balance_loss_mlp": 1.02849424, + "epoch": 0.2042988125657598, + "flos": 18614870438400.0, + "grad_norm": 3.192592019943527, + "language_loss": 0.87949473, + "learning_rate": 3.689685968497518e-06, + "loss": 0.90116513, + "num_input_tokens_seen": 73425225, + "step": 3398, + "time_per_iteration": 3.8984038829803467 + }, + { + "auxiliary_loss_clip": 0.01118541, + "auxiliary_loss_mlp": 0.01049934, + "balance_loss_clip": 1.05268598, + "balance_loss_mlp": 1.03157556, + "epoch": 0.2043589358184278, + "flos": 17851230270720.0, + "grad_norm": 2.181160693169387, + "language_loss": 0.78011405, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.80179876, + "num_input_tokens_seen": 73440940, + "step": 3399, + "time_per_iteration": 2.513711929321289 + }, + { + "auxiliary_loss_clip": 0.01135573, + "auxiliary_loss_mlp": 0.01038007, + "balance_loss_clip": 1.04842651, + "balance_loss_mlp": 1.0204711, + "epoch": 0.20441905907109575, + "flos": 21435223455360.0, + "grad_norm": 2.125540109030586, + "language_loss": 0.76421916, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.78595495, + "num_input_tokens_seen": 73458805, + "step": 3400, + "time_per_iteration": 2.5543415546417236 + }, + { + "auxiliary_loss_clip": 0.01113306, + "auxiliary_loss_mlp": 0.00801861, + "balance_loss_clip": 1.05056381, + "balance_loss_mlp": 1.02480555, + "epoch": 0.20447918232376372, + "flos": 27707703851520.0, + "grad_norm": 1.8670433078103281, + "language_loss": 0.7933653, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.81251699, + "num_input_tokens_seen": 73479380, + "step": 3401, + "time_per_iteration": 2.6463019847869873 + }, + { + "auxiliary_loss_clip": 0.01123329, + "auxiliary_loss_mlp": 0.01047433, + "balance_loss_clip": 1.04622209, + "balance_loss_mlp": 1.02881217, + "epoch": 0.20453930557643168, + "flos": 30524214113280.0, + "grad_norm": 1.7273024926589577, + "language_loss": 0.69289565, + "learning_rate": 3.688851985676991e-06, + "loss": 0.71460325, + "num_input_tokens_seen": 73505105, + "step": 3402, + "time_per_iteration": 2.692523241043091 + }, + { + "auxiliary_loss_clip": 0.01109087, + "auxiliary_loss_mlp": 0.01042973, + "balance_loss_clip": 1.04714298, + "balance_loss_mlp": 1.0247575, + "epoch": 0.20459942882909965, + "flos": 18987767481600.0, + "grad_norm": 3.172747727781626, + "language_loss": 0.80823809, + "learning_rate": 3.688643329848496e-06, + "loss": 0.8297587, + "num_input_tokens_seen": 73523700, + "step": 3403, + "time_per_iteration": 2.570510149002075 + }, + { + "auxiliary_loss_clip": 0.01137626, + "auxiliary_loss_mlp": 0.01045465, + "balance_loss_clip": 1.05118382, + "balance_loss_mlp": 1.02814448, + "epoch": 0.20465955208176762, + "flos": 20339050152960.0, + "grad_norm": 2.03621676229595, + "language_loss": 0.83449239, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.8563233, + "num_input_tokens_seen": 73542625, + "step": 3404, + "time_per_iteration": 2.513256549835205 + }, + { + "auxiliary_loss_clip": 0.01130006, + "auxiliary_loss_mlp": 0.01047823, + "balance_loss_clip": 1.0448885, + "balance_loss_mlp": 1.02935696, + "epoch": 0.20471967533443558, + "flos": 21251288885760.0, + "grad_norm": 1.7714311089115549, + "language_loss": 0.85973078, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.88150907, + "num_input_tokens_seen": 73561450, + "step": 3405, + "time_per_iteration": 2.545488119125366 + }, + { + "auxiliary_loss_clip": 0.01107884, + "auxiliary_loss_mlp": 0.01043908, + "balance_loss_clip": 1.04916358, + "balance_loss_mlp": 1.02655089, + "epoch": 0.20477979858710357, + "flos": 14501555458560.0, + "grad_norm": 2.144152749989942, + "language_loss": 0.84130383, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.8628217, + "num_input_tokens_seen": 73577155, + "step": 3406, + "time_per_iteration": 2.5298378467559814 + }, + { + "auxiliary_loss_clip": 0.01147453, + "auxiliary_loss_mlp": 0.01041267, + "balance_loss_clip": 1.05124187, + "balance_loss_mlp": 1.02444661, + "epoch": 0.20483992183977154, + "flos": 11400310085760.0, + "grad_norm": 2.0364324443209885, + "language_loss": 0.67775559, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.69964284, + "num_input_tokens_seen": 73594900, + "step": 3407, + "time_per_iteration": 2.473159074783325 + }, + { + "auxiliary_loss_clip": 0.01144244, + "auxiliary_loss_mlp": 0.01047457, + "balance_loss_clip": 1.04869151, + "balance_loss_mlp": 1.02981377, + "epoch": 0.2049000450924395, + "flos": 19060271084160.0, + "grad_norm": 2.1749953592653455, + "language_loss": 0.83954334, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.86146033, + "num_input_tokens_seen": 73613810, + "step": 3408, + "time_per_iteration": 2.5187361240386963 + }, + { + "auxiliary_loss_clip": 0.01148888, + "auxiliary_loss_mlp": 0.01042384, + "balance_loss_clip": 1.05118895, + "balance_loss_mlp": 1.02538431, + "epoch": 0.20496016834510747, + "flos": 14574561851520.0, + "grad_norm": 3.421354100479567, + "language_loss": 0.63954228, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.66145504, + "num_input_tokens_seen": 73631495, + "step": 3409, + "time_per_iteration": 2.466261148452759 + }, + { + "auxiliary_loss_clip": 0.01129922, + "auxiliary_loss_mlp": 0.01045837, + "balance_loss_clip": 1.04741991, + "balance_loss_mlp": 1.02853978, + "epoch": 0.20502029159777543, + "flos": 22126647329280.0, + "grad_norm": 1.6486648578703644, + "language_loss": 0.8011961, + "learning_rate": 3.687180946553745e-06, + "loss": 0.8229537, + "num_input_tokens_seen": 73652840, + "step": 3410, + "time_per_iteration": 2.520407199859619 + }, + { + "auxiliary_loss_clip": 0.01092687, + "auxiliary_loss_mlp": 0.01046492, + "balance_loss_clip": 1.04999197, + "balance_loss_mlp": 1.02932596, + "epoch": 0.2050804148504434, + "flos": 25367907916800.0, + "grad_norm": 2.3483070692916925, + "language_loss": 0.76493222, + "learning_rate": 3.686971778678803e-06, + "loss": 0.78632396, + "num_input_tokens_seen": 73672150, + "step": 3411, + "time_per_iteration": 2.6909637451171875 + }, + { + "auxiliary_loss_clip": 0.01128703, + "auxiliary_loss_mlp": 0.01048376, + "balance_loss_clip": 1.04897308, + "balance_loss_mlp": 1.03116262, + "epoch": 0.2051405381031114, + "flos": 23620171858560.0, + "grad_norm": 1.9353367243453121, + "language_loss": 0.73537368, + "learning_rate": 3.686762546833722e-06, + "loss": 0.75714445, + "num_input_tokens_seen": 73691940, + "step": 3412, + "time_per_iteration": 2.5275936126708984 + }, + { + "auxiliary_loss_clip": 0.01119609, + "auxiliary_loss_mlp": 0.01055457, + "balance_loss_clip": 1.04464936, + "balance_loss_mlp": 1.03485715, + "epoch": 0.20520066135577936, + "flos": 19565533745280.0, + "grad_norm": 2.484238108751062, + "language_loss": 0.77972448, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.80147517, + "num_input_tokens_seen": 73709080, + "step": 3413, + "time_per_iteration": 2.5265417098999023 + }, + { + "auxiliary_loss_clip": 0.0110087, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_clip": 1.04630864, + "balance_loss_mlp": 1.03316784, + "epoch": 0.20526078460844732, + "flos": 17676345928320.0, + "grad_norm": 3.2611945975981427, + "language_loss": 0.84530413, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.86683428, + "num_input_tokens_seen": 73727670, + "step": 3414, + "time_per_iteration": 2.5221221446990967 + }, + { + "auxiliary_loss_clip": 0.01132518, + "auxiliary_loss_mlp": 0.01045839, + "balance_loss_clip": 1.04634273, + "balance_loss_mlp": 1.02723074, + "epoch": 0.2053209078611153, + "flos": 21500328856320.0, + "grad_norm": 1.8671865703084682, + "language_loss": 0.81041133, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.83219486, + "num_input_tokens_seen": 73747170, + "step": 3415, + "time_per_iteration": 2.514615535736084 + }, + { + "auxiliary_loss_clip": 0.01077865, + "auxiliary_loss_mlp": 0.01039683, + "balance_loss_clip": 1.0477798, + "balance_loss_mlp": 1.02348268, + "epoch": 0.20538103111378325, + "flos": 25663524848640.0, + "grad_norm": 2.024348109670658, + "language_loss": 0.72763002, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.7488054, + "num_input_tokens_seen": 73767690, + "step": 3416, + "time_per_iteration": 2.6654114723205566 + }, + { + "auxiliary_loss_clip": 0.01138136, + "auxiliary_loss_mlp": 0.01041829, + "balance_loss_clip": 1.0500977, + "balance_loss_mlp": 1.0232203, + "epoch": 0.20544115436645122, + "flos": 23148952312320.0, + "grad_norm": 2.2291375570070593, + "language_loss": 0.78635758, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.80815721, + "num_input_tokens_seen": 73786900, + "step": 3417, + "time_per_iteration": 2.527583360671997 + }, + { + "auxiliary_loss_clip": 0.01136522, + "auxiliary_loss_mlp": 0.01045099, + "balance_loss_clip": 1.04886508, + "balance_loss_mlp": 1.02687216, + "epoch": 0.20550127761911918, + "flos": 19390433921280.0, + "grad_norm": 2.9451885098947512, + "language_loss": 0.8769511, + "learning_rate": 3.685505812834798e-06, + "loss": 0.89876729, + "num_input_tokens_seen": 73804515, + "step": 3418, + "time_per_iteration": 2.4742672443389893 + }, + { + "auxiliary_loss_clip": 0.01128303, + "auxiliary_loss_mlp": 0.0104354, + "balance_loss_clip": 1.0485692, + "balance_loss_mlp": 1.02556312, + "epoch": 0.20556140087178718, + "flos": 22893124671360.0, + "grad_norm": 2.4939992866261087, + "language_loss": 0.6285187, + "learning_rate": 3.685296133421035e-06, + "loss": 0.65023714, + "num_input_tokens_seen": 73822910, + "step": 3419, + "time_per_iteration": 2.542381525039673 + }, + { + "auxiliary_loss_clip": 0.01134023, + "auxiliary_loss_mlp": 0.01052252, + "balance_loss_clip": 1.05439067, + "balance_loss_mlp": 1.03264165, + "epoch": 0.20562152412445514, + "flos": 19789652655360.0, + "grad_norm": 3.658493039100118, + "language_loss": 0.86184001, + "learning_rate": 3.685086390100674e-06, + "loss": 0.88370275, + "num_input_tokens_seen": 73841160, + "step": 3420, + "time_per_iteration": 2.548982620239258 + }, + { + "auxiliary_loss_clip": 0.01099273, + "auxiliary_loss_mlp": 0.00805346, + "balance_loss_clip": 1.0454874, + "balance_loss_mlp": 1.01867366, + "epoch": 0.2056816473771231, + "flos": 31501989210240.0, + "grad_norm": 3.7589481241773663, + "language_loss": 0.71296239, + "learning_rate": 3.684876582881668e-06, + "loss": 0.73200864, + "num_input_tokens_seen": 73862795, + "step": 3421, + "time_per_iteration": 2.711793899536133 + }, + { + "auxiliary_loss_clip": 0.01144457, + "auxiliary_loss_mlp": 0.01038638, + "balance_loss_clip": 1.04959691, + "balance_loss_mlp": 1.02018464, + "epoch": 0.20574177062979107, + "flos": 23258372117760.0, + "grad_norm": 1.909110658739985, + "language_loss": 0.70873851, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.73056948, + "num_input_tokens_seen": 73881525, + "step": 3422, + "time_per_iteration": 2.4635732173919678 + }, + { + "auxiliary_loss_clip": 0.01054264, + "auxiliary_loss_mlp": 0.01023029, + "balance_loss_clip": 1.02866721, + "balance_loss_mlp": 1.0205611, + "epoch": 0.20580189388245904, + "flos": 70312518708480.0, + "grad_norm": 0.7476236925960988, + "language_loss": 0.55444312, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57521605, + "num_input_tokens_seen": 73937775, + "step": 3423, + "time_per_iteration": 3.1671698093414307 + }, + { + "auxiliary_loss_clip": 0.01096711, + "auxiliary_loss_mlp": 0.01039001, + "balance_loss_clip": 1.04828513, + "balance_loss_mlp": 1.02071428, + "epoch": 0.205862017135127, + "flos": 30737846252160.0, + "grad_norm": 1.8761180469436132, + "language_loss": 0.72157598, + "learning_rate": 3.684246777912353e-06, + "loss": 0.74293303, + "num_input_tokens_seen": 73958250, + "step": 3424, + "time_per_iteration": 2.6815574169158936 + }, + { + "auxiliary_loss_clip": 0.01125484, + "auxiliary_loss_mlp": 0.00804547, + "balance_loss_clip": 1.06058967, + "balance_loss_mlp": 1.02473688, + "epoch": 0.20592214038779497, + "flos": 21324546673920.0, + "grad_norm": 1.8806108843194673, + "language_loss": 0.75443572, + "learning_rate": 3.684036715178351e-06, + "loss": 0.77373606, + "num_input_tokens_seen": 73977775, + "step": 3425, + "time_per_iteration": 2.570366621017456 + }, + { + "auxiliary_loss_clip": 0.01096457, + "auxiliary_loss_mlp": 0.01055216, + "balance_loss_clip": 1.05013728, + "balance_loss_mlp": 1.03726315, + "epoch": 0.20598226364046296, + "flos": 22891652213760.0, + "grad_norm": 1.9586510448334638, + "language_loss": 0.88175142, + "learning_rate": 3.683826588585508e-06, + "loss": 0.90326822, + "num_input_tokens_seen": 73996590, + "step": 3426, + "time_per_iteration": 2.6074695587158203 + }, + { + "auxiliary_loss_clip": 0.01136003, + "auxiliary_loss_mlp": 0.01046014, + "balance_loss_clip": 1.05319548, + "balance_loss_mlp": 1.02827549, + "epoch": 0.20604238689313092, + "flos": 23878549365120.0, + "grad_norm": 1.4894599063659826, + "language_loss": 0.76708519, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.78890538, + "num_input_tokens_seen": 74015935, + "step": 3427, + "time_per_iteration": 2.541537284851074 + }, + { + "auxiliary_loss_clip": 0.01150178, + "auxiliary_loss_mlp": 0.01045321, + "balance_loss_clip": 1.05271578, + "balance_loss_mlp": 1.02662897, + "epoch": 0.2061025101457989, + "flos": 22491535639680.0, + "grad_norm": 1.5560277648909873, + "language_loss": 0.73727775, + "learning_rate": 3.683406143855174e-06, + "loss": 0.7592327, + "num_input_tokens_seen": 74036575, + "step": 3428, + "time_per_iteration": 2.5128917694091797 + }, + { + "auxiliary_loss_clip": 0.01123697, + "auxiliary_loss_mlp": 0.01046349, + "balance_loss_clip": 1.04781461, + "balance_loss_mlp": 1.02729905, + "epoch": 0.20616263339846685, + "flos": 22778928357120.0, + "grad_norm": 2.309891846703111, + "language_loss": 0.73496026, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.7566607, + "num_input_tokens_seen": 74055365, + "step": 3429, + "time_per_iteration": 2.5427491664886475 + }, + { + "auxiliary_loss_clip": 0.01133198, + "auxiliary_loss_mlp": 0.01052995, + "balance_loss_clip": 1.0532074, + "balance_loss_mlp": 1.03361142, + "epoch": 0.20622275665113482, + "flos": 20882198684160.0, + "grad_norm": 1.9319490438952744, + "language_loss": 0.85432172, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.87618363, + "num_input_tokens_seen": 74074875, + "step": 3430, + "time_per_iteration": 3.859550952911377 + }, + { + "auxiliary_loss_clip": 0.01077918, + "auxiliary_loss_mlp": 0.01054753, + "balance_loss_clip": 1.04829836, + "balance_loss_mlp": 1.03625226, + "epoch": 0.20628287990380278, + "flos": 19354415558400.0, + "grad_norm": 1.669157853186351, + "language_loss": 0.68975449, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.71108127, + "num_input_tokens_seen": 74094505, + "step": 3431, + "time_per_iteration": 4.034096002578735 + }, + { + "auxiliary_loss_clip": 0.0102156, + "auxiliary_loss_mlp": 0.01015087, + "balance_loss_clip": 1.03331041, + "balance_loss_mlp": 1.01258373, + "epoch": 0.20634300315647078, + "flos": 71517932248320.0, + "grad_norm": 0.814326978795341, + "language_loss": 0.60230935, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62267584, + "num_input_tokens_seen": 74158500, + "step": 3432, + "time_per_iteration": 3.3279008865356445 + }, + { + "auxiliary_loss_clip": 0.01135479, + "auxiliary_loss_mlp": 0.01041714, + "balance_loss_clip": 1.05207968, + "balance_loss_mlp": 1.02470255, + "epoch": 0.20640312640913874, + "flos": 21723944976000.0, + "grad_norm": 2.3646516822946135, + "language_loss": 0.72169495, + "learning_rate": 3.682353915057679e-06, + "loss": 0.74346685, + "num_input_tokens_seen": 74176685, + "step": 3433, + "time_per_iteration": 2.5453410148620605 + }, + { + "auxiliary_loss_clip": 0.01089425, + "auxiliary_loss_mlp": 0.01049824, + "balance_loss_clip": 1.04712129, + "balance_loss_mlp": 1.03023744, + "epoch": 0.2064632496618067, + "flos": 20554621626240.0, + "grad_norm": 1.8661193936042433, + "language_loss": 0.86720085, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.88859338, + "num_input_tokens_seen": 74194935, + "step": 3434, + "time_per_iteration": 2.6232481002807617 + }, + { + "auxiliary_loss_clip": 0.01140222, + "auxiliary_loss_mlp": 0.01038424, + "balance_loss_clip": 1.05029297, + "balance_loss_mlp": 1.02134156, + "epoch": 0.20652337291447467, + "flos": 29823273135360.0, + "grad_norm": 2.0777164564453456, + "language_loss": 0.69520593, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.71699238, + "num_input_tokens_seen": 74215400, + "step": 3435, + "time_per_iteration": 4.008605718612671 + }, + { + "auxiliary_loss_clip": 0.01113363, + "auxiliary_loss_mlp": 0.01040938, + "balance_loss_clip": 1.04786062, + "balance_loss_mlp": 1.02230573, + "epoch": 0.20658349616714264, + "flos": 26213640618240.0, + "grad_norm": 1.7668872442427106, + "language_loss": 0.89547193, + "learning_rate": 3.681721812174988e-06, + "loss": 0.91701502, + "num_input_tokens_seen": 74234090, + "step": 3436, + "time_per_iteration": 3.9448013305664062 + }, + { + "auxiliary_loss_clip": 0.01110239, + "auxiliary_loss_mlp": 0.01044392, + "balance_loss_clip": 1.04932547, + "balance_loss_mlp": 1.02453148, + "epoch": 0.2066436194198106, + "flos": 25994370044160.0, + "grad_norm": 1.9541562884935573, + "language_loss": 0.76485837, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.78640467, + "num_input_tokens_seen": 74253345, + "step": 3437, + "time_per_iteration": 2.6133923530578613 + }, + { + "auxiliary_loss_clip": 0.01140275, + "auxiliary_loss_mlp": 0.0103964, + "balance_loss_clip": 1.05196643, + "balance_loss_mlp": 1.02293873, + "epoch": 0.20670374267247857, + "flos": 21361067827200.0, + "grad_norm": 2.5154564032676867, + "language_loss": 0.7757346, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.79753375, + "num_input_tokens_seen": 74271615, + "step": 3438, + "time_per_iteration": 2.512479782104492 + }, + { + "auxiliary_loss_clip": 0.01053206, + "auxiliary_loss_mlp": 0.01020467, + "balance_loss_clip": 1.02777362, + "balance_loss_mlp": 1.01803529, + "epoch": 0.20676386592514656, + "flos": 66383281952640.0, + "grad_norm": 0.8408839266320254, + "language_loss": 0.67097104, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.69170785, + "num_input_tokens_seen": 74331390, + "step": 3439, + "time_per_iteration": 3.07629656791687 + }, + { + "auxiliary_loss_clip": 0.01136868, + "auxiliary_loss_mlp": 0.01037327, + "balance_loss_clip": 1.05002785, + "balance_loss_mlp": 1.02005339, + "epoch": 0.20682398917781453, + "flos": 17274577328640.0, + "grad_norm": 2.3200109302949055, + "language_loss": 0.83991206, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.86165398, + "num_input_tokens_seen": 74347335, + "step": 3440, + "time_per_iteration": 2.471365213394165 + }, + { + "auxiliary_loss_clip": 0.01136674, + "auxiliary_loss_mlp": 0.0104189, + "balance_loss_clip": 1.04990864, + "balance_loss_mlp": 1.02522492, + "epoch": 0.2068841124304825, + "flos": 18077288515200.0, + "grad_norm": 2.019098512322724, + "language_loss": 0.85298049, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.87476611, + "num_input_tokens_seen": 74366310, + "step": 3441, + "time_per_iteration": 2.5025110244750977 + }, + { + "auxiliary_loss_clip": 0.01103761, + "auxiliary_loss_mlp": 0.01043418, + "balance_loss_clip": 1.05373669, + "balance_loss_mlp": 1.02551305, + "epoch": 0.20694423568315046, + "flos": 27347017432320.0, + "grad_norm": 1.9933889923050232, + "language_loss": 0.85565841, + "learning_rate": 3.680455884806959e-06, + "loss": 0.87713015, + "num_input_tokens_seen": 74387100, + "step": 3442, + "time_per_iteration": 2.6477363109588623 + }, + { + "auxiliary_loss_clip": 0.01073285, + "auxiliary_loss_mlp": 0.01040219, + "balance_loss_clip": 1.05398464, + "balance_loss_mlp": 1.0221827, + "epoch": 0.20700435893581842, + "flos": 20229845829120.0, + "grad_norm": 2.3055245752461437, + "language_loss": 0.73040771, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.75154275, + "num_input_tokens_seen": 74404460, + "step": 3443, + "time_per_iteration": 2.6486024856567383 + }, + { + "auxiliary_loss_clip": 0.01118892, + "auxiliary_loss_mlp": 0.00796983, + "balance_loss_clip": 1.05034018, + "balance_loss_mlp": 1.0132705, + "epoch": 0.2070644821884864, + "flos": 20631111638400.0, + "grad_norm": 2.9927212050547807, + "language_loss": 0.85267627, + "learning_rate": 3.680033399147797e-06, + "loss": 0.87183499, + "num_input_tokens_seen": 74423790, + "step": 3444, + "time_per_iteration": 2.5475831031799316 + }, + { + "auxiliary_loss_clip": 0.01024825, + "auxiliary_loss_mlp": 0.01000165, + "balance_loss_clip": 1.03623033, + "balance_loss_mlp": 0.99786395, + "epoch": 0.20712460544115438, + "flos": 65941077617280.0, + "grad_norm": 0.6949683184703506, + "language_loss": 0.57120758, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.59145755, + "num_input_tokens_seen": 74488130, + "step": 3445, + "time_per_iteration": 3.1430788040161133 + }, + { + "auxiliary_loss_clip": 0.01145526, + "auxiliary_loss_mlp": 0.00799872, + "balance_loss_clip": 1.05042887, + "balance_loss_mlp": 1.0176065, + "epoch": 0.20718472869382235, + "flos": 19425734012160.0, + "grad_norm": 1.6276757943805396, + "language_loss": 0.78120536, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.8006593, + "num_input_tokens_seen": 74506720, + "step": 3446, + "time_per_iteration": 2.483214855194092 + }, + { + "auxiliary_loss_clip": 0.0114348, + "auxiliary_loss_mlp": 0.01044322, + "balance_loss_clip": 1.05233717, + "balance_loss_mlp": 1.02381754, + "epoch": 0.2072448519464903, + "flos": 24499049834880.0, + "grad_norm": 3.682693485183034, + "language_loss": 0.62167156, + "learning_rate": 3.679399192876334e-06, + "loss": 0.64354956, + "num_input_tokens_seen": 74525330, + "step": 3447, + "time_per_iteration": 2.5261948108673096 + }, + { + "auxiliary_loss_clip": 0.01095314, + "auxiliary_loss_mlp": 0.01052544, + "balance_loss_clip": 1.04403543, + "balance_loss_mlp": 1.03258884, + "epoch": 0.20730497519915828, + "flos": 23075694524160.0, + "grad_norm": 2.5452545842051646, + "language_loss": 0.86180961, + "learning_rate": 3.679187663409184e-06, + "loss": 0.88328815, + "num_input_tokens_seen": 74544535, + "step": 3448, + "time_per_iteration": 2.600186586380005 + }, + { + "auxiliary_loss_clip": 0.01120032, + "auxiliary_loss_mlp": 0.01041015, + "balance_loss_clip": 1.04606879, + "balance_loss_mlp": 1.02104735, + "epoch": 0.20736509845182624, + "flos": 21069042255360.0, + "grad_norm": 3.1937407915631537, + "language_loss": 0.75697815, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.77858865, + "num_input_tokens_seen": 74562300, + "step": 3449, + "time_per_iteration": 2.539281129837036 + }, + { + "auxiliary_loss_clip": 0.0113192, + "auxiliary_loss_mlp": 0.01050816, + "balance_loss_clip": 1.04943419, + "balance_loss_mlp": 1.03146803, + "epoch": 0.2074252217044942, + "flos": 17633288499840.0, + "grad_norm": 1.8821350292083072, + "language_loss": 0.76393133, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.78575867, + "num_input_tokens_seen": 74580080, + "step": 3450, + "time_per_iteration": 2.516465425491333 + }, + { + "auxiliary_loss_clip": 0.01116767, + "auxiliary_loss_mlp": 0.01044907, + "balance_loss_clip": 1.04848289, + "balance_loss_mlp": 1.02700186, + "epoch": 0.20748534495716217, + "flos": 23546985897600.0, + "grad_norm": 2.526487579965727, + "language_loss": 0.8224951, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.8441118, + "num_input_tokens_seen": 74598980, + "step": 3451, + "time_per_iteration": 2.590925455093384 + }, + { + "auxiliary_loss_clip": 0.01059186, + "auxiliary_loss_mlp": 0.01003685, + "balance_loss_clip": 1.02382946, + "balance_loss_mlp": 1.00122893, + "epoch": 0.20754546820983016, + "flos": 52252935598080.0, + "grad_norm": 0.7851238299338253, + "language_loss": 0.56603658, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58666527, + "num_input_tokens_seen": 74655275, + "step": 3452, + "time_per_iteration": 3.0009827613830566 + }, + { + "auxiliary_loss_clip": 0.01117395, + "auxiliary_loss_mlp": 0.00799869, + "balance_loss_clip": 1.05144322, + "balance_loss_mlp": 1.01940608, + "epoch": 0.20760559146249813, + "flos": 20412379768320.0, + "grad_norm": 1.919990072486053, + "language_loss": 0.88029069, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.8994633, + "num_input_tokens_seen": 74674560, + "step": 3453, + "time_per_iteration": 2.5796854496002197 + }, + { + "auxiliary_loss_clip": 0.01144416, + "auxiliary_loss_mlp": 0.01044419, + "balance_loss_clip": 1.05596519, + "balance_loss_mlp": 1.02517867, + "epoch": 0.2076657147151661, + "flos": 23186012169600.0, + "grad_norm": 3.998931972470843, + "language_loss": 0.79963958, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.82152796, + "num_input_tokens_seen": 74694500, + "step": 3454, + "time_per_iteration": 2.556161642074585 + }, + { + "auxiliary_loss_clip": 0.01102688, + "auxiliary_loss_mlp": 0.00800396, + "balance_loss_clip": 1.0480746, + "balance_loss_mlp": 1.0173912, + "epoch": 0.20772583796783406, + "flos": 18293219124480.0, + "grad_norm": 3.145513064037283, + "language_loss": 0.76947969, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.7885105, + "num_input_tokens_seen": 74710485, + "step": 3455, + "time_per_iteration": 2.528885841369629 + }, + { + "auxiliary_loss_clip": 0.01116421, + "auxiliary_loss_mlp": 0.01046977, + "balance_loss_clip": 1.05324399, + "balance_loss_mlp": 1.02922714, + "epoch": 0.20778596122050202, + "flos": 17602800831360.0, + "grad_norm": 1.8715827029451806, + "language_loss": 0.80668479, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.82831877, + "num_input_tokens_seen": 74727450, + "step": 3456, + "time_per_iteration": 2.5601418018341064 + }, + { + "auxiliary_loss_clip": 0.01107812, + "auxiliary_loss_mlp": 0.00799771, + "balance_loss_clip": 1.05171108, + "balance_loss_mlp": 1.01674557, + "epoch": 0.20784608447317, + "flos": 23805578885760.0, + "grad_norm": 1.5918944848958074, + "language_loss": 0.7773484, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.79642427, + "num_input_tokens_seen": 74746725, + "step": 3457, + "time_per_iteration": 2.6408443450927734 + }, + { + "auxiliary_loss_clip": 0.0107924, + "auxiliary_loss_mlp": 0.01054229, + "balance_loss_clip": 1.04806256, + "balance_loss_mlp": 1.03279519, + "epoch": 0.20790620772583795, + "flos": 17639286071040.0, + "grad_norm": 1.838500144260174, + "language_loss": 0.83441734, + "learning_rate": 3.677068867939333e-06, + "loss": 0.85575211, + "num_input_tokens_seen": 74765255, + "step": 3458, + "time_per_iteration": 2.641970157623291 + }, + { + "auxiliary_loss_clip": 0.01134122, + "auxiliary_loss_mlp": 0.00800162, + "balance_loss_clip": 1.04950285, + "balance_loss_mlp": 1.01817131, + "epoch": 0.20796633097850595, + "flos": 27673481168640.0, + "grad_norm": 2.7009939442045257, + "language_loss": 0.75923985, + "learning_rate": 3.676856638489272e-06, + "loss": 0.77858269, + "num_input_tokens_seen": 74785710, + "step": 3459, + "time_per_iteration": 2.538094997406006 + }, + { + "auxiliary_loss_clip": 0.01081538, + "auxiliary_loss_mlp": 0.01034032, + "balance_loss_clip": 1.04664731, + "balance_loss_mlp": 1.01684225, + "epoch": 0.2080264542311739, + "flos": 19245606284160.0, + "grad_norm": 2.6800614517237182, + "language_loss": 0.77361226, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.79476798, + "num_input_tokens_seen": 74804490, + "step": 3460, + "time_per_iteration": 2.662203311920166 + }, + { + "auxiliary_loss_clip": 0.01085865, + "auxiliary_loss_mlp": 0.01043899, + "balance_loss_clip": 1.04721487, + "balance_loss_mlp": 1.02557683, + "epoch": 0.20808657748384188, + "flos": 27525924097920.0, + "grad_norm": 1.9339180019431084, + "language_loss": 0.75754774, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.77884531, + "num_input_tokens_seen": 74826340, + "step": 3461, + "time_per_iteration": 2.682154893875122 + }, + { + "auxiliary_loss_clip": 0.01122288, + "auxiliary_loss_mlp": 0.01041392, + "balance_loss_clip": 1.04963934, + "balance_loss_mlp": 1.02255654, + "epoch": 0.20814670073650984, + "flos": 26906931999360.0, + "grad_norm": 1.7561725225975717, + "language_loss": 0.88405389, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.90569067, + "num_input_tokens_seen": 74844960, + "step": 3462, + "time_per_iteration": 2.5886764526367188 + }, + { + "auxiliary_loss_clip": 0.01020584, + "auxiliary_loss_mlp": 0.01120873, + "balance_loss_clip": 1.02479291, + "balance_loss_mlp": 1.60749757, + "epoch": 0.2082068239891778, + "flos": 70175735717760.0, + "grad_norm": 0.8640369271827509, + "language_loss": 0.59079254, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.61220706, + "num_input_tokens_seen": 74909075, + "step": 3463, + "time_per_iteration": 3.3037097454071045 + }, + { + "auxiliary_loss_clip": 0.01125607, + "auxiliary_loss_mlp": 0.01047717, + "balance_loss_clip": 1.04668713, + "balance_loss_mlp": 1.02945399, + "epoch": 0.20826694724184577, + "flos": 24608074590720.0, + "grad_norm": 2.5837400195383924, + "language_loss": 0.66200614, + "learning_rate": 3.675794537601429e-06, + "loss": 0.68373942, + "num_input_tokens_seen": 74928125, + "step": 3464, + "time_per_iteration": 2.5874416828155518 + }, + { + "auxiliary_loss_clip": 0.01112719, + "auxiliary_loss_mlp": 0.01041402, + "balance_loss_clip": 1.04595304, + "balance_loss_mlp": 1.02272153, + "epoch": 0.20832707049451377, + "flos": 12892829034240.0, + "grad_norm": 2.442729049794554, + "language_loss": 0.83733582, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.85887706, + "num_input_tokens_seen": 74945090, + "step": 3465, + "time_per_iteration": 2.5352060794830322 + }, + { + "auxiliary_loss_clip": 0.01092991, + "auxiliary_loss_mlp": 0.01039456, + "balance_loss_clip": 1.04790854, + "balance_loss_mlp": 1.02151537, + "epoch": 0.20838719374718173, + "flos": 22198827709440.0, + "grad_norm": 2.2527418712445355, + "language_loss": 0.82081485, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.84213924, + "num_input_tokens_seen": 74963630, + "step": 3466, + "time_per_iteration": 2.617039203643799 + }, + { + "auxiliary_loss_clip": 0.01133547, + "auxiliary_loss_mlp": 0.01038458, + "balance_loss_clip": 1.05000508, + "balance_loss_mlp": 1.02287698, + "epoch": 0.2084473169998497, + "flos": 15158648908800.0, + "grad_norm": 1.9085569339368604, + "language_loss": 0.81773984, + "learning_rate": 3.675156514448716e-06, + "loss": 0.8394599, + "num_input_tokens_seen": 74981875, + "step": 3467, + "time_per_iteration": 2.4802989959716797 + }, + { + "auxiliary_loss_clip": 0.01142471, + "auxiliary_loss_mlp": 0.01039368, + "balance_loss_clip": 1.05050147, + "balance_loss_mlp": 1.02357268, + "epoch": 0.20850744025251766, + "flos": 17456788045440.0, + "grad_norm": 1.9800416984159306, + "language_loss": 0.8179723, + "learning_rate": 3.674943713009518e-06, + "loss": 0.8397907, + "num_input_tokens_seen": 74999155, + "step": 3468, + "time_per_iteration": 3.8251712322235107 + }, + { + "auxiliary_loss_clip": 0.01136866, + "auxiliary_loss_mlp": 0.01047679, + "balance_loss_clip": 1.04933393, + "balance_loss_mlp": 1.02747309, + "epoch": 0.20856756350518563, + "flos": 25698968593920.0, + "grad_norm": 2.1381579531805297, + "language_loss": 0.90033662, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.92218208, + "num_input_tokens_seen": 75017850, + "step": 3469, + "time_per_iteration": 2.5651025772094727 + }, + { + "auxiliary_loss_clip": 0.01112522, + "auxiliary_loss_mlp": 0.0104036, + "balance_loss_clip": 1.05138671, + "balance_loss_mlp": 1.02283657, + "epoch": 0.2086276867578536, + "flos": 37889060970240.0, + "grad_norm": 2.275679910968917, + "language_loss": 0.76328593, + "learning_rate": 3.674517919597092e-06, + "loss": 0.78481472, + "num_input_tokens_seen": 75039270, + "step": 3470, + "time_per_iteration": 4.0824267864227295 + }, + { + "auxiliary_loss_clip": 0.01124286, + "auxiliary_loss_mlp": 0.01044249, + "balance_loss_clip": 1.0514462, + "balance_loss_mlp": 1.02652287, + "epoch": 0.20868781001052156, + "flos": 25557049958400.0, + "grad_norm": 2.895830467034021, + "language_loss": 0.75791216, + "learning_rate": 3.674304927640011e-06, + "loss": 0.77959746, + "num_input_tokens_seen": 75059350, + "step": 3471, + "time_per_iteration": 2.594482660293579 + }, + { + "auxiliary_loss_clip": 0.01116278, + "auxiliary_loss_mlp": 0.01048773, + "balance_loss_clip": 1.04725099, + "balance_loss_mlp": 1.02955639, + "epoch": 0.20874793326318955, + "flos": 27529192235520.0, + "grad_norm": 1.7431905605982951, + "language_loss": 0.75649774, + "learning_rate": 3.67409187219312e-06, + "loss": 0.77814823, + "num_input_tokens_seen": 75080150, + "step": 3472, + "time_per_iteration": 2.6478099822998047 + }, + { + "auxiliary_loss_clip": 0.01140492, + "auxiliary_loss_mlp": 0.01038848, + "balance_loss_clip": 1.05323529, + "balance_loss_mlp": 1.02189624, + "epoch": 0.20880805651585752, + "flos": 18548795370240.0, + "grad_norm": 2.1396975986549878, + "language_loss": 0.84804034, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.86983377, + "num_input_tokens_seen": 75097920, + "step": 3473, + "time_per_iteration": 3.877782106399536 + }, + { + "auxiliary_loss_clip": 0.01066398, + "auxiliary_loss_mlp": 0.01000987, + "balance_loss_clip": 1.06408632, + "balance_loss_mlp": 0.99860305, + "epoch": 0.20886817976852548, + "flos": 65946644225280.0, + "grad_norm": 0.884051610262058, + "language_loss": 0.63648605, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.65715992, + "num_input_tokens_seen": 75152410, + "step": 3474, + "time_per_iteration": 3.0930633544921875 + }, + { + "auxiliary_loss_clip": 0.01127642, + "auxiliary_loss_mlp": 0.01043503, + "balance_loss_clip": 1.04923463, + "balance_loss_mlp": 1.02613401, + "epoch": 0.20892830302119345, + "flos": 36539178929280.0, + "grad_norm": 2.3989175930908684, + "language_loss": 0.70152605, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.72323751, + "num_input_tokens_seen": 75173265, + "step": 3475, + "time_per_iteration": 4.052665948867798 + }, + { + "auxiliary_loss_clip": 0.01149788, + "auxiliary_loss_mlp": 0.01044503, + "balance_loss_clip": 1.05270267, + "balance_loss_mlp": 1.02620435, + "epoch": 0.2089884262738614, + "flos": 20956749361920.0, + "grad_norm": 2.8976780131969218, + "language_loss": 0.69911337, + "learning_rate": 3.673239015669065e-06, + "loss": 0.72105634, + "num_input_tokens_seen": 75193640, + "step": 3476, + "time_per_iteration": 2.4820446968078613 + }, + { + "auxiliary_loss_clip": 0.0112398, + "auxiliary_loss_mlp": 0.01043256, + "balance_loss_clip": 1.04962373, + "balance_loss_mlp": 1.02624524, + "epoch": 0.20904854952652938, + "flos": 22784028088320.0, + "grad_norm": 2.08871852711378, + "language_loss": 0.89302534, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.91469771, + "num_input_tokens_seen": 75212545, + "step": 3477, + "time_per_iteration": 2.5700876712799072 + }, + { + "auxiliary_loss_clip": 0.01089968, + "auxiliary_loss_mlp": 0.01048953, + "balance_loss_clip": 1.05047798, + "balance_loss_mlp": 1.03116715, + "epoch": 0.20910867277919734, + "flos": 27303277645440.0, + "grad_norm": 2.424945033212532, + "language_loss": 0.68271601, + "learning_rate": 3.672812206678344e-06, + "loss": 0.70410526, + "num_input_tokens_seen": 75230865, + "step": 3478, + "time_per_iteration": 2.630680561065674 + }, + { + "auxiliary_loss_clip": 0.01096439, + "auxiliary_loss_mlp": 0.01050505, + "balance_loss_clip": 1.04820132, + "balance_loss_mlp": 1.03079998, + "epoch": 0.20916879603186533, + "flos": 14319237000960.0, + "grad_norm": 3.9872498620182726, + "language_loss": 0.84516299, + "learning_rate": 3.672598707029127e-06, + "loss": 0.86663246, + "num_input_tokens_seen": 75248285, + "step": 3479, + "time_per_iteration": 2.5455973148345947 + }, + { + "auxiliary_loss_clip": 0.01111411, + "auxiliary_loss_mlp": 0.01057407, + "balance_loss_clip": 1.05196404, + "balance_loss_mlp": 1.03761852, + "epoch": 0.2092289192845333, + "flos": 22273019251200.0, + "grad_norm": 3.0052169094925927, + "language_loss": 0.74061263, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.76230085, + "num_input_tokens_seen": 75266310, + "step": 3480, + "time_per_iteration": 2.5335938930511475 + }, + { + "auxiliary_loss_clip": 0.01104067, + "auxiliary_loss_mlp": 0.01047917, + "balance_loss_clip": 1.04887486, + "balance_loss_mlp": 1.03188896, + "epoch": 0.20928904253720126, + "flos": 14830712714880.0, + "grad_norm": 2.1338674047506117, + "language_loss": 0.75857693, + "learning_rate": 3.67217151746346e-06, + "loss": 0.78009677, + "num_input_tokens_seen": 75284175, + "step": 3481, + "time_per_iteration": 2.570148229598999 + }, + { + "auxiliary_loss_clip": 0.01088691, + "auxiliary_loss_mlp": 0.01051203, + "balance_loss_clip": 1.0499928, + "balance_loss_mlp": 1.03358376, + "epoch": 0.20934916578986923, + "flos": 23259162216960.0, + "grad_norm": 1.9284863691121146, + "language_loss": 0.84975195, + "learning_rate": 3.671957827563209e-06, + "loss": 0.87115085, + "num_input_tokens_seen": 75303465, + "step": 3482, + "time_per_iteration": 2.611280918121338 + }, + { + "auxiliary_loss_clip": 0.01095507, + "auxiliary_loss_mlp": 0.01047784, + "balance_loss_clip": 1.05016088, + "balance_loss_mlp": 1.03027201, + "epoch": 0.2094092890425372, + "flos": 32014398677760.0, + "grad_norm": 2.0939955795071117, + "language_loss": 0.70713842, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.7285713, + "num_input_tokens_seen": 75325290, + "step": 3483, + "time_per_iteration": 2.7167322635650635 + }, + { + "auxiliary_loss_clip": 0.01121781, + "auxiliary_loss_mlp": 0.01052635, + "balance_loss_clip": 1.05267239, + "balance_loss_mlp": 1.03487253, + "epoch": 0.20946941229520516, + "flos": 20010647082240.0, + "grad_norm": 1.6948069290669017, + "language_loss": 0.75003344, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.77177751, + "num_input_tokens_seen": 75343895, + "step": 3484, + "time_per_iteration": 2.531038999557495 + }, + { + "auxiliary_loss_clip": 0.01111533, + "auxiliary_loss_mlp": 0.01045161, + "balance_loss_clip": 1.05212474, + "balance_loss_mlp": 1.0281024, + "epoch": 0.20952953554787315, + "flos": 30740072895360.0, + "grad_norm": 2.297391168367102, + "language_loss": 0.70412374, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.72569072, + "num_input_tokens_seen": 75367100, + "step": 3485, + "time_per_iteration": 2.6345601081848145 + }, + { + "auxiliary_loss_clip": 0.01084735, + "auxiliary_loss_mlp": 0.00995497, + "balance_loss_clip": 1.05218363, + "balance_loss_mlp": 1.38515329, + "epoch": 0.20958965880054112, + "flos": 27049209770880.0, + "grad_norm": 2.007452180720724, + "language_loss": 0.82957196, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.85037428, + "num_input_tokens_seen": 75389925, + "step": 3486, + "time_per_iteration": 2.677536964416504 + }, + { + "auxiliary_loss_clip": 0.01132715, + "auxiliary_loss_mlp": 0.01050763, + "balance_loss_clip": 1.04821658, + "balance_loss_mlp": 1.03393042, + "epoch": 0.20964978205320908, + "flos": 34204123589760.0, + "grad_norm": 1.9165150088301315, + "language_loss": 0.86945432, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.89128917, + "num_input_tokens_seen": 75408575, + "step": 3487, + "time_per_iteration": 2.607020378112793 + }, + { + "auxiliary_loss_clip": 0.01110931, + "auxiliary_loss_mlp": 0.01043318, + "balance_loss_clip": 1.05018532, + "balance_loss_mlp": 1.02475691, + "epoch": 0.20970990530587705, + "flos": 23477391296640.0, + "grad_norm": 4.584333532155842, + "language_loss": 0.72927707, + "learning_rate": 3.670674357028504e-06, + "loss": 0.75081956, + "num_input_tokens_seen": 75427155, + "step": 3488, + "time_per_iteration": 2.618422746658325 + }, + { + "auxiliary_loss_clip": 0.01115491, + "auxiliary_loss_mlp": 0.01037127, + "balance_loss_clip": 1.05096412, + "balance_loss_mlp": 1.02142715, + "epoch": 0.209770028558545, + "flos": 18551452976640.0, + "grad_norm": 2.627647097816227, + "language_loss": 0.81081307, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.83233923, + "num_input_tokens_seen": 75444450, + "step": 3489, + "time_per_iteration": 2.525465726852417 + }, + { + "auxiliary_loss_clip": 0.01148387, + "auxiliary_loss_mlp": 0.01043874, + "balance_loss_clip": 1.05097902, + "balance_loss_mlp": 1.02657723, + "epoch": 0.20983015181121298, + "flos": 21617003208960.0, + "grad_norm": 2.393596439976296, + "language_loss": 0.7305547, + "learning_rate": 3.670246026613266e-06, + "loss": 0.75247729, + "num_input_tokens_seen": 75462625, + "step": 3490, + "time_per_iteration": 2.5032074451446533 + }, + { + "auxiliary_loss_clip": 0.01122455, + "auxiliary_loss_mlp": 0.01051087, + "balance_loss_clip": 1.0522908, + "balance_loss_mlp": 1.03444529, + "epoch": 0.20989027506388094, + "flos": 16614718531200.0, + "grad_norm": 1.9587252042989014, + "language_loss": 0.70764756, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.72938299, + "num_input_tokens_seen": 75480640, + "step": 3491, + "time_per_iteration": 2.5132462978363037 + }, + { + "auxiliary_loss_clip": 0.01135317, + "auxiliary_loss_mlp": 0.00902845, + "balance_loss_clip": 1.04903293, + "balance_loss_mlp": 1.21283817, + "epoch": 0.20995039831654894, + "flos": 23216823060480.0, + "grad_norm": 7.961475652932225, + "language_loss": 0.79656792, + "learning_rate": 3.669817442854444e-06, + "loss": 0.81694949, + "num_input_tokens_seen": 75494900, + "step": 3492, + "time_per_iteration": 2.5188467502593994 + }, + { + "auxiliary_loss_clip": 0.01138289, + "auxiliary_loss_mlp": 0.00909516, + "balance_loss_clip": 1.05258656, + "balance_loss_mlp": 1.22501016, + "epoch": 0.2100105215692169, + "flos": 18147493647360.0, + "grad_norm": 2.0505421184242154, + "language_loss": 0.86747658, + "learning_rate": 3.669603055991502e-06, + "loss": 0.88795471, + "num_input_tokens_seen": 75513370, + "step": 3493, + "time_per_iteration": 2.505835771560669 + }, + { + "auxiliary_loss_clip": 0.01109751, + "auxiliary_loss_mlp": 0.01044329, + "balance_loss_clip": 1.04797983, + "balance_loss_mlp": 1.02771163, + "epoch": 0.21007064482188487, + "flos": 15961611490560.0, + "grad_norm": 1.679712941880243, + "language_loss": 0.68770188, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.7092427, + "num_input_tokens_seen": 75532480, + "step": 3494, + "time_per_iteration": 2.5466177463531494 + }, + { + "auxiliary_loss_clip": 0.01138635, + "auxiliary_loss_mlp": 0.01041421, + "balance_loss_clip": 1.04966712, + "balance_loss_mlp": 1.02437425, + "epoch": 0.21013076807455283, + "flos": 32234315696640.0, + "grad_norm": 2.2017907207780367, + "language_loss": 0.79248595, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.81428647, + "num_input_tokens_seen": 75552745, + "step": 3495, + "time_per_iteration": 2.6143958568573 + }, + { + "auxiliary_loss_clip": 0.01113852, + "auxiliary_loss_mlp": 0.01042715, + "balance_loss_clip": 1.048033, + "balance_loss_mlp": 1.02498865, + "epoch": 0.2101908913272208, + "flos": 23696625957120.0, + "grad_norm": 1.63401207119576, + "language_loss": 0.77094662, + "learning_rate": 3.668959515566116e-06, + "loss": 0.79251218, + "num_input_tokens_seen": 75574355, + "step": 3496, + "time_per_iteration": 2.563825845718384 + }, + { + "auxiliary_loss_clip": 0.0112954, + "auxiliary_loss_mlp": 0.01046132, + "balance_loss_clip": 1.04858029, + "balance_loss_mlp": 1.02794075, + "epoch": 0.21025101457988876, + "flos": 20375786787840.0, + "grad_norm": 2.9252521514504894, + "language_loss": 0.82226348, + "learning_rate": 3.668744875505915e-06, + "loss": 0.84402025, + "num_input_tokens_seen": 75592215, + "step": 3497, + "time_per_iteration": 2.5160553455352783 + }, + { + "auxiliary_loss_clip": 0.01139699, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.05287313, + "balance_loss_mlp": 1.02833104, + "epoch": 0.21031113783255675, + "flos": 25775638174080.0, + "grad_norm": 1.8937955832744184, + "language_loss": 0.67519683, + "learning_rate": 3.668530172166741e-06, + "loss": 0.69705033, + "num_input_tokens_seen": 75610740, + "step": 3498, + "time_per_iteration": 2.56754207611084 + }, + { + "auxiliary_loss_clip": 0.01123172, + "auxiliary_loss_mlp": 0.01039989, + "balance_loss_clip": 1.05134702, + "balance_loss_mlp": 1.02221465, + "epoch": 0.21037126108522472, + "flos": 22018197191040.0, + "grad_norm": 1.9410323864550467, + "language_loss": 0.80622566, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.82785726, + "num_input_tokens_seen": 75631005, + "step": 3499, + "time_per_iteration": 2.5532495975494385 + }, + { + "auxiliary_loss_clip": 0.01137393, + "auxiliary_loss_mlp": 0.01048415, + "balance_loss_clip": 1.05228949, + "balance_loss_mlp": 1.03213739, + "epoch": 0.21043138433789269, + "flos": 25334403505920.0, + "grad_norm": 2.616716500037735, + "language_loss": 0.78575373, + "learning_rate": 3.668100575684043e-06, + "loss": 0.80761182, + "num_input_tokens_seen": 75650655, + "step": 3500, + "time_per_iteration": 2.5483734607696533 + }, + { + "auxiliary_loss_clip": 0.01125127, + "auxiliary_loss_mlp": 0.01043496, + "balance_loss_clip": 1.05135477, + "balance_loss_mlp": 1.0255903, + "epoch": 0.21049150759056065, + "flos": 25556654908800.0, + "grad_norm": 1.7942351813245856, + "language_loss": 0.74132001, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.76300621, + "num_input_tokens_seen": 75669895, + "step": 3501, + "time_per_iteration": 2.5845720767974854 + }, + { + "auxiliary_loss_clip": 0.01131982, + "auxiliary_loss_mlp": 0.01035957, + "balance_loss_clip": 1.0493896, + "balance_loss_mlp": 1.01890969, + "epoch": 0.21055163084322862, + "flos": 24495602129280.0, + "grad_norm": 1.6162144415791853, + "language_loss": 0.75322044, + "learning_rate": 3.667670726183183e-06, + "loss": 0.77489978, + "num_input_tokens_seen": 75689535, + "step": 3502, + "time_per_iteration": 2.543369770050049 + }, + { + "auxiliary_loss_clip": 0.01095476, + "auxiliary_loss_mlp": 0.01037258, + "balance_loss_clip": 1.04675198, + "balance_loss_mlp": 1.01898336, + "epoch": 0.21061175409589658, + "flos": 25739045193600.0, + "grad_norm": 1.8790328207653966, + "language_loss": 0.77159524, + "learning_rate": 3.667455706571316e-06, + "loss": 0.7929225, + "num_input_tokens_seen": 75709265, + "step": 3503, + "time_per_iteration": 2.604196786880493 + }, + { + "auxiliary_loss_clip": 0.01085188, + "auxiliary_loss_mlp": 0.01043772, + "balance_loss_clip": 1.04552209, + "balance_loss_mlp": 1.02364945, + "epoch": 0.21067187734856455, + "flos": 18989168112000.0, + "grad_norm": 3.260555619688053, + "language_loss": 0.78570557, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.80699515, + "num_input_tokens_seen": 75727050, + "step": 3504, + "time_per_iteration": 2.6142737865448 + }, + { + "auxiliary_loss_clip": 0.01112166, + "auxiliary_loss_mlp": 0.01047676, + "balance_loss_clip": 1.0468111, + "balance_loss_mlp": 1.02925849, + "epoch": 0.21073200060123254, + "flos": 24681368292480.0, + "grad_norm": 1.5690313262937263, + "language_loss": 0.76610136, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.78769982, + "num_input_tokens_seen": 75747175, + "step": 3505, + "time_per_iteration": 2.5818111896514893 + }, + { + "auxiliary_loss_clip": 0.01114932, + "auxiliary_loss_mlp": 0.01045907, + "balance_loss_clip": 1.04806197, + "balance_loss_mlp": 1.02863383, + "epoch": 0.2107921238539005, + "flos": 28549342402560.0, + "grad_norm": 1.8588529483175944, + "language_loss": 0.64074957, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.66235799, + "num_input_tokens_seen": 75767690, + "step": 3506, + "time_per_iteration": 2.5834028720855713 + }, + { + "auxiliary_loss_clip": 0.01134723, + "auxiliary_loss_mlp": 0.01046698, + "balance_loss_clip": 1.04999852, + "balance_loss_mlp": 1.02891171, + "epoch": 0.21085224710656847, + "flos": 25885848078720.0, + "grad_norm": 1.7451141039053462, + "language_loss": 0.81971765, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.84153187, + "num_input_tokens_seen": 75787255, + "step": 3507, + "time_per_iteration": 3.9661033153533936 + }, + { + "auxiliary_loss_clip": 0.01133643, + "auxiliary_loss_mlp": 0.01045131, + "balance_loss_clip": 1.04855299, + "balance_loss_mlp": 1.02744079, + "epoch": 0.21091237035923643, + "flos": 14976294537600.0, + "grad_norm": 1.692461244655297, + "language_loss": 0.7549082, + "learning_rate": 3.666379660223824e-06, + "loss": 0.77669597, + "num_input_tokens_seen": 75805890, + "step": 3508, + "time_per_iteration": 2.4953200817108154 + }, + { + "auxiliary_loss_clip": 0.01147844, + "auxiliary_loss_mlp": 0.01036781, + "balance_loss_clip": 1.04835439, + "balance_loss_mlp": 1.01986527, + "epoch": 0.2109724936119044, + "flos": 16362518163840.0, + "grad_norm": 12.334128264821654, + "language_loss": 0.85571086, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.8775571, + "num_input_tokens_seen": 75821620, + "step": 3509, + "time_per_iteration": 3.8399300575256348 + }, + { + "auxiliary_loss_clip": 0.01113698, + "auxiliary_loss_mlp": 0.01048644, + "balance_loss_clip": 1.05233073, + "balance_loss_mlp": 1.02966559, + "epoch": 0.21103261686457236, + "flos": 31502492000640.0, + "grad_norm": 10.447453394036526, + "language_loss": 0.68193603, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.70355952, + "num_input_tokens_seen": 75842490, + "step": 3510, + "time_per_iteration": 2.6425082683563232 + }, + { + "auxiliary_loss_clip": 0.01147457, + "auxiliary_loss_mlp": 0.0104395, + "balance_loss_clip": 1.04872108, + "balance_loss_mlp": 1.02656949, + "epoch": 0.21109274011724033, + "flos": 27344072517120.0, + "grad_norm": 2.1970924158165226, + "language_loss": 0.72436196, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.74627602, + "num_input_tokens_seen": 75865985, + "step": 3511, + "time_per_iteration": 2.5369577407836914 + }, + { + "auxiliary_loss_clip": 0.01064589, + "auxiliary_loss_mlp": 0.01044876, + "balance_loss_clip": 1.04728663, + "balance_loss_mlp": 1.0253135, + "epoch": 0.21115286336990832, + "flos": 17820383466240.0, + "grad_norm": 6.465513340256305, + "language_loss": 0.69501364, + "learning_rate": 3.665517685689794e-06, + "loss": 0.71610832, + "num_input_tokens_seen": 75882745, + "step": 3512, + "time_per_iteration": 4.399074077606201 + }, + { + "auxiliary_loss_clip": 0.01135645, + "auxiliary_loss_mlp": 0.01052916, + "balance_loss_clip": 1.04727292, + "balance_loss_mlp": 1.03397346, + "epoch": 0.2112129866225763, + "flos": 27197987904000.0, + "grad_norm": 1.80625678094807, + "language_loss": 0.73233694, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.75422257, + "num_input_tokens_seen": 75904305, + "step": 3513, + "time_per_iteration": 4.506604433059692 + }, + { + "auxiliary_loss_clip": 0.01119423, + "auxiliary_loss_mlp": 0.01039118, + "balance_loss_clip": 1.04878569, + "balance_loss_mlp": 1.02223778, + "epoch": 0.21127310987524425, + "flos": 23731279603200.0, + "grad_norm": 10.452643770281146, + "language_loss": 0.74398744, + "learning_rate": 3.665086319450502e-06, + "loss": 0.76557285, + "num_input_tokens_seen": 75923710, + "step": 3514, + "time_per_iteration": 2.5936524868011475 + }, + { + "auxiliary_loss_clip": 0.01131753, + "auxiliary_loss_mlp": 0.01044279, + "balance_loss_clip": 1.05141842, + "balance_loss_mlp": 1.02629066, + "epoch": 0.21133323312791222, + "flos": 18332505624960.0, + "grad_norm": 1.7857522943942439, + "language_loss": 0.76661825, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.7883786, + "num_input_tokens_seen": 75942625, + "step": 3515, + "time_per_iteration": 2.5500011444091797 + }, + { + "auxiliary_loss_clip": 0.01124767, + "auxiliary_loss_mlp": 0.01046555, + "balance_loss_clip": 1.05254245, + "balance_loss_mlp": 1.02868533, + "epoch": 0.21139335638058018, + "flos": 17931203902080.0, + "grad_norm": 2.728392262374262, + "language_loss": 0.68636107, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.70807433, + "num_input_tokens_seen": 75959930, + "step": 3516, + "time_per_iteration": 2.501235008239746 + }, + { + "auxiliary_loss_clip": 0.01110838, + "auxiliary_loss_mlp": 0.01051828, + "balance_loss_clip": 1.0548209, + "balance_loss_mlp": 1.03368449, + "epoch": 0.21145347963324815, + "flos": 24572092141440.0, + "grad_norm": 2.0239280009973957, + "language_loss": 0.85076702, + "learning_rate": 3.664438796560225e-06, + "loss": 0.87239373, + "num_input_tokens_seen": 75980335, + "step": 3517, + "time_per_iteration": 2.6136910915374756 + }, + { + "auxiliary_loss_clip": 0.01123624, + "auxiliary_loss_mlp": 0.01039693, + "balance_loss_clip": 1.04714513, + "balance_loss_mlp": 1.02191842, + "epoch": 0.21151360288591614, + "flos": 35845959375360.0, + "grad_norm": 2.3317184645642417, + "language_loss": 0.6218003, + "learning_rate": 3.664222829354512e-06, + "loss": 0.64343351, + "num_input_tokens_seen": 76002095, + "step": 3518, + "time_per_iteration": 2.6331639289855957 + }, + { + "auxiliary_loss_clip": 0.01083533, + "auxiliary_loss_mlp": 0.01049899, + "balance_loss_clip": 1.05089772, + "balance_loss_mlp": 1.03326988, + "epoch": 0.2115737261385841, + "flos": 24641579001600.0, + "grad_norm": 2.022232231062709, + "language_loss": 0.8900677, + "learning_rate": 3.664006799041303e-06, + "loss": 0.91140199, + "num_input_tokens_seen": 76020425, + "step": 3519, + "time_per_iteration": 2.6855738162994385 + }, + { + "auxiliary_loss_clip": 0.01131311, + "auxiliary_loss_mlp": 0.01050373, + "balance_loss_clip": 1.05117714, + "balance_loss_mlp": 1.0321219, + "epoch": 0.21163384939125207, + "flos": 25226887121280.0, + "grad_norm": 2.012051516283834, + "language_loss": 0.81462771, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.8364445, + "num_input_tokens_seen": 76041210, + "step": 3520, + "time_per_iteration": 2.5636672973632812 + }, + { + "auxiliary_loss_clip": 0.01120394, + "auxiliary_loss_mlp": 0.01047989, + "balance_loss_clip": 1.04906666, + "balance_loss_mlp": 1.03066802, + "epoch": 0.21169397264392004, + "flos": 26067520091520.0, + "grad_norm": 1.7195554210974848, + "language_loss": 0.75815171, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.77983558, + "num_input_tokens_seen": 76062685, + "step": 3521, + "time_per_iteration": 2.581467390060425 + }, + { + "auxiliary_loss_clip": 0.01097727, + "auxiliary_loss_mlp": 0.0104762, + "balance_loss_clip": 1.05080175, + "balance_loss_mlp": 1.03075171, + "epoch": 0.211754095896588, + "flos": 23108265181440.0, + "grad_norm": 1.8091106869172928, + "language_loss": 0.75670564, + "learning_rate": 3.663358329538626e-06, + "loss": 0.77815902, + "num_input_tokens_seen": 76082300, + "step": 3522, + "time_per_iteration": 2.6482675075531006 + }, + { + "auxiliary_loss_clip": 0.01146532, + "auxiliary_loss_mlp": 0.01050601, + "balance_loss_clip": 1.04895616, + "balance_loss_mlp": 1.03228998, + "epoch": 0.21181421914925597, + "flos": 27922341571200.0, + "grad_norm": 2.4183802005068222, + "language_loss": 0.70668262, + "learning_rate": 3.663142046877374e-06, + "loss": 0.72865397, + "num_input_tokens_seen": 76101135, + "step": 3523, + "time_per_iteration": 2.5337729454040527 + }, + { + "auxiliary_loss_clip": 0.01135335, + "auxiliary_loss_mlp": 0.01047722, + "balance_loss_clip": 1.05021763, + "balance_loss_mlp": 1.03043699, + "epoch": 0.21187434240192393, + "flos": 17128636369920.0, + "grad_norm": 2.1312922476985507, + "language_loss": 0.76564622, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.78747678, + "num_input_tokens_seen": 76119320, + "step": 3524, + "time_per_iteration": 2.4848852157592773 + }, + { + "auxiliary_loss_clip": 0.01122453, + "auxiliary_loss_mlp": 0.010446, + "balance_loss_clip": 1.04438567, + "balance_loss_mlp": 1.02727914, + "epoch": 0.21193446565459192, + "flos": 22347318533760.0, + "grad_norm": 2.210570732177641, + "language_loss": 0.81818151, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.83985209, + "num_input_tokens_seen": 76137445, + "step": 3525, + "time_per_iteration": 2.548069477081299 + }, + { + "auxiliary_loss_clip": 0.01087691, + "auxiliary_loss_mlp": 0.01040095, + "balance_loss_clip": 1.04595518, + "balance_loss_mlp": 1.0220108, + "epoch": 0.2119945889072599, + "flos": 27199316707200.0, + "grad_norm": 5.024190721970387, + "language_loss": 0.75098658, + "learning_rate": 3.662492820527356e-06, + "loss": 0.77226442, + "num_input_tokens_seen": 76159500, + "step": 3526, + "time_per_iteration": 2.6984596252441406 + }, + { + "auxiliary_loss_clip": 0.01148353, + "auxiliary_loss_mlp": 0.01039429, + "balance_loss_clip": 1.04897857, + "balance_loss_mlp": 1.02151132, + "epoch": 0.21205471215992786, + "flos": 20991869884800.0, + "grad_norm": 2.5588782712728046, + "language_loss": 0.76503098, + "learning_rate": 3.662276285649284e-06, + "loss": 0.78690875, + "num_input_tokens_seen": 76177990, + "step": 3527, + "time_per_iteration": 2.4965102672576904 + }, + { + "auxiliary_loss_clip": 0.01146794, + "auxiliary_loss_mlp": 0.01046128, + "balance_loss_clip": 1.04921532, + "balance_loss_mlp": 1.02751935, + "epoch": 0.21211483541259582, + "flos": 20777663128320.0, + "grad_norm": 3.708553563873243, + "language_loss": 0.77640057, + "learning_rate": 3.662059687737528e-06, + "loss": 0.79832971, + "num_input_tokens_seen": 76197125, + "step": 3528, + "time_per_iteration": 2.4879233837127686 + }, + { + "auxiliary_loss_clip": 0.01134044, + "auxiliary_loss_mlp": 0.01047466, + "balance_loss_clip": 1.04934907, + "balance_loss_mlp": 1.03009689, + "epoch": 0.21217495866526379, + "flos": 18989994124800.0, + "grad_norm": 2.1194008089650396, + "language_loss": 0.81657863, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.83839375, + "num_input_tokens_seen": 76216215, + "step": 3529, + "time_per_iteration": 2.5959057807922363 + }, + { + "auxiliary_loss_clip": 0.01128169, + "auxiliary_loss_mlp": 0.00897232, + "balance_loss_clip": 1.04792738, + "balance_loss_mlp": 1.19988227, + "epoch": 0.21223508191793175, + "flos": 20667309569280.0, + "grad_norm": 3.2401218420683318, + "language_loss": 0.76727259, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.78752661, + "num_input_tokens_seen": 76237010, + "step": 3530, + "time_per_iteration": 2.5688374042510986 + }, + { + "auxiliary_loss_clip": 0.01145752, + "auxiliary_loss_mlp": 0.01042723, + "balance_loss_clip": 1.04892933, + "balance_loss_mlp": 1.0253545, + "epoch": 0.21229520517059972, + "flos": 21616464504960.0, + "grad_norm": 2.3607547477897066, + "language_loss": 0.83353627, + "learning_rate": 3.661409515882308e-06, + "loss": 0.85542107, + "num_input_tokens_seen": 76255965, + "step": 3531, + "time_per_iteration": 2.4826738834381104 + }, + { + "auxiliary_loss_clip": 0.0111902, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_clip": 1.04983568, + "balance_loss_mlp": 1.02335012, + "epoch": 0.2123553284232677, + "flos": 13991049411840.0, + "grad_norm": 2.2670635914066213, + "language_loss": 0.73539811, + "learning_rate": 3.661192665917977e-06, + "loss": 0.75701845, + "num_input_tokens_seen": 76272150, + "step": 3532, + "time_per_iteration": 2.5092530250549316 + }, + { + "auxiliary_loss_clip": 0.01117113, + "auxiliary_loss_mlp": 0.01044131, + "balance_loss_clip": 1.04752171, + "balance_loss_mlp": 1.02518892, + "epoch": 0.21241545167593567, + "flos": 18296774570880.0, + "grad_norm": 2.0012802607863214, + "language_loss": 0.74065185, + "learning_rate": 3.660975752961054e-06, + "loss": 0.76226437, + "num_input_tokens_seen": 76291425, + "step": 3533, + "time_per_iteration": 2.5427908897399902 + }, + { + "auxiliary_loss_clip": 0.01140397, + "auxiliary_loss_mlp": 0.01044, + "balance_loss_clip": 1.04975021, + "balance_loss_mlp": 1.02580905, + "epoch": 0.21247557492860364, + "flos": 34713121265280.0, + "grad_norm": 2.166718897009126, + "language_loss": 0.7107389, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.73258287, + "num_input_tokens_seen": 76313975, + "step": 3534, + "time_per_iteration": 2.626420259475708 + }, + { + "auxiliary_loss_clip": 0.01127606, + "auxiliary_loss_mlp": 0.01044416, + "balance_loss_clip": 1.05046082, + "balance_loss_mlp": 1.02523518, + "epoch": 0.2125356981812716, + "flos": 22053820504320.0, + "grad_norm": 2.058375921061891, + "language_loss": 0.72007889, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.74179912, + "num_input_tokens_seen": 76330955, + "step": 3535, + "time_per_iteration": 2.522433042526245 + }, + { + "auxiliary_loss_clip": 0.01132182, + "auxiliary_loss_mlp": 0.01050127, + "balance_loss_clip": 1.04867887, + "balance_loss_mlp": 1.03235292, + "epoch": 0.21259582143393957, + "flos": 28548336821760.0, + "grad_norm": 2.285220402853848, + "language_loss": 0.70483172, + "learning_rate": 3.660324636216996e-06, + "loss": 0.72665483, + "num_input_tokens_seen": 76352680, + "step": 3536, + "time_per_iteration": 2.575897693634033 + }, + { + "auxiliary_loss_clip": 0.01149503, + "auxiliary_loss_mlp": 0.01044795, + "balance_loss_clip": 1.04894304, + "balance_loss_mlp": 1.02585316, + "epoch": 0.21265594468660753, + "flos": 20120892900480.0, + "grad_norm": 1.9038349311812022, + "language_loss": 0.87563848, + "learning_rate": 3.660107471371981e-06, + "loss": 0.89758146, + "num_input_tokens_seen": 76370750, + "step": 3537, + "time_per_iteration": 2.467878580093384 + }, + { + "auxiliary_loss_clip": 0.01132142, + "auxiliary_loss_mlp": 0.00865601, + "balance_loss_clip": 1.04708505, + "balance_loss_mlp": 1.14377642, + "epoch": 0.21271606793927553, + "flos": 23076161400960.0, + "grad_norm": 1.6774962333781593, + "language_loss": 0.80186081, + "learning_rate": 3.659890243575524e-06, + "loss": 0.82183826, + "num_input_tokens_seen": 76390610, + "step": 3538, + "time_per_iteration": 2.556644916534424 + }, + { + "auxiliary_loss_clip": 0.01081363, + "auxiliary_loss_mlp": 0.01052669, + "balance_loss_clip": 1.04765666, + "balance_loss_mlp": 1.03268981, + "epoch": 0.2127761911919435, + "flos": 26388201738240.0, + "grad_norm": 2.196758473514269, + "language_loss": 0.86939758, + "learning_rate": 3.659672952835863e-06, + "loss": 0.89073795, + "num_input_tokens_seen": 76408860, + "step": 3539, + "time_per_iteration": 2.684224843978882 + }, + { + "auxiliary_loss_clip": 0.01118533, + "auxiliary_loss_mlp": 0.01049966, + "balance_loss_clip": 1.04890287, + "balance_loss_mlp": 1.03190613, + "epoch": 0.21283631444461146, + "flos": 20228265630720.0, + "grad_norm": 3.319083503043055, + "language_loss": 0.58322334, + "learning_rate": 3.659455599161237e-06, + "loss": 0.60490835, + "num_input_tokens_seen": 76424980, + "step": 3540, + "time_per_iteration": 2.5448131561279297 + }, + { + "auxiliary_loss_clip": 0.01148271, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.05048776, + "balance_loss_mlp": 1.02158225, + "epoch": 0.21289643769727942, + "flos": 13516992691200.0, + "grad_norm": 2.1940855215041424, + "language_loss": 0.7543779, + "learning_rate": 3.659238182559888e-06, + "loss": 0.77625507, + "num_input_tokens_seen": 76443135, + "step": 3541, + "time_per_iteration": 2.465139389038086 + }, + { + "auxiliary_loss_clip": 0.01103574, + "auxiliary_loss_mlp": 0.01040576, + "balance_loss_clip": 1.048949, + "balance_loss_mlp": 1.02308798, + "epoch": 0.2129565609499474, + "flos": 24827021942400.0, + "grad_norm": 2.1784990222271023, + "language_loss": 0.69723195, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.71867347, + "num_input_tokens_seen": 76462470, + "step": 3542, + "time_per_iteration": 2.687246084213257 + }, + { + "auxiliary_loss_clip": 0.01144672, + "auxiliary_loss_mlp": 0.01038039, + "balance_loss_clip": 1.05009389, + "balance_loss_mlp": 1.02102828, + "epoch": 0.21301668420261535, + "flos": 23659242877440.0, + "grad_norm": 2.5608506730969682, + "language_loss": 0.75720978, + "learning_rate": 3.658803160610004e-06, + "loss": 0.77903694, + "num_input_tokens_seen": 76481995, + "step": 3543, + "time_per_iteration": 2.5482211112976074 + }, + { + "auxiliary_loss_clip": 0.01127329, + "auxiliary_loss_mlp": 0.01041429, + "balance_loss_clip": 1.05554926, + "balance_loss_mlp": 1.02359533, + "epoch": 0.21307680745528332, + "flos": 16362805472640.0, + "grad_norm": 1.7051623241506713, + "language_loss": 0.66536796, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.68705559, + "num_input_tokens_seen": 76500245, + "step": 3544, + "time_per_iteration": 2.618896722793579 + }, + { + "auxiliary_loss_clip": 0.01115115, + "auxiliary_loss_mlp": 0.01044765, + "balance_loss_clip": 1.04891706, + "balance_loss_mlp": 1.02722967, + "epoch": 0.2131369307079513, + "flos": 19099054794240.0, + "grad_norm": 2.9855450697737744, + "language_loss": 0.7116369, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.73323572, + "num_input_tokens_seen": 76519535, + "step": 3545, + "time_per_iteration": 2.5950546264648438 + }, + { + "auxiliary_loss_clip": 0.01124133, + "auxiliary_loss_mlp": 0.01045233, + "balance_loss_clip": 1.05325949, + "balance_loss_mlp": 1.02772141, + "epoch": 0.21319705396061928, + "flos": 30372275583360.0, + "grad_norm": 2.0413454537432214, + "language_loss": 0.72064364, + "learning_rate": 3.658150155940946e-06, + "loss": 0.74233735, + "num_input_tokens_seen": 76542065, + "step": 3546, + "time_per_iteration": 4.089262008666992 + }, + { + "auxiliary_loss_clip": 0.01109243, + "auxiliary_loss_mlp": 0.01043388, + "balance_loss_clip": 1.05168939, + "balance_loss_mlp": 1.02544689, + "epoch": 0.21325717721328724, + "flos": 21756192410880.0, + "grad_norm": 2.153903416027601, + "language_loss": 0.80337214, + "learning_rate": 3.657932361952479e-06, + "loss": 0.82489848, + "num_input_tokens_seen": 76560540, + "step": 3547, + "time_per_iteration": 2.657205820083618 + }, + { + "auxiliary_loss_clip": 0.01152404, + "auxiliary_loss_mlp": 0.01046033, + "balance_loss_clip": 1.05268919, + "balance_loss_mlp": 1.0277698, + "epoch": 0.2133173004659552, + "flos": 28730870760960.0, + "grad_norm": 2.8293262658533975, + "language_loss": 0.75012743, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.77211177, + "num_input_tokens_seen": 76581760, + "step": 3548, + "time_per_iteration": 4.0355565547943115 + }, + { + "auxiliary_loss_clip": 0.01109683, + "auxiliary_loss_mlp": 0.01047839, + "balance_loss_clip": 1.05108166, + "balance_loss_mlp": 1.02846718, + "epoch": 0.21337742371862317, + "flos": 16837077674880.0, + "grad_norm": 2.042064977488972, + "language_loss": 0.73910099, + "learning_rate": 3.657496585376922e-06, + "loss": 0.76067615, + "num_input_tokens_seen": 76599940, + "step": 3549, + "time_per_iteration": 2.6543850898742676 + }, + { + "auxiliary_loss_clip": 0.01115846, + "auxiliary_loss_mlp": 0.01046312, + "balance_loss_clip": 1.05375147, + "balance_loss_mlp": 1.02874088, + "epoch": 0.21343754697129114, + "flos": 24424930120320.0, + "grad_norm": 1.6798699983268182, + "language_loss": 0.80563468, + "learning_rate": 3.657278602806357e-06, + "loss": 0.82725626, + "num_input_tokens_seen": 76619580, + "step": 3550, + "time_per_iteration": 2.6947054862976074 + }, + { + "auxiliary_loss_clip": 0.01147647, + "auxiliary_loss_mlp": 0.01045743, + "balance_loss_clip": 1.05299544, + "balance_loss_mlp": 1.02837431, + "epoch": 0.21349767022395913, + "flos": 19277817805440.0, + "grad_norm": 1.6597811502683146, + "language_loss": 0.8793155, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90124941, + "num_input_tokens_seen": 76638195, + "step": 3551, + "time_per_iteration": 5.267691135406494 + }, + { + "auxiliary_loss_clip": 0.01146252, + "auxiliary_loss_mlp": 0.01047582, + "balance_loss_clip": 1.04950821, + "balance_loss_mlp": 1.02990377, + "epoch": 0.2135577934766271, + "flos": 17347547808000.0, + "grad_norm": 2.205509168664224, + "language_loss": 0.8337431, + "learning_rate": 3.656842449140983e-06, + "loss": 0.85568142, + "num_input_tokens_seen": 76656695, + "step": 3552, + "time_per_iteration": 2.5200064182281494 + }, + { + "auxiliary_loss_clip": 0.01129664, + "auxiliary_loss_mlp": 0.01049758, + "balance_loss_clip": 1.04717696, + "balance_loss_mlp": 1.03154278, + "epoch": 0.21361791672929506, + "flos": 24057204635520.0, + "grad_norm": 1.7875544228582751, + "language_loss": 0.76520455, + "learning_rate": 3.656624278062713e-06, + "loss": 0.78699875, + "num_input_tokens_seen": 76677430, + "step": 3553, + "time_per_iteration": 2.5613672733306885 + }, + { + "auxiliary_loss_clip": 0.01137171, + "auxiliary_loss_mlp": 0.01042219, + "balance_loss_clip": 1.05327797, + "balance_loss_mlp": 1.02586341, + "epoch": 0.21367803998196302, + "flos": 22162306556160.0, + "grad_norm": 1.9114710269566297, + "language_loss": 0.72625482, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.74804872, + "num_input_tokens_seen": 76697615, + "step": 3554, + "time_per_iteration": 2.523019313812256 + }, + { + "auxiliary_loss_clip": 0.0109325, + "auxiliary_loss_mlp": 0.00818965, + "balance_loss_clip": 1.04906988, + "balance_loss_mlp": 1.05583858, + "epoch": 0.213738163234631, + "flos": 20886867452160.0, + "grad_norm": 2.1388018540589306, + "language_loss": 0.67226726, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.69138944, + "num_input_tokens_seen": 76715685, + "step": 3555, + "time_per_iteration": 2.5881736278533936 + }, + { + "auxiliary_loss_clip": 0.01108682, + "auxiliary_loss_mlp": 0.01037701, + "balance_loss_clip": 1.05199397, + "balance_loss_mlp": 1.0193553, + "epoch": 0.21379828648729896, + "flos": 28403114135040.0, + "grad_norm": 1.9306632932422454, + "language_loss": 0.64941859, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.67088246, + "num_input_tokens_seen": 76735405, + "step": 3556, + "time_per_iteration": 2.603736639022827 + }, + { + "auxiliary_loss_clip": 0.01135656, + "auxiliary_loss_mlp": 0.01044587, + "balance_loss_clip": 1.05126572, + "balance_loss_mlp": 1.02649093, + "epoch": 0.21385840973996692, + "flos": 25479662106240.0, + "grad_norm": 1.7214054380470214, + "language_loss": 0.72507924, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.74688166, + "num_input_tokens_seen": 76754395, + "step": 3557, + "time_per_iteration": 2.539219856262207 + }, + { + "auxiliary_loss_clip": 0.01131707, + "auxiliary_loss_mlp": 0.00811328, + "balance_loss_clip": 1.048563, + "balance_loss_mlp": 1.04187727, + "epoch": 0.2139185329926349, + "flos": 28074280101120.0, + "grad_norm": 2.0584989530610804, + "language_loss": 0.67162126, + "learning_rate": 3.655532480546528e-06, + "loss": 0.6910516, + "num_input_tokens_seen": 76777210, + "step": 3558, + "time_per_iteration": 2.595909833908081 + }, + { + "auxiliary_loss_clip": 0.01149393, + "auxiliary_loss_mlp": 0.01041088, + "balance_loss_clip": 1.0481329, + "balance_loss_mlp": 1.02294409, + "epoch": 0.21397865624530288, + "flos": 19608698914560.0, + "grad_norm": 1.8805590958814418, + "language_loss": 0.79990041, + "learning_rate": 3.655313932676286e-06, + "loss": 0.82180524, + "num_input_tokens_seen": 76795830, + "step": 3559, + "time_per_iteration": 2.448824882507324 + }, + { + "auxiliary_loss_clip": 0.01145098, + "auxiliary_loss_mlp": 0.01046943, + "balance_loss_clip": 1.04833996, + "balance_loss_mlp": 1.03015876, + "epoch": 0.21403877949797084, + "flos": 24681476033280.0, + "grad_norm": 1.6901631277413696, + "language_loss": 0.67429304, + "learning_rate": 3.655095322036373e-06, + "loss": 0.69621342, + "num_input_tokens_seen": 76814700, + "step": 3560, + "time_per_iteration": 2.505565643310547 + }, + { + "auxiliary_loss_clip": 0.01137738, + "auxiliary_loss_mlp": 0.01046734, + "balance_loss_clip": 1.05141854, + "balance_loss_mlp": 1.02884078, + "epoch": 0.2140989027506388, + "flos": 19861150677120.0, + "grad_norm": 2.110417602365459, + "language_loss": 0.73226529, + "learning_rate": 3.65487664863508e-06, + "loss": 0.75410998, + "num_input_tokens_seen": 76833400, + "step": 3561, + "time_per_iteration": 2.474303960800171 + }, + { + "auxiliary_loss_clip": 0.01125456, + "auxiliary_loss_mlp": 0.01047409, + "balance_loss_clip": 1.04931509, + "balance_loss_mlp": 1.02996838, + "epoch": 0.21415902600330677, + "flos": 19135324552320.0, + "grad_norm": 3.030118217089213, + "language_loss": 0.77171111, + "learning_rate": 3.654657912480698e-06, + "loss": 0.79343975, + "num_input_tokens_seen": 76850645, + "step": 3562, + "time_per_iteration": 2.5068187713623047 + }, + { + "auxiliary_loss_clip": 0.01147017, + "auxiliary_loss_mlp": 0.01040073, + "balance_loss_clip": 1.0510819, + "balance_loss_mlp": 1.02322865, + "epoch": 0.21421914925597474, + "flos": 22272624201600.0, + "grad_norm": 1.5796076621576305, + "language_loss": 0.84619159, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.8680625, + "num_input_tokens_seen": 76870135, + "step": 3563, + "time_per_iteration": 2.4753429889678955 + }, + { + "auxiliary_loss_clip": 0.01149568, + "auxiliary_loss_mlp": 0.01037953, + "balance_loss_clip": 1.05320954, + "balance_loss_mlp": 1.02059627, + "epoch": 0.2142792725086427, + "flos": 33875109987840.0, + "grad_norm": 1.7254249634391472, + "language_loss": 0.76565719, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.78753233, + "num_input_tokens_seen": 76893905, + "step": 3564, + "time_per_iteration": 2.6045608520507812 + }, + { + "auxiliary_loss_clip": 0.01122165, + "auxiliary_loss_mlp": 0.01043474, + "balance_loss_clip": 1.05441475, + "balance_loss_mlp": 1.02567601, + "epoch": 0.2143393957613107, + "flos": 19860216923520.0, + "grad_norm": 1.7289576392983124, + "language_loss": 0.8846035, + "learning_rate": 3.654001327581981e-06, + "loss": 0.90625989, + "num_input_tokens_seen": 76914205, + "step": 3565, + "time_per_iteration": 2.555635690689087 + }, + { + "auxiliary_loss_clip": 0.01053285, + "auxiliary_loss_mlp": 0.0102457, + "balance_loss_clip": 1.04297137, + "balance_loss_mlp": 1.0224123, + "epoch": 0.21439951901397866, + "flos": 68530093090560.0, + "grad_norm": 0.8361525736553671, + "language_loss": 0.52211046, + "learning_rate": 3.653782340498215e-06, + "loss": 0.54288906, + "num_input_tokens_seen": 76975650, + "step": 3566, + "time_per_iteration": 3.108618974685669 + }, + { + "auxiliary_loss_clip": 0.01134714, + "auxiliary_loss_mlp": 0.01039465, + "balance_loss_clip": 1.05268884, + "balance_loss_mlp": 1.02297843, + "epoch": 0.21445964226664663, + "flos": 19682998197120.0, + "grad_norm": 2.914095857656631, + "language_loss": 0.67008561, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.69182742, + "num_input_tokens_seen": 76992615, + "step": 3567, + "time_per_iteration": 2.5077567100524902 + }, + { + "auxiliary_loss_clip": 0.01116864, + "auxiliary_loss_mlp": 0.01047414, + "balance_loss_clip": 1.04811716, + "balance_loss_mlp": 1.03009319, + "epoch": 0.2145197655193146, + "flos": 31107259676160.0, + "grad_norm": 1.6907695347484843, + "language_loss": 0.74213696, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.76377976, + "num_input_tokens_seen": 77017005, + "step": 3568, + "time_per_iteration": 2.6291964054107666 + }, + { + "auxiliary_loss_clip": 0.01133623, + "auxiliary_loss_mlp": 0.01044805, + "balance_loss_clip": 1.05056894, + "balance_loss_mlp": 1.02701938, + "epoch": 0.21457988877198256, + "flos": 20120785159680.0, + "grad_norm": 2.2628178475000893, + "language_loss": 0.77461112, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.79639536, + "num_input_tokens_seen": 77034990, + "step": 3569, + "time_per_iteration": 2.497225522994995 + }, + { + "auxiliary_loss_clip": 0.01142836, + "auxiliary_loss_mlp": 0.0104123, + "balance_loss_clip": 1.05266833, + "balance_loss_mlp": 1.02233577, + "epoch": 0.21464001202465052, + "flos": 18588045957120.0, + "grad_norm": 3.0238814510944194, + "language_loss": 0.69888031, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.72072095, + "num_input_tokens_seen": 77052610, + "step": 3570, + "time_per_iteration": 2.48427677154541 + }, + { + "auxiliary_loss_clip": 0.0115146, + "auxiliary_loss_mlp": 0.01041125, + "balance_loss_clip": 1.05305922, + "balance_loss_mlp": 1.02373242, + "epoch": 0.21470013527731852, + "flos": 21835160461440.0, + "grad_norm": 3.0780937022495483, + "language_loss": 0.78282779, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.80475366, + "num_input_tokens_seen": 77072475, + "step": 3571, + "time_per_iteration": 2.505051374435425 + }, + { + "auxiliary_loss_clip": 0.0113204, + "auxiliary_loss_mlp": 0.01047956, + "balance_loss_clip": 1.05005336, + "balance_loss_mlp": 1.02736831, + "epoch": 0.21476025852998648, + "flos": 17603195880960.0, + "grad_norm": 2.6421838889047486, + "language_loss": 0.83038414, + "learning_rate": 3.652467101342991e-06, + "loss": 0.85218406, + "num_input_tokens_seen": 77089930, + "step": 3572, + "time_per_iteration": 2.4663052558898926 + }, + { + "auxiliary_loss_clip": 0.01129119, + "auxiliary_loss_mlp": 0.01040181, + "balance_loss_clip": 1.05488324, + "balance_loss_mlp": 1.02251458, + "epoch": 0.21482038178265445, + "flos": 24828135264000.0, + "grad_norm": 2.4343403785774145, + "language_loss": 0.64892638, + "learning_rate": 3.652247675452598e-06, + "loss": 0.67061937, + "num_input_tokens_seen": 77108970, + "step": 3573, + "time_per_iteration": 2.5658020973205566 + }, + { + "auxiliary_loss_clip": 0.01145547, + "auxiliary_loss_mlp": 0.01039631, + "balance_loss_clip": 1.05213535, + "balance_loss_mlp": 1.0225004, + "epoch": 0.2148805050353224, + "flos": 23258228463360.0, + "grad_norm": 2.204526760609019, + "language_loss": 0.75294143, + "learning_rate": 3.652028186908807e-06, + "loss": 0.77479321, + "num_input_tokens_seen": 77126045, + "step": 3574, + "time_per_iteration": 2.4684271812438965 + }, + { + "auxiliary_loss_clip": 0.01134938, + "auxiliary_loss_mlp": 0.01043702, + "balance_loss_clip": 1.05164385, + "balance_loss_mlp": 1.02501035, + "epoch": 0.21494062828799038, + "flos": 21321098968320.0, + "grad_norm": 1.9355297413220018, + "language_loss": 0.71793222, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.73971856, + "num_input_tokens_seen": 77144600, + "step": 3575, + "time_per_iteration": 2.5125670433044434 + }, + { + "auxiliary_loss_clip": 0.01123089, + "auxiliary_loss_mlp": 0.01039568, + "balance_loss_clip": 1.05268288, + "balance_loss_mlp": 1.0223304, + "epoch": 0.21500075154065834, + "flos": 18843334894080.0, + "grad_norm": 1.7854097914937606, + "language_loss": 0.68127233, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.70289892, + "num_input_tokens_seen": 77162965, + "step": 3576, + "time_per_iteration": 2.5161149501800537 + }, + { + "auxiliary_loss_clip": 0.01134813, + "auxiliary_loss_mlp": 0.01049038, + "balance_loss_clip": 1.04968536, + "balance_loss_mlp": 1.02850974, + "epoch": 0.2150608747933263, + "flos": 18441997257600.0, + "grad_norm": 2.0608078147134896, + "language_loss": 0.88615036, + "learning_rate": 3.651369345440292e-06, + "loss": 0.90798879, + "num_input_tokens_seen": 77179960, + "step": 3577, + "time_per_iteration": 2.509291172027588 + }, + { + "auxiliary_loss_clip": 0.01046768, + "auxiliary_loss_mlp": 0.01007262, + "balance_loss_clip": 1.02912784, + "balance_loss_mlp": 1.00444913, + "epoch": 0.2151209980459943, + "flos": 66598242894720.0, + "grad_norm": 0.8100274562646331, + "language_loss": 0.56144679, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.58198714, + "num_input_tokens_seen": 77239500, + "step": 3578, + "time_per_iteration": 3.0738818645477295 + }, + { + "auxiliary_loss_clip": 0.01136932, + "auxiliary_loss_mlp": 0.00808638, + "balance_loss_clip": 1.05210137, + "balance_loss_mlp": 1.03821027, + "epoch": 0.21518112129866226, + "flos": 21575885114880.0, + "grad_norm": 1.909030279065131, + "language_loss": 0.88764989, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.90710562, + "num_input_tokens_seen": 77254680, + "step": 3579, + "time_per_iteration": 2.5111289024353027 + }, + { + "auxiliary_loss_clip": 0.01137204, + "auxiliary_loss_mlp": 0.01043034, + "balance_loss_clip": 1.04990077, + "balance_loss_mlp": 1.02481866, + "epoch": 0.21524124455133023, + "flos": 20047635112320.0, + "grad_norm": 1.7060487932929371, + "language_loss": 0.77799904, + "learning_rate": 3.650709940390972e-06, + "loss": 0.79980147, + "num_input_tokens_seen": 77274060, + "step": 3580, + "time_per_iteration": 2.481167793273926 + }, + { + "auxiliary_loss_clip": 0.01138319, + "auxiliary_loss_mlp": 0.01041671, + "balance_loss_clip": 1.05239749, + "balance_loss_mlp": 1.02313352, + "epoch": 0.2153013678039982, + "flos": 23951807153280.0, + "grad_norm": 1.8217980800183968, + "language_loss": 0.729801, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.75160086, + "num_input_tokens_seen": 77293255, + "step": 3581, + "time_per_iteration": 2.556138753890991 + }, + { + "auxiliary_loss_clip": 0.01133302, + "auxiliary_loss_mlp": 0.01045406, + "balance_loss_clip": 1.05028224, + "balance_loss_mlp": 1.02562904, + "epoch": 0.21536149105666616, + "flos": 20594841880320.0, + "grad_norm": 2.5782708857365937, + "language_loss": 0.71116096, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.73294806, + "num_input_tokens_seen": 77312390, + "step": 3582, + "time_per_iteration": 2.4891724586486816 + }, + { + "auxiliary_loss_clip": 0.01145904, + "auxiliary_loss_mlp": 0.01045244, + "balance_loss_clip": 1.04980874, + "balance_loss_mlp": 1.02706432, + "epoch": 0.21542161430933413, + "flos": 12860042895360.0, + "grad_norm": 2.4113552932567783, + "language_loss": 0.84687179, + "learning_rate": 3.650049971985889e-06, + "loss": 0.86878324, + "num_input_tokens_seen": 77330985, + "step": 3583, + "time_per_iteration": 2.4601118564605713 + }, + { + "auxiliary_loss_clip": 0.01128106, + "auxiliary_loss_mlp": 0.01045333, + "balance_loss_clip": 1.05209923, + "balance_loss_mlp": 1.0281074, + "epoch": 0.21548173756200212, + "flos": 26103933504000.0, + "grad_norm": 8.407530463569582, + "language_loss": 0.8243736, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.84610802, + "num_input_tokens_seen": 77350770, + "step": 3584, + "time_per_iteration": 2.5542404651641846 + }, + { + "auxiliary_loss_clip": 0.01115106, + "auxiliary_loss_mlp": 0.0080351, + "balance_loss_clip": 1.05234396, + "balance_loss_mlp": 1.02839208, + "epoch": 0.21554186081467008, + "flos": 22163779013760.0, + "grad_norm": 2.1090004363276216, + "language_loss": 0.90104049, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.92022657, + "num_input_tokens_seen": 77370510, + "step": 3585, + "time_per_iteration": 3.924785852432251 + }, + { + "auxiliary_loss_clip": 0.01137176, + "auxiliary_loss_mlp": 0.01044971, + "balance_loss_clip": 1.05354142, + "balance_loss_mlp": 1.02735233, + "epoch": 0.21560198406733805, + "flos": 22966741595520.0, + "grad_norm": 1.6972166177471275, + "language_loss": 0.74319875, + "learning_rate": 3.649389440450277e-06, + "loss": 0.76502025, + "num_input_tokens_seen": 77390645, + "step": 3586, + "time_per_iteration": 4.003892660140991 + }, + { + "auxiliary_loss_clip": 0.01110896, + "auxiliary_loss_mlp": 0.01044621, + "balance_loss_clip": 1.04933465, + "balance_loss_mlp": 1.0273242, + "epoch": 0.215662107320006, + "flos": 22784064001920.0, + "grad_norm": 1.7853426327330457, + "language_loss": 0.83296454, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.85451972, + "num_input_tokens_seen": 77409655, + "step": 3587, + "time_per_iteration": 2.6137540340423584 + }, + { + "auxiliary_loss_clip": 0.01105056, + "auxiliary_loss_mlp": 0.00800988, + "balance_loss_clip": 1.05073643, + "balance_loss_mlp": 1.02053881, + "epoch": 0.21572223057267398, + "flos": 30883859038080.0, + "grad_norm": 1.8657007056620702, + "language_loss": 0.76136464, + "learning_rate": 3.648948773354224e-06, + "loss": 0.78042507, + "num_input_tokens_seen": 77430560, + "step": 3588, + "time_per_iteration": 2.688819408416748 + }, + { + "auxiliary_loss_clip": 0.01131567, + "auxiliary_loss_mlp": 0.01041508, + "balance_loss_clip": 1.04824424, + "balance_loss_mlp": 1.02288771, + "epoch": 0.21578235382534194, + "flos": 26910487445760.0, + "grad_norm": 2.3306016428580763, + "language_loss": 0.80879414, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.83052492, + "num_input_tokens_seen": 77455000, + "step": 3589, + "time_per_iteration": 2.585813522338867 + }, + { + "auxiliary_loss_clip": 0.01149942, + "auxiliary_loss_mlp": 0.0103952, + "balance_loss_clip": 1.0518074, + "balance_loss_mlp": 1.02304578, + "epoch": 0.2158424770780099, + "flos": 24425720219520.0, + "grad_norm": 2.1890025284576025, + "language_loss": 0.72366339, + "learning_rate": 3.648507856144961e-06, + "loss": 0.74555802, + "num_input_tokens_seen": 77475075, + "step": 3590, + "time_per_iteration": 3.8883090019226074 + }, + { + "auxiliary_loss_clip": 0.01130697, + "auxiliary_loss_mlp": 0.01050389, + "balance_loss_clip": 1.05158377, + "balance_loss_mlp": 1.03131568, + "epoch": 0.2159026003306779, + "flos": 23949975559680.0, + "grad_norm": 1.7728891718536726, + "language_loss": 0.84079504, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86260587, + "num_input_tokens_seen": 77495945, + "step": 3591, + "time_per_iteration": 2.5746867656707764 + }, + { + "auxiliary_loss_clip": 0.01119767, + "auxiliary_loss_mlp": 0.01048465, + "balance_loss_clip": 1.05147052, + "balance_loss_mlp": 1.02858067, + "epoch": 0.21596272358334587, + "flos": 30040963511040.0, + "grad_norm": 2.318526811264304, + "language_loss": 0.69247723, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.71415955, + "num_input_tokens_seen": 77517140, + "step": 3592, + "time_per_iteration": 2.618065595626831 + }, + { + "auxiliary_loss_clip": 0.01115936, + "auxiliary_loss_mlp": 0.01051988, + "balance_loss_clip": 1.05275309, + "balance_loss_mlp": 1.03308094, + "epoch": 0.21602284683601383, + "flos": 20376217751040.0, + "grad_norm": 2.47167497682882, + "language_loss": 0.83627129, + "learning_rate": 3.647846011515108e-06, + "loss": 0.85795045, + "num_input_tokens_seen": 77536085, + "step": 3593, + "time_per_iteration": 2.5784521102905273 + }, + { + "auxiliary_loss_clip": 0.01116557, + "auxiliary_loss_mlp": 0.01056681, + "balance_loss_clip": 1.04956639, + "balance_loss_mlp": 1.03806067, + "epoch": 0.2160829700886818, + "flos": 20777339905920.0, + "grad_norm": 2.508969465027594, + "language_loss": 0.75576353, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.77749598, + "num_input_tokens_seen": 77553675, + "step": 3594, + "time_per_iteration": 2.550729990005493 + }, + { + "auxiliary_loss_clip": 0.01134937, + "auxiliary_loss_mlp": 0.01047679, + "balance_loss_clip": 1.05028081, + "balance_loss_mlp": 1.02882028, + "epoch": 0.21614309334134976, + "flos": 22309755886080.0, + "grad_norm": 1.533239206269284, + "language_loss": 0.8076545, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.82948065, + "num_input_tokens_seen": 77573360, + "step": 3595, + "time_per_iteration": 2.522955894470215 + }, + { + "auxiliary_loss_clip": 0.01113763, + "auxiliary_loss_mlp": 0.01044816, + "balance_loss_clip": 1.05004334, + "balance_loss_mlp": 1.02674389, + "epoch": 0.21620321659401773, + "flos": 19609524927360.0, + "grad_norm": 1.9991981450478784, + "language_loss": 0.78598517, + "learning_rate": 3.647183604506897e-06, + "loss": 0.80757093, + "num_input_tokens_seen": 77591865, + "step": 3596, + "time_per_iteration": 2.525251626968384 + }, + { + "auxiliary_loss_clip": 0.01078748, + "auxiliary_loss_mlp": 0.01048531, + "balance_loss_clip": 1.05259264, + "balance_loss_mlp": 1.03166294, + "epoch": 0.2162633398466857, + "flos": 18844555956480.0, + "grad_norm": 1.6465626831070728, + "language_loss": 0.83148664, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.85275942, + "num_input_tokens_seen": 77611600, + "step": 3597, + "time_per_iteration": 2.630080461502075 + }, + { + "auxiliary_loss_clip": 0.01126917, + "auxiliary_loss_mlp": 0.00800216, + "balance_loss_clip": 1.04959917, + "balance_loss_mlp": 1.02266216, + "epoch": 0.21632346309935369, + "flos": 18768820129920.0, + "grad_norm": 2.7008891878634236, + "language_loss": 0.80419767, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.82346904, + "num_input_tokens_seen": 77630665, + "step": 3598, + "time_per_iteration": 2.529641628265381 + }, + { + "auxiliary_loss_clip": 0.01118858, + "auxiliary_loss_mlp": 0.01059012, + "balance_loss_clip": 1.05036426, + "balance_loss_mlp": 1.04061818, + "epoch": 0.21638358635202165, + "flos": 26324173745280.0, + "grad_norm": 1.9085715166264319, + "language_loss": 0.81976426, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.8415429, + "num_input_tokens_seen": 77650835, + "step": 3599, + "time_per_iteration": 2.6024491786956787 + }, + { + "auxiliary_loss_clip": 0.01104995, + "auxiliary_loss_mlp": 0.008039, + "balance_loss_clip": 1.04930496, + "balance_loss_mlp": 1.02896309, + "epoch": 0.21644370960468962, + "flos": 20740854666240.0, + "grad_norm": 2.2273973555042486, + "language_loss": 0.76484621, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.78393513, + "num_input_tokens_seen": 77669000, + "step": 3600, + "time_per_iteration": 2.570035219192505 + }, + { + "auxiliary_loss_clip": 0.01105758, + "auxiliary_loss_mlp": 0.01042576, + "balance_loss_clip": 1.04898727, + "balance_loss_mlp": 1.02709079, + "epoch": 0.21650383285735758, + "flos": 23952238116480.0, + "grad_norm": 1.9243463688617086, + "language_loss": 0.79993558, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.82141888, + "num_input_tokens_seen": 77688745, + "step": 3601, + "time_per_iteration": 2.5895750522613525 + }, + { + "auxiliary_loss_clip": 0.01150803, + "auxiliary_loss_mlp": 0.01054429, + "balance_loss_clip": 1.05374146, + "balance_loss_mlp": 1.03690505, + "epoch": 0.21656395611002555, + "flos": 23696087253120.0, + "grad_norm": 9.25625967608789, + "language_loss": 0.83348989, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.85554224, + "num_input_tokens_seen": 77708445, + "step": 3602, + "time_per_iteration": 2.4903314113616943 + }, + { + "auxiliary_loss_clip": 0.0114789, + "auxiliary_loss_mlp": 0.0105512, + "balance_loss_clip": 1.0510596, + "balance_loss_mlp": 1.03737032, + "epoch": 0.2166240793626935, + "flos": 20666052593280.0, + "grad_norm": 2.2000130581161375, + "language_loss": 0.74751723, + "learning_rate": 3.645635802397693e-06, + "loss": 0.76954734, + "num_input_tokens_seen": 77728465, + "step": 3603, + "time_per_iteration": 2.502960681915283 + }, + { + "auxiliary_loss_clip": 0.01113705, + "auxiliary_loss_mlp": 0.0105609, + "balance_loss_clip": 1.05088377, + "balance_loss_mlp": 1.03729033, + "epoch": 0.2166842026153615, + "flos": 21580410228480.0, + "grad_norm": 2.2288464709677553, + "language_loss": 0.73801547, + "learning_rate": 3.645414438132855e-06, + "loss": 0.75971341, + "num_input_tokens_seen": 77746735, + "step": 3604, + "time_per_iteration": 2.543144702911377 + }, + { + "auxiliary_loss_clip": 0.01132347, + "auxiliary_loss_mlp": 0.01040691, + "balance_loss_clip": 1.04970789, + "balance_loss_mlp": 1.02344179, + "epoch": 0.21674432586802947, + "flos": 25629948610560.0, + "grad_norm": 1.7348700177317176, + "language_loss": 0.79835272, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.82008314, + "num_input_tokens_seen": 77768105, + "step": 3605, + "time_per_iteration": 2.5470573902130127 + }, + { + "auxiliary_loss_clip": 0.01066368, + "auxiliary_loss_mlp": 0.01012644, + "balance_loss_clip": 1.02939415, + "balance_loss_mlp": 1.01006889, + "epoch": 0.21680444912069743, + "flos": 56417783616000.0, + "grad_norm": 0.7137675923000804, + "language_loss": 0.58322209, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.60401219, + "num_input_tokens_seen": 77833750, + "step": 3606, + "time_per_iteration": 3.194392204284668 + }, + { + "auxiliary_loss_clip": 0.01149821, + "auxiliary_loss_mlp": 0.01048093, + "balance_loss_clip": 1.05104542, + "balance_loss_mlp": 1.02962756, + "epoch": 0.2168645723733654, + "flos": 23878944414720.0, + "grad_norm": 2.6439050529593415, + "language_loss": 0.73129213, + "learning_rate": 3.644749971006248e-06, + "loss": 0.75327122, + "num_input_tokens_seen": 77853780, + "step": 3607, + "time_per_iteration": 2.4901204109191895 + }, + { + "auxiliary_loss_clip": 0.01131375, + "auxiliary_loss_mlp": 0.01045014, + "balance_loss_clip": 1.05294013, + "balance_loss_mlp": 1.02626204, + "epoch": 0.21692469562603336, + "flos": 16946174257920.0, + "grad_norm": 2.3461093127086, + "language_loss": 0.76791602, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.78967988, + "num_input_tokens_seen": 77872575, + "step": 3608, + "time_per_iteration": 2.5419533252716064 + }, + { + "auxiliary_loss_clip": 0.01079412, + "auxiliary_loss_mlp": 0.01053179, + "balance_loss_clip": 1.05168915, + "balance_loss_mlp": 1.0354054, + "epoch": 0.21698481887870133, + "flos": 25119047514240.0, + "grad_norm": 1.7218208369904888, + "language_loss": 0.74195898, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.76328492, + "num_input_tokens_seen": 77892700, + "step": 3609, + "time_per_iteration": 2.660496950149536 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01051499, + "balance_loss_clip": 1.05560172, + "balance_loss_mlp": 1.033463, + "epoch": 0.2170449421313693, + "flos": 17894682748800.0, + "grad_norm": 2.3557164844240845, + "language_loss": 0.89041537, + "learning_rate": 3.6440849425579e-06, + "loss": 0.91228849, + "num_input_tokens_seen": 77911060, + "step": 3610, + "time_per_iteration": 2.539219856262207 + }, + { + "auxiliary_loss_clip": 0.01152, + "auxiliary_loss_mlp": 0.01049602, + "balance_loss_clip": 1.05507493, + "balance_loss_mlp": 1.03181565, + "epoch": 0.2171050653840373, + "flos": 22638446265600.0, + "grad_norm": 1.8178916587516818, + "language_loss": 0.77619052, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.79820657, + "num_input_tokens_seen": 77929930, + "step": 3611, + "time_per_iteration": 2.478644847869873 + }, + { + "auxiliary_loss_clip": 0.01085074, + "auxiliary_loss_mlp": 0.01049446, + "balance_loss_clip": 1.04899311, + "balance_loss_mlp": 1.03166044, + "epoch": 0.21716518863670525, + "flos": 19499997381120.0, + "grad_norm": 1.853776679390499, + "language_loss": 0.63276482, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.65411007, + "num_input_tokens_seen": 77949060, + "step": 3612, + "time_per_iteration": 2.6393883228302 + }, + { + "auxiliary_loss_clip": 0.01089425, + "auxiliary_loss_mlp": 0.01048292, + "balance_loss_clip": 1.04954159, + "balance_loss_mlp": 1.02843165, + "epoch": 0.21722531188937322, + "flos": 19792022952960.0, + "grad_norm": 1.9334842430400216, + "language_loss": 0.75866097, + "learning_rate": 3.643419353014776e-06, + "loss": 0.78003812, + "num_input_tokens_seen": 77967920, + "step": 3613, + "time_per_iteration": 2.583744764328003 + }, + { + "auxiliary_loss_clip": 0.01097335, + "auxiliary_loss_mlp": 0.01051229, + "balance_loss_clip": 1.04688537, + "balance_loss_mlp": 1.03117812, + "epoch": 0.21728543514204118, + "flos": 13334386924800.0, + "grad_norm": 1.9151284413011302, + "language_loss": 0.70878899, + "learning_rate": 3.643197365185261e-06, + "loss": 0.73027456, + "num_input_tokens_seen": 77985330, + "step": 3614, + "time_per_iteration": 2.559095859527588 + }, + { + "auxiliary_loss_clip": 0.01138105, + "auxiliary_loss_mlp": 0.01048119, + "balance_loss_clip": 1.05303538, + "balance_loss_mlp": 1.03005886, + "epoch": 0.21734555839470915, + "flos": 15231870783360.0, + "grad_norm": 2.0413031836150988, + "language_loss": 0.73730886, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.75917107, + "num_input_tokens_seen": 78003105, + "step": 3615, + "time_per_iteration": 2.469573497772217 + }, + { + "auxiliary_loss_clip": 0.01140979, + "auxiliary_loss_mlp": 0.01050005, + "balance_loss_clip": 1.05161059, + "balance_loss_mlp": 1.02978766, + "epoch": 0.2174056816473771, + "flos": 19973982274560.0, + "grad_norm": 2.8248249920994124, + "language_loss": 0.8995797, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.92148954, + "num_input_tokens_seen": 78019655, + "step": 3616, + "time_per_iteration": 2.5177431106567383 + }, + { + "auxiliary_loss_clip": 0.01102302, + "auxiliary_loss_mlp": 0.01046514, + "balance_loss_clip": 1.05242956, + "balance_loss_mlp": 1.02748847, + "epoch": 0.21746580490004508, + "flos": 16687293960960.0, + "grad_norm": 2.268065659368606, + "language_loss": 0.81489325, + "learning_rate": 3.642531027869148e-06, + "loss": 0.83638144, + "num_input_tokens_seen": 78036025, + "step": 3617, + "time_per_iteration": 2.571014404296875 + }, + { + "auxiliary_loss_clip": 0.01128367, + "auxiliary_loss_mlp": 0.01043254, + "balance_loss_clip": 1.05191994, + "balance_loss_mlp": 1.02573013, + "epoch": 0.21752592815271307, + "flos": 25772298209280.0, + "grad_norm": 1.7111843680517322, + "language_loss": 0.75594056, + "learning_rate": 3.642308790849329e-06, + "loss": 0.77765679, + "num_input_tokens_seen": 78055645, + "step": 3618, + "time_per_iteration": 2.5710856914520264 + }, + { + "auxiliary_loss_clip": 0.01140793, + "auxiliary_loss_mlp": 0.01049809, + "balance_loss_clip": 1.05120873, + "balance_loss_mlp": 1.0310334, + "epoch": 0.21758605140538104, + "flos": 11254692349440.0, + "grad_norm": 2.0285575099806317, + "language_loss": 0.69402003, + "learning_rate": 3.642086491552996e-06, + "loss": 0.71592605, + "num_input_tokens_seen": 78071660, + "step": 3619, + "time_per_iteration": 2.4710757732391357 + }, + { + "auxiliary_loss_clip": 0.01141697, + "auxiliary_loss_mlp": 0.01040816, + "balance_loss_clip": 1.05565464, + "balance_loss_mlp": 1.02294636, + "epoch": 0.217646174658049, + "flos": 19242625455360.0, + "grad_norm": 1.6533332513566865, + "language_loss": 0.78564417, + "learning_rate": 3.641864129988579e-06, + "loss": 0.80746937, + "num_input_tokens_seen": 78091265, + "step": 3620, + "time_per_iteration": 2.521606683731079 + }, + { + "auxiliary_loss_clip": 0.01145987, + "auxiliary_loss_mlp": 0.01041969, + "balance_loss_clip": 1.05142391, + "balance_loss_mlp": 1.02479053, + "epoch": 0.21770629791071697, + "flos": 21945083057280.0, + "grad_norm": 1.7652946494592838, + "language_loss": 0.79635417, + "learning_rate": 3.641641706164509e-06, + "loss": 0.81823373, + "num_input_tokens_seen": 78110095, + "step": 3621, + "time_per_iteration": 2.4824273586273193 + }, + { + "auxiliary_loss_clip": 0.01137506, + "auxiliary_loss_mlp": 0.01039996, + "balance_loss_clip": 1.05007041, + "balance_loss_mlp": 1.02287781, + "epoch": 0.21776642116338493, + "flos": 24936764970240.0, + "grad_norm": 1.6107048326882176, + "language_loss": 0.87598693, + "learning_rate": 3.641419220089221e-06, + "loss": 0.89776194, + "num_input_tokens_seen": 78129475, + "step": 3622, + "time_per_iteration": 2.5649521350860596 + }, + { + "auxiliary_loss_clip": 0.01142004, + "auxiliary_loss_mlp": 0.01040682, + "balance_loss_clip": 1.05157495, + "balance_loss_mlp": 1.02095306, + "epoch": 0.2178265444160529, + "flos": 17821317219840.0, + "grad_norm": 1.8932488482814362, + "language_loss": 0.76937318, + "learning_rate": 3.641196671771152e-06, + "loss": 0.7912001, + "num_input_tokens_seen": 78146880, + "step": 3623, + "time_per_iteration": 3.834122657775879 + }, + { + "auxiliary_loss_clip": 0.01122576, + "auxiliary_loss_mlp": 0.01045708, + "balance_loss_clip": 1.05417061, + "balance_loss_mlp": 1.02678919, + "epoch": 0.2178866676687209, + "flos": 17712902995200.0, + "grad_norm": 1.9053231016610352, + "language_loss": 0.84546435, + "learning_rate": 3.640974061218741e-06, + "loss": 0.86714721, + "num_input_tokens_seen": 78165065, + "step": 3624, + "time_per_iteration": 2.5513288974761963 + }, + { + "auxiliary_loss_clip": 0.01135224, + "auxiliary_loss_mlp": 0.01057046, + "balance_loss_clip": 1.05255723, + "balance_loss_mlp": 1.03894997, + "epoch": 0.21794679092138886, + "flos": 16945851035520.0, + "grad_norm": 2.5074937833666224, + "language_loss": 0.76958084, + "learning_rate": 3.640751388440429e-06, + "loss": 0.79150355, + "num_input_tokens_seen": 78180005, + "step": 3625, + "time_per_iteration": 3.8914358615875244 + }, + { + "auxiliary_loss_clip": 0.0105276, + "auxiliary_loss_mlp": 0.01005708, + "balance_loss_clip": 1.02740872, + "balance_loss_mlp": 1.00270355, + "epoch": 0.21800691417405682, + "flos": 63718566566400.0, + "grad_norm": 0.8141305506567106, + "language_loss": 0.6069662, + "learning_rate": 3.64052865344466e-06, + "loss": 0.62755084, + "num_input_tokens_seen": 78245350, + "step": 3626, + "time_per_iteration": 3.2012975215911865 + }, + { + "auxiliary_loss_clip": 0.01121764, + "auxiliary_loss_mlp": 0.00804473, + "balance_loss_clip": 1.05081189, + "balance_loss_mlp": 1.02917778, + "epoch": 0.21806703742672479, + "flos": 21616392677760.0, + "grad_norm": 1.8559892417672197, + "language_loss": 0.90313089, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.92239332, + "num_input_tokens_seen": 78264165, + "step": 3627, + "time_per_iteration": 2.5755021572113037 + }, + { + "auxiliary_loss_clip": 0.01095613, + "auxiliary_loss_mlp": 0.01040345, + "balance_loss_clip": 1.0499599, + "balance_loss_mlp": 1.02191544, + "epoch": 0.21812716067939275, + "flos": 19354882435200.0, + "grad_norm": 1.5902262841530739, + "language_loss": 0.73553944, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.756899, + "num_input_tokens_seen": 78283745, + "step": 3628, + "time_per_iteration": 3.9810009002685547 + }, + { + "auxiliary_loss_clip": 0.01147435, + "auxiliary_loss_mlp": 0.010426, + "balance_loss_clip": 1.0501852, + "balance_loss_mlp": 1.0247426, + "epoch": 0.21818728393206072, + "flos": 23548063305600.0, + "grad_norm": 1.7837663822211787, + "language_loss": 0.77245831, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.79435873, + "num_input_tokens_seen": 78302900, + "step": 3629, + "time_per_iteration": 3.853085994720459 + }, + { + "auxiliary_loss_clip": 0.01139222, + "auxiliary_loss_mlp": 0.01039834, + "balance_loss_clip": 1.05215132, + "balance_loss_mlp": 1.02235818, + "epoch": 0.21824740718472868, + "flos": 30225652266240.0, + "grad_norm": 1.5901199793897753, + "language_loss": 0.71160948, + "learning_rate": 3.63963709145597e-06, + "loss": 0.73340005, + "num_input_tokens_seen": 78326470, + "step": 3630, + "time_per_iteration": 2.614562511444092 + }, + { + "auxiliary_loss_clip": 0.01086432, + "auxiliary_loss_mlp": 0.0104, + "balance_loss_clip": 1.04956794, + "balance_loss_mlp": 1.02390695, + "epoch": 0.21830753043739667, + "flos": 26134672567680.0, + "grad_norm": 1.6981907148029305, + "language_loss": 0.76549131, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.78675562, + "num_input_tokens_seen": 78345810, + "step": 3631, + "time_per_iteration": 2.673088312149048 + }, + { + "auxiliary_loss_clip": 0.01148815, + "auxiliary_loss_mlp": 0.01040357, + "balance_loss_clip": 1.05059958, + "balance_loss_mlp": 1.02329779, + "epoch": 0.21836765369006464, + "flos": 21720712752000.0, + "grad_norm": 2.1952068645162144, + "language_loss": 0.75077474, + "learning_rate": 3.639190937376594e-06, + "loss": 0.77266645, + "num_input_tokens_seen": 78364085, + "step": 3632, + "time_per_iteration": 2.492011785507202 + }, + { + "auxiliary_loss_clip": 0.01146546, + "auxiliary_loss_mlp": 0.01037936, + "balance_loss_clip": 1.05101633, + "balance_loss_mlp": 1.02160406, + "epoch": 0.2184277769427326, + "flos": 19937604775680.0, + "grad_norm": 2.0216932807285284, + "language_loss": 0.83486921, + "learning_rate": 3.638967767095249e-06, + "loss": 0.85671401, + "num_input_tokens_seen": 78381385, + "step": 3633, + "time_per_iteration": 2.4654104709625244 + }, + { + "auxiliary_loss_clip": 0.01124123, + "auxiliary_loss_mlp": 0.01057729, + "balance_loss_clip": 1.05485952, + "balance_loss_mlp": 1.04073024, + "epoch": 0.21848790019540057, + "flos": 20340235301760.0, + "grad_norm": 1.7246689420399512, + "language_loss": 0.81501102, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.83682954, + "num_input_tokens_seen": 78400500, + "step": 3634, + "time_per_iteration": 2.5703341960906982 + }, + { + "auxiliary_loss_clip": 0.01143004, + "auxiliary_loss_mlp": 0.01042236, + "balance_loss_clip": 1.05510712, + "balance_loss_mlp": 1.02504647, + "epoch": 0.21854802344806853, + "flos": 15450818135040.0, + "grad_norm": 2.5500429620687934, + "language_loss": 0.75350702, + "learning_rate": 3.638521240091558e-06, + "loss": 0.77535939, + "num_input_tokens_seen": 78418340, + "step": 3635, + "time_per_iteration": 2.4803948402404785 + }, + { + "auxiliary_loss_clip": 0.01124189, + "auxiliary_loss_mlp": 0.01054297, + "balance_loss_clip": 1.05287313, + "balance_loss_mlp": 1.03717875, + "epoch": 0.2186081467007365, + "flos": 16320717711360.0, + "grad_norm": 2.559445710355263, + "language_loss": 0.88346934, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.90525424, + "num_input_tokens_seen": 78434375, + "step": 3636, + "time_per_iteration": 2.503364324569702 + }, + { + "auxiliary_loss_clip": 0.01114252, + "auxiliary_loss_mlp": 0.00793966, + "balance_loss_clip": 1.05357218, + "balance_loss_mlp": 1.01198399, + "epoch": 0.2186682699534045, + "flos": 21689255416320.0, + "grad_norm": 2.0331074691652375, + "language_loss": 0.75864869, + "learning_rate": 3.638074464556311e-06, + "loss": 0.77773088, + "num_input_tokens_seen": 78451735, + "step": 3637, + "time_per_iteration": 2.592402219772339 + }, + { + "auxiliary_loss_clip": 0.01135702, + "auxiliary_loss_mlp": 0.0104715, + "balance_loss_clip": 1.05674338, + "balance_loss_mlp": 1.02862453, + "epoch": 0.21872839320607246, + "flos": 17739260599680.0, + "grad_norm": 3.3966209260532887, + "language_loss": 0.90023845, + "learning_rate": 3.63785098361053e-06, + "loss": 0.92206699, + "num_input_tokens_seen": 78462730, + "step": 3638, + "time_per_iteration": 2.4894158840179443 + }, + { + "auxiliary_loss_clip": 0.01138474, + "auxiliary_loss_mlp": 0.01052528, + "balance_loss_clip": 1.05254579, + "balance_loss_mlp": 1.03452778, + "epoch": 0.21878851645874042, + "flos": 18652289431680.0, + "grad_norm": 2.6170814372675655, + "language_loss": 0.89828521, + "learning_rate": 3.637627440557275e-06, + "loss": 0.92019522, + "num_input_tokens_seen": 78476300, + "step": 3639, + "time_per_iteration": 2.4726271629333496 + }, + { + "auxiliary_loss_clip": 0.01130485, + "auxiliary_loss_mlp": 0.00798461, + "balance_loss_clip": 1.05296171, + "balance_loss_mlp": 1.01790214, + "epoch": 0.2188486397114084, + "flos": 25557301353600.0, + "grad_norm": 1.790890356880474, + "language_loss": 0.79418796, + "learning_rate": 3.637403835405024e-06, + "loss": 0.8134774, + "num_input_tokens_seen": 78496135, + "step": 3640, + "time_per_iteration": 2.5811336040496826 + }, + { + "auxiliary_loss_clip": 0.01145089, + "auxiliary_loss_mlp": 0.01051979, + "balance_loss_clip": 1.05585778, + "balance_loss_mlp": 1.03235674, + "epoch": 0.21890876296407635, + "flos": 17892061056000.0, + "grad_norm": 2.188627488397322, + "language_loss": 0.72094464, + "learning_rate": 3.637180168162255e-06, + "loss": 0.74291539, + "num_input_tokens_seen": 78513855, + "step": 3641, + "time_per_iteration": 2.5281214714050293 + }, + { + "auxiliary_loss_clip": 0.01128554, + "auxiliary_loss_mlp": 0.01044531, + "balance_loss_clip": 1.05507803, + "balance_loss_mlp": 1.02685237, + "epoch": 0.21896888621674432, + "flos": 17749100926080.0, + "grad_norm": 2.32847833119919, + "language_loss": 0.80929869, + "learning_rate": 3.63695643883745e-06, + "loss": 0.83102953, + "num_input_tokens_seen": 78531740, + "step": 3642, + "time_per_iteration": 2.511493682861328 + }, + { + "auxiliary_loss_clip": 0.01143124, + "auxiliary_loss_mlp": 0.01045009, + "balance_loss_clip": 1.05282879, + "balance_loss_mlp": 1.02636445, + "epoch": 0.21902900946941228, + "flos": 23076161400960.0, + "grad_norm": 1.8034267510644242, + "language_loss": 0.71855789, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.74043918, + "num_input_tokens_seen": 78549600, + "step": 3643, + "time_per_iteration": 2.5203025341033936 + }, + { + "auxiliary_loss_clip": 0.0115046, + "auxiliary_loss_mlp": 0.01052039, + "balance_loss_clip": 1.05209136, + "balance_loss_mlp": 1.033288, + "epoch": 0.21908913272208028, + "flos": 48178545004800.0, + "grad_norm": 1.8703035119232514, + "language_loss": 0.68581176, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.70783675, + "num_input_tokens_seen": 78573350, + "step": 3644, + "time_per_iteration": 2.7120461463928223 + }, + { + "auxiliary_loss_clip": 0.01153036, + "auxiliary_loss_mlp": 0.0104401, + "balance_loss_clip": 1.05227304, + "balance_loss_mlp": 1.02618814, + "epoch": 0.21914925597474824, + "flos": 22236749493120.0, + "grad_norm": 2.0012820553989954, + "language_loss": 0.78040653, + "learning_rate": 3.636284878455669e-06, + "loss": 0.80237699, + "num_input_tokens_seen": 78591005, + "step": 3645, + "time_per_iteration": 2.5040066242218018 + }, + { + "auxiliary_loss_clip": 0.01139268, + "auxiliary_loss_mlp": 0.01049955, + "balance_loss_clip": 1.05549979, + "balance_loss_mlp": 1.03303885, + "epoch": 0.2192093792274162, + "flos": 22125605834880.0, + "grad_norm": 1.7210858963471396, + "language_loss": 0.82403046, + "learning_rate": 3.636060900887582e-06, + "loss": 0.84592271, + "num_input_tokens_seen": 78610645, + "step": 3646, + "time_per_iteration": 2.513605833053589 + }, + { + "auxiliary_loss_clip": 0.01135278, + "auxiliary_loss_mlp": 0.01040399, + "balance_loss_clip": 1.05169165, + "balance_loss_mlp": 1.02376974, + "epoch": 0.21926950248008417, + "flos": 15669442264320.0, + "grad_norm": 1.7081906475808488, + "language_loss": 0.82634318, + "learning_rate": 3.635836861279901e-06, + "loss": 0.84809995, + "num_input_tokens_seen": 78628340, + "step": 3647, + "time_per_iteration": 2.4788360595703125 + }, + { + "auxiliary_loss_clip": 0.01145978, + "auxiliary_loss_mlp": 0.01052497, + "balance_loss_clip": 1.05057263, + "balance_loss_mlp": 1.03539014, + "epoch": 0.21932962573275214, + "flos": 30262496641920.0, + "grad_norm": 1.7749354060663207, + "language_loss": 0.7323482, + "learning_rate": 3.635612759641123e-06, + "loss": 0.75433296, + "num_input_tokens_seen": 78649355, + "step": 3648, + "time_per_iteration": 2.557774066925049 + }, + { + "auxiliary_loss_clip": 0.01104952, + "auxiliary_loss_mlp": 0.01058232, + "balance_loss_clip": 1.04666889, + "balance_loss_mlp": 1.0370959, + "epoch": 0.2193897489854201, + "flos": 10780132838400.0, + "grad_norm": 2.471219577721094, + "language_loss": 0.73945308, + "learning_rate": 3.635388595979745e-06, + "loss": 0.76108491, + "num_input_tokens_seen": 78664915, + "step": 3649, + "time_per_iteration": 2.5654475688934326 + }, + { + "auxiliary_loss_clip": 0.01130715, + "auxiliary_loss_mlp": 0.01047321, + "balance_loss_clip": 1.04958594, + "balance_loss_mlp": 1.03069162, + "epoch": 0.21944987223808807, + "flos": 19133313390720.0, + "grad_norm": 2.121476514384449, + "language_loss": 0.86315417, + "learning_rate": 3.635164370304267e-06, + "loss": 0.88493454, + "num_input_tokens_seen": 78681475, + "step": 3650, + "time_per_iteration": 2.482602119445801 + }, + { + "auxiliary_loss_clip": 0.0112541, + "auxiliary_loss_mlp": 0.01045072, + "balance_loss_clip": 1.04912126, + "balance_loss_mlp": 1.02690494, + "epoch": 0.21950999549075606, + "flos": 22711093522560.0, + "grad_norm": 1.9734162715994552, + "language_loss": 0.8403163, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.86202115, + "num_input_tokens_seen": 78702300, + "step": 3651, + "time_per_iteration": 2.5919017791748047 + }, + { + "auxiliary_loss_clip": 0.01137732, + "auxiliary_loss_mlp": 0.01048977, + "balance_loss_clip": 1.05056357, + "balance_loss_mlp": 1.03121483, + "epoch": 0.21957011874342403, + "flos": 10561329141120.0, + "grad_norm": 1.8554151170090756, + "language_loss": 0.74118459, + "learning_rate": 3.634715732945027e-06, + "loss": 0.76305175, + "num_input_tokens_seen": 78720230, + "step": 3652, + "time_per_iteration": 2.4985525608062744 + }, + { + "auxiliary_loss_clip": 0.01026228, + "auxiliary_loss_mlp": 0.01015326, + "balance_loss_clip": 1.03012836, + "balance_loss_mlp": 1.01265609, + "epoch": 0.219630241996092, + "flos": 65747913252480.0, + "grad_norm": 0.736240809805818, + "language_loss": 0.5153445, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.53576005, + "num_input_tokens_seen": 78780200, + "step": 3653, + "time_per_iteration": 3.2133591175079346 + }, + { + "auxiliary_loss_clip": 0.01122818, + "auxiliary_loss_mlp": 0.01057069, + "balance_loss_clip": 1.05662787, + "balance_loss_mlp": 1.03911567, + "epoch": 0.21969036524875996, + "flos": 23696518216320.0, + "grad_norm": 1.819961328054342, + "language_loss": 0.75238478, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.77418363, + "num_input_tokens_seen": 78800575, + "step": 3654, + "time_per_iteration": 2.568155527114868 + }, + { + "auxiliary_loss_clip": 0.01148011, + "auxiliary_loss_mlp": 0.01045644, + "balance_loss_clip": 1.05581641, + "balance_loss_mlp": 1.02748823, + "epoch": 0.21975048850142792, + "flos": 19640910435840.0, + "grad_norm": 2.8254009014777206, + "language_loss": 0.72672415, + "learning_rate": 3.634042312013064e-06, + "loss": 0.74866062, + "num_input_tokens_seen": 78819585, + "step": 3655, + "time_per_iteration": 2.517900228500366 + }, + { + "auxiliary_loss_clip": 0.01125767, + "auxiliary_loss_mlp": 0.0104805, + "balance_loss_clip": 1.05538416, + "balance_loss_mlp": 1.0304184, + "epoch": 0.21981061175409589, + "flos": 22448550038400.0, + "grad_norm": 1.6535189588574208, + "language_loss": 0.80887252, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.83061069, + "num_input_tokens_seen": 78837330, + "step": 3656, + "time_per_iteration": 2.5567877292633057 + }, + { + "auxiliary_loss_clip": 0.01113478, + "auxiliary_loss_mlp": 0.00798333, + "balance_loss_clip": 1.05636406, + "balance_loss_mlp": 1.01843941, + "epoch": 0.21987073500676388, + "flos": 18151049093760.0, + "grad_norm": 2.417120527051926, + "language_loss": 0.84727514, + "learning_rate": 3.63359305489566e-06, + "loss": 0.86639321, + "num_input_tokens_seen": 78854955, + "step": 3657, + "time_per_iteration": 2.568197011947632 + }, + { + "auxiliary_loss_clip": 0.01132715, + "auxiliary_loss_mlp": 0.0103984, + "balance_loss_clip": 1.05129385, + "balance_loss_mlp": 1.02212572, + "epoch": 0.21993085825943184, + "flos": 25626177682560.0, + "grad_norm": 2.9191119643299333, + "language_loss": 0.80479985, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.82652545, + "num_input_tokens_seen": 78874965, + "step": 3658, + "time_per_iteration": 2.5533339977264404 + }, + { + "auxiliary_loss_clip": 0.01040369, + "auxiliary_loss_mlp": 0.01011592, + "balance_loss_clip": 1.03445244, + "balance_loss_mlp": 1.00846851, + "epoch": 0.2199909815120998, + "flos": 70923217743360.0, + "grad_norm": 0.8046671494574841, + "language_loss": 0.58160567, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60212529, + "num_input_tokens_seen": 78937740, + "step": 3659, + "time_per_iteration": 3.254161834716797 + }, + { + "auxiliary_loss_clip": 0.01106953, + "auxiliary_loss_mlp": 0.01042364, + "balance_loss_clip": 1.04993725, + "balance_loss_mlp": 1.0243752, + "epoch": 0.22005110476476777, + "flos": 21543529939200.0, + "grad_norm": 2.27163333025809, + "language_loss": 0.73858714, + "learning_rate": 3.632918704645772e-06, + "loss": 0.76008034, + "num_input_tokens_seen": 78955055, + "step": 3660, + "time_per_iteration": 2.5775833129882812 + }, + { + "auxiliary_loss_clip": 0.01139356, + "auxiliary_loss_mlp": 0.01042564, + "balance_loss_clip": 1.05317688, + "balance_loss_mlp": 1.02477825, + "epoch": 0.22011122801743574, + "flos": 22054502862720.0, + "grad_norm": 1.738165395136322, + "language_loss": 0.81423593, + "learning_rate": 3.632693797376893e-06, + "loss": 0.83605516, + "num_input_tokens_seen": 78974895, + "step": 3661, + "time_per_iteration": 2.5280983448028564 + }, + { + "auxiliary_loss_clip": 0.0111834, + "auxiliary_loss_mlp": 0.01049401, + "balance_loss_clip": 1.04847193, + "balance_loss_mlp": 1.03149545, + "epoch": 0.2201713512701037, + "flos": 26687589598080.0, + "grad_norm": 1.6748471946560908, + "language_loss": 0.73097098, + "learning_rate": 3.632468828196102e-06, + "loss": 0.75264841, + "num_input_tokens_seen": 78994990, + "step": 3662, + "time_per_iteration": 3.9510345458984375 + }, + { + "auxiliary_loss_clip": 0.011277, + "auxiliary_loss_mlp": 0.01051458, + "balance_loss_clip": 1.05445957, + "balance_loss_mlp": 1.03554368, + "epoch": 0.22023147452277167, + "flos": 22162198815360.0, + "grad_norm": 1.6042297878706626, + "language_loss": 0.78651249, + "learning_rate": 3.632243797111929e-06, + "loss": 0.80830407, + "num_input_tokens_seen": 79014405, + "step": 3663, + "time_per_iteration": 2.5583837032318115 + }, + { + "auxiliary_loss_clip": 0.0113569, + "auxiliary_loss_mlp": 0.01049792, + "balance_loss_clip": 1.05353284, + "balance_loss_mlp": 1.02995598, + "epoch": 0.22029159777543966, + "flos": 22523280284160.0, + "grad_norm": 1.9359029100779455, + "language_loss": 0.80439568, + "learning_rate": 3.632018704132908e-06, + "loss": 0.82625055, + "num_input_tokens_seen": 79032375, + "step": 3664, + "time_per_iteration": 3.91141414642334 + }, + { + "auxiliary_loss_clip": 0.01130591, + "auxiliary_loss_mlp": 0.01044625, + "balance_loss_clip": 1.05584669, + "balance_loss_mlp": 1.02501488, + "epoch": 0.22035172102810763, + "flos": 13042469093760.0, + "grad_norm": 2.6942624598886393, + "language_loss": 0.76719534, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.78894752, + "num_input_tokens_seen": 79049635, + "step": 3665, + "time_per_iteration": 2.5394208431243896 + }, + { + "auxiliary_loss_clip": 0.01118303, + "auxiliary_loss_mlp": 0.01051174, + "balance_loss_clip": 1.05206954, + "balance_loss_mlp": 1.03386521, + "epoch": 0.2204118442807756, + "flos": 12165817760640.0, + "grad_norm": 2.8606731258114464, + "language_loss": 0.98223019, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00392509, + "num_input_tokens_seen": 79062890, + "step": 3666, + "time_per_iteration": 2.4899590015411377 + }, + { + "auxiliary_loss_clip": 0.01137924, + "auxiliary_loss_mlp": 0.00796288, + "balance_loss_clip": 1.05316877, + "balance_loss_mlp": 1.01559734, + "epoch": 0.22047196753344356, + "flos": 40108806673920.0, + "grad_norm": 1.7645787496672711, + "language_loss": 0.80873024, + "learning_rate": 3.631343053912122e-06, + "loss": 0.82807231, + "num_input_tokens_seen": 79085495, + "step": 3667, + "time_per_iteration": 2.7084782123565674 + }, + { + "auxiliary_loss_clip": 0.01138773, + "auxiliary_loss_mlp": 0.0105124, + "balance_loss_clip": 1.0560801, + "balance_loss_mlp": 1.03166616, + "epoch": 0.22053209078611152, + "flos": 20701137202560.0, + "grad_norm": 1.9930794879695934, + "language_loss": 0.77217424, + "learning_rate": 3.631117713439087e-06, + "loss": 0.79407442, + "num_input_tokens_seen": 79101820, + "step": 3668, + "time_per_iteration": 3.941887855529785 + }, + { + "auxiliary_loss_clip": 0.01134343, + "auxiliary_loss_mlp": 0.0104531, + "balance_loss_clip": 1.0558877, + "balance_loss_mlp": 1.02775013, + "epoch": 0.2205922140387795, + "flos": 24716309247360.0, + "grad_norm": 1.703702445548757, + "language_loss": 0.71556854, + "learning_rate": 3.630892311113904e-06, + "loss": 0.73736507, + "num_input_tokens_seen": 79123320, + "step": 3669, + "time_per_iteration": 2.571268320083618 + }, + { + "auxiliary_loss_clip": 0.01149285, + "auxiliary_loss_mlp": 0.0103946, + "balance_loss_clip": 1.05244219, + "balance_loss_mlp": 1.02286577, + "epoch": 0.22065233729144745, + "flos": 23477247642240.0, + "grad_norm": 1.7612693055331046, + "language_loss": 0.8569867, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.87887418, + "num_input_tokens_seen": 79141615, + "step": 3670, + "time_per_iteration": 2.477865695953369 + }, + { + "auxiliary_loss_clip": 0.01130697, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_clip": 1.05380821, + "balance_loss_mlp": 1.02308989, + "epoch": 0.22071246054411545, + "flos": 35225566646400.0, + "grad_norm": 1.7464844974907983, + "language_loss": 0.7684592, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.79017496, + "num_input_tokens_seen": 79164910, + "step": 3671, + "time_per_iteration": 2.655616521835327 + }, + { + "auxiliary_loss_clip": 0.01121889, + "auxiliary_loss_mlp": 0.01038324, + "balance_loss_clip": 1.05446315, + "balance_loss_mlp": 1.02138424, + "epoch": 0.2207725837967834, + "flos": 18150294908160.0, + "grad_norm": 2.911520609074442, + "language_loss": 0.81266344, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.83426559, + "num_input_tokens_seen": 79179685, + "step": 3672, + "time_per_iteration": 2.490204334259033 + }, + { + "auxiliary_loss_clip": 0.01143399, + "auxiliary_loss_mlp": 0.01047814, + "balance_loss_clip": 1.05505991, + "balance_loss_mlp": 1.03031433, + "epoch": 0.22083270704945138, + "flos": 20479675898880.0, + "grad_norm": 1.9924927758171853, + "language_loss": 0.73793137, + "learning_rate": 3.629990083462682e-06, + "loss": 0.75984353, + "num_input_tokens_seen": 79196285, + "step": 3673, + "time_per_iteration": 2.508039712905884 + }, + { + "auxiliary_loss_clip": 0.01119155, + "auxiliary_loss_mlp": 0.01042786, + "balance_loss_clip": 1.05900717, + "balance_loss_mlp": 1.02364051, + "epoch": 0.22089283030211934, + "flos": 34125801984000.0, + "grad_norm": 2.117045710983467, + "language_loss": 0.76393187, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.78555125, + "num_input_tokens_seen": 79216060, + "step": 3674, + "time_per_iteration": 2.693067789077759 + }, + { + "auxiliary_loss_clip": 0.01152796, + "auxiliary_loss_mlp": 0.01043986, + "balance_loss_clip": 1.05672407, + "balance_loss_mlp": 1.02574718, + "epoch": 0.2209529535547873, + "flos": 18077216688000.0, + "grad_norm": 2.202236180231166, + "language_loss": 0.74304384, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.76501167, + "num_input_tokens_seen": 79235145, + "step": 3675, + "time_per_iteration": 2.4624598026275635 + }, + { + "auxiliary_loss_clip": 0.0115188, + "auxiliary_loss_mlp": 0.01044317, + "balance_loss_clip": 1.05466366, + "balance_loss_mlp": 1.02716303, + "epoch": 0.22101307680745527, + "flos": 27235335070080.0, + "grad_norm": 1.772344629817283, + "language_loss": 0.80377364, + "learning_rate": 3.629312763695772e-06, + "loss": 0.82573569, + "num_input_tokens_seen": 79256960, + "step": 3676, + "time_per_iteration": 2.5380258560180664 + }, + { + "auxiliary_loss_clip": 0.01128244, + "auxiliary_loss_mlp": 0.01050583, + "balance_loss_clip": 1.0536648, + "balance_loss_mlp": 1.03392899, + "epoch": 0.22107320006012326, + "flos": 16543256423040.0, + "grad_norm": 2.1110987433072115, + "language_loss": 0.7507714, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.7725597, + "num_input_tokens_seen": 79274860, + "step": 3677, + "time_per_iteration": 2.521003007888794 + }, + { + "auxiliary_loss_clip": 0.01116669, + "auxiliary_loss_mlp": 0.01043728, + "balance_loss_clip": 1.05517185, + "balance_loss_mlp": 1.0265981, + "epoch": 0.22113332331279123, + "flos": 22054466949120.0, + "grad_norm": 1.8953834310992432, + "language_loss": 0.83286393, + "learning_rate": 3.628860908251712e-06, + "loss": 0.85446787, + "num_input_tokens_seen": 79294005, + "step": 3678, + "time_per_iteration": 2.6081435680389404 + }, + { + "auxiliary_loss_clip": 0.010937, + "auxiliary_loss_mlp": 0.01052785, + "balance_loss_clip": 1.05674231, + "balance_loss_mlp": 1.03386664, + "epoch": 0.2211934465654592, + "flos": 26612787525120.0, + "grad_norm": 1.88170368729481, + "language_loss": 0.89115268, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.91261756, + "num_input_tokens_seen": 79314005, + "step": 3679, + "time_per_iteration": 2.6403069496154785 + }, + { + "auxiliary_loss_clip": 0.0114709, + "auxiliary_loss_mlp": 0.01056913, + "balance_loss_clip": 1.05789602, + "balance_loss_mlp": 1.03835249, + "epoch": 0.22125356981812716, + "flos": 16360363347840.0, + "grad_norm": 2.316225277391517, + "language_loss": 0.86878163, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.8908217, + "num_input_tokens_seen": 79331030, + "step": 3680, + "time_per_iteration": 2.497048854827881 + }, + { + "auxiliary_loss_clip": 0.01106552, + "auxiliary_loss_mlp": 0.01044452, + "balance_loss_clip": 1.05266786, + "balance_loss_mlp": 1.02716684, + "epoch": 0.22131369307079513, + "flos": 21651118151040.0, + "grad_norm": 1.9952225144895441, + "language_loss": 0.81557775, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.83708787, + "num_input_tokens_seen": 79348560, + "step": 3681, + "time_per_iteration": 2.5601754188537598 + }, + { + "auxiliary_loss_clip": 0.01148144, + "auxiliary_loss_mlp": 0.00799366, + "balance_loss_clip": 1.0557065, + "balance_loss_mlp": 1.02368104, + "epoch": 0.2213738163234631, + "flos": 19609524927360.0, + "grad_norm": 2.4025946647542833, + "language_loss": 0.79279947, + "learning_rate": 3.62795645623335e-06, + "loss": 0.81227458, + "num_input_tokens_seen": 79367175, + "step": 3682, + "time_per_iteration": 2.5056543350219727 + }, + { + "auxiliary_loss_clip": 0.01120446, + "auxiliary_loss_mlp": 0.0104859, + "balance_loss_clip": 1.05120373, + "balance_loss_mlp": 1.02979112, + "epoch": 0.22143393957613106, + "flos": 23623404082560.0, + "grad_norm": 1.7884064455188728, + "language_loss": 0.77415484, + "learning_rate": 3.627730188876638e-06, + "loss": 0.79584527, + "num_input_tokens_seen": 79388435, + "step": 3683, + "time_per_iteration": 2.5760154724121094 + }, + { + "auxiliary_loss_clip": 0.01126291, + "auxiliary_loss_mlp": 0.01047339, + "balance_loss_clip": 1.05156612, + "balance_loss_mlp": 1.0295887, + "epoch": 0.22149406282879905, + "flos": 26177801823360.0, + "grad_norm": 1.8300147832470628, + "language_loss": 0.7275818, + "learning_rate": 3.627503859796234e-06, + "loss": 0.74931812, + "num_input_tokens_seen": 79407910, + "step": 3684, + "time_per_iteration": 2.608365774154663 + }, + { + "auxiliary_loss_clip": 0.01086047, + "auxiliary_loss_mlp": 0.01048394, + "balance_loss_clip": 1.05391777, + "balance_loss_mlp": 1.02902293, + "epoch": 0.221554186081467, + "flos": 14538758970240.0, + "grad_norm": 3.4272035833049133, + "language_loss": 0.80310142, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.82444578, + "num_input_tokens_seen": 79424020, + "step": 3685, + "time_per_iteration": 2.5939111709594727 + }, + { + "auxiliary_loss_clip": 0.01144405, + "auxiliary_loss_mlp": 0.01043214, + "balance_loss_clip": 1.05217791, + "balance_loss_mlp": 1.02704895, + "epoch": 0.22161430933413498, + "flos": 22238257864320.0, + "grad_norm": 1.6102466415262626, + "language_loss": 0.8747015, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.89657766, + "num_input_tokens_seen": 79445605, + "step": 3686, + "time_per_iteration": 2.5217339992523193 + }, + { + "auxiliary_loss_clip": 0.01133111, + "auxiliary_loss_mlp": 0.01046641, + "balance_loss_clip": 1.05104399, + "balance_loss_mlp": 1.02866387, + "epoch": 0.22167443258680294, + "flos": 23476529370240.0, + "grad_norm": 1.859886786993196, + "language_loss": 0.77638704, + "learning_rate": 3.626824502298707e-06, + "loss": 0.79818451, + "num_input_tokens_seen": 79463850, + "step": 3687, + "time_per_iteration": 2.5138392448425293 + }, + { + "auxiliary_loss_clip": 0.01123905, + "auxiliary_loss_mlp": 0.01055128, + "balance_loss_clip": 1.05339372, + "balance_loss_mlp": 1.03631687, + "epoch": 0.2217345558394709, + "flos": 23221132692480.0, + "grad_norm": 1.9452480644119738, + "language_loss": 0.84887278, + "learning_rate": 3.626597926409383e-06, + "loss": 0.87066317, + "num_input_tokens_seen": 79482845, + "step": 3688, + "time_per_iteration": 2.5564327239990234 + }, + { + "auxiliary_loss_clip": 0.01106523, + "auxiliary_loss_mlp": 0.01044934, + "balance_loss_clip": 1.05073452, + "balance_loss_mlp": 1.02617025, + "epoch": 0.22179467909213887, + "flos": 20011078045440.0, + "grad_norm": 1.7931838952577865, + "language_loss": 0.81072587, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.83224046, + "num_input_tokens_seen": 79501550, + "step": 3689, + "time_per_iteration": 2.57620906829834 + }, + { + "auxiliary_loss_clip": 0.01122873, + "auxiliary_loss_mlp": 0.01045406, + "balance_loss_clip": 1.05291712, + "balance_loss_mlp": 1.02758455, + "epoch": 0.22185480234480687, + "flos": 19683034110720.0, + "grad_norm": 1.868179416728469, + "language_loss": 0.70150167, + "learning_rate": 3.626144589597061e-06, + "loss": 0.72318447, + "num_input_tokens_seen": 79519680, + "step": 3690, + "time_per_iteration": 2.542576789855957 + }, + { + "auxiliary_loss_clip": 0.01142259, + "auxiliary_loss_mlp": 0.00795306, + "balance_loss_clip": 1.05227363, + "balance_loss_mlp": 1.01354825, + "epoch": 0.22191492559747483, + "flos": 21981316901760.0, + "grad_norm": 1.707299056919887, + "language_loss": 0.72108769, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.74046338, + "num_input_tokens_seen": 79539000, + "step": 3691, + "time_per_iteration": 2.5178792476654053 + }, + { + "auxiliary_loss_clip": 0.01136474, + "auxiliary_loss_mlp": 0.01045627, + "balance_loss_clip": 1.05352926, + "balance_loss_mlp": 1.0269351, + "epoch": 0.2219750488501428, + "flos": 23222066446080.0, + "grad_norm": 1.9451877437974838, + "language_loss": 0.71676397, + "learning_rate": 3.625691006130477e-06, + "loss": 0.738585, + "num_input_tokens_seen": 79559695, + "step": 3692, + "time_per_iteration": 2.5641558170318604 + }, + { + "auxiliary_loss_clip": 0.01144915, + "auxiliary_loss_mlp": 0.0104927, + "balance_loss_clip": 1.05261433, + "balance_loss_mlp": 1.03159142, + "epoch": 0.22203517210281076, + "flos": 22453685683200.0, + "grad_norm": 1.7647431416611872, + "language_loss": 0.87377274, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.89571464, + "num_input_tokens_seen": 79579095, + "step": 3693, + "time_per_iteration": 2.504927635192871 + }, + { + "auxiliary_loss_clip": 0.01136663, + "auxiliary_loss_mlp": 0.01040074, + "balance_loss_clip": 1.0533886, + "balance_loss_mlp": 1.02430308, + "epoch": 0.22209529535547873, + "flos": 17564555825280.0, + "grad_norm": 1.967173840771144, + "language_loss": 0.85631371, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.87808108, + "num_input_tokens_seen": 79596430, + "step": 3694, + "time_per_iteration": 2.4905030727386475 + }, + { + "auxiliary_loss_clip": 0.01107239, + "auxiliary_loss_mlp": 0.01043881, + "balance_loss_clip": 1.05040383, + "balance_loss_mlp": 1.0256418, + "epoch": 0.2221554186081467, + "flos": 21469015175040.0, + "grad_norm": 2.383949844894555, + "language_loss": 0.68895102, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.71046221, + "num_input_tokens_seen": 79615825, + "step": 3695, + "time_per_iteration": 2.5621185302734375 + }, + { + "auxiliary_loss_clip": 0.0111475, + "auxiliary_loss_mlp": 0.0104375, + "balance_loss_clip": 1.05534911, + "balance_loss_mlp": 1.02731109, + "epoch": 0.22221554186081466, + "flos": 27673445255040.0, + "grad_norm": 1.4445092076790509, + "language_loss": 0.7184068, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.73999178, + "num_input_tokens_seen": 79637875, + "step": 3696, + "time_per_iteration": 2.6141748428344727 + }, + { + "auxiliary_loss_clip": 0.01134903, + "auxiliary_loss_mlp": 0.01045553, + "balance_loss_clip": 1.0508194, + "balance_loss_mlp": 1.0276835, + "epoch": 0.22227566511348265, + "flos": 25958926298880.0, + "grad_norm": 2.1061061338910796, + "language_loss": 0.87669015, + "learning_rate": 3.624555968803217e-06, + "loss": 0.89849466, + "num_input_tokens_seen": 79656970, + "step": 3697, + "time_per_iteration": 2.554702043533325 + }, + { + "auxiliary_loss_clip": 0.01117085, + "auxiliary_loss_mlp": 0.01044974, + "balance_loss_clip": 1.05005264, + "balance_loss_mlp": 1.02879679, + "epoch": 0.22233578836615062, + "flos": 39203678833920.0, + "grad_norm": 1.468307383163174, + "language_loss": 0.65969241, + "learning_rate": 3.624328776493346e-06, + "loss": 0.68131304, + "num_input_tokens_seen": 79680275, + "step": 3698, + "time_per_iteration": 2.7362568378448486 + }, + { + "auxiliary_loss_clip": 0.01134301, + "auxiliary_loss_mlp": 0.01041843, + "balance_loss_clip": 1.05388296, + "balance_loss_mlp": 1.02392554, + "epoch": 0.22239591161881858, + "flos": 36283782251520.0, + "grad_norm": 1.9497695506261954, + "language_loss": 0.8225503, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.84431171, + "num_input_tokens_seen": 79701255, + "step": 3699, + "time_per_iteration": 2.6239187717437744 + }, + { + "auxiliary_loss_clip": 0.01122347, + "auxiliary_loss_mlp": 0.0104384, + "balance_loss_clip": 1.04951191, + "balance_loss_mlp": 1.02591109, + "epoch": 0.22245603487148655, + "flos": 19719591177600.0, + "grad_norm": 1.4906034976712705, + "language_loss": 0.79643065, + "learning_rate": 3.62387420709809e-06, + "loss": 0.81809253, + "num_input_tokens_seen": 79721315, + "step": 3700, + "time_per_iteration": 2.548328161239624 + }, + { + "auxiliary_loss_clip": 0.01110668, + "auxiliary_loss_mlp": 0.01043715, + "balance_loss_clip": 1.05244303, + "balance_loss_mlp": 1.02575028, + "epoch": 0.2225161581241545, + "flos": 46280450615040.0, + "grad_norm": 2.1701972428060765, + "language_loss": 0.71958733, + "learning_rate": 3.623646830029943e-06, + "loss": 0.74113107, + "num_input_tokens_seen": 79742705, + "step": 3701, + "time_per_iteration": 4.185058355331421 + }, + { + "auxiliary_loss_clip": 0.01138335, + "auxiliary_loss_mlp": 0.01042571, + "balance_loss_clip": 1.05032611, + "balance_loss_mlp": 1.02567923, + "epoch": 0.22257628137682248, + "flos": 23696194993920.0, + "grad_norm": 1.8292653964622356, + "language_loss": 0.80229568, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.82410473, + "num_input_tokens_seen": 79763000, + "step": 3702, + "time_per_iteration": 3.9101614952087402 + }, + { + "auxiliary_loss_clip": 0.01125835, + "auxiliary_loss_mlp": 0.01042105, + "balance_loss_clip": 1.04734778, + "balance_loss_mlp": 1.02462888, + "epoch": 0.22263640462949044, + "flos": 19353984595200.0, + "grad_norm": 1.7873417466984012, + "language_loss": 0.77789557, + "learning_rate": 3.623191891195723e-06, + "loss": 0.79957497, + "num_input_tokens_seen": 79781335, + "step": 3703, + "time_per_iteration": 2.5154995918273926 + }, + { + "auxiliary_loss_clip": 0.01137204, + "auxiliary_loss_mlp": 0.01041503, + "balance_loss_clip": 1.05200076, + "balance_loss_mlp": 1.02293038, + "epoch": 0.22269652788215843, + "flos": 20776047016320.0, + "grad_norm": 2.040490302330314, + "language_loss": 0.74058306, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.76237017, + "num_input_tokens_seen": 79800150, + "step": 3704, + "time_per_iteration": 2.4972083568573 + }, + { + "auxiliary_loss_clip": 0.01101439, + "auxiliary_loss_mlp": 0.01045571, + "balance_loss_clip": 1.05286741, + "balance_loss_mlp": 1.02985883, + "epoch": 0.2227566511348264, + "flos": 47958843467520.0, + "grad_norm": 1.9054870683182117, + "language_loss": 0.64360976, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.66507983, + "num_input_tokens_seen": 79822390, + "step": 3705, + "time_per_iteration": 2.795229196548462 + }, + { + "auxiliary_loss_clip": 0.01039548, + "auxiliary_loss_mlp": 0.01005406, + "balance_loss_clip": 1.051301, + "balance_loss_mlp": 1.0027951, + "epoch": 0.22281677438749437, + "flos": 66218953230720.0, + "grad_norm": 1.2214577721489877, + "language_loss": 0.6515553, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.67200482, + "num_input_tokens_seen": 79873350, + "step": 3706, + "time_per_iteration": 4.421535968780518 + }, + { + "auxiliary_loss_clip": 0.01112825, + "auxiliary_loss_mlp": 0.01042955, + "balance_loss_clip": 1.05370426, + "balance_loss_mlp": 1.02633715, + "epoch": 0.22287689764016233, + "flos": 21871609787520.0, + "grad_norm": 2.002099612311545, + "language_loss": 0.80582047, + "learning_rate": 3.622281274977141e-06, + "loss": 0.82737827, + "num_input_tokens_seen": 79891715, + "step": 3707, + "time_per_iteration": 3.9179625511169434 + }, + { + "auxiliary_loss_clip": 0.01147586, + "auxiliary_loss_mlp": 0.01037651, + "balance_loss_clip": 1.05311418, + "balance_loss_mlp": 1.02096164, + "epoch": 0.2229370208928303, + "flos": 27672475587840.0, + "grad_norm": 2.2031729429852986, + "language_loss": 0.78673404, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.80858642, + "num_input_tokens_seen": 79911175, + "step": 3708, + "time_per_iteration": 2.5233566761016846 + }, + { + "auxiliary_loss_clip": 0.0112594, + "auxiliary_loss_mlp": 0.01041713, + "balance_loss_clip": 1.05324852, + "balance_loss_mlp": 1.02409399, + "epoch": 0.22299714414549826, + "flos": 30154657034880.0, + "grad_norm": 1.9819335441606962, + "language_loss": 0.80459082, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.82626742, + "num_input_tokens_seen": 79931875, + "step": 3709, + "time_per_iteration": 2.629119873046875 + }, + { + "auxiliary_loss_clip": 0.01134206, + "auxiliary_loss_mlp": 0.0079543, + "balance_loss_clip": 1.0529294, + "balance_loss_mlp": 1.01735461, + "epoch": 0.22305726739816625, + "flos": 23143134309120.0, + "grad_norm": 1.7690933724008897, + "language_loss": 0.68834007, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.70763642, + "num_input_tokens_seen": 79952445, + "step": 3710, + "time_per_iteration": 2.558711051940918 + }, + { + "auxiliary_loss_clip": 0.01112245, + "auxiliary_loss_mlp": 0.01049539, + "balance_loss_clip": 1.05570114, + "balance_loss_mlp": 1.03184795, + "epoch": 0.22311739065083422, + "flos": 19172061187200.0, + "grad_norm": 2.199210471652172, + "language_loss": 0.91312939, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.93474722, + "num_input_tokens_seen": 79971030, + "step": 3711, + "time_per_iteration": 2.5677599906921387 + }, + { + "auxiliary_loss_clip": 0.01115644, + "auxiliary_loss_mlp": 0.01052688, + "balance_loss_clip": 1.05211163, + "balance_loss_mlp": 1.03456831, + "epoch": 0.22317751390350218, + "flos": 13617757319040.0, + "grad_norm": 2.949339642494492, + "language_loss": 0.89604026, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.9177236, + "num_input_tokens_seen": 79982085, + "step": 3712, + "time_per_iteration": 2.4656765460968018 + }, + { + "auxiliary_loss_clip": 0.01148422, + "auxiliary_loss_mlp": 0.01048396, + "balance_loss_clip": 1.05564618, + "balance_loss_mlp": 1.0310744, + "epoch": 0.22323763715617015, + "flos": 11029065068160.0, + "grad_norm": 2.7479700435099805, + "language_loss": 0.75716138, + "learning_rate": 3.620913505310117e-06, + "loss": 0.77912951, + "num_input_tokens_seen": 79997460, + "step": 3713, + "time_per_iteration": 2.4481799602508545 + }, + { + "auxiliary_loss_clip": 0.0110451, + "auxiliary_loss_mlp": 0.01045457, + "balance_loss_clip": 1.05525827, + "balance_loss_mlp": 1.02851689, + "epoch": 0.22329776040883811, + "flos": 41351531466240.0, + "grad_norm": 3.099933834760393, + "language_loss": 0.62832606, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.64982569, + "num_input_tokens_seen": 80022450, + "step": 3714, + "time_per_iteration": 2.8249099254608154 + }, + { + "auxiliary_loss_clip": 0.0112088, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.05601287, + "balance_loss_mlp": 1.01599741, + "epoch": 0.22335788366150608, + "flos": 25119478477440.0, + "grad_norm": 1.7796132430405904, + "language_loss": 0.7952776, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.81681007, + "num_input_tokens_seen": 80042100, + "step": 3715, + "time_per_iteration": 2.6272099018096924 + }, + { + "auxiliary_loss_clip": 0.01107979, + "auxiliary_loss_mlp": 0.01052724, + "balance_loss_clip": 1.05758274, + "balance_loss_mlp": 1.03586733, + "epoch": 0.22341800691417404, + "flos": 16983377769600.0, + "grad_norm": 1.92139650124282, + "language_loss": 0.77189517, + "learning_rate": 3.620228790579645e-06, + "loss": 0.79350215, + "num_input_tokens_seen": 80059690, + "step": 3716, + "time_per_iteration": 2.566612482070923 + }, + { + "auxiliary_loss_clip": 0.01121539, + "auxiliary_loss_mlp": 0.01047567, + "balance_loss_clip": 1.0515188, + "balance_loss_mlp": 1.03056788, + "epoch": 0.22347813016684204, + "flos": 14136738975360.0, + "grad_norm": 2.44785926085456, + "language_loss": 0.7860806, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.80777168, + "num_input_tokens_seen": 80076060, + "step": 3717, + "time_per_iteration": 2.5313844680786133 + }, + { + "auxiliary_loss_clip": 0.01079823, + "auxiliary_loss_mlp": 0.01045187, + "balance_loss_clip": 1.05552602, + "balance_loss_mlp": 1.02686501, + "epoch": 0.22353825341951, + "flos": 23583147914880.0, + "grad_norm": 2.476336774018428, + "language_loss": 0.6784547, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.69970489, + "num_input_tokens_seen": 80094760, + "step": 3718, + "time_per_iteration": 2.6621789932250977 + }, + { + "auxiliary_loss_clip": 0.01129563, + "auxiliary_loss_mlp": 0.01041298, + "balance_loss_clip": 1.05359626, + "balance_loss_mlp": 1.02198601, + "epoch": 0.22359837667217797, + "flos": 29824206888960.0, + "grad_norm": 2.55547153783583, + "language_loss": 0.80822825, + "learning_rate": 3.619543522896045e-06, + "loss": 0.82993686, + "num_input_tokens_seen": 80114475, + "step": 3719, + "time_per_iteration": 2.627722978591919 + }, + { + "auxiliary_loss_clip": 0.01123064, + "auxiliary_loss_mlp": 0.0105337, + "balance_loss_clip": 1.05091822, + "balance_loss_mlp": 1.03435588, + "epoch": 0.22365849992484593, + "flos": 17603088140160.0, + "grad_norm": 2.1903534207644846, + "language_loss": 0.8648268, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.88659114, + "num_input_tokens_seen": 80132920, + "step": 3720, + "time_per_iteration": 2.516124963760376 + }, + { + "auxiliary_loss_clip": 0.0112258, + "auxiliary_loss_mlp": 0.0104038, + "balance_loss_clip": 1.0591023, + "balance_loss_mlp": 1.02198577, + "epoch": 0.2237186231775139, + "flos": 22710949868160.0, + "grad_norm": 1.8042786776808566, + "language_loss": 0.74390185, + "learning_rate": 3.619086370692945e-06, + "loss": 0.76553142, + "num_input_tokens_seen": 80152845, + "step": 3721, + "time_per_iteration": 2.5827584266662598 + }, + { + "auxiliary_loss_clip": 0.01154226, + "auxiliary_loss_mlp": 0.01048118, + "balance_loss_clip": 1.05574286, + "balance_loss_mlp": 1.03053451, + "epoch": 0.22377874643018186, + "flos": 13371518609280.0, + "grad_norm": 2.050108648041648, + "language_loss": 0.79282176, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.8148452, + "num_input_tokens_seen": 80170680, + "step": 3722, + "time_per_iteration": 2.470303535461426 + }, + { + "auxiliary_loss_clip": 0.01113211, + "auxiliary_loss_mlp": 0.01039803, + "balance_loss_clip": 1.05506945, + "balance_loss_mlp": 1.02316117, + "epoch": 0.22383886968284986, + "flos": 17894970057600.0, + "grad_norm": 2.1204642292089178, + "language_loss": 0.820885, + "learning_rate": 3.618628972906178e-06, + "loss": 0.84241509, + "num_input_tokens_seen": 80189030, + "step": 3723, + "time_per_iteration": 2.6186423301696777 + }, + { + "auxiliary_loss_clip": 0.01154455, + "auxiliary_loss_mlp": 0.01049802, + "balance_loss_clip": 1.05710661, + "balance_loss_mlp": 1.03201628, + "epoch": 0.22389899293551782, + "flos": 23879123982720.0, + "grad_norm": 1.8990467955239643, + "language_loss": 0.84467667, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.86671919, + "num_input_tokens_seen": 80208365, + "step": 3724, + "time_per_iteration": 2.5040199756622314 + }, + { + "auxiliary_loss_clip": 0.01128582, + "auxiliary_loss_mlp": 0.0103929, + "balance_loss_clip": 1.05890274, + "balance_loss_mlp": 1.02188528, + "epoch": 0.2239591161881858, + "flos": 27272430840960.0, + "grad_norm": 2.0217039822730336, + "language_loss": 0.79168761, + "learning_rate": 3.618171329605121e-06, + "loss": 0.81336629, + "num_input_tokens_seen": 80228685, + "step": 3725, + "time_per_iteration": 2.599083662033081 + }, + { + "auxiliary_loss_clip": 0.01093103, + "auxiliary_loss_mlp": 0.01040976, + "balance_loss_clip": 1.05671811, + "balance_loss_mlp": 1.0236187, + "epoch": 0.22401923944085375, + "flos": 22236857233920.0, + "grad_norm": 1.8041644331285458, + "language_loss": 0.76951665, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.79085743, + "num_input_tokens_seen": 80247635, + "step": 3726, + "time_per_iteration": 2.653315782546997 + }, + { + "auxiliary_loss_clip": 0.01153221, + "auxiliary_loss_mlp": 0.0104902, + "balance_loss_clip": 1.05950832, + "balance_loss_mlp": 1.02943349, + "epoch": 0.22407936269352172, + "flos": 12053668521600.0, + "grad_norm": 2.367939704489406, + "language_loss": 0.7255193, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.74754167, + "num_input_tokens_seen": 80260045, + "step": 3727, + "time_per_iteration": 2.4843571186065674 + }, + { + "auxiliary_loss_clip": 0.01156054, + "auxiliary_loss_mlp": 0.0104632, + "balance_loss_clip": 1.05639589, + "balance_loss_mlp": 1.02661514, + "epoch": 0.22413948594618968, + "flos": 19353553632000.0, + "grad_norm": 2.3823967530206778, + "language_loss": 0.87001526, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.89203906, + "num_input_tokens_seen": 80277680, + "step": 3728, + "time_per_iteration": 2.4720895290374756 + }, + { + "auxiliary_loss_clip": 0.01125501, + "auxiliary_loss_mlp": 0.01054499, + "balance_loss_clip": 1.05643523, + "balance_loss_mlp": 1.03327978, + "epoch": 0.22419960919885765, + "flos": 24170000319360.0, + "grad_norm": 2.154243072499331, + "language_loss": 0.8043108, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.82611078, + "num_input_tokens_seen": 80294795, + "step": 3729, + "time_per_iteration": 2.5827672481536865 + }, + { + "auxiliary_loss_clip": 0.01127683, + "auxiliary_loss_mlp": 0.01045647, + "balance_loss_clip": 1.05689728, + "balance_loss_mlp": 1.02861166, + "epoch": 0.22425973245152564, + "flos": 27378977558400.0, + "grad_norm": 2.873095235551979, + "language_loss": 0.86536056, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.88709378, + "num_input_tokens_seen": 80315425, + "step": 3730, + "time_per_iteration": 2.607928514480591 + }, + { + "auxiliary_loss_clip": 0.01128276, + "auxiliary_loss_mlp": 0.00793415, + "balance_loss_clip": 1.05574715, + "balance_loss_mlp": 1.01317406, + "epoch": 0.2243198557041936, + "flos": 13735652734080.0, + "grad_norm": 1.732173958391391, + "language_loss": 0.73223889, + "learning_rate": 3.616796927310559e-06, + "loss": 0.75145578, + "num_input_tokens_seen": 80333905, + "step": 3731, + "time_per_iteration": 2.5272226333618164 + }, + { + "auxiliary_loss_clip": 0.01124866, + "auxiliary_loss_mlp": 0.01041073, + "balance_loss_clip": 1.05654836, + "balance_loss_mlp": 1.02332306, + "epoch": 0.22437997895686157, + "flos": 19530700531200.0, + "grad_norm": 1.6487240139844148, + "language_loss": 0.75365263, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.77531207, + "num_input_tokens_seen": 80352165, + "step": 3732, + "time_per_iteration": 2.5582118034362793 + }, + { + "auxiliary_loss_clip": 0.01156782, + "auxiliary_loss_mlp": 0.01059765, + "balance_loss_clip": 1.05967486, + "balance_loss_mlp": 1.04205072, + "epoch": 0.22444010220952954, + "flos": 23696230907520.0, + "grad_norm": 2.0987299000437676, + "language_loss": 0.87827933, + "learning_rate": 3.616338302646873e-06, + "loss": 0.90044481, + "num_input_tokens_seen": 80371305, + "step": 3733, + "time_per_iteration": 2.505507230758667 + }, + { + "auxiliary_loss_clip": 0.01117646, + "auxiliary_loss_mlp": 0.01047589, + "balance_loss_clip": 1.05846262, + "balance_loss_mlp": 1.02942109, + "epoch": 0.2245002254621975, + "flos": 22382905933440.0, + "grad_norm": 2.0415789591078233, + "language_loss": 0.84686708, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.86851943, + "num_input_tokens_seen": 80391020, + "step": 3734, + "time_per_iteration": 2.6194024085998535 + }, + { + "auxiliary_loss_clip": 0.01133598, + "auxiliary_loss_mlp": 0.01049437, + "balance_loss_clip": 1.05867219, + "balance_loss_mlp": 1.03131711, + "epoch": 0.22456034871486547, + "flos": 26942303917440.0, + "grad_norm": 1.6212796510693572, + "language_loss": 0.76850319, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.79033351, + "num_input_tokens_seen": 80411365, + "step": 3735, + "time_per_iteration": 2.6044516563415527 + }, + { + "auxiliary_loss_clip": 0.01132194, + "auxiliary_loss_mlp": 0.01048712, + "balance_loss_clip": 1.05861056, + "balance_loss_mlp": 1.0317719, + "epoch": 0.22462047196753343, + "flos": 28983538005120.0, + "grad_norm": 1.7063430501289794, + "language_loss": 0.84645194, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.86826098, + "num_input_tokens_seen": 80431075, + "step": 3736, + "time_per_iteration": 2.604153633117676 + }, + { + "auxiliary_loss_clip": 0.01115852, + "auxiliary_loss_mlp": 0.010427, + "balance_loss_clip": 1.06077898, + "balance_loss_mlp": 1.02512896, + "epoch": 0.22468059522020142, + "flos": 20011329440640.0, + "grad_norm": 1.8256528208951255, + "language_loss": 0.86427402, + "learning_rate": 3.615420317888586e-06, + "loss": 0.88585955, + "num_input_tokens_seen": 80449240, + "step": 3737, + "time_per_iteration": 2.562758684158325 + }, + { + "auxiliary_loss_clip": 0.01155111, + "auxiliary_loss_mlp": 0.01051877, + "balance_loss_clip": 1.05735016, + "balance_loss_mlp": 1.03324461, + "epoch": 0.2247407184728694, + "flos": 29314239546240.0, + "grad_norm": 1.8985445200545226, + "language_loss": 0.79017413, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.812244, + "num_input_tokens_seen": 80467900, + "step": 3738, + "time_per_iteration": 2.5640101432800293 + }, + { + "auxiliary_loss_clip": 0.01122355, + "auxiliary_loss_mlp": 0.01045226, + "balance_loss_clip": 1.05492139, + "balance_loss_mlp": 1.02767837, + "epoch": 0.22480084172553735, + "flos": 22310366417280.0, + "grad_norm": 1.6454489790048001, + "language_loss": 0.76213765, + "learning_rate": 3.614960957933224e-06, + "loss": 0.78381348, + "num_input_tokens_seen": 80487100, + "step": 3739, + "time_per_iteration": 3.9595401287078857 + }, + { + "auxiliary_loss_clip": 0.01115621, + "auxiliary_loss_mlp": 0.01048596, + "balance_loss_clip": 1.05171108, + "balance_loss_mlp": 1.02777028, + "epoch": 0.22486096497820532, + "flos": 25591272641280.0, + "grad_norm": 2.731918035872734, + "language_loss": 0.74390483, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.76554704, + "num_input_tokens_seen": 80508625, + "step": 3740, + "time_per_iteration": 2.614309310913086 + }, + { + "auxiliary_loss_clip": 0.01152084, + "auxiliary_loss_mlp": 0.01040033, + "balance_loss_clip": 1.0576247, + "balance_loss_mlp": 1.02271223, + "epoch": 0.22492108823087328, + "flos": 17639824775040.0, + "grad_norm": 1.705388842671718, + "language_loss": 0.75909168, + "learning_rate": 3.614501353019939e-06, + "loss": 0.78101289, + "num_input_tokens_seen": 80527345, + "step": 3741, + "time_per_iteration": 3.8693768978118896 + }, + { + "auxiliary_loss_clip": 0.01138823, + "auxiliary_loss_mlp": 0.01040924, + "balance_loss_clip": 1.06210303, + "balance_loss_mlp": 1.02342367, + "epoch": 0.22498121148354125, + "flos": 16034653797120.0, + "grad_norm": 1.762251101844035, + "language_loss": 0.87479824, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.89659572, + "num_input_tokens_seen": 80545545, + "step": 3742, + "time_per_iteration": 2.5557687282562256 + }, + { + "auxiliary_loss_clip": 0.01104291, + "auxiliary_loss_mlp": 0.01045521, + "balance_loss_clip": 1.05937445, + "balance_loss_mlp": 1.02701962, + "epoch": 0.22504133473620924, + "flos": 24023772051840.0, + "grad_norm": 1.6656173382157309, + "language_loss": 0.81803781, + "learning_rate": 3.614041503218444e-06, + "loss": 0.83953595, + "num_input_tokens_seen": 80565040, + "step": 3743, + "time_per_iteration": 2.6294338703155518 + }, + { + "auxiliary_loss_clip": 0.011429, + "auxiliary_loss_mlp": 0.01040597, + "balance_loss_clip": 1.0567863, + "balance_loss_mlp": 1.02353764, + "epoch": 0.2251014579888772, + "flos": 16763963541120.0, + "grad_norm": 2.1803182555173977, + "language_loss": 0.6400159, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.66185087, + "num_input_tokens_seen": 80582815, + "step": 3744, + "time_per_iteration": 2.491536855697632 + }, + { + "auxiliary_loss_clip": 0.01134607, + "auxiliary_loss_mlp": 0.01039958, + "balance_loss_clip": 1.0559665, + "balance_loss_mlp": 1.02249432, + "epoch": 0.22516158124154517, + "flos": 13991013498240.0, + "grad_norm": 2.9263769970283593, + "language_loss": 0.76143813, + "learning_rate": 3.613581408598489e-06, + "loss": 0.78318381, + "num_input_tokens_seen": 80600865, + "step": 3745, + "time_per_iteration": 3.8623361587524414 + }, + { + "auxiliary_loss_clip": 0.01114466, + "auxiliary_loss_mlp": 0.01037907, + "balance_loss_clip": 1.05591094, + "balance_loss_mlp": 1.02112269, + "epoch": 0.22522170449421314, + "flos": 14390016750720.0, + "grad_norm": 1.9839853955523465, + "language_loss": 0.81164122, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.83316493, + "num_input_tokens_seen": 80617455, + "step": 3746, + "time_per_iteration": 3.975831985473633 + }, + { + "auxiliary_loss_clip": 0.01139914, + "auxiliary_loss_mlp": 0.01046998, + "balance_loss_clip": 1.0541811, + "balance_loss_mlp": 1.02934313, + "epoch": 0.2252818277468811, + "flos": 23805542972160.0, + "grad_norm": 2.504937815084682, + "language_loss": 0.86532533, + "learning_rate": 3.613121069229862e-06, + "loss": 0.88719451, + "num_input_tokens_seen": 80635125, + "step": 3747, + "time_per_iteration": 2.5332112312316895 + }, + { + "auxiliary_loss_clip": 0.01139014, + "auxiliary_loss_mlp": 0.00795194, + "balance_loss_clip": 1.05376601, + "balance_loss_mlp": 1.01527572, + "epoch": 0.22534195099954907, + "flos": 24718033100160.0, + "grad_norm": 1.6151594690233029, + "language_loss": 0.76640296, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.78574502, + "num_input_tokens_seen": 80656370, + "step": 3748, + "time_per_iteration": 2.5779221057891846 + }, + { + "auxiliary_loss_clip": 0.01155238, + "auxiliary_loss_mlp": 0.01046584, + "balance_loss_clip": 1.05752385, + "balance_loss_mlp": 1.02865505, + "epoch": 0.22540207425221703, + "flos": 21032341534080.0, + "grad_norm": 1.5768548681255479, + "language_loss": 0.79707325, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.8190915, + "num_input_tokens_seen": 80676495, + "step": 3749, + "time_per_iteration": 2.4995920658111572 + }, + { + "auxiliary_loss_clip": 0.01124719, + "auxiliary_loss_mlp": 0.01042427, + "balance_loss_clip": 1.05538118, + "balance_loss_mlp": 1.02621508, + "epoch": 0.22546219750488503, + "flos": 19390362094080.0, + "grad_norm": 1.6579203690508102, + "language_loss": 0.79747832, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.81914985, + "num_input_tokens_seen": 80694755, + "step": 3750, + "time_per_iteration": 2.536231517791748 + }, + { + "auxiliary_loss_clip": 0.01093638, + "auxiliary_loss_mlp": 0.01048462, + "balance_loss_clip": 1.0541209, + "balance_loss_mlp": 1.03101015, + "epoch": 0.225522320757553, + "flos": 25192628524800.0, + "grad_norm": 1.7315916059129992, + "language_loss": 0.8184216, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.83984256, + "num_input_tokens_seen": 80713670, + "step": 3751, + "time_per_iteration": 2.625833511352539 + }, + { + "auxiliary_loss_clip": 0.01117888, + "auxiliary_loss_mlp": 0.01044173, + "balance_loss_clip": 1.05664742, + "balance_loss_mlp": 1.02673233, + "epoch": 0.22558244401022096, + "flos": 17163110448000.0, + "grad_norm": 2.1538779973864095, + "language_loss": 0.83622253, + "learning_rate": 3.611969150491165e-06, + "loss": 0.85784316, + "num_input_tokens_seen": 80731450, + "step": 3752, + "time_per_iteration": 2.5240583419799805 + }, + { + "auxiliary_loss_clip": 0.01147899, + "auxiliary_loss_mlp": 0.01036912, + "balance_loss_clip": 1.05460346, + "balance_loss_mlp": 1.02103317, + "epoch": 0.22564256726288892, + "flos": 15231008856960.0, + "grad_norm": 1.8891383814068339, + "language_loss": 0.78549361, + "learning_rate": 3.611738583330375e-06, + "loss": 0.80734175, + "num_input_tokens_seen": 80748415, + "step": 3753, + "time_per_iteration": 2.4651238918304443 + }, + { + "auxiliary_loss_clip": 0.01126926, + "auxiliary_loss_mlp": 0.01040799, + "balance_loss_clip": 1.05602217, + "balance_loss_mlp": 1.02325141, + "epoch": 0.2257026905155569, + "flos": 34568652764160.0, + "grad_norm": 2.0229061658016514, + "language_loss": 0.78974921, + "learning_rate": 3.611507955052295e-06, + "loss": 0.81142646, + "num_input_tokens_seen": 80770835, + "step": 3754, + "time_per_iteration": 2.6487679481506348 + }, + { + "auxiliary_loss_clip": 0.01129785, + "auxiliary_loss_mlp": 0.01047974, + "balance_loss_clip": 1.05975795, + "balance_loss_mlp": 1.03025913, + "epoch": 0.22576281376822485, + "flos": 19938430788480.0, + "grad_norm": 1.748644262666529, + "language_loss": 0.70582604, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.72760355, + "num_input_tokens_seen": 80787840, + "step": 3755, + "time_per_iteration": 2.5304758548736572 + }, + { + "auxiliary_loss_clip": 0.01123953, + "auxiliary_loss_mlp": 0.01054906, + "balance_loss_clip": 1.05806553, + "balance_loss_mlp": 1.03770423, + "epoch": 0.22582293702089282, + "flos": 24602005192320.0, + "grad_norm": 1.9980308594808178, + "language_loss": 0.77365804, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.79544663, + "num_input_tokens_seen": 80806335, + "step": 3756, + "time_per_iteration": 2.602304220199585 + }, + { + "auxiliary_loss_clip": 0.01135464, + "auxiliary_loss_mlp": 0.01047441, + "balance_loss_clip": 1.06435215, + "balance_loss_mlp": 1.03006041, + "epoch": 0.2258830602735608, + "flos": 23035438356480.0, + "grad_norm": 1.9232337083973643, + "language_loss": 0.82484859, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.84667766, + "num_input_tokens_seen": 80825355, + "step": 3757, + "time_per_iteration": 2.549470901489258 + }, + { + "auxiliary_loss_clip": 0.01137561, + "auxiliary_loss_mlp": 0.01043829, + "balance_loss_clip": 1.05552542, + "balance_loss_mlp": 1.02522063, + "epoch": 0.22594318352622877, + "flos": 22158427887360.0, + "grad_norm": 2.143962827059905, + "language_loss": 0.73425412, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.75606805, + "num_input_tokens_seen": 80842570, + "step": 3758, + "time_per_iteration": 2.5137217044830322 + }, + { + "auxiliary_loss_clip": 0.01133726, + "auxiliary_loss_mlp": 0.01050141, + "balance_loss_clip": 1.05688894, + "balance_loss_mlp": 1.03198552, + "epoch": 0.22600330677889674, + "flos": 20594303176320.0, + "grad_norm": 2.0577999837570955, + "language_loss": 0.77360106, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.79543972, + "num_input_tokens_seen": 80858745, + "step": 3759, + "time_per_iteration": 2.553312063217163 + }, + { + "auxiliary_loss_clip": 0.01108294, + "auxiliary_loss_mlp": 0.01042849, + "balance_loss_clip": 1.0554961, + "balance_loss_mlp": 1.02536106, + "epoch": 0.2260634300315647, + "flos": 35659798162560.0, + "grad_norm": 1.5948788167608698, + "language_loss": 0.78859085, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.81010228, + "num_input_tokens_seen": 80880085, + "step": 3760, + "time_per_iteration": 2.738917112350464 + }, + { + "auxiliary_loss_clip": 0.01036548, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.03145194, + "balance_loss_mlp": 1.02503574, + "epoch": 0.22612355328423267, + "flos": 72090455126400.0, + "grad_norm": 0.9526893316136346, + "language_loss": 0.60113287, + "learning_rate": 3.609891846556569e-06, + "loss": 0.6217795, + "num_input_tokens_seen": 80937660, + "step": 3761, + "time_per_iteration": 3.125507354736328 + }, + { + "auxiliary_loss_clip": 0.01115378, + "auxiliary_loss_mlp": 0.01045978, + "balance_loss_clip": 1.05494428, + "balance_loss_mlp": 1.02785814, + "epoch": 0.22618367653690064, + "flos": 22783776693120.0, + "grad_norm": 1.9898634493671348, + "language_loss": 0.7740075, + "learning_rate": 3.609660729655211e-06, + "loss": 0.7956211, + "num_input_tokens_seen": 80956265, + "step": 3762, + "time_per_iteration": 2.5678603649139404 + }, + { + "auxiliary_loss_clip": 0.01128585, + "auxiliary_loss_mlp": 0.01046592, + "balance_loss_clip": 1.05545199, + "balance_loss_mlp": 1.02662432, + "epoch": 0.22624379978956863, + "flos": 20448254476800.0, + "grad_norm": 1.8136800334848775, + "language_loss": 0.78906608, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.81081784, + "num_input_tokens_seen": 80975185, + "step": 3763, + "time_per_iteration": 2.555022716522217 + }, + { + "auxiliary_loss_clip": 0.01136289, + "auxiliary_loss_mlp": 0.01048194, + "balance_loss_clip": 1.05460322, + "balance_loss_mlp": 1.02995539, + "epoch": 0.2263039230422366, + "flos": 17494314779520.0, + "grad_norm": 1.6216548913829765, + "language_loss": 0.9167918, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.93863666, + "num_input_tokens_seen": 80992830, + "step": 3764, + "time_per_iteration": 2.480250835418701 + }, + { + "auxiliary_loss_clip": 0.0113231, + "auxiliary_loss_mlp": 0.01056994, + "balance_loss_clip": 1.0565393, + "balance_loss_mlp": 1.03882706, + "epoch": 0.22636404629490456, + "flos": 28329748606080.0, + "grad_norm": 1.9123978371485788, + "language_loss": 0.75482583, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.7767188, + "num_input_tokens_seen": 81013675, + "step": 3765, + "time_per_iteration": 2.5780951976776123 + }, + { + "auxiliary_loss_clip": 0.01139471, + "auxiliary_loss_mlp": 0.01047511, + "balance_loss_clip": 1.05705738, + "balance_loss_mlp": 1.03052366, + "epoch": 0.22642416954757252, + "flos": 17489143221120.0, + "grad_norm": 2.0299482261223263, + "language_loss": 0.89814973, + "learning_rate": 3.608735651752494e-06, + "loss": 0.92001951, + "num_input_tokens_seen": 81030345, + "step": 3766, + "time_per_iteration": 2.487154006958008 + }, + { + "auxiliary_loss_clip": 0.01123454, + "auxiliary_loss_mlp": 0.01041033, + "balance_loss_clip": 1.05778956, + "balance_loss_mlp": 1.02401042, + "epoch": 0.2264842928002405, + "flos": 24384530298240.0, + "grad_norm": 1.6620599595742342, + "language_loss": 0.74797285, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.76961774, + "num_input_tokens_seen": 81051000, + "step": 3767, + "time_per_iteration": 2.6078543663024902 + }, + { + "auxiliary_loss_clip": 0.01139284, + "auxiliary_loss_mlp": 0.01043375, + "balance_loss_clip": 1.05449152, + "balance_loss_mlp": 1.02509987, + "epoch": 0.22654441605290845, + "flos": 19830519354240.0, + "grad_norm": 1.499791435453702, + "language_loss": 0.71517402, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.73700058, + "num_input_tokens_seen": 81071205, + "step": 3768, + "time_per_iteration": 2.5094616413116455 + }, + { + "auxiliary_loss_clip": 0.01144258, + "auxiliary_loss_mlp": 0.010608, + "balance_loss_clip": 1.06025076, + "balance_loss_mlp": 1.04262006, + "epoch": 0.22660453930557642, + "flos": 27454569730560.0, + "grad_norm": 1.7445075945653372, + "language_loss": 0.78516221, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.80721271, + "num_input_tokens_seen": 81091880, + "step": 3769, + "time_per_iteration": 2.5666685104370117 + }, + { + "auxiliary_loss_clip": 0.01122675, + "auxiliary_loss_mlp": 0.01042542, + "balance_loss_clip": 1.04986739, + "balance_loss_mlp": 1.02429152, + "epoch": 0.2266646625582444, + "flos": 23988148738560.0, + "grad_norm": 3.047596141797272, + "language_loss": 0.68240666, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.70405889, + "num_input_tokens_seen": 81113290, + "step": 3770, + "time_per_iteration": 2.591033697128296 + }, + { + "auxiliary_loss_clip": 0.01151815, + "auxiliary_loss_mlp": 0.01041108, + "balance_loss_clip": 1.05489385, + "balance_loss_mlp": 1.02372742, + "epoch": 0.22672478581091238, + "flos": 26028054023040.0, + "grad_norm": 1.5677721500753763, + "language_loss": 0.80212295, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.82405221, + "num_input_tokens_seen": 81133535, + "step": 3771, + "time_per_iteration": 2.5482959747314453 + }, + { + "auxiliary_loss_clip": 0.01116447, + "auxiliary_loss_mlp": 0.01053853, + "balance_loss_clip": 1.05637538, + "balance_loss_mlp": 1.03626966, + "epoch": 0.22678490906358034, + "flos": 23841812730240.0, + "grad_norm": 1.5389980514277168, + "language_loss": 0.79163587, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81333888, + "num_input_tokens_seen": 81154650, + "step": 3772, + "time_per_iteration": 2.589308738708496 + }, + { + "auxiliary_loss_clip": 0.01026231, + "auxiliary_loss_mlp": 0.01005537, + "balance_loss_clip": 1.0462656, + "balance_loss_mlp": 1.00321257, + "epoch": 0.2268450323162483, + "flos": 65048088574080.0, + "grad_norm": 0.6504675534785376, + "language_loss": 0.5434041, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56372178, + "num_input_tokens_seen": 81221240, + "step": 3773, + "time_per_iteration": 3.3104448318481445 + }, + { + "auxiliary_loss_clip": 0.01123458, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.0590663, + "balance_loss_mlp": 1.02059782, + "epoch": 0.22690515556891627, + "flos": 22526081544960.0, + "grad_norm": 1.976611121060325, + "language_loss": 0.70752025, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.72913176, + "num_input_tokens_seen": 81241520, + "step": 3774, + "time_per_iteration": 2.5703842639923096 + }, + { + "auxiliary_loss_clip": 0.011201, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.05472612, + "balance_loss_mlp": 1.02590048, + "epoch": 0.22696527882158424, + "flos": 18223444955520.0, + "grad_norm": 2.6200067706996495, + "language_loss": 0.75023556, + "learning_rate": 3.606650658627658e-06, + "loss": 0.77187002, + "num_input_tokens_seen": 81256825, + "step": 3775, + "time_per_iteration": 2.5320749282836914 + }, + { + "auxiliary_loss_clip": 0.01150662, + "auxiliary_loss_mlp": 0.01044462, + "balance_loss_clip": 1.05545807, + "balance_loss_mlp": 1.02770162, + "epoch": 0.22702540207425223, + "flos": 17019252478080.0, + "grad_norm": 2.0404101651530966, + "language_loss": 0.81924534, + "learning_rate": 3.606418687985928e-06, + "loss": 0.8411966, + "num_input_tokens_seen": 81275695, + "step": 3776, + "time_per_iteration": 2.4616827964782715 + }, + { + "auxiliary_loss_clip": 0.01128242, + "auxiliary_loss_mlp": 0.01039943, + "balance_loss_clip": 1.05182219, + "balance_loss_mlp": 1.02306354, + "epoch": 0.2270855253269202, + "flos": 21325731822720.0, + "grad_norm": 1.743916181644247, + "language_loss": 0.82684362, + "learning_rate": 3.606186656428641e-06, + "loss": 0.84852546, + "num_input_tokens_seen": 81294920, + "step": 3777, + "time_per_iteration": 2.5795741081237793 + }, + { + "auxiliary_loss_clip": 0.0111951, + "auxiliary_loss_mlp": 0.0104024, + "balance_loss_clip": 1.05260158, + "balance_loss_mlp": 1.02366972, + "epoch": 0.22714564857958816, + "flos": 23550469516800.0, + "grad_norm": 2.307429378772807, + "language_loss": 0.72486413, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.74646163, + "num_input_tokens_seen": 81314275, + "step": 3778, + "time_per_iteration": 3.912975788116455 + }, + { + "auxiliary_loss_clip": 0.01106393, + "auxiliary_loss_mlp": 0.01035319, + "balance_loss_clip": 1.05404961, + "balance_loss_mlp": 1.0185461, + "epoch": 0.22720577183225613, + "flos": 25989880844160.0, + "grad_norm": 1.9243797342374056, + "language_loss": 0.63862342, + "learning_rate": 3.605722410602591e-06, + "loss": 0.6600405, + "num_input_tokens_seen": 81333890, + "step": 3779, + "time_per_iteration": 2.6188840866088867 + }, + { + "auxiliary_loss_clip": 0.01130198, + "auxiliary_loss_mlp": 0.01048303, + "balance_loss_clip": 1.0551579, + "balance_loss_mlp": 1.03074396, + "epoch": 0.2272658950849241, + "flos": 20814076540800.0, + "grad_norm": 1.7425441176422654, + "language_loss": 0.70515758, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.72694254, + "num_input_tokens_seen": 81353640, + "step": 3780, + "time_per_iteration": 3.9290170669555664 + }, + { + "auxiliary_loss_clip": 0.01140331, + "auxiliary_loss_mlp": 0.01044908, + "balance_loss_clip": 1.05665541, + "balance_loss_mlp": 1.02715755, + "epoch": 0.22732601833759206, + "flos": 23909324342400.0, + "grad_norm": 1.6627238788805716, + "language_loss": 0.89534253, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.91719496, + "num_input_tokens_seen": 81371595, + "step": 3781, + "time_per_iteration": 2.5489003658294678 + }, + { + "auxiliary_loss_clip": 0.01149515, + "auxiliary_loss_mlp": 0.01044791, + "balance_loss_clip": 1.053177, + "balance_loss_mlp": 1.02638555, + "epoch": 0.22738614159026002, + "flos": 15924407978880.0, + "grad_norm": 2.0101787472979775, + "language_loss": 0.74393141, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.76587445, + "num_input_tokens_seen": 81388435, + "step": 3782, + "time_per_iteration": 2.4542014598846436 + }, + { + "auxiliary_loss_clip": 0.01124379, + "auxiliary_loss_mlp": 0.01043988, + "balance_loss_clip": 1.05123305, + "balance_loss_mlp": 1.02804935, + "epoch": 0.22744626484292801, + "flos": 24205515891840.0, + "grad_norm": 1.7115388504181877, + "language_loss": 0.82752621, + "learning_rate": 3.604793188351095e-06, + "loss": 0.8492099, + "num_input_tokens_seen": 81410195, + "step": 3783, + "time_per_iteration": 2.588388442993164 + }, + { + "auxiliary_loss_clip": 0.01124848, + "auxiliary_loss_mlp": 0.01044816, + "balance_loss_clip": 1.05692506, + "balance_loss_mlp": 1.02766192, + "epoch": 0.22750638809559598, + "flos": 24791614110720.0, + "grad_norm": 1.7221119712286108, + "language_loss": 0.75809181, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.77978849, + "num_input_tokens_seen": 81430060, + "step": 3784, + "time_per_iteration": 3.943814516067505 + }, + { + "auxiliary_loss_clip": 0.01141807, + "auxiliary_loss_mlp": 0.01041504, + "balance_loss_clip": 1.04939377, + "balance_loss_mlp": 1.02432573, + "epoch": 0.22756651134826394, + "flos": 22236498097920.0, + "grad_norm": 1.781926831492604, + "language_loss": 0.70745474, + "learning_rate": 3.604328212066594e-06, + "loss": 0.72928786, + "num_input_tokens_seen": 81447375, + "step": 3785, + "time_per_iteration": 3.862828254699707 + }, + { + "auxiliary_loss_clip": 0.01039577, + "auxiliary_loss_mlp": 0.01005126, + "balance_loss_clip": 1.03658938, + "balance_loss_mlp": 1.00245571, + "epoch": 0.2276266346009319, + "flos": 62707466626560.0, + "grad_norm": 0.8211894122112678, + "language_loss": 0.61853504, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.63898206, + "num_input_tokens_seen": 81505235, + "step": 3786, + "time_per_iteration": 3.1266160011291504 + }, + { + "auxiliary_loss_clip": 0.01129208, + "auxiliary_loss_mlp": 0.01042675, + "balance_loss_clip": 1.05298257, + "balance_loss_mlp": 1.02484155, + "epoch": 0.22768675785359987, + "flos": 18613936684800.0, + "grad_norm": 3.0265495398563766, + "language_loss": 0.86368495, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.88540375, + "num_input_tokens_seen": 81518685, + "step": 3787, + "time_per_iteration": 2.520141124725342 + }, + { + "auxiliary_loss_clip": 0.01125065, + "auxiliary_loss_mlp": 0.01040742, + "balance_loss_clip": 1.05323696, + "balance_loss_mlp": 1.0246253, + "epoch": 0.22774688110626784, + "flos": 26870195364480.0, + "grad_norm": 1.261092090482919, + "language_loss": 0.72619998, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.74785805, + "num_input_tokens_seen": 81538940, + "step": 3788, + "time_per_iteration": 2.591799736022949 + }, + { + "auxiliary_loss_clip": 0.01123089, + "auxiliary_loss_mlp": 0.01033214, + "balance_loss_clip": 1.05295992, + "balance_loss_mlp": 1.0164535, + "epoch": 0.2278070043589358, + "flos": 15553593924480.0, + "grad_norm": 2.7498066098273535, + "language_loss": 0.67842734, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.69999033, + "num_input_tokens_seen": 81555525, + "step": 3789, + "time_per_iteration": 2.5040812492370605 + }, + { + "auxiliary_loss_clip": 0.01114815, + "auxiliary_loss_mlp": 0.01045765, + "balance_loss_clip": 1.05153263, + "balance_loss_mlp": 1.02850389, + "epoch": 0.2278671276116038, + "flos": 22416805393920.0, + "grad_norm": 2.000027847611294, + "language_loss": 0.75813323, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.77973902, + "num_input_tokens_seen": 81576305, + "step": 3790, + "time_per_iteration": 2.5927207469940186 + }, + { + "auxiliary_loss_clip": 0.01087043, + "auxiliary_loss_mlp": 0.01046705, + "balance_loss_clip": 1.04702282, + "balance_loss_mlp": 1.02871609, + "epoch": 0.22792725086427176, + "flos": 20631363033600.0, + "grad_norm": 1.9269623444165649, + "language_loss": 0.91096443, + "learning_rate": 3.602931823424522e-06, + "loss": 0.93230188, + "num_input_tokens_seen": 81594115, + "step": 3791, + "time_per_iteration": 2.59020733833313 + }, + { + "auxiliary_loss_clip": 0.01136919, + "auxiliary_loss_mlp": 0.01037113, + "balance_loss_clip": 1.0494771, + "balance_loss_mlp": 1.01991153, + "epoch": 0.22798737411693973, + "flos": 31428946903680.0, + "grad_norm": 1.681830033947346, + "language_loss": 0.82667577, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.84841609, + "num_input_tokens_seen": 81615355, + "step": 3792, + "time_per_iteration": 2.6188700199127197 + }, + { + "auxiliary_loss_clip": 0.01062697, + "auxiliary_loss_mlp": 0.01006751, + "balance_loss_clip": 1.02853334, + "balance_loss_mlp": 1.00411665, + "epoch": 0.2280474973696077, + "flos": 52396685827200.0, + "grad_norm": 1.1334899212718617, + "language_loss": 0.65643871, + "learning_rate": 3.602465874182981e-06, + "loss": 0.6771332, + "num_input_tokens_seen": 81662075, + "step": 3793, + "time_per_iteration": 2.8329532146453857 + }, + { + "auxiliary_loss_clip": 0.01152335, + "auxiliary_loss_mlp": 0.01050868, + "balance_loss_clip": 1.05391026, + "balance_loss_mlp": 1.03253388, + "epoch": 0.22810762062227566, + "flos": 26396066816640.0, + "grad_norm": 3.139828160915369, + "language_loss": 0.77072072, + "learning_rate": 3.602232808409293e-06, + "loss": 0.79275274, + "num_input_tokens_seen": 81681625, + "step": 3794, + "time_per_iteration": 2.5295193195343018 + }, + { + "auxiliary_loss_clip": 0.01107156, + "auxiliary_loss_mlp": 0.01048466, + "balance_loss_clip": 1.05119777, + "balance_loss_mlp": 1.02934504, + "epoch": 0.22816774387494362, + "flos": 25630271832960.0, + "grad_norm": 1.768792930368063, + "language_loss": 0.81050599, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.83206218, + "num_input_tokens_seen": 81701170, + "step": 3795, + "time_per_iteration": 2.5996851921081543 + }, + { + "auxiliary_loss_clip": 0.01133733, + "auxiliary_loss_mlp": 0.01046029, + "balance_loss_clip": 1.05219698, + "balance_loss_mlp": 1.02917266, + "epoch": 0.22822786712761162, + "flos": 22451602694400.0, + "grad_norm": 2.089163903812295, + "language_loss": 0.76966131, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.79145896, + "num_input_tokens_seen": 81721265, + "step": 3796, + "time_per_iteration": 2.5228569507598877 + }, + { + "auxiliary_loss_clip": 0.01103359, + "auxiliary_loss_mlp": 0.00795969, + "balance_loss_clip": 1.04818046, + "balance_loss_mlp": 1.01720834, + "epoch": 0.22828799038027958, + "flos": 12202554395520.0, + "grad_norm": 2.1580096836122515, + "language_loss": 0.95989692, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.97889018, + "num_input_tokens_seen": 81736565, + "step": 3797, + "time_per_iteration": 2.5574705600738525 + }, + { + "auxiliary_loss_clip": 0.01137323, + "auxiliary_loss_mlp": 0.00795131, + "balance_loss_clip": 1.05348182, + "balance_loss_mlp": 1.01597154, + "epoch": 0.22834811363294755, + "flos": 22085708803200.0, + "grad_norm": 1.5757044299367566, + "language_loss": 0.81202567, + "learning_rate": 3.601299937834666e-06, + "loss": 0.83135021, + "num_input_tokens_seen": 81756240, + "step": 3798, + "time_per_iteration": 2.5479326248168945 + }, + { + "auxiliary_loss_clip": 0.01113112, + "auxiliary_loss_mlp": 0.01037209, + "balance_loss_clip": 1.05063128, + "balance_loss_mlp": 1.01900554, + "epoch": 0.2284082368856155, + "flos": 24860634094080.0, + "grad_norm": 2.452795832126358, + "language_loss": 0.79113203, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.81263524, + "num_input_tokens_seen": 81775720, + "step": 3799, + "time_per_iteration": 2.5948526859283447 + }, + { + "auxiliary_loss_clip": 0.01118759, + "auxiliary_loss_mlp": 0.01054222, + "balance_loss_clip": 1.05095744, + "balance_loss_mlp": 1.03586364, + "epoch": 0.22846836013828348, + "flos": 23292882109440.0, + "grad_norm": 1.8155651871840301, + "language_loss": 0.75414002, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.77586985, + "num_input_tokens_seen": 81795830, + "step": 3800, + "time_per_iteration": 2.561786651611328 + }, + { + "auxiliary_loss_clip": 0.01124746, + "auxiliary_loss_mlp": 0.0104176, + "balance_loss_clip": 1.05498254, + "balance_loss_mlp": 1.02583408, + "epoch": 0.22852848339095144, + "flos": 27416288810880.0, + "grad_norm": 1.6691442648886738, + "language_loss": 0.63720864, + "learning_rate": 3.600599647297484e-06, + "loss": 0.65887368, + "num_input_tokens_seen": 81815745, + "step": 3801, + "time_per_iteration": 2.5844504833221436 + }, + { + "auxiliary_loss_clip": 0.01127055, + "auxiliary_loss_mlp": 0.01041513, + "balance_loss_clip": 1.05653346, + "balance_loss_mlp": 1.0257175, + "epoch": 0.2285886066436194, + "flos": 26321157002880.0, + "grad_norm": 2.3050139798167013, + "language_loss": 0.81792516, + "learning_rate": 3.60036609571682e-06, + "loss": 0.83961082, + "num_input_tokens_seen": 81835155, + "step": 3802, + "time_per_iteration": 2.611619710922241 + }, + { + "auxiliary_loss_clip": 0.011281, + "auxiliary_loss_mlp": 0.0105442, + "balance_loss_clip": 1.06025064, + "balance_loss_mlp": 1.0367893, + "epoch": 0.2286487298962874, + "flos": 29716475022720.0, + "grad_norm": 1.7905081121109456, + "language_loss": 0.78735512, + "learning_rate": 3.600132483450114e-06, + "loss": 0.80918032, + "num_input_tokens_seen": 81855655, + "step": 3803, + "time_per_iteration": 2.628741502761841 + }, + { + "auxiliary_loss_clip": 0.01113139, + "auxiliary_loss_mlp": 0.01044992, + "balance_loss_clip": 1.05113792, + "balance_loss_mlp": 1.02787316, + "epoch": 0.22870885314895537, + "flos": 21287199507840.0, + "grad_norm": 1.5833700352435451, + "language_loss": 0.84969807, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.87127936, + "num_input_tokens_seen": 81876385, + "step": 3804, + "time_per_iteration": 2.586489200592041 + }, + { + "auxiliary_loss_clip": 0.01138806, + "auxiliary_loss_mlp": 0.01041581, + "balance_loss_clip": 1.05276096, + "balance_loss_mlp": 1.02493906, + "epoch": 0.22876897640162333, + "flos": 14939450161920.0, + "grad_norm": 2.242384506574323, + "language_loss": 0.76627123, + "learning_rate": 3.59966507689401e-06, + "loss": 0.78807509, + "num_input_tokens_seen": 81893225, + "step": 3805, + "time_per_iteration": 2.5013389587402344 + }, + { + "auxiliary_loss_clip": 0.01129626, + "auxiliary_loss_mlp": 0.00795506, + "balance_loss_clip": 1.05583358, + "balance_loss_mlp": 1.01592612, + "epoch": 0.2288290996542913, + "flos": 18113917409280.0, + "grad_norm": 4.644412032126836, + "language_loss": 0.7903083, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.80955958, + "num_input_tokens_seen": 81911350, + "step": 3806, + "time_per_iteration": 2.525341510772705 + }, + { + "auxiliary_loss_clip": 0.01121625, + "auxiliary_loss_mlp": 0.01050926, + "balance_loss_clip": 1.05316019, + "balance_loss_mlp": 1.03235292, + "epoch": 0.22888922290695926, + "flos": 39855457071360.0, + "grad_norm": 2.091225503063303, + "language_loss": 0.69822854, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.71995407, + "num_input_tokens_seen": 81935420, + "step": 3807, + "time_per_iteration": 2.7038092613220215 + }, + { + "auxiliary_loss_clip": 0.0114706, + "auxiliary_loss_mlp": 0.01050295, + "balance_loss_clip": 1.06095433, + "balance_loss_mlp": 1.03198469, + "epoch": 0.22894934615962723, + "flos": 23403774372480.0, + "grad_norm": 2.200221131365818, + "language_loss": 0.65599227, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.67796582, + "num_input_tokens_seen": 81953845, + "step": 3808, + "time_per_iteration": 2.5318214893341064 + }, + { + "auxiliary_loss_clip": 0.0109752, + "auxiliary_loss_mlp": 0.01054022, + "balance_loss_clip": 1.05052137, + "balance_loss_mlp": 1.03624737, + "epoch": 0.22900946941229522, + "flos": 18843011671680.0, + "grad_norm": 1.8949715307955886, + "language_loss": 0.74956495, + "learning_rate": 3.598729535939222e-06, + "loss": 0.77108037, + "num_input_tokens_seen": 81972100, + "step": 3809, + "time_per_iteration": 2.5904080867767334 + }, + { + "auxiliary_loss_clip": 0.01125718, + "auxiliary_loss_mlp": 0.01038884, + "balance_loss_clip": 1.05510354, + "balance_loss_mlp": 1.02287471, + "epoch": 0.22906959266496318, + "flos": 22929394429440.0, + "grad_norm": 1.614383918462754, + "language_loss": 0.8155936, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.83723968, + "num_input_tokens_seen": 81992760, + "step": 3810, + "time_per_iteration": 2.5499823093414307 + }, + { + "auxiliary_loss_clip": 0.01126784, + "auxiliary_loss_mlp": 0.01032894, + "balance_loss_clip": 1.05243492, + "balance_loss_mlp": 1.01790977, + "epoch": 0.22912971591763115, + "flos": 19354523299200.0, + "grad_norm": 2.198637753390495, + "language_loss": 0.79057354, + "learning_rate": 3.598261401682441e-06, + "loss": 0.81217039, + "num_input_tokens_seen": 82009080, + "step": 3811, + "time_per_iteration": 2.5312952995300293 + }, + { + "auxiliary_loss_clip": 0.01135288, + "auxiliary_loss_mlp": 0.00795858, + "balance_loss_clip": 1.06011629, + "balance_loss_mlp": 1.01558495, + "epoch": 0.22918983917029911, + "flos": 19933546538880.0, + "grad_norm": 5.739862218808734, + "language_loss": 0.82732236, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.84663379, + "num_input_tokens_seen": 82026705, + "step": 3812, + "time_per_iteration": 2.5240819454193115 + }, + { + "auxiliary_loss_clip": 0.01086842, + "auxiliary_loss_mlp": 0.01054779, + "balance_loss_clip": 1.05046439, + "balance_loss_mlp": 1.03642094, + "epoch": 0.22924996242296708, + "flos": 16690885320960.0, + "grad_norm": 2.4629318187352154, + "language_loss": 0.82635909, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.84777534, + "num_input_tokens_seen": 82043245, + "step": 3813, + "time_per_iteration": 2.6057562828063965 + }, + { + "auxiliary_loss_clip": 0.01135033, + "auxiliary_loss_mlp": 0.01044464, + "balance_loss_clip": 1.05851972, + "balance_loss_mlp": 1.02757156, + "epoch": 0.22931008567563504, + "flos": 33036164956800.0, + "grad_norm": 1.888609972105612, + "language_loss": 0.69965899, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.72145396, + "num_input_tokens_seen": 82066870, + "step": 3814, + "time_per_iteration": 2.6102564334869385 + }, + { + "auxiliary_loss_clip": 0.01138264, + "auxiliary_loss_mlp": 0.01044251, + "balance_loss_clip": 1.05333614, + "balance_loss_mlp": 1.02714503, + "epoch": 0.229370208928303, + "flos": 23330696152320.0, + "grad_norm": 2.742251065343541, + "language_loss": 0.67091596, + "learning_rate": 3.597324405965139e-06, + "loss": 0.6927411, + "num_input_tokens_seen": 82083180, + "step": 3815, + "time_per_iteration": 2.53525710105896 + }, + { + "auxiliary_loss_clip": 0.01148441, + "auxiliary_loss_mlp": 0.01049799, + "balance_loss_clip": 1.06018019, + "balance_loss_mlp": 1.03381324, + "epoch": 0.229430332180971, + "flos": 28617213150720.0, + "grad_norm": 1.708018138405201, + "language_loss": 0.83380258, + "learning_rate": 3.597090005586848e-06, + "loss": 0.85578495, + "num_input_tokens_seen": 82102950, + "step": 3816, + "time_per_iteration": 2.566465139389038 + }, + { + "auxiliary_loss_clip": 0.01140657, + "auxiliary_loss_mlp": 0.01034272, + "balance_loss_clip": 1.058828, + "balance_loss_mlp": 1.01695037, + "epoch": 0.22949045543363897, + "flos": 17238199829760.0, + "grad_norm": 4.240854013106063, + "language_loss": 0.87123549, + "learning_rate": 3.596855544646742e-06, + "loss": 0.89298475, + "num_input_tokens_seen": 82119510, + "step": 3817, + "time_per_iteration": 3.866034984588623 + }, + { + "auxiliary_loss_clip": 0.01125226, + "auxiliary_loss_mlp": 0.01046038, + "balance_loss_clip": 1.05549395, + "balance_loss_mlp": 1.02880073, + "epoch": 0.22955057868630693, + "flos": 27489438858240.0, + "grad_norm": 1.6005545995053239, + "language_loss": 0.74556327, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.76727587, + "num_input_tokens_seen": 82140095, + "step": 3818, + "time_per_iteration": 3.9936180114746094 + }, + { + "auxiliary_loss_clip": 0.01139507, + "auxiliary_loss_mlp": 0.0103809, + "balance_loss_clip": 1.05679381, + "balance_loss_mlp": 1.02143693, + "epoch": 0.2296107019389749, + "flos": 23476421629440.0, + "grad_norm": 1.6731713417265093, + "language_loss": 0.74736351, + "learning_rate": 3.596386441116659e-06, + "loss": 0.76913953, + "num_input_tokens_seen": 82159510, + "step": 3819, + "time_per_iteration": 2.5997955799102783 + }, + { + "auxiliary_loss_clip": 0.01139898, + "auxiliary_loss_mlp": 0.01042427, + "balance_loss_clip": 1.05681157, + "balance_loss_mlp": 1.02629173, + "epoch": 0.22967082519164286, + "flos": 31285160760960.0, + "grad_norm": 1.7319723288135025, + "language_loss": 0.80956084, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.83138406, + "num_input_tokens_seen": 82179580, + "step": 3820, + "time_per_iteration": 2.5874264240264893 + }, + { + "auxiliary_loss_clip": 0.01128673, + "auxiliary_loss_mlp": 0.0104448, + "balance_loss_clip": 1.05561662, + "balance_loss_mlp": 1.02642, + "epoch": 0.22973094844431083, + "flos": 14642935390080.0, + "grad_norm": 2.2280449423555724, + "language_loss": 0.69402218, + "learning_rate": 3.595917095446042e-06, + "loss": 0.71575373, + "num_input_tokens_seen": 82195585, + "step": 3821, + "time_per_iteration": 2.5137147903442383 + }, + { + "auxiliary_loss_clip": 0.01099532, + "auxiliary_loss_mlp": 0.01035926, + "balance_loss_clip": 1.05981803, + "balance_loss_mlp": 1.0186882, + "epoch": 0.2297910716969788, + "flos": 22823853292800.0, + "grad_norm": 1.4825531544775898, + "language_loss": 0.83136272, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.85271722, + "num_input_tokens_seen": 82217530, + "step": 3822, + "time_per_iteration": 2.657780647277832 + }, + { + "auxiliary_loss_clip": 0.01149851, + "auxiliary_loss_mlp": 0.01044512, + "balance_loss_clip": 1.05641174, + "balance_loss_mlp": 1.02677333, + "epoch": 0.2298511949496468, + "flos": 23039029716480.0, + "grad_norm": 1.5632386111803027, + "language_loss": 0.66070205, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.68264568, + "num_input_tokens_seen": 82237980, + "step": 3823, + "time_per_iteration": 3.971372127532959 + }, + { + "auxiliary_loss_clip": 0.01051588, + "auxiliary_loss_mlp": 0.01001549, + "balance_loss_clip": 1.0325247, + "balance_loss_mlp": 0.9988429, + "epoch": 0.22991131820231475, + "flos": 66890914911360.0, + "grad_norm": 0.8171841913192446, + "language_loss": 0.56839406, + "learning_rate": 3.595212623082357e-06, + "loss": 0.58892548, + "num_input_tokens_seen": 82301785, + "step": 3824, + "time_per_iteration": 3.1776232719421387 + }, + { + "auxiliary_loss_clip": 0.01122805, + "auxiliary_loss_mlp": 0.0103955, + "balance_loss_clip": 1.0534029, + "balance_loss_mlp": 1.02327752, + "epoch": 0.22997144145498272, + "flos": 17887248633600.0, + "grad_norm": 2.0750467084293063, + "language_loss": 0.72595191, + "learning_rate": 3.594977677968009e-06, + "loss": 0.74757546, + "num_input_tokens_seen": 82317355, + "step": 3825, + "time_per_iteration": 2.515634775161743 + }, + { + "auxiliary_loss_clip": 0.01139412, + "auxiliary_loss_mlp": 0.01049956, + "balance_loss_clip": 1.05473948, + "balance_loss_mlp": 1.03153777, + "epoch": 0.23003156470765068, + "flos": 24676843178880.0, + "grad_norm": 1.82366078216413, + "language_loss": 0.87964362, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.9015373, + "num_input_tokens_seen": 82336645, + "step": 3826, + "time_per_iteration": 2.5402939319610596 + }, + { + "auxiliary_loss_clip": 0.01126882, + "auxiliary_loss_mlp": 0.01046266, + "balance_loss_clip": 1.05608273, + "balance_loss_mlp": 1.02764535, + "epoch": 0.23009168796031865, + "flos": 15814126247040.0, + "grad_norm": 3.0688260964426632, + "language_loss": 0.81612957, + "learning_rate": 3.594507606303083e-06, + "loss": 0.83786106, + "num_input_tokens_seen": 82354225, + "step": 3827, + "time_per_iteration": 2.523902177810669 + }, + { + "auxiliary_loss_clip": 0.01076117, + "auxiliary_loss_mlp": 0.01045603, + "balance_loss_clip": 1.04851866, + "balance_loss_mlp": 1.02756619, + "epoch": 0.2301518112129866, + "flos": 16212842190720.0, + "grad_norm": 2.3892060762369685, + "language_loss": 0.86546439, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.88668156, + "num_input_tokens_seen": 82370240, + "step": 3828, + "time_per_iteration": 2.589344024658203 + }, + { + "auxiliary_loss_clip": 0.01125152, + "auxiliary_loss_mlp": 0.01048758, + "balance_loss_clip": 1.0567863, + "balance_loss_mlp": 1.03039944, + "epoch": 0.2302119344656546, + "flos": 20595452411520.0, + "grad_norm": 2.0139500624780555, + "language_loss": 0.70714861, + "learning_rate": 3.594037292782607e-06, + "loss": 0.72888768, + "num_input_tokens_seen": 82389145, + "step": 3829, + "time_per_iteration": 2.5454485416412354 + }, + { + "auxiliary_loss_clip": 0.01084924, + "auxiliary_loss_mlp": 0.01041512, + "balance_loss_clip": 1.05200577, + "balance_loss_mlp": 1.02558529, + "epoch": 0.23027205771832257, + "flos": 26796901662720.0, + "grad_norm": 1.5972425233474026, + "language_loss": 0.84025955, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.86152393, + "num_input_tokens_seen": 82409185, + "step": 3830, + "time_per_iteration": 2.6867055892944336 + }, + { + "auxiliary_loss_clip": 0.01127956, + "auxiliary_loss_mlp": 0.01049759, + "balance_loss_clip": 1.05144382, + "balance_loss_mlp": 1.03277194, + "epoch": 0.23033218097099054, + "flos": 43873143068160.0, + "grad_norm": 1.6028440664985804, + "language_loss": 0.67325199, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.69502914, + "num_input_tokens_seen": 82432070, + "step": 3831, + "time_per_iteration": 2.707850456237793 + }, + { + "auxiliary_loss_clip": 0.01109291, + "auxiliary_loss_mlp": 0.0105248, + "balance_loss_clip": 1.0569694, + "balance_loss_mlp": 1.03405035, + "epoch": 0.2303923042236585, + "flos": 26067663745920.0, + "grad_norm": 3.0344446314632174, + "language_loss": 0.76166391, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.78328156, + "num_input_tokens_seen": 82450625, + "step": 3832, + "time_per_iteration": 2.5985777378082275 + }, + { + "auxiliary_loss_clip": 0.01099045, + "auxiliary_loss_mlp": 0.01047682, + "balance_loss_clip": 1.05293441, + "balance_loss_mlp": 1.02928817, + "epoch": 0.23045242747632647, + "flos": 18296379521280.0, + "grad_norm": 1.829429721008068, + "language_loss": 0.87619793, + "learning_rate": 3.593095940460389e-06, + "loss": 0.89766526, + "num_input_tokens_seen": 82468575, + "step": 3833, + "time_per_iteration": 2.5935466289520264 + }, + { + "auxiliary_loss_clip": 0.01112984, + "auxiliary_loss_mlp": 0.01048962, + "balance_loss_clip": 1.05210924, + "balance_loss_mlp": 1.03080678, + "epoch": 0.23051255072899443, + "flos": 25520528805120.0, + "grad_norm": 1.780049537790239, + "language_loss": 0.74901283, + "learning_rate": 3.592860451331624e-06, + "loss": 0.77063227, + "num_input_tokens_seen": 82488655, + "step": 3834, + "time_per_iteration": 2.610978126525879 + }, + { + "auxiliary_loss_clip": 0.01103886, + "auxiliary_loss_mlp": 0.01061426, + "balance_loss_clip": 1.05283308, + "balance_loss_mlp": 1.04125547, + "epoch": 0.2305726739816624, + "flos": 21215198695680.0, + "grad_norm": 1.7578101009021503, + "language_loss": 0.85852575, + "learning_rate": 3.592624901801432e-06, + "loss": 0.88017881, + "num_input_tokens_seen": 82507220, + "step": 3835, + "time_per_iteration": 2.586960792541504 + }, + { + "auxiliary_loss_clip": 0.01112557, + "auxiliary_loss_mlp": 0.01055955, + "balance_loss_clip": 1.05020642, + "balance_loss_mlp": 1.03644037, + "epoch": 0.2306327972343304, + "flos": 23331127115520.0, + "grad_norm": 2.491410226603742, + "language_loss": 0.8185432, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.84022826, + "num_input_tokens_seen": 82527920, + "step": 3836, + "time_per_iteration": 2.6100287437438965 + }, + { + "auxiliary_loss_clip": 0.01136285, + "auxiliary_loss_mlp": 0.01052514, + "balance_loss_clip": 1.05724311, + "balance_loss_mlp": 1.03534746, + "epoch": 0.23069292048699835, + "flos": 20666734951680.0, + "grad_norm": 1.605544813887528, + "language_loss": 0.79533976, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.81722772, + "num_input_tokens_seen": 82549040, + "step": 3837, + "time_per_iteration": 2.5394492149353027 + }, + { + "auxiliary_loss_clip": 0.01046184, + "auxiliary_loss_mlp": 0.0102709, + "balance_loss_clip": 1.04074001, + "balance_loss_mlp": 1.02447975, + "epoch": 0.23075304373966632, + "flos": 70454832393600.0, + "grad_norm": 0.9346560746316889, + "language_loss": 0.65387332, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67460603, + "num_input_tokens_seen": 82604070, + "step": 3838, + "time_per_iteration": 3.08566951751709 + }, + { + "auxiliary_loss_clip": 0.01132201, + "auxiliary_loss_mlp": 0.01047361, + "balance_loss_clip": 1.05306172, + "balance_loss_mlp": 1.03054023, + "epoch": 0.23081316699233428, + "flos": 16617986668800.0, + "grad_norm": 1.9156261730714912, + "language_loss": 0.75731093, + "learning_rate": 3.591682099845058e-06, + "loss": 0.77910656, + "num_input_tokens_seen": 82619665, + "step": 3839, + "time_per_iteration": 2.495717763900757 + }, + { + "auxiliary_loss_clip": 0.01120692, + "auxiliary_loss_mlp": 0.01041811, + "balance_loss_clip": 1.05538511, + "balance_loss_mlp": 1.02440643, + "epoch": 0.23087329024500225, + "flos": 13298081253120.0, + "grad_norm": 2.035411630268444, + "language_loss": 0.6884886, + "learning_rate": 3.591446248441752e-06, + "loss": 0.71011364, + "num_input_tokens_seen": 82637530, + "step": 3840, + "time_per_iteration": 2.5268445014953613 + }, + { + "auxiliary_loss_clip": 0.01151082, + "auxiliary_loss_mlp": 0.0104394, + "balance_loss_clip": 1.05637467, + "balance_loss_mlp": 1.02520072, + "epoch": 0.23093341349767021, + "flos": 17785729820160.0, + "grad_norm": 2.025841668078174, + "language_loss": 0.7971912, + "learning_rate": 3.591210336690645e-06, + "loss": 0.81914151, + "num_input_tokens_seen": 82656130, + "step": 3841, + "time_per_iteration": 2.4816672801971436 + }, + { + "auxiliary_loss_clip": 0.01139139, + "auxiliary_loss_mlp": 0.01040012, + "balance_loss_clip": 1.05647695, + "balance_loss_mlp": 1.02391839, + "epoch": 0.23099353675033818, + "flos": 23988076911360.0, + "grad_norm": 1.8662482736974775, + "language_loss": 0.83445776, + "learning_rate": 3.590974364600683e-06, + "loss": 0.85624921, + "num_input_tokens_seen": 82675295, + "step": 3842, + "time_per_iteration": 2.5472657680511475 + }, + { + "auxiliary_loss_clip": 0.01137663, + "auxiliary_loss_mlp": 0.0104281, + "balance_loss_clip": 1.05154705, + "balance_loss_mlp": 1.02445161, + "epoch": 0.23105366000300617, + "flos": 35995168471680.0, + "grad_norm": 1.4360807069558068, + "language_loss": 0.6650964, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.68690115, + "num_input_tokens_seen": 82703260, + "step": 3843, + "time_per_iteration": 2.7052645683288574 + }, + { + "auxiliary_loss_clip": 0.01133906, + "auxiliary_loss_mlp": 0.01045617, + "balance_loss_clip": 1.05309486, + "balance_loss_mlp": 1.02802229, + "epoch": 0.23111378325567414, + "flos": 31245335556480.0, + "grad_norm": 1.615161002875236, + "language_loss": 0.77084875, + "learning_rate": 3.590502239439987e-06, + "loss": 0.79264402, + "num_input_tokens_seen": 82725060, + "step": 3844, + "time_per_iteration": 2.606454610824585 + }, + { + "auxiliary_loss_clip": 0.01134577, + "auxiliary_loss_mlp": 0.0104704, + "balance_loss_clip": 1.05273199, + "balance_loss_mlp": 1.02871752, + "epoch": 0.2311739065083421, + "flos": 19208223204480.0, + "grad_norm": 1.5631763753814163, + "language_loss": 0.78123415, + "learning_rate": 3.590266086387156e-06, + "loss": 0.80305028, + "num_input_tokens_seen": 82742960, + "step": 3845, + "time_per_iteration": 2.5240631103515625 + }, + { + "auxiliary_loss_clip": 0.01107608, + "auxiliary_loss_mlp": 0.01035747, + "balance_loss_clip": 1.04875386, + "balance_loss_mlp": 1.01993954, + "epoch": 0.23123402976101007, + "flos": 23360178240000.0, + "grad_norm": 2.0441653389253376, + "language_loss": 0.75972998, + "learning_rate": 3.590029873031276e-06, + "loss": 0.78116357, + "num_input_tokens_seen": 82760205, + "step": 3846, + "time_per_iteration": 2.5820088386535645 + }, + { + "auxiliary_loss_clip": 0.01126932, + "auxiliary_loss_mlp": 0.01044352, + "balance_loss_clip": 1.05349731, + "balance_loss_mlp": 1.02772248, + "epoch": 0.23129415301367803, + "flos": 13735365425280.0, + "grad_norm": 2.4425711810497073, + "language_loss": 0.70110404, + "learning_rate": 3.589793599381304e-06, + "loss": 0.72281688, + "num_input_tokens_seen": 82778590, + "step": 3847, + "time_per_iteration": 2.5345842838287354 + }, + { + "auxiliary_loss_clip": 0.01060946, + "auxiliary_loss_mlp": 0.01006553, + "balance_loss_clip": 1.0418359, + "balance_loss_mlp": 1.00420427, + "epoch": 0.231354276266346, + "flos": 69737015001600.0, + "grad_norm": 0.7940712764160209, + "language_loss": 0.61086202, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63153702, + "num_input_tokens_seen": 82833925, + "step": 3848, + "time_per_iteration": 3.030233144760132 + }, + { + "auxiliary_loss_clip": 0.01137148, + "auxiliary_loss_mlp": 0.01048688, + "balance_loss_clip": 1.05557525, + "balance_loss_mlp": 1.03066349, + "epoch": 0.231414399519014, + "flos": 18835900778880.0, + "grad_norm": 2.094396965048532, + "language_loss": 0.78266042, + "learning_rate": 3.589320871234923e-06, + "loss": 0.80451882, + "num_input_tokens_seen": 82850625, + "step": 3849, + "time_per_iteration": 2.4994523525238037 + }, + { + "auxiliary_loss_clip": 0.01133121, + "auxiliary_loss_mlp": 0.01042574, + "balance_loss_clip": 1.05249631, + "balance_loss_mlp": 1.0250268, + "epoch": 0.23147452277168196, + "flos": 36135470995200.0, + "grad_norm": 1.8935979820710251, + "language_loss": 0.71224451, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.73400152, + "num_input_tokens_seen": 82872105, + "step": 3850, + "time_per_iteration": 2.652895927429199 + }, + { + "auxiliary_loss_clip": 0.01121514, + "auxiliary_loss_mlp": 0.00810802, + "balance_loss_clip": 1.05772889, + "balance_loss_mlp": 1.0419687, + "epoch": 0.23153464602434992, + "flos": 20812927305600.0, + "grad_norm": 2.011395541614, + "language_loss": 0.76376027, + "learning_rate": 3.588847902019718e-06, + "loss": 0.78308338, + "num_input_tokens_seen": 82890595, + "step": 3851, + "time_per_iteration": 2.545243740081787 + }, + { + "auxiliary_loss_clip": 0.01146663, + "auxiliary_loss_mlp": 0.01042639, + "balance_loss_clip": 1.05388856, + "balance_loss_mlp": 1.02479386, + "epoch": 0.2315947692770179, + "flos": 19939256801280.0, + "grad_norm": 1.8115928421958385, + "language_loss": 0.69626987, + "learning_rate": 3.588611327033723e-06, + "loss": 0.71816289, + "num_input_tokens_seen": 82908910, + "step": 3852, + "time_per_iteration": 2.5057969093322754 + }, + { + "auxiliary_loss_clip": 0.01113036, + "auxiliary_loss_mlp": 0.01045179, + "balance_loss_clip": 1.06081414, + "balance_loss_mlp": 1.02649903, + "epoch": 0.23165489252968585, + "flos": 12855553695360.0, + "grad_norm": 2.650874525338518, + "language_loss": 0.67401052, + "learning_rate": 3.588374691807428e-06, + "loss": 0.69559258, + "num_input_tokens_seen": 82925405, + "step": 3853, + "time_per_iteration": 2.522764205932617 + }, + { + "auxiliary_loss_clip": 0.01138854, + "auxiliary_loss_mlp": 0.01036554, + "balance_loss_clip": 1.05338836, + "balance_loss_mlp": 1.01857686, + "epoch": 0.23171501578235382, + "flos": 30628282792320.0, + "grad_norm": 2.4119632688366015, + "language_loss": 0.80101323, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.82276726, + "num_input_tokens_seen": 82945615, + "step": 3854, + "time_per_iteration": 2.5823771953582764 + }, + { + "auxiliary_loss_clip": 0.01115098, + "auxiliary_loss_mlp": 0.01051409, + "balance_loss_clip": 1.05072403, + "balance_loss_mlp": 1.0300827, + "epoch": 0.23177513903502178, + "flos": 23842782397440.0, + "grad_norm": 3.252832112311624, + "language_loss": 0.65646672, + "learning_rate": 3.587901240669831e-06, + "loss": 0.67813182, + "num_input_tokens_seen": 82967570, + "step": 3855, + "time_per_iteration": 4.036926984786987 + }, + { + "auxiliary_loss_clip": 0.0114918, + "auxiliary_loss_mlp": 0.01043817, + "balance_loss_clip": 1.05324948, + "balance_loss_mlp": 1.02685332, + "epoch": 0.23183526228768978, + "flos": 29570282668800.0, + "grad_norm": 2.0111640426151682, + "language_loss": 0.71261704, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.73454702, + "num_input_tokens_seen": 82987435, + "step": 3856, + "time_per_iteration": 2.5408313274383545 + }, + { + "auxiliary_loss_clip": 0.011005, + "auxiliary_loss_mlp": 0.01039564, + "balance_loss_clip": 1.05382097, + "balance_loss_mlp": 1.02360165, + "epoch": 0.23189538554035774, + "flos": 34458694254720.0, + "grad_norm": 1.5260220099513135, + "language_loss": 0.77132583, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.79272652, + "num_input_tokens_seen": 83010505, + "step": 3857, + "time_per_iteration": 4.0918004512786865 + }, + { + "auxiliary_loss_clip": 0.01129478, + "auxiliary_loss_mlp": 0.00799768, + "balance_loss_clip": 1.04996872, + "balance_loss_mlp": 1.02118623, + "epoch": 0.2319555087930257, + "flos": 18003815245440.0, + "grad_norm": 2.4106730137185237, + "language_loss": 0.9106437, + "learning_rate": 3.587190612385584e-06, + "loss": 0.92993617, + "num_input_tokens_seen": 83026705, + "step": 3858, + "time_per_iteration": 2.5487334728240967 + }, + { + "auxiliary_loss_clip": 0.01090704, + "auxiliary_loss_mlp": 0.01038561, + "balance_loss_clip": 1.05095077, + "balance_loss_mlp": 1.02216983, + "epoch": 0.23201563204569367, + "flos": 23143852581120.0, + "grad_norm": 1.7309755194288259, + "language_loss": 0.76138818, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.78268087, + "num_input_tokens_seen": 83046500, + "step": 3859, + "time_per_iteration": 2.6166861057281494 + }, + { + "auxiliary_loss_clip": 0.0113217, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.04978406, + "balance_loss_mlp": 1.01588821, + "epoch": 0.23207575529836164, + "flos": 20667991927680.0, + "grad_norm": 1.656541366781877, + "language_loss": 0.83928525, + "learning_rate": 3.58671655924898e-06, + "loss": 0.86092997, + "num_input_tokens_seen": 83065280, + "step": 3860, + "time_per_iteration": 2.539497137069702 + }, + { + "auxiliary_loss_clip": 0.01091533, + "auxiliary_loss_mlp": 0.01040985, + "balance_loss_clip": 1.05061531, + "balance_loss_mlp": 1.02250767, + "epoch": 0.2321358785510296, + "flos": 16472189364480.0, + "grad_norm": 1.822780011957372, + "language_loss": 0.83020008, + "learning_rate": 3.586479442423508e-06, + "loss": 0.85152531, + "num_input_tokens_seen": 83082310, + "step": 3861, + "time_per_iteration": 2.5872466564178467 + }, + { + "auxiliary_loss_clip": 0.01128088, + "auxiliary_loss_mlp": 0.00796424, + "balance_loss_clip": 1.05216801, + "balance_loss_mlp": 1.01638377, + "epoch": 0.2321960018036976, + "flos": 21616320850560.0, + "grad_norm": 1.6843751149850719, + "language_loss": 0.85838985, + "learning_rate": 3.586242265438576e-06, + "loss": 0.877635, + "num_input_tokens_seen": 83102065, + "step": 3862, + "time_per_iteration": 4.0203776359558105 + }, + { + "auxiliary_loss_clip": 0.01117841, + "auxiliary_loss_mlp": 0.01039902, + "balance_loss_clip": 1.05903065, + "balance_loss_mlp": 1.02445257, + "epoch": 0.23225612505636556, + "flos": 22271474966400.0, + "grad_norm": 1.7204047758070597, + "language_loss": 0.74910867, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.77068615, + "num_input_tokens_seen": 83121445, + "step": 3863, + "time_per_iteration": 2.571600914001465 + }, + { + "auxiliary_loss_clip": 0.01107992, + "auxiliary_loss_mlp": 0.01045304, + "balance_loss_clip": 1.05544806, + "balance_loss_mlp": 1.02888846, + "epoch": 0.23231624830903352, + "flos": 17052325925760.0, + "grad_norm": 1.8189940122783201, + "language_loss": 0.74842155, + "learning_rate": 3.58576773102631e-06, + "loss": 0.76995444, + "num_input_tokens_seen": 83138175, + "step": 3864, + "time_per_iteration": 2.5869011878967285 + }, + { + "auxiliary_loss_clip": 0.01144826, + "auxiliary_loss_mlp": 0.01038351, + "balance_loss_clip": 1.05242896, + "balance_loss_mlp": 1.02127969, + "epoch": 0.2323763715617015, + "flos": 34640043045120.0, + "grad_norm": 2.058784779144382, + "language_loss": 0.70547283, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.72730464, + "num_input_tokens_seen": 83161975, + "step": 3865, + "time_per_iteration": 2.5978221893310547 + }, + { + "auxiliary_loss_clip": 0.01155092, + "auxiliary_loss_mlp": 0.01048915, + "balance_loss_clip": 1.05587006, + "balance_loss_mlp": 1.0303539, + "epoch": 0.23243649481436945, + "flos": 25551698832000.0, + "grad_norm": 1.8017779123713527, + "language_loss": 0.94576895, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.96780908, + "num_input_tokens_seen": 83180905, + "step": 3866, + "time_per_iteration": 2.5655124187469482 + }, + { + "auxiliary_loss_clip": 0.01128886, + "auxiliary_loss_mlp": 0.01039209, + "balance_loss_clip": 1.04990268, + "balance_loss_mlp": 1.0221982, + "epoch": 0.23249661806703742, + "flos": 20483482740480.0, + "grad_norm": 2.493588391839717, + "language_loss": 0.73201525, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.7536962, + "num_input_tokens_seen": 83196390, + "step": 3867, + "time_per_iteration": 2.482191801071167 + }, + { + "auxiliary_loss_clip": 0.01127142, + "auxiliary_loss_mlp": 0.01041987, + "balance_loss_clip": 1.05097556, + "balance_loss_mlp": 1.02509546, + "epoch": 0.23255674131970538, + "flos": 20376612800640.0, + "grad_norm": 1.765781478004179, + "language_loss": 0.82719493, + "learning_rate": 3.584817940684145e-06, + "loss": 0.84888625, + "num_input_tokens_seen": 83216165, + "step": 3868, + "time_per_iteration": 2.593801736831665 + }, + { + "auxiliary_loss_clip": 0.01124397, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.0515275, + "balance_loss_mlp": 1.02725494, + "epoch": 0.23261686457237338, + "flos": 17056096853760.0, + "grad_norm": 1.9521774646481656, + "language_loss": 0.73168361, + "learning_rate": 3.58458034283495e-06, + "loss": 0.75337046, + "num_input_tokens_seen": 83233845, + "step": 3869, + "time_per_iteration": 2.4829907417297363 + }, + { + "auxiliary_loss_clip": 0.01132829, + "auxiliary_loss_mlp": 0.0104785, + "balance_loss_clip": 1.05462408, + "balance_loss_mlp": 1.03108931, + "epoch": 0.23267698782504134, + "flos": 29169878785920.0, + "grad_norm": 15.293602618480255, + "language_loss": 0.79206967, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.81387651, + "num_input_tokens_seen": 83254930, + "step": 3870, + "time_per_iteration": 2.619703531265259 + }, + { + "auxiliary_loss_clip": 0.01147897, + "auxiliary_loss_mlp": 0.01043658, + "balance_loss_clip": 1.05296874, + "balance_loss_mlp": 1.02651596, + "epoch": 0.2327371110777093, + "flos": 21174655219200.0, + "grad_norm": 1.8384305671757024, + "language_loss": 0.70410174, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.72601736, + "num_input_tokens_seen": 83272095, + "step": 3871, + "time_per_iteration": 2.493462085723877 + }, + { + "auxiliary_loss_clip": 0.01136869, + "auxiliary_loss_mlp": 0.01052224, + "balance_loss_clip": 1.05817842, + "balance_loss_mlp": 1.03324604, + "epoch": 0.23279723433037727, + "flos": 24863112132480.0, + "grad_norm": 6.688714128283287, + "language_loss": 0.69049025, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.71238124, + "num_input_tokens_seen": 83290980, + "step": 3872, + "time_per_iteration": 2.577817678451538 + }, + { + "auxiliary_loss_clip": 0.01142374, + "auxiliary_loss_mlp": 0.01043333, + "balance_loss_clip": 1.05421102, + "balance_loss_mlp": 1.0249989, + "epoch": 0.23285735758304524, + "flos": 38800617344640.0, + "grad_norm": 1.525020017001597, + "language_loss": 0.7780028, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.79985988, + "num_input_tokens_seen": 83315175, + "step": 3873, + "time_per_iteration": 2.689356565475464 + }, + { + "auxiliary_loss_clip": 0.01051988, + "auxiliary_loss_mlp": 0.01010075, + "balance_loss_clip": 1.03918886, + "balance_loss_mlp": 1.00741696, + "epoch": 0.2329174808357132, + "flos": 53944113692160.0, + "grad_norm": 0.8490383026407595, + "language_loss": 0.605214, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.62583464, + "num_input_tokens_seen": 83372060, + "step": 3874, + "time_per_iteration": 3.062441110610962 + }, + { + "auxiliary_loss_clip": 0.01124961, + "auxiliary_loss_mlp": 0.01042183, + "balance_loss_clip": 1.05330908, + "balance_loss_mlp": 1.02436161, + "epoch": 0.23297760408838117, + "flos": 21216024708480.0, + "grad_norm": 3.2240305411525663, + "language_loss": 0.80500668, + "learning_rate": 3.583153494218927e-06, + "loss": 0.82667816, + "num_input_tokens_seen": 83389795, + "step": 3875, + "time_per_iteration": 2.571101665496826 + }, + { + "auxiliary_loss_clip": 0.01147783, + "auxiliary_loss_mlp": 0.00798837, + "balance_loss_clip": 1.05613542, + "balance_loss_mlp": 1.02486992, + "epoch": 0.23303772734104916, + "flos": 28403006394240.0, + "grad_norm": 1.64310276657619, + "language_loss": 0.61345834, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.6329245, + "num_input_tokens_seen": 83410005, + "step": 3876, + "time_per_iteration": 2.5708000659942627 + }, + { + "auxiliary_loss_clip": 0.01118109, + "auxiliary_loss_mlp": 0.01042884, + "balance_loss_clip": 1.05915928, + "balance_loss_mlp": 1.02537227, + "epoch": 0.23309785059371713, + "flos": 24314720215680.0, + "grad_norm": 1.931757293229752, + "language_loss": 0.71062177, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.73223174, + "num_input_tokens_seen": 83430250, + "step": 3877, + "time_per_iteration": 2.6000571250915527 + }, + { + "auxiliary_loss_clip": 0.011399, + "auxiliary_loss_mlp": 0.01051045, + "balance_loss_clip": 1.05654049, + "balance_loss_mlp": 1.03340197, + "epoch": 0.2331579738463851, + "flos": 15992925171840.0, + "grad_norm": 2.2813554562185407, + "language_loss": 0.81070411, + "learning_rate": 3.582439259339073e-06, + "loss": 0.83261347, + "num_input_tokens_seen": 83447950, + "step": 3878, + "time_per_iteration": 2.506186008453369 + }, + { + "auxiliary_loss_clip": 0.01084864, + "auxiliary_loss_mlp": 0.01045473, + "balance_loss_clip": 1.04644668, + "balance_loss_mlp": 1.02608931, + "epoch": 0.23321809709905306, + "flos": 36426957863040.0, + "grad_norm": 1.7316012393998932, + "language_loss": 0.74919564, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.77049899, + "num_input_tokens_seen": 83467785, + "step": 3879, + "time_per_iteration": 2.7952187061309814 + }, + { + "auxiliary_loss_clip": 0.01095582, + "auxiliary_loss_mlp": 0.01039258, + "balance_loss_clip": 1.05416429, + "balance_loss_mlp": 1.02093565, + "epoch": 0.23327822035172102, + "flos": 21324762155520.0, + "grad_norm": 2.321457071640085, + "language_loss": 0.8972072, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.91855556, + "num_input_tokens_seen": 83485390, + "step": 3880, + "time_per_iteration": 2.6010191440582275 + }, + { + "auxiliary_loss_clip": 0.01127161, + "auxiliary_loss_mlp": 0.01044466, + "balance_loss_clip": 1.05499732, + "balance_loss_mlp": 1.02744257, + "epoch": 0.233338343604389, + "flos": 19171881619200.0, + "grad_norm": 1.6927385304705507, + "language_loss": 0.72089016, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.74260652, + "num_input_tokens_seen": 83504890, + "step": 3881, + "time_per_iteration": 2.575547695159912 + }, + { + "auxiliary_loss_clip": 0.01146485, + "auxiliary_loss_mlp": 0.01044773, + "balance_loss_clip": 1.05325842, + "balance_loss_mlp": 1.02764237, + "epoch": 0.23339846685705698, + "flos": 26908368543360.0, + "grad_norm": 1.6269802434737304, + "language_loss": 0.67780507, + "learning_rate": 3.581486106120537e-06, + "loss": 0.69971764, + "num_input_tokens_seen": 83526475, + "step": 3882, + "time_per_iteration": 2.5681276321411133 + }, + { + "auxiliary_loss_clip": 0.01106051, + "auxiliary_loss_mlp": 0.01051955, + "balance_loss_clip": 1.0474664, + "balance_loss_mlp": 1.03308427, + "epoch": 0.23345859010972494, + "flos": 32343160884480.0, + "grad_norm": 1.9675810578556254, + "language_loss": 0.76881325, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.79039335, + "num_input_tokens_seen": 83546620, + "step": 3883, + "time_per_iteration": 2.661811590194702 + }, + { + "auxiliary_loss_clip": 0.0104927, + "auxiliary_loss_mlp": 0.01005505, + "balance_loss_clip": 1.03467298, + "balance_loss_mlp": 1.00277507, + "epoch": 0.2335187133623929, + "flos": 58484229050880.0, + "grad_norm": 0.7777488366157249, + "language_loss": 0.59151244, + "learning_rate": 3.58100916965445e-06, + "loss": 0.61206019, + "num_input_tokens_seen": 83616160, + "step": 3884, + "time_per_iteration": 3.3538589477539062 + }, + { + "auxiliary_loss_clip": 0.01120725, + "auxiliary_loss_mlp": 0.01036242, + "balance_loss_clip": 1.05468392, + "balance_loss_mlp": 1.0197196, + "epoch": 0.23357883661506088, + "flos": 24502317972480.0, + "grad_norm": 2.6143479579132314, + "language_loss": 0.80295408, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.82452375, + "num_input_tokens_seen": 83636795, + "step": 3885, + "time_per_iteration": 2.618666172027588 + }, + { + "auxiliary_loss_clip": 0.01131021, + "auxiliary_loss_mlp": 0.01038131, + "balance_loss_clip": 1.05429745, + "balance_loss_mlp": 1.02084613, + "epoch": 0.23363895986772884, + "flos": 18948516894720.0, + "grad_norm": 2.346821077523392, + "language_loss": 0.88331831, + "learning_rate": 3.580531993380261e-06, + "loss": 0.90500987, + "num_input_tokens_seen": 83654050, + "step": 3886, + "time_per_iteration": 2.5157597064971924 + }, + { + "auxiliary_loss_clip": 0.01149729, + "auxiliary_loss_mlp": 0.01041353, + "balance_loss_clip": 1.0557071, + "balance_loss_mlp": 1.02419889, + "epoch": 0.2336990831203968, + "flos": 31686821619840.0, + "grad_norm": 1.8732007155560442, + "language_loss": 0.73695683, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.75886762, + "num_input_tokens_seen": 83673720, + "step": 3887, + "time_per_iteration": 2.577000379562378 + }, + { + "auxiliary_loss_clip": 0.01134754, + "auxiliary_loss_mlp": 0.01039545, + "balance_loss_clip": 1.05111885, + "balance_loss_mlp": 1.02322495, + "epoch": 0.23375920637306477, + "flos": 27709750926720.0, + "grad_norm": 1.8635191725501978, + "language_loss": 0.84497476, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.8667177, + "num_input_tokens_seen": 83693470, + "step": 3888, + "time_per_iteration": 2.6348724365234375 + }, + { + "auxiliary_loss_clip": 0.01119564, + "auxiliary_loss_mlp": 0.01058134, + "balance_loss_clip": 1.05093253, + "balance_loss_mlp": 1.0385952, + "epoch": 0.23381932962573276, + "flos": 17675627656320.0, + "grad_norm": 2.159306801968494, + "language_loss": 0.87320733, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.89498425, + "num_input_tokens_seen": 83711620, + "step": 3889, + "time_per_iteration": 2.526057004928589 + }, + { + "auxiliary_loss_clip": 0.01137155, + "auxiliary_loss_mlp": 0.01040449, + "balance_loss_clip": 1.05318761, + "balance_loss_mlp": 1.02359271, + "epoch": 0.23387945287840073, + "flos": 14390842763520.0, + "grad_norm": 4.653730390801575, + "language_loss": 0.7707029, + "learning_rate": 3.579576921697125e-06, + "loss": 0.79247892, + "num_input_tokens_seen": 83727890, + "step": 3890, + "time_per_iteration": 2.539618492126465 + }, + { + "auxiliary_loss_clip": 0.01108339, + "auxiliary_loss_mlp": 0.00795451, + "balance_loss_clip": 1.0527426, + "balance_loss_mlp": 1.01678765, + "epoch": 0.2339395761310687, + "flos": 46097988503040.0, + "grad_norm": 1.6717155662623473, + "language_loss": 0.73291975, + "learning_rate": 3.579338004009412e-06, + "loss": 0.75195765, + "num_input_tokens_seen": 83749370, + "step": 3891, + "time_per_iteration": 2.8195302486419678 + }, + { + "auxiliary_loss_clip": 0.01143654, + "auxiliary_loss_mlp": 0.01039023, + "balance_loss_clip": 1.05354166, + "balance_loss_mlp": 1.02258372, + "epoch": 0.23399969938373666, + "flos": 22382044007040.0, + "grad_norm": 1.5871962329691258, + "language_loss": 0.82981378, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.85164046, + "num_input_tokens_seen": 83769560, + "step": 3892, + "time_per_iteration": 2.5688040256500244 + }, + { + "auxiliary_loss_clip": 0.01105339, + "auxiliary_loss_mlp": 0.0104989, + "balance_loss_clip": 1.04963768, + "balance_loss_mlp": 1.02951741, + "epoch": 0.23405982263640462, + "flos": 43508542066560.0, + "grad_norm": 1.453913861858147, + "language_loss": 0.65050375, + "learning_rate": 3.578859988977082e-06, + "loss": 0.67205608, + "num_input_tokens_seen": 83795635, + "step": 3893, + "time_per_iteration": 2.7584879398345947 + }, + { + "auxiliary_loss_clip": 0.01113923, + "auxiliary_loss_mlp": 0.0104016, + "balance_loss_clip": 1.06200314, + "balance_loss_mlp": 1.02176595, + "epoch": 0.2341199458890726, + "flos": 22564685687040.0, + "grad_norm": 2.868089135926816, + "language_loss": 0.79227817, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.81381899, + "num_input_tokens_seen": 83814090, + "step": 3894, + "time_per_iteration": 4.0035107135772705 + }, + { + "auxiliary_loss_clip": 0.0113356, + "auxiliary_loss_mlp": 0.0104073, + "balance_loss_clip": 1.05239773, + "balance_loss_mlp": 1.02382612, + "epoch": 0.23418006914174055, + "flos": 25633970933760.0, + "grad_norm": 2.1046401457706976, + "language_loss": 0.82053024, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.84227312, + "num_input_tokens_seen": 83836870, + "step": 3895, + "time_per_iteration": 2.6201205253601074 + }, + { + "auxiliary_loss_clip": 0.01139076, + "auxiliary_loss_mlp": 0.01046976, + "balance_loss_clip": 1.05710959, + "balance_loss_mlp": 1.02923775, + "epoch": 0.23424019239440855, + "flos": 13545936074880.0, + "grad_norm": 2.0008996418892937, + "language_loss": 0.80568826, + "learning_rate": 3.578142517422292e-06, + "loss": 0.82754874, + "num_input_tokens_seen": 83853275, + "step": 3896, + "time_per_iteration": 3.876016616821289 + }, + { + "auxiliary_loss_clip": 0.01127004, + "auxiliary_loss_mlp": 0.01043088, + "balance_loss_clip": 1.05199289, + "balance_loss_mlp": 1.02403855, + "epoch": 0.2343003156470765, + "flos": 22419498913920.0, + "grad_norm": 1.5764381503053442, + "language_loss": 0.82902706, + "learning_rate": 3.577903240538623e-06, + "loss": 0.85072803, + "num_input_tokens_seen": 83872340, + "step": 3897, + "time_per_iteration": 2.566791296005249 + }, + { + "auxiliary_loss_clip": 0.01133539, + "auxiliary_loss_mlp": 0.01042952, + "balance_loss_clip": 1.05464339, + "balance_loss_mlp": 1.02530861, + "epoch": 0.23436043889974448, + "flos": 14790815683200.0, + "grad_norm": 1.5646035849299171, + "language_loss": 0.79256725, + "learning_rate": 3.577663903820705e-06, + "loss": 0.81433213, + "num_input_tokens_seen": 83888795, + "step": 3898, + "time_per_iteration": 2.478588581085205 + }, + { + "auxiliary_loss_clip": 0.01112753, + "auxiliary_loss_mlp": 0.01043387, + "balance_loss_clip": 1.05270946, + "balance_loss_mlp": 1.02628088, + "epoch": 0.23442056215241244, + "flos": 22965700101120.0, + "grad_norm": 2.064936566284087, + "language_loss": 0.7366392, + "learning_rate": 3.577424507277614e-06, + "loss": 0.75820053, + "num_input_tokens_seen": 83906820, + "step": 3899, + "time_per_iteration": 2.608407497406006 + }, + { + "auxiliary_loss_clip": 0.01111163, + "auxiliary_loss_mlp": 0.01047782, + "balance_loss_clip": 1.05194843, + "balance_loss_mlp": 1.03019893, + "epoch": 0.2344806854050804, + "flos": 23071887682560.0, + "grad_norm": 1.6676316870048196, + "language_loss": 0.7541697, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.77575922, + "num_input_tokens_seen": 83926370, + "step": 3900, + "time_per_iteration": 2.567404270172119 + }, + { + "auxiliary_loss_clip": 0.01107523, + "auxiliary_loss_mlp": 0.01047892, + "balance_loss_clip": 1.0601145, + "balance_loss_mlp": 1.03026068, + "epoch": 0.23454080865774837, + "flos": 16327074418560.0, + "grad_norm": 2.012355218575242, + "language_loss": 0.6691407, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.69069481, + "num_input_tokens_seen": 83944600, + "step": 3901, + "time_per_iteration": 5.293537378311157 + }, + { + "auxiliary_loss_clip": 0.01026844, + "auxiliary_loss_mlp": 0.00999459, + "balance_loss_clip": 1.03124702, + "balance_loss_mlp": 0.99714655, + "epoch": 0.23460093191041637, + "flos": 67760958142080.0, + "grad_norm": 0.7527031580649051, + "language_loss": 0.58220869, + "learning_rate": 3.576705958788091e-06, + "loss": 0.60247171, + "num_input_tokens_seen": 84005100, + "step": 3902, + "time_per_iteration": 3.174189567565918 + }, + { + "auxiliary_loss_clip": 0.01130002, + "auxiliary_loss_mlp": 0.01050458, + "balance_loss_clip": 1.05637133, + "balance_loss_mlp": 1.03175402, + "epoch": 0.23466105516308433, + "flos": 20077619990400.0, + "grad_norm": 2.3244849516837096, + "language_loss": 0.80499339, + "learning_rate": 3.576466323035108e-06, + "loss": 0.82679808, + "num_input_tokens_seen": 84023775, + "step": 3903, + "time_per_iteration": 2.5738298892974854 + }, + { + "auxiliary_loss_clip": 0.01099335, + "auxiliary_loss_mlp": 0.01043526, + "balance_loss_clip": 1.05619013, + "balance_loss_mlp": 1.02515554, + "epoch": 0.2347211784157523, + "flos": 24535714642560.0, + "grad_norm": 1.7923512067004113, + "language_loss": 0.8232035, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.84463209, + "num_input_tokens_seen": 84042605, + "step": 3904, + "time_per_iteration": 2.635223388671875 + }, + { + "auxiliary_loss_clip": 0.01148598, + "auxiliary_loss_mlp": 0.01047905, + "balance_loss_clip": 1.05604887, + "balance_loss_mlp": 1.03087008, + "epoch": 0.23478130166842026, + "flos": 23805040181760.0, + "grad_norm": 1.8997540249887228, + "language_loss": 0.71538502, + "learning_rate": 3.57598687219895e-06, + "loss": 0.73735005, + "num_input_tokens_seen": 84061520, + "step": 3905, + "time_per_iteration": 2.5212044715881348 + }, + { + "auxiliary_loss_clip": 0.01147522, + "auxiliary_loss_mlp": 0.01036524, + "balance_loss_clip": 1.05702829, + "balance_loss_mlp": 1.01994205, + "epoch": 0.23484142492108823, + "flos": 24093618048000.0, + "grad_norm": 1.761662216246446, + "language_loss": 0.71436769, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.73620814, + "num_input_tokens_seen": 84081800, + "step": 3906, + "time_per_iteration": 2.5279879570007324 + }, + { + "auxiliary_loss_clip": 0.01141072, + "auxiliary_loss_mlp": 0.01041947, + "balance_loss_clip": 1.05402184, + "balance_loss_mlp": 1.02281404, + "epoch": 0.2349015481737562, + "flos": 29095830898560.0, + "grad_norm": 1.9646820758690333, + "language_loss": 0.72994077, + "learning_rate": 3.575507182316473e-06, + "loss": 0.75177097, + "num_input_tokens_seen": 84102340, + "step": 3907, + "time_per_iteration": 2.6095364093780518 + }, + { + "auxiliary_loss_clip": 0.01138728, + "auxiliary_loss_mlp": 0.01054018, + "balance_loss_clip": 1.05558634, + "balance_loss_mlp": 1.03568316, + "epoch": 0.23496167142642416, + "flos": 18916305373440.0, + "grad_norm": 1.7842776017656081, + "language_loss": 0.72878927, + "learning_rate": 3.575267247755601e-06, + "loss": 0.75071669, + "num_input_tokens_seen": 84120370, + "step": 3908, + "time_per_iteration": 2.511951446533203 + }, + { + "auxiliary_loss_clip": 0.01049448, + "auxiliary_loss_mlp": 0.01012821, + "balance_loss_clip": 1.03424811, + "balance_loss_mlp": 1.01041305, + "epoch": 0.23502179467909215, + "flos": 55868062896000.0, + "grad_norm": 1.0197950639386366, + "language_loss": 0.73358488, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75420761, + "num_input_tokens_seen": 84165515, + "step": 3909, + "time_per_iteration": 2.897395372390747 + }, + { + "auxiliary_loss_clip": 0.01137857, + "auxiliary_loss_mlp": 0.01042713, + "balance_loss_clip": 1.05374527, + "balance_loss_mlp": 1.02565455, + "epoch": 0.23508191793176011, + "flos": 23401763210880.0, + "grad_norm": 1.7041253606887201, + "language_loss": 0.88087815, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.90268385, + "num_input_tokens_seen": 84184540, + "step": 3910, + "time_per_iteration": 2.5438942909240723 + }, + { + "auxiliary_loss_clip": 0.01139193, + "auxiliary_loss_mlp": 0.01040831, + "balance_loss_clip": 1.0567801, + "balance_loss_mlp": 1.02406979, + "epoch": 0.23514204118442808, + "flos": 20047671025920.0, + "grad_norm": 2.0276179915671215, + "language_loss": 0.76595032, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.78775054, + "num_input_tokens_seen": 84202025, + "step": 3911, + "time_per_iteration": 2.5497519969940186 + }, + { + "auxiliary_loss_clip": 0.01132305, + "auxiliary_loss_mlp": 0.01041499, + "balance_loss_clip": 1.05387557, + "balance_loss_mlp": 1.02560854, + "epoch": 0.23520216443709605, + "flos": 21580589796480.0, + "grad_norm": 1.5003743110299204, + "language_loss": 0.81658304, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.83832109, + "num_input_tokens_seen": 84221895, + "step": 3912, + "time_per_iteration": 2.529372453689575 + }, + { + "auxiliary_loss_clip": 0.01127451, + "auxiliary_loss_mlp": 0.01048356, + "balance_loss_clip": 1.05657291, + "balance_loss_mlp": 1.03071332, + "epoch": 0.235262287689764, + "flos": 23185796688000.0, + "grad_norm": 1.7967396908625806, + "language_loss": 0.71360731, + "learning_rate": 3.574066679118909e-06, + "loss": 0.73536545, + "num_input_tokens_seen": 84240455, + "step": 3913, + "time_per_iteration": 2.5951547622680664 + }, + { + "auxiliary_loss_clip": 0.01141361, + "auxiliary_loss_mlp": 0.00793969, + "balance_loss_clip": 1.05409336, + "balance_loss_mlp": 1.01293683, + "epoch": 0.23532241094243198, + "flos": 23185222070400.0, + "grad_norm": 1.7296641080301844, + "language_loss": 0.76219821, + "learning_rate": 3.57382638628884e-06, + "loss": 0.78155148, + "num_input_tokens_seen": 84261605, + "step": 3914, + "time_per_iteration": 2.5661492347717285 + }, + { + "auxiliary_loss_clip": 0.01085353, + "auxiliary_loss_mlp": 0.01043631, + "balance_loss_clip": 1.05228651, + "balance_loss_mlp": 1.02567792, + "epoch": 0.23538253419509997, + "flos": 17019324305280.0, + "grad_norm": 2.8790178696119533, + "language_loss": 0.89877123, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.92006105, + "num_input_tokens_seen": 84278675, + "step": 3915, + "time_per_iteration": 2.62831449508667 + }, + { + "auxiliary_loss_clip": 0.01042616, + "auxiliary_loss_mlp": 0.01000677, + "balance_loss_clip": 1.02608991, + "balance_loss_mlp": 0.9979946, + "epoch": 0.23544265744776793, + "flos": 63448588967040.0, + "grad_norm": 0.8074520255887843, + "language_loss": 0.59380126, + "learning_rate": 3.573345621598854e-06, + "loss": 0.61423421, + "num_input_tokens_seen": 84329765, + "step": 3916, + "time_per_iteration": 3.0832412242889404 + }, + { + "auxiliary_loss_clip": 0.01015193, + "auxiliary_loss_mlp": 0.01002176, + "balance_loss_clip": 1.03043211, + "balance_loss_mlp": 0.99936277, + "epoch": 0.2355027807004359, + "flos": 70515343831680.0, + "grad_norm": 0.7702746569586227, + "language_loss": 0.49471438, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51488805, + "num_input_tokens_seen": 84393680, + "step": 3917, + "time_per_iteration": 3.2329819202423096 + }, + { + "auxiliary_loss_clip": 0.01116677, + "auxiliary_loss_mlp": 0.01051243, + "balance_loss_clip": 1.05844116, + "balance_loss_mlp": 1.03436255, + "epoch": 0.23556290395310386, + "flos": 21434289701760.0, + "grad_norm": 1.8752454051907863, + "language_loss": 0.76419842, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.78587759, + "num_input_tokens_seen": 84412640, + "step": 3918, + "time_per_iteration": 2.5890719890594482 + }, + { + "auxiliary_loss_clip": 0.01096085, + "auxiliary_loss_mlp": 0.0103914, + "balance_loss_clip": 1.04773235, + "balance_loss_mlp": 1.02267718, + "epoch": 0.23562302720577183, + "flos": 18186421011840.0, + "grad_norm": 10.554131542572643, + "language_loss": 0.69566238, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.71701461, + "num_input_tokens_seen": 84431605, + "step": 3919, + "time_per_iteration": 2.5676207542419434 + }, + { + "auxiliary_loss_clip": 0.01110378, + "auxiliary_loss_mlp": 0.01039202, + "balance_loss_clip": 1.05501628, + "balance_loss_mlp": 1.02245307, + "epoch": 0.2356831504584398, + "flos": 33730497832320.0, + "grad_norm": 1.9173113006695695, + "language_loss": 0.70360518, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.72510099, + "num_input_tokens_seen": 84454210, + "step": 3920, + "time_per_iteration": 2.710014820098877 + }, + { + "auxiliary_loss_clip": 0.01121519, + "auxiliary_loss_mlp": 0.01048638, + "balance_loss_clip": 1.05253398, + "balance_loss_mlp": 1.03183007, + "epoch": 0.23574327371110776, + "flos": 24932778560640.0, + "grad_norm": 1.665346823779534, + "language_loss": 0.77063107, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.79233265, + "num_input_tokens_seen": 84475540, + "step": 3921, + "time_per_iteration": 2.587679624557495 + }, + { + "auxiliary_loss_clip": 0.01112469, + "auxiliary_loss_mlp": 0.01045842, + "balance_loss_clip": 1.05206418, + "balance_loss_mlp": 1.02816379, + "epoch": 0.23580339696377575, + "flos": 17822107319040.0, + "grad_norm": 2.351883693085046, + "language_loss": 0.75070214, + "learning_rate": 3.571901895946612e-06, + "loss": 0.77228528, + "num_input_tokens_seen": 84494580, + "step": 3922, + "time_per_iteration": 2.576209783554077 + }, + { + "auxiliary_loss_clip": 0.01114793, + "auxiliary_loss_mlp": 0.01040974, + "balance_loss_clip": 1.0519383, + "balance_loss_mlp": 1.02489233, + "epoch": 0.23586352021644372, + "flos": 26286611097600.0, + "grad_norm": 1.9961203350925847, + "language_loss": 0.80343777, + "learning_rate": 3.571661066327956e-06, + "loss": 0.82499546, + "num_input_tokens_seen": 84513850, + "step": 3923, + "time_per_iteration": 2.5762481689453125 + }, + { + "auxiliary_loss_clip": 0.01086276, + "auxiliary_loss_mlp": 0.01049699, + "balance_loss_clip": 1.04736614, + "balance_loss_mlp": 1.03242564, + "epoch": 0.23592364346911168, + "flos": 14246697484800.0, + "grad_norm": 1.973201472019396, + "language_loss": 0.74515307, + "learning_rate": 3.571420177111754e-06, + "loss": 0.76651281, + "num_input_tokens_seen": 84532315, + "step": 3924, + "time_per_iteration": 2.592226266860962 + }, + { + "auxiliary_loss_clip": 0.01146611, + "auxiliary_loss_mlp": 0.01045704, + "balance_loss_clip": 1.05387878, + "balance_loss_mlp": 1.02980185, + "epoch": 0.23598376672177965, + "flos": 18587938216320.0, + "grad_norm": 1.7950237453265165, + "language_loss": 0.83280671, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.85472989, + "num_input_tokens_seen": 84550970, + "step": 3925, + "time_per_iteration": 2.4800949096679688 + }, + { + "auxiliary_loss_clip": 0.0112533, + "auxiliary_loss_mlp": 0.01054382, + "balance_loss_clip": 1.05168962, + "balance_loss_mlp": 1.03751397, + "epoch": 0.2360438899744476, + "flos": 22675542036480.0, + "grad_norm": 2.1336688701653137, + "language_loss": 0.59609914, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.61789626, + "num_input_tokens_seen": 84571655, + "step": 3926, + "time_per_iteration": 2.5812859535217285 + }, + { + "auxiliary_loss_clip": 0.01118433, + "auxiliary_loss_mlp": 0.01041622, + "balance_loss_clip": 1.04771316, + "balance_loss_mlp": 1.02655411, + "epoch": 0.23610401322711558, + "flos": 29570139014400.0, + "grad_norm": 1.941478080921335, + "language_loss": 0.71280909, + "learning_rate": 3.570697151969235e-06, + "loss": 0.73440969, + "num_input_tokens_seen": 84593130, + "step": 3927, + "time_per_iteration": 2.5648488998413086 + }, + { + "auxiliary_loss_clip": 0.01118156, + "auxiliary_loss_mlp": 0.01046976, + "balance_loss_clip": 1.04995942, + "balance_loss_mlp": 1.03168738, + "epoch": 0.23616413647978354, + "flos": 17858520731520.0, + "grad_norm": 1.7763875038189096, + "language_loss": 0.75305879, + "learning_rate": 3.570456024454221e-06, + "loss": 0.77471006, + "num_input_tokens_seen": 84612410, + "step": 3928, + "time_per_iteration": 2.594587564468384 + }, + { + "auxiliary_loss_clip": 0.01117135, + "auxiliary_loss_mlp": 0.01051785, + "balance_loss_clip": 1.05286551, + "balance_loss_mlp": 1.0336771, + "epoch": 0.23622425973245154, + "flos": 11034847157760.0, + "grad_norm": 2.3426110973547973, + "language_loss": 0.82109511, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.84278429, + "num_input_tokens_seen": 84627610, + "step": 3929, + "time_per_iteration": 2.512575149536133 + }, + { + "auxiliary_loss_clip": 0.01151447, + "auxiliary_loss_mlp": 0.01052083, + "balance_loss_clip": 1.05433226, + "balance_loss_mlp": 1.03330731, + "epoch": 0.2362843829851195, + "flos": 23404061681280.0, + "grad_norm": 1.8325965753586955, + "language_loss": 0.71781456, + "learning_rate": 3.569973590777789e-06, + "loss": 0.73984987, + "num_input_tokens_seen": 84648415, + "step": 3930, + "time_per_iteration": 2.5138580799102783 + }, + { + "auxiliary_loss_clip": 0.01143317, + "auxiliary_loss_mlp": 0.01040959, + "balance_loss_clip": 1.05108714, + "balance_loss_mlp": 1.02410316, + "epoch": 0.23634450623778747, + "flos": 39529855261440.0, + "grad_norm": 1.8178673741239038, + "language_loss": 0.7397995, + "learning_rate": 3.569732284634665e-06, + "loss": 0.76164222, + "num_input_tokens_seen": 84670080, + "step": 3931, + "time_per_iteration": 2.6234030723571777 + }, + { + "auxiliary_loss_clip": 0.01133728, + "auxiliary_loss_mlp": 0.01043585, + "balance_loss_clip": 1.05262184, + "balance_loss_mlp": 1.02609658, + "epoch": 0.23640462949045543, + "flos": 24207167917440.0, + "grad_norm": 2.040284562163236, + "language_loss": 0.80665421, + "learning_rate": 3.569490918967136e-06, + "loss": 0.82842731, + "num_input_tokens_seen": 84686465, + "step": 3932, + "time_per_iteration": 2.549920082092285 + }, + { + "auxiliary_loss_clip": 0.01108696, + "auxiliary_loss_mlp": 0.01041547, + "balance_loss_clip": 1.05572343, + "balance_loss_mlp": 1.02644277, + "epoch": 0.2364647527431234, + "flos": 26177622255360.0, + "grad_norm": 1.5914436875852116, + "language_loss": 0.85524285, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.87674534, + "num_input_tokens_seen": 84708825, + "step": 3933, + "time_per_iteration": 4.058252573013306 + }, + { + "auxiliary_loss_clip": 0.01108514, + "auxiliary_loss_mlp": 0.01045578, + "balance_loss_clip": 1.05562901, + "balance_loss_mlp": 1.02611136, + "epoch": 0.23652487599579136, + "flos": 22637009721600.0, + "grad_norm": 2.0669364058690505, + "language_loss": 0.83576977, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.85731071, + "num_input_tokens_seen": 84726165, + "step": 3934, + "time_per_iteration": 2.5937938690185547 + }, + { + "auxiliary_loss_clip": 0.01146281, + "auxiliary_loss_mlp": 0.01045086, + "balance_loss_clip": 1.05272651, + "balance_loss_mlp": 1.02794361, + "epoch": 0.23658499924845935, + "flos": 21762261809280.0, + "grad_norm": 1.6780617916493208, + "language_loss": 0.78549421, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.80740786, + "num_input_tokens_seen": 84745815, + "step": 3935, + "time_per_iteration": 3.953526735305786 + }, + { + "auxiliary_loss_clip": 0.01134349, + "auxiliary_loss_mlp": 0.01039475, + "balance_loss_clip": 1.05590248, + "balance_loss_mlp": 1.02364421, + "epoch": 0.23664512250112732, + "flos": 21798998444160.0, + "grad_norm": 1.7477941677946933, + "language_loss": 0.79014444, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.81188273, + "num_input_tokens_seen": 84765415, + "step": 3936, + "time_per_iteration": 2.5164499282836914 + }, + { + "auxiliary_loss_clip": 0.01124909, + "auxiliary_loss_mlp": 0.0103769, + "balance_loss_clip": 1.05274773, + "balance_loss_mlp": 1.02038085, + "epoch": 0.23670524575379528, + "flos": 22637871648000.0, + "grad_norm": 1.567388653470774, + "language_loss": 0.79189163, + "learning_rate": 3.568283198083826e-06, + "loss": 0.81351763, + "num_input_tokens_seen": 84787080, + "step": 3937, + "time_per_iteration": 2.5767908096313477 + }, + { + "auxiliary_loss_clip": 0.01135977, + "auxiliary_loss_mlp": 0.01039394, + "balance_loss_clip": 1.05879116, + "balance_loss_mlp": 1.02423072, + "epoch": 0.23676536900646325, + "flos": 16725000263040.0, + "grad_norm": 2.3619585504561544, + "language_loss": 0.85389519, + "learning_rate": 3.568041475462147e-06, + "loss": 0.87564886, + "num_input_tokens_seen": 84805395, + "step": 3938, + "time_per_iteration": 2.5066115856170654 + }, + { + "auxiliary_loss_clip": 0.01142166, + "auxiliary_loss_mlp": 0.01045209, + "balance_loss_clip": 1.05242527, + "balance_loss_mlp": 1.02867436, + "epoch": 0.23682549225913122, + "flos": 11135611785600.0, + "grad_norm": 2.81832367575203, + "language_loss": 0.93989241, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.96176612, + "num_input_tokens_seen": 84818090, + "step": 3939, + "time_per_iteration": 3.929412603378296 + }, + { + "auxiliary_loss_clip": 0.0114944, + "auxiliary_loss_mlp": 0.01046233, + "balance_loss_clip": 1.05516481, + "balance_loss_mlp": 1.02879238, + "epoch": 0.23688561551179918, + "flos": 22559226819840.0, + "grad_norm": 1.6538038438310103, + "language_loss": 0.82099277, + "learning_rate": 3.567557851847088e-06, + "loss": 0.84294945, + "num_input_tokens_seen": 84837695, + "step": 3940, + "time_per_iteration": 3.9966132640838623 + }, + { + "auxiliary_loss_clip": 0.01132047, + "auxiliary_loss_mlp": 0.00798548, + "balance_loss_clip": 1.05700278, + "balance_loss_mlp": 1.02392912, + "epoch": 0.23694573876446715, + "flos": 18514895909760.0, + "grad_norm": 2.2433879758523916, + "language_loss": 0.89842021, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.91772616, + "num_input_tokens_seen": 84854630, + "step": 3941, + "time_per_iteration": 2.546996593475342 + }, + { + "auxiliary_loss_clip": 0.01146248, + "auxiliary_loss_mlp": 0.01046029, + "balance_loss_clip": 1.05281997, + "balance_loss_mlp": 1.02810001, + "epoch": 0.23700586201713514, + "flos": 15335723980800.0, + "grad_norm": 2.0250797705190244, + "language_loss": 0.84929091, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.87121361, + "num_input_tokens_seen": 84871805, + "step": 3942, + "time_per_iteration": 2.458131790161133 + }, + { + "auxiliary_loss_clip": 0.01110135, + "auxiliary_loss_mlp": 0.01046236, + "balance_loss_clip": 1.05741894, + "balance_loss_mlp": 1.02792573, + "epoch": 0.2370659852698031, + "flos": 23947605262080.0, + "grad_norm": 1.8123145761167945, + "language_loss": 0.81469262, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.83625627, + "num_input_tokens_seen": 84889815, + "step": 3943, + "time_per_iteration": 2.6367430686950684 + }, + { + "auxiliary_loss_clip": 0.01114141, + "auxiliary_loss_mlp": 0.01041359, + "balance_loss_clip": 1.05388308, + "balance_loss_mlp": 1.02393079, + "epoch": 0.23712610852247107, + "flos": 15332527670400.0, + "grad_norm": 8.398771358507812, + "language_loss": 0.67402655, + "learning_rate": 3.566589891386959e-06, + "loss": 0.69558156, + "num_input_tokens_seen": 84904380, + "step": 3944, + "time_per_iteration": 2.5157546997070312 + }, + { + "auxiliary_loss_clip": 0.01123193, + "auxiliary_loss_mlp": 0.01044875, + "balance_loss_clip": 1.05832136, + "balance_loss_mlp": 1.02681518, + "epoch": 0.23718623177513903, + "flos": 19682567233920.0, + "grad_norm": 1.655459837638198, + "language_loss": 0.75408065, + "learning_rate": 3.566347752735866e-06, + "loss": 0.77576137, + "num_input_tokens_seen": 84922935, + "step": 3945, + "time_per_iteration": 2.5526230335235596 + }, + { + "auxiliary_loss_clip": 0.01129758, + "auxiliary_loss_mlp": 0.01042504, + "balance_loss_clip": 1.05482817, + "balance_loss_mlp": 1.02566004, + "epoch": 0.237246355027807, + "flos": 24973322037120.0, + "grad_norm": 1.4868559536124195, + "language_loss": 0.63482881, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.65655136, + "num_input_tokens_seen": 84943685, + "step": 3946, + "time_per_iteration": 2.5533552169799805 + }, + { + "auxiliary_loss_clip": 0.01133242, + "auxiliary_loss_mlp": 0.01040841, + "balance_loss_clip": 1.05282092, + "balance_loss_mlp": 1.02303088, + "epoch": 0.23730647828047496, + "flos": 15377416692480.0, + "grad_norm": 4.513801338709062, + "language_loss": 0.77118856, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.79292941, + "num_input_tokens_seen": 84959505, + "step": 3947, + "time_per_iteration": 2.519551992416382 + }, + { + "auxiliary_loss_clip": 0.0114485, + "auxiliary_loss_mlp": 0.01039472, + "balance_loss_clip": 1.05992222, + "balance_loss_mlp": 1.02204359, + "epoch": 0.23736660153314296, + "flos": 28150662372480.0, + "grad_norm": 1.573190202833209, + "language_loss": 0.80595517, + "learning_rate": 3.565620980442944e-06, + "loss": 0.82779837, + "num_input_tokens_seen": 84982130, + "step": 3948, + "time_per_iteration": 2.606762409210205 + }, + { + "auxiliary_loss_clip": 0.01129309, + "auxiliary_loss_mlp": 0.01042978, + "balance_loss_clip": 1.05615067, + "balance_loss_mlp": 1.02593112, + "epoch": 0.23742672478581092, + "flos": 22086570729600.0, + "grad_norm": 1.7539736960923562, + "language_loss": 0.80401385, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.82573676, + "num_input_tokens_seen": 85000640, + "step": 3949, + "time_per_iteration": 2.5637118816375732 + }, + { + "auxiliary_loss_clip": 0.01120531, + "auxiliary_loss_mlp": 0.01035791, + "balance_loss_clip": 1.05336428, + "balance_loss_mlp": 1.01897025, + "epoch": 0.2374868480384789, + "flos": 19537093152000.0, + "grad_norm": 1.876055983493169, + "language_loss": 0.72634459, + "learning_rate": 3.565136168723163e-06, + "loss": 0.74790788, + "num_input_tokens_seen": 85018970, + "step": 3950, + "time_per_iteration": 2.5554633140563965 + }, + { + "auxiliary_loss_clip": 0.01146244, + "auxiliary_loss_mlp": 0.01041176, + "balance_loss_clip": 1.05509329, + "balance_loss_mlp": 1.02499962, + "epoch": 0.23754697129114685, + "flos": 19422501788160.0, + "grad_norm": 1.8977149808335352, + "language_loss": 0.73071659, + "learning_rate": 3.564893673833495e-06, + "loss": 0.75259078, + "num_input_tokens_seen": 85035905, + "step": 3951, + "time_per_iteration": 2.505060911178589 + }, + { + "auxiliary_loss_clip": 0.01121671, + "auxiliary_loss_mlp": 0.01035646, + "balance_loss_clip": 1.05608916, + "balance_loss_mlp": 1.01815772, + "epoch": 0.23760709454381482, + "flos": 19501002961920.0, + "grad_norm": 1.708736064361557, + "language_loss": 0.73918974, + "learning_rate": 3.564651119602903e-06, + "loss": 0.76076287, + "num_input_tokens_seen": 85054560, + "step": 3952, + "time_per_iteration": 2.545238971710205 + }, + { + "auxiliary_loss_clip": 0.01100061, + "auxiliary_loss_mlp": 0.01046257, + "balance_loss_clip": 1.04890037, + "balance_loss_mlp": 1.02861452, + "epoch": 0.23766721779648278, + "flos": 27636600879360.0, + "grad_norm": 1.613676097310807, + "language_loss": 0.71019685, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73166007, + "num_input_tokens_seen": 85074425, + "step": 3953, + "time_per_iteration": 2.6189072132110596 + }, + { + "auxiliary_loss_clip": 0.01153195, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_clip": 1.05823743, + "balance_loss_mlp": 1.02792048, + "epoch": 0.23772734104915075, + "flos": 23404348990080.0, + "grad_norm": 2.04090701906158, + "language_loss": 0.81270242, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.83470124, + "num_input_tokens_seen": 85092865, + "step": 3954, + "time_per_iteration": 2.5225765705108643 + }, + { + "auxiliary_loss_clip": 0.01129264, + "auxiliary_loss_mlp": 0.01041099, + "balance_loss_clip": 1.05720687, + "balance_loss_mlp": 1.02206087, + "epoch": 0.23778746430181874, + "flos": 15705496540800.0, + "grad_norm": 2.2900810058210026, + "language_loss": 0.65865612, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.68035972, + "num_input_tokens_seen": 85110175, + "step": 3955, + "time_per_iteration": 2.522770643234253 + }, + { + "auxiliary_loss_clip": 0.01149625, + "auxiliary_loss_mlp": 0.01053921, + "balance_loss_clip": 1.05615497, + "balance_loss_mlp": 1.0367074, + "epoch": 0.2378475875544867, + "flos": 19426452284160.0, + "grad_norm": 1.4747629343234192, + "language_loss": 0.84014642, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.86218184, + "num_input_tokens_seen": 85129925, + "step": 3956, + "time_per_iteration": 2.506817102432251 + }, + { + "auxiliary_loss_clip": 0.0110464, + "auxiliary_loss_mlp": 0.01040082, + "balance_loss_clip": 1.05486178, + "balance_loss_mlp": 1.02334487, + "epoch": 0.23790771080715467, + "flos": 22268565964800.0, + "grad_norm": 2.1075646781951805, + "language_loss": 0.84762716, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.8690744, + "num_input_tokens_seen": 85147755, + "step": 3957, + "time_per_iteration": 2.5956637859344482 + }, + { + "auxiliary_loss_clip": 0.01087032, + "auxiliary_loss_mlp": 0.01040214, + "balance_loss_clip": 1.05194855, + "balance_loss_mlp": 1.02419281, + "epoch": 0.23796783405982264, + "flos": 20047311889920.0, + "grad_norm": 2.1316937281406236, + "language_loss": 0.70316124, + "learning_rate": 3.563194548575151e-06, + "loss": 0.72443366, + "num_input_tokens_seen": 85165270, + "step": 3958, + "time_per_iteration": 2.6443376541137695 + }, + { + "auxiliary_loss_clip": 0.01102296, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_clip": 1.05144227, + "balance_loss_mlp": 1.02600503, + "epoch": 0.2380279573124906, + "flos": 14245943299200.0, + "grad_norm": 2.5360525781370313, + "language_loss": 0.66396874, + "learning_rate": 3.562951579215745e-06, + "loss": 0.68544722, + "num_input_tokens_seen": 85181555, + "step": 3959, + "time_per_iteration": 2.5484368801116943 + }, + { + "auxiliary_loss_clip": 0.01108935, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_clip": 1.05629826, + "balance_loss_mlp": 1.02838922, + "epoch": 0.23808808056515857, + "flos": 21179180332800.0, + "grad_norm": 2.496213390672979, + "language_loss": 0.72332668, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.7448715, + "num_input_tokens_seen": 85199455, + "step": 3960, + "time_per_iteration": 2.6329147815704346 + }, + { + "auxiliary_loss_clip": 0.01059826, + "auxiliary_loss_mlp": 0.01040472, + "balance_loss_clip": 1.05036616, + "balance_loss_mlp": 1.0229125, + "epoch": 0.23814820381782653, + "flos": 22528308188160.0, + "grad_norm": 3.4008596017174764, + "language_loss": 0.74073315, + "learning_rate": 3.562465462704307e-06, + "loss": 0.76173615, + "num_input_tokens_seen": 85219170, + "step": 3961, + "time_per_iteration": 2.9737634658813477 + }, + { + "auxiliary_loss_clip": 0.01148111, + "auxiliary_loss_mlp": 0.01051909, + "balance_loss_clip": 1.05256987, + "balance_loss_mlp": 1.03200102, + "epoch": 0.23820832707049452, + "flos": 22304332932480.0, + "grad_norm": 1.8528453161028737, + "language_loss": 0.66160566, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.68360579, + "num_input_tokens_seen": 85238480, + "step": 3962, + "time_per_iteration": 2.7489497661590576 + }, + { + "auxiliary_loss_clip": 0.01118965, + "auxiliary_loss_mlp": 0.01046523, + "balance_loss_clip": 1.04937363, + "balance_loss_mlp": 1.02945185, + "epoch": 0.2382684503231625, + "flos": 24864225454080.0, + "grad_norm": 1.6462070877065913, + "language_loss": 0.74941844, + "learning_rate": 3.561979109197483e-06, + "loss": 0.77107334, + "num_input_tokens_seen": 85259180, + "step": 3963, + "time_per_iteration": 2.62774920463562 + }, + { + "auxiliary_loss_clip": 0.01117072, + "auxiliary_loss_mlp": 0.01045797, + "balance_loss_clip": 1.05496264, + "balance_loss_mlp": 1.0284996, + "epoch": 0.23832857357583045, + "flos": 21871609787520.0, + "grad_norm": 1.9815962611336266, + "language_loss": 0.76946729, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.79109597, + "num_input_tokens_seen": 85278550, + "step": 3964, + "time_per_iteration": 2.5967934131622314 + }, + { + "auxiliary_loss_clip": 0.01110521, + "auxiliary_loss_mlp": 0.01042956, + "balance_loss_clip": 1.05301023, + "balance_loss_mlp": 1.02573013, + "epoch": 0.23838869682849842, + "flos": 21288061434240.0, + "grad_norm": 2.680608052499513, + "language_loss": 0.71604174, + "learning_rate": 3.561492518769045e-06, + "loss": 0.73757648, + "num_input_tokens_seen": 85297345, + "step": 3965, + "time_per_iteration": 2.5965473651885986 + }, + { + "auxiliary_loss_clip": 0.01114531, + "auxiliary_loss_mlp": 0.0104336, + "balance_loss_clip": 1.05380714, + "balance_loss_mlp": 1.02611041, + "epoch": 0.23844882008116638, + "flos": 16180594755840.0, + "grad_norm": 1.7881388585451299, + "language_loss": 0.78071404, + "learning_rate": 3.561249134732282e-06, + "loss": 0.80229294, + "num_input_tokens_seen": 85315105, + "step": 3966, + "time_per_iteration": 2.595555305480957 + }, + { + "auxiliary_loss_clip": 0.01119392, + "auxiliary_loss_mlp": 0.01043908, + "balance_loss_clip": 1.05082631, + "balance_loss_mlp": 1.02708793, + "epoch": 0.23850894333383435, + "flos": 21069724613760.0, + "grad_norm": 1.9932713058676443, + "language_loss": 0.68601161, + "learning_rate": 3.561005691492797e-06, + "loss": 0.70764464, + "num_input_tokens_seen": 85334735, + "step": 3967, + "time_per_iteration": 2.5344583988189697 + }, + { + "auxiliary_loss_clip": 0.01116554, + "auxiliary_loss_mlp": 0.01050236, + "balance_loss_clip": 1.05269158, + "balance_loss_mlp": 1.03185439, + "epoch": 0.23856906658650234, + "flos": 17201606849280.0, + "grad_norm": 1.857153899585276, + "language_loss": 0.67995822, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.70162618, + "num_input_tokens_seen": 85352875, + "step": 3968, + "time_per_iteration": 2.575070381164551 + }, + { + "auxiliary_loss_clip": 0.0108929, + "auxiliary_loss_mlp": 0.01042932, + "balance_loss_clip": 1.04837632, + "balance_loss_mlp": 1.02618313, + "epoch": 0.2386291898391703, + "flos": 29494223619840.0, + "grad_norm": 1.7939964888525397, + "language_loss": 0.76515961, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.78648186, + "num_input_tokens_seen": 85372205, + "step": 3969, + "time_per_iteration": 2.673492193222046 + }, + { + "auxiliary_loss_clip": 0.01125796, + "auxiliary_loss_mlp": 0.01040259, + "balance_loss_clip": 1.05352545, + "balance_loss_mlp": 1.0237484, + "epoch": 0.23868931309183827, + "flos": 21142443697920.0, + "grad_norm": 2.0149077588297466, + "language_loss": 0.76286101, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.78452158, + "num_input_tokens_seen": 85389705, + "step": 3970, + "time_per_iteration": 2.5753135681152344 + }, + { + "auxiliary_loss_clip": 0.01108071, + "auxiliary_loss_mlp": 0.01050822, + "balance_loss_clip": 1.05032849, + "balance_loss_mlp": 1.03272593, + "epoch": 0.23874943634450624, + "flos": 25659394784640.0, + "grad_norm": 1.880230780578347, + "language_loss": 0.84621048, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.86779946, + "num_input_tokens_seen": 85407855, + "step": 3971, + "time_per_iteration": 2.583514451980591 + }, + { + "auxiliary_loss_clip": 0.0105642, + "auxiliary_loss_mlp": 0.01020033, + "balance_loss_clip": 1.03157544, + "balance_loss_mlp": 1.017959, + "epoch": 0.2388095595971742, + "flos": 58986618624000.0, + "grad_norm": 0.7445835779215032, + "language_loss": 0.62784207, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.64860666, + "num_input_tokens_seen": 85470885, + "step": 3972, + "time_per_iteration": 4.796634197235107 + }, + { + "auxiliary_loss_clip": 0.01121312, + "auxiliary_loss_mlp": 0.01039659, + "balance_loss_clip": 1.05333877, + "balance_loss_mlp": 1.02275538, + "epoch": 0.23886968284984217, + "flos": 16800341040000.0, + "grad_norm": 2.061079039934424, + "language_loss": 0.8213827, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.84299242, + "num_input_tokens_seen": 85488460, + "step": 3973, + "time_per_iteration": 3.9478888511657715 + }, + { + "auxiliary_loss_clip": 0.01113116, + "auxiliary_loss_mlp": 0.010504, + "balance_loss_clip": 1.05104899, + "balance_loss_mlp": 1.03207803, + "epoch": 0.23892980610251013, + "flos": 22382654538240.0, + "grad_norm": 1.590800791719403, + "language_loss": 0.79374391, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.81537902, + "num_input_tokens_seen": 85508590, + "step": 3974, + "time_per_iteration": 2.595546245574951 + }, + { + "auxiliary_loss_clip": 0.0113084, + "auxiliary_loss_mlp": 0.01046006, + "balance_loss_clip": 1.05526972, + "balance_loss_mlp": 1.02750468, + "epoch": 0.23898992935517813, + "flos": 12823198519680.0, + "grad_norm": 5.030532161550108, + "language_loss": 0.84937775, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.8711462, + "num_input_tokens_seen": 85525970, + "step": 3975, + "time_per_iteration": 2.5252346992492676 + }, + { + "auxiliary_loss_clip": 0.01125148, + "auxiliary_loss_mlp": 0.01047652, + "balance_loss_clip": 1.05138302, + "balance_loss_mlp": 1.03055716, + "epoch": 0.2390500526078461, + "flos": 22345666508160.0, + "grad_norm": 2.656566288042889, + "language_loss": 0.83713007, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.85885811, + "num_input_tokens_seen": 85543700, + "step": 3976, + "time_per_iteration": 2.5289478302001953 + }, + { + "auxiliary_loss_clip": 0.01073264, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.05000925, + "balance_loss_mlp": 1.01777828, + "epoch": 0.23911017586051406, + "flos": 22635142214400.0, + "grad_norm": 1.8123498407703187, + "language_loss": 0.74369967, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.76476556, + "num_input_tokens_seen": 85562765, + "step": 3977, + "time_per_iteration": 2.683412551879883 + }, + { + "auxiliary_loss_clip": 0.01146041, + "auxiliary_loss_mlp": 0.01048486, + "balance_loss_clip": 1.05231786, + "balance_loss_mlp": 1.02975798, + "epoch": 0.23917029911318202, + "flos": 23653281219840.0, + "grad_norm": 1.7846829962029425, + "language_loss": 0.72064781, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.74259317, + "num_input_tokens_seen": 85581755, + "step": 3978, + "time_per_iteration": 5.251256465911865 + }, + { + "auxiliary_loss_clip": 0.01123007, + "auxiliary_loss_mlp": 0.01049753, + "balance_loss_clip": 1.05084777, + "balance_loss_mlp": 1.03174043, + "epoch": 0.23923042236585, + "flos": 22783597125120.0, + "grad_norm": 2.390737430637487, + "language_loss": 0.78904504, + "learning_rate": 3.558079758168997e-06, + "loss": 0.81077266, + "num_input_tokens_seen": 85599455, + "step": 3979, + "time_per_iteration": 2.536644220352173 + }, + { + "auxiliary_loss_clip": 0.01121063, + "auxiliary_loss_mlp": 0.01053439, + "balance_loss_clip": 1.04953194, + "balance_loss_mlp": 1.03448439, + "epoch": 0.23929054561851795, + "flos": 28147717457280.0, + "grad_norm": 1.6173414134200046, + "language_loss": 0.81639117, + "learning_rate": 3.557835546134977e-06, + "loss": 0.8381362, + "num_input_tokens_seen": 85619970, + "step": 3980, + "time_per_iteration": 2.606170654296875 + }, + { + "auxiliary_loss_clip": 0.0109376, + "auxiliary_loss_mlp": 0.01039104, + "balance_loss_clip": 1.05324113, + "balance_loss_mlp": 1.02167594, + "epoch": 0.23935066887118592, + "flos": 21686525982720.0, + "grad_norm": 1.928465144549385, + "language_loss": 0.83992606, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86125469, + "num_input_tokens_seen": 85638850, + "step": 3981, + "time_per_iteration": 2.5904626846313477 + }, + { + "auxiliary_loss_clip": 0.01122951, + "auxiliary_loss_mlp": 0.01046126, + "balance_loss_clip": 1.05135012, + "balance_loss_mlp": 1.02787542, + "epoch": 0.2394107921238539, + "flos": 32122274198400.0, + "grad_norm": 1.7854685389408194, + "language_loss": 0.76793194, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.78962278, + "num_input_tokens_seen": 85656285, + "step": 3982, + "time_per_iteration": 2.6381239891052246 + }, + { + "auxiliary_loss_clip": 0.01108236, + "auxiliary_loss_mlp": 0.01043153, + "balance_loss_clip": 1.05375719, + "balance_loss_mlp": 1.02628481, + "epoch": 0.23947091537652188, + "flos": 17019180650880.0, + "grad_norm": 1.6924185059848798, + "language_loss": 0.78154534, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.80305922, + "num_input_tokens_seen": 85673020, + "step": 3983, + "time_per_iteration": 2.583866834640503 + }, + { + "auxiliary_loss_clip": 0.0112964, + "auxiliary_loss_mlp": 0.00796828, + "balance_loss_clip": 1.04988647, + "balance_loss_mlp": 1.01839876, + "epoch": 0.23953103862918984, + "flos": 20593584904320.0, + "grad_norm": 1.8534213558903507, + "language_loss": 0.735425, + "learning_rate": 3.556858107358737e-06, + "loss": 0.75468969, + "num_input_tokens_seen": 85692565, + "step": 3984, + "time_per_iteration": 2.525561809539795 + }, + { + "auxiliary_loss_clip": 0.01096018, + "auxiliary_loss_mlp": 0.0104866, + "balance_loss_clip": 1.04637766, + "balance_loss_mlp": 1.02978873, + "epoch": 0.2395911618818578, + "flos": 20704405340160.0, + "grad_norm": 2.100088824649878, + "language_loss": 0.79106486, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.81251168, + "num_input_tokens_seen": 85709730, + "step": 3985, + "time_per_iteration": 2.5827388763427734 + }, + { + "auxiliary_loss_clip": 0.0109926, + "auxiliary_loss_mlp": 0.01050179, + "balance_loss_clip": 1.04866457, + "balance_loss_mlp": 1.03183222, + "epoch": 0.23965128513452577, + "flos": 27053519402880.0, + "grad_norm": 1.926566711464529, + "language_loss": 0.73119307, + "learning_rate": 3.556369033716254e-06, + "loss": 0.75268745, + "num_input_tokens_seen": 85730045, + "step": 3986, + "time_per_iteration": 2.647474765777588 + }, + { + "auxiliary_loss_clip": 0.01135635, + "auxiliary_loss_mlp": 0.01046966, + "balance_loss_clip": 1.05052495, + "balance_loss_mlp": 1.02983558, + "epoch": 0.23971140838719374, + "flos": 23144319457920.0, + "grad_norm": 1.838263713942384, + "language_loss": 0.87676966, + "learning_rate": 3.556124408363871e-06, + "loss": 0.89859557, + "num_input_tokens_seen": 85747590, + "step": 3987, + "time_per_iteration": 2.535912275314331 + }, + { + "auxiliary_loss_clip": 0.01124002, + "auxiliary_loss_mlp": 0.01044331, + "balance_loss_clip": 1.04817116, + "balance_loss_mlp": 1.02847576, + "epoch": 0.23977153163986173, + "flos": 18034554309120.0, + "grad_norm": 2.754663722620775, + "language_loss": 0.83041346, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.8520968, + "num_input_tokens_seen": 85763460, + "step": 3988, + "time_per_iteration": 2.50909161567688 + }, + { + "auxiliary_loss_clip": 0.01130129, + "auxiliary_loss_mlp": 0.01042058, + "balance_loss_clip": 1.05050838, + "balance_loss_mlp": 1.02414083, + "epoch": 0.2398316548925297, + "flos": 18113378705280.0, + "grad_norm": 1.6467212333622046, + "language_loss": 0.85133708, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.87305892, + "num_input_tokens_seen": 85782050, + "step": 3989, + "time_per_iteration": 2.5063977241516113 + }, + { + "auxiliary_loss_clip": 0.01139613, + "auxiliary_loss_mlp": 0.010406, + "balance_loss_clip": 1.0482645, + "balance_loss_mlp": 1.02360058, + "epoch": 0.23989177814519766, + "flos": 12567730014720.0, + "grad_norm": 2.1014965764309927, + "language_loss": 0.84938669, + "learning_rate": 3.555390178293477e-06, + "loss": 0.87118882, + "num_input_tokens_seen": 85797400, + "step": 3990, + "time_per_iteration": 2.467935800552368 + }, + { + "auxiliary_loss_clip": 0.01126497, + "auxiliary_loss_mlp": 0.01040871, + "balance_loss_clip": 1.04710507, + "balance_loss_mlp": 1.02455068, + "epoch": 0.23995190139786562, + "flos": 25264593423360.0, + "grad_norm": 1.4217277974439448, + "language_loss": 0.7575894, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.77926302, + "num_input_tokens_seen": 85818995, + "step": 3991, + "time_per_iteration": 2.6114790439605713 + }, + { + "auxiliary_loss_clip": 0.01041511, + "auxiliary_loss_mlp": 0.01009693, + "balance_loss_clip": 1.03682637, + "balance_loss_mlp": 1.00698686, + "epoch": 0.2400120246505336, + "flos": 61960379650560.0, + "grad_norm": 0.9105204939209784, + "language_loss": 0.63676965, + "learning_rate": 3.554900396661656e-06, + "loss": 0.6572817, + "num_input_tokens_seen": 85876695, + "step": 3992, + "time_per_iteration": 3.058872938156128 + }, + { + "auxiliary_loss_clip": 0.01048008, + "auxiliary_loss_mlp": 0.01012253, + "balance_loss_clip": 1.02257085, + "balance_loss_mlp": 1.00989234, + "epoch": 0.24007214790320155, + "flos": 66708560540160.0, + "grad_norm": 0.7584167552123384, + "language_loss": 0.62989819, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65050083, + "num_input_tokens_seen": 85940990, + "step": 3993, + "time_per_iteration": 3.191061496734619 + }, + { + "auxiliary_loss_clip": 0.01105097, + "auxiliary_loss_mlp": 0.01046402, + "balance_loss_clip": 1.05188179, + "balance_loss_mlp": 1.02844906, + "epoch": 0.24013227115586952, + "flos": 25809070757760.0, + "grad_norm": 1.7762757403255023, + "language_loss": 0.76852882, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.79004377, + "num_input_tokens_seen": 85961165, + "step": 3994, + "time_per_iteration": 2.614126205444336 + }, + { + "auxiliary_loss_clip": 0.01117814, + "auxiliary_loss_mlp": 0.01046809, + "balance_loss_clip": 1.04799819, + "balance_loss_mlp": 1.02786684, + "epoch": 0.2401923944085375, + "flos": 25557480921600.0, + "grad_norm": 1.5583007865951757, + "language_loss": 0.78603429, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.80768055, + "num_input_tokens_seen": 85982710, + "step": 3995, + "time_per_iteration": 2.610560178756714 + }, + { + "auxiliary_loss_clip": 0.01028891, + "auxiliary_loss_mlp": 0.0100644, + "balance_loss_clip": 1.02648234, + "balance_loss_mlp": 1.00379372, + "epoch": 0.24025251766120548, + "flos": 54941138478720.0, + "grad_norm": 1.0258291369588604, + "language_loss": 0.63426208, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.6546154, + "num_input_tokens_seen": 86046935, + "step": 3996, + "time_per_iteration": 3.216639280319214 + }, + { + "auxiliary_loss_clip": 0.0112158, + "auxiliary_loss_mlp": 0.01042444, + "balance_loss_clip": 1.05151033, + "balance_loss_mlp": 1.02509856, + "epoch": 0.24031264091387344, + "flos": 20631075724800.0, + "grad_norm": 2.8416082661475457, + "language_loss": 0.69843972, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.72007996, + "num_input_tokens_seen": 86064355, + "step": 3997, + "time_per_iteration": 2.5486128330230713 + }, + { + "auxiliary_loss_clip": 0.01131272, + "auxiliary_loss_mlp": 0.01045812, + "balance_loss_clip": 1.04793882, + "balance_loss_mlp": 1.02838349, + "epoch": 0.2403727641665414, + "flos": 20886256920960.0, + "grad_norm": 1.7538254988423188, + "language_loss": 0.87081623, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.89258707, + "num_input_tokens_seen": 86081340, + "step": 3998, + "time_per_iteration": 2.5194218158721924 + }, + { + "auxiliary_loss_clip": 0.01123567, + "auxiliary_loss_mlp": 0.01036504, + "balance_loss_clip": 1.04552388, + "balance_loss_mlp": 1.01900411, + "epoch": 0.24043288741920937, + "flos": 22820046451200.0, + "grad_norm": 1.532200940516686, + "language_loss": 0.75738704, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.77898777, + "num_input_tokens_seen": 86102260, + "step": 3999, + "time_per_iteration": 2.5257408618927 + }, + { + "auxiliary_loss_clip": 0.01118399, + "auxiliary_loss_mlp": 0.01045206, + "balance_loss_clip": 1.04893708, + "balance_loss_mlp": 1.02818251, + "epoch": 0.24049301067187734, + "flos": 27959652823680.0, + "grad_norm": 2.17245677645293, + "language_loss": 0.7202577, + "learning_rate": 3.552938912398679e-06, + "loss": 0.74189377, + "num_input_tokens_seen": 86123400, + "step": 4000, + "time_per_iteration": 2.6110410690307617 + }, + { + "auxiliary_loss_clip": 0.01133737, + "auxiliary_loss_mlp": 0.0104128, + "balance_loss_clip": 1.05354083, + "balance_loss_mlp": 1.02380371, + "epoch": 0.24055313392454533, + "flos": 27451409333760.0, + "grad_norm": 1.8111866156153111, + "language_loss": 0.67109108, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.69284129, + "num_input_tokens_seen": 86144060, + "step": 4001, + "time_per_iteration": 2.640087127685547 + }, + { + "auxiliary_loss_clip": 0.011416, + "auxiliary_loss_mlp": 0.01042102, + "balance_loss_clip": 1.04832053, + "balance_loss_mlp": 1.02420866, + "epoch": 0.2406132571772133, + "flos": 25556618995200.0, + "grad_norm": 2.999521716088347, + "language_loss": 0.8269186, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.8487556, + "num_input_tokens_seen": 86163005, + "step": 4002, + "time_per_iteration": 2.623082399368286 + }, + { + "auxiliary_loss_clip": 0.01105717, + "auxiliary_loss_mlp": 0.01045616, + "balance_loss_clip": 1.05172229, + "balance_loss_mlp": 1.02812767, + "epoch": 0.24067338042988126, + "flos": 24791398629120.0, + "grad_norm": 1.832958449229557, + "language_loss": 0.83070654, + "learning_rate": 3.552202383898897e-06, + "loss": 0.85221988, + "num_input_tokens_seen": 86182580, + "step": 4003, + "time_per_iteration": 2.6036951541900635 + }, + { + "auxiliary_loss_clip": 0.01114653, + "auxiliary_loss_mlp": 0.01046296, + "balance_loss_clip": 1.05497539, + "balance_loss_mlp": 1.02833104, + "epoch": 0.24073350368254923, + "flos": 21177923356800.0, + "grad_norm": 2.3594217118290124, + "language_loss": 0.8724547, + "learning_rate": 3.551956756667215e-06, + "loss": 0.89406419, + "num_input_tokens_seen": 86200665, + "step": 4004, + "time_per_iteration": 2.588740587234497 + }, + { + "auxiliary_loss_clip": 0.01112984, + "auxiliary_loss_mlp": 0.01055246, + "balance_loss_clip": 1.04746306, + "balance_loss_mlp": 1.03773427, + "epoch": 0.2407936269352172, + "flos": 22494300986880.0, + "grad_norm": 1.9481871846160752, + "language_loss": 0.78054601, + "learning_rate": 3.551711070585177e-06, + "loss": 0.80222833, + "num_input_tokens_seen": 86221640, + "step": 4005, + "time_per_iteration": 2.6189208030700684 + }, + { + "auxiliary_loss_clip": 0.01093523, + "auxiliary_loss_mlp": 0.01041805, + "balance_loss_clip": 1.0552001, + "balance_loss_mlp": 1.02361369, + "epoch": 0.24085375018788516, + "flos": 18551129754240.0, + "grad_norm": 1.759248228701162, + "language_loss": 0.78972691, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.81108022, + "num_input_tokens_seen": 86240795, + "step": 4006, + "time_per_iteration": 2.617008686065674 + }, + { + "auxiliary_loss_clip": 0.01125168, + "auxiliary_loss_mlp": 0.00802104, + "balance_loss_clip": 1.04796863, + "balance_loss_mlp": 1.02371728, + "epoch": 0.24091387344055312, + "flos": 24170539023360.0, + "grad_norm": 1.8154514527029975, + "language_loss": 0.71510124, + "learning_rate": 3.551219521907302e-06, + "loss": 0.73437405, + "num_input_tokens_seen": 86262000, + "step": 4007, + "time_per_iteration": 2.6243021488189697 + }, + { + "auxiliary_loss_clip": 0.01099673, + "auxiliary_loss_mlp": 0.01051721, + "balance_loss_clip": 1.04841959, + "balance_loss_mlp": 1.03467393, + "epoch": 0.24097399669322112, + "flos": 11036319615360.0, + "grad_norm": 1.7124084580579062, + "language_loss": 0.76213837, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.78365231, + "num_input_tokens_seen": 86279680, + "step": 4008, + "time_per_iteration": 2.5774104595184326 + }, + { + "auxiliary_loss_clip": 0.01132232, + "auxiliary_loss_mlp": 0.0103462, + "balance_loss_clip": 1.05102634, + "balance_loss_mlp": 1.01781178, + "epoch": 0.24103411994588908, + "flos": 17165085696000.0, + "grad_norm": 3.103374244293146, + "language_loss": 0.74867499, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.77034354, + "num_input_tokens_seen": 86297180, + "step": 4009, + "time_per_iteration": 2.50140380859375 + }, + { + "auxiliary_loss_clip": 0.01134969, + "auxiliary_loss_mlp": 0.01042157, + "balance_loss_clip": 1.05507278, + "balance_loss_mlp": 1.02599192, + "epoch": 0.24109424319855705, + "flos": 20667956014080.0, + "grad_norm": 2.41954597874374, + "language_loss": 0.8038913, + "learning_rate": 3.550481757745804e-06, + "loss": 0.82566249, + "num_input_tokens_seen": 86317660, + "step": 4010, + "time_per_iteration": 4.076771974563599 + }, + { + "auxiliary_loss_clip": 0.01112707, + "auxiliary_loss_mlp": 0.01047437, + "balance_loss_clip": 1.0464623, + "balance_loss_mlp": 1.02810097, + "epoch": 0.241154366451225, + "flos": 28181796485760.0, + "grad_norm": 1.8691953210459238, + "language_loss": 0.70688415, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.72848558, + "num_input_tokens_seen": 86338325, + "step": 4011, + "time_per_iteration": 2.614694118499756 + }, + { + "auxiliary_loss_clip": 0.0106501, + "auxiliary_loss_mlp": 0.01045721, + "balance_loss_clip": 1.04901385, + "balance_loss_mlp": 1.0272795, + "epoch": 0.24121448970389298, + "flos": 21689722293120.0, + "grad_norm": 1.547327249856818, + "language_loss": 0.69206476, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.71317208, + "num_input_tokens_seen": 86357615, + "step": 4012, + "time_per_iteration": 4.117403268814087 + }, + { + "auxiliary_loss_clip": 0.01136374, + "auxiliary_loss_mlp": 0.01039232, + "balance_loss_clip": 1.05388105, + "balance_loss_mlp": 1.02054036, + "epoch": 0.24127461295656094, + "flos": 39676191269760.0, + "grad_norm": 2.546662546725643, + "language_loss": 0.73377073, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.75552678, + "num_input_tokens_seen": 86380355, + "step": 4013, + "time_per_iteration": 2.7572360038757324 + }, + { + "auxiliary_loss_clip": 0.01145021, + "auxiliary_loss_mlp": 0.01036892, + "balance_loss_clip": 1.05215931, + "balance_loss_mlp": 1.0200243, + "epoch": 0.2413347362092289, + "flos": 19135863256320.0, + "grad_norm": 1.8766334035211751, + "language_loss": 0.88072759, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.90254664, + "num_input_tokens_seen": 86399125, + "step": 4014, + "time_per_iteration": 2.584608316421509 + }, + { + "auxiliary_loss_clip": 0.01112331, + "auxiliary_loss_mlp": 0.01045714, + "balance_loss_clip": 1.04684353, + "balance_loss_mlp": 1.02728415, + "epoch": 0.2413948594618969, + "flos": 26939430829440.0, + "grad_norm": 3.963401747993656, + "language_loss": 0.94899535, + "learning_rate": 3.549250975045952e-06, + "loss": 0.97057581, + "num_input_tokens_seen": 86418625, + "step": 4015, + "time_per_iteration": 2.6074790954589844 + }, + { + "auxiliary_loss_clip": 0.01118639, + "auxiliary_loss_mlp": 0.01041119, + "balance_loss_clip": 1.04913712, + "balance_loss_mlp": 1.0239768, + "epoch": 0.24145498271456486, + "flos": 25228108183680.0, + "grad_norm": 1.5974701962007962, + "language_loss": 0.83014596, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.85174346, + "num_input_tokens_seen": 86438375, + "step": 4016, + "time_per_iteration": 3.981218099594116 + }, + { + "auxiliary_loss_clip": 0.01098541, + "auxiliary_loss_mlp": 0.01045398, + "balance_loss_clip": 1.04932272, + "balance_loss_mlp": 1.02787423, + "epoch": 0.24151510596723283, + "flos": 40661759617920.0, + "grad_norm": 1.7830423585222162, + "language_loss": 0.69282365, + "learning_rate": 3.54875825066639e-06, + "loss": 0.71426308, + "num_input_tokens_seen": 86463230, + "step": 4017, + "time_per_iteration": 4.1247382164001465 + }, + { + "auxiliary_loss_clip": 0.01136839, + "auxiliary_loss_mlp": 0.01050121, + "balance_loss_clip": 1.05055165, + "balance_loss_mlp": 1.03202534, + "epoch": 0.2415752292199008, + "flos": 18146667634560.0, + "grad_norm": 1.6593854001122648, + "language_loss": 0.85084385, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.87271345, + "num_input_tokens_seen": 86481230, + "step": 4018, + "time_per_iteration": 2.4732067584991455 + }, + { + "auxiliary_loss_clip": 0.0104496, + "auxiliary_loss_mlp": 0.01027712, + "balance_loss_clip": 1.02184987, + "balance_loss_mlp": 1.02508914, + "epoch": 0.24163535247256876, + "flos": 67288409792640.0, + "grad_norm": 0.8923993929372823, + "language_loss": 0.60657042, + "learning_rate": 3.548265291370558e-06, + "loss": 0.6272971, + "num_input_tokens_seen": 86541260, + "step": 4019, + "time_per_iteration": 3.180785894393921 + }, + { + "auxiliary_loss_clip": 0.01114539, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.04849195, + "balance_loss_mlp": 1.02413881, + "epoch": 0.24169547572523672, + "flos": 24929941386240.0, + "grad_norm": 1.815298460670813, + "language_loss": 0.72977412, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.75134283, + "num_input_tokens_seen": 86559580, + "step": 4020, + "time_per_iteration": 2.5616390705108643 + }, + { + "auxiliary_loss_clip": 0.01109041, + "auxiliary_loss_mlp": 0.01043272, + "balance_loss_clip": 1.05300891, + "balance_loss_mlp": 1.02651095, + "epoch": 0.24175559897790472, + "flos": 18728312567040.0, + "grad_norm": 2.0251892274081134, + "language_loss": 0.8179847, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.83950788, + "num_input_tokens_seen": 86577560, + "step": 4021, + "time_per_iteration": 2.6048834323883057 + }, + { + "auxiliary_loss_clip": 0.01146623, + "auxiliary_loss_mlp": 0.01044586, + "balance_loss_clip": 1.05097163, + "balance_loss_mlp": 1.02570295, + "epoch": 0.24181572223057268, + "flos": 23039281111680.0, + "grad_norm": 3.275335023822472, + "language_loss": 0.76536536, + "learning_rate": 3.547525412122378e-06, + "loss": 0.78727746, + "num_input_tokens_seen": 86595350, + "step": 4022, + "time_per_iteration": 2.5886471271514893 + }, + { + "auxiliary_loss_clip": 0.01102041, + "auxiliary_loss_mlp": 0.01055398, + "balance_loss_clip": 1.04630566, + "balance_loss_mlp": 1.03450108, + "epoch": 0.24187584548324065, + "flos": 20376145923840.0, + "grad_norm": 3.08554944544278, + "language_loss": 0.7518698, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.77344418, + "num_input_tokens_seen": 86614805, + "step": 4023, + "time_per_iteration": 2.647296190261841 + }, + { + "auxiliary_loss_clip": 0.01116421, + "auxiliary_loss_mlp": 0.0104822, + "balance_loss_clip": 1.04828644, + "balance_loss_mlp": 1.03141165, + "epoch": 0.2419359687359086, + "flos": 21397517153280.0, + "grad_norm": 1.8786778732475815, + "language_loss": 0.82211012, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.84375656, + "num_input_tokens_seen": 86633700, + "step": 4024, + "time_per_iteration": 2.5474495887756348 + }, + { + "auxiliary_loss_clip": 0.01130209, + "auxiliary_loss_mlp": 0.01048, + "balance_loss_clip": 1.05083799, + "balance_loss_mlp": 1.03023815, + "epoch": 0.24199609198857658, + "flos": 18369385914240.0, + "grad_norm": 1.8363904911614037, + "language_loss": 0.86203313, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.88381511, + "num_input_tokens_seen": 86650905, + "step": 4025, + "time_per_iteration": 2.5017287731170654 + }, + { + "auxiliary_loss_clip": 0.01088591, + "auxiliary_loss_mlp": 0.01060871, + "balance_loss_clip": 1.04398167, + "balance_loss_mlp": 1.03986657, + "epoch": 0.24205621524124454, + "flos": 19463871277440.0, + "grad_norm": 2.291313034439703, + "language_loss": 0.71767557, + "learning_rate": 3.546538084949365e-06, + "loss": 0.73917019, + "num_input_tokens_seen": 86669185, + "step": 4026, + "time_per_iteration": 2.5372469425201416 + }, + { + "auxiliary_loss_clip": 0.01130568, + "auxiliary_loss_mlp": 0.01044265, + "balance_loss_clip": 1.04912758, + "balance_loss_mlp": 1.02765906, + "epoch": 0.2421163384939125, + "flos": 14976330451200.0, + "grad_norm": 1.932385070122097, + "language_loss": 0.64375663, + "learning_rate": 3.546291106520509e-06, + "loss": 0.66550493, + "num_input_tokens_seen": 86686805, + "step": 4027, + "time_per_iteration": 2.497695207595825 + }, + { + "auxiliary_loss_clip": 0.01130882, + "auxiliary_loss_mlp": 0.00796493, + "balance_loss_clip": 1.05233431, + "balance_loss_mlp": 1.01803195, + "epoch": 0.2421764617465805, + "flos": 18662057930880.0, + "grad_norm": 2.0747926514283717, + "language_loss": 0.70681512, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.72608888, + "num_input_tokens_seen": 86705520, + "step": 4028, + "time_per_iteration": 2.4857304096221924 + }, + { + "auxiliary_loss_clip": 0.01047859, + "auxiliary_loss_mlp": 0.01002631, + "balance_loss_clip": 1.02258825, + "balance_loss_mlp": 0.99987727, + "epoch": 0.24223658499924847, + "flos": 64347327164160.0, + "grad_norm": 0.8596866466051839, + "language_loss": 0.55335605, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57386094, + "num_input_tokens_seen": 86767320, + "step": 4029, + "time_per_iteration": 3.1139018535614014 + }, + { + "auxiliary_loss_clip": 0.01128465, + "auxiliary_loss_mlp": 0.01043135, + "balance_loss_clip": 1.04789257, + "balance_loss_mlp": 1.02534914, + "epoch": 0.24229670825191643, + "flos": 25775243124480.0, + "grad_norm": 2.0070647139598807, + "language_loss": 0.74112433, + "learning_rate": 3.54554981945833e-06, + "loss": 0.76284033, + "num_input_tokens_seen": 86788110, + "step": 4030, + "time_per_iteration": 2.5590341091156006 + }, + { + "auxiliary_loss_clip": 0.01143192, + "auxiliary_loss_mlp": 0.01047872, + "balance_loss_clip": 1.05018294, + "balance_loss_mlp": 1.03051519, + "epoch": 0.2423568315045844, + "flos": 20667094087680.0, + "grad_norm": 2.308389388341383, + "language_loss": 0.76584744, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.78775811, + "num_input_tokens_seen": 86807640, + "step": 4031, + "time_per_iteration": 2.477060556411743 + }, + { + "auxiliary_loss_clip": 0.01126788, + "auxiliary_loss_mlp": 0.00794528, + "balance_loss_clip": 1.05056179, + "balance_loss_mlp": 1.01542914, + "epoch": 0.24241695475725236, + "flos": 22416805393920.0, + "grad_norm": 2.0302274532160993, + "language_loss": 0.65499628, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.67420948, + "num_input_tokens_seen": 86826795, + "step": 4032, + "time_per_iteration": 2.539369821548462 + }, + { + "auxiliary_loss_clip": 0.01128533, + "auxiliary_loss_mlp": 0.0104903, + "balance_loss_clip": 1.04847717, + "balance_loss_mlp": 1.03108931, + "epoch": 0.24247707800992033, + "flos": 17128995505920.0, + "grad_norm": 2.0138608647254097, + "language_loss": 0.81748688, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.83926249, + "num_input_tokens_seen": 86843175, + "step": 4033, + "time_per_iteration": 2.47478985786438 + }, + { + "auxiliary_loss_clip": 0.01101277, + "auxiliary_loss_mlp": 0.01040001, + "balance_loss_clip": 1.04493499, + "balance_loss_mlp": 1.02263224, + "epoch": 0.2425372012625883, + "flos": 31613743399680.0, + "grad_norm": 1.9399888389233693, + "language_loss": 0.69009948, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.71151227, + "num_input_tokens_seen": 86863185, + "step": 4034, + "time_per_iteration": 2.6520516872406006 + }, + { + "auxiliary_loss_clip": 0.01121607, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.05028987, + "balance_loss_mlp": 1.01788962, + "epoch": 0.24259732451525629, + "flos": 16326032924160.0, + "grad_norm": 2.257829122812066, + "language_loss": 0.96469712, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.98626763, + "num_input_tokens_seen": 86880040, + "step": 4035, + "time_per_iteration": 2.524733066558838 + }, + { + "auxiliary_loss_clip": 0.01109954, + "auxiliary_loss_mlp": 0.01043292, + "balance_loss_clip": 1.0469228, + "balance_loss_mlp": 1.02697241, + "epoch": 0.24265744776792425, + "flos": 22856639431680.0, + "grad_norm": 1.7216986085237018, + "language_loss": 0.77888983, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.80042225, + "num_input_tokens_seen": 86900610, + "step": 4036, + "time_per_iteration": 2.594829797744751 + }, + { + "auxiliary_loss_clip": 0.01131105, + "auxiliary_loss_mlp": 0.01045309, + "balance_loss_clip": 1.0494417, + "balance_loss_mlp": 1.02780914, + "epoch": 0.24271757102059222, + "flos": 21871573873920.0, + "grad_norm": 1.660775597199867, + "language_loss": 0.74402273, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.76578683, + "num_input_tokens_seen": 86919385, + "step": 4037, + "time_per_iteration": 2.5148277282714844 + }, + { + "auxiliary_loss_clip": 0.01100191, + "auxiliary_loss_mlp": 0.01044572, + "balance_loss_clip": 1.04295444, + "balance_loss_mlp": 1.02609491, + "epoch": 0.24277769427326018, + "flos": 19208582340480.0, + "grad_norm": 2.5828722719838333, + "language_loss": 0.76590365, + "learning_rate": 3.543570475921171e-06, + "loss": 0.78735125, + "num_input_tokens_seen": 86938885, + "step": 4038, + "time_per_iteration": 2.56931471824646 + }, + { + "auxiliary_loss_clip": 0.01120516, + "auxiliary_loss_mlp": 0.01043683, + "balance_loss_clip": 1.04519093, + "balance_loss_mlp": 1.02588558, + "epoch": 0.24283781752592815, + "flos": 19499889640320.0, + "grad_norm": 1.8754543105885366, + "language_loss": 0.719172, + "learning_rate": 3.543322794484905e-06, + "loss": 0.74081397, + "num_input_tokens_seen": 86957705, + "step": 4039, + "time_per_iteration": 2.5069429874420166 + }, + { + "auxiliary_loss_clip": 0.01122641, + "auxiliary_loss_mlp": 0.01047942, + "balance_loss_clip": 1.04563642, + "balance_loss_mlp": 1.02970314, + "epoch": 0.2428979407785961, + "flos": 19902196944000.0, + "grad_norm": 1.6458444554386125, + "language_loss": 0.78198588, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.80369174, + "num_input_tokens_seen": 86975845, + "step": 4040, + "time_per_iteration": 2.532717704772949 + }, + { + "auxiliary_loss_clip": 0.0108711, + "auxiliary_loss_mlp": 0.01038578, + "balance_loss_clip": 1.04925156, + "balance_loss_mlp": 1.0232476, + "epoch": 0.2429580640312641, + "flos": 24715878284160.0, + "grad_norm": 1.745798873066938, + "language_loss": 0.80524886, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.82650578, + "num_input_tokens_seen": 86994800, + "step": 4041, + "time_per_iteration": 2.6474568843841553 + }, + { + "auxiliary_loss_clip": 0.01102459, + "auxiliary_loss_mlp": 0.01047221, + "balance_loss_clip": 1.04431391, + "balance_loss_mlp": 1.03077054, + "epoch": 0.24301818728393207, + "flos": 25630343660160.0, + "grad_norm": 2.3407593573194627, + "language_loss": 0.77206039, + "learning_rate": 3.542579399075957e-06, + "loss": 0.79355717, + "num_input_tokens_seen": 87016845, + "step": 4042, + "time_per_iteration": 2.6184520721435547 + }, + { + "auxiliary_loss_clip": 0.01057189, + "auxiliary_loss_mlp": 0.01034831, + "balance_loss_clip": 1.04875016, + "balance_loss_mlp": 1.01901221, + "epoch": 0.24307831053660003, + "flos": 26141388410880.0, + "grad_norm": 1.7904267078337692, + "language_loss": 0.81468761, + "learning_rate": 3.542331483604246e-06, + "loss": 0.83560789, + "num_input_tokens_seen": 87036270, + "step": 4043, + "time_per_iteration": 2.9986908435821533 + }, + { + "auxiliary_loss_clip": 0.01120177, + "auxiliary_loss_mlp": 0.01035618, + "balance_loss_clip": 1.04476559, + "balance_loss_mlp": 1.01786816, + "epoch": 0.243138433789268, + "flos": 14972415868800.0, + "grad_norm": 2.538662212436332, + "language_loss": 0.73247063, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.75402856, + "num_input_tokens_seen": 87049920, + "step": 4044, + "time_per_iteration": 2.6385445594787598 + }, + { + "auxiliary_loss_clip": 0.0113084, + "auxiliary_loss_mlp": 0.01043514, + "balance_loss_clip": 1.0503329, + "balance_loss_mlp": 1.02607369, + "epoch": 0.24319855704193596, + "flos": 25191694771200.0, + "grad_norm": 1.7314338749369147, + "language_loss": 0.8323797, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.85412323, + "num_input_tokens_seen": 87068230, + "step": 4045, + "time_per_iteration": 2.548265218734741 + }, + { + "auxiliary_loss_clip": 0.01069807, + "auxiliary_loss_mlp": 0.01046837, + "balance_loss_clip": 1.04905891, + "balance_loss_mlp": 1.02946854, + "epoch": 0.24325868029460393, + "flos": 22127221946880.0, + "grad_norm": 1.6540377309510772, + "language_loss": 0.86619663, + "learning_rate": 3.541587386314541e-06, + "loss": 0.88736308, + "num_input_tokens_seen": 87086435, + "step": 4046, + "time_per_iteration": 2.665343999862671 + }, + { + "auxiliary_loss_clip": 0.01113428, + "auxiliary_loss_mlp": 0.01040588, + "balance_loss_clip": 1.04386544, + "balance_loss_mlp": 1.02295732, + "epoch": 0.2433188035472719, + "flos": 23582106420480.0, + "grad_norm": 2.8255504732731103, + "language_loss": 0.72904557, + "learning_rate": 3.5413392369578e-06, + "loss": 0.75058573, + "num_input_tokens_seen": 87105340, + "step": 4047, + "time_per_iteration": 2.5578064918518066 + }, + { + "auxiliary_loss_clip": 0.0111971, + "auxiliary_loss_mlp": 0.01044445, + "balance_loss_clip": 1.04478192, + "balance_loss_mlp": 1.02528834, + "epoch": 0.2433789267999399, + "flos": 24462815990400.0, + "grad_norm": 1.9350315978084784, + "language_loss": 0.73341578, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.75505728, + "num_input_tokens_seen": 87125780, + "step": 4048, + "time_per_iteration": 3.9934537410736084 + }, + { + "auxiliary_loss_clip": 0.01105718, + "auxiliary_loss_mlp": 0.0104604, + "balance_loss_clip": 1.04889297, + "balance_loss_mlp": 1.02975655, + "epoch": 0.24343905005260785, + "flos": 16727909264640.0, + "grad_norm": 2.0939950210655387, + "language_loss": 0.73073602, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.75225365, + "num_input_tokens_seen": 87144470, + "step": 4049, + "time_per_iteration": 2.582179069519043 + }, + { + "auxiliary_loss_clip": 0.01094697, + "auxiliary_loss_mlp": 0.01045239, + "balance_loss_clip": 1.0494746, + "balance_loss_mlp": 1.0281086, + "epoch": 0.24349917330527582, + "flos": 20043756443520.0, + "grad_norm": 1.6451586132693121, + "language_loss": 0.73533195, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.75673133, + "num_input_tokens_seen": 87162830, + "step": 4050, + "time_per_iteration": 2.5592751502990723 + }, + { + "auxiliary_loss_clip": 0.01112302, + "auxiliary_loss_mlp": 0.01039314, + "balance_loss_clip": 1.04630661, + "balance_loss_mlp": 1.02400756, + "epoch": 0.24355929655794378, + "flos": 17420554200960.0, + "grad_norm": 3.552410221398559, + "language_loss": 0.75445408, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.77597028, + "num_input_tokens_seen": 87180905, + "step": 4051, + "time_per_iteration": 4.261563539505005 + }, + { + "auxiliary_loss_clip": 0.01088596, + "auxiliary_loss_mlp": 0.01040579, + "balance_loss_clip": 1.04402041, + "balance_loss_mlp": 1.02309132, + "epoch": 0.24361941981061175, + "flos": 25410929431680.0, + "grad_norm": 1.964121489417537, + "language_loss": 0.70954454, + "learning_rate": 3.540097613646296e-06, + "loss": 0.73083627, + "num_input_tokens_seen": 87202290, + "step": 4052, + "time_per_iteration": 2.6766762733459473 + }, + { + "auxiliary_loss_clip": 0.01111934, + "auxiliary_loss_mlp": 0.01047906, + "balance_loss_clip": 1.04940224, + "balance_loss_mlp": 1.03091884, + "epoch": 0.2436795430632797, + "flos": 22820800636800.0, + "grad_norm": 2.0452220700371297, + "language_loss": 0.80944443, + "learning_rate": 3.539849113744351e-06, + "loss": 0.83104277, + "num_input_tokens_seen": 87221650, + "step": 4053, + "time_per_iteration": 2.546811819076538 + }, + { + "auxiliary_loss_clip": 0.011419, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.04839897, + "balance_loss_mlp": 1.0166961, + "epoch": 0.2437396663159477, + "flos": 15157786982400.0, + "grad_norm": 1.486131170278907, + "language_loss": 0.78313071, + "learning_rate": 3.539600555451172e-06, + "loss": 0.80488706, + "num_input_tokens_seen": 87238515, + "step": 4054, + "time_per_iteration": 3.9412338733673096 + }, + { + "auxiliary_loss_clip": 0.01092707, + "auxiliary_loss_mlp": 0.01050961, + "balance_loss_clip": 1.04194045, + "balance_loss_mlp": 1.03441453, + "epoch": 0.24379978956861567, + "flos": 22091131756800.0, + "grad_norm": 1.7563621562144864, + "language_loss": 0.83513236, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.85656905, + "num_input_tokens_seen": 87256290, + "step": 4055, + "time_per_iteration": 2.584815263748169 + }, + { + "auxiliary_loss_clip": 0.01107682, + "auxiliary_loss_mlp": 0.01042001, + "balance_loss_clip": 1.0457263, + "balance_loss_mlp": 1.02473927, + "epoch": 0.24385991282128364, + "flos": 31467766527360.0, + "grad_norm": 2.578589499630782, + "language_loss": 0.54837275, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.56986952, + "num_input_tokens_seen": 87277085, + "step": 4056, + "time_per_iteration": 4.116791486740112 + }, + { + "auxiliary_loss_clip": 0.01131829, + "auxiliary_loss_mlp": 0.01045558, + "balance_loss_clip": 1.04693413, + "balance_loss_mlp": 1.02851152, + "epoch": 0.2439200360739516, + "flos": 23838795987840.0, + "grad_norm": 2.3497757278599054, + "language_loss": 0.80489063, + "learning_rate": 3.538854530318506e-06, + "loss": 0.82666445, + "num_input_tokens_seen": 87293020, + "step": 4057, + "time_per_iteration": 2.555865526199341 + }, + { + "auxiliary_loss_clip": 0.01129609, + "auxiliary_loss_mlp": 0.01042933, + "balance_loss_clip": 1.04953766, + "balance_loss_mlp": 1.02571964, + "epoch": 0.24398015932661957, + "flos": 19169978198400.0, + "grad_norm": 1.857725121557648, + "language_loss": 0.79824817, + "learning_rate": 3.538605738554673e-06, + "loss": 0.81997359, + "num_input_tokens_seen": 87311445, + "step": 4058, + "time_per_iteration": 2.508112907409668 + }, + { + "auxiliary_loss_clip": 0.01143594, + "auxiliary_loss_mlp": 0.01044896, + "balance_loss_clip": 1.04854465, + "balance_loss_mlp": 1.02873147, + "epoch": 0.24404028257928753, + "flos": 25262474520960.0, + "grad_norm": 1.6853755783741657, + "language_loss": 0.85349518, + "learning_rate": 3.538356888446756e-06, + "loss": 0.8753801, + "num_input_tokens_seen": 87332055, + "step": 4059, + "time_per_iteration": 2.5445706844329834 + }, + { + "auxiliary_loss_clip": 0.01119939, + "auxiliary_loss_mlp": 0.01040853, + "balance_loss_clip": 1.04820955, + "balance_loss_mlp": 1.02480769, + "epoch": 0.2441004058319555, + "flos": 26467600752000.0, + "grad_norm": 1.5792654594178297, + "language_loss": 0.74005854, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.76166648, + "num_input_tokens_seen": 87351295, + "step": 4060, + "time_per_iteration": 2.533621311187744 + }, + { + "auxiliary_loss_clip": 0.01111929, + "auxiliary_loss_mlp": 0.01051804, + "balance_loss_clip": 1.05061626, + "balance_loss_mlp": 1.032444, + "epoch": 0.2441605290846235, + "flos": 26760524163840.0, + "grad_norm": 1.6863196326097842, + "language_loss": 0.73267519, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.75431252, + "num_input_tokens_seen": 87370650, + "step": 4061, + "time_per_iteration": 2.6412577629089355 + }, + { + "auxiliary_loss_clip": 0.01141652, + "auxiliary_loss_mlp": 0.01043399, + "balance_loss_clip": 1.0519197, + "balance_loss_mlp": 1.02722239, + "epoch": 0.24422065233729146, + "flos": 21105850717440.0, + "grad_norm": 1.6919656136647816, + "language_loss": 0.75998342, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.78183395, + "num_input_tokens_seen": 87389020, + "step": 4062, + "time_per_iteration": 2.489513635635376 + }, + { + "auxiliary_loss_clip": 0.010969, + "auxiliary_loss_mlp": 0.01040973, + "balance_loss_clip": 1.04417419, + "balance_loss_mlp": 1.0239265, + "epoch": 0.24428077558995942, + "flos": 25263156879360.0, + "grad_norm": 2.8224101995053212, + "language_loss": 0.85319197, + "learning_rate": 3.537360904763011e-06, + "loss": 0.87457067, + "num_input_tokens_seen": 87409695, + "step": 4063, + "time_per_iteration": 2.6377546787261963 + }, + { + "auxiliary_loss_clip": 0.01115781, + "auxiliary_loss_mlp": 0.0104536, + "balance_loss_clip": 1.04801059, + "balance_loss_mlp": 1.02638149, + "epoch": 0.24434089884262739, + "flos": 20485278420480.0, + "grad_norm": 2.330424838905176, + "language_loss": 0.68731987, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.70893133, + "num_input_tokens_seen": 87428250, + "step": 4064, + "time_per_iteration": 2.5396385192871094 + }, + { + "auxiliary_loss_clip": 0.01131381, + "auxiliary_loss_mlp": 0.01037037, + "balance_loss_clip": 1.04562259, + "balance_loss_mlp": 1.01938248, + "epoch": 0.24440102209529535, + "flos": 23621895711360.0, + "grad_norm": 1.5977548026969297, + "language_loss": 0.6986832, + "learning_rate": 3.536862563102088e-06, + "loss": 0.72036737, + "num_input_tokens_seen": 87449380, + "step": 4065, + "time_per_iteration": 2.5446460247039795 + }, + { + "auxiliary_loss_clip": 0.01144215, + "auxiliary_loss_mlp": 0.01050484, + "balance_loss_clip": 1.04831016, + "balance_loss_mlp": 1.03099322, + "epoch": 0.24446114534796332, + "flos": 20554729367040.0, + "grad_norm": 1.8400148046646443, + "language_loss": 0.84147298, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.86342001, + "num_input_tokens_seen": 87465365, + "step": 4066, + "time_per_iteration": 2.465567111968994 + }, + { + "auxiliary_loss_clip": 0.01054569, + "auxiliary_loss_mlp": 0.01006142, + "balance_loss_clip": 1.0221839, + "balance_loss_mlp": 1.00363851, + "epoch": 0.24452126860063128, + "flos": 60389575009920.0, + "grad_norm": 0.735897067806973, + "language_loss": 0.52292848, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54353559, + "num_input_tokens_seen": 87522525, + "step": 4067, + "time_per_iteration": 2.9932448863983154 + }, + { + "auxiliary_loss_clip": 0.01115961, + "auxiliary_loss_mlp": 0.01043709, + "balance_loss_clip": 1.04977655, + "balance_loss_mlp": 1.02629256, + "epoch": 0.24458139185329927, + "flos": 15121660878720.0, + "grad_norm": 5.6492349172965355, + "language_loss": 0.72757614, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.74917281, + "num_input_tokens_seen": 87539170, + "step": 4068, + "time_per_iteration": 2.5147202014923096 + }, + { + "auxiliary_loss_clip": 0.01081586, + "auxiliary_loss_mlp": 0.01049372, + "balance_loss_clip": 1.04547429, + "balance_loss_mlp": 1.03122818, + "epoch": 0.24464151510596724, + "flos": 27998723842560.0, + "grad_norm": 1.4615960965478452, + "language_loss": 0.77617294, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.79748249, + "num_input_tokens_seen": 87558875, + "step": 4069, + "time_per_iteration": 2.658329725265503 + }, + { + "auxiliary_loss_clip": 0.01109904, + "auxiliary_loss_mlp": 0.01046352, + "balance_loss_clip": 1.04953909, + "balance_loss_mlp": 1.02866125, + "epoch": 0.2447016383586352, + "flos": 19792884879360.0, + "grad_norm": 1.9226532412656518, + "language_loss": 0.80536497, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.82692748, + "num_input_tokens_seen": 87576485, + "step": 4070, + "time_per_iteration": 2.513995885848999 + }, + { + "auxiliary_loss_clip": 0.01115447, + "auxiliary_loss_mlp": 0.01044847, + "balance_loss_clip": 1.04194021, + "balance_loss_mlp": 1.02782345, + "epoch": 0.24476176161130317, + "flos": 26067340523520.0, + "grad_norm": 1.4795252715687568, + "language_loss": 0.84157377, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.8631767, + "num_input_tokens_seen": 87598620, + "step": 4071, + "time_per_iteration": 2.5597803592681885 + }, + { + "auxiliary_loss_clip": 0.01111101, + "auxiliary_loss_mlp": 0.01058603, + "balance_loss_clip": 1.04430556, + "balance_loss_mlp": 1.03756237, + "epoch": 0.24482188486397113, + "flos": 18843550375680.0, + "grad_norm": 3.848192691463926, + "language_loss": 0.79799104, + "learning_rate": 3.535116532028798e-06, + "loss": 0.81968808, + "num_input_tokens_seen": 87616595, + "step": 4072, + "time_per_iteration": 2.5026051998138428 + }, + { + "auxiliary_loss_clip": 0.01125416, + "auxiliary_loss_mlp": 0.01042288, + "balance_loss_clip": 1.0474143, + "balance_loss_mlp": 1.02625394, + "epoch": 0.2448820081166391, + "flos": 21251791676160.0, + "grad_norm": 1.8016006957344795, + "language_loss": 0.70453238, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.7262094, + "num_input_tokens_seen": 87635755, + "step": 4073, + "time_per_iteration": 2.5232815742492676 + }, + { + "auxiliary_loss_clip": 0.01104844, + "auxiliary_loss_mlp": 0.01044772, + "balance_loss_clip": 1.04434156, + "balance_loss_mlp": 1.02796352, + "epoch": 0.2449421313693071, + "flos": 23950586090880.0, + "grad_norm": 2.2536831441202634, + "language_loss": 0.67355007, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.69504625, + "num_input_tokens_seen": 87652885, + "step": 4074, + "time_per_iteration": 2.559296131134033 + }, + { + "auxiliary_loss_clip": 0.01052857, + "auxiliary_loss_mlp": 0.01002022, + "balance_loss_clip": 1.02016497, + "balance_loss_mlp": 0.99955481, + "epoch": 0.24500225462197506, + "flos": 60687669980160.0, + "grad_norm": 0.9704253833200763, + "language_loss": 0.68699533, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.70754409, + "num_input_tokens_seen": 87713220, + "step": 4075, + "time_per_iteration": 3.171764850616455 + }, + { + "auxiliary_loss_clip": 0.01137022, + "auxiliary_loss_mlp": 0.01039808, + "balance_loss_clip": 1.04742277, + "balance_loss_mlp": 1.02258253, + "epoch": 0.24506237787464302, + "flos": 26284204886400.0, + "grad_norm": 1.9466558058527834, + "language_loss": 0.79569173, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.81746, + "num_input_tokens_seen": 87732680, + "step": 4076, + "time_per_iteration": 2.5383574962615967 + }, + { + "auxiliary_loss_clip": 0.01117743, + "auxiliary_loss_mlp": 0.00799091, + "balance_loss_clip": 1.04427421, + "balance_loss_mlp": 1.01950848, + "epoch": 0.245122501127311, + "flos": 20552287242240.0, + "grad_norm": 1.9212466110137036, + "language_loss": 0.82518983, + "learning_rate": 3.533867620434151e-06, + "loss": 0.84435821, + "num_input_tokens_seen": 87751880, + "step": 4077, + "time_per_iteration": 2.530695676803589 + }, + { + "auxiliary_loss_clip": 0.01139553, + "auxiliary_loss_mlp": 0.01050621, + "balance_loss_clip": 1.04718626, + "balance_loss_mlp": 1.03211927, + "epoch": 0.24518262437997895, + "flos": 29132603447040.0, + "grad_norm": 2.119799103636231, + "language_loss": 0.61810875, + "learning_rate": 3.533617663584082e-06, + "loss": 0.64001048, + "num_input_tokens_seen": 87771795, + "step": 4078, + "time_per_iteration": 2.584671974182129 + }, + { + "auxiliary_loss_clip": 0.01106838, + "auxiliary_loss_mlp": 0.01038334, + "balance_loss_clip": 1.0468055, + "balance_loss_mlp": 1.02164483, + "epoch": 0.24524274763264692, + "flos": 23476924419840.0, + "grad_norm": 1.541652739336377, + "language_loss": 0.7580663, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.77951795, + "num_input_tokens_seen": 87793640, + "step": 4079, + "time_per_iteration": 2.600055456161499 + }, + { + "auxiliary_loss_clip": 0.01135982, + "auxiliary_loss_mlp": 0.01048022, + "balance_loss_clip": 1.04581106, + "balance_loss_mlp": 1.02946115, + "epoch": 0.24530287088531488, + "flos": 17201175886080.0, + "grad_norm": 1.9531432142201848, + "language_loss": 0.75416845, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.77600849, + "num_input_tokens_seen": 87812390, + "step": 4080, + "time_per_iteration": 2.4901764392852783 + }, + { + "auxiliary_loss_clip": 0.01110847, + "auxiliary_loss_mlp": 0.01047747, + "balance_loss_clip": 1.04658723, + "balance_loss_mlp": 1.03007984, + "epoch": 0.24536299413798288, + "flos": 14867449349760.0, + "grad_norm": 1.8816592413859812, + "language_loss": 0.8332175, + "learning_rate": 3.532867444142186e-06, + "loss": 0.85480344, + "num_input_tokens_seen": 87830640, + "step": 4081, + "time_per_iteration": 2.522614002227783 + }, + { + "auxiliary_loss_clip": 0.01111238, + "auxiliary_loss_mlp": 0.01040632, + "balance_loss_clip": 1.04979253, + "balance_loss_mlp": 1.02455103, + "epoch": 0.24542311739065084, + "flos": 35262051886080.0, + "grad_norm": 2.2200492557024103, + "language_loss": 0.73291588, + "learning_rate": 3.532617254729267e-06, + "loss": 0.75443459, + "num_input_tokens_seen": 87850450, + "step": 4082, + "time_per_iteration": 2.6994357109069824 + }, + { + "auxiliary_loss_clip": 0.01098353, + "auxiliary_loss_mlp": 0.01042265, + "balance_loss_clip": 1.04277372, + "balance_loss_mlp": 1.02613568, + "epoch": 0.2454832406433188, + "flos": 21503130117120.0, + "grad_norm": 2.008526572057144, + "language_loss": 0.71873218, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.74013841, + "num_input_tokens_seen": 87868810, + "step": 4083, + "time_per_iteration": 2.5573463439941406 + }, + { + "auxiliary_loss_clip": 0.01114321, + "auxiliary_loss_mlp": 0.0104478, + "balance_loss_clip": 1.04943562, + "balance_loss_mlp": 1.02573085, + "epoch": 0.24554336389598677, + "flos": 14756664827520.0, + "grad_norm": 2.3963013424583175, + "language_loss": 0.74696505, + "learning_rate": 3.532116701561919e-06, + "loss": 0.768556, + "num_input_tokens_seen": 87885685, + "step": 4084, + "time_per_iteration": 2.5822641849517822 + }, + { + "auxiliary_loss_clip": 0.01121863, + "auxiliary_loss_mlp": 0.01038712, + "balance_loss_clip": 1.0455904, + "balance_loss_mlp": 1.02102089, + "epoch": 0.24560348714865474, + "flos": 14976402278400.0, + "grad_norm": 1.837033736313879, + "language_loss": 0.853522, + "learning_rate": 3.531866337826471e-06, + "loss": 0.87512779, + "num_input_tokens_seen": 87903715, + "step": 4085, + "time_per_iteration": 2.4841411113739014 + }, + { + "auxiliary_loss_clip": 0.01106379, + "auxiliary_loss_mlp": 0.01050924, + "balance_loss_clip": 1.04965615, + "balance_loss_mlp": 1.03422308, + "epoch": 0.2456636104013227, + "flos": 22675326554880.0, + "grad_norm": 2.17373593527973, + "language_loss": 0.7919153, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.81348836, + "num_input_tokens_seen": 87923375, + "step": 4086, + "time_per_iteration": 2.5669467449188232 + }, + { + "auxiliary_loss_clip": 0.01084263, + "auxiliary_loss_mlp": 0.01041736, + "balance_loss_clip": 1.05082965, + "balance_loss_mlp": 1.02468944, + "epoch": 0.2457237336539907, + "flos": 27417869009280.0, + "grad_norm": 1.9327504568533138, + "language_loss": 0.74810493, + "learning_rate": 3.531365436099496e-06, + "loss": 0.76936489, + "num_input_tokens_seen": 87943115, + "step": 4087, + "time_per_iteration": 2.6562256813049316 + }, + { + "auxiliary_loss_clip": 0.01080461, + "auxiliary_loss_mlp": 0.01041709, + "balance_loss_clip": 1.04579067, + "balance_loss_mlp": 1.02373266, + "epoch": 0.24578385690665866, + "flos": 20412379768320.0, + "grad_norm": 2.301462338837411, + "language_loss": 0.79594231, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.81716394, + "num_input_tokens_seen": 87959505, + "step": 4088, + "time_per_iteration": 4.188084602355957 + }, + { + "auxiliary_loss_clip": 0.01095449, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.04577804, + "balance_loss_mlp": 1.01668727, + "epoch": 0.24584398015932662, + "flos": 23915393740800.0, + "grad_norm": 2.7966761098355954, + "language_loss": 0.77486593, + "learning_rate": 3.5308643020944e-06, + "loss": 0.79614425, + "num_input_tokens_seen": 87979725, + "step": 4089, + "time_per_iteration": 4.057018518447876 + }, + { + "auxiliary_loss_clip": 0.01117572, + "auxiliary_loss_mlp": 0.01045765, + "balance_loss_clip": 1.04462266, + "balance_loss_mlp": 1.02884936, + "epoch": 0.2459041034119946, + "flos": 41496359103360.0, + "grad_norm": 2.4218106904578476, + "language_loss": 0.81431168, + "learning_rate": 3.530613648011309e-06, + "loss": 0.83594513, + "num_input_tokens_seen": 87998270, + "step": 4090, + "time_per_iteration": 2.7036917209625244 + }, + { + "auxiliary_loss_clip": 0.01116544, + "auxiliary_loss_mlp": 0.01044538, + "balance_loss_clip": 1.04748869, + "balance_loss_mlp": 1.02670467, + "epoch": 0.24596422666466256, + "flos": 19936814676480.0, + "grad_norm": 1.6863877575491022, + "language_loss": 0.73427761, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.75588834, + "num_input_tokens_seen": 88016760, + "step": 4091, + "time_per_iteration": 2.564976692199707 + }, + { + "auxiliary_loss_clip": 0.01105134, + "auxiliary_loss_mlp": 0.01047622, + "balance_loss_clip": 1.05047441, + "balance_loss_mlp": 1.03002667, + "epoch": 0.24602434991733052, + "flos": 21544391865600.0, + "grad_norm": 1.9259740970569186, + "language_loss": 0.76788026, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.78940785, + "num_input_tokens_seen": 88036465, + "step": 4092, + "time_per_iteration": 2.602738618850708 + }, + { + "auxiliary_loss_clip": 0.01111087, + "auxiliary_loss_mlp": 0.01038944, + "balance_loss_clip": 1.04835844, + "balance_loss_mlp": 1.02093172, + "epoch": 0.24608447316999849, + "flos": 23185078416000.0, + "grad_norm": 2.6535042907297384, + "language_loss": 0.81777549, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.83927578, + "num_input_tokens_seen": 88053270, + "step": 4093, + "time_per_iteration": 2.565294027328491 + }, + { + "auxiliary_loss_clip": 0.0112937, + "auxiliary_loss_mlp": 0.01046147, + "balance_loss_clip": 1.04659247, + "balance_loss_mlp": 1.02832544, + "epoch": 0.24614459642266648, + "flos": 19641951930240.0, + "grad_norm": 2.763278392957555, + "language_loss": 0.87075579, + "learning_rate": 3.529610451363797e-06, + "loss": 0.89251095, + "num_input_tokens_seen": 88072305, + "step": 4094, + "time_per_iteration": 5.4436821937561035 + }, + { + "auxiliary_loss_clip": 0.01005841, + "auxiliary_loss_mlp": 0.01022095, + "balance_loss_clip": 1.03511453, + "balance_loss_mlp": 1.01962698, + "epoch": 0.24620471967533444, + "flos": 61739816186880.0, + "grad_norm": 0.761623042357955, + "language_loss": 0.57535207, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59563142, + "num_input_tokens_seen": 88137995, + "step": 4095, + "time_per_iteration": 3.380734920501709 + }, + { + "auxiliary_loss_clip": 0.01045027, + "auxiliary_loss_mlp": 0.01020194, + "balance_loss_clip": 1.02984977, + "balance_loss_mlp": 1.01764345, + "epoch": 0.2462648429280024, + "flos": 69154436315520.0, + "grad_norm": 0.6468204870944155, + "language_loss": 0.56254905, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58320123, + "num_input_tokens_seen": 88208490, + "step": 4096, + "time_per_iteration": 3.386735439300537 + }, + { + "auxiliary_loss_clip": 0.01124918, + "auxiliary_loss_mlp": 0.01037995, + "balance_loss_clip": 1.05461454, + "balance_loss_mlp": 1.02146661, + "epoch": 0.24632496618067037, + "flos": 29459605887360.0, + "grad_norm": 1.78537155839086, + "language_loss": 0.77265137, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.79428053, + "num_input_tokens_seen": 88228050, + "step": 4097, + "time_per_iteration": 2.62400484085083 + }, + { + "auxiliary_loss_clip": 0.01107265, + "auxiliary_loss_mlp": 0.01047272, + "balance_loss_clip": 1.0481807, + "balance_loss_mlp": 1.02796006, + "epoch": 0.24638508943333834, + "flos": 24316444068480.0, + "grad_norm": 5.137470019199943, + "language_loss": 0.76399124, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.78553665, + "num_input_tokens_seen": 88248090, + "step": 4098, + "time_per_iteration": 2.634230375289917 + }, + { + "auxiliary_loss_clip": 0.01119, + "auxiliary_loss_mlp": 0.01042201, + "balance_loss_clip": 1.04897356, + "balance_loss_mlp": 1.02609622, + "epoch": 0.2464452126860063, + "flos": 26613254401920.0, + "grad_norm": 2.05460511762004, + "language_loss": 0.6777097, + "learning_rate": 3.528355150558764e-06, + "loss": 0.69932169, + "num_input_tokens_seen": 88267545, + "step": 4099, + "time_per_iteration": 2.6085078716278076 + }, + { + "auxiliary_loss_clip": 0.01121189, + "auxiliary_loss_mlp": 0.01040258, + "balance_loss_clip": 1.04561186, + "balance_loss_mlp": 1.02422416, + "epoch": 0.24650533593867427, + "flos": 31212405763200.0, + "grad_norm": 2.5931873230484506, + "language_loss": 0.6583665, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.67998099, + "num_input_tokens_seen": 88289785, + "step": 4100, + "time_per_iteration": 2.6501948833465576 + }, + { + "auxiliary_loss_clip": 0.01037679, + "auxiliary_loss_mlp": 0.01011164, + "balance_loss_clip": 1.0251236, + "balance_loss_mlp": 1.00838697, + "epoch": 0.24656545919134226, + "flos": 68494002900480.0, + "grad_norm": 0.7163803846660457, + "language_loss": 0.6150279, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63551641, + "num_input_tokens_seen": 88357320, + "step": 4101, + "time_per_iteration": 3.335144281387329 + }, + { + "auxiliary_loss_clip": 0.01133851, + "auxiliary_loss_mlp": 0.01040431, + "balance_loss_clip": 1.04597139, + "balance_loss_mlp": 1.02371776, + "epoch": 0.24662558244401023, + "flos": 20084192179200.0, + "grad_norm": 1.6621046723805768, + "language_loss": 0.73425752, + "learning_rate": 3.527601274535012e-06, + "loss": 0.75600034, + "num_input_tokens_seen": 88377040, + "step": 4102, + "time_per_iteration": 2.502042531967163 + }, + { + "auxiliary_loss_clip": 0.01122108, + "auxiliary_loss_mlp": 0.01034935, + "balance_loss_clip": 1.05115032, + "balance_loss_mlp": 1.01866329, + "epoch": 0.2466857056966782, + "flos": 30701361012480.0, + "grad_norm": 2.0846927384935237, + "language_loss": 0.76391196, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.78548235, + "num_input_tokens_seen": 88395085, + "step": 4103, + "time_per_iteration": 2.628417491912842 + }, + { + "auxiliary_loss_clip": 0.01124086, + "auxiliary_loss_mlp": 0.01040225, + "balance_loss_clip": 1.04939628, + "balance_loss_mlp": 1.02327991, + "epoch": 0.24674582894934616, + "flos": 22528523669760.0, + "grad_norm": 2.2781281745005124, + "language_loss": 0.78875113, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.81039417, + "num_input_tokens_seen": 88413205, + "step": 4104, + "time_per_iteration": 2.5066943168640137 + }, + { + "auxiliary_loss_clip": 0.01125264, + "auxiliary_loss_mlp": 0.01041177, + "balance_loss_clip": 1.04759538, + "balance_loss_mlp": 1.02279544, + "epoch": 0.24680595220201412, + "flos": 20704297599360.0, + "grad_norm": 1.75821864123404, + "language_loss": 0.83185595, + "learning_rate": 3.526846877170133e-06, + "loss": 0.85352039, + "num_input_tokens_seen": 88431525, + "step": 4105, + "time_per_iteration": 2.517305850982666 + }, + { + "auxiliary_loss_clip": 0.01138692, + "auxiliary_loss_mlp": 0.01040378, + "balance_loss_clip": 1.04915869, + "balance_loss_mlp": 1.02396274, + "epoch": 0.2468660754546821, + "flos": 21831174051840.0, + "grad_norm": 1.7821254374915958, + "language_loss": 0.76362121, + "learning_rate": 3.52659529557275e-06, + "loss": 0.78541195, + "num_input_tokens_seen": 88451210, + "step": 4106, + "time_per_iteration": 2.50044322013855 + }, + { + "auxiliary_loss_clip": 0.01101183, + "auxiliary_loss_mlp": 0.01057149, + "balance_loss_clip": 1.04281211, + "balance_loss_mlp": 1.03598893, + "epoch": 0.24692619870735008, + "flos": 15267709578240.0, + "grad_norm": 3.061023790765575, + "language_loss": 0.72957838, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.75116169, + "num_input_tokens_seen": 88467790, + "step": 4107, + "time_per_iteration": 2.509249687194824 + }, + { + "auxiliary_loss_clip": 0.01139325, + "auxiliary_loss_mlp": 0.01048073, + "balance_loss_clip": 1.04902554, + "balance_loss_mlp": 1.03102636, + "epoch": 0.24698632196001805, + "flos": 29680097523840.0, + "grad_norm": 2.0717804190004134, + "language_loss": 0.65264738, + "learning_rate": 3.526091958721587e-06, + "loss": 0.67452133, + "num_input_tokens_seen": 88490330, + "step": 4108, + "time_per_iteration": 2.565877914428711 + }, + { + "auxiliary_loss_clip": 0.01088916, + "auxiliary_loss_mlp": 0.01046132, + "balance_loss_clip": 1.04593635, + "balance_loss_mlp": 1.02897763, + "epoch": 0.247046445212686, + "flos": 39165469741440.0, + "grad_norm": 1.769755366651145, + "language_loss": 0.72741055, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.74876094, + "num_input_tokens_seen": 88512435, + "step": 4109, + "time_per_iteration": 2.755033493041992 + }, + { + "auxiliary_loss_clip": 0.011054, + "auxiliary_loss_mlp": 0.01043577, + "balance_loss_clip": 1.04649365, + "balance_loss_mlp": 1.0265305, + "epoch": 0.24710656846535398, + "flos": 22998845376000.0, + "grad_norm": 2.411221865263363, + "language_loss": 0.79137492, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.81286472, + "num_input_tokens_seen": 88529780, + "step": 4110, + "time_per_iteration": 2.585092782974243 + }, + { + "auxiliary_loss_clip": 0.01113201, + "auxiliary_loss_mlp": 0.01040365, + "balance_loss_clip": 1.05533838, + "balance_loss_mlp": 1.02260327, + "epoch": 0.24716669171802194, + "flos": 26432803451520.0, + "grad_norm": 2.4366560444076923, + "language_loss": 0.80910999, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.83064568, + "num_input_tokens_seen": 88547200, + "step": 4111, + "time_per_iteration": 2.575685739517212 + }, + { + "auxiliary_loss_clip": 0.0113651, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_clip": 1.04611444, + "balance_loss_mlp": 1.02690661, + "epoch": 0.2472268149706899, + "flos": 23329870139520.0, + "grad_norm": 1.8703946250913648, + "language_loss": 0.75027156, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.77206522, + "num_input_tokens_seen": 88566415, + "step": 4112, + "time_per_iteration": 2.494840383529663 + }, + { + "auxiliary_loss_clip": 0.01106586, + "auxiliary_loss_mlp": 0.00796106, + "balance_loss_clip": 1.0466553, + "balance_loss_mlp": 1.01570094, + "epoch": 0.24728693822335787, + "flos": 23768734510080.0, + "grad_norm": 2.1896344182442604, + "language_loss": 0.82396328, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.84299016, + "num_input_tokens_seen": 88585225, + "step": 4113, + "time_per_iteration": 2.614182710647583 + }, + { + "auxiliary_loss_clip": 0.01136864, + "auxiliary_loss_mlp": 0.0104037, + "balance_loss_clip": 1.0464623, + "balance_loss_mlp": 1.02291799, + "epoch": 0.24734706147602586, + "flos": 19317499355520.0, + "grad_norm": 2.375625929213814, + "language_loss": 0.87100542, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.8927778, + "num_input_tokens_seen": 88603280, + "step": 4114, + "time_per_iteration": 2.4783382415771484 + }, + { + "auxiliary_loss_clip": 0.01090178, + "auxiliary_loss_mlp": 0.01038062, + "balance_loss_clip": 1.04879808, + "balance_loss_mlp": 1.02192092, + "epoch": 0.24740718472869383, + "flos": 28036932935040.0, + "grad_norm": 1.8014719145348934, + "language_loss": 0.75647044, + "learning_rate": 3.524328457352734e-06, + "loss": 0.77775288, + "num_input_tokens_seen": 88624925, + "step": 4115, + "time_per_iteration": 2.6645374298095703 + }, + { + "auxiliary_loss_clip": 0.0100418, + "auxiliary_loss_mlp": 0.01004414, + "balance_loss_clip": 1.02191377, + "balance_loss_mlp": 1.00146914, + "epoch": 0.2474673079813618, + "flos": 68107569408000.0, + "grad_norm": 0.6884371776921112, + "language_loss": 0.58225501, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60234094, + "num_input_tokens_seen": 88691475, + "step": 4116, + "time_per_iteration": 3.2892348766326904 + }, + { + "auxiliary_loss_clip": 0.01117571, + "auxiliary_loss_mlp": 0.01036068, + "balance_loss_clip": 1.0489924, + "balance_loss_mlp": 1.01941454, + "epoch": 0.24752743123402976, + "flos": 29462119839360.0, + "grad_norm": 1.5755206340169294, + "language_loss": 0.83955437, + "learning_rate": 3.523824079451235e-06, + "loss": 0.86109078, + "num_input_tokens_seen": 88713425, + "step": 4117, + "time_per_iteration": 2.6707966327667236 + }, + { + "auxiliary_loss_clip": 0.010429, + "auxiliary_loss_mlp": 0.01120672, + "balance_loss_clip": 1.0318023, + "balance_loss_mlp": 1.60832429, + "epoch": 0.24758755448669773, + "flos": 58350459824640.0, + "grad_norm": 0.9183164676287245, + "language_loss": 0.63480014, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.65643585, + "num_input_tokens_seen": 88769995, + "step": 4118, + "time_per_iteration": 2.9928536415100098 + }, + { + "auxiliary_loss_clip": 0.01124542, + "auxiliary_loss_mlp": 0.01043981, + "balance_loss_clip": 1.04752135, + "balance_loss_mlp": 1.02767289, + "epoch": 0.2476476777393657, + "flos": 20484416494080.0, + "grad_norm": 2.751520021795543, + "language_loss": 0.7927981, + "learning_rate": 3.523319470415491e-06, + "loss": 0.81448328, + "num_input_tokens_seen": 88789970, + "step": 4119, + "time_per_iteration": 2.5344390869140625 + }, + { + "auxiliary_loss_clip": 0.01125539, + "auxiliary_loss_mlp": 0.01044673, + "balance_loss_clip": 1.0487349, + "balance_loss_mlp": 1.027578, + "epoch": 0.24770780099203366, + "flos": 20485853038080.0, + "grad_norm": 2.5427382362908157, + "language_loss": 0.74193549, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.76363766, + "num_input_tokens_seen": 88810000, + "step": 4120, + "time_per_iteration": 2.522059679031372 + }, + { + "auxiliary_loss_clip": 0.01135088, + "auxiliary_loss_mlp": 0.01046808, + "balance_loss_clip": 1.05156136, + "balance_loss_mlp": 1.02946293, + "epoch": 0.24776792424470165, + "flos": 15153405523200.0, + "grad_norm": 2.4358877409098048, + "language_loss": 0.88883543, + "learning_rate": 3.522814630322041e-06, + "loss": 0.91065437, + "num_input_tokens_seen": 88827515, + "step": 4121, + "time_per_iteration": 2.5243165493011475 + }, + { + "auxiliary_loss_clip": 0.01140053, + "auxiliary_loss_mlp": 0.01042954, + "balance_loss_clip": 1.04703283, + "balance_loss_mlp": 1.02527499, + "epoch": 0.2478280474973696, + "flos": 21725453347200.0, + "grad_norm": 1.9237126277521899, + "language_loss": 0.69383675, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.71566677, + "num_input_tokens_seen": 88845025, + "step": 4122, + "time_per_iteration": 2.475464344024658 + }, + { + "auxiliary_loss_clip": 0.01140506, + "auxiliary_loss_mlp": 0.01044107, + "balance_loss_clip": 1.04741681, + "balance_loss_mlp": 1.02492595, + "epoch": 0.24788817075003758, + "flos": 20412200200320.0, + "grad_norm": 2.2434594038346307, + "language_loss": 0.80372918, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.82557535, + "num_input_tokens_seen": 88861740, + "step": 4123, + "time_per_iteration": 2.4736177921295166 + }, + { + "auxiliary_loss_clip": 0.01081597, + "auxiliary_loss_mlp": 0.01048962, + "balance_loss_clip": 1.05137062, + "balance_loss_mlp": 1.03181958, + "epoch": 0.24794829400270554, + "flos": 22594455083520.0, + "grad_norm": 1.847149334970972, + "language_loss": 0.74529076, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.76659632, + "num_input_tokens_seen": 88879740, + "step": 4124, + "time_per_iteration": 2.618082046508789 + }, + { + "auxiliary_loss_clip": 0.01126016, + "auxiliary_loss_mlp": 0.01043572, + "balance_loss_clip": 1.04680741, + "balance_loss_mlp": 1.02749085, + "epoch": 0.2480084172553735, + "flos": 39676047615360.0, + "grad_norm": 1.4149046635443423, + "language_loss": 0.73537505, + "learning_rate": 3.521804257268357e-06, + "loss": 0.7570709, + "num_input_tokens_seen": 88904095, + "step": 4125, + "time_per_iteration": 2.7050702571868896 + }, + { + "auxiliary_loss_clip": 0.01110037, + "auxiliary_loss_mlp": 0.00891478, + "balance_loss_clip": 1.05011868, + "balance_loss_mlp": 1.19141185, + "epoch": 0.24806854050804147, + "flos": 22053712763520.0, + "grad_norm": 1.683684435761037, + "language_loss": 0.69425464, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.71426976, + "num_input_tokens_seen": 88920740, + "step": 4126, + "time_per_iteration": 2.562877655029297 + }, + { + "auxiliary_loss_clip": 0.01127104, + "auxiliary_loss_mlp": 0.01046934, + "balance_loss_clip": 1.04586124, + "balance_loss_mlp": 1.02923131, + "epoch": 0.24812866376070947, + "flos": 15486764670720.0, + "grad_norm": 2.153427807835069, + "language_loss": 0.80948591, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.83122635, + "num_input_tokens_seen": 88938510, + "step": 4127, + "time_per_iteration": 3.8914225101470947 + }, + { + "auxiliary_loss_clip": 0.01136646, + "auxiliary_loss_mlp": 0.00880402, + "balance_loss_clip": 1.05215406, + "balance_loss_mlp": 1.17223668, + "epoch": 0.24818878701337743, + "flos": 14757419013120.0, + "grad_norm": 2.108074144190643, + "language_loss": 0.84281337, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.86298382, + "num_input_tokens_seen": 88955235, + "step": 4128, + "time_per_iteration": 3.9490010738372803 + }, + { + "auxiliary_loss_clip": 0.01111183, + "auxiliary_loss_mlp": 0.01060146, + "balance_loss_clip": 1.04542506, + "balance_loss_mlp": 1.04185939, + "epoch": 0.2482489102660454, + "flos": 27089501852160.0, + "grad_norm": 2.207465673456794, + "language_loss": 0.65040553, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.67211884, + "num_input_tokens_seen": 88975210, + "step": 4129, + "time_per_iteration": 2.572672128677368 + }, + { + "auxiliary_loss_clip": 0.01100023, + "auxiliary_loss_mlp": 0.01041027, + "balance_loss_clip": 1.05255604, + "balance_loss_mlp": 1.02289522, + "epoch": 0.24830903351871336, + "flos": 26467528924800.0, + "grad_norm": 1.6752917617675114, + "language_loss": 0.76018584, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.78159642, + "num_input_tokens_seen": 88996120, + "step": 4130, + "time_per_iteration": 2.6761093139648438 + }, + { + "auxiliary_loss_clip": 0.0108235, + "auxiliary_loss_mlp": 0.01048149, + "balance_loss_clip": 1.05399072, + "balance_loss_mlp": 1.02996993, + "epoch": 0.24836915677138133, + "flos": 10228436870400.0, + "grad_norm": 2.1154183824550983, + "language_loss": 0.76766288, + "learning_rate": 3.520286966670535e-06, + "loss": 0.78896797, + "num_input_tokens_seen": 89008685, + "step": 4131, + "time_per_iteration": 2.6322720050811768 + }, + { + "auxiliary_loss_clip": 0.01128727, + "auxiliary_loss_mlp": 0.01038182, + "balance_loss_clip": 1.04962575, + "balance_loss_mlp": 1.02225518, + "epoch": 0.2484292800240493, + "flos": 30080429579520.0, + "grad_norm": 1.6462772087836217, + "language_loss": 0.83701503, + "learning_rate": 3.520033883075255e-06, + "loss": 0.85868406, + "num_input_tokens_seen": 89031160, + "step": 4132, + "time_per_iteration": 4.002025604248047 + }, + { + "auxiliary_loss_clip": 0.0111566, + "auxiliary_loss_mlp": 0.01040046, + "balance_loss_clip": 1.04667783, + "balance_loss_mlp": 1.02224755, + "epoch": 0.24848940327671726, + "flos": 13442944803840.0, + "grad_norm": 1.6530696289927114, + "language_loss": 0.71028817, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.73184526, + "num_input_tokens_seen": 89047235, + "step": 4133, + "time_per_iteration": 3.8995234966278076 + }, + { + "auxiliary_loss_clip": 0.01144755, + "auxiliary_loss_mlp": 0.01041711, + "balance_loss_clip": 1.04837191, + "balance_loss_mlp": 1.02170789, + "epoch": 0.24854952652938525, + "flos": 19970247260160.0, + "grad_norm": 2.852315598683667, + "language_loss": 0.61901885, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.64088356, + "num_input_tokens_seen": 89064790, + "step": 4134, + "time_per_iteration": 2.4875385761260986 + }, + { + "auxiliary_loss_clip": 0.01133761, + "auxiliary_loss_mlp": 0.01038938, + "balance_loss_clip": 1.04852426, + "balance_loss_mlp": 1.02216494, + "epoch": 0.24860964978205322, + "flos": 18150187167360.0, + "grad_norm": 2.2937288186725135, + "language_loss": 0.78303152, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.80475849, + "num_input_tokens_seen": 89083250, + "step": 4135, + "time_per_iteration": 2.4985318183898926 + }, + { + "auxiliary_loss_clip": 0.01121386, + "auxiliary_loss_mlp": 0.01033319, + "balance_loss_clip": 1.05619216, + "balance_loss_mlp": 1.01707041, + "epoch": 0.24866977303472118, + "flos": 11728641329280.0, + "grad_norm": 2.3500161314053094, + "language_loss": 0.82600498, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.847552, + "num_input_tokens_seen": 89100905, + "step": 4136, + "time_per_iteration": 2.5383379459381104 + }, + { + "auxiliary_loss_clip": 0.01112691, + "auxiliary_loss_mlp": 0.01039828, + "balance_loss_clip": 1.04932618, + "balance_loss_mlp": 1.02349591, + "epoch": 0.24872989628738915, + "flos": 34823582565120.0, + "grad_norm": 1.7399471198848842, + "language_loss": 0.70878381, + "learning_rate": 3.518767600693314e-06, + "loss": 0.73030901, + "num_input_tokens_seen": 89122630, + "step": 4137, + "time_per_iteration": 2.681018590927124 + }, + { + "auxiliary_loss_clip": 0.01129554, + "auxiliary_loss_mlp": 0.00836255, + "balance_loss_clip": 1.04443419, + "balance_loss_mlp": 1.08970308, + "epoch": 0.2487900195400571, + "flos": 13699347062400.0, + "grad_norm": 2.173309315743969, + "language_loss": 0.67251742, + "learning_rate": 3.518514171403042e-06, + "loss": 0.69217551, + "num_input_tokens_seen": 89141050, + "step": 4138, + "time_per_iteration": 2.5236244201660156 + }, + { + "auxiliary_loss_clip": 0.01102859, + "auxiliary_loss_mlp": 0.01036789, + "balance_loss_clip": 1.05030572, + "balance_loss_mlp": 1.02089214, + "epoch": 0.24885014279272508, + "flos": 25337815297920.0, + "grad_norm": 2.0091220993289305, + "language_loss": 0.83943886, + "learning_rate": 3.51826068453056e-06, + "loss": 0.86083537, + "num_input_tokens_seen": 89160810, + "step": 4139, + "time_per_iteration": 2.616950750350952 + }, + { + "auxiliary_loss_clip": 0.01114385, + "auxiliary_loss_mlp": 0.01044829, + "balance_loss_clip": 1.04837298, + "balance_loss_mlp": 1.02660227, + "epoch": 0.24891026604539307, + "flos": 20631434860800.0, + "grad_norm": 1.6088566022062087, + "language_loss": 0.78956091, + "learning_rate": 3.518007140085481e-06, + "loss": 0.81115305, + "num_input_tokens_seen": 89180610, + "step": 4140, + "time_per_iteration": 2.631852626800537 + }, + { + "auxiliary_loss_clip": 0.01042773, + "auxiliary_loss_mlp": 0.01027477, + "balance_loss_clip": 1.02544045, + "balance_loss_mlp": 1.02479517, + "epoch": 0.24897038929806103, + "flos": 66960294030720.0, + "grad_norm": 0.835416219623163, + "language_loss": 0.61016411, + "learning_rate": 3.51775353807742e-06, + "loss": 0.63086665, + "num_input_tokens_seen": 89241880, + "step": 4141, + "time_per_iteration": 3.165205955505371 + }, + { + "auxiliary_loss_clip": 0.01145305, + "auxiliary_loss_mlp": 0.01047435, + "balance_loss_clip": 1.05094647, + "balance_loss_mlp": 1.02936292, + "epoch": 0.249030512550729, + "flos": 36392555612160.0, + "grad_norm": 1.7772766714269916, + "language_loss": 0.72956634, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.75149363, + "num_input_tokens_seen": 89263340, + "step": 4142, + "time_per_iteration": 2.6350324153900146 + }, + { + "auxiliary_loss_clip": 0.01129919, + "auxiliary_loss_mlp": 0.01043132, + "balance_loss_clip": 1.04777157, + "balance_loss_mlp": 1.02578723, + "epoch": 0.24909063580339696, + "flos": 20154576879360.0, + "grad_norm": 1.921181892071923, + "language_loss": 0.81097233, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.83270288, + "num_input_tokens_seen": 89282870, + "step": 4143, + "time_per_iteration": 2.4944663047790527 + }, + { + "auxiliary_loss_clip": 0.01115008, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.04672313, + "balance_loss_mlp": 1.01620126, + "epoch": 0.24915075905606493, + "flos": 26396569607040.0, + "grad_norm": 1.7852390662741324, + "language_loss": 0.58778453, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.60926288, + "num_input_tokens_seen": 89303830, + "step": 4144, + "time_per_iteration": 2.601059675216675 + }, + { + "auxiliary_loss_clip": 0.01125648, + "auxiliary_loss_mlp": 0.01044358, + "balance_loss_clip": 1.04463744, + "balance_loss_mlp": 1.02737117, + "epoch": 0.2492108823087329, + "flos": 27527216987520.0, + "grad_norm": 1.9815387158588953, + "language_loss": 0.78678077, + "learning_rate": 3.516738554607708e-06, + "loss": 0.80848086, + "num_input_tokens_seen": 89324350, + "step": 4145, + "time_per_iteration": 2.5409960746765137 + }, + { + "auxiliary_loss_clip": 0.011365, + "auxiliary_loss_mlp": 0.00824089, + "balance_loss_clip": 1.04778695, + "balance_loss_mlp": 1.06460738, + "epoch": 0.24927100556140086, + "flos": 16691388111360.0, + "grad_norm": 2.4471496142617344, + "language_loss": 0.64958751, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.66919339, + "num_input_tokens_seen": 89342875, + "step": 4146, + "time_per_iteration": 2.541562080383301 + }, + { + "auxiliary_loss_clip": 0.01044138, + "auxiliary_loss_mlp": 0.01007488, + "balance_loss_clip": 1.03918529, + "balance_loss_mlp": 1.00480556, + "epoch": 0.24933112881406885, + "flos": 62772464286720.0, + "grad_norm": 0.9581606018026156, + "language_loss": 0.67360258, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.69411874, + "num_input_tokens_seen": 89404925, + "step": 4147, + "time_per_iteration": 3.278383255004883 + }, + { + "auxiliary_loss_clip": 0.01121499, + "auxiliary_loss_mlp": 0.01050557, + "balance_loss_clip": 1.05102086, + "balance_loss_mlp": 1.03204358, + "epoch": 0.24939125206673682, + "flos": 26651894457600.0, + "grad_norm": 5.8099437677705765, + "language_loss": 0.8870182, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.90873873, + "num_input_tokens_seen": 89425090, + "step": 4148, + "time_per_iteration": 2.6115939617156982 + }, + { + "auxiliary_loss_clip": 0.01098408, + "auxiliary_loss_mlp": 0.01045386, + "balance_loss_clip": 1.05248904, + "balance_loss_mlp": 1.02484572, + "epoch": 0.24945137531940478, + "flos": 20704333512960.0, + "grad_norm": 2.1358873357954065, + "language_loss": 0.6780448, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.6994828, + "num_input_tokens_seen": 89442615, + "step": 4149, + "time_per_iteration": 2.6600887775421143 + }, + { + "auxiliary_loss_clip": 0.01132804, + "auxiliary_loss_mlp": 0.01043699, + "balance_loss_clip": 1.05065155, + "balance_loss_mlp": 1.02632987, + "epoch": 0.24951149857207275, + "flos": 23768662682880.0, + "grad_norm": 1.8246636469997897, + "language_loss": 0.71436256, + "learning_rate": 3.515468531258095e-06, + "loss": 0.73612756, + "num_input_tokens_seen": 89463025, + "step": 4150, + "time_per_iteration": 2.586015462875366 + }, + { + "auxiliary_loss_clip": 0.01099992, + "auxiliary_loss_mlp": 0.01052415, + "balance_loss_clip": 1.05100632, + "balance_loss_mlp": 1.03419948, + "epoch": 0.2495716218247407, + "flos": 15664881237120.0, + "grad_norm": 1.8003030563349758, + "language_loss": 0.7278831, + "learning_rate": 3.515214354149478e-06, + "loss": 0.74940717, + "num_input_tokens_seen": 89480225, + "step": 4151, + "time_per_iteration": 2.6327946186065674 + }, + { + "auxiliary_loss_clip": 0.01139202, + "auxiliary_loss_mlp": 0.01051128, + "balance_loss_clip": 1.04965281, + "balance_loss_mlp": 1.0335331, + "epoch": 0.24963174507740868, + "flos": 24052499953920.0, + "grad_norm": 5.654057858241154, + "language_loss": 0.63348258, + "learning_rate": 3.514960119583781e-06, + "loss": 0.65538591, + "num_input_tokens_seen": 89496985, + "step": 4152, + "time_per_iteration": 2.5183281898498535 + }, + { + "auxiliary_loss_clip": 0.01125002, + "auxiliary_loss_mlp": 0.01039395, + "balance_loss_clip": 1.05073178, + "balance_loss_mlp": 1.02300406, + "epoch": 0.24969186833007664, + "flos": 21799501234560.0, + "grad_norm": 2.0962964827423685, + "language_loss": 0.77338862, + "learning_rate": 3.514705827570645e-06, + "loss": 0.79503262, + "num_input_tokens_seen": 89514420, + "step": 4153, + "time_per_iteration": 2.5620808601379395 + }, + { + "auxiliary_loss_clip": 0.01132082, + "auxiliary_loss_mlp": 0.0104217, + "balance_loss_clip": 1.05042517, + "balance_loss_mlp": 1.02422905, + "epoch": 0.24975199158274464, + "flos": 19938143479680.0, + "grad_norm": 2.0786116446902683, + "language_loss": 0.76931506, + "learning_rate": 3.514451478119711e-06, + "loss": 0.79105759, + "num_input_tokens_seen": 89532925, + "step": 4154, + "time_per_iteration": 2.5017600059509277 + }, + { + "auxiliary_loss_clip": 0.01132313, + "auxiliary_loss_mlp": 0.01046606, + "balance_loss_clip": 1.05098653, + "balance_loss_mlp": 1.02740121, + "epoch": 0.2498121148354126, + "flos": 25338389915520.0, + "grad_norm": 1.6686039247396394, + "language_loss": 0.70890975, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.73069894, + "num_input_tokens_seen": 89552855, + "step": 4155, + "time_per_iteration": 2.573125123977661 + }, + { + "auxiliary_loss_clip": 0.0112671, + "auxiliary_loss_mlp": 0.01053332, + "balance_loss_clip": 1.05419278, + "balance_loss_mlp": 1.03529525, + "epoch": 0.24987223808808057, + "flos": 20558787603840.0, + "grad_norm": 1.6602888993873777, + "language_loss": 0.74720329, + "learning_rate": 3.513942606943036e-06, + "loss": 0.76900375, + "num_input_tokens_seen": 89572830, + "step": 4156, + "time_per_iteration": 2.5292437076568604 + }, + { + "auxiliary_loss_clip": 0.01124922, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_clip": 1.0495472, + "balance_loss_mlp": 1.02584982, + "epoch": 0.24993236134074853, + "flos": 19749037351680.0, + "grad_norm": 2.0597851096726227, + "language_loss": 0.76852906, + "learning_rate": 3.513688085236591e-06, + "loss": 0.79019797, + "num_input_tokens_seen": 89590345, + "step": 4157, + "time_per_iteration": 2.5438830852508545 + }, + { + "auxiliary_loss_clip": 0.010848, + "auxiliary_loss_mlp": 0.01048115, + "balance_loss_clip": 1.0501945, + "balance_loss_mlp": 1.03086579, + "epoch": 0.2499924845934165, + "flos": 18770292587520.0, + "grad_norm": 1.7501006942913941, + "language_loss": 0.81622154, + "learning_rate": 3.513433506130942e-06, + "loss": 0.83755064, + "num_input_tokens_seen": 89610295, + "step": 4158, + "time_per_iteration": 2.631075382232666 + }, + { + "auxiliary_loss_clip": 0.01113003, + "auxiliary_loss_mlp": 0.01041548, + "balance_loss_clip": 1.04756236, + "balance_loss_mlp": 1.0242269, + "epoch": 0.25005260784608446, + "flos": 16872198197760.0, + "grad_norm": 1.7915506440459277, + "language_loss": 0.75868756, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.78023303, + "num_input_tokens_seen": 89627795, + "step": 4159, + "time_per_iteration": 2.5396616458892822 + }, + { + "auxiliary_loss_clip": 0.01137981, + "auxiliary_loss_mlp": 0.01040365, + "balance_loss_clip": 1.04924023, + "balance_loss_mlp": 1.02199435, + "epoch": 0.2501127310987524, + "flos": 22124923476480.0, + "grad_norm": 1.7617842334061926, + "language_loss": 0.71579945, + "learning_rate": 3.512924175760649e-06, + "loss": 0.73758292, + "num_input_tokens_seen": 89648090, + "step": 4160, + "time_per_iteration": 2.5165696144104004 + }, + { + "auxiliary_loss_clip": 0.01053899, + "auxiliary_loss_mlp": 0.01000773, + "balance_loss_clip": 1.02166438, + "balance_loss_mlp": 0.99829334, + "epoch": 0.2501728543514204, + "flos": 69458061980160.0, + "grad_norm": 0.7522567263675518, + "language_loss": 0.56782573, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.58837247, + "num_input_tokens_seen": 89710345, + "step": 4161, + "time_per_iteration": 3.170154571533203 + }, + { + "auxiliary_loss_clip": 0.01136388, + "auxiliary_loss_mlp": 0.0104707, + "balance_loss_clip": 1.05113387, + "balance_loss_mlp": 1.02952278, + "epoch": 0.25023297760408836, + "flos": 16289978647680.0, + "grad_norm": 2.670184363257036, + "language_loss": 0.80400395, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.82583857, + "num_input_tokens_seen": 89729390, + "step": 4162, + "time_per_iteration": 2.4993739128112793 + }, + { + "auxiliary_loss_clip": 0.01123698, + "auxiliary_loss_mlp": 0.00809538, + "balance_loss_clip": 1.04711008, + "balance_loss_mlp": 1.03849638, + "epoch": 0.2502931008567563, + "flos": 12237998140800.0, + "grad_norm": 2.8625025997031637, + "language_loss": 0.8788147, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.89814705, + "num_input_tokens_seen": 89742805, + "step": 4163, + "time_per_iteration": 2.560518264770508 + }, + { + "auxiliary_loss_clip": 0.01123111, + "auxiliary_loss_mlp": 0.01040794, + "balance_loss_clip": 1.04742765, + "balance_loss_mlp": 1.02399731, + "epoch": 0.25035322410942434, + "flos": 23181882105600.0, + "grad_norm": 1.6169879629797883, + "language_loss": 0.83554333, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.85718238, + "num_input_tokens_seen": 89761145, + "step": 4164, + "time_per_iteration": 2.5147740840911865 + }, + { + "auxiliary_loss_clip": 0.01119927, + "auxiliary_loss_mlp": 0.0104443, + "balance_loss_clip": 1.04894054, + "balance_loss_mlp": 1.02888536, + "epoch": 0.2504133473620923, + "flos": 20917534688640.0, + "grad_norm": 1.739495066376407, + "language_loss": 0.74490547, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.76654899, + "num_input_tokens_seen": 89780905, + "step": 4165, + "time_per_iteration": 3.994152069091797 + }, + { + "auxiliary_loss_clip": 0.01109691, + "auxiliary_loss_mlp": 0.01040661, + "balance_loss_clip": 1.05149627, + "balance_loss_mlp": 1.02345955, + "epoch": 0.2504734706147603, + "flos": 20776549806720.0, + "grad_norm": 2.58045330400646, + "language_loss": 0.74187362, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.76337713, + "num_input_tokens_seen": 89799230, + "step": 4166, + "time_per_iteration": 2.593127727508545 + }, + { + "auxiliary_loss_clip": 0.01108854, + "auxiliary_loss_mlp": 0.01046142, + "balance_loss_clip": 1.05197155, + "balance_loss_mlp": 1.02971506, + "epoch": 0.25053359386742824, + "flos": 24349373861760.0, + "grad_norm": 1.8106054794924564, + "language_loss": 0.81629688, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.83784682, + "num_input_tokens_seen": 89818240, + "step": 4167, + "time_per_iteration": 4.06223726272583 + }, + { + "auxiliary_loss_clip": 0.0112493, + "auxiliary_loss_mlp": 0.01040943, + "balance_loss_clip": 1.04999757, + "balance_loss_mlp": 1.02440846, + "epoch": 0.2505937171200962, + "flos": 21214336769280.0, + "grad_norm": 2.4213369307678496, + "language_loss": 0.80078799, + "learning_rate": 3.51088456024312e-06, + "loss": 0.8224467, + "num_input_tokens_seen": 89834485, + "step": 4168, + "time_per_iteration": 2.5175251960754395 + }, + { + "auxiliary_loss_clip": 0.01133528, + "auxiliary_loss_mlp": 0.010419, + "balance_loss_clip": 1.05065882, + "balance_loss_mlp": 1.0231719, + "epoch": 0.25065384037276417, + "flos": 41427231379200.0, + "grad_norm": 2.5635899941483857, + "language_loss": 0.69463646, + "learning_rate": 3.510629350383849e-06, + "loss": 0.71639079, + "num_input_tokens_seen": 89855645, + "step": 4169, + "time_per_iteration": 2.712191581726074 + }, + { + "auxiliary_loss_clip": 0.01114254, + "auxiliary_loss_mlp": 0.01045792, + "balance_loss_clip": 1.05119789, + "balance_loss_mlp": 1.02924621, + "epoch": 0.25071396362543213, + "flos": 26102389219200.0, + "grad_norm": 1.8757010133403997, + "language_loss": 0.77773982, + "learning_rate": 3.510374083241361e-06, + "loss": 0.79934031, + "num_input_tokens_seen": 89874895, + "step": 4170, + "time_per_iteration": 2.5920393466949463 + }, + { + "auxiliary_loss_clip": 0.01124444, + "auxiliary_loss_mlp": 0.01042644, + "balance_loss_clip": 1.05373669, + "balance_loss_mlp": 1.0256207, + "epoch": 0.2507740868781001, + "flos": 19098982967040.0, + "grad_norm": 2.656063587996473, + "language_loss": 0.76917732, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.79084826, + "num_input_tokens_seen": 89891700, + "step": 4171, + "time_per_iteration": 3.9274473190307617 + }, + { + "auxiliary_loss_clip": 0.01054623, + "auxiliary_loss_mlp": 0.01002679, + "balance_loss_clip": 1.02209496, + "balance_loss_mlp": 1.0004853, + "epoch": 0.25083421013076806, + "flos": 64341868296960.0, + "grad_norm": 0.8337183342791503, + "language_loss": 0.60019076, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62076378, + "num_input_tokens_seen": 89955775, + "step": 4172, + "time_per_iteration": 4.542768955230713 + }, + { + "auxiliary_loss_clip": 0.01119046, + "auxiliary_loss_mlp": 0.01047396, + "balance_loss_clip": 1.04860425, + "balance_loss_mlp": 1.03008687, + "epoch": 0.25089433338343603, + "flos": 24279599692800.0, + "grad_norm": 2.0876871392154817, + "language_loss": 0.78822267, + "learning_rate": 3.509607938211409e-06, + "loss": 0.80988717, + "num_input_tokens_seen": 89977150, + "step": 4173, + "time_per_iteration": 2.556041955947876 + }, + { + "auxiliary_loss_clip": 0.01143433, + "auxiliary_loss_mlp": 0.01046027, + "balance_loss_clip": 1.05179024, + "balance_loss_mlp": 1.02901638, + "epoch": 0.250954456636104, + "flos": 14721472477440.0, + "grad_norm": 2.4155455099056615, + "language_loss": 0.83314586, + "learning_rate": 3.509352442032875e-06, + "loss": 0.85504043, + "num_input_tokens_seen": 89994925, + "step": 4174, + "time_per_iteration": 2.4972116947174072 + }, + { + "auxiliary_loss_clip": 0.01093314, + "auxiliary_loss_mlp": 0.01044753, + "balance_loss_clip": 1.0524497, + "balance_loss_mlp": 1.02695549, + "epoch": 0.25101457988877196, + "flos": 22273593868800.0, + "grad_norm": 1.9115259935718238, + "language_loss": 0.7098121, + "learning_rate": 3.509096888619545e-06, + "loss": 0.73119277, + "num_input_tokens_seen": 90013235, + "step": 4175, + "time_per_iteration": 2.59843111038208 + }, + { + "auxiliary_loss_clip": 0.01108247, + "auxiliary_loss_mlp": 0.01039456, + "balance_loss_clip": 1.0450871, + "balance_loss_mlp": 1.0218488, + "epoch": 0.2510747031414399, + "flos": 25188929424000.0, + "grad_norm": 2.3950360202614425, + "language_loss": 0.81089795, + "learning_rate": 3.50884127798111e-06, + "loss": 0.83237493, + "num_input_tokens_seen": 90032150, + "step": 4176, + "time_per_iteration": 2.635676860809326 + }, + { + "auxiliary_loss_clip": 0.01123898, + "auxiliary_loss_mlp": 0.01044524, + "balance_loss_clip": 1.05265474, + "balance_loss_mlp": 1.02623737, + "epoch": 0.25113482639410795, + "flos": 20704189858560.0, + "grad_norm": 1.9689323626036777, + "language_loss": 0.82736754, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.84905171, + "num_input_tokens_seen": 90049085, + "step": 4177, + "time_per_iteration": 2.5302979946136475 + }, + { + "auxiliary_loss_clip": 0.0110562, + "auxiliary_loss_mlp": 0.01046526, + "balance_loss_clip": 1.05345774, + "balance_loss_mlp": 1.02896667, + "epoch": 0.2511949496467759, + "flos": 21506936958720.0, + "grad_norm": 2.873094117994669, + "language_loss": 0.83124399, + "learning_rate": 3.508329885067698e-06, + "loss": 0.85276556, + "num_input_tokens_seen": 90067695, + "step": 4178, + "time_per_iteration": 2.5873606204986572 + }, + { + "auxiliary_loss_clip": 0.01136461, + "auxiliary_loss_mlp": 0.00813418, + "balance_loss_clip": 1.04790998, + "balance_loss_mlp": 1.04619706, + "epoch": 0.2512550728994439, + "flos": 20701999128960.0, + "grad_norm": 2.177709512125536, + "language_loss": 0.75745499, + "learning_rate": 3.508074102812112e-06, + "loss": 0.77695382, + "num_input_tokens_seen": 90083890, + "step": 4179, + "time_per_iteration": 2.4612603187561035 + }, + { + "auxiliary_loss_clip": 0.01110062, + "auxiliary_loss_mlp": 0.01049512, + "balance_loss_clip": 1.05662632, + "balance_loss_mlp": 1.03147602, + "epoch": 0.25131519615211184, + "flos": 18478626151680.0, + "grad_norm": 2.0407728053029084, + "language_loss": 0.70052803, + "learning_rate": 3.507818263370206e-06, + "loss": 0.72212374, + "num_input_tokens_seen": 90100995, + "step": 4180, + "time_per_iteration": 2.5649971961975098 + }, + { + "auxiliary_loss_clip": 0.01142537, + "auxiliary_loss_mlp": 0.01044082, + "balance_loss_clip": 1.05112052, + "balance_loss_mlp": 1.02691579, + "epoch": 0.2513753194047798, + "flos": 20484955198080.0, + "grad_norm": 1.8549944600581727, + "language_loss": 0.86271638, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.88458258, + "num_input_tokens_seen": 90120365, + "step": 4181, + "time_per_iteration": 2.4703288078308105 + }, + { + "auxiliary_loss_clip": 0.01142255, + "auxiliary_loss_mlp": 0.01041752, + "balance_loss_clip": 1.05076313, + "balance_loss_mlp": 1.02537239, + "epoch": 0.25143544265744777, + "flos": 37670077704960.0, + "grad_norm": 1.9556720494717799, + "language_loss": 0.68327421, + "learning_rate": 3.507306412966238e-06, + "loss": 0.70511431, + "num_input_tokens_seen": 90142610, + "step": 4182, + "time_per_iteration": 2.6250925064086914 + }, + { + "auxiliary_loss_clip": 0.01036658, + "auxiliary_loss_mlp": 0.01019785, + "balance_loss_clip": 1.02276802, + "balance_loss_mlp": 1.01755607, + "epoch": 0.25149556591011574, + "flos": 69367457923200.0, + "grad_norm": 0.856689785132992, + "language_loss": 0.70153159, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72209603, + "num_input_tokens_seen": 90200555, + "step": 4183, + "time_per_iteration": 3.1446096897125244 + }, + { + "auxiliary_loss_clip": 0.01119631, + "auxiliary_loss_mlp": 0.01037726, + "balance_loss_clip": 1.04457963, + "balance_loss_mlp": 1.0196898, + "epoch": 0.2515556891627837, + "flos": 13990402967040.0, + "grad_norm": 1.871453909510983, + "language_loss": 0.74295533, + "learning_rate": 3.506794333933431e-06, + "loss": 0.76452893, + "num_input_tokens_seen": 90218120, + "step": 4184, + "time_per_iteration": 2.5237274169921875 + }, + { + "auxiliary_loss_clip": 0.01133871, + "auxiliary_loss_mlp": 0.01047591, + "balance_loss_clip": 1.0536691, + "balance_loss_mlp": 1.03055596, + "epoch": 0.25161581241545167, + "flos": 22163527618560.0, + "grad_norm": 1.73834114088923, + "language_loss": 0.83287394, + "learning_rate": 3.506538208705484e-06, + "loss": 0.85468853, + "num_input_tokens_seen": 90236790, + "step": 4185, + "time_per_iteration": 2.5087890625 + }, + { + "auxiliary_loss_clip": 0.01018739, + "auxiliary_loss_mlp": 0.01007103, + "balance_loss_clip": 1.0406152, + "balance_loss_mlp": 1.00473106, + "epoch": 0.25167593566811963, + "flos": 69358407696000.0, + "grad_norm": 0.7876503935608078, + "language_loss": 0.61452711, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.63478553, + "num_input_tokens_seen": 90297070, + "step": 4186, + "time_per_iteration": 3.173625946044922 + }, + { + "auxiliary_loss_clip": 0.01105394, + "auxiliary_loss_mlp": 0.01039698, + "balance_loss_clip": 1.0530982, + "balance_loss_mlp": 1.02234066, + "epoch": 0.2517360589207876, + "flos": 13261452359040.0, + "grad_norm": 2.152180640305493, + "language_loss": 0.7882688, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.80971968, + "num_input_tokens_seen": 90315255, + "step": 4187, + "time_per_iteration": 2.7467048168182373 + }, + { + "auxiliary_loss_clip": 0.0108929, + "auxiliary_loss_mlp": 0.01047022, + "balance_loss_clip": 1.05331969, + "balance_loss_mlp": 1.02997482, + "epoch": 0.25179618217345556, + "flos": 20376828282240.0, + "grad_norm": 1.5305206492882368, + "language_loss": 0.80367285, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.82503593, + "num_input_tokens_seen": 90334990, + "step": 4188, + "time_per_iteration": 2.6302270889282227 + }, + { + "auxiliary_loss_clip": 0.01131087, + "auxiliary_loss_mlp": 0.01041819, + "balance_loss_clip": 1.05176997, + "balance_loss_mlp": 1.0250107, + "epoch": 0.25185630542612353, + "flos": 27664718250240.0, + "grad_norm": 1.8245589903071278, + "language_loss": 0.74277329, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.76450241, + "num_input_tokens_seen": 90351825, + "step": 4189, + "time_per_iteration": 2.5701334476470947 + }, + { + "auxiliary_loss_clip": 0.01116179, + "auxiliary_loss_mlp": 0.01038152, + "balance_loss_clip": 1.050457, + "balance_loss_mlp": 1.02193975, + "epoch": 0.25191642867879155, + "flos": 20996430912000.0, + "grad_norm": 1.9439584487519277, + "language_loss": 0.84726453, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.86880785, + "num_input_tokens_seen": 90369860, + "step": 4190, + "time_per_iteration": 2.5372204780578613 + }, + { + "auxiliary_loss_clip": 0.01114883, + "auxiliary_loss_mlp": 0.01041187, + "balance_loss_clip": 1.04696929, + "balance_loss_mlp": 1.02240014, + "epoch": 0.2519765519314595, + "flos": 21105671149440.0, + "grad_norm": 2.047462002790528, + "language_loss": 0.75713044, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.77869117, + "num_input_tokens_seen": 90389245, + "step": 4191, + "time_per_iteration": 2.556689977645874 + }, + { + "auxiliary_loss_clip": 0.01040592, + "auxiliary_loss_mlp": 0.01014081, + "balance_loss_clip": 1.01856768, + "balance_loss_mlp": 1.01176834, + "epoch": 0.2520366751841275, + "flos": 62744993360640.0, + "grad_norm": 0.7225712115793018, + "language_loss": 0.57186699, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59241378, + "num_input_tokens_seen": 90456735, + "step": 4192, + "time_per_iteration": 3.1918184757232666 + }, + { + "auxiliary_loss_clip": 0.01113354, + "auxiliary_loss_mlp": 0.01037803, + "balance_loss_clip": 1.05171919, + "balance_loss_mlp": 1.02052915, + "epoch": 0.25209679843679544, + "flos": 22230716008320.0, + "grad_norm": 2.1431237444523386, + "language_loss": 0.76147521, + "learning_rate": 3.504487151087323e-06, + "loss": 0.78298676, + "num_input_tokens_seen": 90474165, + "step": 4193, + "time_per_iteration": 2.5616514682769775 + }, + { + "auxiliary_loss_clip": 0.01133836, + "auxiliary_loss_mlp": 0.01042961, + "balance_loss_clip": 1.0520308, + "balance_loss_mlp": 1.02625978, + "epoch": 0.2521569216894634, + "flos": 12166643773440.0, + "grad_norm": 2.411381985503307, + "language_loss": 0.84560609, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.86737406, + "num_input_tokens_seen": 90491660, + "step": 4194, + "time_per_iteration": 2.499225616455078 + }, + { + "auxiliary_loss_clip": 0.0114583, + "auxiliary_loss_mlp": 0.01049245, + "balance_loss_clip": 1.05131829, + "balance_loss_mlp": 1.03294885, + "epoch": 0.2522170449421314, + "flos": 23699786353920.0, + "grad_norm": 1.6507998675592224, + "language_loss": 0.88385773, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.90580851, + "num_input_tokens_seen": 90514025, + "step": 4195, + "time_per_iteration": 2.550368070602417 + }, + { + "auxiliary_loss_clip": 0.01142552, + "auxiliary_loss_mlp": 0.01043084, + "balance_loss_clip": 1.04944646, + "balance_loss_mlp": 1.02345037, + "epoch": 0.25227716819479934, + "flos": 20955456472320.0, + "grad_norm": 1.9692881368749846, + "language_loss": 0.85777748, + "learning_rate": 3.503717062883053e-06, + "loss": 0.8796339, + "num_input_tokens_seen": 90533530, + "step": 4196, + "time_per_iteration": 2.465139865875244 + }, + { + "auxiliary_loss_clip": 0.01134465, + "auxiliary_loss_mlp": 0.01045389, + "balance_loss_clip": 1.05269337, + "balance_loss_mlp": 1.02849722, + "epoch": 0.2523372914474673, + "flos": 23331342597120.0, + "grad_norm": 1.8424119494160511, + "language_loss": 0.82944691, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.8512454, + "num_input_tokens_seen": 90554025, + "step": 4197, + "time_per_iteration": 2.5739097595214844 + }, + { + "auxiliary_loss_clip": 0.01134259, + "auxiliary_loss_mlp": 0.01053187, + "balance_loss_clip": 1.05059266, + "balance_loss_mlp": 1.03356552, + "epoch": 0.25239741470013527, + "flos": 36970321875840.0, + "grad_norm": 1.9307504459478506, + "language_loss": 0.7284075, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.75028193, + "num_input_tokens_seen": 90576930, + "step": 4198, + "time_per_iteration": 2.6197543144226074 + }, + { + "auxiliary_loss_clip": 0.01144958, + "auxiliary_loss_mlp": 0.01047975, + "balance_loss_clip": 1.04897666, + "balance_loss_mlp": 1.03010535, + "epoch": 0.25245753795280323, + "flos": 18515757836160.0, + "grad_norm": 3.850166073254288, + "language_loss": 0.77104497, + "learning_rate": 3.50294646148888e-06, + "loss": 0.79297435, + "num_input_tokens_seen": 90595710, + "step": 4199, + "time_per_iteration": 2.481710910797119 + }, + { + "auxiliary_loss_clip": 0.01119214, + "auxiliary_loss_mlp": 0.00803683, + "balance_loss_clip": 1.05081201, + "balance_loss_mlp": 1.02567899, + "epoch": 0.2525176612054712, + "flos": 32344884737280.0, + "grad_norm": 1.9109894482105225, + "language_loss": 0.73717773, + "learning_rate": 3.502689480360739e-06, + "loss": 0.75640666, + "num_input_tokens_seen": 90617945, + "step": 4200, + "time_per_iteration": 2.6304562091827393 + }, + { + "auxiliary_loss_clip": 0.01130258, + "auxiliary_loss_mlp": 0.01049186, + "balance_loss_clip": 1.04807377, + "balance_loss_mlp": 1.0331161, + "epoch": 0.25257778445813917, + "flos": 45258217459200.0, + "grad_norm": 1.6124490132601998, + "language_loss": 0.82179862, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.84359306, + "num_input_tokens_seen": 90640855, + "step": 4201, + "time_per_iteration": 2.713648796081543 + }, + { + "auxiliary_loss_clip": 0.01088373, + "auxiliary_loss_mlp": 0.01048496, + "balance_loss_clip": 1.04990649, + "balance_loss_mlp": 1.0314486, + "epoch": 0.25263790771080713, + "flos": 23367791923200.0, + "grad_norm": 1.8377893790834416, + "language_loss": 0.75092006, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77228868, + "num_input_tokens_seen": 90661350, + "step": 4202, + "time_per_iteration": 2.6195433139801025 + }, + { + "auxiliary_loss_clip": 0.01127139, + "auxiliary_loss_mlp": 0.01039127, + "balance_loss_clip": 1.0486691, + "balance_loss_mlp": 1.02292621, + "epoch": 0.25269803096347515, + "flos": 18515039564160.0, + "grad_norm": 8.051313289538987, + "language_loss": 0.73033226, + "learning_rate": 3.501918195122491e-06, + "loss": 0.75199485, + "num_input_tokens_seen": 90680540, + "step": 4203, + "time_per_iteration": 2.4859166145324707 + }, + { + "auxiliary_loss_clip": 0.01120245, + "auxiliary_loss_mlp": 0.01042484, + "balance_loss_clip": 1.04730225, + "balance_loss_mlp": 1.02513337, + "epoch": 0.2527581542161431, + "flos": 24610552629120.0, + "grad_norm": 1.5576353205722957, + "language_loss": 0.77772391, + "learning_rate": 3.501660986124297e-06, + "loss": 0.79935122, + "num_input_tokens_seen": 90703460, + "step": 4204, + "time_per_iteration": 4.1854846477508545 + }, + { + "auxiliary_loss_clip": 0.01108976, + "auxiliary_loss_mlp": 0.01051655, + "balance_loss_clip": 1.0509088, + "balance_loss_mlp": 1.03436995, + "epoch": 0.2528182774688111, + "flos": 12641275111680.0, + "grad_norm": 2.002012198550442, + "language_loss": 0.72438425, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.74599057, + "num_input_tokens_seen": 90718815, + "step": 4205, + "time_per_iteration": 4.00794529914856 + }, + { + "auxiliary_loss_clip": 0.01120223, + "auxiliary_loss_mlp": 0.01047128, + "balance_loss_clip": 1.04895735, + "balance_loss_mlp": 1.03080821, + "epoch": 0.25287840072147905, + "flos": 46936789879680.0, + "grad_norm": 1.4987666619521824, + "language_loss": 0.75560892, + "learning_rate": 3.50114639730826e-06, + "loss": 0.77728236, + "num_input_tokens_seen": 90742125, + "step": 4206, + "time_per_iteration": 2.7286767959594727 + }, + { + "auxiliary_loss_clip": 0.01103882, + "auxiliary_loss_mlp": 0.01042445, + "balance_loss_clip": 1.05054867, + "balance_loss_mlp": 1.02552915, + "epoch": 0.252938523974147, + "flos": 18879712392960.0, + "grad_norm": 1.704649105745411, + "language_loss": 0.79357755, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.81504083, + "num_input_tokens_seen": 90760785, + "step": 4207, + "time_per_iteration": 2.5845463275909424 + }, + { + "auxiliary_loss_clip": 0.01123209, + "auxiliary_loss_mlp": 0.01045852, + "balance_loss_clip": 1.04920232, + "balance_loss_mlp": 1.02942514, + "epoch": 0.252998647226815, + "flos": 21434720664960.0, + "grad_norm": 1.4914187430649963, + "language_loss": 0.76483417, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.78652477, + "num_input_tokens_seen": 90780045, + "step": 4208, + "time_per_iteration": 2.5067555904388428 + }, + { + "auxiliary_loss_clip": 0.01123771, + "auxiliary_loss_mlp": 0.01036535, + "balance_loss_clip": 1.0482868, + "balance_loss_mlp": 1.02033424, + "epoch": 0.25305877047948294, + "flos": 25442171285760.0, + "grad_norm": 2.0227314915176713, + "language_loss": 0.69580591, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.71740901, + "num_input_tokens_seen": 90797980, + "step": 4209, + "time_per_iteration": 2.5619609355926514 + }, + { + "auxiliary_loss_clip": 0.01045357, + "auxiliary_loss_mlp": 0.01008836, + "balance_loss_clip": 1.02289116, + "balance_loss_mlp": 1.00672579, + "epoch": 0.2531188937321509, + "flos": 60185603629440.0, + "grad_norm": 0.7535851087009179, + "language_loss": 0.55069059, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.57123256, + "num_input_tokens_seen": 90864865, + "step": 4210, + "time_per_iteration": 4.6422340869903564 + }, + { + "auxiliary_loss_clip": 0.01108315, + "auxiliary_loss_mlp": 0.01033944, + "balance_loss_clip": 1.04945707, + "balance_loss_mlp": 1.01791084, + "epoch": 0.25317901698481887, + "flos": 19682387665920.0, + "grad_norm": 1.9143855402434136, + "language_loss": 0.80569184, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.8271144, + "num_input_tokens_seen": 90882885, + "step": 4211, + "time_per_iteration": 2.5754833221435547 + }, + { + "auxiliary_loss_clip": 0.01092567, + "auxiliary_loss_mlp": 0.01040711, + "balance_loss_clip": 1.04849422, + "balance_loss_mlp": 1.02459407, + "epoch": 0.25323914023748684, + "flos": 24424355502720.0, + "grad_norm": 1.6880604248252724, + "language_loss": 0.78215683, + "learning_rate": 3.499601265005622e-06, + "loss": 0.80348957, + "num_input_tokens_seen": 90902985, + "step": 4212, + "time_per_iteration": 2.6975791454315186 + }, + { + "auxiliary_loss_clip": 0.01129892, + "auxiliary_loss_mlp": 0.01038437, + "balance_loss_clip": 1.04700506, + "balance_loss_mlp": 1.02102077, + "epoch": 0.2532992634901548, + "flos": 25447450584960.0, + "grad_norm": 1.73914296717674, + "language_loss": 0.53461695, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.55630022, + "num_input_tokens_seen": 90923550, + "step": 4213, + "time_per_iteration": 2.534336566925049 + }, + { + "auxiliary_loss_clip": 0.01115557, + "auxiliary_loss_mlp": 0.010442, + "balance_loss_clip": 1.04790354, + "balance_loss_mlp": 1.02574682, + "epoch": 0.25335938674282277, + "flos": 18880538405760.0, + "grad_norm": 2.161212914563186, + "language_loss": 0.65408087, + "learning_rate": 3.499085765880308e-06, + "loss": 0.67567849, + "num_input_tokens_seen": 90943260, + "step": 4214, + "time_per_iteration": 2.535834312438965 + }, + { + "auxiliary_loss_clip": 0.01044738, + "auxiliary_loss_mlp": 0.01003893, + "balance_loss_clip": 1.02401614, + "balance_loss_mlp": 1.0018425, + "epoch": 0.25341950999549073, + "flos": 53062649936640.0, + "grad_norm": 0.8548362707507434, + "language_loss": 0.58043808, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60092437, + "num_input_tokens_seen": 90996295, + "step": 4215, + "time_per_iteration": 2.8520216941833496 + }, + { + "auxiliary_loss_clip": 0.01118991, + "auxiliary_loss_mlp": 0.0104383, + "balance_loss_clip": 1.05027294, + "balance_loss_mlp": 1.02648449, + "epoch": 0.2534796332481587, + "flos": 39020247054720.0, + "grad_norm": 1.6420361991509769, + "language_loss": 0.83592808, + "learning_rate": 3.498570039373066e-06, + "loss": 0.85755634, + "num_input_tokens_seen": 91017545, + "step": 4216, + "time_per_iteration": 2.7138257026672363 + }, + { + "auxiliary_loss_clip": 0.01125245, + "auxiliary_loss_mlp": 0.01039168, + "balance_loss_clip": 1.04997885, + "balance_loss_mlp": 1.02191842, + "epoch": 0.2535397565008267, + "flos": 23586990670080.0, + "grad_norm": 1.7531630824152382, + "language_loss": 0.80319643, + "learning_rate": 3.498312090875666e-06, + "loss": 0.82484061, + "num_input_tokens_seen": 91037715, + "step": 4217, + "time_per_iteration": 2.584118366241455 + }, + { + "auxiliary_loss_clip": 0.01113155, + "auxiliary_loss_mlp": 0.01045231, + "balance_loss_clip": 1.04432559, + "balance_loss_mlp": 1.02760041, + "epoch": 0.2535998797534947, + "flos": 19281373251840.0, + "grad_norm": 1.9958812293465191, + "language_loss": 0.74772942, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.76931334, + "num_input_tokens_seen": 91055295, + "step": 4218, + "time_per_iteration": 2.521731376647949 + }, + { + "auxiliary_loss_clip": 0.01132137, + "auxiliary_loss_mlp": 0.01041681, + "balance_loss_clip": 1.04777539, + "balance_loss_mlp": 1.02412152, + "epoch": 0.25366000300616265, + "flos": 24024382583040.0, + "grad_norm": 1.7903530623944606, + "language_loss": 0.74492151, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.76665974, + "num_input_tokens_seen": 91075485, + "step": 4219, + "time_per_iteration": 2.515061140060425 + }, + { + "auxiliary_loss_clip": 0.01139753, + "auxiliary_loss_mlp": 0.01047329, + "balance_loss_clip": 1.05136347, + "balance_loss_mlp": 1.02923298, + "epoch": 0.2537201262588306, + "flos": 16289368116480.0, + "grad_norm": 1.7236782934895805, + "language_loss": 0.81369126, + "learning_rate": 3.497537904525736e-06, + "loss": 0.83556211, + "num_input_tokens_seen": 91093620, + "step": 4220, + "time_per_iteration": 2.503474235534668 + }, + { + "auxiliary_loss_clip": 0.0109648, + "auxiliary_loss_mlp": 0.0104833, + "balance_loss_clip": 1.04748201, + "balance_loss_mlp": 1.02880335, + "epoch": 0.2537802495114986, + "flos": 23294677789440.0, + "grad_norm": 2.3592018529084378, + "language_loss": 0.70802414, + "learning_rate": 3.497279728822468e-06, + "loss": 0.72947222, + "num_input_tokens_seen": 91114110, + "step": 4221, + "time_per_iteration": 2.603095054626465 + }, + { + "auxiliary_loss_clip": 0.01141229, + "auxiliary_loss_mlp": 0.0103788, + "balance_loss_clip": 1.04879808, + "balance_loss_mlp": 1.02061832, + "epoch": 0.25384037276416654, + "flos": 17639142416640.0, + "grad_norm": 1.5466148007371248, + "language_loss": 0.61762375, + "learning_rate": 3.497021496342202e-06, + "loss": 0.63941485, + "num_input_tokens_seen": 91133135, + "step": 4222, + "time_per_iteration": 2.4434754848480225 + }, + { + "auxiliary_loss_clip": 0.01133249, + "auxiliary_loss_mlp": 0.01045784, + "balance_loss_clip": 1.05050397, + "balance_loss_mlp": 1.02783108, + "epoch": 0.2539004960168345, + "flos": 21507044699520.0, + "grad_norm": 1.669617481222977, + "language_loss": 0.74588066, + "learning_rate": 3.496763207094731e-06, + "loss": 0.76767099, + "num_input_tokens_seen": 91151805, + "step": 4223, + "time_per_iteration": 2.48734712600708 + }, + { + "auxiliary_loss_clip": 0.01093433, + "auxiliary_loss_mlp": 0.01036521, + "balance_loss_clip": 1.05494618, + "balance_loss_mlp": 1.01915169, + "epoch": 0.2539606192695025, + "flos": 23950909313280.0, + "grad_norm": 1.8854226523158253, + "language_loss": 0.79919016, + "learning_rate": 3.49650486108985e-06, + "loss": 0.82048976, + "num_input_tokens_seen": 91172270, + "step": 4224, + "time_per_iteration": 2.6407740116119385 + }, + { + "auxiliary_loss_clip": 0.0112612, + "auxiliary_loss_mlp": 0.00799344, + "balance_loss_clip": 1.04775882, + "balance_loss_mlp": 1.02086532, + "epoch": 0.25402074252217044, + "flos": 24169784837760.0, + "grad_norm": 1.3934992558251114, + "language_loss": 0.77413338, + "learning_rate": 3.496246458337354e-06, + "loss": 0.79338801, + "num_input_tokens_seen": 91192080, + "step": 4225, + "time_per_iteration": 2.5812244415283203 + }, + { + "auxiliary_loss_clip": 0.0112889, + "auxiliary_loss_mlp": 0.01055226, + "balance_loss_clip": 1.05014372, + "balance_loss_mlp": 1.03689146, + "epoch": 0.2540808657748384, + "flos": 22303758314880.0, + "grad_norm": 1.6561668322737146, + "language_loss": 0.84560812, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.86744922, + "num_input_tokens_seen": 91211450, + "step": 4226, + "time_per_iteration": 2.524010181427002 + }, + { + "auxiliary_loss_clip": 0.01139176, + "auxiliary_loss_mlp": 0.01043821, + "balance_loss_clip": 1.04824162, + "balance_loss_mlp": 1.02574909, + "epoch": 0.25414098902750637, + "flos": 27599541022080.0, + "grad_norm": 1.6127510479819556, + "language_loss": 0.71263063, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.73446059, + "num_input_tokens_seen": 91231835, + "step": 4227, + "time_per_iteration": 2.5184755325317383 + }, + { + "auxiliary_loss_clip": 0.01056297, + "auxiliary_loss_mlp": 0.01002949, + "balance_loss_clip": 1.02424264, + "balance_loss_mlp": 1.00091004, + "epoch": 0.25420111228017434, + "flos": 58170834887040.0, + "grad_norm": 1.0028746730980762, + "language_loss": 0.61823136, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.63882381, + "num_input_tokens_seen": 91288755, + "step": 4228, + "time_per_iteration": 2.93076491355896 + }, + { + "auxiliary_loss_clip": 0.01123852, + "auxiliary_loss_mlp": 0.01039323, + "balance_loss_clip": 1.04733372, + "balance_loss_mlp": 1.0201062, + "epoch": 0.2542612355328423, + "flos": 11464409905920.0, + "grad_norm": 3.488721138519077, + "language_loss": 0.86304027, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.88467205, + "num_input_tokens_seen": 91302485, + "step": 4229, + "time_per_iteration": 2.4584848880767822 + }, + { + "auxiliary_loss_clip": 0.01101028, + "auxiliary_loss_mlp": 0.0104724, + "balance_loss_clip": 1.05259359, + "balance_loss_mlp": 1.0292635, + "epoch": 0.2543213587855103, + "flos": 22965879669120.0, + "grad_norm": 2.1801011401635413, + "language_loss": 0.77314049, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.79462314, + "num_input_tokens_seen": 91321120, + "step": 4230, + "time_per_iteration": 2.571901559829712 + }, + { + "auxiliary_loss_clip": 0.01129366, + "auxiliary_loss_mlp": 0.01045416, + "balance_loss_clip": 1.04849374, + "balance_loss_mlp": 1.0274272, + "epoch": 0.2543814820381783, + "flos": 18253178438400.0, + "grad_norm": 3.5951308631028076, + "language_loss": 0.75364429, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.77539212, + "num_input_tokens_seen": 91338575, + "step": 4231, + "time_per_iteration": 2.4861953258514404 + }, + { + "auxiliary_loss_clip": 0.01133144, + "auxiliary_loss_mlp": 0.01038405, + "balance_loss_clip": 1.04841709, + "balance_loss_mlp": 1.02114367, + "epoch": 0.25444160529084625, + "flos": 15632705629440.0, + "grad_norm": 1.7876726972299433, + "language_loss": 0.74301231, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.76472783, + "num_input_tokens_seen": 91357355, + "step": 4232, + "time_per_iteration": 2.4565417766571045 + }, + { + "auxiliary_loss_clip": 0.01143802, + "auxiliary_loss_mlp": 0.01041737, + "balance_loss_clip": 1.05154347, + "balance_loss_mlp": 1.02328348, + "epoch": 0.2545017285435142, + "flos": 24601610142720.0, + "grad_norm": 1.9866010124374502, + "language_loss": 0.86342567, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.88528109, + "num_input_tokens_seen": 91376515, + "step": 4233, + "time_per_iteration": 2.5158069133758545 + }, + { + "auxiliary_loss_clip": 0.01070143, + "auxiliary_loss_mlp": 0.01039907, + "balance_loss_clip": 1.04045379, + "balance_loss_mlp": 1.0234561, + "epoch": 0.2545618517961822, + "flos": 24679069822080.0, + "grad_norm": 1.4879481264817707, + "language_loss": 0.75046974, + "learning_rate": 3.493918281539737e-06, + "loss": 0.77157027, + "num_input_tokens_seen": 91397595, + "step": 4234, + "time_per_iteration": 2.638753890991211 + }, + { + "auxiliary_loss_clip": 0.01112207, + "auxiliary_loss_mlp": 0.01045045, + "balance_loss_clip": 1.05060732, + "balance_loss_mlp": 1.02861845, + "epoch": 0.25462197504885015, + "flos": 23915106432000.0, + "grad_norm": 1.5828786804226291, + "language_loss": 0.75011247, + "learning_rate": 3.493659311850379e-06, + "loss": 0.771685, + "num_input_tokens_seen": 91417775, + "step": 4235, + "time_per_iteration": 2.563563823699951 + }, + { + "auxiliary_loss_clip": 0.01120774, + "auxiliary_loss_mlp": 0.00802774, + "balance_loss_clip": 1.05089688, + "balance_loss_mlp": 1.02229548, + "epoch": 0.2546820983015181, + "flos": 24789387467520.0, + "grad_norm": 1.867689288782314, + "language_loss": 0.65187043, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.67110592, + "num_input_tokens_seen": 91437665, + "step": 4236, + "time_per_iteration": 2.590193271636963 + }, + { + "auxiliary_loss_clip": 0.01140032, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.04862189, + "balance_loss_mlp": 1.01972079, + "epoch": 0.2547422215541861, + "flos": 18734130570240.0, + "grad_norm": 1.8405734796122653, + "language_loss": 0.66651976, + "learning_rate": 3.493141202562354e-06, + "loss": 0.68827635, + "num_input_tokens_seen": 91456705, + "step": 4237, + "time_per_iteration": 2.466362476348877 + }, + { + "auxiliary_loss_clip": 0.01141966, + "auxiliary_loss_mlp": 0.01045092, + "balance_loss_clip": 1.0498637, + "balance_loss_mlp": 1.02744937, + "epoch": 0.25480234480685404, + "flos": 21032449274880.0, + "grad_norm": 2.1893778627937293, + "language_loss": 0.74857002, + "learning_rate": 3.492882062983333e-06, + "loss": 0.77044058, + "num_input_tokens_seen": 91475535, + "step": 4238, + "time_per_iteration": 2.478989839553833 + }, + { + "auxiliary_loss_clip": 0.01134811, + "auxiliary_loss_mlp": 0.01044884, + "balance_loss_clip": 1.05262041, + "balance_loss_mlp": 1.02628744, + "epoch": 0.254862468059522, + "flos": 25082167224960.0, + "grad_norm": 1.7547210609300186, + "language_loss": 0.80364847, + "learning_rate": 3.492622866794074e-06, + "loss": 0.82544541, + "num_input_tokens_seen": 91499140, + "step": 4239, + "time_per_iteration": 2.567063093185425 + }, + { + "auxiliary_loss_clip": 0.01122458, + "auxiliary_loss_mlp": 0.01043735, + "balance_loss_clip": 1.04824591, + "balance_loss_mlp": 1.02604461, + "epoch": 0.25492259131219, + "flos": 20558392554240.0, + "grad_norm": 1.8338299195480898, + "language_loss": 0.77298915, + "learning_rate": 3.492363614004407e-06, + "loss": 0.79465115, + "num_input_tokens_seen": 91518335, + "step": 4240, + "time_per_iteration": 2.4843051433563232 + }, + { + "auxiliary_loss_clip": 0.01147008, + "auxiliary_loss_mlp": 0.010414, + "balance_loss_clip": 1.05006981, + "balance_loss_mlp": 1.0221591, + "epoch": 0.25498271456485794, + "flos": 25042485674880.0, + "grad_norm": 1.7745837004290628, + "language_loss": 0.83331883, + "learning_rate": 3.492104304624162e-06, + "loss": 0.85520285, + "num_input_tokens_seen": 91537655, + "step": 4241, + "time_per_iteration": 2.5271737575531006 + }, + { + "auxiliary_loss_clip": 0.01134377, + "auxiliary_loss_mlp": 0.01047659, + "balance_loss_clip": 1.05211878, + "balance_loss_mlp": 1.02976608, + "epoch": 0.2550428378175259, + "flos": 26178412354560.0, + "grad_norm": 1.643603492614393, + "language_loss": 0.7349869, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.75680727, + "num_input_tokens_seen": 91557545, + "step": 4242, + "time_per_iteration": 3.9310996532440186 + }, + { + "auxiliary_loss_clip": 0.01143143, + "auxiliary_loss_mlp": 0.008008, + "balance_loss_clip": 1.04972279, + "balance_loss_mlp": 1.02218652, + "epoch": 0.2551029610701939, + "flos": 15267170874240.0, + "grad_norm": 2.50326989937717, + "language_loss": 0.72598886, + "learning_rate": 3.491585516131273e-06, + "loss": 0.74542832, + "num_input_tokens_seen": 91574405, + "step": 4243, + "time_per_iteration": 2.4698591232299805 + }, + { + "auxiliary_loss_clip": 0.01131941, + "auxiliary_loss_mlp": 0.01043677, + "balance_loss_clip": 1.05228317, + "balance_loss_mlp": 1.02566457, + "epoch": 0.2551630843228619, + "flos": 18112193556480.0, + "grad_norm": 1.7138235286588777, + "language_loss": 0.81636965, + "learning_rate": 3.491326037038301e-06, + "loss": 0.83812582, + "num_input_tokens_seen": 91593755, + "step": 4244, + "time_per_iteration": 2.4953184127807617 + }, + { + "auxiliary_loss_clip": 0.01046613, + "auxiliary_loss_mlp": 0.00999958, + "balance_loss_clip": 1.03326035, + "balance_loss_mlp": 0.99769288, + "epoch": 0.25522320757552985, + "flos": 70520192167680.0, + "grad_norm": 0.6836146490701756, + "language_loss": 0.57725215, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.59771788, + "num_input_tokens_seen": 91660335, + "step": 4245, + "time_per_iteration": 4.584337949752808 + }, + { + "auxiliary_loss_clip": 0.01141583, + "auxiliary_loss_mlp": 0.01048247, + "balance_loss_clip": 1.04727948, + "balance_loss_mlp": 1.03031814, + "epoch": 0.2552833308281978, + "flos": 22893088757760.0, + "grad_norm": 8.04411389754988, + "language_loss": 0.6569916, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.67888993, + "num_input_tokens_seen": 91678500, + "step": 4246, + "time_per_iteration": 2.5026135444641113 + }, + { + "auxiliary_loss_clip": 0.0112532, + "auxiliary_loss_mlp": 0.01042422, + "balance_loss_clip": 1.0478363, + "balance_loss_mlp": 1.02589989, + "epoch": 0.2553434540808658, + "flos": 22053605022720.0, + "grad_norm": 1.7248728832006013, + "language_loss": 0.81413066, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.8358081, + "num_input_tokens_seen": 91696430, + "step": 4247, + "time_per_iteration": 2.5147736072540283 + }, + { + "auxiliary_loss_clip": 0.01138112, + "auxiliary_loss_mlp": 0.01047619, + "balance_loss_clip": 1.05128479, + "balance_loss_mlp": 1.02738893, + "epoch": 0.25540357733353375, + "flos": 16544190176640.0, + "grad_norm": 12.174063335875717, + "language_loss": 0.83309078, + "learning_rate": 3.490287555252514e-06, + "loss": 0.85494804, + "num_input_tokens_seen": 91713270, + "step": 4248, + "time_per_iteration": 2.496375322341919 + }, + { + "auxiliary_loss_clip": 0.0111429, + "auxiliary_loss_mlp": 0.01048862, + "balance_loss_clip": 1.05105305, + "balance_loss_mlp": 1.03027773, + "epoch": 0.2554637005862017, + "flos": 17565022702080.0, + "grad_norm": 1.9263832984929832, + "language_loss": 0.84380645, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.86543792, + "num_input_tokens_seen": 91728865, + "step": 4249, + "time_per_iteration": 3.9953527450561523 + }, + { + "auxiliary_loss_clip": 0.01016211, + "auxiliary_loss_mlp": 0.01006234, + "balance_loss_clip": 1.04986334, + "balance_loss_mlp": 1.00369477, + "epoch": 0.2555238238388697, + "flos": 72244763953920.0, + "grad_norm": 0.8760033519730654, + "language_loss": 0.56300676, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58323121, + "num_input_tokens_seen": 91787470, + "step": 4250, + "time_per_iteration": 3.2334401607513428 + }, + { + "auxiliary_loss_clip": 0.01117793, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.0520196, + "balance_loss_mlp": 1.01876628, + "epoch": 0.25558394709153764, + "flos": 24389414547840.0, + "grad_norm": 2.3934817870936693, + "language_loss": 0.80622435, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.82777715, + "num_input_tokens_seen": 91805640, + "step": 4251, + "time_per_iteration": 2.7421164512634277 + }, + { + "auxiliary_loss_clip": 0.01030078, + "auxiliary_loss_mlp": 0.01004369, + "balance_loss_clip": 1.02314687, + "balance_loss_mlp": 1.00230694, + "epoch": 0.2556440703442056, + "flos": 69231213636480.0, + "grad_norm": 0.7902388601062489, + "language_loss": 0.66075605, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.68110049, + "num_input_tokens_seen": 91869695, + "step": 4252, + "time_per_iteration": 3.1808204650878906 + }, + { + "auxiliary_loss_clip": 0.01128658, + "auxiliary_loss_mlp": 0.01037519, + "balance_loss_clip": 1.0511682, + "balance_loss_mlp": 1.0214262, + "epoch": 0.2557041935968736, + "flos": 24863902231680.0, + "grad_norm": 1.9077071824480705, + "language_loss": 0.73546243, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.75712419, + "num_input_tokens_seen": 91889920, + "step": 4253, + "time_per_iteration": 2.548811197280884 + }, + { + "auxiliary_loss_clip": 0.01099403, + "auxiliary_loss_mlp": 0.01049412, + "balance_loss_clip": 1.04519558, + "balance_loss_mlp": 1.03124452, + "epoch": 0.25576431684954154, + "flos": 22492110257280.0, + "grad_norm": 1.9289298777358568, + "language_loss": 0.72460449, + "learning_rate": 3.488728137415357e-06, + "loss": 0.74609256, + "num_input_tokens_seen": 91908665, + "step": 4254, + "time_per_iteration": 2.5426926612854004 + }, + { + "auxiliary_loss_clip": 0.01092867, + "auxiliary_loss_mlp": 0.00799693, + "balance_loss_clip": 1.04223788, + "balance_loss_mlp": 1.02061284, + "epoch": 0.2558244401022095, + "flos": 19826748426240.0, + "grad_norm": 1.95712882892548, + "language_loss": 0.81002963, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.82895517, + "num_input_tokens_seen": 91927855, + "step": 4255, + "time_per_iteration": 2.6083834171295166 + }, + { + "auxiliary_loss_clip": 0.0111504, + "auxiliary_loss_mlp": 0.01044827, + "balance_loss_clip": 1.04491186, + "balance_loss_mlp": 1.02766049, + "epoch": 0.2558845633548775, + "flos": 23220486247680.0, + "grad_norm": 1.45556324342878, + "language_loss": 0.85644722, + "learning_rate": 3.488207879742721e-06, + "loss": 0.87804586, + "num_input_tokens_seen": 91948500, + "step": 4256, + "time_per_iteration": 2.5286006927490234 + }, + { + "auxiliary_loss_clip": 0.01099012, + "auxiliary_loss_mlp": 0.01053253, + "balance_loss_clip": 1.04855323, + "balance_loss_mlp": 1.03416717, + "epoch": 0.2559446866075455, + "flos": 16837867774080.0, + "grad_norm": 1.7247652117730614, + "language_loss": 0.7494669, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.77098948, + "num_input_tokens_seen": 91968375, + "step": 4257, + "time_per_iteration": 2.8045501708984375 + }, + { + "auxiliary_loss_clip": 0.01019017, + "auxiliary_loss_mlp": 0.01006111, + "balance_loss_clip": 1.03457105, + "balance_loss_mlp": 1.00381017, + "epoch": 0.25600480986021346, + "flos": 57593786895360.0, + "grad_norm": 0.8043296708150767, + "language_loss": 0.65303993, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.67329121, + "num_input_tokens_seen": 92028490, + "step": 4258, + "time_per_iteration": 3.1575746536254883 + }, + { + "auxiliary_loss_clip": 0.01090975, + "auxiliary_loss_mlp": 0.00807163, + "balance_loss_clip": 1.0481627, + "balance_loss_mlp": 1.03512788, + "epoch": 0.2560649331128814, + "flos": 27819529868160.0, + "grad_norm": 1.6044212836112213, + "language_loss": 0.7658385, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.78481984, + "num_input_tokens_seen": 92048060, + "step": 4259, + "time_per_iteration": 2.6379659175872803 + }, + { + "auxiliary_loss_clip": 0.01023861, + "auxiliary_loss_mlp": 0.0101671, + "balance_loss_clip": 1.02979696, + "balance_loss_mlp": 1.01424217, + "epoch": 0.2561250563655494, + "flos": 70950509101440.0, + "grad_norm": 0.7943348395857789, + "language_loss": 0.58453786, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60494357, + "num_input_tokens_seen": 92118180, + "step": 4260, + "time_per_iteration": 3.2660748958587646 + }, + { + "auxiliary_loss_clip": 0.01127002, + "auxiliary_loss_mlp": 0.01045124, + "balance_loss_clip": 1.04498827, + "balance_loss_mlp": 1.02708793, + "epoch": 0.25618517961821735, + "flos": 27012329481600.0, + "grad_norm": 1.8419885962491196, + "language_loss": 0.76981723, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.79153848, + "num_input_tokens_seen": 92137570, + "step": 4261, + "time_per_iteration": 2.548661947250366 + }, + { + "auxiliary_loss_clip": 0.01138182, + "auxiliary_loss_mlp": 0.01036346, + "balance_loss_clip": 1.04749751, + "balance_loss_mlp": 1.02032471, + "epoch": 0.2562453028708853, + "flos": 23068296322560.0, + "grad_norm": 1.755725320850985, + "language_loss": 0.83410358, + "learning_rate": 3.486645752648842e-06, + "loss": 0.85584885, + "num_input_tokens_seen": 92157625, + "step": 4262, + "time_per_iteration": 2.5282111167907715 + }, + { + "auxiliary_loss_clip": 0.01124862, + "auxiliary_loss_mlp": 0.01046502, + "balance_loss_clip": 1.04495347, + "balance_loss_mlp": 1.02807212, + "epoch": 0.2563054261235533, + "flos": 15120942606720.0, + "grad_norm": 2.956504339816012, + "language_loss": 0.73483956, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.75655317, + "num_input_tokens_seen": 92175350, + "step": 4263, + "time_per_iteration": 2.4598124027252197 + }, + { + "auxiliary_loss_clip": 0.01107571, + "auxiliary_loss_mlp": 0.00799964, + "balance_loss_clip": 1.05061722, + "balance_loss_mlp": 1.02069402, + "epoch": 0.25636554937622125, + "flos": 27854865872640.0, + "grad_norm": 2.196645328202278, + "language_loss": 0.82722867, + "learning_rate": 3.486124592522163e-06, + "loss": 0.846304, + "num_input_tokens_seen": 92196070, + "step": 4264, + "time_per_iteration": 2.594109058380127 + }, + { + "auxiliary_loss_clip": 0.01122692, + "auxiliary_loss_mlp": 0.01046667, + "balance_loss_clip": 1.04783094, + "balance_loss_mlp": 1.02864218, + "epoch": 0.2564256726288892, + "flos": 28906509288960.0, + "grad_norm": 1.716823220388212, + "language_loss": 0.74581778, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.76751137, + "num_input_tokens_seen": 92216310, + "step": 4265, + "time_per_iteration": 2.5553011894226074 + }, + { + "auxiliary_loss_clip": 0.01103464, + "auxiliary_loss_mlp": 0.01038492, + "balance_loss_clip": 1.04165602, + "balance_loss_mlp": 1.02113521, + "epoch": 0.2564857958815572, + "flos": 18514931823360.0, + "grad_norm": 1.659260842420772, + "language_loss": 0.81608248, + "learning_rate": 3.485603206979513e-06, + "loss": 0.83750206, + "num_input_tokens_seen": 92234510, + "step": 4266, + "time_per_iteration": 2.5572922229766846 + }, + { + "auxiliary_loss_clip": 0.01076585, + "auxiliary_loss_mlp": 0.0104554, + "balance_loss_clip": 1.04594505, + "balance_loss_mlp": 1.02685964, + "epoch": 0.25654591913422514, + "flos": 25808280658560.0, + "grad_norm": 1.5922476894083235, + "language_loss": 0.79321229, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.81443357, + "num_input_tokens_seen": 92254070, + "step": 4267, + "time_per_iteration": 2.605055809020996 + }, + { + "auxiliary_loss_clip": 0.01089267, + "auxiliary_loss_mlp": 0.01052449, + "balance_loss_clip": 1.04506159, + "balance_loss_mlp": 1.03398395, + "epoch": 0.2566060423868931, + "flos": 19099665325440.0, + "grad_norm": 1.6587867457352752, + "language_loss": 0.79010189, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.81151903, + "num_input_tokens_seen": 92275060, + "step": 4268, + "time_per_iteration": 2.5922610759735107 + }, + { + "auxiliary_loss_clip": 0.01100504, + "auxiliary_loss_mlp": 0.00801056, + "balance_loss_clip": 1.0475899, + "balance_loss_mlp": 1.01867104, + "epoch": 0.25666616563956113, + "flos": 23842674656640.0, + "grad_norm": 1.6293652573550652, + "language_loss": 0.68179679, + "learning_rate": 3.484820706183595e-06, + "loss": 0.70081234, + "num_input_tokens_seen": 92293610, + "step": 4269, + "time_per_iteration": 2.5811843872070312 + }, + { + "auxiliary_loss_clip": 0.01118993, + "auxiliary_loss_mlp": 0.01040183, + "balance_loss_clip": 1.049788, + "balance_loss_mlp": 1.02230191, + "epoch": 0.2567262888922291, + "flos": 14604259420800.0, + "grad_norm": 2.68568062447982, + "language_loss": 0.79041356, + "learning_rate": 3.484559759962666e-06, + "loss": 0.81200534, + "num_input_tokens_seen": 92308305, + "step": 4270, + "time_per_iteration": 2.4964094161987305 + }, + { + "auxiliary_loss_clip": 0.01091696, + "auxiliary_loss_mlp": 0.0104358, + "balance_loss_clip": 1.04449821, + "balance_loss_mlp": 1.02346981, + "epoch": 0.25678641214489706, + "flos": 32923117877760.0, + "grad_norm": 2.250891638801971, + "language_loss": 0.68206286, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.70341569, + "num_input_tokens_seen": 92329875, + "step": 4271, + "time_per_iteration": 2.7033114433288574 + }, + { + "auxiliary_loss_clip": 0.0112868, + "auxiliary_loss_mlp": 0.00799538, + "balance_loss_clip": 1.04659271, + "balance_loss_mlp": 1.021626, + "epoch": 0.256846535397565, + "flos": 24098933260800.0, + "grad_norm": 1.3387401410314024, + "language_loss": 0.87680429, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.89608645, + "num_input_tokens_seen": 92348780, + "step": 4272, + "time_per_iteration": 2.5421268939971924 + }, + { + "auxiliary_loss_clip": 0.011139, + "auxiliary_loss_mlp": 0.0104264, + "balance_loss_clip": 1.04792595, + "balance_loss_mlp": 1.02469945, + "epoch": 0.256906658650233, + "flos": 19718441942400.0, + "grad_norm": 2.181205162885094, + "language_loss": 0.81756425, + "learning_rate": 3.483776583571541e-06, + "loss": 0.83912969, + "num_input_tokens_seen": 92368175, + "step": 4273, + "time_per_iteration": 2.5358664989471436 + }, + { + "auxiliary_loss_clip": 0.01093086, + "auxiliary_loss_mlp": 0.01043857, + "balance_loss_clip": 1.04305995, + "balance_loss_mlp": 1.02596366, + "epoch": 0.25696678190290095, + "flos": 22926018551040.0, + "grad_norm": 1.5090656170182468, + "language_loss": 0.77242714, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.79379654, + "num_input_tokens_seen": 92387755, + "step": 4274, + "time_per_iteration": 2.5790083408355713 + }, + { + "auxiliary_loss_clip": 0.01108341, + "auxiliary_loss_mlp": 0.01036086, + "balance_loss_clip": 1.04319179, + "balance_loss_mlp": 1.01865816, + "epoch": 0.2570269051555689, + "flos": 27307838672640.0, + "grad_norm": 1.6144211663080767, + "language_loss": 0.83899903, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.86044335, + "num_input_tokens_seen": 92409850, + "step": 4275, + "time_per_iteration": 2.6044628620147705 + }, + { + "auxiliary_loss_clip": 0.01113526, + "auxiliary_loss_mlp": 0.01039132, + "balance_loss_clip": 1.04747784, + "balance_loss_mlp": 1.02103555, + "epoch": 0.2570870284082369, + "flos": 27563414918400.0, + "grad_norm": 1.9785093636167983, + "language_loss": 0.78262246, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.80414903, + "num_input_tokens_seen": 92431250, + "step": 4276, + "time_per_iteration": 2.5839807987213135 + }, + { + "auxiliary_loss_clip": 0.01130526, + "auxiliary_loss_mlp": 0.01045013, + "balance_loss_clip": 1.05004811, + "balance_loss_mlp": 1.02787101, + "epoch": 0.25714715166090485, + "flos": 28730834847360.0, + "grad_norm": 1.5836912341271765, + "language_loss": 0.79331613, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.81507158, + "num_input_tokens_seen": 92452065, + "step": 4277, + "time_per_iteration": 2.5692403316497803 + }, + { + "auxiliary_loss_clip": 0.01135525, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.0456754, + "balance_loss_mlp": 1.02232885, + "epoch": 0.2572072749135728, + "flos": 20116152305280.0, + "grad_norm": 2.2702311326926, + "language_loss": 0.78513592, + "learning_rate": 3.482470164419295e-06, + "loss": 0.80687749, + "num_input_tokens_seen": 92470025, + "step": 4278, + "time_per_iteration": 2.467740774154663 + }, + { + "auxiliary_loss_clip": 0.01119023, + "auxiliary_loss_mlp": 0.0104335, + "balance_loss_clip": 1.04783249, + "balance_loss_mlp": 1.02610004, + "epoch": 0.2572673981662408, + "flos": 26030855283840.0, + "grad_norm": 3.1081899400013464, + "language_loss": 0.74423659, + "learning_rate": 3.482208711902952e-06, + "loss": 0.76586032, + "num_input_tokens_seen": 92489825, + "step": 4279, + "time_per_iteration": 2.571587324142456 + }, + { + "auxiliary_loss_clip": 0.0112701, + "auxiliary_loss_mlp": 0.01051158, + "balance_loss_clip": 1.04582095, + "balance_loss_mlp": 1.03405118, + "epoch": 0.25732752141890874, + "flos": 16106618695680.0, + "grad_norm": 2.7488843339333235, + "language_loss": 0.85104078, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.8728224, + "num_input_tokens_seen": 92507270, + "step": 4280, + "time_per_iteration": 2.4687283039093018 + }, + { + "auxiliary_loss_clip": 0.01127492, + "auxiliary_loss_mlp": 0.01040304, + "balance_loss_clip": 1.04679573, + "balance_loss_mlp": 1.02262545, + "epoch": 0.2573876446715767, + "flos": 22524429519360.0, + "grad_norm": 2.4675384330036407, + "language_loss": 0.78568369, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.8073616, + "num_input_tokens_seen": 92526300, + "step": 4281, + "time_per_iteration": 3.886716842651367 + }, + { + "auxiliary_loss_clip": 0.01104253, + "auxiliary_loss_mlp": 0.01038146, + "balance_loss_clip": 1.04692113, + "balance_loss_mlp": 1.02089643, + "epoch": 0.2574477679242447, + "flos": 23950837486080.0, + "grad_norm": 1.6965105252517332, + "language_loss": 0.87348092, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.89490497, + "num_input_tokens_seen": 92546465, + "step": 4282, + "time_per_iteration": 2.5583720207214355 + }, + { + "auxiliary_loss_clip": 0.01140688, + "auxiliary_loss_mlp": 0.01047402, + "balance_loss_clip": 1.0486536, + "balance_loss_mlp": 1.03050983, + "epoch": 0.2575078911769127, + "flos": 21981711951360.0, + "grad_norm": 1.4343370138194094, + "language_loss": 0.70238447, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.72426534, + "num_input_tokens_seen": 92567260, + "step": 4283, + "time_per_iteration": 3.8846657276153564 + }, + { + "auxiliary_loss_clip": 0.0113632, + "auxiliary_loss_mlp": 0.00797711, + "balance_loss_clip": 1.05005956, + "balance_loss_mlp": 1.02026296, + "epoch": 0.25756801442958066, + "flos": 21945406279680.0, + "grad_norm": 1.8586936704082038, + "language_loss": 0.80360806, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.82294834, + "num_input_tokens_seen": 92585425, + "step": 4284, + "time_per_iteration": 2.485323190689087 + }, + { + "auxiliary_loss_clip": 0.01104155, + "auxiliary_loss_mlp": 0.01045104, + "balance_loss_clip": 1.06230116, + "balance_loss_mlp": 1.02959478, + "epoch": 0.2576281376822486, + "flos": 35261980058880.0, + "grad_norm": 2.2164560290648585, + "language_loss": 0.69986564, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.72135824, + "num_input_tokens_seen": 92604770, + "step": 4285, + "time_per_iteration": 2.7337234020233154 + }, + { + "auxiliary_loss_clip": 0.01120559, + "auxiliary_loss_mlp": 0.01041919, + "balance_loss_clip": 1.04921699, + "balance_loss_mlp": 1.02569485, + "epoch": 0.2576882609349166, + "flos": 14132285688960.0, + "grad_norm": 2.131717672758683, + "language_loss": 0.58454341, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.60616821, + "num_input_tokens_seen": 92622635, + "step": 4286, + "time_per_iteration": 2.5230324268341064 + }, + { + "auxiliary_loss_clip": 0.01137081, + "auxiliary_loss_mlp": 0.01050862, + "balance_loss_clip": 1.05513835, + "balance_loss_mlp": 1.03364861, + "epoch": 0.25774838418758456, + "flos": 23258336204160.0, + "grad_norm": 1.9921226661956455, + "language_loss": 0.64447463, + "learning_rate": 3.480115069207354e-06, + "loss": 0.66635406, + "num_input_tokens_seen": 92642960, + "step": 4287, + "time_per_iteration": 3.9076879024505615 + }, + { + "auxiliary_loss_clip": 0.01121383, + "auxiliary_loss_mlp": 0.01041021, + "balance_loss_clip": 1.0517416, + "balance_loss_mlp": 1.02318752, + "epoch": 0.2578085074402525, + "flos": 22601745544320.0, + "grad_norm": 1.907799843331985, + "language_loss": 0.71493471, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.73655874, + "num_input_tokens_seen": 92662455, + "step": 4288, + "time_per_iteration": 2.5470919609069824 + }, + { + "auxiliary_loss_clip": 0.0110921, + "auxiliary_loss_mlp": 0.01040513, + "balance_loss_clip": 1.05219197, + "balance_loss_mlp": 1.02460492, + "epoch": 0.2578686306929205, + "flos": 24571840746240.0, + "grad_norm": 1.582076261160591, + "language_loss": 0.77113163, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79262882, + "num_input_tokens_seen": 92683520, + "step": 4289, + "time_per_iteration": 2.6136324405670166 + }, + { + "auxiliary_loss_clip": 0.01138363, + "auxiliary_loss_mlp": 0.00797868, + "balance_loss_clip": 1.04776001, + "balance_loss_mlp": 1.01614797, + "epoch": 0.25792875394558845, + "flos": 18113953322880.0, + "grad_norm": 2.169361696540674, + "language_loss": 0.85465837, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.87402064, + "num_input_tokens_seen": 92701450, + "step": 4290, + "time_per_iteration": 2.4610414505004883 + }, + { + "auxiliary_loss_clip": 0.01106441, + "auxiliary_loss_mlp": 0.01056347, + "balance_loss_clip": 1.04692769, + "balance_loss_mlp": 1.03630793, + "epoch": 0.2579888771982564, + "flos": 17712902995200.0, + "grad_norm": 2.240185939215405, + "language_loss": 0.72462511, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.74625295, + "num_input_tokens_seen": 92720355, + "step": 4291, + "time_per_iteration": 2.52471661567688 + }, + { + "auxiliary_loss_clip": 0.01145325, + "auxiliary_loss_mlp": 0.01044476, + "balance_loss_clip": 1.0512569, + "balance_loss_mlp": 1.02605844, + "epoch": 0.2580490004509244, + "flos": 16434878112000.0, + "grad_norm": 2.374984192610286, + "language_loss": 0.8021847, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.82408273, + "num_input_tokens_seen": 92736755, + "step": 4292, + "time_per_iteration": 2.434584617614746 + }, + { + "auxiliary_loss_clip": 0.01145265, + "auxiliary_loss_mlp": 0.01041032, + "balance_loss_clip": 1.05210721, + "balance_loss_mlp": 1.02332926, + "epoch": 0.25810912370359235, + "flos": 33835141128960.0, + "grad_norm": 2.6186875607541067, + "language_loss": 0.67857945, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.70044249, + "num_input_tokens_seen": 92757655, + "step": 4293, + "time_per_iteration": 2.571488380432129 + }, + { + "auxiliary_loss_clip": 0.01100147, + "auxiliary_loss_mlp": 0.01041131, + "balance_loss_clip": 1.04961753, + "balance_loss_mlp": 1.02435827, + "epoch": 0.2581692469562603, + "flos": 25192197561600.0, + "grad_norm": 1.8780374950730028, + "language_loss": 0.75602859, + "learning_rate": 3.478280185054542e-06, + "loss": 0.77744138, + "num_input_tokens_seen": 92776100, + "step": 4294, + "time_per_iteration": 2.598740577697754 + }, + { + "auxiliary_loss_clip": 0.01094854, + "auxiliary_loss_mlp": 0.01051318, + "balance_loss_clip": 1.04619396, + "balance_loss_mlp": 1.03144598, + "epoch": 0.2582293702089283, + "flos": 34932212271360.0, + "grad_norm": 1.9953637378804363, + "language_loss": 0.81120205, + "learning_rate": 3.478017834441318e-06, + "loss": 0.83266377, + "num_input_tokens_seen": 92798880, + "step": 4295, + "time_per_iteration": 2.673055648803711 + }, + { + "auxiliary_loss_clip": 0.01053476, + "auxiliary_loss_mlp": 0.01044454, + "balance_loss_clip": 1.05398846, + "balance_loss_mlp": 1.02577424, + "epoch": 0.2582894934615963, + "flos": 26833746038400.0, + "grad_norm": 1.9118231748290029, + "language_loss": 0.72756565, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.74854493, + "num_input_tokens_seen": 92817750, + "step": 4296, + "time_per_iteration": 2.9798176288604736 + }, + { + "auxiliary_loss_clip": 0.01087787, + "auxiliary_loss_mlp": 0.01045803, + "balance_loss_clip": 1.05587077, + "balance_loss_mlp": 1.02731407, + "epoch": 0.25834961671426426, + "flos": 23515241253120.0, + "grad_norm": 1.6239038428685129, + "language_loss": 0.86971539, + "learning_rate": 3.477492965085067e-06, + "loss": 0.89105129, + "num_input_tokens_seen": 92837995, + "step": 4297, + "time_per_iteration": 2.830334186553955 + }, + { + "auxiliary_loss_clip": 0.0114407, + "auxiliary_loss_mlp": 0.01051685, + "balance_loss_clip": 1.05137229, + "balance_loss_mlp": 1.0345192, + "epoch": 0.25840973996693223, + "flos": 22451028076800.0, + "grad_norm": 1.7817813381239145, + "language_loss": 0.84521413, + "learning_rate": 3.477230446361943e-06, + "loss": 0.86717165, + "num_input_tokens_seen": 92857245, + "step": 4298, + "time_per_iteration": 2.4782094955444336 + }, + { + "auxiliary_loss_clip": 0.01131021, + "auxiliary_loss_mlp": 0.00797932, + "balance_loss_clip": 1.0501492, + "balance_loss_mlp": 1.01819611, + "epoch": 0.2584698632196002, + "flos": 11290854366720.0, + "grad_norm": 2.264361981335166, + "language_loss": 0.83932018, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.85860968, + "num_input_tokens_seen": 92873265, + "step": 4299, + "time_per_iteration": 2.491185188293457 + }, + { + "auxiliary_loss_clip": 0.01114915, + "auxiliary_loss_mlp": 0.01037871, + "balance_loss_clip": 1.05451441, + "balance_loss_mlp": 1.02119339, + "epoch": 0.25852998647226816, + "flos": 17929982839680.0, + "grad_norm": 3.709325899304579, + "language_loss": 0.82388949, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.84541738, + "num_input_tokens_seen": 92890880, + "step": 4300, + "time_per_iteration": 2.5094621181488037 + }, + { + "auxiliary_loss_clip": 0.01131524, + "auxiliary_loss_mlp": 0.01041505, + "balance_loss_clip": 1.04924357, + "balance_loss_mlp": 1.02387381, + "epoch": 0.2585901097249361, + "flos": 33256117889280.0, + "grad_norm": 1.874298841074203, + "language_loss": 0.67626262, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.69799292, + "num_input_tokens_seen": 92910770, + "step": 4301, + "time_per_iteration": 2.6223793029785156 + }, + { + "auxiliary_loss_clip": 0.01135615, + "auxiliary_loss_mlp": 0.01042044, + "balance_loss_clip": 1.05178702, + "balance_loss_mlp": 1.02372169, + "epoch": 0.2586502329776041, + "flos": 18441278985600.0, + "grad_norm": 2.4610688310578746, + "language_loss": 0.81512749, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.83690405, + "num_input_tokens_seen": 92929520, + "step": 4302, + "time_per_iteration": 2.4644076824188232 + }, + { + "auxiliary_loss_clip": 0.01101069, + "auxiliary_loss_mlp": 0.01051676, + "balance_loss_clip": 1.05248427, + "balance_loss_mlp": 1.03374708, + "epoch": 0.25871035623027205, + "flos": 17968120104960.0, + "grad_norm": 1.7571093141848448, + "language_loss": 0.92334336, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94487083, + "num_input_tokens_seen": 92947890, + "step": 4303, + "time_per_iteration": 2.5807883739471436 + }, + { + "auxiliary_loss_clip": 0.01137401, + "auxiliary_loss_mlp": 0.01040152, + "balance_loss_clip": 1.05324376, + "balance_loss_mlp": 1.02227068, + "epoch": 0.25877047948294, + "flos": 27777729415680.0, + "grad_norm": 2.099282250872492, + "language_loss": 0.67070353, + "learning_rate": 3.475654158020507e-06, + "loss": 0.69247907, + "num_input_tokens_seen": 92967690, + "step": 4304, + "time_per_iteration": 2.567720413208008 + }, + { + "auxiliary_loss_clip": 0.0110925, + "auxiliary_loss_mlp": 0.01045098, + "balance_loss_clip": 1.04842925, + "balance_loss_mlp": 1.02817082, + "epoch": 0.258830602735608, + "flos": 27125843437440.0, + "grad_norm": 2.0658529677280133, + "language_loss": 0.72147042, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.74301392, + "num_input_tokens_seen": 92986830, + "step": 4305, + "time_per_iteration": 2.625807285308838 + }, + { + "auxiliary_loss_clip": 0.01112111, + "auxiliary_loss_mlp": 0.00797522, + "balance_loss_clip": 1.05853105, + "balance_loss_mlp": 1.01642895, + "epoch": 0.25889072598827595, + "flos": 17891486438400.0, + "grad_norm": 1.9310047252969054, + "language_loss": 0.76028717, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.77938348, + "num_input_tokens_seen": 93002740, + "step": 4306, + "time_per_iteration": 2.569707155227661 + }, + { + "auxiliary_loss_clip": 0.01038845, + "auxiliary_loss_mlp": 0.0102569, + "balance_loss_clip": 1.02955842, + "balance_loss_mlp": 1.02372324, + "epoch": 0.2589508492409439, + "flos": 53934955724160.0, + "grad_norm": 0.8684610995872517, + "language_loss": 0.57161283, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59225821, + "num_input_tokens_seen": 93058645, + "step": 4307, + "time_per_iteration": 3.0661473274230957 + }, + { + "auxiliary_loss_clip": 0.01120957, + "auxiliary_loss_mlp": 0.0103979, + "balance_loss_clip": 1.05059099, + "balance_loss_mlp": 1.02249253, + "epoch": 0.2590109724936119, + "flos": 22125785402880.0, + "grad_norm": 1.5360734267448695, + "language_loss": 0.71434838, + "learning_rate": 3.474602179854327e-06, + "loss": 0.73595583, + "num_input_tokens_seen": 93077140, + "step": 4308, + "time_per_iteration": 2.555570125579834 + }, + { + "auxiliary_loss_clip": 0.01144346, + "auxiliary_loss_mlp": 0.01048134, + "balance_loss_clip": 1.04951274, + "balance_loss_mlp": 1.03026462, + "epoch": 0.2590710957462799, + "flos": 13474294398720.0, + "grad_norm": 1.9383156167940314, + "language_loss": 0.8453536, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.8672784, + "num_input_tokens_seen": 93093580, + "step": 4309, + "time_per_iteration": 2.469115734100342 + }, + { + "auxiliary_loss_clip": 0.01131582, + "auxiliary_loss_mlp": 0.01043641, + "balance_loss_clip": 1.05199981, + "balance_loss_mlp": 1.02739322, + "epoch": 0.25913121899894787, + "flos": 22307098279680.0, + "grad_norm": 1.4364121391358387, + "language_loss": 0.84343874, + "learning_rate": 3.474075855228966e-06, + "loss": 0.86519098, + "num_input_tokens_seen": 93112345, + "step": 4310, + "time_per_iteration": 2.522749185562134 + }, + { + "auxiliary_loss_clip": 0.01135287, + "auxiliary_loss_mlp": 0.01050611, + "balance_loss_clip": 1.05208492, + "balance_loss_mlp": 1.03299189, + "epoch": 0.25919134225161583, + "flos": 25811728364160.0, + "grad_norm": 1.8042846745674028, + "language_loss": 0.77197385, + "learning_rate": 3.473812609065639e-06, + "loss": 0.79383278, + "num_input_tokens_seen": 93131545, + "step": 4311, + "time_per_iteration": 2.5374698638916016 + }, + { + "auxiliary_loss_clip": 0.01107237, + "auxiliary_loss_mlp": 0.01051686, + "balance_loss_clip": 1.04918838, + "balance_loss_mlp": 1.03307748, + "epoch": 0.2592514655042838, + "flos": 31212262108800.0, + "grad_norm": 1.9830086721589142, + "language_loss": 0.72743535, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.74902457, + "num_input_tokens_seen": 93150730, + "step": 4312, + "time_per_iteration": 2.6053314208984375 + }, + { + "auxiliary_loss_clip": 0.01140907, + "auxiliary_loss_mlp": 0.01047105, + "balance_loss_clip": 1.04838264, + "balance_loss_mlp": 1.02943873, + "epoch": 0.25931158875695176, + "flos": 18474998878080.0, + "grad_norm": 1.7922671042561338, + "language_loss": 0.7009573, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.72283745, + "num_input_tokens_seen": 93167895, + "step": 4313, + "time_per_iteration": 2.456695318222046 + }, + { + "auxiliary_loss_clip": 0.01142924, + "auxiliary_loss_mlp": 0.01054284, + "balance_loss_clip": 1.05155659, + "balance_loss_mlp": 1.03814328, + "epoch": 0.2593717120096197, + "flos": 19207935895680.0, + "grad_norm": 1.625527915626716, + "language_loss": 0.80365914, + "learning_rate": 3.473022535292867e-06, + "loss": 0.82563126, + "num_input_tokens_seen": 93187650, + "step": 4314, + "time_per_iteration": 2.4841270446777344 + }, + { + "auxiliary_loss_clip": 0.01111106, + "auxiliary_loss_mlp": 0.0105176, + "balance_loss_clip": 1.05653501, + "balance_loss_mlp": 1.03298509, + "epoch": 0.2594318352622877, + "flos": 31248100903680.0, + "grad_norm": 2.1540535133644867, + "language_loss": 0.67564428, + "learning_rate": 3.472759065640968e-06, + "loss": 0.69727296, + "num_input_tokens_seen": 93207370, + "step": 4315, + "time_per_iteration": 2.655148506164551 + }, + { + "auxiliary_loss_clip": 0.0109984, + "auxiliary_loss_mlp": 0.01050248, + "balance_loss_clip": 1.05578899, + "balance_loss_mlp": 1.03348696, + "epoch": 0.25949195851495566, + "flos": 22237144542720.0, + "grad_norm": 1.536865622680474, + "language_loss": 0.79594028, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.81744123, + "num_input_tokens_seen": 93227925, + "step": 4316, + "time_per_iteration": 2.5955095291137695 + }, + { + "auxiliary_loss_clip": 0.01099143, + "auxiliary_loss_mlp": 0.01047348, + "balance_loss_clip": 1.05579841, + "balance_loss_mlp": 1.03000307, + "epoch": 0.2595520817676236, + "flos": 28075716645120.0, + "grad_norm": 1.7113342047685036, + "language_loss": 0.78233808, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.80380297, + "num_input_tokens_seen": 93250020, + "step": 4317, + "time_per_iteration": 2.6446585655212402 + }, + { + "auxiliary_loss_clip": 0.01146443, + "auxiliary_loss_mlp": 0.01055735, + "balance_loss_clip": 1.05391908, + "balance_loss_mlp": 1.03741264, + "epoch": 0.2596122050202916, + "flos": 20190954378240.0, + "grad_norm": 1.905227505821882, + "language_loss": 0.77810329, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.80012506, + "num_input_tokens_seen": 93269070, + "step": 4318, + "time_per_iteration": 2.4877078533172607 + }, + { + "auxiliary_loss_clip": 0.0114319, + "auxiliary_loss_mlp": 0.01047727, + "balance_loss_clip": 1.05140805, + "balance_loss_mlp": 1.02848649, + "epoch": 0.25967232827295955, + "flos": 22527949052160.0, + "grad_norm": 1.7594511300737965, + "language_loss": 0.76270759, + "learning_rate": 3.471704628661598e-06, + "loss": 0.78461671, + "num_input_tokens_seen": 93290250, + "step": 4319, + "time_per_iteration": 2.504451274871826 + }, + { + "auxiliary_loss_clip": 0.01119863, + "auxiliary_loss_mlp": 0.01043001, + "balance_loss_clip": 1.05329406, + "balance_loss_mlp": 1.02589452, + "epoch": 0.2597324515256275, + "flos": 21068252156160.0, + "grad_norm": 1.7414394983796198, + "language_loss": 0.76944822, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.7910769, + "num_input_tokens_seen": 93310090, + "step": 4320, + "time_per_iteration": 4.719043493270874 + }, + { + "auxiliary_loss_clip": 0.01113514, + "auxiliary_loss_mlp": 0.01043802, + "balance_loss_clip": 1.05157399, + "balance_loss_mlp": 1.02576518, + "epoch": 0.2597925747782955, + "flos": 22050013662720.0, + "grad_norm": 1.4556809847023873, + "language_loss": 0.71682805, + "learning_rate": 3.471177075288801e-06, + "loss": 0.73840123, + "num_input_tokens_seen": 93329570, + "step": 4321, + "time_per_iteration": 2.597766876220703 + }, + { + "auxiliary_loss_clip": 0.01121439, + "auxiliary_loss_mlp": 0.01054612, + "balance_loss_clip": 1.05148494, + "balance_loss_mlp": 1.03470397, + "epoch": 0.2598526980309635, + "flos": 19536949497600.0, + "grad_norm": 2.5833491319651958, + "language_loss": 0.74848419, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.77024472, + "num_input_tokens_seen": 93347920, + "step": 4322, + "time_per_iteration": 3.9139814376831055 + }, + { + "auxiliary_loss_clip": 0.01113039, + "auxiliary_loss_mlp": 0.01045727, + "balance_loss_clip": 1.05149674, + "balance_loss_mlp": 1.02769089, + "epoch": 0.25991282128363147, + "flos": 24495207079680.0, + "grad_norm": 2.0147062610199304, + "language_loss": 0.73401642, + "learning_rate": 3.470649298767278e-06, + "loss": 0.75560403, + "num_input_tokens_seen": 93367145, + "step": 4323, + "time_per_iteration": 2.606518507003784 + }, + { + "auxiliary_loss_clip": 0.01139452, + "auxiliary_loss_mlp": 0.00794283, + "balance_loss_clip": 1.05182719, + "balance_loss_mlp": 1.01093149, + "epoch": 0.25997294453629943, + "flos": 24201457655040.0, + "grad_norm": 2.3369812444527946, + "language_loss": 0.67120755, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.69054484, + "num_input_tokens_seen": 93386555, + "step": 4324, + "time_per_iteration": 2.526766538619995 + }, + { + "auxiliary_loss_clip": 0.01094633, + "auxiliary_loss_mlp": 0.01043809, + "balance_loss_clip": 1.0535562, + "balance_loss_mlp": 1.0274539, + "epoch": 0.2600330677889674, + "flos": 31431460855680.0, + "grad_norm": 1.948711840672533, + "language_loss": 0.70369822, + "learning_rate": 3.470121299177082e-06, + "loss": 0.72508264, + "num_input_tokens_seen": 93405590, + "step": 4325, + "time_per_iteration": 2.689810037612915 + }, + { + "auxiliary_loss_clip": 0.01133219, + "auxiliary_loss_mlp": 0.01037349, + "balance_loss_clip": 1.05007637, + "balance_loss_mlp": 1.01938391, + "epoch": 0.26009319104163536, + "flos": 32266527217920.0, + "grad_norm": 1.7884317276415618, + "language_loss": 0.73248255, + "learning_rate": 3.469857215756257e-06, + "loss": 0.7541883, + "num_input_tokens_seen": 93424750, + "step": 4326, + "time_per_iteration": 5.432117462158203 + }, + { + "auxiliary_loss_clip": 0.0111517, + "auxiliary_loss_mlp": 0.00794392, + "balance_loss_clip": 1.04967153, + "balance_loss_mlp": 1.01086843, + "epoch": 0.26015331429430333, + "flos": 26286754752000.0, + "grad_norm": 1.9501609860226394, + "language_loss": 0.86770654, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.88680214, + "num_input_tokens_seen": 93443465, + "step": 4327, + "time_per_iteration": 2.5746920108795166 + }, + { + "auxiliary_loss_clip": 0.01150133, + "auxiliary_loss_mlp": 0.00796744, + "balance_loss_clip": 1.05507922, + "balance_loss_mlp": 1.01390171, + "epoch": 0.2602134375469713, + "flos": 21142335957120.0, + "grad_norm": 1.5786999783214695, + "language_loss": 0.80524671, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.8247155, + "num_input_tokens_seen": 93462580, + "step": 4328, + "time_per_iteration": 2.5092906951904297 + }, + { + "auxiliary_loss_clip": 0.01118202, + "auxiliary_loss_mlp": 0.00792321, + "balance_loss_clip": 1.04910231, + "balance_loss_mlp": 1.01003909, + "epoch": 0.26027356079963926, + "flos": 25921327737600.0, + "grad_norm": 1.438560125490664, + "language_loss": 0.88107479, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.90017998, + "num_input_tokens_seen": 93482790, + "step": 4329, + "time_per_iteration": 2.5702037811279297 + }, + { + "auxiliary_loss_clip": 0.01143982, + "auxiliary_loss_mlp": 0.0104051, + "balance_loss_clip": 1.05399287, + "balance_loss_mlp": 1.02384424, + "epoch": 0.2603336840523072, + "flos": 26359222440960.0, + "grad_norm": 2.0050833458655397, + "language_loss": 0.7769537, + "learning_rate": 3.468800324801802e-06, + "loss": 0.79879868, + "num_input_tokens_seen": 93498795, + "step": 4330, + "time_per_iteration": 2.524470329284668 + }, + { + "auxiliary_loss_clip": 0.01146425, + "auxiliary_loss_mlp": 0.01050687, + "balance_loss_clip": 1.0536449, + "balance_loss_mlp": 1.03328288, + "epoch": 0.2603938073049752, + "flos": 23513661054720.0, + "grad_norm": 1.4580356957206388, + "language_loss": 0.75740635, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.7793774, + "num_input_tokens_seen": 93518335, + "step": 4331, + "time_per_iteration": 2.474468231201172 + }, + { + "auxiliary_loss_clip": 0.01130942, + "auxiliary_loss_mlp": 0.01043477, + "balance_loss_clip": 1.06179154, + "balance_loss_mlp": 1.02670467, + "epoch": 0.26045393055764315, + "flos": 25374300537600.0, + "grad_norm": 1.3750957576397314, + "language_loss": 0.69333494, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.71507913, + "num_input_tokens_seen": 93539170, + "step": 4332, + "time_per_iteration": 2.5712621212005615 + }, + { + "auxiliary_loss_clip": 0.0111626, + "auxiliary_loss_mlp": 0.01049058, + "balance_loss_clip": 1.0506351, + "balance_loss_mlp": 1.0312717, + "epoch": 0.2605140538103111, + "flos": 27635272076160.0, + "grad_norm": 2.131085289059811, + "language_loss": 0.80100262, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.8226558, + "num_input_tokens_seen": 93558480, + "step": 4333, + "time_per_iteration": 2.6174464225769043 + }, + { + "auxiliary_loss_clip": 0.0114315, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.05362654, + "balance_loss_mlp": 1.02393973, + "epoch": 0.2605741770629791, + "flos": 13769839503360.0, + "grad_norm": 2.2194680530795625, + "language_loss": 0.8105405, + "learning_rate": 3.467742542694501e-06, + "loss": 0.83237791, + "num_input_tokens_seen": 93575220, + "step": 4334, + "time_per_iteration": 2.425631284713745 + }, + { + "auxiliary_loss_clip": 0.01124042, + "auxiliary_loss_mlp": 0.0104175, + "balance_loss_clip": 1.05481255, + "balance_loss_mlp": 1.02352262, + "epoch": 0.26063430031564705, + "flos": 26031681296640.0, + "grad_norm": 1.733119612950829, + "language_loss": 0.7937693, + "learning_rate": 3.46747795800024e-06, + "loss": 0.81542724, + "num_input_tokens_seen": 93597015, + "step": 4335, + "time_per_iteration": 2.5666167736053467 + }, + { + "auxiliary_loss_clip": 0.0104311, + "auxiliary_loss_mlp": 0.01021426, + "balance_loss_clip": 1.02145386, + "balance_loss_mlp": 1.0196023, + "epoch": 0.26069442356831507, + "flos": 62443809820800.0, + "grad_norm": 0.8523053376293434, + "language_loss": 0.60719335, + "learning_rate": 3.467213317659068e-06, + "loss": 0.62783873, + "num_input_tokens_seen": 93657775, + "step": 4336, + "time_per_iteration": 3.090118169784546 + }, + { + "auxiliary_loss_clip": 0.01112759, + "auxiliary_loss_mlp": 0.01049381, + "balance_loss_clip": 1.05626202, + "balance_loss_mlp": 1.03209531, + "epoch": 0.26075454682098304, + "flos": 13626376583040.0, + "grad_norm": 1.8067748553068412, + "language_loss": 0.77059507, + "learning_rate": 3.46694862168102e-06, + "loss": 0.79221648, + "num_input_tokens_seen": 93676145, + "step": 4337, + "time_per_iteration": 2.570322036743164 + }, + { + "auxiliary_loss_clip": 0.01124625, + "auxiliary_loss_mlp": 0.01046805, + "balance_loss_clip": 1.05298734, + "balance_loss_mlp": 1.02878118, + "epoch": 0.260814670073651, + "flos": 12126531260160.0, + "grad_norm": 2.0938754207217953, + "language_loss": 0.74802327, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.7697376, + "num_input_tokens_seen": 93692480, + "step": 4338, + "time_per_iteration": 2.553378105163574 + }, + { + "auxiliary_loss_clip": 0.01139183, + "auxiliary_loss_mlp": 0.0104467, + "balance_loss_clip": 1.05332136, + "balance_loss_mlp": 1.02632356, + "epoch": 0.26087479332631897, + "flos": 15122522805120.0, + "grad_norm": 2.0903182747107873, + "language_loss": 0.80266958, + "learning_rate": 3.466419062854447e-06, + "loss": 0.82450807, + "num_input_tokens_seen": 93710165, + "step": 4339, + "time_per_iteration": 2.565016031265259 + }, + { + "auxiliary_loss_clip": 0.01092035, + "auxiliary_loss_mlp": 0.01042612, + "balance_loss_clip": 1.05473995, + "balance_loss_mlp": 1.02606559, + "epoch": 0.26093491657898693, + "flos": 24680937329280.0, + "grad_norm": 1.5283632930736504, + "language_loss": 0.76621288, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.78755933, + "num_input_tokens_seen": 93730185, + "step": 4340, + "time_per_iteration": 2.6806702613830566 + }, + { + "auxiliary_loss_clip": 0.01079926, + "auxiliary_loss_mlp": 0.01045089, + "balance_loss_clip": 1.0510751, + "balance_loss_mlp": 1.02695775, + "epoch": 0.2609950398316549, + "flos": 25116138512640.0, + "grad_norm": 1.4507369348382404, + "language_loss": 0.82588172, + "learning_rate": 3.465889281600845e-06, + "loss": 0.84713185, + "num_input_tokens_seen": 93747690, + "step": 4341, + "time_per_iteration": 2.7596187591552734 + }, + { + "auxiliary_loss_clip": 0.01145798, + "auxiliary_loss_mlp": 0.01044881, + "balance_loss_clip": 1.05402863, + "balance_loss_mlp": 1.02651131, + "epoch": 0.26105516308432286, + "flos": 28548588216960.0, + "grad_norm": 1.8590792570963286, + "language_loss": 0.76490748, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.78681433, + "num_input_tokens_seen": 93767405, + "step": 4342, + "time_per_iteration": 2.628211259841919 + }, + { + "auxiliary_loss_clip": 0.01139623, + "auxiliary_loss_mlp": 0.01034317, + "balance_loss_clip": 1.05396891, + "balance_loss_mlp": 1.01586318, + "epoch": 0.2611152863369908, + "flos": 39530609447040.0, + "grad_norm": 2.5263104630045206, + "language_loss": 0.65793741, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.67967677, + "num_input_tokens_seen": 93789950, + "step": 4343, + "time_per_iteration": 2.6799204349517822 + }, + { + "auxiliary_loss_clip": 0.01078756, + "auxiliary_loss_mlp": 0.01053407, + "balance_loss_clip": 1.05614293, + "balance_loss_mlp": 1.03372598, + "epoch": 0.2611754095896588, + "flos": 13735329511680.0, + "grad_norm": 1.8845952236487313, + "language_loss": 0.73191047, + "learning_rate": 3.465094192845553e-06, + "loss": 0.75323212, + "num_input_tokens_seen": 93807835, + "step": 4344, + "time_per_iteration": 2.5885727405548096 + }, + { + "auxiliary_loss_clip": 0.01146157, + "auxiliary_loss_mlp": 0.01043, + "balance_loss_clip": 1.0541023, + "balance_loss_mlp": 1.02473712, + "epoch": 0.26123553284232676, + "flos": 21506649649920.0, + "grad_norm": 1.9991646639234057, + "language_loss": 0.86644703, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.88833863, + "num_input_tokens_seen": 93825670, + "step": 4345, + "time_per_iteration": 2.4916841983795166 + }, + { + "auxiliary_loss_clip": 0.01120438, + "auxiliary_loss_mlp": 0.01041875, + "balance_loss_clip": 1.0535959, + "balance_loss_mlp": 1.02472043, + "epoch": 0.2612956560949947, + "flos": 21139786091520.0, + "grad_norm": 1.84400567060217, + "language_loss": 0.76431578, + "learning_rate": 3.464563855876015e-06, + "loss": 0.78593886, + "num_input_tokens_seen": 93844045, + "step": 4346, + "time_per_iteration": 2.5186007022857666 + }, + { + "auxiliary_loss_clip": 0.011354, + "auxiliary_loss_mlp": 0.01043701, + "balance_loss_clip": 1.05299759, + "balance_loss_mlp": 1.02614164, + "epoch": 0.2613557793476627, + "flos": 25119011600640.0, + "grad_norm": 1.446564548211201, + "language_loss": 0.75851655, + "learning_rate": 3.464298604081606e-06, + "loss": 0.78030759, + "num_input_tokens_seen": 93864380, + "step": 4347, + "time_per_iteration": 2.592986583709717 + }, + { + "auxiliary_loss_clip": 0.01101102, + "auxiliary_loss_mlp": 0.01039296, + "balance_loss_clip": 1.04844522, + "balance_loss_mlp": 1.02100956, + "epoch": 0.26141590260033065, + "flos": 26067699659520.0, + "grad_norm": 1.6886710965101979, + "language_loss": 0.73634154, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.7577455, + "num_input_tokens_seen": 93885475, + "step": 4348, + "time_per_iteration": 2.600609302520752 + }, + { + "auxiliary_loss_clip": 0.01107802, + "auxiliary_loss_mlp": 0.01047118, + "balance_loss_clip": 1.05433822, + "balance_loss_mlp": 1.02903438, + "epoch": 0.2614760258529987, + "flos": 25701518459520.0, + "grad_norm": 2.0576385305303595, + "language_loss": 0.90948963, + "learning_rate": 3.463767933923799e-06, + "loss": 0.93103886, + "num_input_tokens_seen": 93905545, + "step": 4349, + "time_per_iteration": 2.585158109664917 + }, + { + "auxiliary_loss_clip": 0.01138384, + "auxiliary_loss_mlp": 0.0104717, + "balance_loss_clip": 1.05462372, + "balance_loss_mlp": 1.02981365, + "epoch": 0.26153614910566664, + "flos": 17457147181440.0, + "grad_norm": 1.9299011807650872, + "language_loss": 0.80198544, + "learning_rate": 3.463502515580524e-06, + "loss": 0.82384098, + "num_input_tokens_seen": 93924185, + "step": 4350, + "time_per_iteration": 2.4858100414276123 + }, + { + "auxiliary_loss_clip": 0.01128008, + "auxiliary_loss_mlp": 0.01046501, + "balance_loss_clip": 1.05072606, + "balance_loss_mlp": 1.02878666, + "epoch": 0.2615962723583346, + "flos": 17712831168000.0, + "grad_norm": 1.9843122876684063, + "language_loss": 0.62930971, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.65105486, + "num_input_tokens_seen": 93942825, + "step": 4351, + "time_per_iteration": 2.487734317779541 + }, + { + "auxiliary_loss_clip": 0.01134059, + "auxiliary_loss_mlp": 0.01040894, + "balance_loss_clip": 1.05019701, + "balance_loss_mlp": 1.0232389, + "epoch": 0.26165639561100257, + "flos": 23257725672960.0, + "grad_norm": 2.092744981865366, + "language_loss": 0.84091425, + "learning_rate": 3.462971512415555e-06, + "loss": 0.86266381, + "num_input_tokens_seen": 93962045, + "step": 4352, + "time_per_iteration": 2.4973349571228027 + }, + { + "auxiliary_loss_clip": 0.01047516, + "auxiliary_loss_mlp": 0.01003382, + "balance_loss_clip": 1.02565098, + "balance_loss_mlp": 1.0014981, + "epoch": 0.26171651886367053, + "flos": 66737970800640.0, + "grad_norm": 0.8083682155822187, + "language_loss": 0.705567, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72607595, + "num_input_tokens_seen": 94021175, + "step": 4353, + "time_per_iteration": 3.005462884902954 + }, + { + "auxiliary_loss_clip": 0.01109624, + "auxiliary_loss_mlp": 0.01054865, + "balance_loss_clip": 1.04579782, + "balance_loss_mlp": 1.03511214, + "epoch": 0.2617766421163385, + "flos": 22349581090560.0, + "grad_norm": 1.6301351895871359, + "language_loss": 0.77481627, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.79646116, + "num_input_tokens_seen": 94043370, + "step": 4354, + "time_per_iteration": 2.561483860015869 + }, + { + "auxiliary_loss_clip": 0.01085974, + "auxiliary_loss_mlp": 0.01048217, + "balance_loss_clip": 1.0521884, + "balance_loss_mlp": 1.02998936, + "epoch": 0.26183676536900646, + "flos": 26067125041920.0, + "grad_norm": 1.8338169443674612, + "language_loss": 0.68119442, + "learning_rate": 3.462174591623085e-06, + "loss": 0.70253634, + "num_input_tokens_seen": 94063510, + "step": 4355, + "time_per_iteration": 2.611246347427368 + }, + { + "auxiliary_loss_clip": 0.0109416, + "auxiliary_loss_mlp": 0.01039403, + "balance_loss_clip": 1.05252719, + "balance_loss_mlp": 1.02012658, + "epoch": 0.26189688862167443, + "flos": 20996466825600.0, + "grad_norm": 1.7108487844870557, + "language_loss": 0.67281187, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.69414747, + "num_input_tokens_seen": 94083865, + "step": 4356, + "time_per_iteration": 2.6090705394744873 + }, + { + "auxiliary_loss_clip": 0.01042452, + "auxiliary_loss_mlp": 0.01003951, + "balance_loss_clip": 1.02172256, + "balance_loss_mlp": 1.00213933, + "epoch": 0.2619570118743424, + "flos": 65798261141760.0, + "grad_norm": 0.6824195649184003, + "language_loss": 0.53114581, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55160987, + "num_input_tokens_seen": 94144095, + "step": 4357, + "time_per_iteration": 2.9986743927001953 + }, + { + "auxiliary_loss_clip": 0.01135305, + "auxiliary_loss_mlp": 0.01049319, + "balance_loss_clip": 1.051319, + "balance_loss_mlp": 1.03131807, + "epoch": 0.26201713512701036, + "flos": 28766817296640.0, + "grad_norm": 1.9618307246744673, + "language_loss": 0.83876789, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.86061418, + "num_input_tokens_seen": 94163035, + "step": 4358, + "time_per_iteration": 3.982941150665283 + }, + { + "auxiliary_loss_clip": 0.01121418, + "auxiliary_loss_mlp": 0.01046412, + "balance_loss_clip": 1.04700947, + "balance_loss_mlp": 1.02569306, + "epoch": 0.2620772583796783, + "flos": 26432516142720.0, + "grad_norm": 2.79078183242566, + "language_loss": 0.6763202, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.69799852, + "num_input_tokens_seen": 94182520, + "step": 4359, + "time_per_iteration": 2.5809645652770996 + }, + { + "auxiliary_loss_clip": 0.01115076, + "auxiliary_loss_mlp": 0.01044168, + "balance_loss_clip": 1.04576993, + "balance_loss_mlp": 1.02685857, + "epoch": 0.2621373816323463, + "flos": 20156552127360.0, + "grad_norm": 1.9302512057123988, + "language_loss": 0.78561997, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.80721241, + "num_input_tokens_seen": 94201795, + "step": 4360, + "time_per_iteration": 2.5514204502105713 + }, + { + "auxiliary_loss_clip": 0.01116739, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_clip": 1.04817796, + "balance_loss_mlp": 1.02848291, + "epoch": 0.26219750488501425, + "flos": 28621235473920.0, + "grad_norm": 1.8295380028832282, + "language_loss": 0.68313551, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.70475858, + "num_input_tokens_seen": 94222390, + "step": 4361, + "time_per_iteration": 3.999147891998291 + }, + { + "auxiliary_loss_clip": 0.01135038, + "auxiliary_loss_mlp": 0.01062518, + "balance_loss_clip": 1.05126452, + "balance_loss_mlp": 1.04437387, + "epoch": 0.2622576281376823, + "flos": 15042549173760.0, + "grad_norm": 2.424189474497625, + "language_loss": 0.84023184, + "learning_rate": 3.46031316964119e-06, + "loss": 0.86220741, + "num_input_tokens_seen": 94239980, + "step": 4362, + "time_per_iteration": 2.479478120803833 + }, + { + "auxiliary_loss_clip": 0.0110158, + "auxiliary_loss_mlp": 0.01046658, + "balance_loss_clip": 1.04843307, + "balance_loss_mlp": 1.02820456, + "epoch": 0.26231775139035024, + "flos": 26396174557440.0, + "grad_norm": 1.7016252825507183, + "language_loss": 0.65391695, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.67539942, + "num_input_tokens_seen": 94260715, + "step": 4363, + "time_per_iteration": 2.6306166648864746 + }, + { + "auxiliary_loss_clip": 0.01036756, + "auxiliary_loss_mlp": 0.01005948, + "balance_loss_clip": 1.03407109, + "balance_loss_mlp": 1.00387383, + "epoch": 0.2623778746430182, + "flos": 65408918647680.0, + "grad_norm": 0.8955280228033217, + "language_loss": 0.61087322, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63130033, + "num_input_tokens_seen": 94321285, + "step": 4364, + "time_per_iteration": 4.630819082260132 + }, + { + "auxiliary_loss_clip": 0.01148628, + "auxiliary_loss_mlp": 0.01053271, + "balance_loss_clip": 1.05364084, + "balance_loss_mlp": 1.03369665, + "epoch": 0.26243799789568617, + "flos": 12604215254400.0, + "grad_norm": 2.4539086292477594, + "language_loss": 0.71765798, + "learning_rate": 3.459514586533184e-06, + "loss": 0.73967701, + "num_input_tokens_seen": 94335420, + "step": 4365, + "time_per_iteration": 3.8484385013580322 + }, + { + "auxiliary_loss_clip": 0.0112043, + "auxiliary_loss_mlp": 0.00812219, + "balance_loss_clip": 1.052544, + "balance_loss_mlp": 1.04350734, + "epoch": 0.26249812114835414, + "flos": 28623821253120.0, + "grad_norm": 1.8384531948395009, + "language_loss": 0.77336061, + "learning_rate": 3.459248281460509e-06, + "loss": 0.79268706, + "num_input_tokens_seen": 94357440, + "step": 4366, + "time_per_iteration": 2.6529324054718018 + }, + { + "auxiliary_loss_clip": 0.01147554, + "auxiliary_loss_mlp": 0.0105079, + "balance_loss_clip": 1.05417681, + "balance_loss_mlp": 1.03383887, + "epoch": 0.2625582444010221, + "flos": 14465393441280.0, + "grad_norm": 1.7739601521421118, + "language_loss": 0.76445055, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.78643394, + "num_input_tokens_seen": 94375690, + "step": 4367, + "time_per_iteration": 2.5249221324920654 + }, + { + "auxiliary_loss_clip": 0.0113331, + "auxiliary_loss_mlp": 0.01051582, + "balance_loss_clip": 1.05135322, + "balance_loss_mlp": 1.03494, + "epoch": 0.26261836765369007, + "flos": 16613174246400.0, + "grad_norm": 1.559255083678225, + "language_loss": 0.6986447, + "learning_rate": 3.458715505320736e-06, + "loss": 0.72049367, + "num_input_tokens_seen": 94393190, + "step": 4368, + "time_per_iteration": 2.5003228187561035 + }, + { + "auxiliary_loss_clip": 0.01121761, + "auxiliary_loss_mlp": 0.01048257, + "balance_loss_clip": 1.05187249, + "balance_loss_mlp": 1.02986312, + "epoch": 0.26267849090635803, + "flos": 20519932066560.0, + "grad_norm": 2.0433897832995447, + "language_loss": 0.78875363, + "learning_rate": 3.458449034273841e-06, + "loss": 0.81045383, + "num_input_tokens_seen": 94410975, + "step": 4369, + "time_per_iteration": 2.539787530899048 + }, + { + "auxiliary_loss_clip": 0.01118733, + "auxiliary_loss_mlp": 0.01043924, + "balance_loss_clip": 1.05358374, + "balance_loss_mlp": 1.0267694, + "epoch": 0.262738614159026, + "flos": 21323936142720.0, + "grad_norm": 1.9060299312410807, + "language_loss": 0.83110225, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.85272884, + "num_input_tokens_seen": 94429985, + "step": 4370, + "time_per_iteration": 2.545135021209717 + }, + { + "auxiliary_loss_clip": 0.01138646, + "auxiliary_loss_mlp": 0.01055747, + "balance_loss_clip": 1.05306959, + "balance_loss_mlp": 1.03512406, + "epoch": 0.26279873741169396, + "flos": 17603590930560.0, + "grad_norm": 1.6123663783188398, + "language_loss": 0.7141428, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.73608673, + "num_input_tokens_seen": 94448660, + "step": 4371, + "time_per_iteration": 2.572695255279541 + }, + { + "auxiliary_loss_clip": 0.01055633, + "auxiliary_loss_mlp": 0.01014799, + "balance_loss_clip": 1.02339172, + "balance_loss_mlp": 1.01285636, + "epoch": 0.2628588606643619, + "flos": 60949746587520.0, + "grad_norm": 0.7007925315149038, + "language_loss": 0.56430948, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58501387, + "num_input_tokens_seen": 94515630, + "step": 4372, + "time_per_iteration": 3.2151167392730713 + }, + { + "auxiliary_loss_clip": 0.01120117, + "auxiliary_loss_mlp": 0.01036166, + "balance_loss_clip": 1.05193114, + "balance_loss_mlp": 1.01989365, + "epoch": 0.2629189839170299, + "flos": 27016315891200.0, + "grad_norm": 2.109915434050956, + "language_loss": 0.77628684, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.79784966, + "num_input_tokens_seen": 94535385, + "step": 4373, + "time_per_iteration": 2.6567587852478027 + }, + { + "auxiliary_loss_clip": 0.01101984, + "auxiliary_loss_mlp": 0.01043854, + "balance_loss_clip": 1.05317998, + "balance_loss_mlp": 1.02723026, + "epoch": 0.26297910716969786, + "flos": 17019863009280.0, + "grad_norm": 2.463689670877103, + "language_loss": 0.70781839, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.72927678, + "num_input_tokens_seen": 94552650, + "step": 4374, + "time_per_iteration": 2.5929269790649414 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.0104455, + "balance_loss_clip": 1.0537827, + "balance_loss_mlp": 1.02587008, + "epoch": 0.2630392304223659, + "flos": 24897370728960.0, + "grad_norm": 1.687008300326169, + "language_loss": 0.8089484, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.83056897, + "num_input_tokens_seen": 94574075, + "step": 4375, + "time_per_iteration": 2.590388536453247 + }, + { + "auxiliary_loss_clip": 0.01118027, + "auxiliary_loss_mlp": 0.01046868, + "balance_loss_clip": 1.04994369, + "balance_loss_mlp": 1.03025043, + "epoch": 0.26309935367503384, + "flos": 32854026067200.0, + "grad_norm": 2.1003347436398676, + "language_loss": 0.660061, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.68170989, + "num_input_tokens_seen": 94594255, + "step": 4376, + "time_per_iteration": 2.618356943130493 + }, + { + "auxiliary_loss_clip": 0.01102788, + "auxiliary_loss_mlp": 0.01055949, + "balance_loss_clip": 1.05451763, + "balance_loss_mlp": 1.03682756, + "epoch": 0.2631594769277018, + "flos": 15887958652800.0, + "grad_norm": 2.1136069737120486, + "language_loss": 0.69312197, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.71470928, + "num_input_tokens_seen": 94611410, + "step": 4377, + "time_per_iteration": 2.60772705078125 + }, + { + "auxiliary_loss_clip": 0.0113318, + "auxiliary_loss_mlp": 0.01043379, + "balance_loss_clip": 1.05162215, + "balance_loss_mlp": 1.02617693, + "epoch": 0.2632196001803698, + "flos": 50804943557760.0, + "grad_norm": 1.7490173290015092, + "language_loss": 0.79582822, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.81759381, + "num_input_tokens_seen": 94636575, + "step": 4378, + "time_per_iteration": 2.760911464691162 + }, + { + "auxiliary_loss_clip": 0.01119675, + "auxiliary_loss_mlp": 0.0104864, + "balance_loss_clip": 1.05175352, + "balance_loss_mlp": 1.03301156, + "epoch": 0.26327972343303774, + "flos": 13733031041280.0, + "grad_norm": 1.982443517710993, + "language_loss": 0.7683928, + "learning_rate": 3.455781283723846e-06, + "loss": 0.7900759, + "num_input_tokens_seen": 94654345, + "step": 4379, + "time_per_iteration": 2.543760061264038 + }, + { + "auxiliary_loss_clip": 0.01111095, + "auxiliary_loss_mlp": 0.01043194, + "balance_loss_clip": 1.05439043, + "balance_loss_mlp": 1.0242877, + "epoch": 0.2633398466857057, + "flos": 23769057732480.0, + "grad_norm": 3.1073610659824027, + "language_loss": 0.77994424, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.80148715, + "num_input_tokens_seen": 94673985, + "step": 4380, + "time_per_iteration": 2.6417787075042725 + }, + { + "auxiliary_loss_clip": 0.01117362, + "auxiliary_loss_mlp": 0.01041985, + "balance_loss_clip": 1.04558921, + "balance_loss_mlp": 1.02461624, + "epoch": 0.26339996993837367, + "flos": 27600223380480.0, + "grad_norm": 1.924105774372334, + "language_loss": 0.64101404, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.66260749, + "num_input_tokens_seen": 94693145, + "step": 4381, + "time_per_iteration": 2.577605724334717 + }, + { + "auxiliary_loss_clip": 0.01130085, + "auxiliary_loss_mlp": 0.01040754, + "balance_loss_clip": 1.04822731, + "balance_loss_mlp": 1.02404118, + "epoch": 0.26346009319104163, + "flos": 16946317912320.0, + "grad_norm": 1.8722766535643167, + "language_loss": 0.82916325, + "learning_rate": 3.454979881632595e-06, + "loss": 0.85087168, + "num_input_tokens_seen": 94710185, + "step": 4382, + "time_per_iteration": 2.4710776805877686 + }, + { + "auxiliary_loss_clip": 0.01105921, + "auxiliary_loss_mlp": 0.01055454, + "balance_loss_clip": 1.05695319, + "balance_loss_mlp": 1.03619003, + "epoch": 0.2635202164437096, + "flos": 37232218915200.0, + "grad_norm": 1.8627329063786284, + "language_loss": 0.69699407, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.71860778, + "num_input_tokens_seen": 94730280, + "step": 4383, + "time_per_iteration": 2.6879851818084717 + }, + { + "auxiliary_loss_clip": 0.011316, + "auxiliary_loss_mlp": 0.01047104, + "balance_loss_clip": 1.049927, + "balance_loss_mlp": 1.03077209, + "epoch": 0.26358033969637756, + "flos": 20996359084800.0, + "grad_norm": 2.1131313905581206, + "language_loss": 0.69373494, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.71552199, + "num_input_tokens_seen": 94748560, + "step": 4384, + "time_per_iteration": 2.501309394836426 + }, + { + "auxiliary_loss_clip": 0.0112889, + "auxiliary_loss_mlp": 0.01043778, + "balance_loss_clip": 1.04798007, + "balance_loss_mlp": 1.02658832, + "epoch": 0.26364046294904553, + "flos": 27746092512000.0, + "grad_norm": 7.2107371007278775, + "language_loss": 0.70297003, + "learning_rate": 3.45417798298451e-06, + "loss": 0.7246967, + "num_input_tokens_seen": 94767570, + "step": 4385, + "time_per_iteration": 2.626553535461426 + }, + { + "auxiliary_loss_clip": 0.01110324, + "auxiliary_loss_mlp": 0.01051196, + "balance_loss_clip": 1.04985929, + "balance_loss_mlp": 1.03326643, + "epoch": 0.2637005862017135, + "flos": 22893088757760.0, + "grad_norm": 1.8376093900720654, + "language_loss": 0.85618889, + "learning_rate": 3.453910573136482e-06, + "loss": 0.87780404, + "num_input_tokens_seen": 94784985, + "step": 4386, + "time_per_iteration": 2.5771591663360596 + }, + { + "auxiliary_loss_clip": 0.01124331, + "auxiliary_loss_mlp": 0.0104394, + "balance_loss_clip": 1.05228853, + "balance_loss_mlp": 1.02696466, + "epoch": 0.26376070945438146, + "flos": 15048834053760.0, + "grad_norm": 3.0277282317871252, + "language_loss": 0.7747761, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.79645884, + "num_input_tokens_seen": 94802545, + "step": 4387, + "time_per_iteration": 2.5180513858795166 + }, + { + "auxiliary_loss_clip": 0.01129837, + "auxiliary_loss_mlp": 0.01050696, + "balance_loss_clip": 1.05579758, + "balance_loss_mlp": 1.03404272, + "epoch": 0.2638208327070494, + "flos": 21141833166720.0, + "grad_norm": 1.937737761135073, + "language_loss": 0.76291931, + "learning_rate": 3.453375588053264e-06, + "loss": 0.78472459, + "num_input_tokens_seen": 94820730, + "step": 4388, + "time_per_iteration": 2.5022358894348145 + }, + { + "auxiliary_loss_clip": 0.01141735, + "auxiliary_loss_mlp": 0.01040995, + "balance_loss_clip": 1.05024266, + "balance_loss_mlp": 1.02437782, + "epoch": 0.26388095595971744, + "flos": 21725597001600.0, + "grad_norm": 2.881053240785093, + "language_loss": 0.86369467, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.88552201, + "num_input_tokens_seen": 94839175, + "step": 4389, + "time_per_iteration": 2.46939754486084 + }, + { + "auxiliary_loss_clip": 0.01044334, + "auxiliary_loss_mlp": 0.01010081, + "balance_loss_clip": 1.02608502, + "balance_loss_mlp": 1.00769639, + "epoch": 0.2639410792123854, + "flos": 65515537192320.0, + "grad_norm": 0.8098718678112861, + "language_loss": 0.60292989, + "learning_rate": 3.452840382521457e-06, + "loss": 0.623474, + "num_input_tokens_seen": 94898865, + "step": 4390, + "time_per_iteration": 3.1140854358673096 + }, + { + "auxiliary_loss_clip": 0.0112849, + "auxiliary_loss_mlp": 0.01042392, + "balance_loss_clip": 1.0505836, + "balance_loss_mlp": 1.02472544, + "epoch": 0.2640012024650534, + "flos": 23948574929280.0, + "grad_norm": 2.0234689527115437, + "language_loss": 0.77745348, + "learning_rate": 3.4525726971127e-06, + "loss": 0.79916233, + "num_input_tokens_seen": 94917490, + "step": 4391, + "time_per_iteration": 2.5452842712402344 + }, + { + "auxiliary_loss_clip": 0.01024267, + "auxiliary_loss_mlp": 0.01034943, + "balance_loss_clip": 1.02411127, + "balance_loss_mlp": 1.44962382, + "epoch": 0.26406132571772134, + "flos": 56441163369600.0, + "grad_norm": 0.8980062902404639, + "language_loss": 0.58728611, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60787821, + "num_input_tokens_seen": 94969065, + "step": 4392, + "time_per_iteration": 3.136406183242798 + }, + { + "auxiliary_loss_clip": 0.01134592, + "auxiliary_loss_mlp": 0.0105264, + "balance_loss_clip": 1.05362105, + "balance_loss_mlp": 1.03535509, + "epoch": 0.2641214489703893, + "flos": 22090557139200.0, + "grad_norm": 1.9192418706603087, + "language_loss": 0.68714941, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.70902181, + "num_input_tokens_seen": 94988540, + "step": 4393, + "time_per_iteration": 2.5015244483947754 + }, + { + "auxiliary_loss_clip": 0.01136645, + "auxiliary_loss_mlp": 0.01043828, + "balance_loss_clip": 1.05249703, + "balance_loss_mlp": 1.02570808, + "epoch": 0.26418157222305727, + "flos": 16544764794240.0, + "grad_norm": 1.7876586007322226, + "language_loss": 0.8408162, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.86262095, + "num_input_tokens_seen": 95004810, + "step": 4394, + "time_per_iteration": 2.482170343399048 + }, + { + "auxiliary_loss_clip": 0.01121145, + "auxiliary_loss_mlp": 0.01047799, + "balance_loss_clip": 1.05023146, + "balance_loss_mlp": 1.02851093, + "epoch": 0.26424169547572524, + "flos": 18002486442240.0, + "grad_norm": 2.27239914833793, + "language_loss": 0.69740576, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.71909517, + "num_input_tokens_seen": 95024085, + "step": 4395, + "time_per_iteration": 2.5033183097839355 + }, + { + "auxiliary_loss_clip": 0.01110827, + "auxiliary_loss_mlp": 0.01038454, + "balance_loss_clip": 1.05107808, + "balance_loss_mlp": 1.02119207, + "epoch": 0.2643018187283932, + "flos": 16983162288000.0, + "grad_norm": 1.8901670889805686, + "language_loss": 0.86793739, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.88943022, + "num_input_tokens_seen": 95042515, + "step": 4396, + "time_per_iteration": 2.5593998432159424 + }, + { + "auxiliary_loss_clip": 0.01006689, + "auxiliary_loss_mlp": 0.0102546, + "balance_loss_clip": 1.02342391, + "balance_loss_mlp": 1.02267098, + "epoch": 0.26436194198106117, + "flos": 59664359416320.0, + "grad_norm": 0.7910875031223654, + "language_loss": 0.55005282, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57037431, + "num_input_tokens_seen": 95094835, + "step": 4397, + "time_per_iteration": 4.526848316192627 + }, + { + "auxiliary_loss_clip": 0.0113141, + "auxiliary_loss_mlp": 0.01051345, + "balance_loss_clip": 1.050735, + "balance_loss_mlp": 1.03476334, + "epoch": 0.26442206523372913, + "flos": 32921322197760.0, + "grad_norm": 2.0398499259582565, + "language_loss": 0.77977586, + "learning_rate": 3.450697357532435e-06, + "loss": 0.80160338, + "num_input_tokens_seen": 95113480, + "step": 4398, + "time_per_iteration": 2.7783992290496826 + }, + { + "auxiliary_loss_clip": 0.01139459, + "auxiliary_loss_mlp": 0.01046485, + "balance_loss_clip": 1.05867553, + "balance_loss_mlp": 1.02874649, + "epoch": 0.2644821884863971, + "flos": 21031300039680.0, + "grad_norm": 1.6584463220502164, + "language_loss": 0.67016208, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.69202143, + "num_input_tokens_seen": 95132580, + "step": 4399, + "time_per_iteration": 3.8913605213165283 + }, + { + "auxiliary_loss_clip": 0.01096951, + "auxiliary_loss_mlp": 0.01049858, + "balance_loss_clip": 1.05236864, + "balance_loss_mlp": 1.03185773, + "epoch": 0.26454231173906506, + "flos": 20776801201920.0, + "grad_norm": 1.6893011635552766, + "language_loss": 0.86926705, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.89073515, + "num_input_tokens_seen": 95152375, + "step": 4400, + "time_per_iteration": 2.605315923690796 + }, + { + "auxiliary_loss_clip": 0.01117111, + "auxiliary_loss_mlp": 0.01041909, + "balance_loss_clip": 1.04945254, + "balance_loss_mlp": 1.02403998, + "epoch": 0.264602434991733, + "flos": 16618669027200.0, + "grad_norm": 2.0091643639118835, + "language_loss": 0.76037884, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.78196907, + "num_input_tokens_seen": 95170265, + "step": 4401, + "time_per_iteration": 2.5109434127807617 + }, + { + "auxiliary_loss_clip": 0.01103023, + "auxiliary_loss_mlp": 0.01052503, + "balance_loss_clip": 1.05305493, + "balance_loss_mlp": 1.03376341, + "epoch": 0.26466255824440105, + "flos": 19062677295360.0, + "grad_norm": 1.6961170679443283, + "language_loss": 0.87897581, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.90053105, + "num_input_tokens_seen": 95188655, + "step": 4402, + "time_per_iteration": 3.967491626739502 + }, + { + "auxiliary_loss_clip": 0.01111241, + "auxiliary_loss_mlp": 0.01047223, + "balance_loss_clip": 1.05405986, + "balance_loss_mlp": 1.02938914, + "epoch": 0.264722681497069, + "flos": 22638554006400.0, + "grad_norm": 1.5282527709412101, + "language_loss": 0.78206861, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.80365324, + "num_input_tokens_seen": 95209615, + "step": 4403, + "time_per_iteration": 2.6071062088012695 + }, + { + "auxiliary_loss_clip": 0.01130712, + "auxiliary_loss_mlp": 0.01040815, + "balance_loss_clip": 1.05544055, + "balance_loss_mlp": 1.02262378, + "epoch": 0.264782804749737, + "flos": 22492253911680.0, + "grad_norm": 2.0038718210069284, + "language_loss": 0.8797586, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.90147388, + "num_input_tokens_seen": 95227810, + "step": 4404, + "time_per_iteration": 3.8886563777923584 + }, + { + "auxiliary_loss_clip": 0.01122966, + "auxiliary_loss_mlp": 0.01041368, + "balance_loss_clip": 1.04914522, + "balance_loss_mlp": 1.02419043, + "epoch": 0.26484292800240494, + "flos": 16800269212800.0, + "grad_norm": 1.7938719653886217, + "language_loss": 0.76728553, + "learning_rate": 3.448819322433709e-06, + "loss": 0.78892881, + "num_input_tokens_seen": 95245890, + "step": 4405, + "time_per_iteration": 2.520015239715576 + }, + { + "auxiliary_loss_clip": 0.01146226, + "auxiliary_loss_mlp": 0.01040581, + "balance_loss_clip": 1.0532428, + "balance_loss_mlp": 1.02218664, + "epoch": 0.2649030512550729, + "flos": 20449583280000.0, + "grad_norm": 1.9651212866756151, + "language_loss": 0.70241761, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.72428572, + "num_input_tokens_seen": 95264955, + "step": 4406, + "time_per_iteration": 2.469975233078003 + }, + { + "auxiliary_loss_clip": 0.01120805, + "auxiliary_loss_mlp": 0.01049514, + "balance_loss_clip": 1.05607772, + "balance_loss_mlp": 1.03309894, + "epoch": 0.2649631745077409, + "flos": 22416123035520.0, + "grad_norm": 1.7170309932055356, + "language_loss": 0.83698618, + "learning_rate": 3.448282246369912e-06, + "loss": 0.85868937, + "num_input_tokens_seen": 95284245, + "step": 4407, + "time_per_iteration": 2.578608989715576 + }, + { + "auxiliary_loss_clip": 0.01107093, + "auxiliary_loss_mlp": 0.0103423, + "balance_loss_clip": 1.04968333, + "balance_loss_mlp": 1.01645565, + "epoch": 0.26502329776040884, + "flos": 35116110927360.0, + "grad_norm": 1.8913692257173007, + "language_loss": 0.76059258, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.78200579, + "num_input_tokens_seen": 95307125, + "step": 4408, + "time_per_iteration": 2.683703660964966 + }, + { + "auxiliary_loss_clip": 0.01095089, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_clip": 1.05363941, + "balance_loss_mlp": 1.02447772, + "epoch": 0.2650834210130768, + "flos": 38687498438400.0, + "grad_norm": 1.7176394478630326, + "language_loss": 0.70982969, + "learning_rate": 3.447744950630084e-06, + "loss": 0.73120612, + "num_input_tokens_seen": 95329150, + "step": 4409, + "time_per_iteration": 2.725533962249756 + }, + { + "auxiliary_loss_clip": 0.01131844, + "auxiliary_loss_mlp": 0.01041131, + "balance_loss_clip": 1.05357516, + "balance_loss_mlp": 1.02268934, + "epoch": 0.26514354426574477, + "flos": 24716847951360.0, + "grad_norm": 1.6030140374413975, + "language_loss": 0.72768289, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.7494126, + "num_input_tokens_seen": 95349880, + "step": 4410, + "time_per_iteration": 2.538592576980591 + }, + { + "auxiliary_loss_clip": 0.01138845, + "auxiliary_loss_mlp": 0.01052984, + "balance_loss_clip": 1.05246949, + "balance_loss_mlp": 1.0360086, + "epoch": 0.26520366751841273, + "flos": 20340055733760.0, + "grad_norm": 1.7430946471764588, + "language_loss": 0.7333802, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.7552985, + "num_input_tokens_seen": 95368570, + "step": 4411, + "time_per_iteration": 2.529890537261963 + }, + { + "auxiliary_loss_clip": 0.01107955, + "auxiliary_loss_mlp": 0.01043679, + "balance_loss_clip": 1.05844247, + "balance_loss_mlp": 1.02677476, + "epoch": 0.2652637907710807, + "flos": 22343870828160.0, + "grad_norm": 2.853722657739487, + "language_loss": 0.82264185, + "learning_rate": 3.446938595306071e-06, + "loss": 0.84415817, + "num_input_tokens_seen": 95387065, + "step": 4412, + "time_per_iteration": 2.607990026473999 + }, + { + "auxiliary_loss_clip": 0.01133394, + "auxiliary_loss_mlp": 0.01050864, + "balance_loss_clip": 1.05392146, + "balance_loss_mlp": 1.03444862, + "epoch": 0.26532391402374866, + "flos": 19354235990400.0, + "grad_norm": 2.090971702600164, + "language_loss": 0.74601042, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.76785302, + "num_input_tokens_seen": 95406345, + "step": 4413, + "time_per_iteration": 2.5071966648101807 + }, + { + "auxiliary_loss_clip": 0.01044692, + "auxiliary_loss_mlp": 0.01003874, + "balance_loss_clip": 1.02066255, + "balance_loss_mlp": 1.00160933, + "epoch": 0.26538403727641663, + "flos": 44787611422080.0, + "grad_norm": 0.8721646797899274, + "language_loss": 0.56900698, + "learning_rate": 3.446400750732793e-06, + "loss": 0.58949262, + "num_input_tokens_seen": 95463595, + "step": 4414, + "time_per_iteration": 3.023125171661377 + }, + { + "auxiliary_loss_clip": 0.01111193, + "auxiliary_loss_mlp": 0.0104508, + "balance_loss_clip": 1.05856359, + "balance_loss_mlp": 1.02932024, + "epoch": 0.26544416052908465, + "flos": 28182119708160.0, + "grad_norm": 1.730570216326672, + "language_loss": 0.74342585, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.7649886, + "num_input_tokens_seen": 95484115, + "step": 4415, + "time_per_iteration": 2.6364481449127197 + }, + { + "auxiliary_loss_clip": 0.01093386, + "auxiliary_loss_mlp": 0.01044109, + "balance_loss_clip": 1.04633212, + "balance_loss_mlp": 1.02414107, + "epoch": 0.2655042837817526, + "flos": 17565274097280.0, + "grad_norm": 2.1555280244232438, + "language_loss": 0.86553347, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.88690841, + "num_input_tokens_seen": 95501435, + "step": 4416, + "time_per_iteration": 2.568256378173828 + }, + { + "auxiliary_loss_clip": 0.01140352, + "auxiliary_loss_mlp": 0.01044602, + "balance_loss_clip": 1.05521917, + "balance_loss_mlp": 1.02589798, + "epoch": 0.2655644070344206, + "flos": 23404636298880.0, + "grad_norm": 1.647706326575447, + "language_loss": 0.76408511, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.78593463, + "num_input_tokens_seen": 95520135, + "step": 4417, + "time_per_iteration": 2.536073684692383 + }, + { + "auxiliary_loss_clip": 0.01116712, + "auxiliary_loss_mlp": 0.01041302, + "balance_loss_clip": 1.0505116, + "balance_loss_mlp": 1.02239561, + "epoch": 0.26562453028708854, + "flos": 26468462678400.0, + "grad_norm": 1.4614490570466478, + "language_loss": 0.79934263, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.82092279, + "num_input_tokens_seen": 95541705, + "step": 4418, + "time_per_iteration": 2.585609197616577 + }, + { + "auxiliary_loss_clip": 0.01134012, + "auxiliary_loss_mlp": 0.01051816, + "balance_loss_clip": 1.05209827, + "balance_loss_mlp": 1.03411365, + "epoch": 0.2656846535397565, + "flos": 19207576759680.0, + "grad_norm": 2.047124782234769, + "language_loss": 0.66981399, + "learning_rate": 3.445055179644071e-06, + "loss": 0.69167227, + "num_input_tokens_seen": 95560300, + "step": 4419, + "time_per_iteration": 2.4991655349731445 + }, + { + "auxiliary_loss_clip": 0.01144906, + "auxiliary_loss_mlp": 0.01051613, + "balance_loss_clip": 1.05178988, + "balance_loss_mlp": 1.03281355, + "epoch": 0.2657447767924245, + "flos": 30551325903360.0, + "grad_norm": 2.1283391962267633, + "language_loss": 0.79438108, + "learning_rate": 3.444785900995585e-06, + "loss": 0.81634629, + "num_input_tokens_seen": 95580150, + "step": 4420, + "time_per_iteration": 2.539354085922241 + }, + { + "auxiliary_loss_clip": 0.01122686, + "auxiliary_loss_mlp": 0.01054445, + "balance_loss_clip": 1.05308604, + "balance_loss_mlp": 1.03416729, + "epoch": 0.26580490004509244, + "flos": 20922742160640.0, + "grad_norm": 2.032274391641846, + "language_loss": 0.81626356, + "learning_rate": 3.444516567560673e-06, + "loss": 0.83803487, + "num_input_tokens_seen": 95597570, + "step": 4421, + "time_per_iteration": 2.5403130054473877 + }, + { + "auxiliary_loss_clip": 0.01129056, + "auxiliary_loss_mlp": 0.01041749, + "balance_loss_clip": 1.05126178, + "balance_loss_mlp": 1.02451086, + "epoch": 0.2658650232977604, + "flos": 43945682584320.0, + "grad_norm": 1.5914586562586155, + "language_loss": 0.65694618, + "learning_rate": 3.444247179349548e-06, + "loss": 0.67865425, + "num_input_tokens_seen": 95619415, + "step": 4422, + "time_per_iteration": 2.6843690872192383 + }, + { + "auxiliary_loss_clip": 0.01128207, + "auxiliary_loss_mlp": 0.01052475, + "balance_loss_clip": 1.05145574, + "balance_loss_mlp": 1.03567886, + "epoch": 0.26592514655042837, + "flos": 29716439109120.0, + "grad_norm": 2.0949678428956133, + "language_loss": 0.74429774, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.76610452, + "num_input_tokens_seen": 95639155, + "step": 4423, + "time_per_iteration": 2.580152988433838 + }, + { + "auxiliary_loss_clip": 0.01126843, + "auxiliary_loss_mlp": 0.01064823, + "balance_loss_clip": 1.04784322, + "balance_loss_mlp": 1.04547548, + "epoch": 0.26598526980309634, + "flos": 46677730014720.0, + "grad_norm": 1.7126677816458091, + "language_loss": 0.77900314, + "learning_rate": 3.443708238639522e-06, + "loss": 0.80091971, + "num_input_tokens_seen": 95663320, + "step": 4424, + "time_per_iteration": 2.7113468647003174 + }, + { + "auxiliary_loss_clip": 0.01130611, + "auxiliary_loss_mlp": 0.01053883, + "balance_loss_clip": 1.05677748, + "balance_loss_mlp": 1.0362637, + "epoch": 0.2660453930557643, + "flos": 11509442582400.0, + "grad_norm": 2.0503481042831684, + "language_loss": 0.79425108, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.81609607, + "num_input_tokens_seen": 95680260, + "step": 4425, + "time_per_iteration": 2.464238166809082 + }, + { + "auxiliary_loss_clip": 0.01118916, + "auxiliary_loss_mlp": 0.01050925, + "balance_loss_clip": 1.05207205, + "balance_loss_mlp": 1.03504646, + "epoch": 0.26610551630843227, + "flos": 24791578197120.0, + "grad_norm": 1.503152849994541, + "language_loss": 0.80416524, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.8258636, + "num_input_tokens_seen": 95701140, + "step": 4426, + "time_per_iteration": 2.579782724380493 + }, + { + "auxiliary_loss_clip": 0.01149112, + "auxiliary_loss_mlp": 0.01054087, + "balance_loss_clip": 1.05695677, + "balance_loss_mlp": 1.03723061, + "epoch": 0.26616563956110023, + "flos": 27636385397760.0, + "grad_norm": 1.537056116638259, + "language_loss": 0.77172869, + "learning_rate": 3.442899417008333e-06, + "loss": 0.79376066, + "num_input_tokens_seen": 95722060, + "step": 4427, + "time_per_iteration": 2.520817279815674 + }, + { + "auxiliary_loss_clip": 0.01107953, + "auxiliary_loss_mlp": 0.01037954, + "balance_loss_clip": 1.05090678, + "balance_loss_mlp": 1.02155089, + "epoch": 0.26622576281376825, + "flos": 28362893880960.0, + "grad_norm": 1.5792828291709775, + "language_loss": 0.76619202, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.78765112, + "num_input_tokens_seen": 95742495, + "step": 4428, + "time_per_iteration": 2.627612352371216 + }, + { + "auxiliary_loss_clip": 0.01107725, + "auxiliary_loss_mlp": 0.01002234, + "balance_loss_clip": 1.050843, + "balance_loss_mlp": 1.40964615, + "epoch": 0.2662858860664362, + "flos": 18041341979520.0, + "grad_norm": 2.236605847511358, + "language_loss": 0.82639253, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.8474921, + "num_input_tokens_seen": 95761510, + "step": 4429, + "time_per_iteration": 2.525341510772705 + }, + { + "auxiliary_loss_clip": 0.01099685, + "auxiliary_loss_mlp": 0.01046622, + "balance_loss_clip": 1.04895079, + "balance_loss_mlp": 1.02978969, + "epoch": 0.2663460093191042, + "flos": 22745818995840.0, + "grad_norm": 1.544496983474864, + "language_loss": 0.72031105, + "learning_rate": 3.442090102943143e-06, + "loss": 0.7417742, + "num_input_tokens_seen": 95782385, + "step": 4430, + "time_per_iteration": 2.5926504135131836 + }, + { + "auxiliary_loss_clip": 0.01144431, + "auxiliary_loss_mlp": 0.0104679, + "balance_loss_clip": 1.05136895, + "balance_loss_mlp": 1.0277524, + "epoch": 0.26640613257177215, + "flos": 16508782344960.0, + "grad_norm": 1.9216805620612056, + "language_loss": 0.82211339, + "learning_rate": 3.441820222206035e-06, + "loss": 0.84402561, + "num_input_tokens_seen": 95800595, + "step": 4431, + "time_per_iteration": 2.4560351371765137 + }, + { + "auxiliary_loss_clip": 0.0113755, + "auxiliary_loss_mlp": 0.01054675, + "balance_loss_clip": 1.05209875, + "balance_loss_mlp": 1.03663838, + "epoch": 0.2664662558244401, + "flos": 23075945919360.0, + "grad_norm": 2.145849961411832, + "language_loss": 0.76728427, + "learning_rate": 3.44155028679496e-06, + "loss": 0.78920656, + "num_input_tokens_seen": 95818480, + "step": 4432, + "time_per_iteration": 2.5234768390655518 + }, + { + "auxiliary_loss_clip": 0.01086313, + "auxiliary_loss_mlp": 0.01050933, + "balance_loss_clip": 1.05426133, + "balance_loss_mlp": 1.03097689, + "epoch": 0.2665263790771081, + "flos": 23769273214080.0, + "grad_norm": 1.7952334988097736, + "language_loss": 0.82375765, + "learning_rate": 3.441280296720154e-06, + "loss": 0.84513009, + "num_input_tokens_seen": 95837205, + "step": 4433, + "time_per_iteration": 2.6081361770629883 + }, + { + "auxiliary_loss_clip": 0.01133106, + "auxiliary_loss_mlp": 0.01045396, + "balance_loss_clip": 1.05154896, + "balance_loss_mlp": 1.02727675, + "epoch": 0.26658650232977604, + "flos": 28001273708160.0, + "grad_norm": 1.91425509807709, + "language_loss": 0.76696533, + "learning_rate": 3.441010251991854e-06, + "loss": 0.78875041, + "num_input_tokens_seen": 95858395, + "step": 4434, + "time_per_iteration": 2.553297519683838 + }, + { + "auxiliary_loss_clip": 0.01138661, + "auxiliary_loss_mlp": 0.01041866, + "balance_loss_clip": 1.04942489, + "balance_loss_mlp": 1.02570164, + "epoch": 0.266646625582444, + "flos": 22163635359360.0, + "grad_norm": 1.6939232937855473, + "language_loss": 0.82330251, + "learning_rate": 3.440740152620301e-06, + "loss": 0.84510779, + "num_input_tokens_seen": 95877875, + "step": 4435, + "time_per_iteration": 3.9191555976867676 + }, + { + "auxiliary_loss_clip": 0.01101275, + "auxiliary_loss_mlp": 0.01054708, + "balance_loss_clip": 1.04687381, + "balance_loss_mlp": 1.03688645, + "epoch": 0.266706748835112, + "flos": 27853537069440.0, + "grad_norm": 2.2715279163038695, + "language_loss": 0.87753206, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.8990919, + "num_input_tokens_seen": 95895820, + "step": 4436, + "time_per_iteration": 2.6219725608825684 + }, + { + "auxiliary_loss_clip": 0.0111722, + "auxiliary_loss_mlp": 0.01047436, + "balance_loss_clip": 1.04947853, + "balance_loss_mlp": 1.03062761, + "epoch": 0.26676687208777994, + "flos": 25812123413760.0, + "grad_norm": 1.4205169612174395, + "language_loss": 0.78481591, + "learning_rate": 3.440199789988407e-06, + "loss": 0.80646253, + "num_input_tokens_seen": 95918025, + "step": 4437, + "time_per_iteration": 2.6007931232452393 + }, + { + "auxiliary_loss_clip": 0.01082908, + "auxiliary_loss_mlp": 0.01044531, + "balance_loss_clip": 1.04704571, + "balance_loss_mlp": 1.02796054, + "epoch": 0.2668269953404479, + "flos": 36064583504640.0, + "grad_norm": 2.121767228001481, + "language_loss": 0.6386252, + "learning_rate": 3.439929526748556e-06, + "loss": 0.65989959, + "num_input_tokens_seen": 95937725, + "step": 4438, + "time_per_iteration": 4.1463892459869385 + }, + { + "auxiliary_loss_clip": 0.01076665, + "auxiliary_loss_mlp": 0.01041793, + "balance_loss_clip": 1.04625285, + "balance_loss_mlp": 1.02503181, + "epoch": 0.26688711859311587, + "flos": 26570987072640.0, + "grad_norm": 1.7623683861652961, + "language_loss": 0.7558741, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.77705866, + "num_input_tokens_seen": 95956335, + "step": 4439, + "time_per_iteration": 2.661316156387329 + }, + { + "auxiliary_loss_clip": 0.01082716, + "auxiliary_loss_mlp": 0.01039724, + "balance_loss_clip": 1.05045295, + "balance_loss_mlp": 1.02135396, + "epoch": 0.26694724184578383, + "flos": 26761565658240.0, + "grad_norm": 1.8028190082303404, + "language_loss": 0.71727204, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.73849642, + "num_input_tokens_seen": 95977135, + "step": 4440, + "time_per_iteration": 2.8228225708007812 + }, + { + "auxiliary_loss_clip": 0.01117812, + "auxiliary_loss_mlp": 0.01043118, + "balance_loss_clip": 1.04706824, + "balance_loss_mlp": 1.0243907, + "epoch": 0.2670073650984518, + "flos": 20959586536320.0, + "grad_norm": 2.0135333594781226, + "language_loss": 0.66859025, + "learning_rate": 3.439118409456376e-06, + "loss": 0.69019961, + "num_input_tokens_seen": 95995435, + "step": 4441, + "time_per_iteration": 4.0875084400177 + }, + { + "auxiliary_loss_clip": 0.01132056, + "auxiliary_loss_mlp": 0.01044278, + "balance_loss_clip": 1.05036354, + "balance_loss_mlp": 1.0262413, + "epoch": 0.2670674883511198, + "flos": 28366054277760.0, + "grad_norm": 1.4930384976745803, + "language_loss": 0.76444209, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.78620547, + "num_input_tokens_seen": 96016340, + "step": 4442, + "time_per_iteration": 2.5793521404266357 + }, + { + "auxiliary_loss_clip": 0.01015655, + "auxiliary_loss_mlp": 0.01006796, + "balance_loss_clip": 1.04582334, + "balance_loss_mlp": 1.00414968, + "epoch": 0.2671276116037878, + "flos": 58971319430400.0, + "grad_norm": 0.9390549358690226, + "language_loss": 0.61215156, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63237613, + "num_input_tokens_seen": 96071205, + "step": 4443, + "time_per_iteration": 4.521975040435791 + }, + { + "auxiliary_loss_clip": 0.01110311, + "auxiliary_loss_mlp": 0.01038758, + "balance_loss_clip": 1.04921758, + "balance_loss_mlp": 1.02234268, + "epoch": 0.26718773485645575, + "flos": 43945072053120.0, + "grad_norm": 1.539144949862099, + "language_loss": 0.76472688, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.78621757, + "num_input_tokens_seen": 96094240, + "step": 4444, + "time_per_iteration": 2.9049065113067627 + }, + { + "auxiliary_loss_clip": 0.01131106, + "auxiliary_loss_mlp": 0.01041508, + "balance_loss_clip": 1.04940248, + "balance_loss_mlp": 1.02310228, + "epoch": 0.2672478581091237, + "flos": 25228323665280.0, + "grad_norm": 1.7877005171646547, + "language_loss": 0.80601805, + "learning_rate": 3.438036155780158e-06, + "loss": 0.82774419, + "num_input_tokens_seen": 96114105, + "step": 4445, + "time_per_iteration": 2.544567346572876 + }, + { + "auxiliary_loss_clip": 0.01117477, + "auxiliary_loss_mlp": 0.01041654, + "balance_loss_clip": 1.05175412, + "balance_loss_mlp": 1.02356958, + "epoch": 0.2673079813617917, + "flos": 15268176455040.0, + "grad_norm": 1.6900831222016517, + "language_loss": 0.88785595, + "learning_rate": 3.43776545600926e-06, + "loss": 0.90944731, + "num_input_tokens_seen": 96132140, + "step": 4446, + "time_per_iteration": 2.506457567214966 + }, + { + "auxiliary_loss_clip": 0.01137587, + "auxiliary_loss_mlp": 0.01045312, + "balance_loss_clip": 1.05390275, + "balance_loss_mlp": 1.02822983, + "epoch": 0.26736810461445965, + "flos": 25812733944960.0, + "grad_norm": 1.712450017647839, + "language_loss": 0.68322051, + "learning_rate": 3.437494701718153e-06, + "loss": 0.70504946, + "num_input_tokens_seen": 96152090, + "step": 4447, + "time_per_iteration": 2.5620434284210205 + }, + { + "auxiliary_loss_clip": 0.01131958, + "auxiliary_loss_mlp": 0.0103367, + "balance_loss_clip": 1.04934859, + "balance_loss_mlp": 1.01625395, + "epoch": 0.2674282278671276, + "flos": 24312709054080.0, + "grad_norm": 1.84329767421907, + "language_loss": 0.83499849, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.85665476, + "num_input_tokens_seen": 96170015, + "step": 4448, + "time_per_iteration": 2.5200748443603516 + }, + { + "auxiliary_loss_clip": 0.01106577, + "auxiliary_loss_mlp": 0.01047458, + "balance_loss_clip": 1.05090284, + "balance_loss_mlp": 1.0299108, + "epoch": 0.2674883511197956, + "flos": 22815521337600.0, + "grad_norm": 1.5881662232920335, + "language_loss": 0.84498179, + "learning_rate": 3.436953029616378e-06, + "loss": 0.86652213, + "num_input_tokens_seen": 96188065, + "step": 4449, + "time_per_iteration": 2.5832161903381348 + }, + { + "auxiliary_loss_clip": 0.01125331, + "auxiliary_loss_mlp": 0.01050869, + "balance_loss_clip": 1.0520699, + "balance_loss_mlp": 1.03134286, + "epoch": 0.26754847437246354, + "flos": 25370170473600.0, + "grad_norm": 1.702608161601835, + "language_loss": 0.84063292, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.86239493, + "num_input_tokens_seen": 96205780, + "step": 4450, + "time_per_iteration": 2.556450605392456 + }, + { + "auxiliary_loss_clip": 0.01099728, + "auxiliary_loss_mlp": 0.01046402, + "balance_loss_clip": 1.04524529, + "balance_loss_mlp": 1.03057146, + "epoch": 0.2676085976251315, + "flos": 20230420446720.0, + "grad_norm": 1.8332219727447805, + "language_loss": 0.80917144, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.83063275, + "num_input_tokens_seen": 96224990, + "step": 4451, + "time_per_iteration": 2.5719053745269775 + }, + { + "auxiliary_loss_clip": 0.01131679, + "auxiliary_loss_mlp": 0.0104383, + "balance_loss_clip": 1.05191159, + "balance_loss_mlp": 1.02753377, + "epoch": 0.26766872087779947, + "flos": 28038225824640.0, + "grad_norm": 1.747725974215543, + "language_loss": 0.86410314, + "learning_rate": 3.436140112818882e-06, + "loss": 0.88585824, + "num_input_tokens_seen": 96245345, + "step": 4452, + "time_per_iteration": 2.5630733966827393 + }, + { + "auxiliary_loss_clip": 0.01128667, + "auxiliary_loss_mlp": 0.01041628, + "balance_loss_clip": 1.05152178, + "balance_loss_mlp": 1.02424717, + "epoch": 0.26772884413046744, + "flos": 18325179250560.0, + "grad_norm": 1.9421305910331945, + "language_loss": 0.83588606, + "learning_rate": 3.435869031622194e-06, + "loss": 0.85758901, + "num_input_tokens_seen": 96259000, + "step": 4453, + "time_per_iteration": 2.5198562145233154 + }, + { + "auxiliary_loss_clip": 0.01129044, + "auxiliary_loss_mlp": 0.01053853, + "balance_loss_clip": 1.0491432, + "balance_loss_mlp": 1.0359242, + "epoch": 0.2677889673831354, + "flos": 22127509255680.0, + "grad_norm": 1.5747280391345864, + "language_loss": 0.7953862, + "learning_rate": 3.435597895977208e-06, + "loss": 0.8172152, + "num_input_tokens_seen": 96277000, + "step": 4454, + "time_per_iteration": 2.506873369216919 + }, + { + "auxiliary_loss_clip": 0.01121252, + "auxiliary_loss_mlp": 0.01045283, + "balance_loss_clip": 1.04929137, + "balance_loss_mlp": 1.02861774, + "epoch": 0.2678490906358034, + "flos": 23729699404800.0, + "grad_norm": 1.5975597461759021, + "language_loss": 0.72825682, + "learning_rate": 3.435326705894206e-06, + "loss": 0.74992222, + "num_input_tokens_seen": 96297010, + "step": 4455, + "time_per_iteration": 2.5931975841522217 + }, + { + "auxiliary_loss_clip": 0.01103411, + "auxiliary_loss_mlp": 0.0103854, + "balance_loss_clip": 1.04691625, + "balance_loss_mlp": 1.02189851, + "epoch": 0.2679092138884714, + "flos": 21762872340480.0, + "grad_norm": 1.7569992112633872, + "language_loss": 0.73731369, + "learning_rate": 3.435055461383471e-06, + "loss": 0.75873321, + "num_input_tokens_seen": 96315780, + "step": 4456, + "time_per_iteration": 2.5601377487182617 + }, + { + "auxiliary_loss_clip": 0.01132101, + "auxiliary_loss_mlp": 0.01043958, + "balance_loss_clip": 1.05058622, + "balance_loss_mlp": 1.02677965, + "epoch": 0.26796933714113935, + "flos": 19861186590720.0, + "grad_norm": 2.3166750412910373, + "language_loss": 0.71135765, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.73311824, + "num_input_tokens_seen": 96333465, + "step": 4457, + "time_per_iteration": 2.496516704559326 + }, + { + "auxiliary_loss_clip": 0.01104468, + "auxiliary_loss_mlp": 0.01050712, + "balance_loss_clip": 1.05158019, + "balance_loss_mlp": 1.03297377, + "epoch": 0.2680294603938073, + "flos": 20047886507520.0, + "grad_norm": 1.7088232607677534, + "language_loss": 0.7907716, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81232339, + "num_input_tokens_seen": 96352005, + "step": 4458, + "time_per_iteration": 2.549220323562622 + }, + { + "auxiliary_loss_clip": 0.01025929, + "auxiliary_loss_mlp": 0.01010182, + "balance_loss_clip": 1.03887081, + "balance_loss_mlp": 1.00791728, + "epoch": 0.2680895836464753, + "flos": 72113763052800.0, + "grad_norm": 0.8638735331751527, + "language_loss": 0.58753645, + "learning_rate": 3.434241401387739e-06, + "loss": 0.60789752, + "num_input_tokens_seen": 96406265, + "step": 4459, + "time_per_iteration": 3.134660005569458 + }, + { + "auxiliary_loss_clip": 0.01087013, + "auxiliary_loss_mlp": 0.01048586, + "balance_loss_clip": 1.04771137, + "balance_loss_mlp": 1.03096747, + "epoch": 0.26814970689914325, + "flos": 20449044576000.0, + "grad_norm": 2.1167551641864364, + "language_loss": 0.8488428, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.87019873, + "num_input_tokens_seen": 96425225, + "step": 4460, + "time_per_iteration": 2.594700813293457 + }, + { + "auxiliary_loss_clip": 0.01123927, + "auxiliary_loss_mlp": 0.0105005, + "balance_loss_clip": 1.04747891, + "balance_loss_mlp": 1.03127444, + "epoch": 0.2682098301518112, + "flos": 17566674727680.0, + "grad_norm": 1.6824790048684086, + "language_loss": 0.69000703, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.71174681, + "num_input_tokens_seen": 96443780, + "step": 4461, + "time_per_iteration": 2.496055841445923 + }, + { + "auxiliary_loss_clip": 0.01110421, + "auxiliary_loss_mlp": 0.01051103, + "balance_loss_clip": 1.04753959, + "balance_loss_mlp": 1.03328133, + "epoch": 0.2682699534044792, + "flos": 18333259810560.0, + "grad_norm": 1.5007190295104833, + "language_loss": 0.67323077, + "learning_rate": 3.43342685191282e-06, + "loss": 0.69484597, + "num_input_tokens_seen": 96464530, + "step": 4462, + "time_per_iteration": 2.574934244155884 + }, + { + "auxiliary_loss_clip": 0.01108192, + "auxiliary_loss_mlp": 0.01043412, + "balance_loss_clip": 1.04988587, + "balance_loss_mlp": 1.02579343, + "epoch": 0.26833007665714714, + "flos": 25301294144640.0, + "grad_norm": 1.68020691687763, + "language_loss": 0.69699466, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.71851069, + "num_input_tokens_seen": 96483345, + "step": 4463, + "time_per_iteration": 2.5978140830993652 + }, + { + "auxiliary_loss_clip": 0.01111255, + "auxiliary_loss_mlp": 0.01042828, + "balance_loss_clip": 1.05031061, + "balance_loss_mlp": 1.02381396, + "epoch": 0.2683901999098151, + "flos": 16099759198080.0, + "grad_norm": 4.196534262942418, + "language_loss": 0.77649295, + "learning_rate": 3.432883547133931e-06, + "loss": 0.79803383, + "num_input_tokens_seen": 96498305, + "step": 4464, + "time_per_iteration": 2.511720895767212 + }, + { + "auxiliary_loss_clip": 0.01123149, + "auxiliary_loss_mlp": 0.01046319, + "balance_loss_clip": 1.04771745, + "balance_loss_mlp": 1.02816391, + "epoch": 0.2684503231624831, + "flos": 27308054154240.0, + "grad_norm": 1.84370009639088, + "language_loss": 0.71120918, + "learning_rate": 3.432611813236704e-06, + "loss": 0.73290384, + "num_input_tokens_seen": 96519740, + "step": 4465, + "time_per_iteration": 2.537893056869507 + }, + { + "auxiliary_loss_clip": 0.0103385, + "auxiliary_loss_mlp": 0.01008438, + "balance_loss_clip": 1.02243781, + "balance_loss_mlp": 1.00634027, + "epoch": 0.26851044641515104, + "flos": 71858007239040.0, + "grad_norm": 0.7024333089637484, + "language_loss": 0.53084028, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55126315, + "num_input_tokens_seen": 96588870, + "step": 4466, + "time_per_iteration": 3.266301155090332 + }, + { + "auxiliary_loss_clip": 0.01112109, + "auxiliary_loss_mlp": 0.01055079, + "balance_loss_clip": 1.04691613, + "balance_loss_mlp": 1.03520691, + "epoch": 0.268570569667819, + "flos": 18733771434240.0, + "grad_norm": 2.0183857375358683, + "language_loss": 0.7368573, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.75852919, + "num_input_tokens_seen": 96605100, + "step": 4467, + "time_per_iteration": 2.496220827102661 + }, + { + "auxiliary_loss_clip": 0.01116474, + "auxiliary_loss_mlp": 0.00817638, + "balance_loss_clip": 1.04721522, + "balance_loss_mlp": 1.05253947, + "epoch": 0.268630692920487, + "flos": 18178376365440.0, + "grad_norm": 2.09820191206727, + "language_loss": 0.8037039, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.82304496, + "num_input_tokens_seen": 96621410, + "step": 4468, + "time_per_iteration": 2.5150015354156494 + }, + { + "auxiliary_loss_clip": 0.01047537, + "auxiliary_loss_mlp": 0.01002468, + "balance_loss_clip": 1.01688409, + "balance_loss_mlp": 1.00052536, + "epoch": 0.268690816173155, + "flos": 68731768978560.0, + "grad_norm": 0.8433721372102897, + "language_loss": 0.5952903, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.61579037, + "num_input_tokens_seen": 96684810, + "step": 4469, + "time_per_iteration": 3.1412925720214844 + }, + { + "auxiliary_loss_clip": 0.01144386, + "auxiliary_loss_mlp": 0.01051764, + "balance_loss_clip": 1.05051684, + "balance_loss_mlp": 1.0324403, + "epoch": 0.26875093942582295, + "flos": 23293636295040.0, + "grad_norm": 1.8171657653953754, + "language_loss": 0.81470454, + "learning_rate": 3.431252329084972e-06, + "loss": 0.83666605, + "num_input_tokens_seen": 96701920, + "step": 4470, + "time_per_iteration": 2.4732558727264404 + }, + { + "auxiliary_loss_clip": 0.011076, + "auxiliary_loss_mlp": 0.01038979, + "balance_loss_clip": 1.04698515, + "balance_loss_mlp": 1.0210855, + "epoch": 0.2688110626784909, + "flos": 21543458112000.0, + "grad_norm": 1.6020260168754987, + "language_loss": 0.82371807, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.84518391, + "num_input_tokens_seen": 96721260, + "step": 4471, + "time_per_iteration": 2.5486056804656982 + }, + { + "auxiliary_loss_clip": 0.01123472, + "auxiliary_loss_mlp": 0.01034752, + "balance_loss_clip": 1.04969656, + "balance_loss_mlp": 1.01808643, + "epoch": 0.2688711859311589, + "flos": 28400600183040.0, + "grad_norm": 2.000035490320257, + "language_loss": 0.69287491, + "learning_rate": 3.43070815543947e-06, + "loss": 0.71445715, + "num_input_tokens_seen": 96740385, + "step": 4472, + "time_per_iteration": 2.5673751831054688 + }, + { + "auxiliary_loss_clip": 0.01141421, + "auxiliary_loss_mlp": 0.01046354, + "balance_loss_clip": 1.05152702, + "balance_loss_mlp": 1.02948558, + "epoch": 0.26893130918382685, + "flos": 25994944661760.0, + "grad_norm": 1.6047456296616176, + "language_loss": 0.6793195, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.70119727, + "num_input_tokens_seen": 96761860, + "step": 4473, + "time_per_iteration": 2.5278029441833496 + }, + { + "auxiliary_loss_clip": 0.01110286, + "auxiliary_loss_mlp": 0.01044422, + "balance_loss_clip": 1.05326653, + "balance_loss_mlp": 1.02789927, + "epoch": 0.2689914324364948, + "flos": 20339624770560.0, + "grad_norm": 2.5591362537211557, + "language_loss": 0.83369005, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.85523713, + "num_input_tokens_seen": 96781890, + "step": 4474, + "time_per_iteration": 3.988219738006592 + }, + { + "auxiliary_loss_clip": 0.01133581, + "auxiliary_loss_mlp": 0.01048487, + "balance_loss_clip": 1.05033731, + "balance_loss_mlp": 1.03160667, + "epoch": 0.2690515556891628, + "flos": 19464553635840.0, + "grad_norm": 1.7354524440223338, + "language_loss": 0.70562327, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.72744393, + "num_input_tokens_seen": 96800390, + "step": 4475, + "time_per_iteration": 2.515655994415283 + }, + { + "auxiliary_loss_clip": 0.01106248, + "auxiliary_loss_mlp": 0.00799418, + "balance_loss_clip": 1.05015504, + "balance_loss_mlp": 1.01873732, + "epoch": 0.26911167894183075, + "flos": 18146631720960.0, + "grad_norm": 1.688227134106577, + "language_loss": 0.73619485, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.75525153, + "num_input_tokens_seen": 96816685, + "step": 4476, + "time_per_iteration": 2.5385799407958984 + }, + { + "auxiliary_loss_clip": 0.01110408, + "auxiliary_loss_mlp": 0.01041315, + "balance_loss_clip": 1.0488379, + "balance_loss_mlp": 1.02450597, + "epoch": 0.2691718021944987, + "flos": 19975131509760.0, + "grad_norm": 1.598061615280744, + "language_loss": 0.80687028, + "learning_rate": 3.429346772085922e-06, + "loss": 0.8283875, + "num_input_tokens_seen": 96836285, + "step": 4477, + "time_per_iteration": 3.918288230895996 + }, + { + "auxiliary_loss_clip": 0.01090335, + "auxiliary_loss_mlp": 0.0104428, + "balance_loss_clip": 1.05206299, + "balance_loss_mlp": 1.02730441, + "epoch": 0.2692319254471667, + "flos": 37447215770880.0, + "grad_norm": 1.609309327645889, + "language_loss": 0.65358973, + "learning_rate": 3.429074332770984e-06, + "loss": 0.67493588, + "num_input_tokens_seen": 96857745, + "step": 4478, + "time_per_iteration": 2.7518298625946045 + }, + { + "auxiliary_loss_clip": 0.01125696, + "auxiliary_loss_mlp": 0.0104342, + "balance_loss_clip": 1.0526123, + "balance_loss_mlp": 1.02607477, + "epoch": 0.26929204869983464, + "flos": 22127796564480.0, + "grad_norm": 2.5295545498049075, + "language_loss": 0.80842757, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.83011866, + "num_input_tokens_seen": 96877295, + "step": 4479, + "time_per_iteration": 2.532438278198242 + }, + { + "auxiliary_loss_clip": 0.0111564, + "auxiliary_loss_mlp": 0.00803326, + "balance_loss_clip": 1.04490662, + "balance_loss_mlp": 1.02934241, + "epoch": 0.2693521719525026, + "flos": 19792813052160.0, + "grad_norm": 1.932924500821838, + "language_loss": 0.80649167, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.82568139, + "num_input_tokens_seen": 96896160, + "step": 4480, + "time_per_iteration": 3.9151158332824707 + }, + { + "auxiliary_loss_clip": 0.01090093, + "auxiliary_loss_mlp": 0.01043445, + "balance_loss_clip": 1.04528522, + "balance_loss_mlp": 1.02624357, + "epoch": 0.2694122952051706, + "flos": 20994383836800.0, + "grad_norm": 1.5497070024897408, + "language_loss": 0.78247821, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.80381358, + "num_input_tokens_seen": 96915410, + "step": 4481, + "time_per_iteration": 3.9661073684692383 + }, + { + "auxiliary_loss_clip": 0.01128721, + "auxiliary_loss_mlp": 0.01045143, + "balance_loss_clip": 1.04769182, + "balance_loss_mlp": 1.02771509, + "epoch": 0.2694724184578386, + "flos": 25849291011840.0, + "grad_norm": 1.6268377659048607, + "language_loss": 0.74103439, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.76277304, + "num_input_tokens_seen": 96937865, + "step": 4482, + "time_per_iteration": 2.554187297821045 + }, + { + "auxiliary_loss_clip": 0.01114032, + "auxiliary_loss_mlp": 0.0104006, + "balance_loss_clip": 1.05001283, + "balance_loss_mlp": 1.02277493, + "epoch": 0.26953254171050656, + "flos": 21726961718400.0, + "grad_norm": 1.845516773710618, + "language_loss": 0.72103763, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.74257851, + "num_input_tokens_seen": 96957710, + "step": 4483, + "time_per_iteration": 2.551866292953491 + }, + { + "auxiliary_loss_clip": 0.01127827, + "auxiliary_loss_mlp": 0.01043592, + "balance_loss_clip": 1.04464364, + "balance_loss_mlp": 1.02548409, + "epoch": 0.2695926649631745, + "flos": 19682926369920.0, + "grad_norm": 2.243586414976352, + "language_loss": 0.86571479, + "learning_rate": 3.427438559239605e-06, + "loss": 0.887429, + "num_input_tokens_seen": 96975890, + "step": 4484, + "time_per_iteration": 2.4865710735321045 + }, + { + "auxiliary_loss_clip": 0.01131473, + "auxiliary_loss_mlp": 0.01041479, + "balance_loss_clip": 1.04900467, + "balance_loss_mlp": 1.02479017, + "epoch": 0.2696527882158425, + "flos": 32886596724480.0, + "grad_norm": 1.4793412766113272, + "language_loss": 0.66303736, + "learning_rate": 3.427165740807239e-06, + "loss": 0.68476689, + "num_input_tokens_seen": 96998595, + "step": 4485, + "time_per_iteration": 2.6156177520751953 + }, + { + "auxiliary_loss_clip": 0.01106626, + "auxiliary_loss_mlp": 0.01043923, + "balance_loss_clip": 1.04639256, + "balance_loss_mlp": 1.02619624, + "epoch": 0.26971291146851045, + "flos": 12124843320960.0, + "grad_norm": 1.9959194863007592, + "language_loss": 0.72878277, + "learning_rate": 3.426892868256604e-06, + "loss": 0.75028825, + "num_input_tokens_seen": 97013715, + "step": 4486, + "time_per_iteration": 2.494731903076172 + }, + { + "auxiliary_loss_clip": 0.01143223, + "auxiliary_loss_mlp": 0.01042378, + "balance_loss_clip": 1.04994547, + "balance_loss_mlp": 1.02579641, + "epoch": 0.2697730347211784, + "flos": 22634459856000.0, + "grad_norm": 2.2394008333884927, + "language_loss": 0.84021556, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.86207151, + "num_input_tokens_seen": 97031570, + "step": 4487, + "time_per_iteration": 2.476351499557495 + }, + { + "auxiliary_loss_clip": 0.01112031, + "auxiliary_loss_mlp": 0.01048766, + "balance_loss_clip": 1.04656935, + "balance_loss_mlp": 1.03122997, + "epoch": 0.2698331579738464, + "flos": 23513050523520.0, + "grad_norm": 1.9792413235406863, + "language_loss": 0.71202135, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.73362935, + "num_input_tokens_seen": 97049815, + "step": 4488, + "time_per_iteration": 2.5568771362304688 + }, + { + "auxiliary_loss_clip": 0.01064416, + "auxiliary_loss_mlp": 0.01054286, + "balance_loss_clip": 1.04693699, + "balance_loss_mlp": 1.03470039, + "epoch": 0.26989328122651435, + "flos": 24641040297600.0, + "grad_norm": 1.6920952147906192, + "language_loss": 0.83591771, + "learning_rate": 3.426073925998578e-06, + "loss": 0.85710466, + "num_input_tokens_seen": 97067570, + "step": 4489, + "time_per_iteration": 2.7415268421173096 + }, + { + "auxiliary_loss_clip": 0.01117084, + "auxiliary_loss_mlp": 0.01053478, + "balance_loss_clip": 1.05060363, + "balance_loss_mlp": 1.03547716, + "epoch": 0.2699534044791823, + "flos": 10772555068800.0, + "grad_norm": 2.645649836290693, + "language_loss": 0.89948237, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.92118794, + "num_input_tokens_seen": 97082180, + "step": 4490, + "time_per_iteration": 2.7907402515411377 + }, + { + "auxiliary_loss_clip": 0.01067402, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_clip": 1.04283535, + "balance_loss_mlp": 1.02870131, + "epoch": 0.2700135277318503, + "flos": 36171597098880.0, + "grad_norm": 2.1400600397898684, + "language_loss": 0.73436946, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.75551939, + "num_input_tokens_seen": 97103470, + "step": 4491, + "time_per_iteration": 2.7784411907196045 + }, + { + "auxiliary_loss_clip": 0.01143226, + "auxiliary_loss_mlp": 0.01043648, + "balance_loss_clip": 1.05035329, + "balance_loss_mlp": 1.02587366, + "epoch": 0.27007365098451824, + "flos": 17418614866560.0, + "grad_norm": 2.6454558244736437, + "language_loss": 0.74234641, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.76421517, + "num_input_tokens_seen": 97118100, + "step": 4492, + "time_per_iteration": 2.4473721981048584 + }, + { + "auxiliary_loss_clip": 0.01119635, + "auxiliary_loss_mlp": 0.01039173, + "balance_loss_clip": 1.04784238, + "balance_loss_mlp": 1.02107668, + "epoch": 0.2701337742371862, + "flos": 23185688947200.0, + "grad_norm": 1.829117691457779, + "language_loss": 0.89527071, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.91685879, + "num_input_tokens_seen": 97136765, + "step": 4493, + "time_per_iteration": 2.5625925064086914 + }, + { + "auxiliary_loss_clip": 0.01131222, + "auxiliary_loss_mlp": 0.01040813, + "balance_loss_clip": 1.04900217, + "balance_loss_mlp": 1.02482677, + "epoch": 0.2701938974898542, + "flos": 24389450461440.0, + "grad_norm": 1.3998316055978226, + "language_loss": 0.71245718, + "learning_rate": 3.424707940835998e-06, + "loss": 0.73417759, + "num_input_tokens_seen": 97157470, + "step": 4494, + "time_per_iteration": 2.5274531841278076 + }, + { + "auxiliary_loss_clip": 0.01109175, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.04669833, + "balance_loss_mlp": 1.01779544, + "epoch": 0.2702540207425222, + "flos": 26214322976640.0, + "grad_norm": 1.9982972689288083, + "language_loss": 0.86037445, + "learning_rate": 3.42443458168683e-06, + "loss": 0.88180137, + "num_input_tokens_seen": 97176905, + "step": 4495, + "time_per_iteration": 2.573192834854126 + }, + { + "auxiliary_loss_clip": 0.01141665, + "auxiliary_loss_mlp": 0.0104837, + "balance_loss_clip": 1.05089402, + "balance_loss_mlp": 1.03108478, + "epoch": 0.27031414399519016, + "flos": 22926377687040.0, + "grad_norm": 1.6995144344735065, + "language_loss": 0.76506186, + "learning_rate": 3.424161168522959e-06, + "loss": 0.78696221, + "num_input_tokens_seen": 97196380, + "step": 4496, + "time_per_iteration": 2.4609932899475098 + }, + { + "auxiliary_loss_clip": 0.01048548, + "auxiliary_loss_mlp": 0.01021007, + "balance_loss_clip": 1.01847565, + "balance_loss_mlp": 1.01903963, + "epoch": 0.2703742672478581, + "flos": 63019780404480.0, + "grad_norm": 0.740758459414504, + "language_loss": 0.50290072, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52359623, + "num_input_tokens_seen": 97260100, + "step": 4497, + "time_per_iteration": 3.1563968658447266 + }, + { + "auxiliary_loss_clip": 0.01105101, + "auxiliary_loss_mlp": 0.01048364, + "balance_loss_clip": 1.05334675, + "balance_loss_mlp": 1.03175807, + "epoch": 0.2704343905005261, + "flos": 18840820942080.0, + "grad_norm": 1.8983785311463663, + "language_loss": 0.72455627, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.74609089, + "num_input_tokens_seen": 97277935, + "step": 4498, + "time_per_iteration": 2.6308045387268066 + }, + { + "auxiliary_loss_clip": 0.01027131, + "auxiliary_loss_mlp": 0.01007791, + "balance_loss_clip": 1.02563977, + "balance_loss_mlp": 1.00577629, + "epoch": 0.27049451375319405, + "flos": 71233412618880.0, + "grad_norm": 0.7508698867175054, + "language_loss": 0.59205633, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61240554, + "num_input_tokens_seen": 97338845, + "step": 4499, + "time_per_iteration": 3.1805806159973145 + }, + { + "auxiliary_loss_clip": 0.0111615, + "auxiliary_loss_mlp": 0.01038228, + "balance_loss_clip": 1.04770088, + "balance_loss_mlp": 1.0210619, + "epoch": 0.270554637005862, + "flos": 24278594112000.0, + "grad_norm": 1.8887649400601636, + "language_loss": 0.7335906, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.75513434, + "num_input_tokens_seen": 97356640, + "step": 4500, + "time_per_iteration": 2.548063278198242 + }, + { + "auxiliary_loss_clip": 0.0110765, + "auxiliary_loss_mlp": 0.01039438, + "balance_loss_clip": 1.04454708, + "balance_loss_mlp": 1.02140117, + "epoch": 0.27061476025853, + "flos": 17632318832640.0, + "grad_norm": 2.7793610667265596, + "language_loss": 0.82141984, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.84289068, + "num_input_tokens_seen": 97372585, + "step": 4501, + "time_per_iteration": 2.483837842941284 + }, + { + "auxiliary_loss_clip": 0.01096542, + "auxiliary_loss_mlp": 0.01049972, + "balance_loss_clip": 1.04841316, + "balance_loss_mlp": 1.03074312, + "epoch": 0.27067488351119795, + "flos": 22710123855360.0, + "grad_norm": 1.6585083974985475, + "language_loss": 0.72276747, + "learning_rate": 3.422519555811735e-06, + "loss": 0.74423254, + "num_input_tokens_seen": 97393315, + "step": 4502, + "time_per_iteration": 2.5873076915740967 + }, + { + "auxiliary_loss_clip": 0.01119053, + "auxiliary_loss_mlp": 0.01040381, + "balance_loss_clip": 1.04458654, + "balance_loss_mlp": 1.02185643, + "epoch": 0.2707350067638659, + "flos": 41719616087040.0, + "grad_norm": 2.672846836716759, + "language_loss": 0.6845091, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.70610344, + "num_input_tokens_seen": 97417860, + "step": 4503, + "time_per_iteration": 2.7004613876342773 + }, + { + "auxiliary_loss_clip": 0.01093346, + "auxiliary_loss_mlp": 0.0104818, + "balance_loss_clip": 1.0456419, + "balance_loss_mlp": 1.02846241, + "epoch": 0.2707951300165339, + "flos": 20193037367040.0, + "grad_norm": 1.9340405421063862, + "language_loss": 0.68538749, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.70680273, + "num_input_tokens_seen": 97436780, + "step": 4504, + "time_per_iteration": 2.5856242179870605 + }, + { + "auxiliary_loss_clip": 0.01134226, + "auxiliary_loss_mlp": 0.01045619, + "balance_loss_clip": 1.053473, + "balance_loss_mlp": 1.02932334, + "epoch": 0.27085525326920185, + "flos": 21433966479360.0, + "grad_norm": 2.2360337874702956, + "language_loss": 0.75650901, + "learning_rate": 3.421698021097902e-06, + "loss": 0.77830744, + "num_input_tokens_seen": 97456190, + "step": 4505, + "time_per_iteration": 2.5162622928619385 + }, + { + "auxiliary_loss_clip": 0.01142935, + "auxiliary_loss_mlp": 0.01049197, + "balance_loss_clip": 1.04793525, + "balance_loss_mlp": 1.03076684, + "epoch": 0.2709153765218698, + "flos": 17675232606720.0, + "grad_norm": 2.5215357577653514, + "language_loss": 0.74191433, + "learning_rate": 3.42142406835758e-06, + "loss": 0.76383567, + "num_input_tokens_seen": 97474545, + "step": 4506, + "time_per_iteration": 2.454254388809204 + }, + { + "auxiliary_loss_clip": 0.0112089, + "auxiliary_loss_mlp": 0.01045417, + "balance_loss_clip": 1.04666853, + "balance_loss_mlp": 1.02709484, + "epoch": 0.2709754997745378, + "flos": 24456243801600.0, + "grad_norm": 1.6943622793621906, + "language_loss": 0.80773634, + "learning_rate": 3.421150061716715e-06, + "loss": 0.82939935, + "num_input_tokens_seen": 97494520, + "step": 4507, + "time_per_iteration": 2.5464677810668945 + }, + { + "auxiliary_loss_clip": 0.01029837, + "auxiliary_loss_mlp": 0.01005558, + "balance_loss_clip": 1.01846051, + "balance_loss_mlp": 1.00353193, + "epoch": 0.2710356230272058, + "flos": 65210798206080.0, + "grad_norm": 0.7353174210334501, + "language_loss": 0.50890005, + "learning_rate": 3.420876001185698e-06, + "loss": 0.52925396, + "num_input_tokens_seen": 97552455, + "step": 4508, + "time_per_iteration": 3.0610549449920654 + }, + { + "auxiliary_loss_clip": 0.01073424, + "auxiliary_loss_mlp": 0.01043065, + "balance_loss_clip": 1.04831767, + "balance_loss_mlp": 1.02598262, + "epoch": 0.27109574627987376, + "flos": 25484438615040.0, + "grad_norm": 1.8757852316739143, + "language_loss": 0.74867398, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.76983887, + "num_input_tokens_seen": 97572650, + "step": 4509, + "time_per_iteration": 2.6566059589385986 + }, + { + "auxiliary_loss_clip": 0.01122941, + "auxiliary_loss_mlp": 0.01042544, + "balance_loss_clip": 1.04699922, + "balance_loss_mlp": 1.026546, + "epoch": 0.2711558695325417, + "flos": 19682782715520.0, + "grad_norm": 1.8402324854289138, + "language_loss": 0.72025633, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.74191129, + "num_input_tokens_seen": 97591150, + "step": 4510, + "time_per_iteration": 2.505190849304199 + }, + { + "auxiliary_loss_clip": 0.01127741, + "auxiliary_loss_mlp": 0.01036116, + "balance_loss_clip": 1.05075705, + "balance_loss_mlp": 1.01954556, + "epoch": 0.2712159927852097, + "flos": 18587758648320.0, + "grad_norm": 2.6187146391217326, + "language_loss": 0.70010448, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.72174305, + "num_input_tokens_seen": 97607410, + "step": 4511, + "time_per_iteration": 2.4943504333496094 + }, + { + "auxiliary_loss_clip": 0.01120763, + "auxiliary_loss_mlp": 0.01047372, + "balance_loss_clip": 1.05080712, + "balance_loss_mlp": 1.02955031, + "epoch": 0.27127611603787766, + "flos": 25630235919360.0, + "grad_norm": 2.313272511447112, + "language_loss": 0.81138587, + "learning_rate": 3.419779220367979e-06, + "loss": 0.83306718, + "num_input_tokens_seen": 97626870, + "step": 4512, + "time_per_iteration": 2.5854361057281494 + }, + { + "auxiliary_loss_clip": 0.01138507, + "auxiliary_loss_mlp": 0.01036222, + "balance_loss_clip": 1.0501523, + "balance_loss_mlp": 1.02109456, + "epoch": 0.2713362392905456, + "flos": 23148952312320.0, + "grad_norm": 1.469728648635988, + "language_loss": 0.80464095, + "learning_rate": 3.419504890542124e-06, + "loss": 0.82638824, + "num_input_tokens_seen": 97646595, + "step": 4513, + "time_per_iteration": 3.8704352378845215 + }, + { + "auxiliary_loss_clip": 0.01114196, + "auxiliary_loss_mlp": 0.01042347, + "balance_loss_clip": 1.04599738, + "balance_loss_mlp": 1.02607524, + "epoch": 0.2713963625432136, + "flos": 18366045949440.0, + "grad_norm": 1.7141189597958044, + "language_loss": 0.88035959, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.90192503, + "num_input_tokens_seen": 97665485, + "step": 4514, + "time_per_iteration": 2.501976490020752 + }, + { + "auxiliary_loss_clip": 0.01121125, + "auxiliary_loss_mlp": 0.01045801, + "balance_loss_clip": 1.05205917, + "balance_loss_mlp": 1.02797914, + "epoch": 0.27145648579588155, + "flos": 22491751121280.0, + "grad_norm": 1.6189721178620045, + "language_loss": 0.92009163, + "learning_rate": 3.418956069417517e-06, + "loss": 0.9417609, + "num_input_tokens_seen": 97683800, + "step": 4515, + "time_per_iteration": 2.5290305614471436 + }, + { + "auxiliary_loss_clip": 0.01094956, + "auxiliary_loss_mlp": 0.01054977, + "balance_loss_clip": 1.04787481, + "balance_loss_mlp": 1.03503323, + "epoch": 0.2715166090485495, + "flos": 19239177749760.0, + "grad_norm": 2.0764666251076145, + "language_loss": 0.73953557, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.76103497, + "num_input_tokens_seen": 97700505, + "step": 4516, + "time_per_iteration": 3.9668712615966797 + }, + { + "auxiliary_loss_clip": 0.01130694, + "auxiliary_loss_mlp": 0.01045234, + "balance_loss_clip": 1.0497849, + "balance_loss_mlp": 1.02756715, + "epoch": 0.2715767323012175, + "flos": 17709598944000.0, + "grad_norm": 1.794788833674979, + "language_loss": 0.7601161, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.78187537, + "num_input_tokens_seen": 97717410, + "step": 4517, + "time_per_iteration": 2.5013344287872314 + }, + { + "auxiliary_loss_clip": 0.01095685, + "auxiliary_loss_mlp": 0.01050651, + "balance_loss_clip": 1.05052805, + "balance_loss_mlp": 1.03206658, + "epoch": 0.27163685555388545, + "flos": 22382834106240.0, + "grad_norm": 2.4518279809822907, + "language_loss": 0.76961941, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.79108274, + "num_input_tokens_seen": 97734545, + "step": 4518, + "time_per_iteration": 3.9212284088134766 + }, + { + "auxiliary_loss_clip": 0.01108027, + "auxiliary_loss_mlp": 0.01042431, + "balance_loss_clip": 1.04692054, + "balance_loss_mlp": 1.02638555, + "epoch": 0.2716969788065534, + "flos": 22346708002560.0, + "grad_norm": 1.570556671116304, + "language_loss": 0.68091941, + "learning_rate": 3.41785778156811e-06, + "loss": 0.70242393, + "num_input_tokens_seen": 97754000, + "step": 4519, + "time_per_iteration": 2.5338449478149414 + }, + { + "auxiliary_loss_clip": 0.01128838, + "auxiliary_loss_mlp": 0.01037237, + "balance_loss_clip": 1.04917455, + "balance_loss_mlp": 1.02079773, + "epoch": 0.2717571020592214, + "flos": 25228467319680.0, + "grad_norm": 1.9654778561120358, + "language_loss": 0.7564159, + "learning_rate": 3.417583075166451e-06, + "loss": 0.77807665, + "num_input_tokens_seen": 97772080, + "step": 4520, + "time_per_iteration": 3.893636703491211 + }, + { + "auxiliary_loss_clip": 0.01130925, + "auxiliary_loss_mlp": 0.01050621, + "balance_loss_clip": 1.0494076, + "balance_loss_mlp": 1.03163075, + "epoch": 0.2718172253118894, + "flos": 20189769229440.0, + "grad_norm": 2.2897556694385237, + "language_loss": 0.76804525, + "learning_rate": 3.4173083150099e-06, + "loss": 0.78986073, + "num_input_tokens_seen": 97789370, + "step": 4521, + "time_per_iteration": 2.485511541366577 + }, + { + "auxiliary_loss_clip": 0.01114827, + "auxiliary_loss_mlp": 0.01055841, + "balance_loss_clip": 1.04888737, + "balance_loss_mlp": 1.03774524, + "epoch": 0.27187734856455736, + "flos": 14319129260160.0, + "grad_norm": 2.1644352252469066, + "language_loss": 0.7536062, + "learning_rate": 3.417033501108875e-06, + "loss": 0.7753129, + "num_input_tokens_seen": 97807385, + "step": 4522, + "time_per_iteration": 2.5214123725891113 + }, + { + "auxiliary_loss_clip": 0.01148137, + "auxiliary_loss_mlp": 0.01042453, + "balance_loss_clip": 1.0544132, + "balance_loss_mlp": 1.02498937, + "epoch": 0.27193747181722533, + "flos": 21107682311040.0, + "grad_norm": 1.7594104124888108, + "language_loss": 0.73183894, + "learning_rate": 3.416758633473798e-06, + "loss": 0.75374484, + "num_input_tokens_seen": 97827930, + "step": 4523, + "time_per_iteration": 2.4845175743103027 + }, + { + "auxiliary_loss_clip": 0.01115382, + "auxiliary_loss_mlp": 0.01042952, + "balance_loss_clip": 1.05120468, + "balance_loss_mlp": 1.02508235, + "epoch": 0.2719975950698933, + "flos": 19682782715520.0, + "grad_norm": 1.5732697287025224, + "language_loss": 0.74388039, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.76546371, + "num_input_tokens_seen": 97847440, + "step": 4524, + "time_per_iteration": 2.533616065979004 + }, + { + "auxiliary_loss_clip": 0.01144623, + "auxiliary_loss_mlp": 0.01043077, + "balance_loss_clip": 1.05292606, + "balance_loss_mlp": 1.02628016, + "epoch": 0.27205771832256126, + "flos": 24754482426240.0, + "grad_norm": 1.5974851896144944, + "language_loss": 0.76450539, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78638244, + "num_input_tokens_seen": 97867620, + "step": 4525, + "time_per_iteration": 2.4912586212158203 + }, + { + "auxiliary_loss_clip": 0.01128959, + "auxiliary_loss_mlp": 0.01056791, + "balance_loss_clip": 1.04988551, + "balance_loss_mlp": 1.03914773, + "epoch": 0.2721178415752292, + "flos": 21755581879680.0, + "grad_norm": 1.8085598090627317, + "language_loss": 0.81979275, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.84165019, + "num_input_tokens_seen": 97884345, + "step": 4526, + "time_per_iteration": 2.5021679401397705 + }, + { + "auxiliary_loss_clip": 0.01145838, + "auxiliary_loss_mlp": 0.01044938, + "balance_loss_clip": 1.04952478, + "balance_loss_mlp": 1.02648473, + "epoch": 0.2721779648278972, + "flos": 12676826597760.0, + "grad_norm": 1.9456468450755564, + "language_loss": 0.77229297, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.79420072, + "num_input_tokens_seen": 97901500, + "step": 4527, + "time_per_iteration": 2.4227097034454346 + }, + { + "auxiliary_loss_clip": 0.01103454, + "auxiliary_loss_mlp": 0.00808735, + "balance_loss_clip": 1.04762781, + "balance_loss_mlp": 1.03469324, + "epoch": 0.27223808808056515, + "flos": 16253206099200.0, + "grad_norm": 2.091462288367161, + "language_loss": 0.81780398, + "learning_rate": 3.415383489652503e-06, + "loss": 0.83692586, + "num_input_tokens_seen": 97917800, + "step": 4528, + "time_per_iteration": 2.5451056957244873 + }, + { + "auxiliary_loss_clip": 0.01110182, + "auxiliary_loss_mlp": 0.0104906, + "balance_loss_clip": 1.04880047, + "balance_loss_mlp": 1.0315119, + "epoch": 0.2722982113332331, + "flos": 27745805203200.0, + "grad_norm": 1.6882648698834963, + "language_loss": 0.77496833, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.79656076, + "num_input_tokens_seen": 97937225, + "step": 4529, + "time_per_iteration": 2.5883820056915283 + }, + { + "auxiliary_loss_clip": 0.01116165, + "auxiliary_loss_mlp": 0.01049023, + "balance_loss_clip": 1.05065322, + "balance_loss_mlp": 1.03229761, + "epoch": 0.2723583345859011, + "flos": 21726243446400.0, + "grad_norm": 2.5601133053175107, + "language_loss": 0.82266498, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.84431684, + "num_input_tokens_seen": 97956845, + "step": 4530, + "time_per_iteration": 2.5457839965820312 + }, + { + "auxiliary_loss_clip": 0.01130935, + "auxiliary_loss_mlp": 0.0104324, + "balance_loss_clip": 1.05029082, + "balance_loss_mlp": 1.02558541, + "epoch": 0.27241845783856905, + "flos": 17347260499200.0, + "grad_norm": 2.1270422739159294, + "language_loss": 0.9153102, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.93705189, + "num_input_tokens_seen": 97972465, + "step": 4531, + "time_per_iteration": 2.4806768894195557 + }, + { + "auxiliary_loss_clip": 0.01135778, + "auxiliary_loss_mlp": 0.01053113, + "balance_loss_clip": 1.05317128, + "balance_loss_mlp": 1.03550553, + "epoch": 0.272478581091237, + "flos": 24754302858240.0, + "grad_norm": 2.013553690636943, + "language_loss": 0.76224649, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.78413534, + "num_input_tokens_seen": 97990770, + "step": 4532, + "time_per_iteration": 2.524092197418213 + }, + { + "auxiliary_loss_clip": 0.01105198, + "auxiliary_loss_mlp": 0.01038354, + "balance_loss_clip": 1.04765201, + "balance_loss_mlp": 1.02158141, + "epoch": 0.272538704343905, + "flos": 17890624512000.0, + "grad_norm": 2.811896406683421, + "language_loss": 0.88966048, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.91109598, + "num_input_tokens_seen": 98005775, + "step": 4533, + "time_per_iteration": 2.579923391342163 + }, + { + "auxiliary_loss_clip": 0.01115748, + "auxiliary_loss_mlp": 0.01037686, + "balance_loss_clip": 1.04822183, + "balance_loss_mlp": 1.02084208, + "epoch": 0.272598827596573, + "flos": 22932016122240.0, + "grad_norm": 2.2947188796784252, + "language_loss": 0.71111238, + "learning_rate": 3.413731546022929e-06, + "loss": 0.7326467, + "num_input_tokens_seen": 98025750, + "step": 4534, + "time_per_iteration": 2.5526938438415527 + }, + { + "auxiliary_loss_clip": 0.01114558, + "auxiliary_loss_mlp": 0.01042451, + "balance_loss_clip": 1.04923487, + "balance_loss_mlp": 1.02386642, + "epoch": 0.27265895084924097, + "flos": 24238409771520.0, + "grad_norm": 2.304384910111777, + "language_loss": 0.91507041, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.9366405, + "num_input_tokens_seen": 98044955, + "step": 4535, + "time_per_iteration": 2.564807176589966 + }, + { + "auxiliary_loss_clip": 0.01124425, + "auxiliary_loss_mlp": 0.01044843, + "balance_loss_clip": 1.0526731, + "balance_loss_mlp": 1.0268538, + "epoch": 0.27271907410190893, + "flos": 27013155494400.0, + "grad_norm": 1.8528563976854204, + "language_loss": 0.72931945, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.75101215, + "num_input_tokens_seen": 98065860, + "step": 4536, + "time_per_iteration": 2.5818119049072266 + }, + { + "auxiliary_loss_clip": 0.01130097, + "auxiliary_loss_mlp": 0.01042362, + "balance_loss_clip": 1.04721761, + "balance_loss_mlp": 1.02488577, + "epoch": 0.2727791973545769, + "flos": 34452588942720.0, + "grad_norm": 1.9944316080066942, + "language_loss": 0.71762562, + "learning_rate": 3.41290485034781e-06, + "loss": 0.7393502, + "num_input_tokens_seen": 98085450, + "step": 4537, + "time_per_iteration": 2.632749080657959 + }, + { + "auxiliary_loss_clip": 0.01116822, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.05132604, + "balance_loss_mlp": 1.02549505, + "epoch": 0.27283932060724486, + "flos": 15041723160960.0, + "grad_norm": 2.1545889174517243, + "language_loss": 0.78586113, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.80746704, + "num_input_tokens_seen": 98099115, + "step": 4538, + "time_per_iteration": 2.499117851257324 + }, + { + "auxiliary_loss_clip": 0.01128334, + "auxiliary_loss_mlp": 0.01052535, + "balance_loss_clip": 1.05050778, + "balance_loss_mlp": 1.03545225, + "epoch": 0.2728994438599128, + "flos": 21652411040640.0, + "grad_norm": 1.5179070909798678, + "language_loss": 0.90235782, + "learning_rate": 3.412353451992847e-06, + "loss": 0.9241665, + "num_input_tokens_seen": 98118415, + "step": 4539, + "time_per_iteration": 2.5019781589508057 + }, + { + "auxiliary_loss_clip": 0.01120072, + "auxiliary_loss_mlp": 0.01045068, + "balance_loss_clip": 1.04940486, + "balance_loss_mlp": 1.02647185, + "epoch": 0.2729595671125808, + "flos": 17488424949120.0, + "grad_norm": 2.156731876388365, + "language_loss": 0.8804332, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.90208459, + "num_input_tokens_seen": 98136300, + "step": 4540, + "time_per_iteration": 2.5047340393066406 + }, + { + "auxiliary_loss_clip": 0.01128716, + "auxiliary_loss_mlp": 0.00801765, + "balance_loss_clip": 1.04611623, + "balance_loss_mlp": 1.02379584, + "epoch": 0.27301969036524876, + "flos": 19318145800320.0, + "grad_norm": 1.9747411159870862, + "language_loss": 0.82196605, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.84127086, + "num_input_tokens_seen": 98154580, + "step": 4541, + "time_per_iteration": 2.499253034591675 + }, + { + "auxiliary_loss_clip": 0.01119741, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.05043793, + "balance_loss_mlp": 1.02998614, + "epoch": 0.2730798136179167, + "flos": 21065666376960.0, + "grad_norm": 1.7268216415905464, + "language_loss": 0.80162841, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.82329428, + "num_input_tokens_seen": 98173115, + "step": 4542, + "time_per_iteration": 2.520578145980835 + }, + { + "auxiliary_loss_clip": 0.01124253, + "auxiliary_loss_mlp": 0.01044221, + "balance_loss_clip": 1.05272257, + "balance_loss_mlp": 1.02685237, + "epoch": 0.2731399368705847, + "flos": 19171737964800.0, + "grad_norm": 2.752868616583742, + "language_loss": 0.89792156, + "learning_rate": 3.411250012687582e-06, + "loss": 0.91960633, + "num_input_tokens_seen": 98190260, + "step": 4543, + "time_per_iteration": 2.5313565731048584 + }, + { + "auxiliary_loss_clip": 0.01112592, + "auxiliary_loss_mlp": 0.0080349, + "balance_loss_clip": 1.04763103, + "balance_loss_mlp": 1.02365911, + "epoch": 0.27320006012325265, + "flos": 18290130554880.0, + "grad_norm": 1.9448178398491165, + "language_loss": 0.63466549, + "learning_rate": 3.410974019048255e-06, + "loss": 0.65382636, + "num_input_tokens_seen": 98207115, + "step": 4544, + "time_per_iteration": 2.500349998474121 + }, + { + "auxiliary_loss_clip": 0.01114489, + "auxiliary_loss_mlp": 0.01050742, + "balance_loss_clip": 1.05041945, + "balance_loss_mlp": 1.03249109, + "epoch": 0.2732601833759206, + "flos": 34860929731200.0, + "grad_norm": 2.664600784336412, + "language_loss": 0.69951761, + "learning_rate": 3.410697971904651e-06, + "loss": 0.72116995, + "num_input_tokens_seen": 98230610, + "step": 4545, + "time_per_iteration": 2.6435632705688477 + }, + { + "auxiliary_loss_clip": 0.01037535, + "auxiliary_loss_mlp": 0.0102222, + "balance_loss_clip": 1.02492678, + "balance_loss_mlp": 1.01981187, + "epoch": 0.2733203066285886, + "flos": 53910824762880.0, + "grad_norm": 0.7178554000585645, + "language_loss": 0.61609846, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.63669598, + "num_input_tokens_seen": 98293585, + "step": 4546, + "time_per_iteration": 3.147480010986328 + }, + { + "auxiliary_loss_clip": 0.01059046, + "auxiliary_loss_mlp": 0.01049629, + "balance_loss_clip": 1.0512315, + "balance_loss_mlp": 1.03149772, + "epoch": 0.2733804298812566, + "flos": 20660378244480.0, + "grad_norm": 1.878604642393384, + "language_loss": 0.65212297, + "learning_rate": 3.410145717146488e-06, + "loss": 0.67320967, + "num_input_tokens_seen": 98311680, + "step": 4547, + "time_per_iteration": 2.6773836612701416 + }, + { + "auxiliary_loss_clip": 0.01114092, + "auxiliary_loss_mlp": 0.00798582, + "balance_loss_clip": 1.04781163, + "balance_loss_mlp": 1.02120054, + "epoch": 0.27344055313392457, + "flos": 25884339707520.0, + "grad_norm": 1.9953491448887073, + "language_loss": 0.77718008, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.79630679, + "num_input_tokens_seen": 98330770, + "step": 4548, + "time_per_iteration": 2.5650501251220703 + }, + { + "auxiliary_loss_clip": 0.01118547, + "auxiliary_loss_mlp": 0.01043897, + "balance_loss_clip": 1.05588746, + "balance_loss_mlp": 1.02886498, + "epoch": 0.27350067638659253, + "flos": 22929753565440.0, + "grad_norm": 1.9571312809506023, + "language_loss": 0.82959986, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.8512243, + "num_input_tokens_seen": 98349860, + "step": 4549, + "time_per_iteration": 2.5703437328338623 + }, + { + "auxiliary_loss_clip": 0.0113131, + "auxiliary_loss_mlp": 0.01046858, + "balance_loss_clip": 1.04935944, + "balance_loss_mlp": 1.02826118, + "epoch": 0.2735607996392605, + "flos": 16574821499520.0, + "grad_norm": 1.9110335880786171, + "language_loss": 0.71209204, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.73387367, + "num_input_tokens_seen": 98367040, + "step": 4550, + "time_per_iteration": 2.47241473197937 + }, + { + "auxiliary_loss_clip": 0.01101262, + "auxiliary_loss_mlp": 0.01040311, + "balance_loss_clip": 1.05056798, + "balance_loss_mlp": 1.02452779, + "epoch": 0.27362092289192846, + "flos": 19645291895040.0, + "grad_norm": 3.5496042573401483, + "language_loss": 0.7877121, + "learning_rate": 3.409040566039563e-06, + "loss": 0.80912781, + "num_input_tokens_seen": 98384010, + "step": 4551, + "time_per_iteration": 3.9322588443756104 + }, + { + "auxiliary_loss_clip": 0.0110168, + "auxiliary_loss_mlp": 0.01061336, + "balance_loss_clip": 1.04672778, + "balance_loss_mlp": 1.04309726, + "epoch": 0.27368104614459643, + "flos": 17639142416640.0, + "grad_norm": 2.498622408224486, + "language_loss": 0.70441055, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.72604072, + "num_input_tokens_seen": 98399625, + "step": 4552, + "time_per_iteration": 2.5511727333068848 + }, + { + "auxiliary_loss_clip": 0.01118855, + "auxiliary_loss_mlp": 0.01039164, + "balance_loss_clip": 1.05241859, + "balance_loss_mlp": 1.0224154, + "epoch": 0.2737411693972644, + "flos": 21580015178880.0, + "grad_norm": 1.9068315580108788, + "language_loss": 0.71525764, + "learning_rate": 3.408487669858431e-06, + "loss": 0.73683786, + "num_input_tokens_seen": 98417310, + "step": 4553, + "time_per_iteration": 2.5350074768066406 + }, + { + "auxiliary_loss_clip": 0.01130031, + "auxiliary_loss_mlp": 0.01040073, + "balance_loss_clip": 1.05045605, + "balance_loss_mlp": 1.02257335, + "epoch": 0.27380129264993236, + "flos": 25484043565440.0, + "grad_norm": 1.5614163550254527, + "language_loss": 0.59273994, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.61444098, + "num_input_tokens_seen": 98438670, + "step": 4554, + "time_per_iteration": 3.9525089263916016 + }, + { + "auxiliary_loss_clip": 0.01124878, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.05314243, + "balance_loss_mlp": 1.02425921, + "epoch": 0.2738614159026003, + "flos": 18661196004480.0, + "grad_norm": 1.6774875614747509, + "language_loss": 0.74197125, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.76363432, + "num_input_tokens_seen": 98456060, + "step": 4555, + "time_per_iteration": 2.516784906387329 + }, + { + "auxiliary_loss_clip": 0.01135253, + "auxiliary_loss_mlp": 0.0104142, + "balance_loss_clip": 1.05415702, + "balance_loss_mlp": 1.02432489, + "epoch": 0.2739215391552683, + "flos": 23477139901440.0, + "grad_norm": 1.810006062912045, + "language_loss": 0.78252065, + "learning_rate": 3.407657925038002e-06, + "loss": 0.80428737, + "num_input_tokens_seen": 98473765, + "step": 4556, + "time_per_iteration": 2.525484085083008 + }, + { + "auxiliary_loss_clip": 0.01141809, + "auxiliary_loss_mlp": 0.01050109, + "balance_loss_clip": 1.0516057, + "balance_loss_mlp": 1.03157234, + "epoch": 0.27398166240793626, + "flos": 17128636369920.0, + "grad_norm": 1.8441261448157684, + "language_loss": 0.81953609, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.84145528, + "num_input_tokens_seen": 98490590, + "step": 4557, + "time_per_iteration": 3.840811252593994 + }, + { + "auxiliary_loss_clip": 0.01089759, + "auxiliary_loss_mlp": 0.01047343, + "balance_loss_clip": 1.04805732, + "balance_loss_mlp": 1.03030801, + "epoch": 0.2740417856606042, + "flos": 23404744039680.0, + "grad_norm": 1.804654663251744, + "language_loss": 0.72688043, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.74825156, + "num_input_tokens_seen": 98510590, + "step": 4558, + "time_per_iteration": 2.583707332611084 + }, + { + "auxiliary_loss_clip": 0.01119497, + "auxiliary_loss_mlp": 0.01050064, + "balance_loss_clip": 1.05051875, + "balance_loss_mlp": 1.03308892, + "epoch": 0.2741019089132722, + "flos": 12780428400000.0, + "grad_norm": 2.2034832489796012, + "language_loss": 0.67164522, + "learning_rate": 3.406827699810819e-06, + "loss": 0.69334078, + "num_input_tokens_seen": 98527875, + "step": 4559, + "time_per_iteration": 3.9095847606658936 + }, + { + "auxiliary_loss_clip": 0.01119445, + "auxiliary_loss_mlp": 0.0105611, + "balance_loss_clip": 1.05226851, + "balance_loss_mlp": 1.03836012, + "epoch": 0.27416203216594015, + "flos": 20631542601600.0, + "grad_norm": 1.6938784733853083, + "language_loss": 0.72141016, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.74316573, + "num_input_tokens_seen": 98547575, + "step": 4560, + "time_per_iteration": 2.5518782138824463 + }, + { + "auxiliary_loss_clip": 0.01120571, + "auxiliary_loss_mlp": 0.01047266, + "balance_loss_clip": 1.05052269, + "balance_loss_mlp": 1.03018284, + "epoch": 0.27422215541860817, + "flos": 26541576812160.0, + "grad_norm": 1.963117285069061, + "language_loss": 0.8123908, + "learning_rate": 3.406273949573303e-06, + "loss": 0.83406925, + "num_input_tokens_seen": 98566290, + "step": 4561, + "time_per_iteration": 2.5771610736846924 + }, + { + "auxiliary_loss_clip": 0.01146238, + "auxiliary_loss_mlp": 0.01040169, + "balance_loss_clip": 1.05289841, + "balance_loss_mlp": 1.02364707, + "epoch": 0.27428227867127614, + "flos": 23331163029120.0, + "grad_norm": 1.5346026029762971, + "language_loss": 0.75317484, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.77503896, + "num_input_tokens_seen": 98586255, + "step": 4562, + "time_per_iteration": 2.490492582321167 + }, + { + "auxiliary_loss_clip": 0.01144916, + "auxiliary_loss_mlp": 0.01035383, + "balance_loss_clip": 1.05402851, + "balance_loss_mlp": 1.01917028, + "epoch": 0.2743424019239441, + "flos": 23035115134080.0, + "grad_norm": 1.6627032782671975, + "language_loss": 0.74494636, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.76674938, + "num_input_tokens_seen": 98606030, + "step": 4563, + "time_per_iteration": 2.4725401401519775 + }, + { + "auxiliary_loss_clip": 0.01114398, + "auxiliary_loss_mlp": 0.01045745, + "balance_loss_clip": 1.0489639, + "balance_loss_mlp": 1.02767324, + "epoch": 0.27440252517661207, + "flos": 21981101420160.0, + "grad_norm": 1.7547496909436608, + "language_loss": 0.62307405, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.64467549, + "num_input_tokens_seen": 98625225, + "step": 4564, + "time_per_iteration": 2.568338394165039 + }, + { + "auxiliary_loss_clip": 0.01119935, + "auxiliary_loss_mlp": 0.0103954, + "balance_loss_clip": 1.0504005, + "balance_loss_mlp": 1.02190936, + "epoch": 0.27446264842928003, + "flos": 40187451502080.0, + "grad_norm": 1.8354037759692108, + "language_loss": 0.78316224, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.804757, + "num_input_tokens_seen": 98649470, + "step": 4565, + "time_per_iteration": 2.7361390590667725 + }, + { + "auxiliary_loss_clip": 0.01088838, + "auxiliary_loss_mlp": 0.01043462, + "balance_loss_clip": 1.04926825, + "balance_loss_mlp": 1.02648675, + "epoch": 0.274522771681948, + "flos": 13479681438720.0, + "grad_norm": 1.8532940803934566, + "language_loss": 0.69197786, + "learning_rate": 3.404888640957477e-06, + "loss": 0.71330082, + "num_input_tokens_seen": 98666915, + "step": 4566, + "time_per_iteration": 2.5790462493896484 + }, + { + "auxiliary_loss_clip": 0.01134066, + "auxiliary_loss_mlp": 0.01048642, + "balance_loss_clip": 1.05532503, + "balance_loss_mlp": 1.03276348, + "epoch": 0.27458289493461596, + "flos": 28622133313920.0, + "grad_norm": 1.6155001708637973, + "language_loss": 0.61191463, + "learning_rate": 3.404611419371723e-06, + "loss": 0.63374174, + "num_input_tokens_seen": 98688240, + "step": 4567, + "time_per_iteration": 2.569058895111084 + }, + { + "auxiliary_loss_clip": 0.01128143, + "auxiliary_loss_mlp": 0.01051064, + "balance_loss_clip": 1.05105937, + "balance_loss_mlp": 1.03238344, + "epoch": 0.2746430181872839, + "flos": 20119815492480.0, + "grad_norm": 1.840314437980144, + "language_loss": 0.82663429, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.8484264, + "num_input_tokens_seen": 98708245, + "step": 4568, + "time_per_iteration": 2.522486686706543 + }, + { + "auxiliary_loss_clip": 0.01133863, + "auxiliary_loss_mlp": 0.01038972, + "balance_loss_clip": 1.05766308, + "balance_loss_mlp": 1.02188957, + "epoch": 0.2747031414399519, + "flos": 20193468330240.0, + "grad_norm": 1.909086822001717, + "language_loss": 0.68361425, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.70534259, + "num_input_tokens_seen": 98724575, + "step": 4569, + "time_per_iteration": 2.4911868572235107 + }, + { + "auxiliary_loss_clip": 0.01110755, + "auxiliary_loss_mlp": 0.0104203, + "balance_loss_clip": 1.04618526, + "balance_loss_mlp": 1.02466154, + "epoch": 0.27476326469261986, + "flos": 13516346246400.0, + "grad_norm": 2.0712356786557726, + "language_loss": 0.70915002, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.7306779, + "num_input_tokens_seen": 98740700, + "step": 4570, + "time_per_iteration": 2.5152506828308105 + }, + { + "auxiliary_loss_clip": 0.0102923, + "auxiliary_loss_mlp": 0.01012619, + "balance_loss_clip": 1.02956676, + "balance_loss_mlp": 1.01053262, + "epoch": 0.2748233879452878, + "flos": 65937127121280.0, + "grad_norm": 0.729525008789145, + "language_loss": 0.55786008, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.57827854, + "num_input_tokens_seen": 98803030, + "step": 4571, + "time_per_iteration": 3.283519983291626 + }, + { + "auxiliary_loss_clip": 0.01089561, + "auxiliary_loss_mlp": 0.01050093, + "balance_loss_clip": 1.04630232, + "balance_loss_mlp": 1.03330863, + "epoch": 0.2748835111979558, + "flos": 17384212615680.0, + "grad_norm": 2.2123016574130903, + "language_loss": 0.78179109, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.80318761, + "num_input_tokens_seen": 98820505, + "step": 4572, + "time_per_iteration": 2.5635368824005127 + }, + { + "auxiliary_loss_clip": 0.01138101, + "auxiliary_loss_mlp": 0.01035728, + "balance_loss_clip": 1.05242419, + "balance_loss_mlp": 1.02102923, + "epoch": 0.27494363445062375, + "flos": 23587565287680.0, + "grad_norm": 1.5534112849435615, + "language_loss": 0.8169744, + "learning_rate": 3.402946971702147e-06, + "loss": 0.83871275, + "num_input_tokens_seen": 98842150, + "step": 4573, + "time_per_iteration": 2.495182514190674 + }, + { + "auxiliary_loss_clip": 0.01128378, + "auxiliary_loss_mlp": 0.01036518, + "balance_loss_clip": 1.05076098, + "balance_loss_mlp": 1.02046061, + "epoch": 0.2750037577032918, + "flos": 17164582905600.0, + "grad_norm": 1.7352847900596808, + "language_loss": 0.79025495, + "learning_rate": 3.402669377496223e-06, + "loss": 0.81190389, + "num_input_tokens_seen": 98861050, + "step": 4574, + "time_per_iteration": 2.526484727859497 + }, + { + "auxiliary_loss_clip": 0.01096269, + "auxiliary_loss_mlp": 0.01046291, + "balance_loss_clip": 1.05016243, + "balance_loss_mlp": 1.03067446, + "epoch": 0.27506388095595974, + "flos": 24491903028480.0, + "grad_norm": 1.9179857475411626, + "language_loss": 0.7433297, + "learning_rate": 3.402391730100936e-06, + "loss": 0.76475531, + "num_input_tokens_seen": 98879695, + "step": 4575, + "time_per_iteration": 2.6148297786712646 + }, + { + "auxiliary_loss_clip": 0.01116613, + "auxiliary_loss_mlp": 0.01039832, + "balance_loss_clip": 1.04837322, + "balance_loss_mlp": 1.02459693, + "epoch": 0.2751240042086277, + "flos": 38764706722560.0, + "grad_norm": 1.9037872313967157, + "language_loss": 0.71472859, + "learning_rate": 3.402114029526814e-06, + "loss": 0.73629296, + "num_input_tokens_seen": 98902035, + "step": 4576, + "time_per_iteration": 2.668282985687256 + }, + { + "auxiliary_loss_clip": 0.01093512, + "auxiliary_loss_mlp": 0.0079766, + "balance_loss_clip": 1.04820681, + "balance_loss_mlp": 1.01608515, + "epoch": 0.27518412746129567, + "flos": 26907039740160.0, + "grad_norm": 2.3717910777771403, + "language_loss": 0.7332589, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.75217062, + "num_input_tokens_seen": 98921835, + "step": 4577, + "time_per_iteration": 2.609741687774658 + }, + { + "auxiliary_loss_clip": 0.01119014, + "auxiliary_loss_mlp": 0.01035019, + "balance_loss_clip": 1.04985094, + "balance_loss_mlp": 1.01841295, + "epoch": 0.27524425071396363, + "flos": 24900531125760.0, + "grad_norm": 1.7459002502087733, + "language_loss": 0.75670123, + "learning_rate": 3.401558468884188e-06, + "loss": 0.77824152, + "num_input_tokens_seen": 98939610, + "step": 4578, + "time_per_iteration": 2.6070759296417236 + }, + { + "auxiliary_loss_clip": 0.01112081, + "auxiliary_loss_mlp": 0.01049131, + "balance_loss_clip": 1.05056262, + "balance_loss_mlp": 1.03012896, + "epoch": 0.2753043739666316, + "flos": 26288047641600.0, + "grad_norm": 1.4042292350543357, + "language_loss": 0.66100872, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.68262082, + "num_input_tokens_seen": 98962250, + "step": 4579, + "time_per_iteration": 2.605952262878418 + }, + { + "auxiliary_loss_clip": 0.01108784, + "auxiliary_loss_mlp": 0.01053297, + "balance_loss_clip": 1.04921377, + "balance_loss_mlp": 1.03489113, + "epoch": 0.27536449721929956, + "flos": 24206772867840.0, + "grad_norm": 1.7835397570030327, + "language_loss": 0.79870218, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.82032299, + "num_input_tokens_seen": 98981845, + "step": 4580, + "time_per_iteration": 2.5854430198669434 + }, + { + "auxiliary_loss_clip": 0.01129091, + "auxiliary_loss_mlp": 0.01045212, + "balance_loss_clip": 1.05021822, + "balance_loss_mlp": 1.02716339, + "epoch": 0.27542462047196753, + "flos": 19537272720000.0, + "grad_norm": 1.4597107140874, + "language_loss": 0.67508006, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.69682312, + "num_input_tokens_seen": 99001855, + "step": 4581, + "time_per_iteration": 2.51517391204834 + }, + { + "auxiliary_loss_clip": 0.01137224, + "auxiliary_loss_mlp": 0.01043672, + "balance_loss_clip": 1.07608891, + "balance_loss_mlp": 1.02787638, + "epoch": 0.2754847437246355, + "flos": 14319165173760.0, + "grad_norm": 1.6345952899159173, + "language_loss": 0.78349888, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80530781, + "num_input_tokens_seen": 99019880, + "step": 4582, + "time_per_iteration": 2.5103378295898438 + }, + { + "auxiliary_loss_clip": 0.0109258, + "auxiliary_loss_mlp": 0.01041821, + "balance_loss_clip": 1.04995096, + "balance_loss_mlp": 1.02639556, + "epoch": 0.27554486697730346, + "flos": 18838773866880.0, + "grad_norm": 1.689995703652388, + "language_loss": 0.84217536, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.86351943, + "num_input_tokens_seen": 99037570, + "step": 4583, + "time_per_iteration": 2.5703258514404297 + }, + { + "auxiliary_loss_clip": 0.01129991, + "auxiliary_loss_mlp": 0.01039308, + "balance_loss_clip": 1.04892838, + "balance_loss_mlp": 1.02340567, + "epoch": 0.2756049902299714, + "flos": 22382295402240.0, + "grad_norm": 1.6670971472417908, + "language_loss": 0.67117977, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.69287276, + "num_input_tokens_seen": 99056875, + "step": 4584, + "time_per_iteration": 2.5056724548339844 + }, + { + "auxiliary_loss_clip": 0.01075734, + "auxiliary_loss_mlp": 0.01058966, + "balance_loss_clip": 1.04437709, + "balance_loss_mlp": 1.04119158, + "epoch": 0.2756651134826394, + "flos": 19573901614080.0, + "grad_norm": 1.7901573065808485, + "language_loss": 0.76945627, + "learning_rate": 3.399612333050327e-06, + "loss": 0.79080331, + "num_input_tokens_seen": 99074685, + "step": 4585, + "time_per_iteration": 2.571514844894409 + }, + { + "auxiliary_loss_clip": 0.01132056, + "auxiliary_loss_mlp": 0.00795758, + "balance_loss_clip": 1.05190325, + "balance_loss_mlp": 1.01412296, + "epoch": 0.27572523673530736, + "flos": 23586559706880.0, + "grad_norm": 2.502688797944457, + "language_loss": 0.71783203, + "learning_rate": 3.399334101267362e-06, + "loss": 0.73711014, + "num_input_tokens_seen": 99095300, + "step": 4586, + "time_per_iteration": 2.55117130279541 + }, + { + "auxiliary_loss_clip": 0.01116307, + "auxiliary_loss_mlp": 0.0103393, + "balance_loss_clip": 1.04923749, + "balance_loss_mlp": 1.01833105, + "epoch": 0.2757853599879754, + "flos": 22820118278400.0, + "grad_norm": 1.5021314304396445, + "language_loss": 0.80737627, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.82887852, + "num_input_tokens_seen": 99115965, + "step": 4587, + "time_per_iteration": 2.553982973098755 + }, + { + "auxiliary_loss_clip": 0.01124902, + "auxiliary_loss_mlp": 0.01037289, + "balance_loss_clip": 1.04641604, + "balance_loss_mlp": 1.0215714, + "epoch": 0.27584548324064334, + "flos": 18551704371840.0, + "grad_norm": 1.8384515501312855, + "language_loss": 0.82731664, + "learning_rate": 3.398777478523316e-06, + "loss": 0.84893858, + "num_input_tokens_seen": 99134265, + "step": 4588, + "time_per_iteration": 2.4975743293762207 + }, + { + "auxiliary_loss_clip": 0.0110073, + "auxiliary_loss_mlp": 0.01038644, + "balance_loss_clip": 1.04469192, + "balance_loss_mlp": 1.0222528, + "epoch": 0.2759056064933113, + "flos": 23769883745280.0, + "grad_norm": 1.4391082804463773, + "language_loss": 0.75537062, + "learning_rate": 3.398499087583342e-06, + "loss": 0.77676433, + "num_input_tokens_seen": 99156185, + "step": 4589, + "time_per_iteration": 2.6969096660614014 + }, + { + "auxiliary_loss_clip": 0.0113205, + "auxiliary_loss_mlp": 0.01042943, + "balance_loss_clip": 1.05382586, + "balance_loss_mlp": 1.02686214, + "epoch": 0.27596572974597927, + "flos": 24281898163200.0, + "grad_norm": 1.7344916579577159, + "language_loss": 0.88667816, + "learning_rate": 3.398220643612143e-06, + "loss": 0.90842807, + "num_input_tokens_seen": 99176735, + "step": 4590, + "time_per_iteration": 4.020005702972412 + }, + { + "auxiliary_loss_clip": 0.01120574, + "auxiliary_loss_mlp": 0.01046791, + "balance_loss_clip": 1.04740644, + "balance_loss_mlp": 1.02985179, + "epoch": 0.27602585299864724, + "flos": 35040985632000.0, + "grad_norm": 1.5917515285143642, + "language_loss": 0.71352559, + "learning_rate": 3.397942146620277e-06, + "loss": 0.73519927, + "num_input_tokens_seen": 99199765, + "step": 4591, + "time_per_iteration": 2.6162924766540527 + }, + { + "auxiliary_loss_clip": 0.01103177, + "auxiliary_loss_mlp": 0.01042908, + "balance_loss_clip": 1.04735827, + "balance_loss_mlp": 1.02623081, + "epoch": 0.2760859762513152, + "flos": 24309405002880.0, + "grad_norm": 1.7693090981193267, + "language_loss": 0.80589068, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.82735151, + "num_input_tokens_seen": 99218435, + "step": 4592, + "time_per_iteration": 2.5975236892700195 + }, + { + "auxiliary_loss_clip": 0.01042429, + "auxiliary_loss_mlp": 0.01064954, + "balance_loss_clip": 1.02105987, + "balance_loss_mlp": 1.51519334, + "epoch": 0.27614609950398317, + "flos": 71260739890560.0, + "grad_norm": 0.7262632233473194, + "language_loss": 0.61610973, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63718367, + "num_input_tokens_seen": 99276200, + "step": 4593, + "time_per_iteration": 3.071500778198242 + }, + { + "auxiliary_loss_clip": 0.01122033, + "auxiliary_loss_mlp": 0.01044588, + "balance_loss_clip": 1.04893804, + "balance_loss_mlp": 1.0290314, + "epoch": 0.27620622275665113, + "flos": 29674854138240.0, + "grad_norm": 1.9477870890362081, + "language_loss": 0.77191192, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.79357809, + "num_input_tokens_seen": 99297625, + "step": 4594, + "time_per_iteration": 3.9729862213134766 + }, + { + "auxiliary_loss_clip": 0.01126328, + "auxiliary_loss_mlp": 0.01035069, + "balance_loss_clip": 1.04918838, + "balance_loss_mlp": 1.01904774, + "epoch": 0.2762663460093191, + "flos": 15378063137280.0, + "grad_norm": 1.5727802020094417, + "language_loss": 0.91514361, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.93675756, + "num_input_tokens_seen": 99315790, + "step": 4595, + "time_per_iteration": 3.844413995742798 + }, + { + "auxiliary_loss_clip": 0.01125853, + "auxiliary_loss_mlp": 0.01047307, + "balance_loss_clip": 1.05224323, + "balance_loss_mlp": 1.03037965, + "epoch": 0.27632646926198706, + "flos": 20704082117760.0, + "grad_norm": 1.794384131044761, + "language_loss": 0.69423991, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.71597153, + "num_input_tokens_seen": 99334615, + "step": 4596, + "time_per_iteration": 2.482381820678711 + }, + { + "auxiliary_loss_clip": 0.01114096, + "auxiliary_loss_mlp": 0.01043073, + "balance_loss_clip": 1.04718077, + "balance_loss_mlp": 1.02651477, + "epoch": 0.276386592514655, + "flos": 32813374849920.0, + "grad_norm": 2.098328341184232, + "language_loss": 0.6364525, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.65802419, + "num_input_tokens_seen": 99356685, + "step": 4597, + "time_per_iteration": 4.0288896560668945 + }, + { + "auxiliary_loss_clip": 0.01136141, + "auxiliary_loss_mlp": 0.01040724, + "balance_loss_clip": 1.04940391, + "balance_loss_mlp": 1.02554893, + "epoch": 0.276446715767323, + "flos": 18551704371840.0, + "grad_norm": 2.0482349351030797, + "language_loss": 0.86713678, + "learning_rate": 3.395991183985887e-06, + "loss": 0.88890541, + "num_input_tokens_seen": 99374810, + "step": 4598, + "time_per_iteration": 2.435037136077881 + }, + { + "auxiliary_loss_clip": 0.01137243, + "auxiliary_loss_mlp": 0.01045467, + "balance_loss_clip": 1.04756272, + "balance_loss_mlp": 1.0284555, + "epoch": 0.27650683901999096, + "flos": 22819615488000.0, + "grad_norm": 2.282877417620533, + "language_loss": 0.79980391, + "learning_rate": 3.395712263209037e-06, + "loss": 0.82163101, + "num_input_tokens_seen": 99391290, + "step": 4599, + "time_per_iteration": 2.4718916416168213 + }, + { + "auxiliary_loss_clip": 0.01117317, + "auxiliary_loss_mlp": 0.01045297, + "balance_loss_clip": 1.04641652, + "balance_loss_mlp": 1.02933502, + "epoch": 0.276566962272659, + "flos": 21361534704000.0, + "grad_norm": 1.9949490391617812, + "language_loss": 0.79224294, + "learning_rate": 3.395433289506639e-06, + "loss": 0.81386912, + "num_input_tokens_seen": 99409120, + "step": 4600, + "time_per_iteration": 2.5263750553131104 + }, + { + "auxiliary_loss_clip": 0.01107023, + "auxiliary_loss_mlp": 0.01045421, + "balance_loss_clip": 1.04740596, + "balance_loss_mlp": 1.02910113, + "epoch": 0.27662708552532694, + "flos": 17710604524800.0, + "grad_norm": 1.745096568033291, + "language_loss": 0.72896874, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.75049317, + "num_input_tokens_seen": 99426180, + "step": 4601, + "time_per_iteration": 2.5388035774230957 + }, + { + "auxiliary_loss_clip": 0.01124964, + "auxiliary_loss_mlp": 0.01044537, + "balance_loss_clip": 1.04663622, + "balance_loss_mlp": 1.02746677, + "epoch": 0.2766872087779949, + "flos": 21252725429760.0, + "grad_norm": 1.6226448699763845, + "language_loss": 0.80026096, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.82195592, + "num_input_tokens_seen": 99447720, + "step": 4602, + "time_per_iteration": 2.541095495223999 + }, + { + "auxiliary_loss_clip": 0.01120022, + "auxiliary_loss_mlp": 0.0105208, + "balance_loss_clip": 1.04730737, + "balance_loss_mlp": 1.03398371, + "epoch": 0.2767473320306629, + "flos": 12931900053120.0, + "grad_norm": 2.0494422499463547, + "language_loss": 0.77083993, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.79256094, + "num_input_tokens_seen": 99464720, + "step": 4603, + "time_per_iteration": 2.503892660140991 + }, + { + "auxiliary_loss_clip": 0.0110519, + "auxiliary_loss_mlp": 0.01042108, + "balance_loss_clip": 1.04653883, + "balance_loss_mlp": 1.0270164, + "epoch": 0.27680745528333084, + "flos": 15012851604480.0, + "grad_norm": 1.545044163277421, + "language_loss": 0.81708497, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.83855796, + "num_input_tokens_seen": 99482310, + "step": 4604, + "time_per_iteration": 2.505063772201538 + }, + { + "auxiliary_loss_clip": 0.01090317, + "auxiliary_loss_mlp": 0.01034344, + "balance_loss_clip": 1.04821622, + "balance_loss_mlp": 1.01755977, + "epoch": 0.2768675785359988, + "flos": 22637835734400.0, + "grad_norm": 1.879089817930719, + "language_loss": 0.70047253, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.72171915, + "num_input_tokens_seen": 99501255, + "step": 4605, + "time_per_iteration": 2.6266467571258545 + }, + { + "auxiliary_loss_clip": 0.01037719, + "auxiliary_loss_mlp": 0.01014949, + "balance_loss_clip": 1.01658928, + "balance_loss_mlp": 1.01294613, + "epoch": 0.27692770178866677, + "flos": 66130542881280.0, + "grad_norm": 0.6991732294776043, + "language_loss": 0.57192135, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.592448, + "num_input_tokens_seen": 99568925, + "step": 4606, + "time_per_iteration": 3.1990973949432373 + }, + { + "auxiliary_loss_clip": 0.01118391, + "auxiliary_loss_mlp": 0.01049217, + "balance_loss_clip": 1.05202675, + "balance_loss_mlp": 1.03230166, + "epoch": 0.27698782504133473, + "flos": 26464979059200.0, + "grad_norm": 2.007095824110389, + "language_loss": 0.69247019, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.71414638, + "num_input_tokens_seen": 99588455, + "step": 4607, + "time_per_iteration": 2.656494379043579 + }, + { + "auxiliary_loss_clip": 0.01120114, + "auxiliary_loss_mlp": 0.01038531, + "balance_loss_clip": 1.04959011, + "balance_loss_mlp": 1.02318907, + "epoch": 0.2770479482940027, + "flos": 25884806584320.0, + "grad_norm": 1.8633162070389817, + "language_loss": 0.69553816, + "learning_rate": 3.393199595837555e-06, + "loss": 0.71712458, + "num_input_tokens_seen": 99609355, + "step": 4608, + "time_per_iteration": 2.5660226345062256 + }, + { + "auxiliary_loss_clip": 0.0108775, + "auxiliary_loss_mlp": 0.01035153, + "balance_loss_clip": 1.04684854, + "balance_loss_mlp": 1.01942933, + "epoch": 0.27710807154667066, + "flos": 22857249962880.0, + "grad_norm": 1.7415987207604806, + "language_loss": 0.72589314, + "learning_rate": 3.392920146281499e-06, + "loss": 0.74712217, + "num_input_tokens_seen": 99628780, + "step": 4609, + "time_per_iteration": 2.6270017623901367 + }, + { + "auxiliary_loss_clip": 0.01098854, + "auxiliary_loss_mlp": 0.01049616, + "balance_loss_clip": 1.04445148, + "balance_loss_mlp": 1.0321641, + "epoch": 0.27716819479933863, + "flos": 17711071401600.0, + "grad_norm": 2.4332284724049185, + "language_loss": 0.84144986, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.86293459, + "num_input_tokens_seen": 99644545, + "step": 4610, + "time_per_iteration": 2.5473031997680664 + }, + { + "auxiliary_loss_clip": 0.01070351, + "auxiliary_loss_mlp": 0.00970023, + "balance_loss_clip": 1.04251122, + "balance_loss_mlp": 1.32770705, + "epoch": 0.2772283180520066, + "flos": 19646046080640.0, + "grad_norm": 1.9969084022953179, + "language_loss": 0.68949115, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.7098949, + "num_input_tokens_seen": 99663125, + "step": 4611, + "time_per_iteration": 2.6056246757507324 + }, + { + "auxiliary_loss_clip": 0.01132708, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.04773974, + "balance_loss_mlp": 1.02491426, + "epoch": 0.27728844130467456, + "flos": 21032628842880.0, + "grad_norm": 2.2208598116921334, + "language_loss": 0.73927277, + "learning_rate": 3.392081480737698e-06, + "loss": 0.7610054, + "num_input_tokens_seen": 99682645, + "step": 4612, + "time_per_iteration": 2.4833269119262695 + }, + { + "auxiliary_loss_clip": 0.01126487, + "auxiliary_loss_mlp": 0.00926401, + "balance_loss_clip": 1.04549122, + "balance_loss_mlp": 1.2558229, + "epoch": 0.2773485645573425, + "flos": 18989204025600.0, + "grad_norm": 2.1654356304366074, + "language_loss": 0.66371137, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.68424022, + "num_input_tokens_seen": 99700520, + "step": 4613, + "time_per_iteration": 2.489562749862671 + }, + { + "auxiliary_loss_clip": 0.01089532, + "auxiliary_loss_mlp": 0.01048419, + "balance_loss_clip": 1.04482877, + "balance_loss_mlp": 1.03044188, + "epoch": 0.27740868781001055, + "flos": 21468440557440.0, + "grad_norm": 1.5999895911122677, + "language_loss": 0.79339874, + "learning_rate": 3.39152210641815e-06, + "loss": 0.81477821, + "num_input_tokens_seen": 99720355, + "step": 4614, + "time_per_iteration": 2.5995330810546875 + }, + { + "auxiliary_loss_clip": 0.01122576, + "auxiliary_loss_mlp": 0.01048232, + "balance_loss_clip": 1.0471983, + "balance_loss_mlp": 1.0308274, + "epoch": 0.2774688110626785, + "flos": 19827825834240.0, + "grad_norm": 2.4625342958150083, + "language_loss": 0.80370331, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.82541144, + "num_input_tokens_seen": 99736090, + "step": 4615, + "time_per_iteration": 2.4763236045837402 + }, + { + "auxiliary_loss_clip": 0.01105885, + "auxiliary_loss_mlp": 0.0104815, + "balance_loss_clip": 1.04328609, + "balance_loss_mlp": 1.03128231, + "epoch": 0.2775289343153465, + "flos": 18216226321920.0, + "grad_norm": 2.5065406414970672, + "language_loss": 0.64132905, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.66286939, + "num_input_tokens_seen": 99751805, + "step": 4616, + "time_per_iteration": 2.526428461074829 + }, + { + "auxiliary_loss_clip": 0.01119088, + "auxiliary_loss_mlp": 0.01041888, + "balance_loss_clip": 1.04597723, + "balance_loss_mlp": 1.02572346, + "epoch": 0.27758905756801444, + "flos": 16472476673280.0, + "grad_norm": 1.9628791553878375, + "language_loss": 0.82244915, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.84405893, + "num_input_tokens_seen": 99770610, + "step": 4617, + "time_per_iteration": 2.463698387145996 + }, + { + "auxiliary_loss_clip": 0.0113526, + "auxiliary_loss_mlp": 0.01046815, + "balance_loss_clip": 1.04471505, + "balance_loss_mlp": 1.0306505, + "epoch": 0.2776491808206824, + "flos": 18728240739840.0, + "grad_norm": 2.1035555178318246, + "language_loss": 0.76906061, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.7908814, + "num_input_tokens_seen": 99787305, + "step": 4618, + "time_per_iteration": 2.4476096630096436 + }, + { + "auxiliary_loss_clip": 0.01140291, + "auxiliary_loss_mlp": 0.01043357, + "balance_loss_clip": 1.04953671, + "balance_loss_mlp": 1.02762127, + "epoch": 0.27770930407335037, + "flos": 28038189911040.0, + "grad_norm": 1.7186766072974553, + "language_loss": 0.84697956, + "learning_rate": 3.390122747388459e-06, + "loss": 0.86881602, + "num_input_tokens_seen": 99808940, + "step": 4619, + "time_per_iteration": 2.4974989891052246 + }, + { + "auxiliary_loss_clip": 0.01115397, + "auxiliary_loss_mlp": 0.01044371, + "balance_loss_clip": 1.04970312, + "balance_loss_mlp": 1.02936208, + "epoch": 0.27776942732601834, + "flos": 23549823072000.0, + "grad_norm": 1.4532002320895863, + "language_loss": 0.77073061, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.79232824, + "num_input_tokens_seen": 99829575, + "step": 4620, + "time_per_iteration": 2.553058385848999 + }, + { + "auxiliary_loss_clip": 0.01083806, + "auxiliary_loss_mlp": 0.01045214, + "balance_loss_clip": 1.04345798, + "balance_loss_mlp": 1.02858424, + "epoch": 0.2778295505786863, + "flos": 23908713811200.0, + "grad_norm": 1.9414833436103471, + "language_loss": 0.78531361, + "learning_rate": 3.389562634707122e-06, + "loss": 0.80660379, + "num_input_tokens_seen": 99847575, + "step": 4621, + "time_per_iteration": 2.5575671195983887 + }, + { + "auxiliary_loss_clip": 0.01105024, + "auxiliary_loss_mlp": 0.01048569, + "balance_loss_clip": 1.04612386, + "balance_loss_mlp": 1.03178477, + "epoch": 0.27788967383135427, + "flos": 25554571920000.0, + "grad_norm": 2.054327389108337, + "language_loss": 0.88047147, + "learning_rate": 3.389282499322611e-06, + "loss": 0.90200746, + "num_input_tokens_seen": 99864995, + "step": 4622, + "time_per_iteration": 2.5794317722320557 + }, + { + "auxiliary_loss_clip": 0.01092365, + "auxiliary_loss_mlp": 0.01046548, + "balance_loss_clip": 1.05089056, + "balance_loss_mlp": 1.02983463, + "epoch": 0.27794979708402223, + "flos": 16252631481600.0, + "grad_norm": 1.832885730134156, + "language_loss": 0.81256199, + "learning_rate": 3.389002311256369e-06, + "loss": 0.83395118, + "num_input_tokens_seen": 99881540, + "step": 4623, + "time_per_iteration": 2.566901683807373 + }, + { + "auxiliary_loss_clip": 0.01112504, + "auxiliary_loss_mlp": 0.01041143, + "balance_loss_clip": 1.05366421, + "balance_loss_mlp": 1.02533627, + "epoch": 0.2780099203366902, + "flos": 20667632791680.0, + "grad_norm": 1.9271216548617651, + "language_loss": 0.81662834, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.83816481, + "num_input_tokens_seen": 99899595, + "step": 4624, + "time_per_iteration": 2.5724620819091797 + }, + { + "auxiliary_loss_clip": 0.01103994, + "auxiliary_loss_mlp": 0.0086115, + "balance_loss_clip": 1.04751551, + "balance_loss_mlp": 1.13005877, + "epoch": 0.27807004358935816, + "flos": 17739583822080.0, + "grad_norm": 2.333408868810702, + "language_loss": 0.76850295, + "learning_rate": 3.388441777121191e-06, + "loss": 0.78815436, + "num_input_tokens_seen": 99913020, + "step": 4625, + "time_per_iteration": 2.4856669902801514 + }, + { + "auxiliary_loss_clip": 0.01101894, + "auxiliary_loss_mlp": 0.01045432, + "balance_loss_clip": 1.04383314, + "balance_loss_mlp": 1.028409, + "epoch": 0.2781301668420261, + "flos": 16727119165440.0, + "grad_norm": 1.9236987067820803, + "language_loss": 0.70091903, + "learning_rate": 3.388161431073511e-06, + "loss": 0.72239232, + "num_input_tokens_seen": 99931405, + "step": 4626, + "time_per_iteration": 2.5033881664276123 + }, + { + "auxiliary_loss_clip": 0.01100071, + "auxiliary_loss_mlp": 0.01037942, + "balance_loss_clip": 1.05040431, + "balance_loss_mlp": 1.02050173, + "epoch": 0.27819029009469415, + "flos": 13844749317120.0, + "grad_norm": 2.220480798840469, + "language_loss": 0.92374498, + "learning_rate": 3.38788103238661e-06, + "loss": 0.9451251, + "num_input_tokens_seen": 99948100, + "step": 4627, + "time_per_iteration": 2.590651512145996 + }, + { + "auxiliary_loss_clip": 0.01138132, + "auxiliary_loss_mlp": 0.01036755, + "balance_loss_clip": 1.04818082, + "balance_loss_mlp": 1.02204502, + "epoch": 0.2782504133473621, + "flos": 27089286370560.0, + "grad_norm": 1.8443621619033856, + "language_loss": 0.8555727, + "learning_rate": 3.387600581071121e-06, + "loss": 0.8773216, + "num_input_tokens_seen": 99966470, + "step": 4628, + "time_per_iteration": 3.9970452785491943 + }, + { + "auxiliary_loss_clip": 0.01104961, + "auxiliary_loss_mlp": 0.01037681, + "balance_loss_clip": 1.04529834, + "balance_loss_mlp": 1.02195776, + "epoch": 0.2783105366000301, + "flos": 21068826773760.0, + "grad_norm": 1.6773352140184754, + "language_loss": 0.79432112, + "learning_rate": 3.387320077137679e-06, + "loss": 0.81574756, + "num_input_tokens_seen": 99985930, + "step": 4629, + "time_per_iteration": 2.60141921043396 + }, + { + "auxiliary_loss_clip": 0.01096186, + "auxiliary_loss_mlp": 0.01035997, + "balance_loss_clip": 1.04897237, + "balance_loss_mlp": 1.02100015, + "epoch": 0.27837065985269804, + "flos": 26501823434880.0, + "grad_norm": 1.5225963087161418, + "language_loss": 0.84674692, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.8680687, + "num_input_tokens_seen": 100006235, + "step": 4630, + "time_per_iteration": 2.5954160690307617 + }, + { + "auxiliary_loss_clip": 0.01116265, + "auxiliary_loss_mlp": 0.01036389, + "balance_loss_clip": 1.04775929, + "balance_loss_mlp": 1.02040255, + "epoch": 0.278430783105366, + "flos": 20223201813120.0, + "grad_norm": 2.056054936080906, + "language_loss": 0.80716789, + "learning_rate": 3.386758911459485e-06, + "loss": 0.8286944, + "num_input_tokens_seen": 100023655, + "step": 4631, + "time_per_iteration": 2.5210793018341064 + }, + { + "auxiliary_loss_clip": 0.01140955, + "auxiliary_loss_mlp": 0.01044766, + "balance_loss_clip": 1.05197287, + "balance_loss_mlp": 1.0289588, + "epoch": 0.278490906358034, + "flos": 25592888753280.0, + "grad_norm": 2.0131091829862107, + "language_loss": 0.71281576, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.73467302, + "num_input_tokens_seen": 100043280, + "step": 4632, + "time_per_iteration": 3.8958778381347656 + }, + { + "auxiliary_loss_clip": 0.01124893, + "auxiliary_loss_mlp": 0.0103816, + "balance_loss_clip": 1.05348122, + "balance_loss_mlp": 1.02353859, + "epoch": 0.27855102961070194, + "flos": 16171544528640.0, + "grad_norm": 2.8729806000309006, + "language_loss": 0.82598174, + "learning_rate": 3.386197535437145e-06, + "loss": 0.84761232, + "num_input_tokens_seen": 100057690, + "step": 4633, + "time_per_iteration": 2.4715912342071533 + }, + { + "auxiliary_loss_clip": 0.01114567, + "auxiliary_loss_mlp": 0.01036745, + "balance_loss_clip": 1.04612207, + "balance_loss_mlp": 1.01996088, + "epoch": 0.2786111528633699, + "flos": 22927598749440.0, + "grad_norm": 1.7099276805900336, + "language_loss": 0.88070834, + "learning_rate": 3.385916768573529e-06, + "loss": 0.9022215, + "num_input_tokens_seen": 100075875, + "step": 4634, + "time_per_iteration": 3.918361186981201 + }, + { + "auxiliary_loss_clip": 0.01111288, + "auxiliary_loss_mlp": 0.01038052, + "balance_loss_clip": 1.05041873, + "balance_loss_mlp": 1.02167225, + "epoch": 0.27867127611603787, + "flos": 23404205335680.0, + "grad_norm": 1.5728230436888795, + "language_loss": 0.77092016, + "learning_rate": 3.38563594915581e-06, + "loss": 0.79241359, + "num_input_tokens_seen": 100092930, + "step": 4635, + "time_per_iteration": 2.525675058364868 + }, + { + "auxiliary_loss_clip": 0.01137117, + "auxiliary_loss_mlp": 0.01041733, + "balance_loss_clip": 1.04662895, + "balance_loss_mlp": 1.02541304, + "epoch": 0.27873139936870583, + "flos": 19829010983040.0, + "grad_norm": 1.8168896741910923, + "language_loss": 0.65567625, + "learning_rate": 3.385355077194637e-06, + "loss": 0.67746478, + "num_input_tokens_seen": 100110790, + "step": 4636, + "time_per_iteration": 3.8618345260620117 + }, + { + "auxiliary_loss_clip": 0.01123143, + "auxiliary_loss_mlp": 0.01041048, + "balance_loss_clip": 1.04482627, + "balance_loss_mlp": 1.02439427, + "epoch": 0.2787915226213738, + "flos": 17707659609600.0, + "grad_norm": 2.338718946525157, + "language_loss": 0.8375017, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.85914361, + "num_input_tokens_seen": 100126970, + "step": 4637, + "time_per_iteration": 2.501603126525879 + }, + { + "auxiliary_loss_clip": 0.0110914, + "auxiliary_loss_mlp": 0.01038811, + "balance_loss_clip": 1.0451746, + "balance_loss_mlp": 1.0236001, + "epoch": 0.27885164587404176, + "flos": 22090557139200.0, + "grad_norm": 1.560462396490986, + "language_loss": 0.75961751, + "learning_rate": 3.384793175684533e-06, + "loss": 0.78109705, + "num_input_tokens_seen": 100146720, + "step": 4638, + "time_per_iteration": 2.5474679470062256 + }, + { + "auxiliary_loss_clip": 0.0112359, + "auxiliary_loss_mlp": 0.01046815, + "balance_loss_clip": 1.04570699, + "balance_loss_mlp": 1.03063846, + "epoch": 0.27891176912670973, + "flos": 19207684500480.0, + "grad_norm": 1.4802483772938428, + "language_loss": 0.71746266, + "learning_rate": 3.38451214615691e-06, + "loss": 0.73916662, + "num_input_tokens_seen": 100165920, + "step": 4639, + "time_per_iteration": 2.56996750831604 + }, + { + "auxiliary_loss_clip": 0.01125007, + "auxiliary_loss_mlp": 0.01031937, + "balance_loss_clip": 1.0462656, + "balance_loss_mlp": 1.01592779, + "epoch": 0.27897189237937775, + "flos": 27600007898880.0, + "grad_norm": 1.9886332186873583, + "language_loss": 0.65409517, + "learning_rate": 3.384231064128447e-06, + "loss": 0.67566466, + "num_input_tokens_seen": 100185525, + "step": 4640, + "time_per_iteration": 2.621410846710205 + }, + { + "auxiliary_loss_clip": 0.01127101, + "auxiliary_loss_mlp": 0.01035471, + "balance_loss_clip": 1.04912806, + "balance_loss_mlp": 1.02020025, + "epoch": 0.2790320156320457, + "flos": 21178210665600.0, + "grad_norm": 2.132173514709288, + "language_loss": 0.72568685, + "learning_rate": 3.383949929609804e-06, + "loss": 0.74731255, + "num_input_tokens_seen": 100204850, + "step": 4641, + "time_per_iteration": 2.515049695968628 + }, + { + "auxiliary_loss_clip": 0.01104436, + "auxiliary_loss_mlp": 0.01038865, + "balance_loss_clip": 1.05019653, + "balance_loss_mlp": 1.02145994, + "epoch": 0.2790921388847137, + "flos": 22783920347520.0, + "grad_norm": 1.8084714821666483, + "language_loss": 0.75005323, + "learning_rate": 3.383668742611641e-06, + "loss": 0.77148622, + "num_input_tokens_seen": 100224520, + "step": 4642, + "time_per_iteration": 2.5699260234832764 + }, + { + "auxiliary_loss_clip": 0.01099257, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.04186738, + "balance_loss_mlp": 1.02415264, + "epoch": 0.27915226213738165, + "flos": 23400649889280.0, + "grad_norm": 1.7883942494773621, + "language_loss": 0.86328977, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.88469505, + "num_input_tokens_seen": 100243935, + "step": 4643, + "time_per_iteration": 2.5711557865142822 + }, + { + "auxiliary_loss_clip": 0.01097974, + "auxiliary_loss_mlp": 0.01044078, + "balance_loss_clip": 1.04903412, + "balance_loss_mlp": 1.02775788, + "epoch": 0.2792123853900496, + "flos": 22747794243840.0, + "grad_norm": 2.0920994559622472, + "language_loss": 0.83128965, + "learning_rate": 3.383106211219407e-06, + "loss": 0.85271019, + "num_input_tokens_seen": 100262290, + "step": 4644, + "time_per_iteration": 2.5910048484802246 + }, + { + "auxiliary_loss_clip": 0.01126353, + "auxiliary_loss_mlp": 0.01036884, + "balance_loss_clip": 1.04743123, + "balance_loss_mlp": 1.02117229, + "epoch": 0.2792725086427176, + "flos": 15049372757760.0, + "grad_norm": 2.141806993340925, + "language_loss": 0.78821397, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.80984634, + "num_input_tokens_seen": 100280015, + "step": 4645, + "time_per_iteration": 2.4989938735961914 + }, + { + "auxiliary_loss_clip": 0.01038937, + "auxiliary_loss_mlp": 0.01002614, + "balance_loss_clip": 1.02718139, + "balance_loss_mlp": 1.00020587, + "epoch": 0.27933263189538554, + "flos": 62544861757440.0, + "grad_norm": 0.77199471251545, + "language_loss": 0.62278366, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64319915, + "num_input_tokens_seen": 100338935, + "step": 4646, + "time_per_iteration": 3.1178700923919678 + }, + { + "auxiliary_loss_clip": 0.01106299, + "auxiliary_loss_mlp": 0.01035882, + "balance_loss_clip": 1.04774046, + "balance_loss_mlp": 1.02127862, + "epoch": 0.2793927551480535, + "flos": 25118365155840.0, + "grad_norm": 2.140834403548063, + "language_loss": 0.89443445, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.91585624, + "num_input_tokens_seen": 100359905, + "step": 4647, + "time_per_iteration": 2.6019418239593506 + }, + { + "auxiliary_loss_clip": 0.01128825, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.04668164, + "balance_loss_mlp": 1.02680373, + "epoch": 0.27945287840072147, + "flos": 21324582587520.0, + "grad_norm": 1.9868112668555429, + "language_loss": 0.87038803, + "learning_rate": 3.381980519149988e-06, + "loss": 0.89210391, + "num_input_tokens_seen": 100376955, + "step": 4648, + "time_per_iteration": 2.5379035472869873 + }, + { + "auxiliary_loss_clip": 0.01127397, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.04859483, + "balance_loss_mlp": 1.02043843, + "epoch": 0.27951300165338944, + "flos": 27450547407360.0, + "grad_norm": 2.647708062291757, + "language_loss": 0.72879887, + "learning_rate": 3.38169896509385e-06, + "loss": 0.75043207, + "num_input_tokens_seen": 100397545, + "step": 4649, + "time_per_iteration": 2.6446566581726074 + }, + { + "auxiliary_loss_clip": 0.01104258, + "auxiliary_loss_mlp": 0.01037894, + "balance_loss_clip": 1.04621673, + "balance_loss_mlp": 1.02039361, + "epoch": 0.2795731249060574, + "flos": 15159008044800.0, + "grad_norm": 2.016975520561866, + "language_loss": 0.80387753, + "learning_rate": 3.381417358643549e-06, + "loss": 0.82529902, + "num_input_tokens_seen": 100415080, + "step": 4650, + "time_per_iteration": 2.559195041656494 + }, + { + "auxiliary_loss_clip": 0.01041749, + "auxiliary_loss_mlp": 0.01101311, + "balance_loss_clip": 1.03725791, + "balance_loss_mlp": 1.57935834, + "epoch": 0.27963324815872537, + "flos": 60120103178880.0, + "grad_norm": 0.833010792876427, + "language_loss": 0.58868992, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.61012053, + "num_input_tokens_seen": 100471105, + "step": 4651, + "time_per_iteration": 3.195953845977783 + }, + { + "auxiliary_loss_clip": 0.01124402, + "auxiliary_loss_mlp": 0.01040942, + "balance_loss_clip": 1.04370177, + "balance_loss_mlp": 1.02368093, + "epoch": 0.27969337141139333, + "flos": 21765960910080.0, + "grad_norm": 2.2462954633140018, + "language_loss": 0.73860872, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.76026213, + "num_input_tokens_seen": 100492520, + "step": 4652, + "time_per_iteration": 2.527059555053711 + }, + { + "auxiliary_loss_clip": 0.01139726, + "auxiliary_loss_mlp": 0.01042486, + "balance_loss_clip": 1.04931331, + "balance_loss_mlp": 1.02638078, + "epoch": 0.27975349466406135, + "flos": 39851398834560.0, + "grad_norm": 2.2423148305832967, + "language_loss": 0.79883558, + "learning_rate": 3.380572225034461e-06, + "loss": 0.82065773, + "num_input_tokens_seen": 100512870, + "step": 4653, + "time_per_iteration": 2.610903263092041 + }, + { + "auxiliary_loss_clip": 0.01113825, + "auxiliary_loss_mlp": 0.01042194, + "balance_loss_clip": 1.04847276, + "balance_loss_mlp": 1.02607083, + "epoch": 0.2798136179167293, + "flos": 21579799697280.0, + "grad_norm": 2.361965493090918, + "language_loss": 0.7899189, + "learning_rate": 3.380290409114312e-06, + "loss": 0.81147909, + "num_input_tokens_seen": 100531655, + "step": 4654, + "time_per_iteration": 2.5414981842041016 + }, + { + "auxiliary_loss_clip": 0.01088722, + "auxiliary_loss_mlp": 0.01042359, + "balance_loss_clip": 1.04437375, + "balance_loss_mlp": 1.02504945, + "epoch": 0.2798737411693973, + "flos": 21537676022400.0, + "grad_norm": 1.7510041641377763, + "language_loss": 0.80720276, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.8285135, + "num_input_tokens_seen": 100548005, + "step": 4655, + "time_per_iteration": 2.612304925918579 + }, + { + "auxiliary_loss_clip": 0.0110336, + "auxiliary_loss_mlp": 0.00819539, + "balance_loss_clip": 1.04477298, + "balance_loss_mlp": 1.05953538, + "epoch": 0.27993386442206525, + "flos": 26981051713920.0, + "grad_norm": 1.8097650784293082, + "language_loss": 0.818385, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.83761394, + "num_input_tokens_seen": 100567980, + "step": 4656, + "time_per_iteration": 2.6301631927490234 + }, + { + "auxiliary_loss_clip": 0.01107626, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.04694164, + "balance_loss_mlp": 1.01987422, + "epoch": 0.2799939876747332, + "flos": 24349876652160.0, + "grad_norm": 1.6411319824669215, + "language_loss": 0.83330786, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.85474384, + "num_input_tokens_seen": 100588630, + "step": 4657, + "time_per_iteration": 2.578326940536499 + }, + { + "auxiliary_loss_clip": 0.01101136, + "auxiliary_loss_mlp": 0.01044654, + "balance_loss_clip": 1.04536581, + "balance_loss_mlp": 1.02764308, + "epoch": 0.2800541109274012, + "flos": 33656988648960.0, + "grad_norm": 1.982101006700611, + "language_loss": 0.63934344, + "learning_rate": 3.379162622133105e-06, + "loss": 0.66080135, + "num_input_tokens_seen": 100608775, + "step": 4658, + "time_per_iteration": 2.6612305641174316 + }, + { + "auxiliary_loss_clip": 0.01124837, + "auxiliary_loss_mlp": 0.01047489, + "balance_loss_clip": 1.04482651, + "balance_loss_mlp": 1.03096616, + "epoch": 0.28011423418006914, + "flos": 21614417429760.0, + "grad_norm": 1.7052387776381874, + "language_loss": 0.78747398, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.80919725, + "num_input_tokens_seen": 100627975, + "step": 4659, + "time_per_iteration": 2.5252163410186768 + }, + { + "auxiliary_loss_clip": 0.01099112, + "auxiliary_loss_mlp": 0.01052073, + "balance_loss_clip": 1.04606164, + "balance_loss_mlp": 1.03557467, + "epoch": 0.2801743574327371, + "flos": 23112431159040.0, + "grad_norm": 1.669471323442172, + "language_loss": 0.7909928, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.81250465, + "num_input_tokens_seen": 100645430, + "step": 4660, + "time_per_iteration": 2.582948684692383 + }, + { + "auxiliary_loss_clip": 0.01095089, + "auxiliary_loss_mlp": 0.01040743, + "balance_loss_clip": 1.04538989, + "balance_loss_mlp": 1.02549028, + "epoch": 0.2802344806854051, + "flos": 12641418766080.0, + "grad_norm": 1.7014261992680957, + "language_loss": 0.80440485, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.82576323, + "num_input_tokens_seen": 100663775, + "step": 4661, + "time_per_iteration": 2.550361394882202 + }, + { + "auxiliary_loss_clip": 0.01117089, + "auxiliary_loss_mlp": 0.01051152, + "balance_loss_clip": 1.04930079, + "balance_loss_mlp": 1.03491592, + "epoch": 0.28029460393807304, + "flos": 37267878142080.0, + "grad_norm": 2.084969429345593, + "language_loss": 0.78964412, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.8113265, + "num_input_tokens_seen": 100686085, + "step": 4662, + "time_per_iteration": 2.670490026473999 + }, + { + "auxiliary_loss_clip": 0.01118154, + "auxiliary_loss_mlp": 0.010417, + "balance_loss_clip": 1.0430541, + "balance_loss_mlp": 1.02335382, + "epoch": 0.280354727190741, + "flos": 20741106061440.0, + "grad_norm": 1.6029331798194923, + "language_loss": 0.69727182, + "learning_rate": 3.377751711782227e-06, + "loss": 0.71887034, + "num_input_tokens_seen": 100705135, + "step": 4663, + "time_per_iteration": 2.5277862548828125 + }, + { + "auxiliary_loss_clip": 0.01129237, + "auxiliary_loss_mlp": 0.01046153, + "balance_loss_clip": 1.06351531, + "balance_loss_mlp": 1.02902293, + "epoch": 0.28041485044340897, + "flos": 21471026336640.0, + "grad_norm": 1.6553843075939263, + "language_loss": 0.77396965, + "learning_rate": 3.377469372935791e-06, + "loss": 0.79572356, + "num_input_tokens_seen": 100724960, + "step": 4664, + "time_per_iteration": 2.5541462898254395 + }, + { + "auxiliary_loss_clip": 0.01107859, + "auxiliary_loss_mlp": 0.01044576, + "balance_loss_clip": 1.05315757, + "balance_loss_mlp": 1.0288403, + "epoch": 0.28047497369607693, + "flos": 14794263388800.0, + "grad_norm": 1.69870994289954, + "language_loss": 0.79033214, + "learning_rate": 3.377186981855578e-06, + "loss": 0.81185651, + "num_input_tokens_seen": 100741995, + "step": 4665, + "time_per_iteration": 2.520967483520508 + }, + { + "auxiliary_loss_clip": 0.01120048, + "auxiliary_loss_mlp": 0.01040802, + "balance_loss_clip": 1.04307604, + "balance_loss_mlp": 1.02476859, + "epoch": 0.2805350969487449, + "flos": 23070738447360.0, + "grad_norm": 1.749609877681062, + "language_loss": 0.81003022, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.83163869, + "num_input_tokens_seen": 100758985, + "step": 4666, + "time_per_iteration": 3.872018337249756 + }, + { + "auxiliary_loss_clip": 0.01097885, + "auxiliary_loss_mlp": 0.01054708, + "balance_loss_clip": 1.04372501, + "balance_loss_mlp": 1.03642142, + "epoch": 0.2805952202014129, + "flos": 20479855466880.0, + "grad_norm": 1.8043685598694792, + "language_loss": 0.85038841, + "learning_rate": 3.376622043036658e-06, + "loss": 0.87191433, + "num_input_tokens_seen": 100777820, + "step": 4667, + "time_per_iteration": 2.5516834259033203 + }, + { + "auxiliary_loss_clip": 0.01109251, + "auxiliary_loss_mlp": 0.00856152, + "balance_loss_clip": 1.04683554, + "balance_loss_mlp": 1.12841833, + "epoch": 0.2806553434540809, + "flos": 27417330305280.0, + "grad_norm": 1.5976279354310383, + "language_loss": 0.79502612, + "learning_rate": 3.376339495319373e-06, + "loss": 0.8146801, + "num_input_tokens_seen": 100798205, + "step": 4668, + "time_per_iteration": 2.6213877201080322 + }, + { + "auxiliary_loss_clip": 0.01090446, + "auxiliary_loss_mlp": 0.01042703, + "balance_loss_clip": 1.05952621, + "balance_loss_mlp": 1.02604938, + "epoch": 0.28071546670674885, + "flos": 26505019745280.0, + "grad_norm": 1.3921098550598912, + "language_loss": 0.76412952, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.78546101, + "num_input_tokens_seen": 100819800, + "step": 4669, + "time_per_iteration": 2.6618430614471436 + }, + { + "auxiliary_loss_clip": 0.01124576, + "auxiliary_loss_mlp": 0.01042096, + "balance_loss_clip": 1.04607916, + "balance_loss_mlp": 1.02585995, + "epoch": 0.2807755899594168, + "flos": 20558679863040.0, + "grad_norm": 1.9127563800949565, + "language_loss": 0.78800786, + "learning_rate": 3.375774243322725e-06, + "loss": 0.8096745, + "num_input_tokens_seen": 100837880, + "step": 4670, + "time_per_iteration": 3.892761707305908 + }, + { + "auxiliary_loss_clip": 0.01096477, + "auxiliary_loss_mlp": 0.010456, + "balance_loss_clip": 1.04462171, + "balance_loss_mlp": 1.02771902, + "epoch": 0.2808357132120848, + "flos": 24313319585280.0, + "grad_norm": 1.9736450835321329, + "language_loss": 0.78751451, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.80893528, + "num_input_tokens_seen": 100856350, + "step": 4671, + "time_per_iteration": 2.594900131225586 + }, + { + "auxiliary_loss_clip": 0.0111549, + "auxiliary_loss_mlp": 0.01041309, + "balance_loss_clip": 1.04449296, + "balance_loss_mlp": 1.02495337, + "epoch": 0.28089583646475275, + "flos": 26432408401920.0, + "grad_norm": 1.6749655398289727, + "language_loss": 0.75135559, + "learning_rate": 3.37520878264809e-06, + "loss": 0.77292353, + "num_input_tokens_seen": 100876135, + "step": 4672, + "time_per_iteration": 2.542782783508301 + }, + { + "auxiliary_loss_clip": 0.01115214, + "auxiliary_loss_mlp": 0.01041074, + "balance_loss_clip": 1.04347706, + "balance_loss_mlp": 1.02238178, + "epoch": 0.2809559597174207, + "flos": 23111820627840.0, + "grad_norm": 7.019177576679457, + "language_loss": 0.75139004, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.77295291, + "num_input_tokens_seen": 100894790, + "step": 4673, + "time_per_iteration": 3.936271905899048 + }, + { + "auxiliary_loss_clip": 0.01123439, + "auxiliary_loss_mlp": 0.01037698, + "balance_loss_clip": 1.04712594, + "balance_loss_mlp": 1.02110422, + "epoch": 0.2810160829700887, + "flos": 20923496346240.0, + "grad_norm": 1.7859653993196654, + "language_loss": 0.727992, + "learning_rate": 3.374643113381237e-06, + "loss": 0.74960339, + "num_input_tokens_seen": 100915100, + "step": 4674, + "time_per_iteration": 3.9172489643096924 + }, + { + "auxiliary_loss_clip": 0.01129035, + "auxiliary_loss_mlp": 0.0103869, + "balance_loss_clip": 1.04793084, + "balance_loss_mlp": 1.02123761, + "epoch": 0.28107620622275664, + "flos": 14355901808640.0, + "grad_norm": 1.946414113035512, + "language_loss": 0.77284235, + "learning_rate": 3.374360200552541e-06, + "loss": 0.79451954, + "num_input_tokens_seen": 100932795, + "step": 4675, + "time_per_iteration": 2.5007379055023193 + }, + { + "auxiliary_loss_clip": 0.01136569, + "auxiliary_loss_mlp": 0.01040781, + "balance_loss_clip": 1.04534388, + "balance_loss_mlp": 1.02379346, + "epoch": 0.2811363294754246, + "flos": 20919078973440.0, + "grad_norm": 1.998049523469888, + "language_loss": 0.70266777, + "learning_rate": 3.374077235607968e-06, + "loss": 0.72444129, + "num_input_tokens_seen": 100950505, + "step": 4676, + "time_per_iteration": 2.4650843143463135 + }, + { + "auxiliary_loss_clip": 0.01134299, + "auxiliary_loss_mlp": 0.01036167, + "balance_loss_clip": 1.04904854, + "balance_loss_mlp": 1.02105153, + "epoch": 0.28119645272809257, + "flos": 20594841880320.0, + "grad_norm": 1.5541214466072006, + "language_loss": 0.69771254, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.71941721, + "num_input_tokens_seen": 100968790, + "step": 4677, + "time_per_iteration": 2.454850912094116 + }, + { + "auxiliary_loss_clip": 0.01118687, + "auxiliary_loss_mlp": 0.01044677, + "balance_loss_clip": 1.04783058, + "balance_loss_mlp": 1.02660513, + "epoch": 0.28125657598076054, + "flos": 25337420248320.0, + "grad_norm": 1.5466186628135945, + "language_loss": 0.63499784, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.65663147, + "num_input_tokens_seen": 100990205, + "step": 4678, + "time_per_iteration": 2.5450096130371094 + }, + { + "auxiliary_loss_clip": 0.01124337, + "auxiliary_loss_mlp": 0.01041046, + "balance_loss_clip": 1.04573441, + "balance_loss_mlp": 1.0253818, + "epoch": 0.2813166992334285, + "flos": 24827093769600.0, + "grad_norm": 1.5374594470617675, + "language_loss": 0.70330453, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.7249583, + "num_input_tokens_seen": 101009815, + "step": 4679, + "time_per_iteration": 2.5043954849243164 + }, + { + "auxiliary_loss_clip": 0.01124017, + "auxiliary_loss_mlp": 0.01040753, + "balance_loss_clip": 1.04533398, + "balance_loss_mlp": 1.0235157, + "epoch": 0.2813768224860965, + "flos": 21760753438080.0, + "grad_norm": 1.875750232638296, + "language_loss": 0.74803919, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.76968694, + "num_input_tokens_seen": 101026780, + "step": 4680, + "time_per_iteration": 2.4857938289642334 + }, + { + "auxiliary_loss_clip": 0.01136772, + "auxiliary_loss_mlp": 0.01040502, + "balance_loss_clip": 1.04715753, + "balance_loss_mlp": 1.02531505, + "epoch": 0.2814369457387645, + "flos": 24316803204480.0, + "grad_norm": 1.6016860899376817, + "language_loss": 0.77811068, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.79988343, + "num_input_tokens_seen": 101046215, + "step": 4681, + "time_per_iteration": 2.485274076461792 + }, + { + "auxiliary_loss_clip": 0.01125733, + "auxiliary_loss_mlp": 0.01034027, + "balance_loss_clip": 1.04688406, + "balance_loss_mlp": 1.01751649, + "epoch": 0.28149706899143245, + "flos": 18515326872960.0, + "grad_norm": 2.31702337665741, + "language_loss": 0.73562539, + "learning_rate": 3.372378352108146e-06, + "loss": 0.75722301, + "num_input_tokens_seen": 101063365, + "step": 4682, + "time_per_iteration": 2.4952099323272705 + }, + { + "auxiliary_loss_clip": 0.01132415, + "auxiliary_loss_mlp": 0.01036588, + "balance_loss_clip": 1.04593492, + "balance_loss_mlp": 1.02149618, + "epoch": 0.2815571922441004, + "flos": 24863255786880.0, + "grad_norm": 1.4676256436812118, + "language_loss": 0.80541968, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.82710969, + "num_input_tokens_seen": 101083835, + "step": 4683, + "time_per_iteration": 2.4969868659973145 + }, + { + "auxiliary_loss_clip": 0.01083239, + "auxiliary_loss_mlp": 0.0104281, + "balance_loss_clip": 1.04940498, + "balance_loss_mlp": 1.02556038, + "epoch": 0.2816173154967684, + "flos": 19901622326400.0, + "grad_norm": 3.488422867779134, + "language_loss": 0.76347148, + "learning_rate": 3.371811641167852e-06, + "loss": 0.78473198, + "num_input_tokens_seen": 101101740, + "step": 4684, + "time_per_iteration": 2.5843777656555176 + }, + { + "auxiliary_loss_clip": 0.01082383, + "auxiliary_loss_mlp": 0.01039119, + "balance_loss_clip": 1.04645956, + "balance_loss_mlp": 1.02334714, + "epoch": 0.28167743874943635, + "flos": 17491333950720.0, + "grad_norm": 1.7720763514361482, + "language_loss": 0.76182789, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.78304297, + "num_input_tokens_seen": 101120480, + "step": 4685, + "time_per_iteration": 2.558091640472412 + }, + { + "auxiliary_loss_clip": 0.01108652, + "auxiliary_loss_mlp": 0.01040358, + "balance_loss_clip": 1.04886866, + "balance_loss_mlp": 1.02524209, + "epoch": 0.2817375620021043, + "flos": 25302120157440.0, + "grad_norm": 1.41240106607788, + "language_loss": 0.75407463, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.77556467, + "num_input_tokens_seen": 101142910, + "step": 4686, + "time_per_iteration": 2.582836389541626 + }, + { + "auxiliary_loss_clip": 0.01107955, + "auxiliary_loss_mlp": 0.01047543, + "balance_loss_clip": 1.045753, + "balance_loss_mlp": 1.03038836, + "epoch": 0.2817976852547723, + "flos": 18693227957760.0, + "grad_norm": 2.383655571595673, + "language_loss": 0.62683159, + "learning_rate": 3.370961184640025e-06, + "loss": 0.6483866, + "num_input_tokens_seen": 101160030, + "step": 4687, + "time_per_iteration": 2.4957659244537354 + }, + { + "auxiliary_loss_clip": 0.01114553, + "auxiliary_loss_mlp": 0.01046096, + "balance_loss_clip": 1.04716611, + "balance_loss_mlp": 1.03037262, + "epoch": 0.28185780850744024, + "flos": 22742263549440.0, + "grad_norm": 2.118160622745849, + "language_loss": 0.76500183, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.78660834, + "num_input_tokens_seen": 101177675, + "step": 4688, + "time_per_iteration": 2.542945146560669 + }, + { + "auxiliary_loss_clip": 0.01107846, + "auxiliary_loss_mlp": 0.01038181, + "balance_loss_clip": 1.05424023, + "balance_loss_mlp": 1.02246368, + "epoch": 0.2819179317601082, + "flos": 14933919467520.0, + "grad_norm": 1.9221042875729457, + "language_loss": 0.78427291, + "learning_rate": 3.37039395366863e-06, + "loss": 0.8057332, + "num_input_tokens_seen": 101192225, + "step": 4689, + "time_per_iteration": 2.527785301208496 + }, + { + "auxiliary_loss_clip": 0.01097337, + "auxiliary_loss_mlp": 0.0104165, + "balance_loss_clip": 1.04753399, + "balance_loss_mlp": 1.02481723, + "epoch": 0.2819780550127762, + "flos": 23145325038720.0, + "grad_norm": 1.8845949920405425, + "language_loss": 0.78016448, + "learning_rate": 3.37011026022934e-06, + "loss": 0.80155432, + "num_input_tokens_seen": 101210870, + "step": 4690, + "time_per_iteration": 2.578328847885132 + }, + { + "auxiliary_loss_clip": 0.01135913, + "auxiliary_loss_mlp": 0.00820802, + "balance_loss_clip": 1.04669869, + "balance_loss_mlp": 1.06502342, + "epoch": 0.28203817826544414, + "flos": 21616356764160.0, + "grad_norm": 1.773554879048875, + "language_loss": 0.87545443, + "learning_rate": 3.369826514835332e-06, + "loss": 0.89502156, + "num_input_tokens_seen": 101229965, + "step": 4691, + "time_per_iteration": 2.4609627723693848 + }, + { + "auxiliary_loss_clip": 0.01114087, + "auxiliary_loss_mlp": 0.01046591, + "balance_loss_clip": 1.04700041, + "balance_loss_mlp": 1.02894855, + "epoch": 0.2820983015181121, + "flos": 24026788794240.0, + "grad_norm": 3.5106192650424615, + "language_loss": 0.81812793, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.83973467, + "num_input_tokens_seen": 101250980, + "step": 4692, + "time_per_iteration": 2.558096170425415 + }, + { + "auxiliary_loss_clip": 0.01106792, + "auxiliary_loss_mlp": 0.01032129, + "balance_loss_clip": 1.05386162, + "balance_loss_mlp": 1.01550519, + "epoch": 0.2821584247707801, + "flos": 30007925976960.0, + "grad_norm": 1.533300618251183, + "language_loss": 0.74624848, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.76763773, + "num_input_tokens_seen": 101273335, + "step": 4693, + "time_per_iteration": 2.6105990409851074 + }, + { + "auxiliary_loss_clip": 0.01102975, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.04758358, + "balance_loss_mlp": 1.0176065, + "epoch": 0.2822185480234481, + "flos": 21396762967680.0, + "grad_norm": 1.6477652061037509, + "language_loss": 0.77372909, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.79509968, + "num_input_tokens_seen": 101292110, + "step": 4694, + "time_per_iteration": 2.5589561462402344 + }, + { + "auxiliary_loss_clip": 0.01124383, + "auxiliary_loss_mlp": 0.0104021, + "balance_loss_clip": 1.04910028, + "balance_loss_mlp": 1.02416396, + "epoch": 0.28227867127611606, + "flos": 27452809964160.0, + "grad_norm": 1.9784256864313345, + "language_loss": 0.67364204, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.69528806, + "num_input_tokens_seen": 101312815, + "step": 4695, + "time_per_iteration": 2.5303614139556885 + }, + { + "auxiliary_loss_clip": 0.01120439, + "auxiliary_loss_mlp": 0.0105031, + "balance_loss_clip": 1.04867887, + "balance_loss_mlp": 1.03201151, + "epoch": 0.282338794528784, + "flos": 22593736811520.0, + "grad_norm": 2.5065640111625216, + "language_loss": 0.75814497, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.77985251, + "num_input_tokens_seen": 101329045, + "step": 4696, + "time_per_iteration": 2.51064133644104 + }, + { + "auxiliary_loss_clip": 0.01107956, + "auxiliary_loss_mlp": 0.01042798, + "balance_loss_clip": 1.05199051, + "balance_loss_mlp": 1.02649617, + "epoch": 0.282398917781452, + "flos": 42010923386880.0, + "grad_norm": 1.485280318069343, + "language_loss": 0.6243881, + "learning_rate": 3.368122952024877e-06, + "loss": 0.6458956, + "num_input_tokens_seen": 101352715, + "step": 4697, + "time_per_iteration": 2.7115960121154785 + }, + { + "auxiliary_loss_clip": 0.01094036, + "auxiliary_loss_mlp": 0.01033836, + "balance_loss_clip": 1.04951119, + "balance_loss_mlp": 1.01824379, + "epoch": 0.28245904103411995, + "flos": 23224724052480.0, + "grad_norm": 2.0105385650668994, + "language_loss": 0.73338199, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.75466067, + "num_input_tokens_seen": 101374640, + "step": 4698, + "time_per_iteration": 2.623849391937256 + }, + { + "auxiliary_loss_clip": 0.01133746, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_clip": 1.04752636, + "balance_loss_mlp": 1.02603436, + "epoch": 0.2825191642867879, + "flos": 25374623760000.0, + "grad_norm": 1.7661338011154286, + "language_loss": 0.75224173, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.77398801, + "num_input_tokens_seen": 101393595, + "step": 4699, + "time_per_iteration": 2.487856864929199 + }, + { + "auxiliary_loss_clip": 0.01126496, + "auxiliary_loss_mlp": 0.01036838, + "balance_loss_clip": 1.04680598, + "balance_loss_mlp": 1.01949322, + "epoch": 0.2825792875394559, + "flos": 17236799199360.0, + "grad_norm": 2.5361200078159456, + "language_loss": 0.79775268, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.81938601, + "num_input_tokens_seen": 101409265, + "step": 4700, + "time_per_iteration": 2.475954055786133 + }, + { + "auxiliary_loss_clip": 0.01113311, + "auxiliary_loss_mlp": 0.01041458, + "balance_loss_clip": 1.05102527, + "balance_loss_mlp": 1.02766001, + "epoch": 0.28263941079212385, + "flos": 26723967096960.0, + "grad_norm": 1.7845390147533655, + "language_loss": 0.81643629, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.83798397, + "num_input_tokens_seen": 101428365, + "step": 4701, + "time_per_iteration": 2.5704147815704346 + }, + { + "auxiliary_loss_clip": 0.01068657, + "auxiliary_loss_mlp": 0.01039108, + "balance_loss_clip": 1.04364395, + "balance_loss_mlp": 1.02336049, + "epoch": 0.2826995340447918, + "flos": 25921327737600.0, + "grad_norm": 1.9722216417776959, + "language_loss": 0.73069978, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.75177741, + "num_input_tokens_seen": 101447280, + "step": 4702, + "time_per_iteration": 2.706874370574951 + }, + { + "auxiliary_loss_clip": 0.0113728, + "auxiliary_loss_mlp": 0.01040513, + "balance_loss_clip": 1.0504545, + "balance_loss_mlp": 1.02455115, + "epoch": 0.2827596572974598, + "flos": 22379709623040.0, + "grad_norm": 1.7354510846689797, + "language_loss": 0.7829591, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.80473703, + "num_input_tokens_seen": 101465435, + "step": 4703, + "time_per_iteration": 2.4871652126312256 + }, + { + "auxiliary_loss_clip": 0.01106803, + "auxiliary_loss_mlp": 0.01043875, + "balance_loss_clip": 1.04510415, + "balance_loss_mlp": 1.02731669, + "epoch": 0.28281978055012774, + "flos": 33547137880320.0, + "grad_norm": 1.522973924737269, + "language_loss": 0.69374758, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.71525437, + "num_input_tokens_seen": 101486355, + "step": 4704, + "time_per_iteration": 2.6305463314056396 + }, + { + "auxiliary_loss_clip": 0.0110109, + "auxiliary_loss_mlp": 0.01038708, + "balance_loss_clip": 1.04865193, + "balance_loss_mlp": 1.02173281, + "epoch": 0.2828799038027957, + "flos": 23440870143360.0, + "grad_norm": 1.915696614425445, + "language_loss": 0.70383757, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.72523558, + "num_input_tokens_seen": 101505875, + "step": 4705, + "time_per_iteration": 4.070115089416504 + }, + { + "auxiliary_loss_clip": 0.01046845, + "auxiliary_loss_mlp": 0.01001626, + "balance_loss_clip": 1.02562094, + "balance_loss_mlp": 0.99970675, + "epoch": 0.2829400270554637, + "flos": 69873690251520.0, + "grad_norm": 0.7242747442982526, + "language_loss": 0.59198987, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.61247456, + "num_input_tokens_seen": 101565045, + "step": 4706, + "time_per_iteration": 3.142594814300537 + }, + { + "auxiliary_loss_clip": 0.01108345, + "auxiliary_loss_mlp": 0.01038592, + "balance_loss_clip": 1.04421616, + "balance_loss_mlp": 1.02442372, + "epoch": 0.2830001503081317, + "flos": 24789028331520.0, + "grad_norm": 1.4224633930716837, + "language_loss": 0.82220376, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84367311, + "num_input_tokens_seen": 101585825, + "step": 4707, + "time_per_iteration": 2.5846662521362305 + }, + { + "auxiliary_loss_clip": 0.01115389, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.04521322, + "balance_loss_mlp": 1.01896417, + "epoch": 0.28306027356079966, + "flos": 27669387018240.0, + "grad_norm": 1.7313707110935654, + "language_loss": 0.8031528, + "learning_rate": 3.36499490449902e-06, + "loss": 0.82466096, + "num_input_tokens_seen": 101606105, + "step": 4708, + "time_per_iteration": 2.608454704284668 + }, + { + "auxiliary_loss_clip": 0.01037216, + "auxiliary_loss_mlp": 0.01002154, + "balance_loss_clip": 1.02806938, + "balance_loss_mlp": 0.99988872, + "epoch": 0.2831203968134676, + "flos": 60527938199040.0, + "grad_norm": 0.8857249519734655, + "language_loss": 0.62818146, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.64857513, + "num_input_tokens_seen": 101656875, + "step": 4709, + "time_per_iteration": 4.357207536697388 + }, + { + "auxiliary_loss_clip": 0.01108642, + "auxiliary_loss_mlp": 0.01047613, + "balance_loss_clip": 1.04767466, + "balance_loss_mlp": 1.03005326, + "epoch": 0.2831805200661356, + "flos": 22054790171520.0, + "grad_norm": 1.356831537212952, + "language_loss": 0.74126446, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.76282704, + "num_input_tokens_seen": 101676225, + "step": 4710, + "time_per_iteration": 2.5286450386047363 + }, + { + "auxiliary_loss_clip": 0.01108941, + "auxiliary_loss_mlp": 0.01060804, + "balance_loss_clip": 1.04743147, + "balance_loss_mlp": 1.04306543, + "epoch": 0.28324064331880355, + "flos": 22600668136320.0, + "grad_norm": 1.7897364385047985, + "language_loss": 0.79235715, + "learning_rate": 3.364140713048579e-06, + "loss": 0.81405461, + "num_input_tokens_seen": 101693710, + "step": 4711, + "time_per_iteration": 2.5403249263763428 + }, + { + "auxiliary_loss_clip": 0.01129004, + "auxiliary_loss_mlp": 0.00802997, + "balance_loss_clip": 1.0504998, + "balance_loss_mlp": 1.0295794, + "epoch": 0.2833007665714715, + "flos": 30404127968640.0, + "grad_norm": 2.015063467950678, + "language_loss": 0.71361828, + "learning_rate": 3.363855879093996e-06, + "loss": 0.73293829, + "num_input_tokens_seen": 101714010, + "step": 4712, + "time_per_iteration": 3.9620487689971924 + }, + { + "auxiliary_loss_clip": 0.01137962, + "auxiliary_loss_mlp": 0.01052891, + "balance_loss_clip": 1.0485605, + "balance_loss_mlp": 1.03589201, + "epoch": 0.2833608898241395, + "flos": 23549499849600.0, + "grad_norm": 1.9333888296346071, + "language_loss": 0.81510329, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.83701181, + "num_input_tokens_seen": 101732995, + "step": 4713, + "time_per_iteration": 3.8527326583862305 + }, + { + "auxiliary_loss_clip": 0.01115652, + "auxiliary_loss_mlp": 0.01041132, + "balance_loss_clip": 1.04942465, + "balance_loss_mlp": 1.02419233, + "epoch": 0.28342101307680745, + "flos": 20266726118400.0, + "grad_norm": 1.7018411304733572, + "language_loss": 0.75099349, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.77256131, + "num_input_tokens_seen": 101751385, + "step": 4714, + "time_per_iteration": 2.524751901626587 + }, + { + "auxiliary_loss_clip": 0.01122886, + "auxiliary_loss_mlp": 0.01047364, + "balance_loss_clip": 1.0474565, + "balance_loss_mlp": 1.03168809, + "epoch": 0.2834811363294754, + "flos": 30847050576000.0, + "grad_norm": 1.3818759164232401, + "language_loss": 0.78353739, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.80523992, + "num_input_tokens_seen": 101773825, + "step": 4715, + "time_per_iteration": 2.5830488204956055 + }, + { + "auxiliary_loss_clip": 0.01113898, + "auxiliary_loss_mlp": 0.01039894, + "balance_loss_clip": 1.04743803, + "balance_loss_mlp": 1.02382493, + "epoch": 0.2835412595821434, + "flos": 22711021695360.0, + "grad_norm": 2.1178468371219052, + "language_loss": 0.73809367, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.75963157, + "num_input_tokens_seen": 101791920, + "step": 4716, + "time_per_iteration": 2.5343379974365234 + }, + { + "auxiliary_loss_clip": 0.01110655, + "auxiliary_loss_mlp": 0.01051172, + "balance_loss_clip": 1.04374957, + "balance_loss_mlp": 1.03195524, + "epoch": 0.28360138283481134, + "flos": 18077719478400.0, + "grad_norm": 2.2364903333020583, + "language_loss": 0.7421577, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.76377606, + "num_input_tokens_seen": 101809515, + "step": 4717, + "time_per_iteration": 2.515613317489624 + }, + { + "auxiliary_loss_clip": 0.01106143, + "auxiliary_loss_mlp": 0.01043526, + "balance_loss_clip": 1.04614985, + "balance_loss_mlp": 1.02755249, + "epoch": 0.2836615060874793, + "flos": 17854785717120.0, + "grad_norm": 1.4271762280587414, + "language_loss": 0.67124969, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.69274634, + "num_input_tokens_seen": 101827735, + "step": 4718, + "time_per_iteration": 2.5307796001434326 + }, + { + "auxiliary_loss_clip": 0.01115475, + "auxiliary_loss_mlp": 0.01043642, + "balance_loss_clip": 1.0456841, + "balance_loss_mlp": 1.02697623, + "epoch": 0.2837216293401473, + "flos": 25740302169600.0, + "grad_norm": 1.787453020258257, + "language_loss": 0.7235682, + "learning_rate": 3.361860593925566e-06, + "loss": 0.74515939, + "num_input_tokens_seen": 101845970, + "step": 4719, + "time_per_iteration": 2.562986373901367 + }, + { + "auxiliary_loss_clip": 0.01124225, + "auxiliary_loss_mlp": 0.01045924, + "balance_loss_clip": 1.04766309, + "balance_loss_mlp": 1.03020036, + "epoch": 0.2837817525928153, + "flos": 20923532259840.0, + "grad_norm": 1.4876309566993318, + "language_loss": 0.80460107, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.82630259, + "num_input_tokens_seen": 101865040, + "step": 4720, + "time_per_iteration": 2.515974521636963 + }, + { + "auxiliary_loss_clip": 0.01124894, + "auxiliary_loss_mlp": 0.01042739, + "balance_loss_clip": 1.04734015, + "balance_loss_mlp": 1.02576399, + "epoch": 0.28384187584548326, + "flos": 18916700423040.0, + "grad_norm": 1.891617944669613, + "language_loss": 0.794743, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.8164193, + "num_input_tokens_seen": 101883735, + "step": 4721, + "time_per_iteration": 2.4790003299713135 + }, + { + "auxiliary_loss_clip": 0.01085203, + "auxiliary_loss_mlp": 0.00797088, + "balance_loss_clip": 1.04421127, + "balance_loss_mlp": 1.01730049, + "epoch": 0.2839019990981512, + "flos": 27343964776320.0, + "grad_norm": 1.8005827840091124, + "language_loss": 0.82781452, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.84663749, + "num_input_tokens_seen": 101903025, + "step": 4722, + "time_per_iteration": 2.6383211612701416 + }, + { + "auxiliary_loss_clip": 0.0113804, + "auxiliary_loss_mlp": 0.01040662, + "balance_loss_clip": 1.04901862, + "balance_loss_mlp": 1.02479529, + "epoch": 0.2839621223508192, + "flos": 18114312458880.0, + "grad_norm": 2.261307870988022, + "language_loss": 0.70257539, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.72436243, + "num_input_tokens_seen": 101922255, + "step": 4723, + "time_per_iteration": 2.4621448516845703 + }, + { + "auxiliary_loss_clip": 0.01107496, + "auxiliary_loss_mlp": 0.01040536, + "balance_loss_clip": 1.04707062, + "balance_loss_mlp": 1.02292848, + "epoch": 0.28402224560348716, + "flos": 26358360514560.0, + "grad_norm": 1.3865925725096342, + "language_loss": 0.78444624, + "learning_rate": 3.360433840760998e-06, + "loss": 0.8059265, + "num_input_tokens_seen": 101943100, + "step": 4724, + "time_per_iteration": 2.589341640472412 + }, + { + "auxiliary_loss_clip": 0.01108982, + "auxiliary_loss_mlp": 0.01051436, + "balance_loss_clip": 1.0454309, + "balance_loss_mlp": 1.03399611, + "epoch": 0.2840823688561551, + "flos": 24060795995520.0, + "grad_norm": 1.5028282835718851, + "language_loss": 0.92422891, + "learning_rate": 3.36014833532143e-06, + "loss": 0.94583309, + "num_input_tokens_seen": 101963160, + "step": 4725, + "time_per_iteration": 2.5751779079437256 + }, + { + "auxiliary_loss_clip": 0.01130655, + "auxiliary_loss_mlp": 0.01042074, + "balance_loss_clip": 1.04940116, + "balance_loss_mlp": 1.02546811, + "epoch": 0.2841424921088231, + "flos": 29459821368960.0, + "grad_norm": 1.4967415413497736, + "language_loss": 0.88623804, + "learning_rate": 3.3598627783049e-06, + "loss": 0.90796536, + "num_input_tokens_seen": 101984300, + "step": 4726, + "time_per_iteration": 2.578953266143799 + }, + { + "auxiliary_loss_clip": 0.01127487, + "auxiliary_loss_mlp": 0.01041638, + "balance_loss_clip": 1.05270338, + "balance_loss_mlp": 1.02503181, + "epoch": 0.28420261536149105, + "flos": 48100367053440.0, + "grad_norm": 1.9979955827616438, + "language_loss": 0.78705853, + "learning_rate": 3.359577169722238e-06, + "loss": 0.80874979, + "num_input_tokens_seen": 102005765, + "step": 4727, + "time_per_iteration": 2.732815742492676 + }, + { + "auxiliary_loss_clip": 0.01125111, + "auxiliary_loss_mlp": 0.01038933, + "balance_loss_clip": 1.04780173, + "balance_loss_mlp": 1.02376997, + "epoch": 0.284262738614159, + "flos": 25666146541440.0, + "grad_norm": 2.2446077694076565, + "language_loss": 0.66971147, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.69135189, + "num_input_tokens_seen": 102022755, + "step": 4728, + "time_per_iteration": 2.526355266571045 + }, + { + "auxiliary_loss_clip": 0.01100015, + "auxiliary_loss_mlp": 0.01046007, + "balance_loss_clip": 1.04468679, + "balance_loss_mlp": 1.02922189, + "epoch": 0.284322861866827, + "flos": 19718980646400.0, + "grad_norm": 1.6615122135184746, + "language_loss": 0.76435685, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.78581703, + "num_input_tokens_seen": 102041850, + "step": 4729, + "time_per_iteration": 2.5324559211730957 + }, + { + "auxiliary_loss_clip": 0.01117082, + "auxiliary_loss_mlp": 0.01047722, + "balance_loss_clip": 1.05305576, + "balance_loss_mlp": 1.03093743, + "epoch": 0.28438298511949495, + "flos": 23915250086400.0, + "grad_norm": 1.6377745214442347, + "language_loss": 0.66508847, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.68673652, + "num_input_tokens_seen": 102059500, + "step": 4730, + "time_per_iteration": 2.5561037063598633 + }, + { + "auxiliary_loss_clip": 0.0111587, + "auxiliary_loss_mlp": 0.01035587, + "balance_loss_clip": 1.04831815, + "balance_loss_mlp": 1.01863527, + "epoch": 0.2844431083721629, + "flos": 26067340523520.0, + "grad_norm": 1.8494322909263117, + "language_loss": 0.74553567, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.76705027, + "num_input_tokens_seen": 102080460, + "step": 4731, + "time_per_iteration": 2.569830894470215 + }, + { + "auxiliary_loss_clip": 0.01098977, + "auxiliary_loss_mlp": 0.0103818, + "balance_loss_clip": 1.05490744, + "balance_loss_mlp": 1.02212214, + "epoch": 0.2845032316248309, + "flos": 25810435474560.0, + "grad_norm": 1.502885318910816, + "language_loss": 0.83617097, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.85754251, + "num_input_tokens_seen": 102100950, + "step": 4732, + "time_per_iteration": 2.6337873935699463 + }, + { + "auxiliary_loss_clip": 0.01130291, + "auxiliary_loss_mlp": 0.01050253, + "balance_loss_clip": 1.05007625, + "balance_loss_mlp": 1.03226411, + "epoch": 0.2845633548774989, + "flos": 19823192979840.0, + "grad_norm": 1.570658103313605, + "language_loss": 0.79042763, + "learning_rate": 3.357862435944109e-06, + "loss": 0.81223309, + "num_input_tokens_seen": 102119345, + "step": 4733, + "time_per_iteration": 2.491194248199463 + }, + { + "auxiliary_loss_clip": 0.01146298, + "auxiliary_loss_mlp": 0.01053606, + "balance_loss_clip": 1.05172348, + "balance_loss_mlp": 1.03618932, + "epoch": 0.28462347813016686, + "flos": 23182815859200.0, + "grad_norm": 2.765358066672143, + "language_loss": 0.71167308, + "learning_rate": 3.357576466701875e-06, + "loss": 0.73367214, + "num_input_tokens_seen": 102139050, + "step": 4734, + "time_per_iteration": 2.5084898471832275 + }, + { + "auxiliary_loss_clip": 0.01117554, + "auxiliary_loss_mlp": 0.01037935, + "balance_loss_clip": 1.0494473, + "balance_loss_mlp": 1.02160299, + "epoch": 0.2846836013828348, + "flos": 18660477732480.0, + "grad_norm": 1.7565704609117334, + "language_loss": 0.74026042, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.76181537, + "num_input_tokens_seen": 102157935, + "step": 4735, + "time_per_iteration": 2.516871690750122 + }, + { + "auxiliary_loss_clip": 0.01119281, + "auxiliary_loss_mlp": 0.01043626, + "balance_loss_clip": 1.05183434, + "balance_loss_mlp": 1.02800918, + "epoch": 0.2847437246355028, + "flos": 14173511523840.0, + "grad_norm": 1.8301074254338343, + "language_loss": 0.79881096, + "learning_rate": 3.357004373789946e-06, + "loss": 0.82044005, + "num_input_tokens_seen": 102175325, + "step": 4736, + "time_per_iteration": 2.522127866744995 + }, + { + "auxiliary_loss_clip": 0.01142373, + "auxiliary_loss_mlp": 0.0104719, + "balance_loss_clip": 1.05098569, + "balance_loss_mlp": 1.02941585, + "epoch": 0.28480384788817076, + "flos": 29278364837760.0, + "grad_norm": 2.586107812262861, + "language_loss": 0.59716618, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.61906189, + "num_input_tokens_seen": 102196625, + "step": 4737, + "time_per_iteration": 2.5250797271728516 + }, + { + "auxiliary_loss_clip": 0.01124903, + "auxiliary_loss_mlp": 0.01041208, + "balance_loss_clip": 1.04805982, + "balance_loss_mlp": 1.02462602, + "epoch": 0.2848639711408387, + "flos": 22601314581120.0, + "grad_norm": 1.8761504956160868, + "language_loss": 0.86821061, + "learning_rate": 3.356432075047052e-06, + "loss": 0.88987172, + "num_input_tokens_seen": 102214975, + "step": 4738, + "time_per_iteration": 2.5215678215026855 + }, + { + "auxiliary_loss_clip": 0.01116746, + "auxiliary_loss_mlp": 0.01044634, + "balance_loss_clip": 1.05256605, + "balance_loss_mlp": 1.02564359, + "epoch": 0.2849240943935067, + "flos": 17599460866560.0, + "grad_norm": 1.8981088897526033, + "language_loss": 0.90005398, + "learning_rate": 3.356145848516118e-06, + "loss": 0.92166775, + "num_input_tokens_seen": 102231885, + "step": 4739, + "time_per_iteration": 2.5362894535064697 + }, + { + "auxiliary_loss_clip": 0.01133041, + "auxiliary_loss_mlp": 0.01044793, + "balance_loss_clip": 1.05736172, + "balance_loss_mlp": 1.02806759, + "epoch": 0.28498421764617465, + "flos": 24862573428480.0, + "grad_norm": 1.378518139615513, + "language_loss": 0.72308254, + "learning_rate": 3.355859570559998e-06, + "loss": 0.74486089, + "num_input_tokens_seen": 102252725, + "step": 4740, + "time_per_iteration": 2.5610649585723877 + }, + { + "auxiliary_loss_clip": 0.0111576, + "auxiliary_loss_mlp": 0.01038, + "balance_loss_clip": 1.04853237, + "balance_loss_mlp": 1.02159643, + "epoch": 0.2850443408988426, + "flos": 22782555630720.0, + "grad_norm": 1.4968429676091495, + "language_loss": 0.77567494, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.79721248, + "num_input_tokens_seen": 102271730, + "step": 4741, + "time_per_iteration": 2.5290775299072266 + }, + { + "auxiliary_loss_clip": 0.01105816, + "auxiliary_loss_mlp": 0.01044735, + "balance_loss_clip": 1.04954851, + "balance_loss_mlp": 1.02693701, + "epoch": 0.2851044641515106, + "flos": 18844053166080.0, + "grad_norm": 1.794811285141452, + "language_loss": 0.76328015, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.78478563, + "num_input_tokens_seen": 102291325, + "step": 4742, + "time_per_iteration": 2.573063373565674 + }, + { + "auxiliary_loss_clip": 0.01147761, + "auxiliary_loss_mlp": 0.0105458, + "balance_loss_clip": 1.05183744, + "balance_loss_mlp": 1.03474402, + "epoch": 0.28516458740417855, + "flos": 18880502492160.0, + "grad_norm": 1.8140698017504322, + "language_loss": 0.57834566, + "learning_rate": 3.355000428249086e-06, + "loss": 0.60036904, + "num_input_tokens_seen": 102309000, + "step": 4743, + "time_per_iteration": 3.9263756275177 + }, + { + "auxiliary_loss_clip": 0.01111528, + "auxiliary_loss_mlp": 0.01053832, + "balance_loss_clip": 1.05152857, + "balance_loss_mlp": 1.03597403, + "epoch": 0.2852247106568465, + "flos": 25299821687040.0, + "grad_norm": 1.6190569928254324, + "language_loss": 0.74562025, + "learning_rate": 3.354713944700797e-06, + "loss": 0.76727378, + "num_input_tokens_seen": 102329240, + "step": 4744, + "time_per_iteration": 2.591512441635132 + }, + { + "auxiliary_loss_clip": 0.01127608, + "auxiliary_loss_mlp": 0.01042876, + "balance_loss_clip": 1.05403781, + "balance_loss_mlp": 1.02658033, + "epoch": 0.2852848339095145, + "flos": 11655383541120.0, + "grad_norm": 2.1556536735844958, + "language_loss": 0.77555263, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.79725748, + "num_input_tokens_seen": 102344440, + "step": 4745, + "time_per_iteration": 2.4967527389526367 + }, + { + "auxiliary_loss_clip": 0.01119628, + "auxiliary_loss_mlp": 0.01041223, + "balance_loss_clip": 1.05126548, + "balance_loss_mlp": 1.0254035, + "epoch": 0.2853449571621825, + "flos": 12933228856320.0, + "grad_norm": 1.7574636648395323, + "language_loss": 0.82555014, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.84715867, + "num_input_tokens_seen": 102360985, + "step": 4746, + "time_per_iteration": 2.455691337585449 + }, + { + "auxiliary_loss_clip": 0.01096621, + "auxiliary_loss_mlp": 0.01040639, + "balance_loss_clip": 1.04853284, + "balance_loss_mlp": 1.02260303, + "epoch": 0.28540508041485046, + "flos": 20010575255040.0, + "grad_norm": 1.7242825725443152, + "language_loss": 0.79685938, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.81823194, + "num_input_tokens_seen": 102380320, + "step": 4747, + "time_per_iteration": 3.998424530029297 + }, + { + "auxiliary_loss_clip": 0.0105427, + "auxiliary_loss_mlp": 0.00998739, + "balance_loss_clip": 1.04021525, + "balance_loss_mlp": 0.99655706, + "epoch": 0.28546520366751843, + "flos": 68139349966080.0, + "grad_norm": 0.7803066287920077, + "language_loss": 0.60478145, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.62531155, + "num_input_tokens_seen": 102439140, + "step": 4748, + "time_per_iteration": 3.059692859649658 + }, + { + "auxiliary_loss_clip": 0.01141742, + "auxiliary_loss_mlp": 0.01045095, + "balance_loss_clip": 1.0509851, + "balance_loss_mlp": 1.02791691, + "epoch": 0.2855253269201864, + "flos": 13251540205440.0, + "grad_norm": 2.1477024829352995, + "language_loss": 0.80526704, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.82713538, + "num_input_tokens_seen": 102450990, + "step": 4749, + "time_per_iteration": 2.4262030124664307 + }, + { + "auxiliary_loss_clip": 0.01128148, + "auxiliary_loss_mlp": 0.01038797, + "balance_loss_clip": 1.04833663, + "balance_loss_mlp": 1.02245343, + "epoch": 0.28558545017285436, + "flos": 28620876337920.0, + "grad_norm": 2.0494126843266978, + "language_loss": 0.70746493, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.72913432, + "num_input_tokens_seen": 102471820, + "step": 4750, + "time_per_iteration": 2.546436309814453 + }, + { + "auxiliary_loss_clip": 0.01125916, + "auxiliary_loss_mlp": 0.01037574, + "balance_loss_clip": 1.05005288, + "balance_loss_mlp": 1.02033651, + "epoch": 0.2856455734255223, + "flos": 34130470752000.0, + "grad_norm": 1.5201751613753034, + "language_loss": 0.82011378, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.84174865, + "num_input_tokens_seen": 102492625, + "step": 4751, + "time_per_iteration": 3.978419542312622 + }, + { + "auxiliary_loss_clip": 0.01137419, + "auxiliary_loss_mlp": 0.01043643, + "balance_loss_clip": 1.04902041, + "balance_loss_mlp": 1.02720404, + "epoch": 0.2857056966781903, + "flos": 39786149779200.0, + "grad_norm": 1.8596793578360746, + "language_loss": 0.79753631, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.81934696, + "num_input_tokens_seen": 102514145, + "step": 4752, + "time_per_iteration": 3.9630253314971924 + }, + { + "auxiliary_loss_clip": 0.01123437, + "auxiliary_loss_mlp": 0.01041, + "balance_loss_clip": 1.04618716, + "balance_loss_mlp": 1.02333331, + "epoch": 0.28576581993085826, + "flos": 21872292145920.0, + "grad_norm": 3.4673473915228237, + "language_loss": 0.78727859, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.80892301, + "num_input_tokens_seen": 102532365, + "step": 4753, + "time_per_iteration": 2.495924711227417 + }, + { + "auxiliary_loss_clip": 0.0114203, + "auxiliary_loss_mlp": 0.01046793, + "balance_loss_clip": 1.04894924, + "balance_loss_mlp": 1.02838707, + "epoch": 0.2858259431835262, + "flos": 19091656592640.0, + "grad_norm": 2.2123475876088414, + "language_loss": 0.89530432, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.91719252, + "num_input_tokens_seen": 102548425, + "step": 4754, + "time_per_iteration": 2.442079782485962 + }, + { + "auxiliary_loss_clip": 0.01122922, + "auxiliary_loss_mlp": 0.01044951, + "balance_loss_clip": 1.04663658, + "balance_loss_mlp": 1.02917945, + "epoch": 0.2858860664361942, + "flos": 20334309557760.0, + "grad_norm": 1.6517408929846047, + "language_loss": 0.82434464, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84602338, + "num_input_tokens_seen": 102566370, + "step": 4755, + "time_per_iteration": 2.4942989349365234 + }, + { + "auxiliary_loss_clip": 0.01091851, + "auxiliary_loss_mlp": 0.01045617, + "balance_loss_clip": 1.04487181, + "balance_loss_mlp": 1.02874875, + "epoch": 0.28594618968886215, + "flos": 24461738582400.0, + "grad_norm": 1.5061533913788643, + "language_loss": 0.83860207, + "learning_rate": 3.351272138300922e-06, + "loss": 0.85997677, + "num_input_tokens_seen": 102588715, + "step": 4756, + "time_per_iteration": 2.6073732376098633 + }, + { + "auxiliary_loss_clip": 0.01025286, + "auxiliary_loss_mlp": 0.01045071, + "balance_loss_clip": 1.02326345, + "balance_loss_mlp": 1.04293716, + "epoch": 0.2860063129415301, + "flos": 71652850709760.0, + "grad_norm": 0.8717787889320826, + "language_loss": 0.61005175, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63075531, + "num_input_tokens_seen": 102656715, + "step": 4757, + "time_per_iteration": 3.294732093811035 + }, + { + "auxiliary_loss_clip": 0.01138982, + "auxiliary_loss_mlp": 0.01039281, + "balance_loss_clip": 1.05072808, + "balance_loss_mlp": 1.02317548, + "epoch": 0.2860664361941981, + "flos": 20558679863040.0, + "grad_norm": 2.3340868643179875, + "language_loss": 0.65656912, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.67835176, + "num_input_tokens_seen": 102676545, + "step": 4758, + "time_per_iteration": 2.456272602081299 + }, + { + "auxiliary_loss_clip": 0.01127335, + "auxiliary_loss_mlp": 0.0104159, + "balance_loss_clip": 1.04701948, + "balance_loss_mlp": 1.02526999, + "epoch": 0.2861265594468661, + "flos": 35996389534080.0, + "grad_norm": 1.7138502004116387, + "language_loss": 0.63186759, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.65355682, + "num_input_tokens_seen": 102702875, + "step": 4759, + "time_per_iteration": 2.651405096054077 + }, + { + "auxiliary_loss_clip": 0.01121616, + "auxiliary_loss_mlp": 0.00808024, + "balance_loss_clip": 1.04575109, + "balance_loss_mlp": 1.03912568, + "epoch": 0.28618668269953407, + "flos": 20047419630720.0, + "grad_norm": 1.7105191328054878, + "language_loss": 0.74347973, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.76277614, + "num_input_tokens_seen": 102723160, + "step": 4760, + "time_per_iteration": 2.503000020980835 + }, + { + "auxiliary_loss_clip": 0.01111423, + "auxiliary_loss_mlp": 0.01037704, + "balance_loss_clip": 1.04649949, + "balance_loss_mlp": 1.02224302, + "epoch": 0.28624680595220203, + "flos": 24971849579520.0, + "grad_norm": 1.8335614924683528, + "language_loss": 0.72312033, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.74461162, + "num_input_tokens_seen": 102743855, + "step": 4761, + "time_per_iteration": 2.5940070152282715 + }, + { + "auxiliary_loss_clip": 0.01071955, + "auxiliary_loss_mlp": 0.01051225, + "balance_loss_clip": 1.04343772, + "balance_loss_mlp": 1.03408265, + "epoch": 0.28630692920487, + "flos": 22492253911680.0, + "grad_norm": 1.9268016941546922, + "language_loss": 0.74752426, + "learning_rate": 3.349548466945793e-06, + "loss": 0.76875603, + "num_input_tokens_seen": 102761370, + "step": 4762, + "time_per_iteration": 2.64593505859375 + }, + { + "auxiliary_loss_clip": 0.01104164, + "auxiliary_loss_mlp": 0.01047436, + "balance_loss_clip": 1.04861796, + "balance_loss_mlp": 1.03081822, + "epoch": 0.28636705245753796, + "flos": 21249888255360.0, + "grad_norm": 1.4706477794068227, + "language_loss": 0.76228422, + "learning_rate": 3.349261009210496e-06, + "loss": 0.78380024, + "num_input_tokens_seen": 102780885, + "step": 4763, + "time_per_iteration": 2.5804033279418945 + }, + { + "auxiliary_loss_clip": 0.01098643, + "auxiliary_loss_mlp": 0.01040866, + "balance_loss_clip": 1.04492235, + "balance_loss_mlp": 1.0231154, + "epoch": 0.28642717571020593, + "flos": 24095772864000.0, + "grad_norm": 1.6809529045282812, + "language_loss": 0.77258694, + "learning_rate": 3.348973500311086e-06, + "loss": 0.79398209, + "num_input_tokens_seen": 102801000, + "step": 4764, + "time_per_iteration": 2.59322190284729 + }, + { + "auxiliary_loss_clip": 0.0111101, + "auxiliary_loss_mlp": 0.0104964, + "balance_loss_clip": 1.04797757, + "balance_loss_mlp": 1.03056645, + "epoch": 0.2864872989628739, + "flos": 22601386408320.0, + "grad_norm": 2.145650389309795, + "language_loss": 0.71602571, + "learning_rate": 3.348685940258466e-06, + "loss": 0.73763216, + "num_input_tokens_seen": 102820230, + "step": 4765, + "time_per_iteration": 2.5628535747528076 + }, + { + "auxiliary_loss_clip": 0.0112123, + "auxiliary_loss_mlp": 0.01037512, + "balance_loss_clip": 1.04530001, + "balance_loss_mlp": 1.02103722, + "epoch": 0.28654742221554186, + "flos": 32745073138560.0, + "grad_norm": 1.4741809827318884, + "language_loss": 0.76299542, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.78458285, + "num_input_tokens_seen": 102842670, + "step": 4766, + "time_per_iteration": 2.587432384490967 + }, + { + "auxiliary_loss_clip": 0.01126043, + "auxiliary_loss_mlp": 0.01037963, + "balance_loss_clip": 1.04860449, + "balance_loss_mlp": 1.02090383, + "epoch": 0.2866075454682098, + "flos": 26981626331520.0, + "grad_norm": 1.6391728387880309, + "language_loss": 0.77360928, + "learning_rate": 3.348110666737214e-06, + "loss": 0.79524934, + "num_input_tokens_seen": 102864480, + "step": 4767, + "time_per_iteration": 2.546358346939087 + }, + { + "auxiliary_loss_clip": 0.01138345, + "auxiliary_loss_mlp": 0.0104506, + "balance_loss_clip": 1.04898131, + "balance_loss_mlp": 1.02763152, + "epoch": 0.2866676687208778, + "flos": 23253847004160.0, + "grad_norm": 7.046884905568304, + "language_loss": 0.65011466, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.67194867, + "num_input_tokens_seen": 102883740, + "step": 4768, + "time_per_iteration": 2.4628591537475586 + }, + { + "auxiliary_loss_clip": 0.01122376, + "auxiliary_loss_mlp": 0.01043338, + "balance_loss_clip": 1.04617262, + "balance_loss_mlp": 1.0260886, + "epoch": 0.28672779197354575, + "flos": 21579727870080.0, + "grad_norm": 1.4817828808367253, + "language_loss": 0.70423031, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.72588742, + "num_input_tokens_seen": 102902945, + "step": 4769, + "time_per_iteration": 2.5374674797058105 + }, + { + "auxiliary_loss_clip": 0.01079071, + "auxiliary_loss_mlp": 0.01036974, + "balance_loss_clip": 1.04922342, + "balance_loss_mlp": 1.02066636, + "epoch": 0.2867879152262137, + "flos": 19865568049920.0, + "grad_norm": 1.8495768809499047, + "language_loss": 0.74623311, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.76739353, + "num_input_tokens_seen": 102922405, + "step": 4770, + "time_per_iteration": 2.572683334350586 + }, + { + "auxiliary_loss_clip": 0.01095659, + "auxiliary_loss_mlp": 0.01039514, + "balance_loss_clip": 1.04554141, + "balance_loss_mlp": 1.02244353, + "epoch": 0.2868480384788817, + "flos": 28213325648640.0, + "grad_norm": 2.1929149219131046, + "language_loss": 0.67686105, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.69821274, + "num_input_tokens_seen": 102938980, + "step": 4771, + "time_per_iteration": 2.606570243835449 + }, + { + "auxiliary_loss_clip": 0.0104232, + "auxiliary_loss_mlp": 0.01013457, + "balance_loss_clip": 1.02050531, + "balance_loss_mlp": 1.01083457, + "epoch": 0.2869081617315497, + "flos": 65424286690560.0, + "grad_norm": 0.7771358482650962, + "language_loss": 0.56848186, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.58903962, + "num_input_tokens_seen": 103000405, + "step": 4772, + "time_per_iteration": 3.0561890602111816 + }, + { + "auxiliary_loss_clip": 0.01068197, + "auxiliary_loss_mlp": 0.00802564, + "balance_loss_clip": 1.04791284, + "balance_loss_mlp": 1.02087307, + "epoch": 0.28696828498421767, + "flos": 18660729127680.0, + "grad_norm": 3.514771486332438, + "language_loss": 0.83102655, + "learning_rate": 3.346383619630856e-06, + "loss": 0.84973413, + "num_input_tokens_seen": 103017970, + "step": 4773, + "time_per_iteration": 2.589796543121338 + }, + { + "auxiliary_loss_clip": 0.01135772, + "auxiliary_loss_mlp": 0.01044142, + "balance_loss_clip": 1.04373336, + "balance_loss_mlp": 1.02660584, + "epoch": 0.28702840823688563, + "flos": 23659745667840.0, + "grad_norm": 2.236332573757313, + "language_loss": 0.78125864, + "learning_rate": 3.34609559969027e-06, + "loss": 0.80305773, + "num_input_tokens_seen": 103036385, + "step": 4774, + "time_per_iteration": 2.4978747367858887 + }, + { + "auxiliary_loss_clip": 0.01120781, + "auxiliary_loss_mlp": 0.01040058, + "balance_loss_clip": 1.04889321, + "balance_loss_mlp": 1.0228802, + "epoch": 0.2870885314895536, + "flos": 13804744544640.0, + "grad_norm": 3.2303872857382094, + "language_loss": 0.73551333, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.75712168, + "num_input_tokens_seen": 103052170, + "step": 4775, + "time_per_iteration": 2.487281322479248 + }, + { + "auxiliary_loss_clip": 0.01128898, + "auxiliary_loss_mlp": 0.01043169, + "balance_loss_clip": 1.04778576, + "balance_loss_mlp": 1.02615762, + "epoch": 0.28714865474222157, + "flos": 17786771314560.0, + "grad_norm": 1.6660118843038847, + "language_loss": 0.88346696, + "learning_rate": 3.34551940668778e-06, + "loss": 0.90518761, + "num_input_tokens_seen": 103070510, + "step": 4776, + "time_per_iteration": 2.49529767036438 + }, + { + "auxiliary_loss_clip": 0.01130966, + "auxiliary_loss_mlp": 0.01042449, + "balance_loss_clip": 1.04798794, + "balance_loss_mlp": 1.02599788, + "epoch": 0.28720877799488953, + "flos": 15997486199040.0, + "grad_norm": 1.7433575012401514, + "language_loss": 0.74455446, + "learning_rate": 3.345231233647726e-06, + "loss": 0.76628864, + "num_input_tokens_seen": 103089590, + "step": 4777, + "time_per_iteration": 2.4783732891082764 + }, + { + "auxiliary_loss_clip": 0.01121369, + "auxiliary_loss_mlp": 0.01045536, + "balance_loss_clip": 1.04871154, + "balance_loss_mlp": 1.02751124, + "epoch": 0.2872689012475575, + "flos": 20923137210240.0, + "grad_norm": 1.9333975098312768, + "language_loss": 0.80073142, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.82240045, + "num_input_tokens_seen": 103109080, + "step": 4778, + "time_per_iteration": 2.5239548683166504 + }, + { + "auxiliary_loss_clip": 0.01115648, + "auxiliary_loss_mlp": 0.01045992, + "balance_loss_clip": 1.05035591, + "balance_loss_mlp": 1.02875447, + "epoch": 0.28732902450022546, + "flos": 21325121291520.0, + "grad_norm": 2.3268660464597133, + "language_loss": 0.73681831, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.75843465, + "num_input_tokens_seen": 103127755, + "step": 4779, + "time_per_iteration": 2.5321717262268066 + }, + { + "auxiliary_loss_clip": 0.01122971, + "auxiliary_loss_mlp": 0.01047601, + "balance_loss_clip": 1.04734063, + "balance_loss_mlp": 1.02939737, + "epoch": 0.2873891477528934, + "flos": 20850382212480.0, + "grad_norm": 1.519506602262648, + "language_loss": 0.7599743, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.78167999, + "num_input_tokens_seen": 103147035, + "step": 4780, + "time_per_iteration": 2.519141912460327 + }, + { + "auxiliary_loss_clip": 0.01098947, + "auxiliary_loss_mlp": 0.01044288, + "balance_loss_clip": 1.04321027, + "balance_loss_mlp": 1.02838564, + "epoch": 0.2874492710055614, + "flos": 17420051410560.0, + "grad_norm": 1.788816116988456, + "language_loss": 0.81210744, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83353978, + "num_input_tokens_seen": 103165410, + "step": 4781, + "time_per_iteration": 2.5388567447662354 + }, + { + "auxiliary_loss_clip": 0.0110773, + "auxiliary_loss_mlp": 0.01045687, + "balance_loss_clip": 1.04744744, + "balance_loss_mlp": 1.0269351, + "epoch": 0.28750939425822936, + "flos": 13406818700160.0, + "grad_norm": 2.0634908323296646, + "language_loss": 0.86759174, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.88912594, + "num_input_tokens_seen": 103183710, + "step": 4782, + "time_per_iteration": 3.931670665740967 + }, + { + "auxiliary_loss_clip": 0.01105331, + "auxiliary_loss_mlp": 0.01042782, + "balance_loss_clip": 1.04790056, + "balance_loss_mlp": 1.02553844, + "epoch": 0.2875695175108973, + "flos": 21870029589120.0, + "grad_norm": 1.431989918082156, + "language_loss": 0.71197486, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.73345602, + "num_input_tokens_seen": 103203790, + "step": 4783, + "time_per_iteration": 2.562919855117798 + }, + { + "auxiliary_loss_clip": 0.01116754, + "auxiliary_loss_mlp": 0.01043158, + "balance_loss_clip": 1.0515151, + "balance_loss_mlp": 1.02634978, + "epoch": 0.2876296407635653, + "flos": 26245457089920.0, + "grad_norm": 1.6348801775613293, + "language_loss": 0.76708436, + "learning_rate": 3.343212594663047e-06, + "loss": 0.78868347, + "num_input_tokens_seen": 103223925, + "step": 4784, + "time_per_iteration": 2.553053140640259 + }, + { + "auxiliary_loss_clip": 0.01093478, + "auxiliary_loss_mlp": 0.01050314, + "balance_loss_clip": 1.04737186, + "balance_loss_mlp": 1.03230166, + "epoch": 0.28768976401623325, + "flos": 25373654092800.0, + "grad_norm": 1.4839660650445712, + "language_loss": 0.76311821, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.78455609, + "num_input_tokens_seen": 103244760, + "step": 4785, + "time_per_iteration": 2.5848641395568848 + }, + { + "auxiliary_loss_clip": 0.01139317, + "auxiliary_loss_mlp": 0.01048424, + "balance_loss_clip": 1.04904962, + "balance_loss_mlp": 1.0314486, + "epoch": 0.28774988726890127, + "flos": 30664372982400.0, + "grad_norm": 2.347020711878431, + "language_loss": 0.83030087, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.85217828, + "num_input_tokens_seen": 103261995, + "step": 4786, + "time_per_iteration": 3.9360263347625732 + }, + { + "auxiliary_loss_clip": 0.01104715, + "auxiliary_loss_mlp": 0.00800125, + "balance_loss_clip": 1.04583335, + "balance_loss_mlp": 1.02445149, + "epoch": 0.28781001052156924, + "flos": 20595452411520.0, + "grad_norm": 1.8640251828811247, + "language_loss": 0.79684925, + "learning_rate": 3.342346699429516e-06, + "loss": 0.8158977, + "num_input_tokens_seen": 103279780, + "step": 4787, + "time_per_iteration": 2.5587966442108154 + }, + { + "auxiliary_loss_clip": 0.01116317, + "auxiliary_loss_mlp": 0.01042927, + "balance_loss_clip": 1.04686737, + "balance_loss_mlp": 1.02608252, + "epoch": 0.2878701337742372, + "flos": 26542330997760.0, + "grad_norm": 5.033182495816941, + "language_loss": 0.83516729, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.85675967, + "num_input_tokens_seen": 103300580, + "step": 4788, + "time_per_iteration": 2.561777114868164 + }, + { + "auxiliary_loss_clip": 0.01102843, + "auxiliary_loss_mlp": 0.01051308, + "balance_loss_clip": 1.04956782, + "balance_loss_mlp": 1.03402257, + "epoch": 0.28793025702690517, + "flos": 28146855530880.0, + "grad_norm": 1.6856530365115436, + "language_loss": 0.73589593, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.75743747, + "num_input_tokens_seen": 103320430, + "step": 4789, + "time_per_iteration": 4.003628730773926 + }, + { + "auxiliary_loss_clip": 0.01119623, + "auxiliary_loss_mlp": 0.01048891, + "balance_loss_clip": 1.04887676, + "balance_loss_mlp": 1.03243947, + "epoch": 0.28799038027957313, + "flos": 23805471144960.0, + "grad_norm": 1.5889162720658165, + "language_loss": 0.83605951, + "learning_rate": 3.341480346078704e-06, + "loss": 0.85774469, + "num_input_tokens_seen": 103337695, + "step": 4790, + "time_per_iteration": 2.5266106128692627 + }, + { + "auxiliary_loss_clip": 0.01130859, + "auxiliary_loss_mlp": 0.01042825, + "balance_loss_clip": 1.04926395, + "balance_loss_mlp": 1.02594507, + "epoch": 0.2880505035322411, + "flos": 22344122223360.0, + "grad_norm": 2.1992852955532403, + "language_loss": 0.7786116, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.8003484, + "num_input_tokens_seen": 103357010, + "step": 4791, + "time_per_iteration": 3.864971876144409 + }, + { + "auxiliary_loss_clip": 0.01118893, + "auxiliary_loss_mlp": 0.01036569, + "balance_loss_clip": 1.05363584, + "balance_loss_mlp": 1.01983249, + "epoch": 0.28811062678490906, + "flos": 18004246208640.0, + "grad_norm": 1.69083438806317, + "language_loss": 0.70408237, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.72563696, + "num_input_tokens_seen": 103375600, + "step": 4792, + "time_per_iteration": 2.5106186866760254 + }, + { + "auxiliary_loss_clip": 0.01096941, + "auxiliary_loss_mlp": 0.01036645, + "balance_loss_clip": 1.04831147, + "balance_loss_mlp": 1.02021778, + "epoch": 0.28817075003757703, + "flos": 22090880361600.0, + "grad_norm": 1.852421046620153, + "language_loss": 0.79528248, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.81661832, + "num_input_tokens_seen": 103395225, + "step": 4793, + "time_per_iteration": 2.597184658050537 + }, + { + "auxiliary_loss_clip": 0.01114566, + "auxiliary_loss_mlp": 0.01042947, + "balance_loss_clip": 1.04845262, + "balance_loss_mlp": 1.02672255, + "epoch": 0.288230873290245, + "flos": 41683130847360.0, + "grad_norm": 1.5850170885615402, + "language_loss": 0.77554369, + "learning_rate": 3.340324496161797e-06, + "loss": 0.79711878, + "num_input_tokens_seen": 103417245, + "step": 4794, + "time_per_iteration": 2.7076923847198486 + }, + { + "auxiliary_loss_clip": 0.01130233, + "auxiliary_loss_mlp": 0.0104739, + "balance_loss_clip": 1.05089021, + "balance_loss_mlp": 1.03017652, + "epoch": 0.28829099654291296, + "flos": 18624423456000.0, + "grad_norm": 2.2091357316789466, + "language_loss": 0.8344444, + "learning_rate": 3.340035406592074e-06, + "loss": 0.8562206, + "num_input_tokens_seen": 103435500, + "step": 4795, + "time_per_iteration": 2.4757022857666016 + }, + { + "auxiliary_loss_clip": 0.0112449, + "auxiliary_loss_mlp": 0.01041411, + "balance_loss_clip": 1.04961681, + "balance_loss_mlp": 1.02561593, + "epoch": 0.2883511197955809, + "flos": 24674832017280.0, + "grad_norm": 2.7237936312301705, + "language_loss": 0.74602813, + "learning_rate": 3.339746266208074e-06, + "loss": 0.76768708, + "num_input_tokens_seen": 103451040, + "step": 4796, + "time_per_iteration": 2.5243303775787354 + }, + { + "auxiliary_loss_clip": 0.01134043, + "auxiliary_loss_mlp": 0.01042604, + "balance_loss_clip": 1.0512476, + "balance_loss_mlp": 1.02428222, + "epoch": 0.2884112430482489, + "flos": 23112143850240.0, + "grad_norm": 2.1808058552661307, + "language_loss": 0.72897542, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.7507419, + "num_input_tokens_seen": 103471330, + "step": 4797, + "time_per_iteration": 2.5189762115478516 + }, + { + "auxiliary_loss_clip": 0.01102617, + "auxiliary_loss_mlp": 0.0079913, + "balance_loss_clip": 1.04585314, + "balance_loss_mlp": 1.01926136, + "epoch": 0.28847136630091685, + "flos": 16873347432960.0, + "grad_norm": 1.888846599443049, + "language_loss": 0.74155521, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.76057273, + "num_input_tokens_seen": 103488060, + "step": 4798, + "time_per_iteration": 2.5525143146514893 + }, + { + "auxiliary_loss_clip": 0.01129831, + "auxiliary_loss_mlp": 0.01041643, + "balance_loss_clip": 1.0475533, + "balance_loss_mlp": 1.0235467, + "epoch": 0.2885314895535849, + "flos": 25657527277440.0, + "grad_norm": 2.7295345992437485, + "language_loss": 0.64997059, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.67168534, + "num_input_tokens_seen": 103503600, + "step": 4799, + "time_per_iteration": 2.5174527168273926 + }, + { + "auxiliary_loss_clip": 0.01142333, + "auxiliary_loss_mlp": 0.01045959, + "balance_loss_clip": 1.05084682, + "balance_loss_mlp": 1.02903104, + "epoch": 0.28859161280625284, + "flos": 21107251347840.0, + "grad_norm": 1.7006293787646176, + "language_loss": 0.82455748, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.84644043, + "num_input_tokens_seen": 103524195, + "step": 4800, + "time_per_iteration": 2.545185089111328 + }, + { + "auxiliary_loss_clip": 0.01100575, + "auxiliary_loss_mlp": 0.01045044, + "balance_loss_clip": 1.04718399, + "balance_loss_mlp": 1.02800894, + "epoch": 0.2886517360589208, + "flos": 26469540086400.0, + "grad_norm": 2.5224837067113692, + "language_loss": 0.91113305, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.93258929, + "num_input_tokens_seen": 103545235, + "step": 4801, + "time_per_iteration": 2.606602907180786 + }, + { + "auxiliary_loss_clip": 0.01121485, + "auxiliary_loss_mlp": 0.0079829, + "balance_loss_clip": 1.052791, + "balance_loss_mlp": 1.01896858, + "epoch": 0.28871185931158877, + "flos": 25265275781760.0, + "grad_norm": 1.948281667334705, + "language_loss": 0.73326087, + "learning_rate": 3.33801035741839e-06, + "loss": 0.75245863, + "num_input_tokens_seen": 103563305, + "step": 4802, + "time_per_iteration": 2.572216749191284 + }, + { + "auxiliary_loss_clip": 0.01023632, + "auxiliary_loss_mlp": 0.01007671, + "balance_loss_clip": 1.02162278, + "balance_loss_mlp": 1.00567997, + "epoch": 0.28877198256425674, + "flos": 66665431284480.0, + "grad_norm": 0.7828172742007847, + "language_loss": 0.62973672, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65004975, + "num_input_tokens_seen": 103625025, + "step": 4803, + "time_per_iteration": 3.1200687885284424 + }, + { + "auxiliary_loss_clip": 0.01083662, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_clip": 1.04621577, + "balance_loss_mlp": 1.02947867, + "epoch": 0.2888321058169247, + "flos": 20303031790080.0, + "grad_norm": 1.7674035801860448, + "language_loss": 0.70588827, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.72718203, + "num_input_tokens_seen": 103644235, + "step": 4804, + "time_per_iteration": 2.5733144283294678 + }, + { + "auxiliary_loss_clip": 0.01130126, + "auxiliary_loss_mlp": 0.0104538, + "balance_loss_clip": 1.04806495, + "balance_loss_mlp": 1.02731931, + "epoch": 0.28889222906959267, + "flos": 25516721963520.0, + "grad_norm": 2.0708928721694986, + "language_loss": 0.68365002, + "learning_rate": 3.337141717919346e-06, + "loss": 0.70540506, + "num_input_tokens_seen": 103664700, + "step": 4805, + "time_per_iteration": 2.5465078353881836 + }, + { + "auxiliary_loss_clip": 0.01129149, + "auxiliary_loss_mlp": 0.01042849, + "balance_loss_clip": 1.04949367, + "balance_loss_mlp": 1.026196, + "epoch": 0.28895235232226063, + "flos": 32671312560000.0, + "grad_norm": 1.5028380745726795, + "language_loss": 0.69150245, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.71322244, + "num_input_tokens_seen": 103686595, + "step": 4806, + "time_per_iteration": 2.585876226425171 + }, + { + "auxiliary_loss_clip": 0.01116833, + "auxiliary_loss_mlp": 0.01044133, + "balance_loss_clip": 1.04845011, + "balance_loss_mlp": 1.02753949, + "epoch": 0.2890124755749286, + "flos": 29714679342720.0, + "grad_norm": 1.4605569665929177, + "language_loss": 0.71404994, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.7356596, + "num_input_tokens_seen": 103707525, + "step": 4807, + "time_per_iteration": 2.6029093265533447 + }, + { + "auxiliary_loss_clip": 0.01103077, + "auxiliary_loss_mlp": 0.01042189, + "balance_loss_clip": 1.05245352, + "balance_loss_mlp": 1.02510667, + "epoch": 0.28907259882759656, + "flos": 22674464628480.0, + "grad_norm": 1.7863235905482282, + "language_loss": 0.81668806, + "learning_rate": 3.336272622079382e-06, + "loss": 0.83814073, + "num_input_tokens_seen": 103727905, + "step": 4808, + "time_per_iteration": 2.592531204223633 + }, + { + "auxiliary_loss_clip": 0.01096128, + "auxiliary_loss_mlp": 0.01046816, + "balance_loss_clip": 1.05126381, + "balance_loss_mlp": 1.02999592, + "epoch": 0.2891327220802645, + "flos": 22566050403840.0, + "grad_norm": 1.6796839893116917, + "language_loss": 0.78443199, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.80586141, + "num_input_tokens_seen": 103748335, + "step": 4809, + "time_per_iteration": 2.589555501937866 + }, + { + "auxiliary_loss_clip": 0.01086526, + "auxiliary_loss_mlp": 0.01047285, + "balance_loss_clip": 1.04476142, + "balance_loss_mlp": 1.02911699, + "epoch": 0.2891928453329325, + "flos": 21652806090240.0, + "grad_norm": 1.7242362731346326, + "language_loss": 0.787103, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.80844104, + "num_input_tokens_seen": 103767020, + "step": 4810, + "time_per_iteration": 2.592916250228882 + }, + { + "auxiliary_loss_clip": 0.01087993, + "auxiliary_loss_mlp": 0.01038983, + "balance_loss_clip": 1.04615712, + "balance_loss_mlp": 1.02251434, + "epoch": 0.28925296858560046, + "flos": 23222102359680.0, + "grad_norm": 1.7067608554091012, + "language_loss": 0.7702812, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.79155087, + "num_input_tokens_seen": 103786355, + "step": 4811, + "time_per_iteration": 2.6138651371002197 + }, + { + "auxiliary_loss_clip": 0.01123123, + "auxiliary_loss_mlp": 0.01040523, + "balance_loss_clip": 1.04570723, + "balance_loss_mlp": 1.02288032, + "epoch": 0.2893130918382685, + "flos": 28621666437120.0, + "grad_norm": 1.5918208177061774, + "language_loss": 0.77299571, + "learning_rate": 3.335113118275117e-06, + "loss": 0.7946322, + "num_input_tokens_seen": 103809345, + "step": 4812, + "time_per_iteration": 2.5525853633880615 + }, + { + "auxiliary_loss_clip": 0.0103715, + "auxiliary_loss_mlp": 0.01005657, + "balance_loss_clip": 1.04362643, + "balance_loss_mlp": 1.00334454, + "epoch": 0.28937321509093644, + "flos": 72301288982400.0, + "grad_norm": 0.8203812397574088, + "language_loss": 0.60211718, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62254524, + "num_input_tokens_seen": 103871180, + "step": 4813, + "time_per_iteration": 3.350177049636841 + }, + { + "auxiliary_loss_clip": 0.01093125, + "auxiliary_loss_mlp": 0.01042086, + "balance_loss_clip": 1.04585767, + "balance_loss_mlp": 1.02412081, + "epoch": 0.2894333383436044, + "flos": 16216397637120.0, + "grad_norm": 3.590983818089054, + "language_loss": 0.82785547, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.84920758, + "num_input_tokens_seen": 103889040, + "step": 4814, + "time_per_iteration": 2.5532994270324707 + }, + { + "auxiliary_loss_clip": 0.01093454, + "auxiliary_loss_mlp": 0.01043973, + "balance_loss_clip": 1.04635572, + "balance_loss_mlp": 1.02631855, + "epoch": 0.2894934615962724, + "flos": 24828278918400.0, + "grad_norm": 1.5385135063197175, + "language_loss": 0.72702378, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.74839807, + "num_input_tokens_seen": 103910380, + "step": 4815, + "time_per_iteration": 2.646898031234741 + }, + { + "auxiliary_loss_clip": 0.01124896, + "auxiliary_loss_mlp": 0.01049842, + "balance_loss_clip": 1.04897428, + "balance_loss_mlp": 1.03420162, + "epoch": 0.28955358484894034, + "flos": 20449978329600.0, + "grad_norm": 1.633085913960417, + "language_loss": 0.71300352, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.73475087, + "num_input_tokens_seen": 103929955, + "step": 4816, + "time_per_iteration": 2.5361380577087402 + }, + { + "auxiliary_loss_clip": 0.01113518, + "auxiliary_loss_mlp": 0.01048274, + "balance_loss_clip": 1.04718804, + "balance_loss_mlp": 1.03040504, + "epoch": 0.2896137081016083, + "flos": 22565188477440.0, + "grad_norm": 2.0464038254033277, + "language_loss": 0.74567235, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.76729023, + "num_input_tokens_seen": 103948020, + "step": 4817, + "time_per_iteration": 2.54116153717041 + }, + { + "auxiliary_loss_clip": 0.01105473, + "auxiliary_loss_mlp": 0.01050379, + "balance_loss_clip": 1.04829526, + "balance_loss_mlp": 1.03217554, + "epoch": 0.28967383135427627, + "flos": 26687948734080.0, + "grad_norm": 1.7654509799217897, + "language_loss": 0.76455295, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.78611147, + "num_input_tokens_seen": 103968740, + "step": 4818, + "time_per_iteration": 2.641082763671875 + }, + { + "auxiliary_loss_clip": 0.01078386, + "auxiliary_loss_mlp": 0.01041792, + "balance_loss_clip": 1.05289197, + "balance_loss_mlp": 1.02443552, + "epoch": 0.28973395460694423, + "flos": 15558262692480.0, + "grad_norm": 1.8055708611924384, + "language_loss": 0.79841411, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.81961584, + "num_input_tokens_seen": 103986005, + "step": 4819, + "time_per_iteration": 2.600611925125122 + }, + { + "auxiliary_loss_clip": 0.01107773, + "auxiliary_loss_mlp": 0.01045481, + "balance_loss_clip": 1.05002475, + "balance_loss_mlp": 1.0266335, + "epoch": 0.2897940778596122, + "flos": 18697465762560.0, + "grad_norm": 2.19313585076955, + "language_loss": 0.78800124, + "learning_rate": 3.332791681244776e-06, + "loss": 0.80953377, + "num_input_tokens_seen": 104005070, + "step": 4820, + "time_per_iteration": 3.9668190479278564 + }, + { + "auxiliary_loss_clip": 0.01098833, + "auxiliary_loss_mlp": 0.01039183, + "balance_loss_clip": 1.04956484, + "balance_loss_mlp": 1.0221957, + "epoch": 0.28985420111228016, + "flos": 18770292587520.0, + "grad_norm": 2.0852321246101893, + "language_loss": 0.72399557, + "learning_rate": 3.332501274072231e-06, + "loss": 0.74537575, + "num_input_tokens_seen": 104022945, + "step": 4821, + "time_per_iteration": 2.6255605220794678 + }, + { + "auxiliary_loss_clip": 0.01123572, + "auxiliary_loss_mlp": 0.01043386, + "balance_loss_clip": 1.04462767, + "balance_loss_mlp": 1.02573073, + "epoch": 0.28991432436494813, + "flos": 23069840607360.0, + "grad_norm": 1.6345772508501442, + "language_loss": 0.72151625, + "learning_rate": 3.332210816371104e-06, + "loss": 0.74318576, + "num_input_tokens_seen": 104042080, + "step": 4822, + "time_per_iteration": 2.518812417984009 + }, + { + "auxiliary_loss_clip": 0.01127346, + "auxiliary_loss_mlp": 0.01052542, + "balance_loss_clip": 1.05389559, + "balance_loss_mlp": 1.03572178, + "epoch": 0.2899744476176161, + "flos": 17603195880960.0, + "grad_norm": 1.9413932778065264, + "language_loss": 0.66026402, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.68206298, + "num_input_tokens_seen": 104060975, + "step": 4823, + "time_per_iteration": 2.480194330215454 + }, + { + "auxiliary_loss_clip": 0.01104906, + "auxiliary_loss_mlp": 0.01050483, + "balance_loss_clip": 1.04491401, + "balance_loss_mlp": 1.03332925, + "epoch": 0.29003457087028406, + "flos": 22309360836480.0, + "grad_norm": 1.9682731472528185, + "language_loss": 0.80678505, + "learning_rate": 3.331629749427164e-06, + "loss": 0.82833898, + "num_input_tokens_seen": 104081395, + "step": 4824, + "time_per_iteration": 2.5336921215057373 + }, + { + "auxiliary_loss_clip": 0.01136256, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_clip": 1.04539847, + "balance_loss_mlp": 1.02679217, + "epoch": 0.2900946941229521, + "flos": 21944975316480.0, + "grad_norm": 2.432648501077168, + "language_loss": 0.72968924, + "learning_rate": 3.331339140206385e-06, + "loss": 0.75150186, + "num_input_tokens_seen": 104099995, + "step": 4825, + "time_per_iteration": 3.908147096633911 + }, + { + "auxiliary_loss_clip": 0.01140351, + "auxiliary_loss_mlp": 0.01042519, + "balance_loss_clip": 1.04924107, + "balance_loss_mlp": 1.02509022, + "epoch": 0.29015481737562004, + "flos": 17932173569280.0, + "grad_norm": 2.0342936156330054, + "language_loss": 0.73755825, + "learning_rate": 3.331048480501092e-06, + "loss": 0.7593869, + "num_input_tokens_seen": 104118930, + "step": 4826, + "time_per_iteration": 2.454428195953369 + }, + { + "auxiliary_loss_clip": 0.01122694, + "auxiliary_loss_mlp": 0.01044601, + "balance_loss_clip": 1.04481852, + "balance_loss_mlp": 1.0286268, + "epoch": 0.290214940628288, + "flos": 22783525297920.0, + "grad_norm": 1.9919390217428057, + "language_loss": 0.68618894, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.7078619, + "num_input_tokens_seen": 104136940, + "step": 4827, + "time_per_iteration": 2.491593360900879 + }, + { + "auxiliary_loss_clip": 0.01120912, + "auxiliary_loss_mlp": 0.01047738, + "balance_loss_clip": 1.04821396, + "balance_loss_mlp": 1.02971363, + "epoch": 0.290275063880956, + "flos": 20006481104640.0, + "grad_norm": 1.7051283393804744, + "language_loss": 0.80255914, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.82424569, + "num_input_tokens_seen": 104154280, + "step": 4828, + "time_per_iteration": 3.8448123931884766 + }, + { + "auxiliary_loss_clip": 0.01135672, + "auxiliary_loss_mlp": 0.01052731, + "balance_loss_clip": 1.04773808, + "balance_loss_mlp": 1.03608894, + "epoch": 0.29033518713362394, + "flos": 22053605022720.0, + "grad_norm": 2.398962496707361, + "language_loss": 0.8048256, + "learning_rate": 3.33017619858836e-06, + "loss": 0.82670963, + "num_input_tokens_seen": 104172605, + "step": 4829, + "time_per_iteration": 2.45902419090271 + }, + { + "auxiliary_loss_clip": 0.01108453, + "auxiliary_loss_mlp": 0.01038963, + "balance_loss_clip": 1.04495692, + "balance_loss_mlp": 1.02246451, + "epoch": 0.2903953103862919, + "flos": 25630056351360.0, + "grad_norm": 1.5005968325389218, + "language_loss": 0.82667208, + "learning_rate": 3.329885337055249e-06, + "loss": 0.84814626, + "num_input_tokens_seen": 104194120, + "step": 4830, + "time_per_iteration": 3.9271233081817627 + }, + { + "auxiliary_loss_clip": 0.01129852, + "auxiliary_loss_mlp": 0.01050254, + "balance_loss_clip": 1.04896581, + "balance_loss_mlp": 1.03273034, + "epoch": 0.29045543363895987, + "flos": 16945851035520.0, + "grad_norm": 2.238050671510261, + "language_loss": 0.78937465, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.8111757, + "num_input_tokens_seen": 104210875, + "step": 4831, + "time_per_iteration": 2.4494216442108154 + }, + { + "auxiliary_loss_clip": 0.01132521, + "auxiliary_loss_mlp": 0.01044349, + "balance_loss_clip": 1.04658771, + "balance_loss_mlp": 1.02891135, + "epoch": 0.29051555689162784, + "flos": 26395492199040.0, + "grad_norm": 1.785915045387044, + "language_loss": 0.74273837, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.76450706, + "num_input_tokens_seen": 104229875, + "step": 4832, + "time_per_iteration": 2.505241870880127 + }, + { + "auxiliary_loss_clip": 0.01109624, + "auxiliary_loss_mlp": 0.01035142, + "balance_loss_clip": 1.04497075, + "balance_loss_mlp": 1.02061009, + "epoch": 0.2905756801442958, + "flos": 21103875469440.0, + "grad_norm": 1.6257621918238705, + "language_loss": 0.76184607, + "learning_rate": 3.329012449923736e-06, + "loss": 0.78329372, + "num_input_tokens_seen": 104250405, + "step": 4833, + "time_per_iteration": 2.5236973762512207 + }, + { + "auxiliary_loss_clip": 0.01102174, + "auxiliary_loss_mlp": 0.01043579, + "balance_loss_clip": 1.04461467, + "balance_loss_mlp": 1.02711582, + "epoch": 0.29063580339696377, + "flos": 15706071158400.0, + "grad_norm": 1.638458583624717, + "language_loss": 0.64486355, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.6663211, + "num_input_tokens_seen": 104269185, + "step": 4834, + "time_per_iteration": 2.4966297149658203 + }, + { + "auxiliary_loss_clip": 0.01114374, + "auxiliary_loss_mlp": 0.01031664, + "balance_loss_clip": 1.04618764, + "balance_loss_mlp": 1.0166018, + "epoch": 0.29069592664963173, + "flos": 24644990793600.0, + "grad_norm": 1.6645229963984465, + "language_loss": 0.71389157, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.73535192, + "num_input_tokens_seen": 104289400, + "step": 4835, + "time_per_iteration": 2.536180257797241 + }, + { + "auxiliary_loss_clip": 0.01108478, + "auxiliary_loss_mlp": 0.01038209, + "balance_loss_clip": 1.04437828, + "balance_loss_mlp": 1.02318835, + "epoch": 0.2907560499022997, + "flos": 24973753000320.0, + "grad_norm": 1.7098391207988692, + "language_loss": 0.79219645, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.81366336, + "num_input_tokens_seen": 104310485, + "step": 4836, + "time_per_iteration": 2.5574657917022705 + }, + { + "auxiliary_loss_clip": 0.01095918, + "auxiliary_loss_mlp": 0.01048228, + "balance_loss_clip": 1.04634571, + "balance_loss_mlp": 1.03079975, + "epoch": 0.29081617315496766, + "flos": 18657496903680.0, + "grad_norm": 1.7853821310553506, + "language_loss": 0.8084169, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.8298583, + "num_input_tokens_seen": 104327330, + "step": 4837, + "time_per_iteration": 2.526125192642212 + }, + { + "auxiliary_loss_clip": 0.01106894, + "auxiliary_loss_mlp": 0.0103912, + "balance_loss_clip": 1.04448175, + "balance_loss_mlp": 1.02309239, + "epoch": 0.2908762964076356, + "flos": 35331035955840.0, + "grad_norm": 1.8969058082127794, + "language_loss": 0.67205465, + "learning_rate": 3.327556630259381e-06, + "loss": 0.69351482, + "num_input_tokens_seen": 104350350, + "step": 4838, + "time_per_iteration": 2.6529359817504883 + }, + { + "auxiliary_loss_clip": 0.01140035, + "auxiliary_loss_mlp": 0.00796586, + "balance_loss_clip": 1.04971874, + "balance_loss_mlp": 1.01541865, + "epoch": 0.29093641966030365, + "flos": 23076305055360.0, + "grad_norm": 1.6240014906662916, + "language_loss": 0.71291816, + "learning_rate": 3.327265315259095e-06, + "loss": 0.73228437, + "num_input_tokens_seen": 104369995, + "step": 4839, + "time_per_iteration": 2.475241184234619 + }, + { + "auxiliary_loss_clip": 0.01135377, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.04671979, + "balance_loss_mlp": 1.01887584, + "epoch": 0.2909965429129716, + "flos": 35955415094400.0, + "grad_norm": 2.104541484353523, + "language_loss": 0.76117986, + "learning_rate": 3.326973949928776e-06, + "loss": 0.78287673, + "num_input_tokens_seen": 104392285, + "step": 4840, + "time_per_iteration": 2.6014418601989746 + }, + { + "auxiliary_loss_clip": 0.01092254, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_clip": 1.04911077, + "balance_loss_mlp": 1.02846766, + "epoch": 0.2910566661656396, + "flos": 30880231764480.0, + "grad_norm": 1.891739371992281, + "language_loss": 0.60350764, + "learning_rate": 3.326682534279471e-06, + "loss": 0.62487274, + "num_input_tokens_seen": 104412640, + "step": 4841, + "time_per_iteration": 2.6110615730285645 + }, + { + "auxiliary_loss_clip": 0.01113767, + "auxiliary_loss_mlp": 0.01034628, + "balance_loss_clip": 1.04790914, + "balance_loss_mlp": 1.01787901, + "epoch": 0.29111678941830754, + "flos": 30010188533760.0, + "grad_norm": 1.2835636525615306, + "language_loss": 0.71131843, + "learning_rate": 3.326391068322232e-06, + "loss": 0.73280239, + "num_input_tokens_seen": 104435245, + "step": 4842, + "time_per_iteration": 2.589752435684204 + }, + { + "auxiliary_loss_clip": 0.01120649, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.04453254, + "balance_loss_mlp": 1.02083421, + "epoch": 0.2911769126709755, + "flos": 22857393617280.0, + "grad_norm": 1.6210651385791541, + "language_loss": 0.73533225, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.7568953, + "num_input_tokens_seen": 104455395, + "step": 4843, + "time_per_iteration": 2.4801993370056152 + }, + { + "auxiliary_loss_clip": 0.01089906, + "auxiliary_loss_mlp": 0.01035123, + "balance_loss_clip": 1.05082464, + "balance_loss_mlp": 1.01997781, + "epoch": 0.2912370359236435, + "flos": 21650507619840.0, + "grad_norm": 2.4135522728095133, + "language_loss": 0.58341634, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.60466671, + "num_input_tokens_seen": 104473350, + "step": 4844, + "time_per_iteration": 2.5971016883850098 + }, + { + "auxiliary_loss_clip": 0.01130059, + "auxiliary_loss_mlp": 0.01037611, + "balance_loss_clip": 1.05085087, + "balance_loss_mlp": 1.02022982, + "epoch": 0.29129715917631144, + "flos": 22893340152960.0, + "grad_norm": 2.2220433398646957, + "language_loss": 0.86151212, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.88318884, + "num_input_tokens_seen": 104492265, + "step": 4845, + "time_per_iteration": 2.4858081340789795 + }, + { + "auxiliary_loss_clip": 0.01113229, + "auxiliary_loss_mlp": 0.01048283, + "balance_loss_clip": 1.05085874, + "balance_loss_mlp": 1.03112876, + "epoch": 0.2913572824289794, + "flos": 22674464628480.0, + "grad_norm": 1.8785829465028014, + "language_loss": 0.66771609, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.68933117, + "num_input_tokens_seen": 104510755, + "step": 4846, + "time_per_iteration": 2.5500428676605225 + }, + { + "auxiliary_loss_clip": 0.01120072, + "auxiliary_loss_mlp": 0.01037171, + "balance_loss_clip": 1.04981589, + "balance_loss_mlp": 1.0222466, + "epoch": 0.29141740568164737, + "flos": 23107403255040.0, + "grad_norm": 1.935802944504872, + "language_loss": 0.70473295, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.72630537, + "num_input_tokens_seen": 104530830, + "step": 4847, + "time_per_iteration": 2.521238327026367 + }, + { + "auxiliary_loss_clip": 0.0112111, + "auxiliary_loss_mlp": 0.01030208, + "balance_loss_clip": 1.04434049, + "balance_loss_mlp": 1.01468706, + "epoch": 0.29147752893431533, + "flos": 23587026583680.0, + "grad_norm": 1.699794864283418, + "language_loss": 0.7387647, + "learning_rate": 3.324641216731237e-06, + "loss": 0.76027787, + "num_input_tokens_seen": 104550115, + "step": 4848, + "time_per_iteration": 2.535477876663208 + }, + { + "auxiliary_loss_clip": 0.01117412, + "auxiliary_loss_mlp": 0.01042526, + "balance_loss_clip": 1.04371822, + "balance_loss_mlp": 1.02561033, + "epoch": 0.2915376521869833, + "flos": 20591968792320.0, + "grad_norm": 2.185202372781586, + "language_loss": 0.76977265, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.79137206, + "num_input_tokens_seen": 104566255, + "step": 4849, + "time_per_iteration": 2.47027587890625 + }, + { + "auxiliary_loss_clip": 0.0111774, + "auxiliary_loss_mlp": 0.01039357, + "balance_loss_clip": 1.04584193, + "balance_loss_mlp": 1.02374077, + "epoch": 0.29159777543965126, + "flos": 20811490761600.0, + "grad_norm": 1.618833453631688, + "language_loss": 0.78362465, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.80519569, + "num_input_tokens_seen": 104585235, + "step": 4850, + "time_per_iteration": 2.505894184112549 + }, + { + "auxiliary_loss_clip": 0.01107214, + "auxiliary_loss_mlp": 0.01038631, + "balance_loss_clip": 1.05103838, + "balance_loss_mlp": 1.02253222, + "epoch": 0.29165789869231923, + "flos": 24244155947520.0, + "grad_norm": 1.6513055039882796, + "language_loss": 0.75652337, + "learning_rate": 3.323765612674296e-06, + "loss": 0.77798176, + "num_input_tokens_seen": 104605315, + "step": 4851, + "time_per_iteration": 2.5365350246429443 + }, + { + "auxiliary_loss_clip": 0.01120916, + "auxiliary_loss_mlp": 0.0104321, + "balance_loss_clip": 1.04728937, + "balance_loss_mlp": 1.02900028, + "epoch": 0.29171802194498725, + "flos": 28949925853440.0, + "grad_norm": 1.4122880165415193, + "language_loss": 0.77309263, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.79473388, + "num_input_tokens_seen": 104626055, + "step": 4852, + "time_per_iteration": 2.5700159072875977 + }, + { + "auxiliary_loss_clip": 0.01111467, + "auxiliary_loss_mlp": 0.0104185, + "balance_loss_clip": 1.04714632, + "balance_loss_mlp": 1.02643657, + "epoch": 0.2917781451976552, + "flos": 22598226011520.0, + "grad_norm": 1.576679825996922, + "language_loss": 0.78142083, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80295396, + "num_input_tokens_seen": 104646005, + "step": 4853, + "time_per_iteration": 2.558647871017456 + }, + { + "auxiliary_loss_clip": 0.01100322, + "auxiliary_loss_mlp": 0.01036277, + "balance_loss_clip": 1.0474267, + "balance_loss_mlp": 1.02079189, + "epoch": 0.2918382684503232, + "flos": 21574448570880.0, + "grad_norm": 2.7192614644424142, + "language_loss": 0.88506591, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90643191, + "num_input_tokens_seen": 104661620, + "step": 4854, + "time_per_iteration": 2.6070668697357178 + }, + { + "auxiliary_loss_clip": 0.0111831, + "auxiliary_loss_mlp": 0.01056007, + "balance_loss_clip": 1.0480063, + "balance_loss_mlp": 1.03791153, + "epoch": 0.29189839170299114, + "flos": 24353503925760.0, + "grad_norm": 2.2972828860732806, + "language_loss": 0.86293477, + "learning_rate": 3.322597437887519e-06, + "loss": 0.88467795, + "num_input_tokens_seen": 104681445, + "step": 4855, + "time_per_iteration": 2.527416467666626 + }, + { + "auxiliary_loss_clip": 0.01050791, + "auxiliary_loss_mlp": 0.01041844, + "balance_loss_clip": 1.02596068, + "balance_loss_mlp": 1.03998458, + "epoch": 0.2919585149556591, + "flos": 71316726215040.0, + "grad_norm": 0.807103294701614, + "language_loss": 0.60243952, + "learning_rate": 3.322305268780566e-06, + "loss": 0.62336588, + "num_input_tokens_seen": 104747945, + "step": 4856, + "time_per_iteration": 3.242067813873291 + }, + { + "auxiliary_loss_clip": 0.01109, + "auxiliary_loss_mlp": 0.00793662, + "balance_loss_clip": 1.04584336, + "balance_loss_mlp": 1.0141933, + "epoch": 0.2920186382083271, + "flos": 15633208419840.0, + "grad_norm": 1.9073947903075221, + "language_loss": 0.6844548, + "learning_rate": 3.322013049531664e-06, + "loss": 0.70348144, + "num_input_tokens_seen": 104766225, + "step": 4857, + "time_per_iteration": 2.5312864780426025 + }, + { + "auxiliary_loss_clip": 0.01120108, + "auxiliary_loss_mlp": 0.0079238, + "balance_loss_clip": 1.04651284, + "balance_loss_mlp": 1.01410019, + "epoch": 0.29207876146099504, + "flos": 28366018364160.0, + "grad_norm": 2.4218959479493045, + "language_loss": 0.84190947, + "learning_rate": 3.321720780151895e-06, + "loss": 0.86103427, + "num_input_tokens_seen": 104785345, + "step": 4858, + "time_per_iteration": 2.6087169647216797 + }, + { + "auxiliary_loss_clip": 0.01134357, + "auxiliary_loss_mlp": 0.01036867, + "balance_loss_clip": 1.04824066, + "balance_loss_mlp": 1.02223992, + "epoch": 0.292138884713663, + "flos": 21870963342720.0, + "grad_norm": 2.0761692899655078, + "language_loss": 0.77788353, + "learning_rate": 3.321428460652342e-06, + "loss": 0.79959583, + "num_input_tokens_seen": 104804560, + "step": 4859, + "time_per_iteration": 3.855374336242676 + }, + { + "auxiliary_loss_clip": 0.010975, + "auxiliary_loss_mlp": 0.01039493, + "balance_loss_clip": 1.04785335, + "balance_loss_mlp": 1.0231142, + "epoch": 0.29219900796633097, + "flos": 20992552243200.0, + "grad_norm": 2.9049728651643685, + "language_loss": 0.68475497, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.7061249, + "num_input_tokens_seen": 104821105, + "step": 4860, + "time_per_iteration": 2.620515823364258 + }, + { + "auxiliary_loss_clip": 0.01111516, + "auxiliary_loss_mlp": 0.01042436, + "balance_loss_clip": 1.04934347, + "balance_loss_mlp": 1.028054, + "epoch": 0.29225913121899894, + "flos": 35004608133120.0, + "grad_norm": 2.0956799027611943, + "language_loss": 0.75221592, + "learning_rate": 3.320843671338222e-06, + "loss": 0.77375549, + "num_input_tokens_seen": 104841440, + "step": 4861, + "time_per_iteration": 2.6450259685516357 + }, + { + "auxiliary_loss_clip": 0.01120963, + "auxiliary_loss_mlp": 0.01045138, + "balance_loss_clip": 1.04619575, + "balance_loss_mlp": 1.03071976, + "epoch": 0.2923192544716669, + "flos": 13515663888000.0, + "grad_norm": 1.7534340313707848, + "language_loss": 0.91651678, + "learning_rate": 3.320551201545832e-06, + "loss": 0.93817782, + "num_input_tokens_seen": 104858210, + "step": 4862, + "time_per_iteration": 2.5155978202819824 + }, + { + "auxiliary_loss_clip": 0.01122067, + "auxiliary_loss_mlp": 0.01038437, + "balance_loss_clip": 1.04535699, + "balance_loss_mlp": 1.0236311, + "epoch": 0.29237937772433487, + "flos": 19463512141440.0, + "grad_norm": 2.736432102024755, + "language_loss": 0.73195994, + "learning_rate": 3.320258681678008e-06, + "loss": 0.75356501, + "num_input_tokens_seen": 104875620, + "step": 4863, + "time_per_iteration": 3.9440033435821533 + }, + { + "auxiliary_loss_clip": 0.010617, + "auxiliary_loss_mlp": 0.01035244, + "balance_loss_clip": 1.04624104, + "balance_loss_mlp": 1.02029502, + "epoch": 0.29243950097700283, + "flos": 20850597694080.0, + "grad_norm": 1.8359665266469698, + "language_loss": 0.7779513, + "learning_rate": 3.319966111745842e-06, + "loss": 0.79892075, + "num_input_tokens_seen": 104894600, + "step": 4864, + "time_per_iteration": 2.6766703128814697 + }, + { + "auxiliary_loss_clip": 0.01094848, + "auxiliary_loss_mlp": 0.01042284, + "balance_loss_clip": 1.04344225, + "balance_loss_mlp": 1.02491522, + "epoch": 0.29249962422967085, + "flos": 23584225322880.0, + "grad_norm": 2.2018536745064647, + "language_loss": 0.81772017, + "learning_rate": 3.319673491760429e-06, + "loss": 0.83909142, + "num_input_tokens_seen": 104914530, + "step": 4865, + "time_per_iteration": 2.5476834774017334 + }, + { + "auxiliary_loss_clip": 0.01077071, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.04262638, + "balance_loss_mlp": 1.02388656, + "epoch": 0.2925597474823388, + "flos": 22273342473600.0, + "grad_norm": 2.162105808028683, + "language_loss": 0.85104614, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.87221777, + "num_input_tokens_seen": 104933460, + "step": 4866, + "time_per_iteration": 2.6124978065490723 + }, + { + "auxiliary_loss_clip": 0.0110476, + "auxiliary_loss_mlp": 0.01034294, + "balance_loss_clip": 1.04799366, + "balance_loss_mlp": 1.01959538, + "epoch": 0.2926198707350068, + "flos": 34456108475520.0, + "grad_norm": 1.6778864828391444, + "language_loss": 0.75531071, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.77670127, + "num_input_tokens_seen": 104954495, + "step": 4867, + "time_per_iteration": 3.9887735843658447 + }, + { + "auxiliary_loss_clip": 0.01073313, + "auxiliary_loss_mlp": 0.01045394, + "balance_loss_clip": 1.04441845, + "balance_loss_mlp": 1.02963471, + "epoch": 0.29267999398767475, + "flos": 20704153944960.0, + "grad_norm": 1.77119559111657, + "language_loss": 0.73429871, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.75548583, + "num_input_tokens_seen": 104971915, + "step": 4868, + "time_per_iteration": 3.9865503311157227 + }, + { + "auxiliary_loss_clip": 0.01084403, + "auxiliary_loss_mlp": 0.01040676, + "balance_loss_clip": 1.04842043, + "balance_loss_mlp": 1.02386785, + "epoch": 0.2927401172403427, + "flos": 18368667642240.0, + "grad_norm": 1.3192807070804216, + "language_loss": 0.74628031, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.7675311, + "num_input_tokens_seen": 104991335, + "step": 4869, + "time_per_iteration": 2.599173069000244 + }, + { + "auxiliary_loss_clip": 0.01107454, + "auxiliary_loss_mlp": 0.01035869, + "balance_loss_clip": 1.04796124, + "balance_loss_mlp": 1.02022839, + "epoch": 0.2928002404930107, + "flos": 26104041244800.0, + "grad_norm": 1.4750428906256239, + "language_loss": 0.76594031, + "learning_rate": 3.318209641423088e-06, + "loss": 0.78737354, + "num_input_tokens_seen": 105012015, + "step": 4870, + "time_per_iteration": 2.5542151927948 + }, + { + "auxiliary_loss_clip": 0.01122903, + "auxiliary_loss_mlp": 0.01044719, + "balance_loss_clip": 1.04645526, + "balance_loss_mlp": 1.02818465, + "epoch": 0.29286036374567864, + "flos": 21324726241920.0, + "grad_norm": 2.032644808503494, + "language_loss": 0.67701113, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.69868743, + "num_input_tokens_seen": 105031460, + "step": 4871, + "time_per_iteration": 2.5021960735321045 + }, + { + "auxiliary_loss_clip": 0.01102426, + "auxiliary_loss_mlp": 0.01047874, + "balance_loss_clip": 1.04175055, + "balance_loss_mlp": 1.03131557, + "epoch": 0.2929204869983466, + "flos": 29569492569600.0, + "grad_norm": 1.7439313058417694, + "language_loss": 0.77657032, + "learning_rate": 3.317623751303933e-06, + "loss": 0.79807335, + "num_input_tokens_seen": 105052965, + "step": 4872, + "time_per_iteration": 2.5804905891418457 + }, + { + "auxiliary_loss_clip": 0.01074211, + "auxiliary_loss_mlp": 0.01041091, + "balance_loss_clip": 1.05210352, + "balance_loss_mlp": 1.02417541, + "epoch": 0.2929806102510146, + "flos": 19058259922560.0, + "grad_norm": 1.9746360033515076, + "language_loss": 0.72462058, + "learning_rate": 3.317330731292164e-06, + "loss": 0.74577367, + "num_input_tokens_seen": 105071840, + "step": 4873, + "time_per_iteration": 2.6133391857147217 + }, + { + "auxiliary_loss_clip": 0.01123673, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.04591727, + "balance_loss_mlp": 1.0232836, + "epoch": 0.29304073350368254, + "flos": 21944221130880.0, + "grad_norm": 1.8474491602066785, + "language_loss": 0.78506398, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.80669451, + "num_input_tokens_seen": 105089445, + "step": 4874, + "time_per_iteration": 2.470649480819702 + }, + { + "auxiliary_loss_clip": 0.01084308, + "auxiliary_loss_mlp": 0.01045147, + "balance_loss_clip": 1.04921818, + "balance_loss_mlp": 1.02864826, + "epoch": 0.2931008567563505, + "flos": 15450818135040.0, + "grad_norm": 2.431306686061909, + "language_loss": 0.77197063, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.79326522, + "num_input_tokens_seen": 105106210, + "step": 4875, + "time_per_iteration": 2.5743253231048584 + }, + { + "auxiliary_loss_clip": 0.01129272, + "auxiliary_loss_mlp": 0.01033662, + "balance_loss_clip": 1.05002117, + "balance_loss_mlp": 1.01819503, + "epoch": 0.29316098000901847, + "flos": 16983162288000.0, + "grad_norm": 1.6622948697815456, + "language_loss": 0.69379979, + "learning_rate": 3.316451371581431e-06, + "loss": 0.71542919, + "num_input_tokens_seen": 105124200, + "step": 4876, + "time_per_iteration": 2.453495502471924 + }, + { + "auxiliary_loss_clip": 0.0111459, + "auxiliary_loss_mlp": 0.01046379, + "balance_loss_clip": 1.04559231, + "balance_loss_mlp": 1.03116751, + "epoch": 0.29322110326168643, + "flos": 16357705741440.0, + "grad_norm": 2.015632254337879, + "language_loss": 0.81755853, + "learning_rate": 3.316158151823096e-06, + "loss": 0.83916819, + "num_input_tokens_seen": 105140400, + "step": 4877, + "time_per_iteration": 2.470668077468872 + }, + { + "auxiliary_loss_clip": 0.01128089, + "auxiliary_loss_mlp": 0.01040921, + "balance_loss_clip": 1.04780734, + "balance_loss_mlp": 1.02525711, + "epoch": 0.29328122651435445, + "flos": 13990869843840.0, + "grad_norm": 2.688030710122657, + "language_loss": 0.67930096, + "learning_rate": 3.315864882155911e-06, + "loss": 0.70099103, + "num_input_tokens_seen": 105157535, + "step": 4878, + "time_per_iteration": 2.4566025733947754 + }, + { + "auxiliary_loss_clip": 0.01102792, + "auxiliary_loss_mlp": 0.01046594, + "balance_loss_clip": 1.04958808, + "balance_loss_mlp": 1.03110838, + "epoch": 0.2933413497670224, + "flos": 25264593423360.0, + "grad_norm": 1.7738488176787937, + "language_loss": 0.73789984, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.75939369, + "num_input_tokens_seen": 105175185, + "step": 4879, + "time_per_iteration": 2.5683116912841797 + }, + { + "auxiliary_loss_clip": 0.01094622, + "auxiliary_loss_mlp": 0.00796211, + "balance_loss_clip": 1.04959798, + "balance_loss_mlp": 1.01395142, + "epoch": 0.2934014730196904, + "flos": 32123746656000.0, + "grad_norm": 4.170465141648543, + "language_loss": 0.66318065, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.68208897, + "num_input_tokens_seen": 105194540, + "step": 4880, + "time_per_iteration": 2.629755973815918 + }, + { + "auxiliary_loss_clip": 0.01122982, + "auxiliary_loss_mlp": 0.01050818, + "balance_loss_clip": 1.04577231, + "balance_loss_mlp": 1.03466535, + "epoch": 0.29346159627235835, + "flos": 24352498344960.0, + "grad_norm": 2.2275853353298736, + "language_loss": 0.70025432, + "learning_rate": 3.314984773812481e-06, + "loss": 0.72199231, + "num_input_tokens_seen": 105213215, + "step": 4881, + "time_per_iteration": 2.5017483234405518 + }, + { + "auxiliary_loss_clip": 0.01105357, + "auxiliary_loss_mlp": 0.00793235, + "balance_loss_clip": 1.04586196, + "balance_loss_mlp": 1.01241755, + "epoch": 0.2935217195250263, + "flos": 22746752749440.0, + "grad_norm": 1.6120395084743337, + "language_loss": 0.83769453, + "learning_rate": 3.314691304621127e-06, + "loss": 0.85668045, + "num_input_tokens_seen": 105231585, + "step": 4882, + "time_per_iteration": 2.520005226135254 + }, + { + "auxiliary_loss_clip": 0.01139105, + "auxiliary_loss_mlp": 0.01041903, + "balance_loss_clip": 1.04858851, + "balance_loss_mlp": 1.02592885, + "epoch": 0.2935818427776943, + "flos": 21725561088000.0, + "grad_norm": 2.456426184981771, + "language_loss": 0.71473742, + "learning_rate": 3.314397785576548e-06, + "loss": 0.73654747, + "num_input_tokens_seen": 105250120, + "step": 4883, + "time_per_iteration": 2.4620766639709473 + }, + { + "auxiliary_loss_clip": 0.01111876, + "auxiliary_loss_mlp": 0.01039114, + "balance_loss_clip": 1.04647648, + "balance_loss_mlp": 1.02358127, + "epoch": 0.29364196603036224, + "flos": 23804968354560.0, + "grad_norm": 2.0907261016749437, + "language_loss": 0.92236882, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.94387877, + "num_input_tokens_seen": 105266065, + "step": 4884, + "time_per_iteration": 2.5337040424346924 + }, + { + "auxiliary_loss_clip": 0.01128784, + "auxiliary_loss_mlp": 0.01038558, + "balance_loss_clip": 1.05069876, + "balance_loss_mlp": 1.02288151, + "epoch": 0.2937020892830302, + "flos": 23470064922240.0, + "grad_norm": 6.138772849755271, + "language_loss": 0.73307711, + "learning_rate": 3.313810597972234e-06, + "loss": 0.75475055, + "num_input_tokens_seen": 105282155, + "step": 4885, + "time_per_iteration": 2.4846506118774414 + }, + { + "auxiliary_loss_clip": 0.01117812, + "auxiliary_loss_mlp": 0.01050858, + "balance_loss_clip": 1.04821086, + "balance_loss_mlp": 1.03486025, + "epoch": 0.2937622125356982, + "flos": 24272740195200.0, + "grad_norm": 1.9289476504170726, + "language_loss": 0.8539021, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.87558872, + "num_input_tokens_seen": 105299225, + "step": 4886, + "time_per_iteration": 2.495817184448242 + }, + { + "auxiliary_loss_clip": 0.01103479, + "auxiliary_loss_mlp": 0.01040423, + "balance_loss_clip": 1.04361868, + "balance_loss_mlp": 1.02521205, + "epoch": 0.29382233578836614, + "flos": 20662461233280.0, + "grad_norm": 2.501634940459168, + "language_loss": 0.77050245, + "learning_rate": 3.313223211088603e-06, + "loss": 0.79194146, + "num_input_tokens_seen": 105315710, + "step": 4887, + "time_per_iteration": 2.522519826889038 + }, + { + "auxiliary_loss_clip": 0.01114905, + "auxiliary_loss_mlp": 0.01045588, + "balance_loss_clip": 1.05093074, + "balance_loss_mlp": 1.03001952, + "epoch": 0.2938824590410341, + "flos": 16545052103040.0, + "grad_norm": 2.7373356755599896, + "language_loss": 0.79280293, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.81440789, + "num_input_tokens_seen": 105333505, + "step": 4888, + "time_per_iteration": 2.4880967140197754 + }, + { + "auxiliary_loss_clip": 0.0110834, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.04861164, + "balance_loss_mlp": 1.01996112, + "epoch": 0.29394258229370207, + "flos": 37925474382720.0, + "grad_norm": 1.4438986865716363, + "language_loss": 0.55357742, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57501233, + "num_input_tokens_seen": 105355605, + "step": 4889, + "time_per_iteration": 2.6691501140594482 + }, + { + "auxiliary_loss_clip": 0.01125455, + "auxiliary_loss_mlp": 0.01039283, + "balance_loss_clip": 1.04619408, + "balance_loss_mlp": 1.02261806, + "epoch": 0.29400270554637004, + "flos": 20044690197120.0, + "grad_norm": 1.8285535452983845, + "language_loss": 0.84365165, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.86529905, + "num_input_tokens_seen": 105374225, + "step": 4890, + "time_per_iteration": 2.4761905670166016 + }, + { + "auxiliary_loss_clip": 0.01129133, + "auxiliary_loss_mlp": 0.01047197, + "balance_loss_clip": 1.04979587, + "balance_loss_mlp": 1.03152084, + "epoch": 0.294062828799038, + "flos": 15266380775040.0, + "grad_norm": 1.8088010657157847, + "language_loss": 0.72714233, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.7489056, + "num_input_tokens_seen": 105391565, + "step": 4891, + "time_per_iteration": 2.46941876411438 + }, + { + "auxiliary_loss_clip": 0.0113805, + "auxiliary_loss_mlp": 0.01044374, + "balance_loss_clip": 1.04875684, + "balance_loss_mlp": 1.02786374, + "epoch": 0.294122952051706, + "flos": 22747147799040.0, + "grad_norm": 1.727525484627151, + "language_loss": 0.7727828, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.79460704, + "num_input_tokens_seen": 105409840, + "step": 4892, + "time_per_iteration": 2.4586985111236572 + }, + { + "auxiliary_loss_clip": 0.01131748, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.0444119, + "balance_loss_mlp": 1.01788926, + "epoch": 0.294183075304374, + "flos": 24972891073920.0, + "grad_norm": 3.1238776553728007, + "language_loss": 0.78405607, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.80571359, + "num_input_tokens_seen": 105428645, + "step": 4893, + "time_per_iteration": 2.4751572608947754 + }, + { + "auxiliary_loss_clip": 0.01099542, + "auxiliary_loss_mlp": 0.01042052, + "balance_loss_clip": 1.04928422, + "balance_loss_mlp": 1.02630448, + "epoch": 0.29424319855704195, + "flos": 30952986762240.0, + "grad_norm": 1.931257378100444, + "language_loss": 0.85214335, + "learning_rate": 3.311165788957864e-06, + "loss": 0.87355924, + "num_input_tokens_seen": 105447480, + "step": 4894, + "time_per_iteration": 2.6131014823913574 + }, + { + "auxiliary_loss_clip": 0.01121813, + "auxiliary_loss_mlp": 0.01039262, + "balance_loss_clip": 1.0467608, + "balance_loss_mlp": 1.02369308, + "epoch": 0.2943033218097099, + "flos": 15231583474560.0, + "grad_norm": 2.5654195386417964, + "language_loss": 0.90317953, + "learning_rate": 3.310871672543274e-06, + "loss": 0.92479026, + "num_input_tokens_seen": 105464600, + "step": 4895, + "time_per_iteration": 2.4986560344696045 + }, + { + "auxiliary_loss_clip": 0.0112748, + "auxiliary_loss_mlp": 0.01042008, + "balance_loss_clip": 1.04756224, + "balance_loss_mlp": 1.02504468, + "epoch": 0.2943634450623779, + "flos": 21725884310400.0, + "grad_norm": 1.8595131465716532, + "language_loss": 0.86807722, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.88977206, + "num_input_tokens_seen": 105481510, + "step": 4896, + "time_per_iteration": 2.4746692180633545 + }, + { + "auxiliary_loss_clip": 0.01126894, + "auxiliary_loss_mlp": 0.01044427, + "balance_loss_clip": 1.04753733, + "balance_loss_mlp": 1.02714169, + "epoch": 0.29442356831504585, + "flos": 22602104680320.0, + "grad_norm": 1.781062642290452, + "language_loss": 0.73368102, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.75539422, + "num_input_tokens_seen": 105501390, + "step": 4897, + "time_per_iteration": 3.878309488296509 + }, + { + "auxiliary_loss_clip": 0.01122727, + "auxiliary_loss_mlp": 0.01044693, + "balance_loss_clip": 1.0461216, + "balance_loss_mlp": 1.02657342, + "epoch": 0.2944836915677138, + "flos": 20011401267840.0, + "grad_norm": 1.969961281753076, + "language_loss": 0.73796952, + "learning_rate": 3.309989025093813e-06, + "loss": 0.75964367, + "num_input_tokens_seen": 105519600, + "step": 4898, + "time_per_iteration": 2.4916224479675293 + }, + { + "auxiliary_loss_clip": 0.01129737, + "auxiliary_loss_mlp": 0.01040618, + "balance_loss_clip": 1.05199862, + "balance_loss_mlp": 1.02171135, + "epoch": 0.2945438148203818, + "flos": 20045875345920.0, + "grad_norm": 2.4246476949459534, + "language_loss": 0.69957542, + "learning_rate": 3.309694709912618e-06, + "loss": 0.72127903, + "num_input_tokens_seen": 105535970, + "step": 4899, + "time_per_iteration": 2.4598538875579834 + }, + { + "auxiliary_loss_clip": 0.01112202, + "auxiliary_loss_mlp": 0.00799019, + "balance_loss_clip": 1.04465938, + "balance_loss_mlp": 1.02145696, + "epoch": 0.29460393807304974, + "flos": 23733542160000.0, + "grad_norm": 1.9578903959633245, + "language_loss": 0.79285479, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.81196702, + "num_input_tokens_seen": 105556735, + "step": 4900, + "time_per_iteration": 2.5395562648773193 + }, + { + "auxiliary_loss_clip": 0.01096787, + "auxiliary_loss_mlp": 0.01055943, + "balance_loss_clip": 1.04089987, + "balance_loss_mlp": 1.03783512, + "epoch": 0.2946640613257177, + "flos": 14976079056000.0, + "grad_norm": 1.6438298621403855, + "language_loss": 0.80785561, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.82938284, + "num_input_tokens_seen": 105574875, + "step": 4901, + "time_per_iteration": 2.474019765853882 + }, + { + "auxiliary_loss_clip": 0.01104026, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.04525256, + "balance_loss_mlp": 1.01814222, + "epoch": 0.2947241845783857, + "flos": 24243904552320.0, + "grad_norm": 2.4398297654725662, + "language_loss": 0.58460808, + "learning_rate": 3.308811466431157e-06, + "loss": 0.6059767, + "num_input_tokens_seen": 105594225, + "step": 4902, + "time_per_iteration": 3.9335074424743652 + }, + { + "auxiliary_loss_clip": 0.01113038, + "auxiliary_loss_mlp": 0.01039568, + "balance_loss_clip": 1.04633045, + "balance_loss_mlp": 1.02439249, + "epoch": 0.29478430783105364, + "flos": 19938394874880.0, + "grad_norm": 1.7611037090354738, + "language_loss": 0.7548719, + "learning_rate": 3.308516952661925e-06, + "loss": 0.77639794, + "num_input_tokens_seen": 105614000, + "step": 4903, + "time_per_iteration": 2.494096517562866 + }, + { + "auxiliary_loss_clip": 0.01108475, + "auxiliary_loss_mlp": 0.01055923, + "balance_loss_clip": 1.04559398, + "balance_loss_mlp": 1.0368849, + "epoch": 0.2948444310837216, + "flos": 27381347856000.0, + "grad_norm": 2.1661939074750842, + "language_loss": 0.62255424, + "learning_rate": 3.3082223892736e-06, + "loss": 0.64419824, + "num_input_tokens_seen": 105634575, + "step": 4904, + "time_per_iteration": 2.56199312210083 + }, + { + "auxiliary_loss_clip": 0.0112356, + "auxiliary_loss_mlp": 0.01044683, + "balance_loss_clip": 1.04610598, + "balance_loss_mlp": 1.02823246, + "epoch": 0.2949045543363896, + "flos": 23405462311680.0, + "grad_norm": 1.44627143917231, + "language_loss": 0.73085648, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.75253892, + "num_input_tokens_seen": 105654385, + "step": 4905, + "time_per_iteration": 3.8679378032684326 + }, + { + "auxiliary_loss_clip": 0.01100874, + "auxiliary_loss_mlp": 0.01042058, + "balance_loss_clip": 1.04384816, + "balance_loss_mlp": 1.02495158, + "epoch": 0.2949646775890576, + "flos": 23951483930880.0, + "grad_norm": 1.6266736324131077, + "language_loss": 0.81753802, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.83896732, + "num_input_tokens_seen": 105673570, + "step": 4906, + "time_per_iteration": 3.9810681343078613 + }, + { + "auxiliary_loss_clip": 0.01089943, + "auxiliary_loss_mlp": 0.01042727, + "balance_loss_clip": 1.04213428, + "balance_loss_mlp": 1.02686024, + "epoch": 0.29502480084172555, + "flos": 22784315397120.0, + "grad_norm": 1.8527376765501666, + "language_loss": 0.8753252, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.89665186, + "num_input_tokens_seen": 105691940, + "step": 4907, + "time_per_iteration": 2.584670305252075 + }, + { + "auxiliary_loss_clip": 0.01138108, + "auxiliary_loss_mlp": 0.01044809, + "balance_loss_clip": 1.04699647, + "balance_loss_mlp": 1.0272733, + "epoch": 0.2950849240943935, + "flos": 19646656611840.0, + "grad_norm": 2.246177546917773, + "language_loss": 0.81753218, + "learning_rate": 3.307043639752782e-06, + "loss": 0.83936131, + "num_input_tokens_seen": 105709825, + "step": 4908, + "time_per_iteration": 2.4579389095306396 + }, + { + "auxiliary_loss_clip": 0.01050203, + "auxiliary_loss_mlp": 0.01013942, + "balance_loss_clip": 1.01877642, + "balance_loss_mlp": 1.01147389, + "epoch": 0.2951450473470615, + "flos": 71002829260800.0, + "grad_norm": 0.8739148813883363, + "language_loss": 0.57220006, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59284151, + "num_input_tokens_seen": 105766880, + "step": 4909, + "time_per_iteration": 2.937335968017578 + }, + { + "auxiliary_loss_clip": 0.01123155, + "auxiliary_loss_mlp": 0.00794712, + "balance_loss_clip": 1.04816902, + "balance_loss_mlp": 1.01533914, + "epoch": 0.29520517059972945, + "flos": 22966310632320.0, + "grad_norm": 1.4832857933438257, + "language_loss": 0.86776012, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.88693881, + "num_input_tokens_seen": 105786875, + "step": 4910, + "time_per_iteration": 2.5236947536468506 + }, + { + "auxiliary_loss_clip": 0.01119667, + "auxiliary_loss_mlp": 0.01042815, + "balance_loss_clip": 1.04634047, + "balance_loss_mlp": 1.02769291, + "epoch": 0.2952652938523974, + "flos": 20485673470080.0, + "grad_norm": 1.9037110659509064, + "language_loss": 0.72981197, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.75143677, + "num_input_tokens_seen": 105805315, + "step": 4911, + "time_per_iteration": 2.493351697921753 + }, + { + "auxiliary_loss_clip": 0.01122406, + "auxiliary_loss_mlp": 0.01038456, + "balance_loss_clip": 1.04755485, + "balance_loss_mlp": 1.02301884, + "epoch": 0.2953254171050654, + "flos": 19646584784640.0, + "grad_norm": 1.6861268417955038, + "language_loss": 0.89695966, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.91856831, + "num_input_tokens_seen": 105825125, + "step": 4912, + "time_per_iteration": 2.5040273666381836 + }, + { + "auxiliary_loss_clip": 0.01104328, + "auxiliary_loss_mlp": 0.01052079, + "balance_loss_clip": 1.04452348, + "balance_loss_mlp": 1.03589022, + "epoch": 0.29538554035773334, + "flos": 22747973811840.0, + "grad_norm": 1.3662345777095246, + "language_loss": 0.83456069, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.85612476, + "num_input_tokens_seen": 105846085, + "step": 4913, + "time_per_iteration": 2.5472917556762695 + }, + { + "auxiliary_loss_clip": 0.01133839, + "auxiliary_loss_mlp": 0.01044669, + "balance_loss_clip": 1.04531503, + "balance_loss_mlp": 1.02942204, + "epoch": 0.2954456636104013, + "flos": 21871861182720.0, + "grad_norm": 1.6615543958123002, + "language_loss": 0.77000606, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.79179108, + "num_input_tokens_seen": 105865400, + "step": 4914, + "time_per_iteration": 2.482278347015381 + }, + { + "auxiliary_loss_clip": 0.0111123, + "auxiliary_loss_mlp": 0.01042056, + "balance_loss_clip": 1.04507804, + "balance_loss_mlp": 1.02577162, + "epoch": 0.2955057868630693, + "flos": 40442560871040.0, + "grad_norm": 1.735280153274108, + "language_loss": 0.81486237, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.83639526, + "num_input_tokens_seen": 105887920, + "step": 4915, + "time_per_iteration": 2.6914052963256836 + }, + { + "auxiliary_loss_clip": 0.01063129, + "auxiliary_loss_mlp": 0.01040463, + "balance_loss_clip": 1.04851794, + "balance_loss_mlp": 1.0246557, + "epoch": 0.29556591011573724, + "flos": 22564506119040.0, + "grad_norm": 1.8573261972168285, + "language_loss": 0.84808469, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.8691206, + "num_input_tokens_seen": 105904035, + "step": 4916, + "time_per_iteration": 2.663374423980713 + }, + { + "auxiliary_loss_clip": 0.01117781, + "auxiliary_loss_mlp": 0.01038662, + "balance_loss_clip": 1.04346943, + "balance_loss_mlp": 1.02290249, + "epoch": 0.2956260333684052, + "flos": 22089300163200.0, + "grad_norm": 1.918522518270424, + "language_loss": 0.69340903, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.71497345, + "num_input_tokens_seen": 105922685, + "step": 4917, + "time_per_iteration": 2.5277392864227295 + }, + { + "auxiliary_loss_clip": 0.01116632, + "auxiliary_loss_mlp": 0.01043496, + "balance_loss_clip": 1.0482322, + "balance_loss_mlp": 1.0271759, + "epoch": 0.2956861566210732, + "flos": 16435488643200.0, + "grad_norm": 2.1431860523004915, + "language_loss": 0.91212702, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.93372834, + "num_input_tokens_seen": 105940425, + "step": 4918, + "time_per_iteration": 2.5113093852996826 + }, + { + "auxiliary_loss_clip": 0.01136974, + "auxiliary_loss_mlp": 0.01040716, + "balance_loss_clip": 1.04891634, + "balance_loss_mlp": 1.02447987, + "epoch": 0.2957462798737412, + "flos": 25812087500160.0, + "grad_norm": 1.9192377883004035, + "language_loss": 0.72685957, + "learning_rate": 3.303797991757425e-06, + "loss": 0.74863648, + "num_input_tokens_seen": 105960550, + "step": 4919, + "time_per_iteration": 2.501753091812134 + }, + { + "auxiliary_loss_clip": 0.01111154, + "auxiliary_loss_mlp": 0.01045876, + "balance_loss_clip": 1.04485106, + "balance_loss_mlp": 1.02973485, + "epoch": 0.29580640312640916, + "flos": 16690849407360.0, + "grad_norm": 1.663954480290254, + "language_loss": 0.75656331, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.77813363, + "num_input_tokens_seen": 105978820, + "step": 4920, + "time_per_iteration": 2.5170164108276367 + }, + { + "auxiliary_loss_clip": 0.01114554, + "auxiliary_loss_mlp": 0.01048129, + "balance_loss_clip": 1.05274725, + "balance_loss_mlp": 1.03098667, + "epoch": 0.2958665263790771, + "flos": 23945594100480.0, + "grad_norm": 1.9891320726188144, + "language_loss": 0.68606687, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.7076937, + "num_input_tokens_seen": 105997545, + "step": 4921, + "time_per_iteration": 2.547312021255493 + }, + { + "auxiliary_loss_clip": 0.01113835, + "auxiliary_loss_mlp": 0.0104726, + "balance_loss_clip": 1.04876328, + "balance_loss_mlp": 1.02924728, + "epoch": 0.2959266496317451, + "flos": 18478410670080.0, + "grad_norm": 1.8585661098005761, + "language_loss": 0.74796164, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.76957256, + "num_input_tokens_seen": 106015320, + "step": 4922, + "time_per_iteration": 2.5280961990356445 + }, + { + "auxiliary_loss_clip": 0.01140753, + "auxiliary_loss_mlp": 0.00796917, + "balance_loss_clip": 1.04812646, + "balance_loss_mlp": 1.01910353, + "epoch": 0.29598677288441305, + "flos": 25957489754880.0, + "grad_norm": 1.8369715669648106, + "language_loss": 0.7659955, + "learning_rate": 3.302616272134737e-06, + "loss": 0.7853722, + "num_input_tokens_seen": 106034555, + "step": 4923, + "time_per_iteration": 2.5392491817474365 + }, + { + "auxiliary_loss_clip": 0.01109356, + "auxiliary_loss_mlp": 0.01042321, + "balance_loss_clip": 1.04817212, + "balance_loss_mlp": 1.02591789, + "epoch": 0.296046896137081, + "flos": 25155999630720.0, + "grad_norm": 1.5358678528472514, + "language_loss": 0.86043108, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.88194788, + "num_input_tokens_seen": 106054200, + "step": 4924, + "time_per_iteration": 2.5724589824676514 + }, + { + "auxiliary_loss_clip": 0.01124706, + "auxiliary_loss_mlp": 0.01036357, + "balance_loss_clip": 1.04624474, + "balance_loss_mlp": 1.01987004, + "epoch": 0.296107019389749, + "flos": 21761148487680.0, + "grad_norm": 1.5506582979546162, + "language_loss": 0.82045966, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.84207028, + "num_input_tokens_seen": 106074700, + "step": 4925, + "time_per_iteration": 2.493276596069336 + }, + { + "auxiliary_loss_clip": 0.01073246, + "auxiliary_loss_mlp": 0.0105467, + "balance_loss_clip": 1.04146099, + "balance_loss_mlp": 1.0360496, + "epoch": 0.29616714264241695, + "flos": 17960039544960.0, + "grad_norm": 2.765801889007856, + "language_loss": 0.85679293, + "learning_rate": 3.301729463727452e-06, + "loss": 0.87807202, + "num_input_tokens_seen": 106091415, + "step": 4926, + "time_per_iteration": 2.5829362869262695 + }, + { + "auxiliary_loss_clip": 0.01102715, + "auxiliary_loss_mlp": 0.01034584, + "balance_loss_clip": 1.04951811, + "balance_loss_mlp": 1.0181334, + "epoch": 0.2962272658950849, + "flos": 15012779777280.0, + "grad_norm": 1.8693649389657263, + "language_loss": 0.85876894, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.88014197, + "num_input_tokens_seen": 106109135, + "step": 4927, + "time_per_iteration": 2.5275156497955322 + }, + { + "auxiliary_loss_clip": 0.01124483, + "auxiliary_loss_mlp": 0.01035881, + "balance_loss_clip": 1.04768372, + "balance_loss_mlp": 1.02021694, + "epoch": 0.2962873891477529, + "flos": 14720861946240.0, + "grad_norm": 1.9016280820736657, + "language_loss": 0.80796528, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.82956898, + "num_input_tokens_seen": 106125750, + "step": 4928, + "time_per_iteration": 2.4751455783843994 + }, + { + "auxiliary_loss_clip": 0.01118862, + "auxiliary_loss_mlp": 0.01045489, + "balance_loss_clip": 1.04860318, + "balance_loss_mlp": 1.025617, + "epoch": 0.29634751240042084, + "flos": 26723787528960.0, + "grad_norm": 2.727473683610809, + "language_loss": 0.73132175, + "learning_rate": 3.300842211064773e-06, + "loss": 0.75296527, + "num_input_tokens_seen": 106142835, + "step": 4929, + "time_per_iteration": 2.5643842220306396 + }, + { + "auxiliary_loss_clip": 0.01112046, + "auxiliary_loss_mlp": 0.01051851, + "balance_loss_clip": 1.04593229, + "balance_loss_mlp": 1.03267026, + "epoch": 0.2964076356530888, + "flos": 14571293713920.0, + "grad_norm": 2.5975991010604322, + "language_loss": 0.72156721, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.74320614, + "num_input_tokens_seen": 106160680, + "step": 4930, + "time_per_iteration": 2.5074079036712646 + }, + { + "auxiliary_loss_clip": 0.01024303, + "auxiliary_loss_mlp": 0.01012113, + "balance_loss_clip": 1.03160453, + "balance_loss_mlp": 1.00978816, + "epoch": 0.29646775890575683, + "flos": 63104315063040.0, + "grad_norm": 0.839756314478916, + "language_loss": 0.60666978, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.62703395, + "num_input_tokens_seen": 106224415, + "step": 4931, + "time_per_iteration": 3.101501941680908 + }, + { + "auxiliary_loss_clip": 0.01016519, + "auxiliary_loss_mlp": 0.0100333, + "balance_loss_clip": 1.03191793, + "balance_loss_mlp": 1.00133944, + "epoch": 0.2965278821584248, + "flos": 63067686168960.0, + "grad_norm": 0.7358175170088151, + "language_loss": 0.52359688, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54379535, + "num_input_tokens_seen": 106279140, + "step": 4932, + "time_per_iteration": 3.108490467071533 + }, + { + "auxiliary_loss_clip": 0.01120713, + "auxiliary_loss_mlp": 0.01040023, + "balance_loss_clip": 1.04415369, + "balance_loss_mlp": 1.02402496, + "epoch": 0.29658800541109276, + "flos": 23768734510080.0, + "grad_norm": 1.8313847156801686, + "language_loss": 0.81668144, + "learning_rate": 3.299658516973972e-06, + "loss": 0.83828884, + "num_input_tokens_seen": 106298190, + "step": 4933, + "time_per_iteration": 2.508192777633667 + }, + { + "auxiliary_loss_clip": 0.0108748, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.04446614, + "balance_loss_mlp": 1.0176537, + "epoch": 0.2966481286637607, + "flos": 23988543788160.0, + "grad_norm": 1.6980827793485707, + "language_loss": 0.75080633, + "learning_rate": 3.299362470215261e-06, + "loss": 0.77202052, + "num_input_tokens_seen": 106319065, + "step": 4934, + "time_per_iteration": 2.5856339931488037 + }, + { + "auxiliary_loss_clip": 0.01117362, + "auxiliary_loss_mlp": 0.010438, + "balance_loss_clip": 1.04535282, + "balance_loss_mlp": 1.02688408, + "epoch": 0.2967082519164287, + "flos": 17165157523200.0, + "grad_norm": 2.561484325445196, + "language_loss": 0.62242049, + "learning_rate": 3.299066374184594e-06, + "loss": 0.64403206, + "num_input_tokens_seen": 106338040, + "step": 4935, + "time_per_iteration": 2.5256917476654053 + }, + { + "auxiliary_loss_clip": 0.01120715, + "auxiliary_loss_mlp": 0.01044921, + "balance_loss_clip": 1.04529619, + "balance_loss_mlp": 1.02814782, + "epoch": 0.29676837516909665, + "flos": 29387712816000.0, + "grad_norm": 1.4627948897126581, + "language_loss": 0.79835713, + "learning_rate": 3.2987702288932e-06, + "loss": 0.82001352, + "num_input_tokens_seen": 106358900, + "step": 4936, + "time_per_iteration": 3.982489585876465 + }, + { + "auxiliary_loss_clip": 0.01092786, + "auxiliary_loss_mlp": 0.01043847, + "balance_loss_clip": 1.04642642, + "balance_loss_mlp": 1.02703881, + "epoch": 0.2968284984217646, + "flos": 34751222616960.0, + "grad_norm": 1.5205678510468719, + "language_loss": 0.74239337, + "learning_rate": 3.298474034352309e-06, + "loss": 0.76375973, + "num_input_tokens_seen": 106381805, + "step": 4937, + "time_per_iteration": 2.709940195083618 + }, + { + "auxiliary_loss_clip": 0.01085801, + "auxiliary_loss_mlp": 0.01038702, + "balance_loss_clip": 1.04770231, + "balance_loss_mlp": 1.02221584, + "epoch": 0.2968886216744326, + "flos": 21544104556800.0, + "grad_norm": 1.749637238471787, + "language_loss": 0.78230458, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.80354965, + "num_input_tokens_seen": 106402365, + "step": 4938, + "time_per_iteration": 2.591019630432129 + }, + { + "auxiliary_loss_clip": 0.01112082, + "auxiliary_loss_mlp": 0.01053028, + "balance_loss_clip": 1.04882312, + "balance_loss_mlp": 1.03456235, + "epoch": 0.29694874492710055, + "flos": 12787323811200.0, + "grad_norm": 2.5581979510543817, + "language_loss": 0.76968861, + "learning_rate": 3.297881497566964e-06, + "loss": 0.79133964, + "num_input_tokens_seen": 106419800, + "step": 4939, + "time_per_iteration": 2.501011848449707 + }, + { + "auxiliary_loss_clip": 0.01103574, + "auxiliary_loss_mlp": 0.01042692, + "balance_loss_clip": 1.04527116, + "balance_loss_mlp": 1.02581215, + "epoch": 0.2970088681797685, + "flos": 24569973239040.0, + "grad_norm": 1.5268105538887367, + "language_loss": 0.77793717, + "learning_rate": 3.297585155344979e-06, + "loss": 0.79939985, + "num_input_tokens_seen": 106440300, + "step": 4940, + "time_per_iteration": 3.985199451446533 + }, + { + "auxiliary_loss_clip": 0.01110712, + "auxiliary_loss_mlp": 0.01042539, + "balance_loss_clip": 1.04663587, + "balance_loss_mlp": 1.02454996, + "epoch": 0.2970689914324365, + "flos": 23659171050240.0, + "grad_norm": 1.5910322643189212, + "language_loss": 0.75560927, + "learning_rate": 3.297288763918435e-06, + "loss": 0.77714175, + "num_input_tokens_seen": 106460035, + "step": 4941, + "time_per_iteration": 2.5265209674835205 + }, + { + "auxiliary_loss_clip": 0.01129836, + "auxiliary_loss_mlp": 0.01052822, + "balance_loss_clip": 1.04919887, + "balance_loss_mlp": 1.03523827, + "epoch": 0.29712911468510445, + "flos": 39670301439360.0, + "grad_norm": 2.2224279724879494, + "language_loss": 0.736848, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.75867462, + "num_input_tokens_seen": 106481095, + "step": 4942, + "time_per_iteration": 2.6539859771728516 + }, + { + "auxiliary_loss_clip": 0.01103854, + "auxiliary_loss_mlp": 0.01049522, + "balance_loss_clip": 1.04729104, + "balance_loss_mlp": 1.03149712, + "epoch": 0.2971892379377724, + "flos": 26395312631040.0, + "grad_norm": 1.8462232639750258, + "language_loss": 0.70156109, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.72309488, + "num_input_tokens_seen": 106501590, + "step": 4943, + "time_per_iteration": 2.584354877471924 + }, + { + "auxiliary_loss_clip": 0.0111667, + "auxiliary_loss_mlp": 0.01038744, + "balance_loss_clip": 1.04889107, + "balance_loss_mlp": 1.02176857, + "epoch": 0.2972493611904404, + "flos": 17603195880960.0, + "grad_norm": 5.43307363628931, + "language_loss": 0.80035192, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.82190609, + "num_input_tokens_seen": 106519430, + "step": 4944, + "time_per_iteration": 3.8918614387512207 + }, + { + "auxiliary_loss_clip": 0.01109374, + "auxiliary_loss_mlp": 0.01046265, + "balance_loss_clip": 1.04611421, + "balance_loss_mlp": 1.03054166, + "epoch": 0.2973094844431084, + "flos": 20412774817920.0, + "grad_norm": 2.074660948512871, + "language_loss": 0.83114612, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.8527025, + "num_input_tokens_seen": 106535870, + "step": 4945, + "time_per_iteration": 3.9237940311431885 + }, + { + "auxiliary_loss_clip": 0.01084488, + "auxiliary_loss_mlp": 0.01042055, + "balance_loss_clip": 1.0454663, + "balance_loss_mlp": 1.02603281, + "epoch": 0.29736960769577636, + "flos": 17493488766720.0, + "grad_norm": 1.7190780776521672, + "language_loss": 0.6724177, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.69368315, + "num_input_tokens_seen": 106553560, + "step": 4946, + "time_per_iteration": 2.5634496212005615 + }, + { + "auxiliary_loss_clip": 0.01120952, + "auxiliary_loss_mlp": 0.00796396, + "balance_loss_clip": 1.04690695, + "balance_loss_mlp": 1.01498115, + "epoch": 0.2974297309484443, + "flos": 26103969417600.0, + "grad_norm": 2.2294357679976953, + "language_loss": 0.73810446, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.75727797, + "num_input_tokens_seen": 106574115, + "step": 4947, + "time_per_iteration": 2.542558431625366 + }, + { + "auxiliary_loss_clip": 0.01107005, + "auxiliary_loss_mlp": 0.0104565, + "balance_loss_clip": 1.05087936, + "balance_loss_mlp": 1.02879381, + "epoch": 0.2974898542011123, + "flos": 25666433850240.0, + "grad_norm": 1.9620140727914286, + "language_loss": 0.73488063, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.75640714, + "num_input_tokens_seen": 106593070, + "step": 4948, + "time_per_iteration": 2.574901819229126 + }, + { + "auxiliary_loss_clip": 0.01134555, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.04622447, + "balance_loss_mlp": 1.01902342, + "epoch": 0.29754997745378026, + "flos": 18661339658880.0, + "grad_norm": 2.059281226990047, + "language_loss": 0.84081614, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.8625083, + "num_input_tokens_seen": 106610695, + "step": 4949, + "time_per_iteration": 2.434049129486084 + }, + { + "auxiliary_loss_clip": 0.01120207, + "auxiliary_loss_mlp": 0.01040125, + "balance_loss_clip": 1.04426491, + "balance_loss_mlp": 1.02424049, + "epoch": 0.2976101007064482, + "flos": 22274599449600.0, + "grad_norm": 1.7927366131895477, + "language_loss": 0.71439672, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73600006, + "num_input_tokens_seen": 106631300, + "step": 4950, + "time_per_iteration": 2.5180699825286865 + }, + { + "auxiliary_loss_clip": 0.0108866, + "auxiliary_loss_mlp": 0.01047115, + "balance_loss_clip": 1.047786, + "balance_loss_mlp": 1.03222561, + "epoch": 0.2976702239591162, + "flos": 21945657674880.0, + "grad_norm": 3.600264950959555, + "language_loss": 0.82365501, + "learning_rate": 3.294322145875789e-06, + "loss": 0.84501278, + "num_input_tokens_seen": 106650065, + "step": 4951, + "time_per_iteration": 2.613265037536621 + }, + { + "auxiliary_loss_clip": 0.01107971, + "auxiliary_loss_mlp": 0.01037204, + "balance_loss_clip": 1.04129207, + "balance_loss_mlp": 1.02049077, + "epoch": 0.29773034721178415, + "flos": 24637197542400.0, + "grad_norm": 2.4577561368441456, + "language_loss": 0.74009323, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.76154494, + "num_input_tokens_seen": 106668230, + "step": 4952, + "time_per_iteration": 2.5388505458831787 + }, + { + "auxiliary_loss_clip": 0.01057955, + "auxiliary_loss_mlp": 0.01040764, + "balance_loss_clip": 1.04443014, + "balance_loss_mlp": 1.02270341, + "epoch": 0.2977904704644521, + "flos": 20557566541440.0, + "grad_norm": 1.6479531658806863, + "language_loss": 0.8396914, + "learning_rate": 3.293728232937228e-06, + "loss": 0.86067855, + "num_input_tokens_seen": 106687785, + "step": 4953, + "time_per_iteration": 2.661498546600342 + }, + { + "auxiliary_loss_clip": 0.01112075, + "auxiliary_loss_mlp": 0.01045485, + "balance_loss_clip": 1.0459528, + "balance_loss_mlp": 1.0298326, + "epoch": 0.2978505937171201, + "flos": 18916449027840.0, + "grad_norm": 2.0291355196395164, + "language_loss": 0.74258542, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.76416099, + "num_input_tokens_seen": 106706875, + "step": 4954, + "time_per_iteration": 2.5045459270477295 + }, + { + "auxiliary_loss_clip": 0.01134979, + "auxiliary_loss_mlp": 0.01037066, + "balance_loss_clip": 1.04831278, + "balance_loss_mlp": 1.02199793, + "epoch": 0.29791071696978805, + "flos": 19317750750720.0, + "grad_norm": 1.872131220493492, + "language_loss": 0.7530753, + "learning_rate": 3.293134123765452e-06, + "loss": 0.77479577, + "num_input_tokens_seen": 106725105, + "step": 4955, + "time_per_iteration": 2.4464757442474365 + }, + { + "auxiliary_loss_clip": 0.01090437, + "auxiliary_loss_mlp": 0.01039127, + "balance_loss_clip": 1.04950976, + "balance_loss_mlp": 1.02297449, + "epoch": 0.297970840222456, + "flos": 18806813740800.0, + "grad_norm": 1.7485968961253566, + "language_loss": 0.72980022, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.75109583, + "num_input_tokens_seen": 106744780, + "step": 4956, + "time_per_iteration": 2.5750458240509033 + }, + { + "auxiliary_loss_clip": 0.01127841, + "auxiliary_loss_mlp": 0.01043474, + "balance_loss_clip": 1.04751933, + "balance_loss_mlp": 1.02626014, + "epoch": 0.298030963475124, + "flos": 22852760762880.0, + "grad_norm": 1.9824948840039616, + "language_loss": 0.78920531, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81091845, + "num_input_tokens_seen": 106764670, + "step": 4957, + "time_per_iteration": 2.50779390335083 + }, + { + "auxiliary_loss_clip": 0.01129331, + "auxiliary_loss_mlp": 0.01039878, + "balance_loss_clip": 1.04948533, + "balance_loss_mlp": 1.02233028, + "epoch": 0.298091086727792, + "flos": 21868485304320.0, + "grad_norm": 1.4954502273126786, + "language_loss": 0.70709717, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.72878921, + "num_input_tokens_seen": 106783695, + "step": 4958, + "time_per_iteration": 2.4895408153533936 + }, + { + "auxiliary_loss_clip": 0.0110244, + "auxiliary_loss_mlp": 0.01051433, + "balance_loss_clip": 1.05049324, + "balance_loss_mlp": 1.03514886, + "epoch": 0.29815120998045996, + "flos": 21175014355200.0, + "grad_norm": 1.535492812525439, + "language_loss": 0.79017234, + "learning_rate": 3.291945317082743e-06, + "loss": 0.81171107, + "num_input_tokens_seen": 106803150, + "step": 4959, + "time_per_iteration": 2.5596766471862793 + }, + { + "auxiliary_loss_clip": 0.01123787, + "auxiliary_loss_mlp": 0.01053742, + "balance_loss_clip": 1.04743814, + "balance_loss_mlp": 1.03761268, + "epoch": 0.29821133323312793, + "flos": 19896271200000.0, + "grad_norm": 1.9875378260786563, + "language_loss": 0.79875195, + "learning_rate": 3.291647992907147e-06, + "loss": 0.8205272, + "num_input_tokens_seen": 106820705, + "step": 4960, + "time_per_iteration": 2.4760794639587402 + }, + { + "auxiliary_loss_clip": 0.01113633, + "auxiliary_loss_mlp": 0.01046445, + "balance_loss_clip": 1.05124331, + "balance_loss_mlp": 1.0291357, + "epoch": 0.2982714564857959, + "flos": 12750766744320.0, + "grad_norm": 2.2486105599802104, + "language_loss": 0.73531622, + "learning_rate": 3.291350619752129e-06, + "loss": 0.756917, + "num_input_tokens_seen": 106837335, + "step": 4961, + "time_per_iteration": 2.531867742538452 + }, + { + "auxiliary_loss_clip": 0.01126822, + "auxiliary_loss_mlp": 0.01039491, + "balance_loss_clip": 1.0498147, + "balance_loss_mlp": 1.02360058, + "epoch": 0.29833157973846386, + "flos": 22271905929600.0, + "grad_norm": 1.8868443884366033, + "language_loss": 0.62142742, + "learning_rate": 3.291053197628967e-06, + "loss": 0.64309055, + "num_input_tokens_seen": 106856250, + "step": 4962, + "time_per_iteration": 2.495713472366333 + }, + { + "auxiliary_loss_clip": 0.01126938, + "auxiliary_loss_mlp": 0.01045824, + "balance_loss_clip": 1.05467582, + "balance_loss_mlp": 1.02828836, + "epoch": 0.2983917029911318, + "flos": 15372999319680.0, + "grad_norm": 1.7620275376838912, + "language_loss": 0.83093548, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.85266304, + "num_input_tokens_seen": 106873370, + "step": 4963, + "time_per_iteration": 2.493417501449585 + }, + { + "auxiliary_loss_clip": 0.01106039, + "auxiliary_loss_mlp": 0.01039411, + "balance_loss_clip": 1.05155969, + "balance_loss_mlp": 1.02332926, + "epoch": 0.2984518262437998, + "flos": 15377632174080.0, + "grad_norm": 2.2934563308067455, + "language_loss": 0.65996009, + "learning_rate": 3.290458206523322e-06, + "loss": 0.6814146, + "num_input_tokens_seen": 106890330, + "step": 4964, + "time_per_iteration": 2.5364348888397217 + }, + { + "auxiliary_loss_clip": 0.01124171, + "auxiliary_loss_mlp": 0.01037257, + "balance_loss_clip": 1.04908979, + "balance_loss_mlp": 1.022928, + "epoch": 0.29851194949646775, + "flos": 18108458542080.0, + "grad_norm": 1.7134842790634002, + "language_loss": 0.71102113, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.73263538, + "num_input_tokens_seen": 106909190, + "step": 4965, + "time_per_iteration": 2.507201910018921 + }, + { + "auxiliary_loss_clip": 0.01145099, + "auxiliary_loss_mlp": 0.01049926, + "balance_loss_clip": 1.05561697, + "balance_loss_mlp": 1.03374922, + "epoch": 0.2985720727491357, + "flos": 22018233104640.0, + "grad_norm": 1.7465980719507772, + "language_loss": 0.6666913, + "learning_rate": 3.289863019680461e-06, + "loss": 0.68864149, + "num_input_tokens_seen": 106927825, + "step": 4966, + "time_per_iteration": 2.4612295627593994 + }, + { + "auxiliary_loss_clip": 0.01143995, + "auxiliary_loss_mlp": 0.01042389, + "balance_loss_clip": 1.05476546, + "balance_loss_mlp": 1.02617693, + "epoch": 0.2986321960018037, + "flos": 13041355772160.0, + "grad_norm": 2.496527951257142, + "language_loss": 0.73763889, + "learning_rate": 3.289565352885785e-06, + "loss": 0.75950271, + "num_input_tokens_seen": 106943155, + "step": 4967, + "time_per_iteration": 2.4381401538848877 + }, + { + "auxiliary_loss_clip": 0.0110763, + "auxiliary_loss_mlp": 0.01033817, + "balance_loss_clip": 1.04713082, + "balance_loss_mlp": 1.01889229, + "epoch": 0.29869231925447165, + "flos": 14465034305280.0, + "grad_norm": 1.9622410450331167, + "language_loss": 0.71063399, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.73204845, + "num_input_tokens_seen": 106960295, + "step": 4968, + "time_per_iteration": 2.5260169506073 + }, + { + "auxiliary_loss_clip": 0.01121578, + "auxiliary_loss_mlp": 0.01039779, + "balance_loss_clip": 1.0515306, + "balance_loss_mlp": 1.0234828, + "epoch": 0.2987524425071396, + "flos": 31650228639360.0, + "grad_norm": 1.5880547015350621, + "language_loss": 0.76861179, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.79022533, + "num_input_tokens_seen": 106982870, + "step": 4969, + "time_per_iteration": 2.583770275115967 + }, + { + "auxiliary_loss_clip": 0.01138014, + "auxiliary_loss_mlp": 0.01034243, + "balance_loss_clip": 1.05114555, + "balance_loss_mlp": 1.01953244, + "epoch": 0.2988125657598076, + "flos": 21433427775360.0, + "grad_norm": 1.803237038759876, + "language_loss": 0.69812667, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.71984923, + "num_input_tokens_seen": 107002405, + "step": 4970, + "time_per_iteration": 2.448671340942383 + }, + { + "auxiliary_loss_clip": 0.01130592, + "auxiliary_loss_mlp": 0.01042155, + "balance_loss_clip": 1.05102003, + "balance_loss_mlp": 1.024786, + "epoch": 0.2988726890124756, + "flos": 18076965292800.0, + "grad_norm": 1.9330718555226658, + "language_loss": 0.85060537, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.87233281, + "num_input_tokens_seen": 107017310, + "step": 4971, + "time_per_iteration": 2.475008726119995 + }, + { + "auxiliary_loss_clip": 0.01108282, + "auxiliary_loss_mlp": 0.01050545, + "balance_loss_clip": 1.04856646, + "balance_loss_mlp": 1.032938, + "epoch": 0.29893281226514357, + "flos": 21755653706880.0, + "grad_norm": 2.326055663764313, + "language_loss": 0.7943964, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.81598467, + "num_input_tokens_seen": 107034645, + "step": 4972, + "time_per_iteration": 2.536632776260376 + }, + { + "auxiliary_loss_clip": 0.01139338, + "auxiliary_loss_mlp": 0.01045256, + "balance_loss_clip": 1.05158985, + "balance_loss_mlp": 1.02937722, + "epoch": 0.29899293551781153, + "flos": 16836718538880.0, + "grad_norm": 1.788750696460857, + "language_loss": 0.85357928, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.87542522, + "num_input_tokens_seen": 107051125, + "step": 4973, + "time_per_iteration": 2.452819347381592 + }, + { + "auxiliary_loss_clip": 0.01106943, + "auxiliary_loss_mlp": 0.01035399, + "balance_loss_clip": 1.0482924, + "balance_loss_mlp": 1.01921034, + "epoch": 0.2990530587704795, + "flos": 11729215946880.0, + "grad_norm": 1.9519118679618412, + "language_loss": 0.77768046, + "learning_rate": 3.287480316742863e-06, + "loss": 0.79910386, + "num_input_tokens_seen": 107068815, + "step": 4974, + "time_per_iteration": 2.5142340660095215 + }, + { + "auxiliary_loss_clip": 0.01111197, + "auxiliary_loss_mlp": 0.00804885, + "balance_loss_clip": 1.0485791, + "balance_loss_mlp": 1.02873158, + "epoch": 0.29911318202314746, + "flos": 28039877850240.0, + "grad_norm": 1.6282605991294536, + "language_loss": 0.72612631, + "learning_rate": 3.287182259060815e-06, + "loss": 0.74528718, + "num_input_tokens_seen": 107090420, + "step": 4975, + "time_per_iteration": 4.041979074478149 + }, + { + "auxiliary_loss_clip": 0.01130138, + "auxiliary_loss_mlp": 0.01039758, + "balance_loss_clip": 1.05351257, + "balance_loss_mlp": 1.02342653, + "epoch": 0.2991733052758154, + "flos": 18733555952640.0, + "grad_norm": 2.0560499047342864, + "language_loss": 0.76485431, + "learning_rate": 3.286884152568687e-06, + "loss": 0.78655332, + "num_input_tokens_seen": 107107255, + "step": 4976, + "time_per_iteration": 2.4753096103668213 + }, + { + "auxiliary_loss_clip": 0.0112518, + "auxiliary_loss_mlp": 0.01039798, + "balance_loss_clip": 1.050264, + "balance_loss_mlp": 1.02437198, + "epoch": 0.2992334285284834, + "flos": 15559160532480.0, + "grad_norm": 1.9670025423908344, + "language_loss": 0.86251616, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.884166, + "num_input_tokens_seen": 107123840, + "step": 4977, + "time_per_iteration": 2.464825391769409 + }, + { + "auxiliary_loss_clip": 0.01116371, + "auxiliary_loss_mlp": 0.010414, + "balance_loss_clip": 1.05296421, + "balance_loss_mlp": 1.02494907, + "epoch": 0.29929355178115136, + "flos": 21797561900160.0, + "grad_norm": 1.5526784782249545, + "language_loss": 0.68555915, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.70713687, + "num_input_tokens_seen": 107143475, + "step": 4978, + "time_per_iteration": 2.5562891960144043 + }, + { + "auxiliary_loss_clip": 0.01120986, + "auxiliary_loss_mlp": 0.01036255, + "balance_loss_clip": 1.05344117, + "balance_loss_mlp": 1.02008998, + "epoch": 0.2993536750338193, + "flos": 21178533888000.0, + "grad_norm": 2.443722641167502, + "language_loss": 0.76693004, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.78850245, + "num_input_tokens_seen": 107161725, + "step": 4979, + "time_per_iteration": 3.898388385772705 + }, + { + "auxiliary_loss_clip": 0.01090218, + "auxiliary_loss_mlp": 0.010413, + "balance_loss_clip": 1.04398394, + "balance_loss_mlp": 1.02354956, + "epoch": 0.2994137982864873, + "flos": 32122130544000.0, + "grad_norm": 1.5650390116049888, + "language_loss": 0.68047053, + "learning_rate": 3.285691238725484e-06, + "loss": 0.70178568, + "num_input_tokens_seen": 107183935, + "step": 4980, + "time_per_iteration": 2.6674156188964844 + }, + { + "auxiliary_loss_clip": 0.01125998, + "auxiliary_loss_mlp": 0.00793846, + "balance_loss_clip": 1.05301797, + "balance_loss_mlp": 1.01659429, + "epoch": 0.29947392153915525, + "flos": 21105419754240.0, + "grad_norm": 1.7367779102422212, + "language_loss": 0.73491484, + "learning_rate": 3.285392888352555e-06, + "loss": 0.75411332, + "num_input_tokens_seen": 107204285, + "step": 4981, + "time_per_iteration": 2.565366268157959 + }, + { + "auxiliary_loss_clip": 0.01128904, + "auxiliary_loss_mlp": 0.01040841, + "balance_loss_clip": 1.04977846, + "balance_loss_mlp": 1.02514136, + "epoch": 0.2995340447918232, + "flos": 21542632099200.0, + "grad_norm": 1.5152469998882911, + "language_loss": 0.86179805, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.88349557, + "num_input_tokens_seen": 107225265, + "step": 4982, + "time_per_iteration": 3.9573800563812256 + }, + { + "auxiliary_loss_clip": 0.01124265, + "auxiliary_loss_mlp": 0.01039309, + "balance_loss_clip": 1.05501938, + "balance_loss_mlp": 1.02183306, + "epoch": 0.2995941680444912, + "flos": 16725143917440.0, + "grad_norm": 2.178267434296674, + "language_loss": 0.86945653, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.8910923, + "num_input_tokens_seen": 107241335, + "step": 4983, + "time_per_iteration": 3.9267542362213135 + }, + { + "auxiliary_loss_clip": 0.01124063, + "auxiliary_loss_mlp": 0.01044364, + "balance_loss_clip": 1.05308414, + "balance_loss_mlp": 1.02948678, + "epoch": 0.2996542912971592, + "flos": 20923496346240.0, + "grad_norm": 2.00736407207815, + "language_loss": 0.78666437, + "learning_rate": 3.284497544825668e-06, + "loss": 0.8083486, + "num_input_tokens_seen": 107259375, + "step": 4984, + "time_per_iteration": 2.511950969696045 + }, + { + "auxiliary_loss_clip": 0.01108388, + "auxiliary_loss_mlp": 0.01045447, + "balance_loss_clip": 1.05400062, + "balance_loss_mlp": 1.0286746, + "epoch": 0.29971441454982717, + "flos": 25079868754560.0, + "grad_norm": 1.5653538001627576, + "language_loss": 0.78252494, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.80406332, + "num_input_tokens_seen": 107279890, + "step": 4985, + "time_per_iteration": 2.588338851928711 + }, + { + "auxiliary_loss_clip": 0.01083459, + "auxiliary_loss_mlp": 0.01055652, + "balance_loss_clip": 1.04852283, + "balance_loss_mlp": 1.03530347, + "epoch": 0.29977453780249513, + "flos": 52555911840000.0, + "grad_norm": 2.261501183930516, + "language_loss": 0.71658659, + "learning_rate": 3.283900405580837e-06, + "loss": 0.73797762, + "num_input_tokens_seen": 107303430, + "step": 4986, + "time_per_iteration": 2.897321939468384 + }, + { + "auxiliary_loss_clip": 0.01117725, + "auxiliary_loss_mlp": 0.01043639, + "balance_loss_clip": 1.0510875, + "balance_loss_mlp": 1.02681816, + "epoch": 0.2998346610551631, + "flos": 22237144542720.0, + "grad_norm": 1.6516946314641412, + "language_loss": 0.73302639, + "learning_rate": 3.283601762924312e-06, + "loss": 0.7546401, + "num_input_tokens_seen": 107323700, + "step": 4987, + "time_per_iteration": 2.531484365463257 + }, + { + "auxiliary_loss_clip": 0.01108881, + "auxiliary_loss_mlp": 0.01039982, + "balance_loss_clip": 1.05021739, + "balance_loss_mlp": 1.02366257, + "epoch": 0.29989478430783106, + "flos": 16873203778560.0, + "grad_norm": 1.6716986159150196, + "language_loss": 0.79939735, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.8208859, + "num_input_tokens_seen": 107341965, + "step": 4988, + "time_per_iteration": 2.53928804397583 + }, + { + "auxiliary_loss_clip": 0.01115922, + "auxiliary_loss_mlp": 0.00795748, + "balance_loss_clip": 1.0486393, + "balance_loss_mlp": 1.01360512, + "epoch": 0.29995490756049903, + "flos": 23768878164480.0, + "grad_norm": 1.6730632093885551, + "language_loss": 0.70436561, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.72348231, + "num_input_tokens_seen": 107362615, + "step": 4989, + "time_per_iteration": 2.56370210647583 + }, + { + "auxiliary_loss_clip": 0.01104126, + "auxiliary_loss_mlp": 0.01045497, + "balance_loss_clip": 1.05030847, + "balance_loss_mlp": 1.02840257, + "epoch": 0.300015030813167, + "flos": 14465321614080.0, + "grad_norm": 2.0264694424956295, + "language_loss": 0.85662687, + "learning_rate": 3.282705542954199e-06, + "loss": 0.8781231, + "num_input_tokens_seen": 107378980, + "step": 4990, + "time_per_iteration": 2.5663490295410156 + }, + { + "auxiliary_loss_clip": 0.01128654, + "auxiliary_loss_mlp": 0.01040321, + "balance_loss_clip": 1.05016279, + "balance_loss_mlp": 1.02319098, + "epoch": 0.30007515406583496, + "flos": 25191982080000.0, + "grad_norm": 1.6170242131856498, + "language_loss": 0.66896319, + "learning_rate": 3.28240670566841e-06, + "loss": 0.69065297, + "num_input_tokens_seen": 107397640, + "step": 4991, + "time_per_iteration": 2.5176143646240234 + }, + { + "auxiliary_loss_clip": 0.01114466, + "auxiliary_loss_mlp": 0.01037011, + "balance_loss_clip": 1.04525542, + "balance_loss_mlp": 1.01949954, + "epoch": 0.3001352773185029, + "flos": 19391188106880.0, + "grad_norm": 1.8438565291232796, + "language_loss": 0.78738689, + "learning_rate": 3.28210781975363e-06, + "loss": 0.80890167, + "num_input_tokens_seen": 107416020, + "step": 4992, + "time_per_iteration": 2.520667314529419 + }, + { + "auxiliary_loss_clip": 0.01138573, + "auxiliary_loss_mlp": 0.01037363, + "balance_loss_clip": 1.05084538, + "balance_loss_mlp": 1.02092373, + "epoch": 0.3001954005711709, + "flos": 21543853161600.0, + "grad_norm": 2.417657915636467, + "language_loss": 0.8233614, + "learning_rate": 3.281808885221193e-06, + "loss": 0.84512079, + "num_input_tokens_seen": 107436340, + "step": 4993, + "time_per_iteration": 2.4865798950195312 + }, + { + "auxiliary_loss_clip": 0.01093742, + "auxiliary_loss_mlp": 0.01051245, + "balance_loss_clip": 1.04945588, + "balance_loss_mlp": 1.03339922, + "epoch": 0.30025552382383885, + "flos": 17384320356480.0, + "grad_norm": 2.2166145976427645, + "language_loss": 0.86553121, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.88698113, + "num_input_tokens_seen": 107454585, + "step": 4994, + "time_per_iteration": 2.581536054611206 + }, + { + "auxiliary_loss_clip": 0.01114301, + "auxiliary_loss_mlp": 0.01036063, + "balance_loss_clip": 1.05031097, + "balance_loss_mlp": 1.02043438, + "epoch": 0.3003156470765068, + "flos": 29533330552320.0, + "grad_norm": 1.6246550986235235, + "language_loss": 0.80989408, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.83139765, + "num_input_tokens_seen": 107477180, + "step": 4995, + "time_per_iteration": 2.6027274131774902 + }, + { + "auxiliary_loss_clip": 0.01119483, + "auxiliary_loss_mlp": 0.01038831, + "balance_loss_clip": 1.04974723, + "balance_loss_mlp": 1.02235603, + "epoch": 0.3003757703291748, + "flos": 43646402465280.0, + "grad_norm": 1.6735142631515005, + "language_loss": 0.67354631, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.69512945, + "num_input_tokens_seen": 107500250, + "step": 4996, + "time_per_iteration": 2.6975860595703125 + }, + { + "auxiliary_loss_clip": 0.0111322, + "auxiliary_loss_mlp": 0.01043185, + "balance_loss_clip": 1.05123568, + "balance_loss_mlp": 1.02649558, + "epoch": 0.30043589358184275, + "flos": 22528380015360.0, + "grad_norm": 1.6793152613844229, + "language_loss": 0.75373858, + "learning_rate": 3.280612661141615e-06, + "loss": 0.77530265, + "num_input_tokens_seen": 107520070, + "step": 4997, + "time_per_iteration": 2.5222723484039307 + }, + { + "auxiliary_loss_clip": 0.01127707, + "auxiliary_loss_mlp": 0.01046694, + "balance_loss_clip": 1.0495832, + "balance_loss_mlp": 1.03103006, + "epoch": 0.30049601683451077, + "flos": 20995892208000.0, + "grad_norm": 2.5231291562388822, + "language_loss": 0.7796188, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.80136281, + "num_input_tokens_seen": 107539285, + "step": 4998, + "time_per_iteration": 2.4837443828582764 + }, + { + "auxiliary_loss_clip": 0.01135992, + "auxiliary_loss_mlp": 0.01042176, + "balance_loss_clip": 1.05157721, + "balance_loss_mlp": 1.02689362, + "epoch": 0.30055614008717874, + "flos": 23916004272000.0, + "grad_norm": 1.7507611900988742, + "language_loss": 0.73029566, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.75207734, + "num_input_tokens_seen": 107560260, + "step": 4999, + "time_per_iteration": 2.482285737991333 + }, + { + "auxiliary_loss_clip": 0.01127361, + "auxiliary_loss_mlp": 0.0104265, + "balance_loss_clip": 1.04840231, + "balance_loss_mlp": 1.02654457, + "epoch": 0.3006162633398467, + "flos": 19169798630400.0, + "grad_norm": 1.7252286233426317, + "language_loss": 0.75548553, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.77718562, + "num_input_tokens_seen": 107579260, + "step": 5000, + "time_per_iteration": 2.472318649291992 + }, + { + "auxiliary_loss_clip": 0.01134743, + "auxiliary_loss_mlp": 0.01041564, + "balance_loss_clip": 1.04956388, + "balance_loss_mlp": 1.0274024, + "epoch": 0.30067638659251467, + "flos": 14679241061760.0, + "grad_norm": 1.8025304270448066, + "language_loss": 0.82174647, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.84350955, + "num_input_tokens_seen": 107595245, + "step": 5001, + "time_per_iteration": 2.4110982418060303 + }, + { + "auxiliary_loss_clip": 0.01126196, + "auxiliary_loss_mlp": 0.01046005, + "balance_loss_clip": 1.05058146, + "balance_loss_mlp": 1.02982819, + "epoch": 0.30073650984518263, + "flos": 23368007404800.0, + "grad_norm": 1.6976251400224927, + "language_loss": 0.80829048, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.83001256, + "num_input_tokens_seen": 107613985, + "step": 5002, + "time_per_iteration": 2.4938101768493652 + }, + { + "auxiliary_loss_clip": 0.01089451, + "auxiliary_loss_mlp": 0.01041605, + "balance_loss_clip": 1.05266523, + "balance_loss_mlp": 1.02323461, + "epoch": 0.3007966330978506, + "flos": 22966633854720.0, + "grad_norm": 1.8980960854875124, + "language_loss": 0.70817143, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.729482, + "num_input_tokens_seen": 107631435, + "step": 5003, + "time_per_iteration": 2.572446584701538 + }, + { + "auxiliary_loss_clip": 0.01104864, + "auxiliary_loss_mlp": 0.01044812, + "balance_loss_clip": 1.04621911, + "balance_loss_mlp": 1.02867079, + "epoch": 0.30085675635051856, + "flos": 27818452460160.0, + "grad_norm": 2.3351197717296683, + "language_loss": 0.70021564, + "learning_rate": 3.27851739984233e-06, + "loss": 0.72171235, + "num_input_tokens_seen": 107650530, + "step": 5004, + "time_per_iteration": 2.597621440887451 + }, + { + "auxiliary_loss_clip": 0.01116068, + "auxiliary_loss_mlp": 0.01042147, + "balance_loss_clip": 1.04942274, + "balance_loss_mlp": 1.0262326, + "epoch": 0.3009168796031865, + "flos": 10882729059840.0, + "grad_norm": 2.671529055966598, + "language_loss": 0.82086205, + "learning_rate": 3.278217882782715e-06, + "loss": 0.84244418, + "num_input_tokens_seen": 107662240, + "step": 5005, + "time_per_iteration": 2.4591000080108643 + }, + { + "auxiliary_loss_clip": 0.0112328, + "auxiliary_loss_mlp": 0.0103599, + "balance_loss_clip": 1.04755902, + "balance_loss_mlp": 1.02119589, + "epoch": 0.3009770028558545, + "flos": 23805399317760.0, + "grad_norm": 2.5033797960099102, + "language_loss": 0.7476567, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.76924944, + "num_input_tokens_seen": 107680330, + "step": 5006, + "time_per_iteration": 2.5223889350891113 + }, + { + "auxiliary_loss_clip": 0.01095068, + "auxiliary_loss_mlp": 0.00791997, + "balance_loss_clip": 1.04681659, + "balance_loss_mlp": 1.00850153, + "epoch": 0.30103712610852246, + "flos": 26468211283200.0, + "grad_norm": 1.8442660899411236, + "language_loss": 0.70984352, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.72871411, + "num_input_tokens_seen": 107700020, + "step": 5007, + "time_per_iteration": 2.5825209617614746 + }, + { + "auxiliary_loss_clip": 0.01123255, + "auxiliary_loss_mlp": 0.01039942, + "balance_loss_clip": 1.0475539, + "balance_loss_mlp": 1.02313316, + "epoch": 0.3010972493611904, + "flos": 22856459863680.0, + "grad_norm": 2.435830217006068, + "language_loss": 0.7587499, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.7803818, + "num_input_tokens_seen": 107718575, + "step": 5008, + "time_per_iteration": 2.5135397911071777 + }, + { + "auxiliary_loss_clip": 0.01122428, + "auxiliary_loss_mlp": 0.01037852, + "balance_loss_clip": 1.05083084, + "balance_loss_mlp": 1.02185404, + "epoch": 0.3011573726138584, + "flos": 24053685102720.0, + "grad_norm": 1.8134224701260682, + "language_loss": 0.84954768, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.87115049, + "num_input_tokens_seen": 107738635, + "step": 5009, + "time_per_iteration": 2.5063834190368652 + }, + { + "auxiliary_loss_clip": 0.01127798, + "auxiliary_loss_mlp": 0.01039477, + "balance_loss_clip": 1.04699087, + "balance_loss_mlp": 1.0223701, + "epoch": 0.30121749586652635, + "flos": 20259687052800.0, + "grad_norm": 1.9780798275027232, + "language_loss": 0.83691001, + "learning_rate": 3.276719570659604e-06, + "loss": 0.85858279, + "num_input_tokens_seen": 107753415, + "step": 5010, + "time_per_iteration": 2.49542236328125 + }, + { + "auxiliary_loss_clip": 0.01101936, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.0454936, + "balance_loss_mlp": 1.01976514, + "epoch": 0.3012776191191944, + "flos": 26943058103040.0, + "grad_norm": 3.9079167958940237, + "language_loss": 0.84785831, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.86922294, + "num_input_tokens_seen": 107773840, + "step": 5011, + "time_per_iteration": 2.5702877044677734 + }, + { + "auxiliary_loss_clip": 0.01114302, + "auxiliary_loss_mlp": 0.01038658, + "balance_loss_clip": 1.04278541, + "balance_loss_mlp": 1.02237439, + "epoch": 0.30133774237186234, + "flos": 20412307941120.0, + "grad_norm": 1.895589823278429, + "language_loss": 0.71950364, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.74103326, + "num_input_tokens_seen": 107792020, + "step": 5012, + "time_per_iteration": 2.524195432662964 + }, + { + "auxiliary_loss_clip": 0.01124124, + "auxiliary_loss_mlp": 0.01039351, + "balance_loss_clip": 1.04910159, + "balance_loss_mlp": 1.02430725, + "epoch": 0.3013978656245303, + "flos": 19792453916160.0, + "grad_norm": 2.1164283540766493, + "language_loss": 0.87311447, + "learning_rate": 3.275820002334819e-06, + "loss": 0.89474922, + "num_input_tokens_seen": 107809595, + "step": 5013, + "time_per_iteration": 2.4812512397766113 + }, + { + "auxiliary_loss_clip": 0.01107591, + "auxiliary_loss_mlp": 0.01047325, + "balance_loss_clip": 1.04383719, + "balance_loss_mlp": 1.02770281, + "epoch": 0.30145798887719827, + "flos": 16249650652800.0, + "grad_norm": 2.8376924339131664, + "language_loss": 0.82869661, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.85024571, + "num_input_tokens_seen": 107827230, + "step": 5014, + "time_per_iteration": 3.91120982170105 + }, + { + "auxiliary_loss_clip": 0.01090462, + "auxiliary_loss_mlp": 0.01042259, + "balance_loss_clip": 1.045784, + "balance_loss_mlp": 1.02535486, + "epoch": 0.30151811212986623, + "flos": 24571733005440.0, + "grad_norm": 1.5379268162300745, + "language_loss": 0.6837967, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.7051239, + "num_input_tokens_seen": 107847195, + "step": 5015, + "time_per_iteration": 2.5861940383911133 + }, + { + "auxiliary_loss_clip": 0.01108581, + "auxiliary_loss_mlp": 0.0103589, + "balance_loss_clip": 1.04680753, + "balance_loss_mlp": 1.01967168, + "epoch": 0.3015782353825342, + "flos": 21872076664320.0, + "grad_norm": 3.4891791683063134, + "language_loss": 0.75034726, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.77179193, + "num_input_tokens_seen": 107866420, + "step": 5016, + "time_per_iteration": 2.540588855743408 + }, + { + "auxiliary_loss_clip": 0.01126164, + "auxiliary_loss_mlp": 0.0103761, + "balance_loss_clip": 1.04735529, + "balance_loss_mlp": 1.02213645, + "epoch": 0.30163835863520216, + "flos": 28769331248640.0, + "grad_norm": 2.106984321978422, + "language_loss": 0.65675789, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.67839563, + "num_input_tokens_seen": 107889090, + "step": 5017, + "time_per_iteration": 2.5641114711761475 + }, + { + "auxiliary_loss_clip": 0.01097453, + "auxiliary_loss_mlp": 0.01051742, + "balance_loss_clip": 1.04973161, + "balance_loss_mlp": 1.03409863, + "epoch": 0.30169848188787013, + "flos": 22966202891520.0, + "grad_norm": 2.0157483127875184, + "language_loss": 0.68887591, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.7103678, + "num_input_tokens_seen": 107907520, + "step": 5018, + "time_per_iteration": 3.923494577407837 + }, + { + "auxiliary_loss_clip": 0.01129174, + "auxiliary_loss_mlp": 0.01044044, + "balance_loss_clip": 1.04529071, + "balance_loss_mlp": 1.03010893, + "epoch": 0.3017586051405381, + "flos": 21835268202240.0, + "grad_norm": 2.1578167382580227, + "language_loss": 0.79096758, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.81269979, + "num_input_tokens_seen": 107925650, + "step": 5019, + "time_per_iteration": 2.474189281463623 + }, + { + "auxiliary_loss_clip": 0.01107469, + "auxiliary_loss_mlp": 0.01041087, + "balance_loss_clip": 1.04696059, + "balance_loss_mlp": 1.02582777, + "epoch": 0.30181872839320606, + "flos": 22160403135360.0, + "grad_norm": 1.995824270528882, + "language_loss": 0.69660342, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.71808898, + "num_input_tokens_seen": 107943975, + "step": 5020, + "time_per_iteration": 2.529404640197754 + }, + { + "auxiliary_loss_clip": 0.01137148, + "auxiliary_loss_mlp": 0.01044588, + "balance_loss_clip": 1.04687035, + "balance_loss_mlp": 1.02954364, + "epoch": 0.301878851645874, + "flos": 18114168804480.0, + "grad_norm": 2.553762834764847, + "language_loss": 0.78794825, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.80976564, + "num_input_tokens_seen": 107962950, + "step": 5021, + "time_per_iteration": 3.829279899597168 + }, + { + "auxiliary_loss_clip": 0.01122893, + "auxiliary_loss_mlp": 0.01033362, + "balance_loss_clip": 1.04386508, + "balance_loss_mlp": 1.01852083, + "epoch": 0.301938974898542, + "flos": 17602226213760.0, + "grad_norm": 3.2362313691780247, + "language_loss": 0.76031899, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.78188157, + "num_input_tokens_seen": 107979700, + "step": 5022, + "time_per_iteration": 3.8037078380584717 + }, + { + "auxiliary_loss_clip": 0.01135612, + "auxiliary_loss_mlp": 0.01041764, + "balance_loss_clip": 1.0463872, + "balance_loss_mlp": 1.02574229, + "epoch": 0.30199909815120995, + "flos": 11181219079680.0, + "grad_norm": 2.4269824877013972, + "language_loss": 0.69675899, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.71853268, + "num_input_tokens_seen": 107996645, + "step": 5023, + "time_per_iteration": 2.4245004653930664 + }, + { + "auxiliary_loss_clip": 0.0111028, + "auxiliary_loss_mlp": 0.0103742, + "balance_loss_clip": 1.04728556, + "balance_loss_mlp": 1.0223875, + "epoch": 0.302059221403878, + "flos": 21907843632000.0, + "grad_norm": 1.9877278401528358, + "language_loss": 0.71320581, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.7346828, + "num_input_tokens_seen": 108015020, + "step": 5024, + "time_per_iteration": 2.506767511367798 + }, + { + "auxiliary_loss_clip": 0.01120946, + "auxiliary_loss_mlp": 0.01041287, + "balance_loss_clip": 1.04641795, + "balance_loss_mlp": 1.02581358, + "epoch": 0.30211934465654594, + "flos": 26396390039040.0, + "grad_norm": 1.7897725998702476, + "language_loss": 0.74343699, + "learning_rate": 3.272217377978061e-06, + "loss": 0.76505935, + "num_input_tokens_seen": 108036430, + "step": 5025, + "time_per_iteration": 2.5245823860168457 + }, + { + "auxiliary_loss_clip": 0.01122714, + "auxiliary_loss_mlp": 0.01040391, + "balance_loss_clip": 1.04840565, + "balance_loss_mlp": 1.02568614, + "epoch": 0.3021794679092139, + "flos": 23400470321280.0, + "grad_norm": 1.9753713599213931, + "language_loss": 0.67020118, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.69183218, + "num_input_tokens_seen": 108054250, + "step": 5026, + "time_per_iteration": 2.499821901321411 + }, + { + "auxiliary_loss_clip": 0.01124726, + "auxiliary_loss_mlp": 0.0104079, + "balance_loss_clip": 1.04773068, + "balance_loss_mlp": 1.02560258, + "epoch": 0.30223959116188187, + "flos": 20260979942400.0, + "grad_norm": 1.7883321615784689, + "language_loss": 0.85508227, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.87673748, + "num_input_tokens_seen": 108071495, + "step": 5027, + "time_per_iteration": 2.488584280014038 + }, + { + "auxiliary_loss_clip": 0.01102362, + "auxiliary_loss_mlp": 0.01040236, + "balance_loss_clip": 1.05059242, + "balance_loss_mlp": 1.02606213, + "epoch": 0.30229971441454984, + "flos": 26687840993280.0, + "grad_norm": 1.655398840970718, + "language_loss": 0.79208744, + "learning_rate": 3.271315635661351e-06, + "loss": 0.8135134, + "num_input_tokens_seen": 108092135, + "step": 5028, + "time_per_iteration": 2.5646305084228516 + }, + { + "auxiliary_loss_clip": 0.01108867, + "auxiliary_loss_mlp": 0.01044683, + "balance_loss_clip": 1.04677677, + "balance_loss_mlp": 1.02911472, + "epoch": 0.3023598376672178, + "flos": 34345323953280.0, + "grad_norm": 1.74509112611865, + "language_loss": 0.77176285, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.79329836, + "num_input_tokens_seen": 108112945, + "step": 5029, + "time_per_iteration": 2.638576030731201 + }, + { + "auxiliary_loss_clip": 0.01103725, + "auxiliary_loss_mlp": 0.01043424, + "balance_loss_clip": 1.04503465, + "balance_loss_mlp": 1.02637756, + "epoch": 0.30241996091988577, + "flos": 23112143850240.0, + "grad_norm": 1.8270087008066194, + "language_loss": 0.8208282, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.8422997, + "num_input_tokens_seen": 108130325, + "step": 5030, + "time_per_iteration": 2.5418989658355713 + }, + { + "auxiliary_loss_clip": 0.01087978, + "auxiliary_loss_mlp": 0.00792286, + "balance_loss_clip": 1.04779077, + "balance_loss_mlp": 1.01254249, + "epoch": 0.30248008417255373, + "flos": 19390002958080.0, + "grad_norm": 1.5971244985502155, + "language_loss": 0.69760072, + "learning_rate": 3.270413459468905e-06, + "loss": 0.71640337, + "num_input_tokens_seen": 108150300, + "step": 5031, + "time_per_iteration": 2.6196177005767822 + }, + { + "auxiliary_loss_clip": 0.01120808, + "auxiliary_loss_mlp": 0.01037504, + "balance_loss_clip": 1.05005407, + "balance_loss_mlp": 1.02197075, + "epoch": 0.3025402074252217, + "flos": 23769704177280.0, + "grad_norm": 1.9695219055555078, + "language_loss": 0.82472879, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.84631193, + "num_input_tokens_seen": 108170330, + "step": 5032, + "time_per_iteration": 2.505718946456909 + }, + { + "auxiliary_loss_clip": 0.01101418, + "auxiliary_loss_mlp": 0.01051626, + "balance_loss_clip": 1.05340934, + "balance_loss_mlp": 1.03347087, + "epoch": 0.30260033067788966, + "flos": 25994118648960.0, + "grad_norm": 2.8175084237099606, + "language_loss": 0.73096377, + "learning_rate": 3.269811767783906e-06, + "loss": 0.75249422, + "num_input_tokens_seen": 108191265, + "step": 5033, + "time_per_iteration": 2.608933210372925 + }, + { + "auxiliary_loss_clip": 0.01119856, + "auxiliary_loss_mlp": 0.01048618, + "balance_loss_clip": 1.04748499, + "balance_loss_mlp": 1.0324651, + "epoch": 0.3026604539305576, + "flos": 25374551932800.0, + "grad_norm": 1.469774908239953, + "language_loss": 0.73938739, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76107216, + "num_input_tokens_seen": 108211615, + "step": 5034, + "time_per_iteration": 2.516589879989624 + }, + { + "auxiliary_loss_clip": 0.01134623, + "auxiliary_loss_mlp": 0.01035083, + "balance_loss_clip": 1.04735935, + "balance_loss_mlp": 1.01980042, + "epoch": 0.3027205771832256, + "flos": 25812733944960.0, + "grad_norm": 1.6855038005244924, + "language_loss": 0.71866202, + "learning_rate": 3.269209883493352e-06, + "loss": 0.74035919, + "num_input_tokens_seen": 108231080, + "step": 5035, + "time_per_iteration": 2.5019819736480713 + }, + { + "auxiliary_loss_clip": 0.01119043, + "auxiliary_loss_mlp": 0.01037384, + "balance_loss_clip": 1.04488468, + "balance_loss_mlp": 1.02359128, + "epoch": 0.30278070043589356, + "flos": 27344539393920.0, + "grad_norm": 10.817542083870004, + "language_loss": 0.87663853, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.89820278, + "num_input_tokens_seen": 108251125, + "step": 5036, + "time_per_iteration": 2.525693655014038 + }, + { + "auxiliary_loss_clip": 0.01093018, + "auxiliary_loss_mlp": 0.01047245, + "balance_loss_clip": 1.04703236, + "balance_loss_mlp": 1.03094947, + "epoch": 0.3028408236885616, + "flos": 24786227070720.0, + "grad_norm": 1.588445482514792, + "language_loss": 0.77619648, + "learning_rate": 3.268607806688536e-06, + "loss": 0.7975992, + "num_input_tokens_seen": 108272545, + "step": 5037, + "time_per_iteration": 2.571694850921631 + }, + { + "auxiliary_loss_clip": 0.01104658, + "auxiliary_loss_mlp": 0.01046705, + "balance_loss_clip": 1.05111504, + "balance_loss_mlp": 1.03108907, + "epoch": 0.30290094694122954, + "flos": 12932474670720.0, + "grad_norm": 4.076112072211769, + "language_loss": 0.77324855, + "learning_rate": 3.268306696121816e-06, + "loss": 0.79476219, + "num_input_tokens_seen": 108289725, + "step": 5038, + "time_per_iteration": 2.5026886463165283 + }, + { + "auxiliary_loss_clip": 0.01110972, + "auxiliary_loss_mlp": 0.0103866, + "balance_loss_clip": 1.04870868, + "balance_loss_mlp": 1.02365112, + "epoch": 0.3029610701938975, + "flos": 25916443488000.0, + "grad_norm": 2.3488857310617965, + "language_loss": 0.74242091, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.76391727, + "num_input_tokens_seen": 108310690, + "step": 5039, + "time_per_iteration": 2.5622355937957764 + }, + { + "auxiliary_loss_clip": 0.0113231, + "auxiliary_loss_mlp": 0.00789317, + "balance_loss_clip": 1.04859507, + "balance_loss_mlp": 1.00950956, + "epoch": 0.3030211934465655, + "flos": 21980993679360.0, + "grad_norm": 1.7824040690755476, + "language_loss": 0.79765821, + "learning_rate": 3.267704330716847e-06, + "loss": 0.8168745, + "num_input_tokens_seen": 108328905, + "step": 5040, + "time_per_iteration": 2.4578700065612793 + }, + { + "auxiliary_loss_clip": 0.01111208, + "auxiliary_loss_mlp": 0.0104063, + "balance_loss_clip": 1.05015945, + "balance_loss_mlp": 1.02638423, + "epoch": 0.30308131669923344, + "flos": 20991977625600.0, + "grad_norm": 2.9039556321710083, + "language_loss": 0.82131392, + "learning_rate": 3.267403075901438e-06, + "loss": 0.84283227, + "num_input_tokens_seen": 108346680, + "step": 5041, + "time_per_iteration": 2.533137798309326 + }, + { + "auxiliary_loss_clip": 0.01027592, + "auxiliary_loss_mlp": 0.01005447, + "balance_loss_clip": 1.03622508, + "balance_loss_mlp": 1.00365877, + "epoch": 0.3031414399519014, + "flos": 60548875827840.0, + "grad_norm": 0.7545294962064434, + "language_loss": 0.59510523, + "learning_rate": 3.267101773025978e-06, + "loss": 0.6154356, + "num_input_tokens_seen": 108413885, + "step": 5042, + "time_per_iteration": 3.2799031734466553 + }, + { + "auxiliary_loss_clip": 0.01138036, + "auxiliary_loss_mlp": 0.01036116, + "balance_loss_clip": 1.04932892, + "balance_loss_mlp": 1.02099395, + "epoch": 0.30320156320456937, + "flos": 21907664064000.0, + "grad_norm": 1.7929005585552833, + "language_loss": 0.7137714, + "learning_rate": 3.266800422101892e-06, + "loss": 0.73551291, + "num_input_tokens_seen": 108433640, + "step": 5043, + "time_per_iteration": 2.486018657684326 + }, + { + "auxiliary_loss_clip": 0.01090154, + "auxiliary_loss_mlp": 0.01032985, + "balance_loss_clip": 1.04562855, + "balance_loss_mlp": 1.01825702, + "epoch": 0.30326168645723733, + "flos": 21652770176640.0, + "grad_norm": 1.759179725413549, + "language_loss": 0.69137061, + "learning_rate": 3.266499023140606e-06, + "loss": 0.71260202, + "num_input_tokens_seen": 108452640, + "step": 5044, + "time_per_iteration": 2.5620553493499756 + }, + { + "auxiliary_loss_clip": 0.01121967, + "auxiliary_loss_mlp": 0.01038655, + "balance_loss_clip": 1.04672647, + "balance_loss_mlp": 1.02316999, + "epoch": 0.3033218097099053, + "flos": 21871286565120.0, + "grad_norm": 1.3922843749442546, + "language_loss": 0.77247328, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.79407954, + "num_input_tokens_seen": 108472470, + "step": 5045, + "time_per_iteration": 2.49916934967041 + }, + { + "auxiliary_loss_clip": 0.01133578, + "auxiliary_loss_mlp": 0.0079288, + "balance_loss_clip": 1.04670262, + "balance_loss_mlp": 1.01428211, + "epoch": 0.30338193296257326, + "flos": 27089717333760.0, + "grad_norm": 1.5981280459189566, + "language_loss": 0.7277652, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.74702978, + "num_input_tokens_seen": 108493025, + "step": 5046, + "time_per_iteration": 2.4955668449401855 + }, + { + "auxiliary_loss_clip": 0.01129494, + "auxiliary_loss_mlp": 0.01039415, + "balance_loss_clip": 1.04668891, + "balance_loss_mlp": 1.02180791, + "epoch": 0.30344205621524123, + "flos": 19534363718400.0, + "grad_norm": 2.022494985953209, + "language_loss": 0.80976367, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.83145273, + "num_input_tokens_seen": 108513480, + "step": 5047, + "time_per_iteration": 2.5077357292175293 + }, + { + "auxiliary_loss_clip": 0.01076565, + "auxiliary_loss_mlp": 0.01043602, + "balance_loss_clip": 1.04660368, + "balance_loss_mlp": 1.02864122, + "epoch": 0.3035021794679092, + "flos": 23910976368000.0, + "grad_norm": 1.74240832908338, + "language_loss": 0.72610414, + "learning_rate": 3.265292947152084e-06, + "loss": 0.74730587, + "num_input_tokens_seen": 108533155, + "step": 5048, + "time_per_iteration": 2.605783700942993 + }, + { + "auxiliary_loss_clip": 0.01109763, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.04464102, + "balance_loss_mlp": 1.01777101, + "epoch": 0.30356230272057716, + "flos": 16143606725760.0, + "grad_norm": 2.109818493952084, + "language_loss": 0.75320429, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.77462137, + "num_input_tokens_seen": 108551900, + "step": 5049, + "time_per_iteration": 2.5007612705230713 + }, + { + "auxiliary_loss_clip": 0.01123869, + "auxiliary_loss_mlp": 0.01035761, + "balance_loss_clip": 1.0444746, + "balance_loss_mlp": 1.02084827, + "epoch": 0.3036224259732452, + "flos": 28914697589760.0, + "grad_norm": 1.6689503744029792, + "language_loss": 0.81846428, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.84006059, + "num_input_tokens_seen": 108574005, + "step": 5050, + "time_per_iteration": 2.531283378601074 + }, + { + "auxiliary_loss_clip": 0.01097535, + "auxiliary_loss_mlp": 0.01038065, + "balance_loss_clip": 1.04568982, + "balance_loss_mlp": 1.02179301, + "epoch": 0.30368254922591315, + "flos": 21105599322240.0, + "grad_norm": 2.1983781286528616, + "language_loss": 0.7373625, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.75871855, + "num_input_tokens_seen": 108592715, + "step": 5051, + "time_per_iteration": 2.5411248207092285 + }, + { + "auxiliary_loss_clip": 0.01075062, + "auxiliary_loss_mlp": 0.00792748, + "balance_loss_clip": 1.04527879, + "balance_loss_mlp": 1.01166964, + "epoch": 0.3037426724785811, + "flos": 23002293081600.0, + "grad_norm": 1.6831582637027438, + "language_loss": 0.76367122, + "learning_rate": 3.264086103483033e-06, + "loss": 0.78234935, + "num_input_tokens_seen": 108611770, + "step": 5052, + "time_per_iteration": 2.618191957473755 + }, + { + "auxiliary_loss_clip": 0.01135883, + "auxiliary_loss_mlp": 0.01043895, + "balance_loss_clip": 1.04660511, + "balance_loss_mlp": 1.02824903, + "epoch": 0.3038027957312491, + "flos": 15632705629440.0, + "grad_norm": 2.8262253379026636, + "language_loss": 0.82604003, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.84783787, + "num_input_tokens_seen": 108629070, + "step": 5053, + "time_per_iteration": 3.827930450439453 + }, + { + "auxiliary_loss_clip": 0.0111127, + "auxiliary_loss_mlp": 0.01037155, + "balance_loss_clip": 1.04815972, + "balance_loss_mlp": 1.02140749, + "epoch": 0.30386291898391704, + "flos": 12713994195840.0, + "grad_norm": 1.5932889222189386, + "language_loss": 0.70945603, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.73094028, + "num_input_tokens_seen": 108646315, + "step": 5054, + "time_per_iteration": 2.5225436687469482 + }, + { + "auxiliary_loss_clip": 0.01134548, + "auxiliary_loss_mlp": 0.01039848, + "balance_loss_clip": 1.04748714, + "balance_loss_mlp": 1.02410674, + "epoch": 0.303923042236585, + "flos": 26359437922560.0, + "grad_norm": 2.0166861590588443, + "language_loss": 0.69648457, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.71822852, + "num_input_tokens_seen": 108665920, + "step": 5055, + "time_per_iteration": 2.4945602416992188 + }, + { + "auxiliary_loss_clip": 0.01112124, + "auxiliary_loss_mlp": 0.01037239, + "balance_loss_clip": 1.05117643, + "balance_loss_mlp": 1.02093148, + "epoch": 0.30398316548925297, + "flos": 19719232041600.0, + "grad_norm": 1.7982401703821733, + "language_loss": 0.68174469, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.70323837, + "num_input_tokens_seen": 108683485, + "step": 5056, + "time_per_iteration": 3.88393235206604 + }, + { + "auxiliary_loss_clip": 0.01109559, + "auxiliary_loss_mlp": 0.01042475, + "balance_loss_clip": 1.04798222, + "balance_loss_mlp": 1.02721667, + "epoch": 0.30404328874192094, + "flos": 24239846315520.0, + "grad_norm": 1.895815421600517, + "language_loss": 0.82551587, + "learning_rate": 3.262576470461507e-06, + "loss": 0.84703624, + "num_input_tokens_seen": 108702700, + "step": 5057, + "time_per_iteration": 2.5427966117858887 + }, + { + "auxiliary_loss_clip": 0.0110859, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_clip": 1.04384065, + "balance_loss_mlp": 1.0211488, + "epoch": 0.3041034119945889, + "flos": 24498942094080.0, + "grad_norm": 1.695535926361367, + "language_loss": 0.8886981, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91015029, + "num_input_tokens_seen": 108721860, + "step": 5058, + "time_per_iteration": 2.544731616973877 + }, + { + "auxiliary_loss_clip": 0.01099867, + "auxiliary_loss_mlp": 0.0103833, + "balance_loss_clip": 1.04681528, + "balance_loss_mlp": 1.0227437, + "epoch": 0.30416353524725687, + "flos": 28288881907200.0, + "grad_norm": 1.912162141114911, + "language_loss": 0.71224678, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.73362875, + "num_input_tokens_seen": 108743215, + "step": 5059, + "time_per_iteration": 3.988891839981079 + }, + { + "auxiliary_loss_clip": 0.01075123, + "auxiliary_loss_mlp": 0.01036582, + "balance_loss_clip": 1.04522324, + "balance_loss_mlp": 1.02160943, + "epoch": 0.30422365849992483, + "flos": 23660392112640.0, + "grad_norm": 1.5256326920730554, + "language_loss": 0.72842723, + "learning_rate": 3.26167011603268e-06, + "loss": 0.74954426, + "num_input_tokens_seen": 108765505, + "step": 5060, + "time_per_iteration": 4.0895915031433105 + }, + { + "auxiliary_loss_clip": 0.0113517, + "auxiliary_loss_mlp": 0.01036365, + "balance_loss_clip": 1.04798388, + "balance_loss_mlp": 1.02126122, + "epoch": 0.3042837817525928, + "flos": 22998773548800.0, + "grad_norm": 2.0403968921607216, + "language_loss": 0.76854384, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.79025924, + "num_input_tokens_seen": 108783370, + "step": 5061, + "time_per_iteration": 2.4672601222991943 + }, + { + "auxiliary_loss_clip": 0.01101666, + "auxiliary_loss_mlp": 0.01039382, + "balance_loss_clip": 1.0544219, + "balance_loss_mlp": 1.02262068, + "epoch": 0.30434390500526076, + "flos": 22082332924800.0, + "grad_norm": 4.613734378002912, + "language_loss": 0.81691206, + "learning_rate": 3.261065640514415e-06, + "loss": 0.83832258, + "num_input_tokens_seen": 108797430, + "step": 5062, + "time_per_iteration": 2.538012981414795 + }, + { + "auxiliary_loss_clip": 0.01128933, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.0445447, + "balance_loss_mlp": 1.01490736, + "epoch": 0.3044040282579287, + "flos": 25483504861440.0, + "grad_norm": 2.415443361972339, + "language_loss": 0.74785197, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.76943111, + "num_input_tokens_seen": 108816945, + "step": 5063, + "time_per_iteration": 2.493812084197998 + }, + { + "auxiliary_loss_clip": 0.01122729, + "auxiliary_loss_mlp": 0.00793142, + "balance_loss_clip": 1.0479188, + "balance_loss_mlp": 1.0148586, + "epoch": 0.30446415151059675, + "flos": 21945478106880.0, + "grad_norm": 1.5590367185508835, + "language_loss": 0.84223461, + "learning_rate": 3.26046097371721e-06, + "loss": 0.86139333, + "num_input_tokens_seen": 108836615, + "step": 5064, + "time_per_iteration": 2.545823574066162 + }, + { + "auxiliary_loss_clip": 0.01123941, + "auxiliary_loss_mlp": 0.01035402, + "balance_loss_clip": 1.04884815, + "balance_loss_mlp": 1.01945138, + "epoch": 0.3045242747632647, + "flos": 16435416816000.0, + "grad_norm": 2.152686934158601, + "language_loss": 0.7569294, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.77852273, + "num_input_tokens_seen": 108855165, + "step": 5065, + "time_per_iteration": 2.475019693374634 + }, + { + "auxiliary_loss_clip": 0.01111469, + "auxiliary_loss_mlp": 0.01036756, + "balance_loss_clip": 1.04949522, + "balance_loss_mlp": 1.02088904, + "epoch": 0.3045843980159327, + "flos": 31540341957120.0, + "grad_norm": 2.130802735175459, + "language_loss": 0.62170458, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.64318681, + "num_input_tokens_seen": 108874690, + "step": 5066, + "time_per_iteration": 2.6204943656921387 + }, + { + "auxiliary_loss_clip": 0.01116099, + "auxiliary_loss_mlp": 0.01047935, + "balance_loss_clip": 1.05782771, + "balance_loss_mlp": 1.03217566, + "epoch": 0.30464452126860064, + "flos": 17853636481920.0, + "grad_norm": 1.9852801388067143, + "language_loss": 0.82529271, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.84693301, + "num_input_tokens_seen": 108893140, + "step": 5067, + "time_per_iteration": 2.5068275928497314 + }, + { + "auxiliary_loss_clip": 0.01134223, + "auxiliary_loss_mlp": 0.01038643, + "balance_loss_clip": 1.04961383, + "balance_loss_mlp": 1.02350366, + "epoch": 0.3047046445212686, + "flos": 20631398947200.0, + "grad_norm": 1.845595786188185, + "language_loss": 0.63034922, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65207791, + "num_input_tokens_seen": 108911880, + "step": 5068, + "time_per_iteration": 2.4847700595855713 + }, + { + "auxiliary_loss_clip": 0.01126455, + "auxiliary_loss_mlp": 0.01033221, + "balance_loss_clip": 1.04755592, + "balance_loss_mlp": 1.01761651, + "epoch": 0.3047647677739366, + "flos": 21287594557440.0, + "grad_norm": 1.6034764769250842, + "language_loss": 0.74536997, + "learning_rate": 3.258948470480793e-06, + "loss": 0.7669667, + "num_input_tokens_seen": 108930440, + "step": 5069, + "time_per_iteration": 2.4868061542510986 + }, + { + "auxiliary_loss_clip": 0.01106698, + "auxiliary_loss_mlp": 0.01045708, + "balance_loss_clip": 1.04889965, + "balance_loss_mlp": 1.03099179, + "epoch": 0.30482489102660454, + "flos": 20995928121600.0, + "grad_norm": 2.909800978394936, + "language_loss": 0.76006079, + "learning_rate": 3.258645826569261e-06, + "loss": 0.78158474, + "num_input_tokens_seen": 108949125, + "step": 5070, + "time_per_iteration": 2.5581939220428467 + }, + { + "auxiliary_loss_clip": 0.01138893, + "auxiliary_loss_mlp": 0.00791519, + "balance_loss_clip": 1.04957914, + "balance_loss_mlp": 1.01184595, + "epoch": 0.3048850142792725, + "flos": 26290812988800.0, + "grad_norm": 1.6507559660289683, + "language_loss": 0.81460655, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.83391058, + "num_input_tokens_seen": 108972190, + "step": 5071, + "time_per_iteration": 2.52705979347229 + }, + { + "auxiliary_loss_clip": 0.0111047, + "auxiliary_loss_mlp": 0.01044877, + "balance_loss_clip": 1.05056286, + "balance_loss_mlp": 1.02825928, + "epoch": 0.30494513753194047, + "flos": 22346241125760.0, + "grad_norm": 1.6053323569643994, + "language_loss": 0.7587418, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.78029525, + "num_input_tokens_seen": 108990325, + "step": 5072, + "time_per_iteration": 2.5522186756134033 + }, + { + "auxiliary_loss_clip": 0.01100834, + "auxiliary_loss_mlp": 0.01047178, + "balance_loss_clip": 1.05258882, + "balance_loss_mlp": 1.03095388, + "epoch": 0.30500526078460843, + "flos": 19537667769600.0, + "grad_norm": 1.9747044809637038, + "language_loss": 0.71797055, + "learning_rate": 3.257737608512723e-06, + "loss": 0.73945069, + "num_input_tokens_seen": 109009505, + "step": 5073, + "time_per_iteration": 2.545243978500366 + }, + { + "auxiliary_loss_clip": 0.01131541, + "auxiliary_loss_mlp": 0.01048105, + "balance_loss_clip": 1.0511663, + "balance_loss_mlp": 1.03203571, + "epoch": 0.3050653840372764, + "flos": 14465321614080.0, + "grad_norm": 2.0679825755195047, + "language_loss": 0.7684505, + "learning_rate": 3.257434773758163e-06, + "loss": 0.79024696, + "num_input_tokens_seen": 109026350, + "step": 5074, + "time_per_iteration": 2.462851047515869 + }, + { + "auxiliary_loss_clip": 0.01116082, + "auxiliary_loss_mlp": 0.01032161, + "balance_loss_clip": 1.05245876, + "balance_loss_mlp": 1.01736689, + "epoch": 0.30512550728994436, + "flos": 24243796811520.0, + "grad_norm": 2.170565671155647, + "language_loss": 0.74483377, + "learning_rate": 3.25713189132155e-06, + "loss": 0.76631618, + "num_input_tokens_seen": 109044165, + "step": 5075, + "time_per_iteration": 2.5491833686828613 + }, + { + "auxiliary_loss_clip": 0.0113848, + "auxiliary_loss_mlp": 0.01046444, + "balance_loss_clip": 1.04955614, + "balance_loss_mlp": 1.02887249, + "epoch": 0.30518563054261233, + "flos": 16360542915840.0, + "grad_norm": 2.141357883454221, + "language_loss": 0.75773489, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.77958411, + "num_input_tokens_seen": 109060665, + "step": 5076, + "time_per_iteration": 2.4443511962890625 + }, + { + "auxiliary_loss_clip": 0.0112506, + "auxiliary_loss_mlp": 0.01037491, + "balance_loss_clip": 1.05398703, + "balance_loss_mlp": 1.02194643, + "epoch": 0.30524575379528035, + "flos": 21579584215680.0, + "grad_norm": 1.3949293248851937, + "language_loss": 0.79628909, + "learning_rate": 3.25652598344811e-06, + "loss": 0.81791461, + "num_input_tokens_seen": 109080035, + "step": 5077, + "time_per_iteration": 2.523547649383545 + }, + { + "auxiliary_loss_clip": 0.01090009, + "auxiliary_loss_mlp": 0.01034945, + "balance_loss_clip": 1.05000806, + "balance_loss_mlp": 1.02030575, + "epoch": 0.3053058770479483, + "flos": 16545231671040.0, + "grad_norm": 1.5383924912404239, + "language_loss": 0.74878854, + "learning_rate": 3.256222958034259e-06, + "loss": 0.77003813, + "num_input_tokens_seen": 109097385, + "step": 5078, + "time_per_iteration": 2.5725393295288086 + }, + { + "auxiliary_loss_clip": 0.01087735, + "auxiliary_loss_mlp": 0.0105293, + "balance_loss_clip": 1.05243587, + "balance_loss_mlp": 1.03696179, + "epoch": 0.3053660003006163, + "flos": 12312907954560.0, + "grad_norm": 1.9493642566642697, + "language_loss": 0.66645539, + "learning_rate": 3.255919884984307e-06, + "loss": 0.68786204, + "num_input_tokens_seen": 109115495, + "step": 5079, + "time_per_iteration": 2.5597622394561768 + }, + { + "auxiliary_loss_clip": 0.01125055, + "auxiliary_loss_mlp": 0.01037471, + "balance_loss_clip": 1.04903173, + "balance_loss_mlp": 1.02276051, + "epoch": 0.30542612355328425, + "flos": 23112287504640.0, + "grad_norm": 1.9441253088566703, + "language_loss": 0.79768336, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.81930864, + "num_input_tokens_seen": 109134235, + "step": 5080, + "time_per_iteration": 2.509190082550049 + }, + { + "auxiliary_loss_clip": 0.01124915, + "auxiliary_loss_mlp": 0.00791136, + "balance_loss_clip": 1.05104709, + "balance_loss_mlp": 1.01269221, + "epoch": 0.3054862468059522, + "flos": 24389450461440.0, + "grad_norm": 2.8986846170070724, + "language_loss": 0.81030858, + "learning_rate": 3.255313596022074e-06, + "loss": 0.82946908, + "num_input_tokens_seen": 109152760, + "step": 5081, + "time_per_iteration": 2.529574155807495 + }, + { + "auxiliary_loss_clip": 0.01121627, + "auxiliary_loss_mlp": 0.01043048, + "balance_loss_clip": 1.05218542, + "balance_loss_mlp": 1.02803993, + "epoch": 0.3055463700586202, + "flos": 29386096704000.0, + "grad_norm": 1.6520753483990296, + "language_loss": 0.71927911, + "learning_rate": 3.255010380132783e-06, + "loss": 0.74092585, + "num_input_tokens_seen": 109173925, + "step": 5082, + "time_per_iteration": 2.562056541442871 + }, + { + "auxiliary_loss_clip": 0.01125385, + "auxiliary_loss_mlp": 0.01036816, + "balance_loss_clip": 1.04792964, + "balance_loss_mlp": 1.02022195, + "epoch": 0.30560649331128814, + "flos": 25591775431680.0, + "grad_norm": 1.8211269826107046, + "language_loss": 0.72992283, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.75154483, + "num_input_tokens_seen": 109192510, + "step": 5083, + "time_per_iteration": 2.5121631622314453 + }, + { + "auxiliary_loss_clip": 0.01107363, + "auxiliary_loss_mlp": 0.00791623, + "balance_loss_clip": 1.04900575, + "balance_loss_mlp": 1.00923359, + "epoch": 0.3056666165639561, + "flos": 19128321400320.0, + "grad_norm": 1.7299783538864115, + "language_loss": 0.70932376, + "learning_rate": 3.254403805595344e-06, + "loss": 0.72831368, + "num_input_tokens_seen": 109210885, + "step": 5084, + "time_per_iteration": 2.52413010597229 + }, + { + "auxiliary_loss_clip": 0.01100134, + "auxiliary_loss_mlp": 0.0104252, + "balance_loss_clip": 1.04889369, + "balance_loss_mlp": 1.02448392, + "epoch": 0.30572673981662407, + "flos": 15523860441600.0, + "grad_norm": 1.9101004536408241, + "language_loss": 0.78309613, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.80452263, + "num_input_tokens_seen": 109229180, + "step": 5085, + "time_per_iteration": 2.5433623790740967 + }, + { + "auxiliary_loss_clip": 0.0112963, + "auxiliary_loss_mlp": 0.01035988, + "balance_loss_clip": 1.0462749, + "balance_loss_mlp": 1.02143264, + "epoch": 0.30578686306929204, + "flos": 21506541909120.0, + "grad_norm": 1.566912184449944, + "language_loss": 0.78487313, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.80652928, + "num_input_tokens_seen": 109249510, + "step": 5086, + "time_per_iteration": 2.4827637672424316 + }, + { + "auxiliary_loss_clip": 0.01105398, + "auxiliary_loss_mlp": 0.01045685, + "balance_loss_clip": 1.05138218, + "balance_loss_mlp": 1.02816749, + "epoch": 0.30584698632196, + "flos": 20954271323520.0, + "grad_norm": 2.239845278641564, + "language_loss": 0.76728064, + "learning_rate": 3.253493587064563e-06, + "loss": 0.78879148, + "num_input_tokens_seen": 109268200, + "step": 5087, + "time_per_iteration": 2.527249336242676 + }, + { + "auxiliary_loss_clip": 0.01122796, + "auxiliary_loss_mlp": 0.01040038, + "balance_loss_clip": 1.04530907, + "balance_loss_mlp": 1.02387357, + "epoch": 0.30590710957462797, + "flos": 24681116897280.0, + "grad_norm": 1.9662993075073387, + "language_loss": 0.72105432, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.7426827, + "num_input_tokens_seen": 109288370, + "step": 5088, + "time_per_iteration": 2.5317161083221436 + }, + { + "auxiliary_loss_clip": 0.01124637, + "auxiliary_loss_mlp": 0.01038311, + "balance_loss_clip": 1.04650855, + "balance_loss_mlp": 1.02218151, + "epoch": 0.30596723282729593, + "flos": 17086907744640.0, + "grad_norm": 2.2247932546184495, + "language_loss": 0.79311979, + "learning_rate": 3.252886537028521e-06, + "loss": 0.81474924, + "num_input_tokens_seen": 109306730, + "step": 5089, + "time_per_iteration": 2.4795587062835693 + }, + { + "auxiliary_loss_clip": 0.01110333, + "auxiliary_loss_mlp": 0.01039859, + "balance_loss_clip": 1.05140638, + "balance_loss_mlp": 1.02414107, + "epoch": 0.30602735607996395, + "flos": 22857106308480.0, + "grad_norm": 1.7530407327365238, + "language_loss": 0.77307272, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.79457462, + "num_input_tokens_seen": 109327360, + "step": 5090, + "time_per_iteration": 2.5709004402160645 + }, + { + "auxiliary_loss_clip": 0.01114679, + "auxiliary_loss_mlp": 0.0105012, + "balance_loss_clip": 1.04628253, + "balance_loss_mlp": 1.03403854, + "epoch": 0.3060874793326319, + "flos": 29861482227840.0, + "grad_norm": 1.8669347185967908, + "language_loss": 0.7638979, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.78554583, + "num_input_tokens_seen": 109348135, + "step": 5091, + "time_per_iteration": 4.047200918197632 + }, + { + "auxiliary_loss_clip": 0.01068696, + "auxiliary_loss_mlp": 0.01043239, + "balance_loss_clip": 1.04349279, + "balance_loss_mlp": 1.02540553, + "epoch": 0.3061476025852999, + "flos": 20448577699200.0, + "grad_norm": 1.8826241953480907, + "language_loss": 0.71840894, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.73952836, + "num_input_tokens_seen": 109366220, + "step": 5092, + "time_per_iteration": 2.656700849533081 + }, + { + "auxiliary_loss_clip": 0.01111082, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.04739928, + "balance_loss_mlp": 1.02256536, + "epoch": 0.30620772583796785, + "flos": 19391475415680.0, + "grad_norm": 1.7260120045400271, + "language_loss": 0.82041371, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.84189373, + "num_input_tokens_seen": 109385260, + "step": 5093, + "time_per_iteration": 2.5346693992614746 + }, + { + "auxiliary_loss_clip": 0.01133573, + "auxiliary_loss_mlp": 0.00790223, + "balance_loss_clip": 1.0484128, + "balance_loss_mlp": 1.01192605, + "epoch": 0.3062678490906358, + "flos": 24024562151040.0, + "grad_norm": 1.8713740157555094, + "language_loss": 0.75356257, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.77280056, + "num_input_tokens_seen": 109405025, + "step": 5094, + "time_per_iteration": 2.4772167205810547 + }, + { + "auxiliary_loss_clip": 0.01112545, + "auxiliary_loss_mlp": 0.01039696, + "balance_loss_clip": 1.05037808, + "balance_loss_mlp": 1.02527213, + "epoch": 0.3063279723433038, + "flos": 19754639873280.0, + "grad_norm": 2.1314616778106443, + "language_loss": 0.76144707, + "learning_rate": 3.251064247058868e-06, + "loss": 0.78296947, + "num_input_tokens_seen": 109422465, + "step": 5095, + "time_per_iteration": 3.911518096923828 + }, + { + "auxiliary_loss_clip": 0.01120638, + "auxiliary_loss_mlp": 0.01043504, + "balance_loss_clip": 1.04719055, + "balance_loss_mlp": 1.02860284, + "epoch": 0.30638809559597174, + "flos": 22450022496000.0, + "grad_norm": 1.6541401228138783, + "language_loss": 0.80679047, + "learning_rate": 3.250760365955042e-06, + "loss": 0.8284319, + "num_input_tokens_seen": 109440575, + "step": 5096, + "time_per_iteration": 2.4915823936462402 + }, + { + "auxiliary_loss_clip": 0.01125517, + "auxiliary_loss_mlp": 0.01035554, + "balance_loss_clip": 1.04893947, + "balance_loss_mlp": 1.02078414, + "epoch": 0.3064482188486397, + "flos": 17165157523200.0, + "grad_norm": 1.9411747073422465, + "language_loss": 0.81598175, + "learning_rate": 3.250456437422258e-06, + "loss": 0.83759242, + "num_input_tokens_seen": 109459050, + "step": 5097, + "time_per_iteration": 2.465376615524292 + }, + { + "auxiliary_loss_clip": 0.01135781, + "auxiliary_loss_mlp": 0.0104149, + "balance_loss_clip": 1.04894066, + "balance_loss_mlp": 1.02482486, + "epoch": 0.3065083421013077, + "flos": 23768483114880.0, + "grad_norm": 1.93054872045021, + "language_loss": 0.77965021, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80142295, + "num_input_tokens_seen": 109475860, + "step": 5098, + "time_per_iteration": 5.191356897354126 + }, + { + "auxiliary_loss_clip": 0.01096508, + "auxiliary_loss_mlp": 0.01038897, + "balance_loss_clip": 1.05094504, + "balance_loss_mlp": 1.02330399, + "epoch": 0.30656846535397564, + "flos": 26431833784320.0, + "grad_norm": 1.806936067939058, + "language_loss": 0.84155428, + "learning_rate": 3.249848438115917e-06, + "loss": 0.86290836, + "num_input_tokens_seen": 109494760, + "step": 5099, + "time_per_iteration": 2.616321086883545 + }, + { + "auxiliary_loss_clip": 0.01137024, + "auxiliary_loss_mlp": 0.01042706, + "balance_loss_clip": 1.04791307, + "balance_loss_mlp": 1.0272032, + "epoch": 0.3066285886066436, + "flos": 26651786716800.0, + "grad_norm": 1.5669882924396057, + "language_loss": 0.85359192, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.87538922, + "num_input_tokens_seen": 109516480, + "step": 5100, + "time_per_iteration": 2.5169858932495117 + }, + { + "auxiliary_loss_clip": 0.01099296, + "auxiliary_loss_mlp": 0.01035528, + "balance_loss_clip": 1.04454994, + "balance_loss_mlp": 1.01904178, + "epoch": 0.30668871185931157, + "flos": 15049947375360.0, + "grad_norm": 1.6930061357631445, + "language_loss": 0.78801191, + "learning_rate": 3.249240249232065e-06, + "loss": 0.80936015, + "num_input_tokens_seen": 109534615, + "step": 5101, + "time_per_iteration": 2.5151965618133545 + }, + { + "auxiliary_loss_clip": 0.01102271, + "auxiliary_loss_mlp": 0.01043094, + "balance_loss_clip": 1.05182481, + "balance_loss_mlp": 1.02558827, + "epoch": 0.30674883511197953, + "flos": 20082109190400.0, + "grad_norm": 1.8354918150005812, + "language_loss": 0.79608208, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.81753576, + "num_input_tokens_seen": 109554040, + "step": 5102, + "time_per_iteration": 2.5565876960754395 + }, + { + "auxiliary_loss_clip": 0.01141002, + "auxiliary_loss_mlp": 0.01043204, + "balance_loss_clip": 1.05367947, + "balance_loss_mlp": 1.02634764, + "epoch": 0.30680895836464755, + "flos": 22893807029760.0, + "grad_norm": 1.8010833685374177, + "language_loss": 0.88698995, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.90883195, + "num_input_tokens_seen": 109574345, + "step": 5103, + "time_per_iteration": 2.4712846279144287 + }, + { + "auxiliary_loss_clip": 0.01120604, + "auxiliary_loss_mlp": 0.01041874, + "balance_loss_clip": 1.05347633, + "balance_loss_mlp": 1.02596498, + "epoch": 0.3068690816173155, + "flos": 23696159080320.0, + "grad_norm": 1.6834241723641357, + "language_loss": 0.73778701, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.75941181, + "num_input_tokens_seen": 109593670, + "step": 5104, + "time_per_iteration": 2.5241332054138184 + }, + { + "auxiliary_loss_clip": 0.01124122, + "auxiliary_loss_mlp": 0.00794398, + "balance_loss_clip": 1.04693866, + "balance_loss_mlp": 1.0146991, + "epoch": 0.3069292048699835, + "flos": 23551044134400.0, + "grad_norm": 1.8491751783777972, + "language_loss": 0.72913712, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.74832237, + "num_input_tokens_seen": 109613385, + "step": 5105, + "time_per_iteration": 2.522522211074829 + }, + { + "auxiliary_loss_clip": 0.01115083, + "auxiliary_loss_mlp": 0.01041374, + "balance_loss_clip": 1.05421078, + "balance_loss_mlp": 1.02469611, + "epoch": 0.30698932812265145, + "flos": 24531656405760.0, + "grad_norm": 1.8758791585399968, + "language_loss": 0.87503719, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.8966018, + "num_input_tokens_seen": 109632395, + "step": 5106, + "time_per_iteration": 2.5765206813812256 + }, + { + "auxiliary_loss_clip": 0.01107248, + "auxiliary_loss_mlp": 0.01048216, + "balance_loss_clip": 1.04846263, + "balance_loss_mlp": 1.03139567, + "epoch": 0.3070494513753194, + "flos": 20996430912000.0, + "grad_norm": 2.151922630430958, + "language_loss": 0.71787798, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.73943257, + "num_input_tokens_seen": 109651380, + "step": 5107, + "time_per_iteration": 2.549423933029175 + }, + { + "auxiliary_loss_clip": 0.01100851, + "auxiliary_loss_mlp": 0.01052218, + "balance_loss_clip": 1.04795861, + "balance_loss_mlp": 1.03629148, + "epoch": 0.3071095746279874, + "flos": 19025940660480.0, + "grad_norm": 2.1866667587905955, + "language_loss": 0.72334552, + "learning_rate": 3.247110096547814e-06, + "loss": 0.74487627, + "num_input_tokens_seen": 109670240, + "step": 5108, + "time_per_iteration": 2.5428662300109863 + }, + { + "auxiliary_loss_clip": 0.0111299, + "auxiliary_loss_mlp": 0.01038792, + "balance_loss_clip": 1.0524236, + "balance_loss_mlp": 1.02285981, + "epoch": 0.30716969788065535, + "flos": 21215521918080.0, + "grad_norm": 1.5255138991223882, + "language_loss": 0.85776103, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.87927878, + "num_input_tokens_seen": 109690810, + "step": 5109, + "time_per_iteration": 2.5429224967956543 + }, + { + "auxiliary_loss_clip": 0.01109672, + "auxiliary_loss_mlp": 0.01035608, + "balance_loss_clip": 1.04936838, + "balance_loss_mlp": 1.0202539, + "epoch": 0.3072298211333233, + "flos": 25772765086080.0, + "grad_norm": 2.1514407454316293, + "language_loss": 0.67383152, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.69528437, + "num_input_tokens_seen": 109711145, + "step": 5110, + "time_per_iteration": 2.5701611042022705 + }, + { + "auxiliary_loss_clip": 0.01122323, + "auxiliary_loss_mlp": 0.01035513, + "balance_loss_clip": 1.04767251, + "balance_loss_mlp": 1.02108884, + "epoch": 0.3072899443859913, + "flos": 25848931875840.0, + "grad_norm": 1.600201760907957, + "language_loss": 0.77334571, + "learning_rate": 3.246196464379919e-06, + "loss": 0.79492408, + "num_input_tokens_seen": 109731425, + "step": 5111, + "time_per_iteration": 2.552011489868164 + }, + { + "auxiliary_loss_clip": 0.01136135, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.04920578, + "balance_loss_mlp": 1.02643371, + "epoch": 0.30735006763865924, + "flos": 25922800195200.0, + "grad_norm": 1.7096686040645723, + "language_loss": 0.67159903, + "learning_rate": 3.245891825796765e-06, + "loss": 0.69337714, + "num_input_tokens_seen": 109752720, + "step": 5112, + "time_per_iteration": 2.5191309452056885 + }, + { + "auxiliary_loss_clip": 0.01131201, + "auxiliary_loss_mlp": 0.01042866, + "balance_loss_clip": 1.05047989, + "balance_loss_mlp": 1.02529418, + "epoch": 0.3074101908913272, + "flos": 30917004312960.0, + "grad_norm": 1.7431548811247024, + "language_loss": 0.79471433, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.81645501, + "num_input_tokens_seen": 109772840, + "step": 5113, + "time_per_iteration": 2.5691511631011963 + }, + { + "auxiliary_loss_clip": 0.01106529, + "auxiliary_loss_mlp": 0.00791205, + "balance_loss_clip": 1.051126, + "balance_loss_mlp": 1.01013219, + "epoch": 0.30747031414399517, + "flos": 18401058731520.0, + "grad_norm": 2.056844458065421, + "language_loss": 0.77346069, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.79243803, + "num_input_tokens_seen": 109790150, + "step": 5114, + "time_per_iteration": 2.5351390838623047 + }, + { + "auxiliary_loss_clip": 0.01100895, + "auxiliary_loss_mlp": 0.01039742, + "balance_loss_clip": 1.04842448, + "balance_loss_mlp": 1.02239728, + "epoch": 0.30753043739666314, + "flos": 22633166966400.0, + "grad_norm": 1.7859074538728787, + "language_loss": 0.62136102, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.64276743, + "num_input_tokens_seen": 109807985, + "step": 5115, + "time_per_iteration": 2.5567257404327393 + }, + { + "auxiliary_loss_clip": 0.01128583, + "auxiliary_loss_mlp": 0.01044244, + "balance_loss_clip": 1.04797733, + "balance_loss_mlp": 1.02856815, + "epoch": 0.3075905606493311, + "flos": 27344072517120.0, + "grad_norm": 1.8624379822359596, + "language_loss": 0.82865864, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.85038686, + "num_input_tokens_seen": 109825920, + "step": 5116, + "time_per_iteration": 2.5321226119995117 + }, + { + "auxiliary_loss_clip": 0.01113358, + "auxiliary_loss_mlp": 0.01041353, + "balance_loss_clip": 1.04987669, + "balance_loss_mlp": 1.02555776, + "epoch": 0.3076506839019991, + "flos": 22090808534400.0, + "grad_norm": 1.7580104221247757, + "language_loss": 0.76169503, + "learning_rate": 3.244367924446952e-06, + "loss": 0.78324217, + "num_input_tokens_seen": 109846220, + "step": 5117, + "time_per_iteration": 2.5518136024475098 + }, + { + "auxiliary_loss_clip": 0.01096852, + "auxiliary_loss_mlp": 0.01041227, + "balance_loss_clip": 1.0507102, + "balance_loss_mlp": 1.0226903, + "epoch": 0.3077108071546671, + "flos": 21289533891840.0, + "grad_norm": 2.507907047117693, + "language_loss": 0.71942323, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.74080408, + "num_input_tokens_seen": 109863870, + "step": 5118, + "time_per_iteration": 2.5447747707366943 + }, + { + "auxiliary_loss_clip": 0.01091868, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.04947543, + "balance_loss_mlp": 1.02165008, + "epoch": 0.30777093040733505, + "flos": 21430985650560.0, + "grad_norm": 1.6267558602890053, + "language_loss": 0.74420536, + "learning_rate": 3.243758033520219e-06, + "loss": 0.76549792, + "num_input_tokens_seen": 109883500, + "step": 5119, + "time_per_iteration": 2.60062313079834 + }, + { + "auxiliary_loss_clip": 0.01128956, + "auxiliary_loss_mlp": 0.01052199, + "balance_loss_clip": 1.04961932, + "balance_loss_mlp": 1.03473532, + "epoch": 0.307831053660003, + "flos": 23149275534720.0, + "grad_norm": 1.8159886058043166, + "language_loss": 0.80395091, + "learning_rate": 3.243453017305926e-06, + "loss": 0.82576251, + "num_input_tokens_seen": 109904620, + "step": 5120, + "time_per_iteration": 2.5535271167755127 + }, + { + "auxiliary_loss_clip": 0.0112246, + "auxiliary_loss_mlp": 0.01039652, + "balance_loss_clip": 1.04454899, + "balance_loss_mlp": 1.02459598, + "epoch": 0.307891176912671, + "flos": 17019755268480.0, + "grad_norm": 1.5933604436822906, + "language_loss": 0.79914868, + "learning_rate": 3.24314795393977e-06, + "loss": 0.82076979, + "num_input_tokens_seen": 109922275, + "step": 5121, + "time_per_iteration": 2.4733738899230957 + }, + { + "auxiliary_loss_clip": 0.01105071, + "auxiliary_loss_mlp": 0.01035385, + "balance_loss_clip": 1.05426407, + "balance_loss_mlp": 1.01998281, + "epoch": 0.30795130016533895, + "flos": 27705046245120.0, + "grad_norm": 1.437674598612212, + "language_loss": 0.82692528, + "learning_rate": 3.242842843433319e-06, + "loss": 0.84832978, + "num_input_tokens_seen": 109944265, + "step": 5122, + "time_per_iteration": 2.5651676654815674 + }, + { + "auxiliary_loss_clip": 0.01046516, + "auxiliary_loss_mlp": 0.01004003, + "balance_loss_clip": 1.02295542, + "balance_loss_mlp": 1.00183296, + "epoch": 0.3080114234180069, + "flos": 69058699591680.0, + "grad_norm": 0.7403419678183805, + "language_loss": 0.58598638, + "learning_rate": 3.242537685798143e-06, + "loss": 0.60649157, + "num_input_tokens_seen": 110014160, + "step": 5123, + "time_per_iteration": 3.26607346534729 + }, + { + "auxiliary_loss_clip": 0.01131117, + "auxiliary_loss_mlp": 0.00791954, + "balance_loss_clip": 1.04846776, + "balance_loss_mlp": 1.01004481, + "epoch": 0.3080715466706749, + "flos": 24060221377920.0, + "grad_norm": 2.131265699194189, + "language_loss": 0.83690172, + "learning_rate": 3.242232481045813e-06, + "loss": 0.85613251, + "num_input_tokens_seen": 110034865, + "step": 5124, + "time_per_iteration": 2.5511813163757324 + }, + { + "auxiliary_loss_clip": 0.01139511, + "auxiliary_loss_mlp": 0.010389, + "balance_loss_clip": 1.04996657, + "balance_loss_mlp": 1.0232718, + "epoch": 0.30813166992334284, + "flos": 25848680480640.0, + "grad_norm": 2.002212414980544, + "language_loss": 0.79170126, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.81348538, + "num_input_tokens_seen": 110052930, + "step": 5125, + "time_per_iteration": 2.5060949325561523 + }, + { + "auxiliary_loss_clip": 0.01124322, + "auxiliary_loss_mlp": 0.01042048, + "balance_loss_clip": 1.04877329, + "balance_loss_mlp": 1.02411926, + "epoch": 0.3081917931760108, + "flos": 20449619193600.0, + "grad_norm": 1.8925401924383416, + "language_loss": 0.64502287, + "learning_rate": 3.241621930235989e-06, + "loss": 0.66668665, + "num_input_tokens_seen": 110071765, + "step": 5126, + "time_per_iteration": 2.465609073638916 + }, + { + "auxiliary_loss_clip": 0.0108997, + "auxiliary_loss_mlp": 0.01039946, + "balance_loss_clip": 1.04512811, + "balance_loss_mlp": 1.02438879, + "epoch": 0.3082519164286788, + "flos": 22166257052160.0, + "grad_norm": 1.5359451701440519, + "language_loss": 0.86318147, + "learning_rate": 3.241316584201646e-06, + "loss": 0.8844806, + "num_input_tokens_seen": 110092660, + "step": 5127, + "time_per_iteration": 2.6120364665985107 + }, + { + "auxiliary_loss_clip": 0.01088781, + "auxiliary_loss_mlp": 0.0104144, + "balance_loss_clip": 1.04430413, + "balance_loss_mlp": 1.02529883, + "epoch": 0.30831203968134674, + "flos": 28913404700160.0, + "grad_norm": 1.751514660264318, + "language_loss": 0.69138956, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.71269178, + "num_input_tokens_seen": 110114960, + "step": 5128, + "time_per_iteration": 2.6283555030822754 + }, + { + "auxiliary_loss_clip": 0.01124886, + "auxiliary_loss_mlp": 0.00790957, + "balance_loss_clip": 1.04735279, + "balance_loss_mlp": 1.00913072, + "epoch": 0.3083721629340147, + "flos": 25667726739840.0, + "grad_norm": 1.9199537416804526, + "language_loss": 0.71370411, + "learning_rate": 3.240705750931993e-06, + "loss": 0.73286253, + "num_input_tokens_seen": 110135750, + "step": 5129, + "time_per_iteration": 2.535334825515747 + }, + { + "auxiliary_loss_clip": 0.01021848, + "auxiliary_loss_mlp": 0.01004726, + "balance_loss_clip": 1.01862228, + "balance_loss_mlp": 1.00237751, + "epoch": 0.3084322861866827, + "flos": 68212679581440.0, + "grad_norm": 0.8308423309343258, + "language_loss": 0.59223545, + "learning_rate": 3.240400263719846e-06, + "loss": 0.6125012, + "num_input_tokens_seen": 110189480, + "step": 5130, + "time_per_iteration": 4.4779157638549805 + }, + { + "auxiliary_loss_clip": 0.01114869, + "auxiliary_loss_mlp": 0.01038897, + "balance_loss_clip": 1.0476954, + "balance_loss_mlp": 1.0217073, + "epoch": 0.3084924094393507, + "flos": 20296495514880.0, + "grad_norm": 2.2726027329164356, + "language_loss": 0.72954559, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.75108331, + "num_input_tokens_seen": 110206445, + "step": 5131, + "time_per_iteration": 2.5157718658447266 + }, + { + "auxiliary_loss_clip": 0.01097355, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.04471791, + "balance_loss_mlp": 1.02164114, + "epoch": 0.30855253269201866, + "flos": 23949831905280.0, + "grad_norm": 1.4976716853155425, + "language_loss": 0.70680654, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.72814673, + "num_input_tokens_seen": 110226845, + "step": 5132, + "time_per_iteration": 2.578185558319092 + }, + { + "auxiliary_loss_clip": 0.01131491, + "auxiliary_loss_mlp": 0.00791007, + "balance_loss_clip": 1.04714143, + "balance_loss_mlp": 1.00972545, + "epoch": 0.3086126559446866, + "flos": 19281876042240.0, + "grad_norm": 1.6878551322865813, + "language_loss": 0.89759606, + "learning_rate": 3.239483519913136e-06, + "loss": 0.916821, + "num_input_tokens_seen": 110244095, + "step": 5133, + "time_per_iteration": 2.4394192695617676 + }, + { + "auxiliary_loss_clip": 0.01116362, + "auxiliary_loss_mlp": 0.01045, + "balance_loss_clip": 1.0451231, + "balance_loss_mlp": 1.02867997, + "epoch": 0.3086727791973546, + "flos": 33760770019200.0, + "grad_norm": 2.0616612895315996, + "language_loss": 0.67234379, + "learning_rate": 3.239177844626102e-06, + "loss": 0.69395745, + "num_input_tokens_seen": 110264240, + "step": 5134, + "time_per_iteration": 3.992511034011841 + }, + { + "auxiliary_loss_clip": 0.01122975, + "auxiliary_loss_mlp": 0.01043144, + "balance_loss_clip": 1.04870009, + "balance_loss_mlp": 1.02612114, + "epoch": 0.30873290245002255, + "flos": 16034151006720.0, + "grad_norm": 1.8803038549908033, + "language_loss": 0.82805687, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.84971809, + "num_input_tokens_seen": 110282450, + "step": 5135, + "time_per_iteration": 2.466834306716919 + }, + { + "auxiliary_loss_clip": 0.01018859, + "auxiliary_loss_mlp": 0.01008721, + "balance_loss_clip": 1.01553559, + "balance_loss_mlp": 1.00664663, + "epoch": 0.3087930257026905, + "flos": 65048304055680.0, + "grad_norm": 0.7027327457366329, + "language_loss": 0.553177, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57345283, + "num_input_tokens_seen": 110343715, + "step": 5136, + "time_per_iteration": 3.186471939086914 + }, + { + "auxiliary_loss_clip": 0.01111712, + "auxiliary_loss_mlp": 0.00791801, + "balance_loss_clip": 1.04443812, + "balance_loss_mlp": 1.01110983, + "epoch": 0.3088531489553585, + "flos": 74738829824640.0, + "grad_norm": 3.6141860830208703, + "language_loss": 0.76095974, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.7799949, + "num_input_tokens_seen": 110368430, + "step": 5137, + "time_per_iteration": 5.66956901550293 + }, + { + "auxiliary_loss_clip": 0.01096748, + "auxiliary_loss_mlp": 0.01035591, + "balance_loss_clip": 1.04324245, + "balance_loss_mlp": 1.0208863, + "epoch": 0.30891327220802645, + "flos": 21142300043520.0, + "grad_norm": 1.611292880470574, + "language_loss": 0.79480058, + "learning_rate": 3.237954673696424e-06, + "loss": 0.81612396, + "num_input_tokens_seen": 110386735, + "step": 5138, + "time_per_iteration": 2.544736623764038 + }, + { + "auxiliary_loss_clip": 0.01080991, + "auxiliary_loss_mlp": 0.01046534, + "balance_loss_clip": 1.04888475, + "balance_loss_mlp": 1.02918911, + "epoch": 0.3089733954606944, + "flos": 25664494515840.0, + "grad_norm": 1.4685043975493102, + "language_loss": 0.81352645, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.83480167, + "num_input_tokens_seen": 110406820, + "step": 5139, + "time_per_iteration": 2.615164279937744 + }, + { + "auxiliary_loss_clip": 0.01124264, + "auxiliary_loss_mlp": 0.01043013, + "balance_loss_clip": 1.04623747, + "balance_loss_mlp": 1.02514327, + "epoch": 0.3090335187133624, + "flos": 19427350124160.0, + "grad_norm": 2.026754591581606, + "language_loss": 0.77583021, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.79750299, + "num_input_tokens_seen": 110424225, + "step": 5140, + "time_per_iteration": 2.475653886795044 + }, + { + "auxiliary_loss_clip": 0.01095993, + "auxiliary_loss_mlp": 0.01048035, + "balance_loss_clip": 1.04417276, + "balance_loss_mlp": 1.03270459, + "epoch": 0.30909364196603034, + "flos": 20011329440640.0, + "grad_norm": 2.5521180203314375, + "language_loss": 0.78292257, + "learning_rate": 3.237036802553252e-06, + "loss": 0.80436289, + "num_input_tokens_seen": 110443310, + "step": 5141, + "time_per_iteration": 2.537550687789917 + }, + { + "auxiliary_loss_clip": 0.01121276, + "auxiliary_loss_mlp": 0.01043065, + "balance_loss_clip": 1.04823518, + "balance_loss_mlp": 1.02662611, + "epoch": 0.3091537652186983, + "flos": 19677575243520.0, + "grad_norm": 2.189522433261539, + "language_loss": 0.87047809, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.89212149, + "num_input_tokens_seen": 110460215, + "step": 5142, + "time_per_iteration": 2.516071319580078 + }, + { + "auxiliary_loss_clip": 0.01123133, + "auxiliary_loss_mlp": 0.01043572, + "balance_loss_clip": 1.04517365, + "balance_loss_mlp": 1.02788424, + "epoch": 0.3092138884713663, + "flos": 17020042577280.0, + "grad_norm": 1.828304621497409, + "language_loss": 0.791161, + "learning_rate": 3.23642465389567e-06, + "loss": 0.81282806, + "num_input_tokens_seen": 110479385, + "step": 5143, + "time_per_iteration": 2.470850706100464 + }, + { + "auxiliary_loss_clip": 0.01103064, + "auxiliary_loss_mlp": 0.01038895, + "balance_loss_clip": 1.04684007, + "balance_loss_mlp": 1.02259874, + "epoch": 0.3092740117240343, + "flos": 25009986844800.0, + "grad_norm": 1.6279856290545514, + "language_loss": 0.71849293, + "learning_rate": 3.236118509233055e-06, + "loss": 0.73991251, + "num_input_tokens_seen": 110499885, + "step": 5144, + "time_per_iteration": 2.5860989093780518 + }, + { + "auxiliary_loss_clip": 0.01125529, + "auxiliary_loss_mlp": 0.01042406, + "balance_loss_clip": 1.04461944, + "balance_loss_mlp": 1.02647996, + "epoch": 0.30933413497670226, + "flos": 25590410714880.0, + "grad_norm": 1.6875416683525042, + "language_loss": 0.7436403, + "learning_rate": 3.235812317696702e-06, + "loss": 0.76531965, + "num_input_tokens_seen": 110519690, + "step": 5145, + "time_per_iteration": 2.518439531326294 + }, + { + "auxiliary_loss_clip": 0.0111389, + "auxiliary_loss_mlp": 0.01049004, + "balance_loss_clip": 1.04682899, + "balance_loss_mlp": 1.03260112, + "epoch": 0.3093942582293702, + "flos": 24389665943040.0, + "grad_norm": 4.610241339881009, + "language_loss": 0.76425201, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.78588092, + "num_input_tokens_seen": 110540520, + "step": 5146, + "time_per_iteration": 2.547069549560547 + }, + { + "auxiliary_loss_clip": 0.0110674, + "auxiliary_loss_mlp": 0.01038721, + "balance_loss_clip": 1.04276359, + "balance_loss_mlp": 1.02328908, + "epoch": 0.3094543814820382, + "flos": 19646441130240.0, + "grad_norm": 1.7872494046337706, + "language_loss": 0.66106921, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.68252385, + "num_input_tokens_seen": 110557950, + "step": 5147, + "time_per_iteration": 2.4985668659210205 + }, + { + "auxiliary_loss_clip": 0.01128481, + "auxiliary_loss_mlp": 0.01041074, + "balance_loss_clip": 1.04848289, + "balance_loss_mlp": 1.02642322, + "epoch": 0.30951450473470615, + "flos": 25663812157440.0, + "grad_norm": 2.0710085825006144, + "language_loss": 0.74743724, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.76913279, + "num_input_tokens_seen": 110578215, + "step": 5148, + "time_per_iteration": 2.5143988132476807 + }, + { + "auxiliary_loss_clip": 0.01129538, + "auxiliary_loss_mlp": 0.01046454, + "balance_loss_clip": 1.04654431, + "balance_loss_mlp": 1.03009832, + "epoch": 0.3095746279873741, + "flos": 12020415505920.0, + "grad_norm": 2.6134496639525837, + "language_loss": 0.72102737, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.74278736, + "num_input_tokens_seen": 110592990, + "step": 5149, + "time_per_iteration": 2.420346736907959 + }, + { + "auxiliary_loss_clip": 0.01091789, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.04417312, + "balance_loss_mlp": 1.02280688, + "epoch": 0.3096347512400421, + "flos": 23623044946560.0, + "grad_norm": 1.8349776678294214, + "language_loss": 0.84674132, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.86805868, + "num_input_tokens_seen": 110612130, + "step": 5150, + "time_per_iteration": 2.5832021236419678 + }, + { + "auxiliary_loss_clip": 0.0108541, + "auxiliary_loss_mlp": 0.01045762, + "balance_loss_clip": 1.04637694, + "balance_loss_mlp": 1.02833378, + "epoch": 0.30969487449271005, + "flos": 22529313768960.0, + "grad_norm": 1.933581818493374, + "language_loss": 0.78832352, + "learning_rate": 3.233974184780424e-06, + "loss": 0.80963528, + "num_input_tokens_seen": 110632045, + "step": 5151, + "time_per_iteration": 2.5732645988464355 + }, + { + "auxiliary_loss_clip": 0.01126874, + "auxiliary_loss_mlp": 0.01042425, + "balance_loss_clip": 1.04638541, + "balance_loss_mlp": 1.02490151, + "epoch": 0.309754997745378, + "flos": 15267925059840.0, + "grad_norm": 3.2489503518730807, + "language_loss": 0.66898882, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.69068182, + "num_input_tokens_seen": 110649340, + "step": 5152, + "time_per_iteration": 2.4647090435028076 + }, + { + "auxiliary_loss_clip": 0.01078987, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_clip": 1.04586983, + "balance_loss_mlp": 1.02673817, + "epoch": 0.309815120998046, + "flos": 26979291947520.0, + "grad_norm": 2.0314345938037883, + "language_loss": 0.82136017, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.8425802, + "num_input_tokens_seen": 110668450, + "step": 5153, + "time_per_iteration": 2.63582706451416 + }, + { + "auxiliary_loss_clip": 0.01110126, + "auxiliary_loss_mlp": 0.00795524, + "balance_loss_clip": 1.0471034, + "balance_loss_mlp": 1.01592326, + "epoch": 0.30987524425071394, + "flos": 21143161969920.0, + "grad_norm": 1.9398322491677165, + "language_loss": 0.74210793, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.76116443, + "num_input_tokens_seen": 110689410, + "step": 5154, + "time_per_iteration": 2.5508196353912354 + }, + { + "auxiliary_loss_clip": 0.0112657, + "auxiliary_loss_mlp": 0.01036252, + "balance_loss_clip": 1.04715347, + "balance_loss_mlp": 1.01976526, + "epoch": 0.3099353675033819, + "flos": 15268284195840.0, + "grad_norm": 2.909941683543604, + "language_loss": 0.76229084, + "learning_rate": 3.232747826832858e-06, + "loss": 0.78391904, + "num_input_tokens_seen": 110707350, + "step": 5155, + "time_per_iteration": 2.460416793823242 + }, + { + "auxiliary_loss_clip": 0.01125444, + "auxiliary_loss_mlp": 0.01039499, + "balance_loss_clip": 1.0519228, + "balance_loss_mlp": 1.02288115, + "epoch": 0.30999549075604993, + "flos": 15413794191360.0, + "grad_norm": 2.3342258746411493, + "language_loss": 0.79106486, + "learning_rate": 3.232441120452094e-06, + "loss": 0.81271434, + "num_input_tokens_seen": 110724910, + "step": 5156, + "time_per_iteration": 2.507646083831787 + }, + { + "auxiliary_loss_clip": 0.01124735, + "auxiliary_loss_mlp": 0.01044663, + "balance_loss_clip": 1.04945016, + "balance_loss_mlp": 1.02774715, + "epoch": 0.3100556140087179, + "flos": 23184539712000.0, + "grad_norm": 2.166944937986495, + "language_loss": 0.75024378, + "learning_rate": 3.23213436733704e-06, + "loss": 0.77193773, + "num_input_tokens_seen": 110744010, + "step": 5157, + "time_per_iteration": 2.5005106925964355 + }, + { + "auxiliary_loss_clip": 0.01099555, + "auxiliary_loss_mlp": 0.01041818, + "balance_loss_clip": 1.04467249, + "balance_loss_mlp": 1.02616549, + "epoch": 0.31011573726138586, + "flos": 25742169676800.0, + "grad_norm": 1.5696421822926878, + "language_loss": 0.69579256, + "learning_rate": 3.231827567499327e-06, + "loss": 0.71720624, + "num_input_tokens_seen": 110765835, + "step": 5158, + "time_per_iteration": 2.599217414855957 + }, + { + "auxiliary_loss_clip": 0.01088321, + "auxiliary_loss_mlp": 0.01037809, + "balance_loss_clip": 1.04163313, + "balance_loss_mlp": 1.02296734, + "epoch": 0.3101758605140538, + "flos": 20011329440640.0, + "grad_norm": 2.4712486636239137, + "language_loss": 0.84994459, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.87120587, + "num_input_tokens_seen": 110784655, + "step": 5159, + "time_per_iteration": 2.5482935905456543 + }, + { + "auxiliary_loss_clip": 0.01112414, + "auxiliary_loss_mlp": 0.01043839, + "balance_loss_clip": 1.04609013, + "balance_loss_mlp": 1.02757919, + "epoch": 0.3102359837667218, + "flos": 19135683688320.0, + "grad_norm": 1.7317939008168393, + "language_loss": 0.85188442, + "learning_rate": 3.231213827702462e-06, + "loss": 0.87344694, + "num_input_tokens_seen": 110802545, + "step": 5160, + "time_per_iteration": 2.5181961059570312 + }, + { + "auxiliary_loss_clip": 0.01123575, + "auxiliary_loss_mlp": 0.01038954, + "balance_loss_clip": 1.04579222, + "balance_loss_mlp": 1.02354002, + "epoch": 0.31029610701938976, + "flos": 22265405568000.0, + "grad_norm": 1.8977180283170363, + "language_loss": 0.7595709, + "learning_rate": 3.230906887766584e-06, + "loss": 0.78119624, + "num_input_tokens_seen": 110820265, + "step": 5161, + "time_per_iteration": 2.4689128398895264 + }, + { + "auxiliary_loss_clip": 0.01126974, + "auxiliary_loss_mlp": 0.01040086, + "balance_loss_clip": 1.0475775, + "balance_loss_mlp": 1.02377796, + "epoch": 0.3103562302720577, + "flos": 20805349536000.0, + "grad_norm": 2.0087336665127205, + "language_loss": 0.81976295, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.84143353, + "num_input_tokens_seen": 110836195, + "step": 5162, + "time_per_iteration": 2.501729965209961 + }, + { + "auxiliary_loss_clip": 0.01122791, + "auxiliary_loss_mlp": 0.01036481, + "balance_loss_clip": 1.04750407, + "balance_loss_mlp": 1.02222919, + "epoch": 0.3104163535247257, + "flos": 22344158136960.0, + "grad_norm": 1.7623342700956923, + "language_loss": 0.83124697, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.85283971, + "num_input_tokens_seen": 110856420, + "step": 5163, + "time_per_iteration": 2.493786334991455 + }, + { + "auxiliary_loss_clip": 0.01139342, + "auxiliary_loss_mlp": 0.01042755, + "balance_loss_clip": 1.04878068, + "balance_loss_mlp": 1.02688789, + "epoch": 0.31047647677739365, + "flos": 21689363157120.0, + "grad_norm": 1.8421898312037124, + "language_loss": 0.75828815, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78010905, + "num_input_tokens_seen": 110876650, + "step": 5164, + "time_per_iteration": 2.483548879623413 + }, + { + "auxiliary_loss_clip": 0.01101012, + "auxiliary_loss_mlp": 0.01040371, + "balance_loss_clip": 1.0478344, + "balance_loss_mlp": 1.02400351, + "epoch": 0.3105366000300616, + "flos": 18917275040640.0, + "grad_norm": 1.8030089877008284, + "language_loss": 0.74250633, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.76392019, + "num_input_tokens_seen": 110894445, + "step": 5165, + "time_per_iteration": 2.544454574584961 + }, + { + "auxiliary_loss_clip": 0.01100525, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_clip": 1.05296779, + "balance_loss_mlp": 1.02886653, + "epoch": 0.3105967232827296, + "flos": 18260397072000.0, + "grad_norm": 1.4650576728571867, + "language_loss": 0.76115322, + "learning_rate": 3.229371488178348e-06, + "loss": 0.78262436, + "num_input_tokens_seen": 110912855, + "step": 5166, + "time_per_iteration": 2.5684804916381836 + }, + { + "auxiliary_loss_clip": 0.01117025, + "auxiliary_loss_mlp": 0.01041485, + "balance_loss_clip": 1.04738986, + "balance_loss_mlp": 1.02471232, + "epoch": 0.31065684653539755, + "flos": 17672144037120.0, + "grad_norm": 3.122357064853797, + "language_loss": 0.73346955, + "learning_rate": 3.229064268360444e-06, + "loss": 0.75505465, + "num_input_tokens_seen": 110928025, + "step": 5167, + "time_per_iteration": 2.47337007522583 + }, + { + "auxiliary_loss_clip": 0.01028684, + "auxiliary_loss_mlp": 0.01057931, + "balance_loss_clip": 1.05697787, + "balance_loss_mlp": 1.05545127, + "epoch": 0.3107169697880655, + "flos": 68531996511360.0, + "grad_norm": 0.7205830673479034, + "language_loss": 0.52990001, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55076617, + "num_input_tokens_seen": 110992215, + "step": 5168, + "time_per_iteration": 4.648841619491577 + }, + { + "auxiliary_loss_clip": 0.01128057, + "auxiliary_loss_mlp": 0.01041426, + "balance_loss_clip": 1.04900575, + "balance_loss_mlp": 1.02473664, + "epoch": 0.3107770930407335, + "flos": 13188733274880.0, + "grad_norm": 2.611895628033281, + "language_loss": 0.78243387, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.80412877, + "num_input_tokens_seen": 111010400, + "step": 5169, + "time_per_iteration": 2.602570056915283 + }, + { + "auxiliary_loss_clip": 0.01115759, + "auxiliary_loss_mlp": 0.01040141, + "balance_loss_clip": 1.04538488, + "balance_loss_mlp": 1.02415502, + "epoch": 0.3108372162934015, + "flos": 31580849520000.0, + "grad_norm": 1.484947480576105, + "language_loss": 0.63738811, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.65894711, + "num_input_tokens_seen": 111033960, + "step": 5170, + "time_per_iteration": 2.627995491027832 + }, + { + "auxiliary_loss_clip": 0.01102077, + "auxiliary_loss_mlp": 0.00794083, + "balance_loss_clip": 1.05027497, + "balance_loss_mlp": 1.01183414, + "epoch": 0.31089733954606946, + "flos": 28729829266560.0, + "grad_norm": 2.394576409658571, + "language_loss": 0.78137171, + "learning_rate": 3.22783492314295e-06, + "loss": 0.80033332, + "num_input_tokens_seen": 111053265, + "step": 5171, + "time_per_iteration": 2.617152690887451 + }, + { + "auxiliary_loss_clip": 0.01100952, + "auxiliary_loss_mlp": 0.01054239, + "balance_loss_clip": 1.05047846, + "balance_loss_mlp": 1.03733563, + "epoch": 0.3109574627987374, + "flos": 19683249592320.0, + "grad_norm": 1.8125294233796099, + "language_loss": 0.83897364, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.86052561, + "num_input_tokens_seen": 111071130, + "step": 5172, + "time_per_iteration": 3.9540529251098633 + }, + { + "auxiliary_loss_clip": 0.01092156, + "auxiliary_loss_mlp": 0.01042326, + "balance_loss_clip": 1.05263233, + "balance_loss_mlp": 1.02541041, + "epoch": 0.3110175860514054, + "flos": 14683981656960.0, + "grad_norm": 2.063805517736345, + "language_loss": 0.8437928, + "learning_rate": 3.227219971129842e-06, + "loss": 0.86513764, + "num_input_tokens_seen": 111089560, + "step": 5173, + "time_per_iteration": 2.568983316421509 + }, + { + "auxiliary_loss_clip": 0.0113316, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.04829097, + "balance_loss_mlp": 1.02085984, + "epoch": 0.31107770930407336, + "flos": 25739655724800.0, + "grad_norm": 1.8270525156300017, + "language_loss": 0.83645487, + "learning_rate": 3.226912425313001e-06, + "loss": 0.85814452, + "num_input_tokens_seen": 111109960, + "step": 5174, + "time_per_iteration": 2.517989158630371 + }, + { + "auxiliary_loss_clip": 0.01115388, + "auxiliary_loss_mlp": 0.01043589, + "balance_loss_clip": 1.04775548, + "balance_loss_mlp": 1.02777028, + "epoch": 0.3111378325567413, + "flos": 19208259118080.0, + "grad_norm": 2.0653210626358343, + "language_loss": 0.85410631, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.87569606, + "num_input_tokens_seen": 111127960, + "step": 5175, + "time_per_iteration": 3.965095043182373 + }, + { + "auxiliary_loss_clip": 0.01080534, + "auxiliary_loss_mlp": 0.01051819, + "balance_loss_clip": 1.04671264, + "balance_loss_mlp": 1.03428352, + "epoch": 0.3111979558094093, + "flos": 23696374561920.0, + "grad_norm": 1.9875723257700844, + "language_loss": 0.8326515, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.85397494, + "num_input_tokens_seen": 111146730, + "step": 5176, + "time_per_iteration": 4.029592514038086 + }, + { + "auxiliary_loss_clip": 0.01117786, + "auxiliary_loss_mlp": 0.0104096, + "balance_loss_clip": 1.04273641, + "balance_loss_mlp": 1.0238061, + "epoch": 0.31125807906207725, + "flos": 21033023892480.0, + "grad_norm": 1.7975039537812783, + "language_loss": 0.8059147, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.82750213, + "num_input_tokens_seen": 111166295, + "step": 5177, + "time_per_iteration": 2.499922037124634 + }, + { + "auxiliary_loss_clip": 0.01124084, + "auxiliary_loss_mlp": 0.00794124, + "balance_loss_clip": 1.04842472, + "balance_loss_mlp": 1.0129987, + "epoch": 0.3113182023147452, + "flos": 23076628277760.0, + "grad_norm": 1.603204742098221, + "language_loss": 0.80778694, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.82696897, + "num_input_tokens_seen": 111185665, + "step": 5178, + "time_per_iteration": 2.486328125 + }, + { + "auxiliary_loss_clip": 0.01111816, + "auxiliary_loss_mlp": 0.01045516, + "balance_loss_clip": 1.04913521, + "balance_loss_mlp": 1.0301621, + "epoch": 0.3113783255674132, + "flos": 11838994888320.0, + "grad_norm": 1.7696422578055893, + "language_loss": 0.8110491, + "learning_rate": 3.225373998592471e-06, + "loss": 0.83262247, + "num_input_tokens_seen": 111201615, + "step": 5179, + "time_per_iteration": 2.519951820373535 + }, + { + "auxiliary_loss_clip": 0.01102373, + "auxiliary_loss_mlp": 0.01048377, + "balance_loss_clip": 1.04727674, + "balance_loss_mlp": 1.03215837, + "epoch": 0.31143844882008115, + "flos": 16289547684480.0, + "grad_norm": 1.7544835344503964, + "language_loss": 0.7843743, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.80588174, + "num_input_tokens_seen": 111220515, + "step": 5180, + "time_per_iteration": 2.5336902141571045 + }, + { + "auxiliary_loss_clip": 0.01100611, + "auxiliary_loss_mlp": 0.01038562, + "balance_loss_clip": 1.0486815, + "balance_loss_mlp": 1.02274334, + "epoch": 0.3114985720727491, + "flos": 23217792727680.0, + "grad_norm": 1.701483802239429, + "language_loss": 0.82973057, + "learning_rate": 3.22475830255844e-06, + "loss": 0.85112232, + "num_input_tokens_seen": 111240395, + "step": 5181, + "time_per_iteration": 2.6240293979644775 + }, + { + "auxiliary_loss_clip": 0.01104904, + "auxiliary_loss_mlp": 0.01046827, + "balance_loss_clip": 1.04833639, + "balance_loss_mlp": 1.03190184, + "epoch": 0.3115586953254171, + "flos": 30044626698240.0, + "grad_norm": 1.5838323050963508, + "language_loss": 0.73865235, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.76016974, + "num_input_tokens_seen": 111261100, + "step": 5182, + "time_per_iteration": 2.6070053577423096 + }, + { + "auxiliary_loss_clip": 0.01091651, + "auxiliary_loss_mlp": 0.00794192, + "balance_loss_clip": 1.05143619, + "balance_loss_mlp": 1.01375818, + "epoch": 0.3116188185780851, + "flos": 25666326109440.0, + "grad_norm": 2.0452499501317933, + "language_loss": 0.70419228, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.72305071, + "num_input_tokens_seen": 111281320, + "step": 5183, + "time_per_iteration": 2.6470866203308105 + }, + { + "auxiliary_loss_clip": 0.01032647, + "auxiliary_loss_mlp": 0.01027089, + "balance_loss_clip": 1.03655267, + "balance_loss_mlp": 1.0250144, + "epoch": 0.31167894183075306, + "flos": 69510058917120.0, + "grad_norm": 0.948472405818638, + "language_loss": 0.59616172, + "learning_rate": 3.223834410214408e-06, + "loss": 0.61675906, + "num_input_tokens_seen": 111341405, + "step": 5184, + "time_per_iteration": 3.1720921993255615 + }, + { + "auxiliary_loss_clip": 0.01110505, + "auxiliary_loss_mlp": 0.01047951, + "balance_loss_clip": 1.04540801, + "balance_loss_mlp": 1.03231096, + "epoch": 0.31173906508342103, + "flos": 14939845211520.0, + "grad_norm": 2.221267139509657, + "language_loss": 0.7008751, + "learning_rate": 3.223526353268311e-06, + "loss": 0.72245967, + "num_input_tokens_seen": 111358975, + "step": 5185, + "time_per_iteration": 2.4985976219177246 + }, + { + "auxiliary_loss_clip": 0.01117424, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.05180252, + "balance_loss_mlp": 1.03086674, + "epoch": 0.311799188336089, + "flos": 16176033728640.0, + "grad_norm": 2.2806412933373763, + "language_loss": 0.64010477, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.661753, + "num_input_tokens_seen": 111375845, + "step": 5186, + "time_per_iteration": 2.55678129196167 + }, + { + "auxiliary_loss_clip": 0.0111856, + "auxiliary_loss_mlp": 0.01046602, + "balance_loss_clip": 1.04853773, + "balance_loss_mlp": 1.02965021, + "epoch": 0.31185931158875696, + "flos": 25009627708800.0, + "grad_norm": 2.7785771110718502, + "language_loss": 0.87207204, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.89372361, + "num_input_tokens_seen": 111394150, + "step": 5187, + "time_per_iteration": 2.5398917198181152 + }, + { + "auxiliary_loss_clip": 0.0113274, + "auxiliary_loss_mlp": 0.00793539, + "balance_loss_clip": 1.04485118, + "balance_loss_mlp": 1.01490855, + "epoch": 0.3119194348414249, + "flos": 37232901273600.0, + "grad_norm": 1.6758342796593326, + "language_loss": 0.63002717, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.64928997, + "num_input_tokens_seen": 111418355, + "step": 5188, + "time_per_iteration": 2.626851797103882 + }, + { + "auxiliary_loss_clip": 0.0110431, + "auxiliary_loss_mlp": 0.01038539, + "balance_loss_clip": 1.05511713, + "balance_loss_mlp": 1.02296984, + "epoch": 0.3119795580940929, + "flos": 15012779777280.0, + "grad_norm": 2.5274316357320616, + "language_loss": 0.8294239, + "learning_rate": 3.222293661638346e-06, + "loss": 0.85085237, + "num_input_tokens_seen": 111435445, + "step": 5189, + "time_per_iteration": 2.5121705532073975 + }, + { + "auxiliary_loss_clip": 0.01034089, + "auxiliary_loss_mlp": 0.01040511, + "balance_loss_clip": 1.05023551, + "balance_loss_mlp": 1.02367854, + "epoch": 0.31203968134676086, + "flos": 15998168557440.0, + "grad_norm": 1.7172832206746698, + "language_loss": 0.79305208, + "learning_rate": 3.22198537282789e-06, + "loss": 0.81379807, + "num_input_tokens_seen": 111453430, + "step": 5190, + "time_per_iteration": 2.8487329483032227 + }, + { + "auxiliary_loss_clip": 0.01083361, + "auxiliary_loss_mlp": 0.0105712, + "balance_loss_clip": 1.04773653, + "balance_loss_mlp": 1.03911948, + "epoch": 0.3120998045994288, + "flos": 23837359443840.0, + "grad_norm": 1.54532643164188, + "language_loss": 0.75090659, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.77231145, + "num_input_tokens_seen": 111475325, + "step": 5191, + "time_per_iteration": 2.8717501163482666 + }, + { + "auxiliary_loss_clip": 0.01040946, + "auxiliary_loss_mlp": 0.00950794, + "balance_loss_clip": 1.01857841, + "balance_loss_mlp": 1.3100661, + "epoch": 0.3121599278520968, + "flos": 69184205712000.0, + "grad_norm": 0.8479726393477088, + "language_loss": 0.6389336, + "learning_rate": 3.221368656205247e-06, + "loss": 0.65885103, + "num_input_tokens_seen": 111533960, + "step": 5192, + "time_per_iteration": 3.204542875289917 + }, + { + "auxiliary_loss_clip": 0.01126482, + "auxiliary_loss_mlp": 0.01039471, + "balance_loss_clip": 1.04712534, + "balance_loss_mlp": 1.022686, + "epoch": 0.31222005110476475, + "flos": 23806368984960.0, + "grad_norm": 1.6343880469172782, + "language_loss": 0.80250365, + "learning_rate": 3.221060228416446e-06, + "loss": 0.8241632, + "num_input_tokens_seen": 111554055, + "step": 5193, + "time_per_iteration": 2.5219171047210693 + }, + { + "auxiliary_loss_clip": 0.01107096, + "auxiliary_loss_mlp": 0.01049755, + "balance_loss_clip": 1.04404974, + "balance_loss_mlp": 1.03155184, + "epoch": 0.3122801743574327, + "flos": 25226132935680.0, + "grad_norm": 2.712552235021476, + "language_loss": 0.72168684, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.74325538, + "num_input_tokens_seen": 111574305, + "step": 5194, + "time_per_iteration": 2.5647292137145996 + }, + { + "auxiliary_loss_clip": 0.01137359, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.04955387, + "balance_loss_mlp": 1.02481389, + "epoch": 0.3123402976101007, + "flos": 22966490200320.0, + "grad_norm": 1.4868172919614784, + "language_loss": 0.76554722, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.78732008, + "num_input_tokens_seen": 111595680, + "step": 5195, + "time_per_iteration": 2.486654281616211 + }, + { + "auxiliary_loss_clip": 0.01134167, + "auxiliary_loss_mlp": 0.01038979, + "balance_loss_clip": 1.04447699, + "balance_loss_mlp": 1.02372026, + "epoch": 0.3124004208627687, + "flos": 25192089820800.0, + "grad_norm": 1.3433196674166674, + "language_loss": 0.77865672, + "learning_rate": 3.220134667280476e-06, + "loss": 0.80038822, + "num_input_tokens_seen": 111618135, + "step": 5196, + "time_per_iteration": 2.5166382789611816 + }, + { + "auxiliary_loss_clip": 0.01036089, + "auxiliary_loss_mlp": 0.00882511, + "balance_loss_clip": 1.03528142, + "balance_loss_mlp": 1.19397402, + "epoch": 0.31246054411543667, + "flos": 67485165517440.0, + "grad_norm": 0.7686288548926228, + "language_loss": 0.54783052, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56701654, + "num_input_tokens_seen": 111682220, + "step": 5197, + "time_per_iteration": 3.1676833629608154 + }, + { + "auxiliary_loss_clip": 0.01133354, + "auxiliary_loss_mlp": 0.01039928, + "balance_loss_clip": 1.04703832, + "balance_loss_mlp": 1.02497935, + "epoch": 0.31252066736810463, + "flos": 17858520731520.0, + "grad_norm": 1.6689481791373746, + "language_loss": 0.66449761, + "learning_rate": 3.21951739516552e-06, + "loss": 0.68623042, + "num_input_tokens_seen": 111700815, + "step": 5198, + "time_per_iteration": 2.4347949028015137 + }, + { + "auxiliary_loss_clip": 0.01097468, + "auxiliary_loss_mlp": 0.01044707, + "balance_loss_clip": 1.0480839, + "balance_loss_mlp": 1.02832723, + "epoch": 0.3125807906207726, + "flos": 18475034791680.0, + "grad_norm": 2.0696562408039165, + "language_loss": 0.69305652, + "learning_rate": 3.219208689735857e-06, + "loss": 0.71447825, + "num_input_tokens_seen": 111718195, + "step": 5199, + "time_per_iteration": 2.5272793769836426 + }, + { + "auxiliary_loss_clip": 0.01125201, + "auxiliary_loss_mlp": 0.01045065, + "balance_loss_clip": 1.04653728, + "balance_loss_mlp": 1.02868581, + "epoch": 0.31264091387344056, + "flos": 18946541646720.0, + "grad_norm": 2.025599793661542, + "language_loss": 0.78516227, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.80686492, + "num_input_tokens_seen": 111734440, + "step": 5200, + "time_per_iteration": 2.4818742275238037 + }, + { + "auxiliary_loss_clip": 0.011226, + "auxiliary_loss_mlp": 0.01036195, + "balance_loss_clip": 1.04734683, + "balance_loss_mlp": 1.02069747, + "epoch": 0.3127010371261085, + "flos": 21468512384640.0, + "grad_norm": 2.111240648917942, + "language_loss": 0.83747625, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.85906422, + "num_input_tokens_seen": 111751960, + "step": 5201, + "time_per_iteration": 2.5128326416015625 + }, + { + "auxiliary_loss_clip": 0.01137302, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.05001748, + "balance_loss_mlp": 1.02639413, + "epoch": 0.3127611603787765, + "flos": 15336047203200.0, + "grad_norm": 2.169256429211964, + "language_loss": 0.69747806, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.71927673, + "num_input_tokens_seen": 111769585, + "step": 5202, + "time_per_iteration": 2.450622797012329 + }, + { + "auxiliary_loss_clip": 0.01136343, + "auxiliary_loss_mlp": 0.01035204, + "balance_loss_clip": 1.0467298, + "balance_loss_mlp": 1.02138734, + "epoch": 0.31282128363144446, + "flos": 17602980399360.0, + "grad_norm": 1.690486940878734, + "language_loss": 0.84001487, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.86173034, + "num_input_tokens_seen": 111787880, + "step": 5203, + "time_per_iteration": 2.4644951820373535 + }, + { + "auxiliary_loss_clip": 0.01086346, + "auxiliary_loss_mlp": 0.01040608, + "balance_loss_clip": 1.05260599, + "balance_loss_mlp": 1.02448487, + "epoch": 0.3128814068841124, + "flos": 26756753235840.0, + "grad_norm": 2.0364480852359685, + "language_loss": 0.60868442, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.62995398, + "num_input_tokens_seen": 111805950, + "step": 5204, + "time_per_iteration": 2.636850118637085 + }, + { + "auxiliary_loss_clip": 0.01102569, + "auxiliary_loss_mlp": 0.01041293, + "balance_loss_clip": 1.04868245, + "balance_loss_mlp": 1.0271486, + "epoch": 0.3129415301367804, + "flos": 22272372806400.0, + "grad_norm": 2.4592486453288656, + "language_loss": 0.65781641, + "learning_rate": 3.217355486684887e-06, + "loss": 0.67925501, + "num_input_tokens_seen": 111826135, + "step": 5205, + "time_per_iteration": 2.537339925765991 + }, + { + "auxiliary_loss_clip": 0.01126912, + "auxiliary_loss_mlp": 0.01045894, + "balance_loss_clip": 1.04904854, + "balance_loss_mlp": 1.0296216, + "epoch": 0.31300165338944835, + "flos": 26464907232000.0, + "grad_norm": 1.5442540569090208, + "language_loss": 0.76642656, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.7881546, + "num_input_tokens_seen": 111844700, + "step": 5206, + "time_per_iteration": 2.5433506965637207 + }, + { + "auxiliary_loss_clip": 0.01134291, + "auxiliary_loss_mlp": 0.01037572, + "balance_loss_clip": 1.04710698, + "balance_loss_mlp": 1.02232492, + "epoch": 0.3130617766421163, + "flos": 21944652094080.0, + "grad_norm": 2.0491924909917825, + "language_loss": 0.83346403, + "learning_rate": 3.216737382911672e-06, + "loss": 0.85518265, + "num_input_tokens_seen": 111861585, + "step": 5207, + "time_per_iteration": 4.298353672027588 + }, + { + "auxiliary_loss_clip": 0.01120945, + "auxiliary_loss_mlp": 0.01041495, + "balance_loss_clip": 1.04804385, + "balance_loss_mlp": 1.02761316, + "epoch": 0.3131218998947843, + "flos": 23292774368640.0, + "grad_norm": 1.785733933756896, + "language_loss": 0.71241498, + "learning_rate": 3.216428261810999e-06, + "loss": 0.73403931, + "num_input_tokens_seen": 111882950, + "step": 5208, + "time_per_iteration": 2.5257740020751953 + }, + { + "auxiliary_loss_clip": 0.01113951, + "auxiliary_loss_mlp": 0.01043194, + "balance_loss_clip": 1.04694748, + "balance_loss_mlp": 1.02757788, + "epoch": 0.3131820231474523, + "flos": 21139642437120.0, + "grad_norm": 1.9897925635685254, + "language_loss": 0.74732858, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.76889998, + "num_input_tokens_seen": 111901640, + "step": 5209, + "time_per_iteration": 2.5442450046539307 + }, + { + "auxiliary_loss_clip": 0.01134312, + "auxiliary_loss_mlp": 0.01044211, + "balance_loss_clip": 1.04607558, + "balance_loss_mlp": 1.02956009, + "epoch": 0.31324214640012027, + "flos": 23909863046400.0, + "grad_norm": 1.7292165498821463, + "language_loss": 0.77616775, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.79795301, + "num_input_tokens_seen": 111919615, + "step": 5210, + "time_per_iteration": 2.485924005508423 + }, + { + "auxiliary_loss_clip": 0.01118425, + "auxiliary_loss_mlp": 0.01039822, + "balance_loss_clip": 1.04493189, + "balance_loss_mlp": 1.02495706, + "epoch": 0.31330226965278823, + "flos": 22236929061120.0, + "grad_norm": 1.645380940549625, + "language_loss": 0.79459143, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.81617391, + "num_input_tokens_seen": 111938485, + "step": 5211, + "time_per_iteration": 3.9138388633728027 + }, + { + "auxiliary_loss_clip": 0.01120589, + "auxiliary_loss_mlp": 0.01037009, + "balance_loss_clip": 1.0448072, + "balance_loss_mlp": 1.02248931, + "epoch": 0.3133623929054562, + "flos": 19753993428480.0, + "grad_norm": 1.7210271460844149, + "language_loss": 0.79722607, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.81880212, + "num_input_tokens_seen": 111956425, + "step": 5212, + "time_per_iteration": 2.508111000061035 + }, + { + "auxiliary_loss_clip": 0.01122775, + "auxiliary_loss_mlp": 0.01047595, + "balance_loss_clip": 1.04892015, + "balance_loss_mlp": 1.03197837, + "epoch": 0.31342251615812416, + "flos": 27162256849920.0, + "grad_norm": 2.1240796469420564, + "language_loss": 0.7106024, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.73230612, + "num_input_tokens_seen": 111975915, + "step": 5213, + "time_per_iteration": 4.019595146179199 + }, + { + "auxiliary_loss_clip": 0.01127092, + "auxiliary_loss_mlp": 0.01045638, + "balance_loss_clip": 1.05091286, + "balance_loss_mlp": 1.03040349, + "epoch": 0.31348263941079213, + "flos": 20229809915520.0, + "grad_norm": 2.1363712396767416, + "language_loss": 0.77613342, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.79786074, + "num_input_tokens_seen": 111995055, + "step": 5214, + "time_per_iteration": 3.8577523231506348 + }, + { + "auxiliary_loss_clip": 0.0108869, + "auxiliary_loss_mlp": 0.01031607, + "balance_loss_clip": 1.0465889, + "balance_loss_mlp": 1.01761806, + "epoch": 0.3135427626634601, + "flos": 24607643627520.0, + "grad_norm": 1.6006245861298873, + "language_loss": 0.82704324, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.84824622, + "num_input_tokens_seen": 112015830, + "step": 5215, + "time_per_iteration": 2.6515002250671387 + }, + { + "auxiliary_loss_clip": 0.01124746, + "auxiliary_loss_mlp": 0.01038224, + "balance_loss_clip": 1.04784989, + "balance_loss_mlp": 1.02275038, + "epoch": 0.31360288591612806, + "flos": 20959873845120.0, + "grad_norm": 2.0668590643427236, + "language_loss": 0.79566288, + "learning_rate": 3.213953633415686e-06, + "loss": 0.81729257, + "num_input_tokens_seen": 112035065, + "step": 5216, + "time_per_iteration": 2.4976272583007812 + }, + { + "auxiliary_loss_clip": 0.01113977, + "auxiliary_loss_mlp": 0.01046292, + "balance_loss_clip": 1.04644322, + "balance_loss_mlp": 1.0295074, + "epoch": 0.313663009168796, + "flos": 26980513009920.0, + "grad_norm": 2.4569149726963775, + "language_loss": 0.68258768, + "learning_rate": 3.213644097593477e-06, + "loss": 0.70419037, + "num_input_tokens_seen": 112058405, + "step": 5217, + "time_per_iteration": 2.603549003601074 + }, + { + "auxiliary_loss_clip": 0.01114681, + "auxiliary_loss_mlp": 0.01035806, + "balance_loss_clip": 1.04653287, + "balance_loss_mlp": 1.0209105, + "epoch": 0.313723132421464, + "flos": 18040911016320.0, + "grad_norm": 1.5895465305197436, + "language_loss": 0.80903047, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.83053529, + "num_input_tokens_seen": 112076420, + "step": 5218, + "time_per_iteration": 2.492814302444458 + }, + { + "auxiliary_loss_clip": 0.01133986, + "auxiliary_loss_mlp": 0.01041928, + "balance_loss_clip": 1.04660439, + "balance_loss_mlp": 1.02551925, + "epoch": 0.31378325567413196, + "flos": 22488913946880.0, + "grad_norm": 2.2846070132117826, + "language_loss": 0.6874423, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.70920146, + "num_input_tokens_seen": 112090775, + "step": 5219, + "time_per_iteration": 2.4423274993896484 + }, + { + "auxiliary_loss_clip": 0.01113631, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.04880488, + "balance_loss_mlp": 1.02168107, + "epoch": 0.3138433789267999, + "flos": 22419247518720.0, + "grad_norm": 1.9857780337760358, + "language_loss": 0.79602402, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.81751657, + "num_input_tokens_seen": 112110980, + "step": 5220, + "time_per_iteration": 2.5338916778564453 + }, + { + "auxiliary_loss_clip": 0.01125815, + "auxiliary_loss_mlp": 0.01037908, + "balance_loss_clip": 1.04784584, + "balance_loss_mlp": 1.02340019, + "epoch": 0.3139035021794679, + "flos": 13005912026880.0, + "grad_norm": 1.6462924290557253, + "language_loss": 0.72998732, + "learning_rate": 3.212405494206986e-06, + "loss": 0.75162458, + "num_input_tokens_seen": 112129020, + "step": 5221, + "time_per_iteration": 2.4895148277282715 + }, + { + "auxiliary_loss_clip": 0.01106429, + "auxiliary_loss_mlp": 0.0104227, + "balance_loss_clip": 1.05023623, + "balance_loss_mlp": 1.02719557, + "epoch": 0.31396362543213585, + "flos": 16945994689920.0, + "grad_norm": 1.7472916931131102, + "language_loss": 0.81640959, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.83789653, + "num_input_tokens_seen": 112147865, + "step": 5222, + "time_per_iteration": 2.505892753601074 + }, + { + "auxiliary_loss_clip": 0.0112837, + "auxiliary_loss_mlp": 0.01043166, + "balance_loss_clip": 1.04827607, + "balance_loss_mlp": 1.02633417, + "epoch": 0.31402374868480387, + "flos": 20156731695360.0, + "grad_norm": 1.9738925164338472, + "language_loss": 0.69880319, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.72051847, + "num_input_tokens_seen": 112166745, + "step": 5223, + "time_per_iteration": 2.478740930557251 + }, + { + "auxiliary_loss_clip": 0.01119965, + "auxiliary_loss_mlp": 0.00947269, + "balance_loss_clip": 1.0488162, + "balance_loss_mlp": 1.31515539, + "epoch": 0.31408387193747184, + "flos": 21251073404160.0, + "grad_norm": 1.489419243883012, + "language_loss": 0.80575806, + "learning_rate": 3.211476058893379e-06, + "loss": 0.82643044, + "num_input_tokens_seen": 112185895, + "step": 5224, + "time_per_iteration": 2.537212371826172 + }, + { + "auxiliary_loss_clip": 0.01135137, + "auxiliary_loss_mlp": 0.01039503, + "balance_loss_clip": 1.05306709, + "balance_loss_mlp": 1.02350497, + "epoch": 0.3141439951901398, + "flos": 27484267299840.0, + "grad_norm": 2.019719196773591, + "language_loss": 0.5812881, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.6030345, + "num_input_tokens_seen": 112204465, + "step": 5225, + "time_per_iteration": 2.542659282684326 + }, + { + "auxiliary_loss_clip": 0.01093186, + "auxiliary_loss_mlp": 0.01034847, + "balance_loss_clip": 1.04801822, + "balance_loss_mlp": 1.02054787, + "epoch": 0.31420411844280777, + "flos": 17852235851520.0, + "grad_norm": 1.7888639176679975, + "language_loss": 0.81622255, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.83750296, + "num_input_tokens_seen": 112221635, + "step": 5226, + "time_per_iteration": 2.5989136695861816 + }, + { + "auxiliary_loss_clip": 0.01121127, + "auxiliary_loss_mlp": 0.01052735, + "balance_loss_clip": 1.04862201, + "balance_loss_mlp": 1.03587866, + "epoch": 0.31426424169547573, + "flos": 21616967295360.0, + "grad_norm": 2.1020661112241776, + "language_loss": 0.74108303, + "learning_rate": 3.210546210126141e-06, + "loss": 0.76282161, + "num_input_tokens_seen": 112241240, + "step": 5227, + "time_per_iteration": 2.55983567237854 + }, + { + "auxiliary_loss_clip": 0.01126166, + "auxiliary_loss_mlp": 0.01036659, + "balance_loss_clip": 1.05428696, + "balance_loss_mlp": 1.02095902, + "epoch": 0.3143243649481437, + "flos": 30920631586560.0, + "grad_norm": 1.6075864047314838, + "language_loss": 0.6770004, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.69862866, + "num_input_tokens_seen": 112262350, + "step": 5228, + "time_per_iteration": 2.6122848987579346 + }, + { + "auxiliary_loss_clip": 0.01113854, + "auxiliary_loss_mlp": 0.01040202, + "balance_loss_clip": 1.04726195, + "balance_loss_mlp": 1.02512217, + "epoch": 0.31438448820081166, + "flos": 22821411168000.0, + "grad_norm": 1.7897783144060964, + "language_loss": 0.79885, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.82039058, + "num_input_tokens_seen": 112283710, + "step": 5229, + "time_per_iteration": 2.6531636714935303 + }, + { + "auxiliary_loss_clip": 0.0111344, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.04995275, + "balance_loss_mlp": 1.0194838, + "epoch": 0.3144446114534796, + "flos": 23292127923840.0, + "grad_norm": 1.746706910547038, + "language_loss": 0.70040798, + "learning_rate": 3.209615948222611e-06, + "loss": 0.72189391, + "num_input_tokens_seen": 112304285, + "step": 5230, + "time_per_iteration": 2.6111514568328857 + }, + { + "auxiliary_loss_clip": 0.01092543, + "auxiliary_loss_mlp": 0.01048427, + "balance_loss_clip": 1.04470325, + "balance_loss_mlp": 1.03048611, + "epoch": 0.3145047347061476, + "flos": 31355976424320.0, + "grad_norm": 1.5305369504833517, + "language_loss": 0.79399204, + "learning_rate": 3.209305769168239e-06, + "loss": 0.81540167, + "num_input_tokens_seen": 112325110, + "step": 5231, + "time_per_iteration": 2.692861557006836 + }, + { + "auxiliary_loss_clip": 0.01109043, + "auxiliary_loss_mlp": 0.01041283, + "balance_loss_clip": 1.05148053, + "balance_loss_mlp": 1.02499938, + "epoch": 0.31456485795881556, + "flos": 10889552643840.0, + "grad_norm": 2.814438173731319, + "language_loss": 0.84686923, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.8683725, + "num_input_tokens_seen": 112339855, + "step": 5232, + "time_per_iteration": 2.5521678924560547 + }, + { + "auxiliary_loss_clip": 0.01078151, + "auxiliary_loss_mlp": 0.0106228, + "balance_loss_clip": 1.04185772, + "balance_loss_mlp": 1.04542434, + "epoch": 0.3146249812114835, + "flos": 17092438439040.0, + "grad_norm": 1.5429441291037251, + "language_loss": 0.80080622, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.82221055, + "num_input_tokens_seen": 112358480, + "step": 5233, + "time_per_iteration": 2.6107914447784424 + }, + { + "auxiliary_loss_clip": 0.01094156, + "auxiliary_loss_mlp": 0.01038063, + "balance_loss_clip": 1.04938102, + "balance_loss_mlp": 1.02281618, + "epoch": 0.3146851044641515, + "flos": 55291442889600.0, + "grad_norm": 1.694335972621516, + "language_loss": 0.70801699, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.72933924, + "num_input_tokens_seen": 112382350, + "step": 5234, + "time_per_iteration": 2.9402360916137695 + }, + { + "auxiliary_loss_clip": 0.01103781, + "auxiliary_loss_mlp": 0.01036934, + "balance_loss_clip": 1.05213642, + "balance_loss_mlp": 1.02124047, + "epoch": 0.31474522771681945, + "flos": 27015884928000.0, + "grad_norm": 2.129473387359332, + "language_loss": 0.72344303, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.74485016, + "num_input_tokens_seen": 112400260, + "step": 5235, + "time_per_iteration": 2.6120810508728027 + }, + { + "auxiliary_loss_clip": 0.01126101, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.047979, + "balance_loss_mlp": 1.02139735, + "epoch": 0.3148053509694875, + "flos": 21251935330560.0, + "grad_norm": 1.9028456087878876, + "language_loss": 0.79160875, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.81323904, + "num_input_tokens_seen": 112419400, + "step": 5236, + "time_per_iteration": 2.5714783668518066 + }, + { + "auxiliary_loss_clip": 0.01138928, + "auxiliary_loss_mlp": 0.01041531, + "balance_loss_clip": 1.04921556, + "balance_loss_mlp": 1.02584291, + "epoch": 0.31486547422215544, + "flos": 31248675521280.0, + "grad_norm": 1.5615004086982307, + "language_loss": 0.75871325, + "learning_rate": 3.207443732256881e-06, + "loss": 0.78051782, + "num_input_tokens_seen": 112440825, + "step": 5237, + "time_per_iteration": 2.5645601749420166 + }, + { + "auxiliary_loss_clip": 0.01131906, + "auxiliary_loss_mlp": 0.01035893, + "balance_loss_clip": 1.04820347, + "balance_loss_mlp": 1.02196932, + "epoch": 0.3149255974748234, + "flos": 19828615933440.0, + "grad_norm": 1.710891802267315, + "language_loss": 0.80179644, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.82347441, + "num_input_tokens_seen": 112459180, + "step": 5238, + "time_per_iteration": 2.5048158168792725 + }, + { + "auxiliary_loss_clip": 0.01044726, + "auxiliary_loss_mlp": 0.01002198, + "balance_loss_clip": 1.02142191, + "balance_loss_mlp": 0.99979037, + "epoch": 0.31498572072749137, + "flos": 67683965339520.0, + "grad_norm": 0.8349325091043307, + "language_loss": 0.67923057, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.69969988, + "num_input_tokens_seen": 112516680, + "step": 5239, + "time_per_iteration": 3.1070854663848877 + }, + { + "auxiliary_loss_clip": 0.01116887, + "auxiliary_loss_mlp": 0.01041061, + "balance_loss_clip": 1.04814386, + "balance_loss_mlp": 1.02428877, + "epoch": 0.31504584398015933, + "flos": 19793136274560.0, + "grad_norm": 2.241149557686202, + "language_loss": 0.83087802, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.85245752, + "num_input_tokens_seen": 112535895, + "step": 5240, + "time_per_iteration": 2.5836524963378906 + }, + { + "auxiliary_loss_clip": 0.01113925, + "auxiliary_loss_mlp": 0.00842079, + "balance_loss_clip": 1.05128574, + "balance_loss_mlp": 1.10492444, + "epoch": 0.3151059672328273, + "flos": 26615409217920.0, + "grad_norm": 1.6545194853361676, + "language_loss": 0.81287932, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.83243942, + "num_input_tokens_seen": 112557490, + "step": 5241, + "time_per_iteration": 2.578056812286377 + }, + { + "auxiliary_loss_clip": 0.01136684, + "auxiliary_loss_mlp": 0.01040504, + "balance_loss_clip": 1.05189133, + "balance_loss_mlp": 1.0254482, + "epoch": 0.31516609048549526, + "flos": 24204438483840.0, + "grad_norm": 1.544403946190781, + "language_loss": 0.74196118, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.76373315, + "num_input_tokens_seen": 112577075, + "step": 5242, + "time_per_iteration": 2.511141061782837 + }, + { + "auxiliary_loss_clip": 0.01102535, + "auxiliary_loss_mlp": 0.01040126, + "balance_loss_clip": 1.04516459, + "balance_loss_mlp": 1.02375901, + "epoch": 0.31522621373816323, + "flos": 25958710817280.0, + "grad_norm": 1.9585354240595878, + "language_loss": 0.74000096, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.76142758, + "num_input_tokens_seen": 112597620, + "step": 5243, + "time_per_iteration": 2.5963993072509766 + }, + { + "auxiliary_loss_clip": 0.01124346, + "auxiliary_loss_mlp": 0.01039557, + "balance_loss_clip": 1.04975796, + "balance_loss_mlp": 1.02440584, + "epoch": 0.3152863369908312, + "flos": 21908813299200.0, + "grad_norm": 1.6939482005602378, + "language_loss": 0.64090401, + "learning_rate": 3.205269272758513e-06, + "loss": 0.66254306, + "num_input_tokens_seen": 112617150, + "step": 5244, + "time_per_iteration": 2.532068967819214 + }, + { + "auxiliary_loss_clip": 0.01088929, + "auxiliary_loss_mlp": 0.0103733, + "balance_loss_clip": 1.0511024, + "balance_loss_mlp": 1.022995, + "epoch": 0.31534646024349916, + "flos": 16281072074880.0, + "grad_norm": 2.0787849358504777, + "language_loss": 0.9110043, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.93226689, + "num_input_tokens_seen": 112631090, + "step": 5245, + "time_per_iteration": 4.014739274978638 + }, + { + "auxiliary_loss_clip": 0.011276, + "auxiliary_loss_mlp": 0.01042944, + "balance_loss_clip": 1.04998124, + "balance_loss_mlp": 1.02754807, + "epoch": 0.3154065834961671, + "flos": 24717243000960.0, + "grad_norm": 1.612091820394434, + "language_loss": 0.75035632, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.77206177, + "num_input_tokens_seen": 112651220, + "step": 5246, + "time_per_iteration": 2.5744895935058594 + }, + { + "auxiliary_loss_clip": 0.01137869, + "auxiliary_loss_mlp": 0.01042182, + "balance_loss_clip": 1.05014086, + "balance_loss_mlp": 1.02705455, + "epoch": 0.3154667067488351, + "flos": 35371148469120.0, + "grad_norm": 1.4928941340316568, + "language_loss": 0.61130077, + "learning_rate": 3.204336675750321e-06, + "loss": 0.63310128, + "num_input_tokens_seen": 112671560, + "step": 5247, + "time_per_iteration": 2.6202356815338135 + }, + { + "auxiliary_loss_clip": 0.01127343, + "auxiliary_loss_mlp": 0.0104193, + "balance_loss_clip": 1.04911995, + "balance_loss_mlp": 1.02683771, + "epoch": 0.31552683000150306, + "flos": 17456464823040.0, + "grad_norm": 2.423156577136237, + "language_loss": 0.82155997, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.84325266, + "num_input_tokens_seen": 112689790, + "step": 5248, + "time_per_iteration": 2.4615566730499268 + }, + { + "auxiliary_loss_clip": 0.01117163, + "auxiliary_loss_mlp": 0.01050852, + "balance_loss_clip": 1.04990673, + "balance_loss_mlp": 1.03442538, + "epoch": 0.3155869532541711, + "flos": 18405763413120.0, + "grad_norm": 2.278820933064458, + "language_loss": 0.84688592, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.86856604, + "num_input_tokens_seen": 112708265, + "step": 5249, + "time_per_iteration": 2.5241925716400146 + }, + { + "auxiliary_loss_clip": 0.01112543, + "auxiliary_loss_mlp": 0.01043778, + "balance_loss_clip": 1.05397642, + "balance_loss_mlp": 1.02741051, + "epoch": 0.31564707650683904, + "flos": 21579763783680.0, + "grad_norm": 1.6921497749167889, + "language_loss": 0.85471606, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.87627929, + "num_input_tokens_seen": 112727820, + "step": 5250, + "time_per_iteration": 3.967754602432251 + }, + { + "auxiliary_loss_clip": 0.01111459, + "auxiliary_loss_mlp": 0.01043965, + "balance_loss_clip": 1.05130935, + "balance_loss_mlp": 1.02826476, + "epoch": 0.315707199759507, + "flos": 21030976817280.0, + "grad_norm": 2.4246814535551846, + "language_loss": 0.68575168, + "learning_rate": 3.203092573767835e-06, + "loss": 0.70730585, + "num_input_tokens_seen": 112743140, + "step": 5251, + "time_per_iteration": 2.525578498840332 + }, + { + "auxiliary_loss_clip": 0.01138099, + "auxiliary_loss_mlp": 0.01044718, + "balance_loss_clip": 1.05258727, + "balance_loss_mlp": 1.02998412, + "epoch": 0.31576732301217497, + "flos": 26828861788800.0, + "grad_norm": 1.87358731195123, + "language_loss": 0.78980613, + "learning_rate": 3.202781434189246e-06, + "loss": 0.8116343, + "num_input_tokens_seen": 112764705, + "step": 5252, + "time_per_iteration": 3.916820764541626 + }, + { + "auxiliary_loss_clip": 0.01123084, + "auxiliary_loss_mlp": 0.01056635, + "balance_loss_clip": 1.05221581, + "balance_loss_mlp": 1.04013658, + "epoch": 0.31582744626484294, + "flos": 22711165349760.0, + "grad_norm": 1.5234422657884983, + "language_loss": 0.74199605, + "learning_rate": 3.202470249001066e-06, + "loss": 0.76379329, + "num_input_tokens_seen": 112785310, + "step": 5253, + "time_per_iteration": 3.900736093521118 + }, + { + "auxiliary_loss_clip": 0.01122104, + "auxiliary_loss_mlp": 0.0104031, + "balance_loss_clip": 1.05114126, + "balance_loss_mlp": 1.0249083, + "epoch": 0.3158875695175109, + "flos": 23951914894080.0, + "grad_norm": 1.886348174412362, + "language_loss": 0.73591691, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.75754106, + "num_input_tokens_seen": 112802905, + "step": 5254, + "time_per_iteration": 2.555074691772461 + }, + { + "auxiliary_loss_clip": 0.01128137, + "auxiliary_loss_mlp": 0.01042581, + "balance_loss_clip": 1.05179739, + "balance_loss_mlp": 1.02750659, + "epoch": 0.31594769277017887, + "flos": 13261883322240.0, + "grad_norm": 1.7992620430986601, + "language_loss": 0.77799451, + "learning_rate": 3.201847741843128e-06, + "loss": 0.79970169, + "num_input_tokens_seen": 112820305, + "step": 5255, + "time_per_iteration": 2.4864346981048584 + }, + { + "auxiliary_loss_clip": 0.01113901, + "auxiliary_loss_mlp": 0.01043039, + "balance_loss_clip": 1.05044961, + "balance_loss_mlp": 1.02624285, + "epoch": 0.31600781602284683, + "flos": 23368258800000.0, + "grad_norm": 1.9767084181228005, + "language_loss": 0.78144288, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80301237, + "num_input_tokens_seen": 112841185, + "step": 5256, + "time_per_iteration": 2.531808853149414 + }, + { + "auxiliary_loss_clip": 0.01095894, + "auxiliary_loss_mlp": 0.01038001, + "balance_loss_clip": 1.05213141, + "balance_loss_mlp": 1.02404106, + "epoch": 0.3160679392755148, + "flos": 19828580019840.0, + "grad_norm": 1.5674103021189727, + "language_loss": 0.71607172, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.73741066, + "num_input_tokens_seen": 112860570, + "step": 5257, + "time_per_iteration": 2.58862566947937 + }, + { + "auxiliary_loss_clip": 0.01130246, + "auxiliary_loss_mlp": 0.01043109, + "balance_loss_clip": 1.0523386, + "balance_loss_mlp": 1.02700412, + "epoch": 0.31612806252818276, + "flos": 20193216935040.0, + "grad_norm": 1.863668782142547, + "language_loss": 0.76839578, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.79012936, + "num_input_tokens_seen": 112877975, + "step": 5258, + "time_per_iteration": 2.479698896408081 + }, + { + "auxiliary_loss_clip": 0.01107451, + "auxiliary_loss_mlp": 0.01045763, + "balance_loss_clip": 1.04908991, + "balance_loss_mlp": 1.02974093, + "epoch": 0.31618818578085073, + "flos": 24235967646720.0, + "grad_norm": 2.180268787874204, + "language_loss": 0.72608805, + "learning_rate": 3.200602180731467e-06, + "loss": 0.74762022, + "num_input_tokens_seen": 112896170, + "step": 5259, + "time_per_iteration": 2.5537657737731934 + }, + { + "auxiliary_loss_clip": 0.01114652, + "auxiliary_loss_mlp": 0.00811725, + "balance_loss_clip": 1.05337918, + "balance_loss_mlp": 1.04783964, + "epoch": 0.3162483090335187, + "flos": 25081844002560.0, + "grad_norm": 1.912829640413767, + "language_loss": 0.66653097, + "learning_rate": 3.20029067660664e-06, + "loss": 0.68579477, + "num_input_tokens_seen": 112916180, + "step": 5260, + "time_per_iteration": 2.571277618408203 + }, + { + "auxiliary_loss_clip": 0.01126576, + "auxiliary_loss_mlp": 0.01032097, + "balance_loss_clip": 1.04818964, + "balance_loss_mlp": 1.01693392, + "epoch": 0.31630843228618666, + "flos": 26323383646080.0, + "grad_norm": 1.8024607866830336, + "language_loss": 0.72465134, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.74623805, + "num_input_tokens_seen": 112936745, + "step": 5261, + "time_per_iteration": 2.5579285621643066 + }, + { + "auxiliary_loss_clip": 0.01043865, + "auxiliary_loss_mlp": 0.01003024, + "balance_loss_clip": 1.02235746, + "balance_loss_mlp": 1.00068712, + "epoch": 0.3163685555388547, + "flos": 66758441552640.0, + "grad_norm": 0.7534593805933323, + "language_loss": 0.50716299, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.52763188, + "num_input_tokens_seen": 112994845, + "step": 5262, + "time_per_iteration": 3.1014809608459473 + }, + { + "auxiliary_loss_clip": 0.01128076, + "auxiliary_loss_mlp": 0.01038795, + "balance_loss_clip": 1.05631959, + "balance_loss_mlp": 1.02357197, + "epoch": 0.31642867879152264, + "flos": 25995662933760.0, + "grad_norm": 1.627835951349252, + "language_loss": 0.85454941, + "learning_rate": 3.19935589118856e-06, + "loss": 0.87621808, + "num_input_tokens_seen": 113015125, + "step": 5263, + "time_per_iteration": 2.5535800457000732 + }, + { + "auxiliary_loss_clip": 0.01109893, + "auxiliary_loss_mlp": 0.01040706, + "balance_loss_clip": 1.05085254, + "balance_loss_mlp": 1.02637732, + "epoch": 0.3164888020441906, + "flos": 25774955815680.0, + "grad_norm": 1.7541471067806753, + "language_loss": 0.81845057, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.83995658, + "num_input_tokens_seen": 113035535, + "step": 5264, + "time_per_iteration": 2.5572218894958496 + }, + { + "auxiliary_loss_clip": 0.01119606, + "auxiliary_loss_mlp": 0.0103565, + "balance_loss_clip": 1.05175257, + "balance_loss_mlp": 1.01947355, + "epoch": 0.3165489252968586, + "flos": 19756220071680.0, + "grad_norm": 2.0435652096615864, + "language_loss": 0.79744875, + "learning_rate": 3.19873247349167e-06, + "loss": 0.81900132, + "num_input_tokens_seen": 113052720, + "step": 5265, + "time_per_iteration": 2.556729793548584 + }, + { + "auxiliary_loss_clip": 0.01132794, + "auxiliary_loss_mlp": 0.01039446, + "balance_loss_clip": 1.05561113, + "balance_loss_mlp": 1.02374589, + "epoch": 0.31660904854952654, + "flos": 23183929180800.0, + "grad_norm": 1.8881313936000494, + "language_loss": 0.75263774, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.77436012, + "num_input_tokens_seen": 113071435, + "step": 5266, + "time_per_iteration": 2.508272409439087 + }, + { + "auxiliary_loss_clip": 0.01108979, + "auxiliary_loss_mlp": 0.01039507, + "balance_loss_clip": 1.05234218, + "balance_loss_mlp": 1.02417636, + "epoch": 0.3166691718021945, + "flos": 20408501099520.0, + "grad_norm": 2.0441762575836893, + "language_loss": 0.79174393, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.81322879, + "num_input_tokens_seen": 113088645, + "step": 5267, + "time_per_iteration": 2.5711276531219482 + }, + { + "auxiliary_loss_clip": 0.0103884, + "auxiliary_loss_mlp": 0.01001225, + "balance_loss_clip": 1.0280776, + "balance_loss_mlp": 0.99892408, + "epoch": 0.31672929505486247, + "flos": 70144781172480.0, + "grad_norm": 0.7385054836274102, + "language_loss": 0.57819247, + "learning_rate": 3.197797006055478e-06, + "loss": 0.59859312, + "num_input_tokens_seen": 113152775, + "step": 5268, + "time_per_iteration": 3.141050338745117 + }, + { + "auxiliary_loss_clip": 0.01142816, + "auxiliary_loss_mlp": 0.01038485, + "balance_loss_clip": 1.05329776, + "balance_loss_mlp": 1.02314305, + "epoch": 0.31678941830753043, + "flos": 14355758154240.0, + "grad_norm": 2.187262618804117, + "language_loss": 0.73038757, + "learning_rate": 3.197485092719815e-06, + "loss": 0.7522006, + "num_input_tokens_seen": 113171410, + "step": 5269, + "time_per_iteration": 2.4694061279296875 + }, + { + "auxiliary_loss_clip": 0.01109208, + "auxiliary_loss_mlp": 0.0104558, + "balance_loss_clip": 1.05336595, + "balance_loss_mlp": 1.03007102, + "epoch": 0.3168495415601984, + "flos": 22747722416640.0, + "grad_norm": 1.92615857898234, + "language_loss": 0.80228651, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.82383442, + "num_input_tokens_seen": 113189965, + "step": 5270, + "time_per_iteration": 2.568838119506836 + }, + { + "auxiliary_loss_clip": 0.01145612, + "auxiliary_loss_mlp": 0.01044615, + "balance_loss_clip": 1.05459249, + "balance_loss_mlp": 1.02792609, + "epoch": 0.31690966481286637, + "flos": 20115254465280.0, + "grad_norm": 2.641262920522099, + "language_loss": 0.79612768, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.81803, + "num_input_tokens_seen": 113206355, + "step": 5271, + "time_per_iteration": 2.449613571166992 + }, + { + "auxiliary_loss_clip": 0.01142161, + "auxiliary_loss_mlp": 0.01040242, + "balance_loss_clip": 1.05404818, + "balance_loss_mlp": 1.02460766, + "epoch": 0.31696978806553433, + "flos": 21178928937600.0, + "grad_norm": 2.2520840173029733, + "language_loss": 0.73129141, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.75311548, + "num_input_tokens_seen": 113225440, + "step": 5272, + "time_per_iteration": 2.4721462726593018 + }, + { + "auxiliary_loss_clip": 0.01119199, + "auxiliary_loss_mlp": 0.01045983, + "balance_loss_clip": 1.0494206, + "balance_loss_mlp": 1.02864981, + "epoch": 0.3170299113182023, + "flos": 42997030439040.0, + "grad_norm": 2.0834405479839497, + "language_loss": 0.68894649, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.71059823, + "num_input_tokens_seen": 113248840, + "step": 5273, + "time_per_iteration": 2.72156023979187 + }, + { + "auxiliary_loss_clip": 0.01126228, + "auxiliary_loss_mlp": 0.00804013, + "balance_loss_clip": 1.0506283, + "balance_loss_mlp": 1.03286171, + "epoch": 0.31709003457087026, + "flos": 24460158384000.0, + "grad_norm": 2.2261655645941594, + "language_loss": 0.67693901, + "learning_rate": 3.195924845146795e-06, + "loss": 0.69624138, + "num_input_tokens_seen": 113269630, + "step": 5274, + "time_per_iteration": 2.535186767578125 + }, + { + "auxiliary_loss_clip": 0.01094616, + "auxiliary_loss_mlp": 0.01061929, + "balance_loss_clip": 1.05058467, + "balance_loss_mlp": 1.04481101, + "epoch": 0.3171501578235382, + "flos": 24135310759680.0, + "grad_norm": 1.4858629231712213, + "language_loss": 0.81048024, + "learning_rate": 3.195612659536081e-06, + "loss": 0.83204567, + "num_input_tokens_seen": 113291200, + "step": 5275, + "time_per_iteration": 2.5741326808929443 + }, + { + "auxiliary_loss_clip": 0.01128344, + "auxiliary_loss_mlp": 0.01045838, + "balance_loss_clip": 1.04925823, + "balance_loss_mlp": 1.03026938, + "epoch": 0.31721028107620625, + "flos": 18879712392960.0, + "grad_norm": 1.8129936355138583, + "language_loss": 0.72861731, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.75035912, + "num_input_tokens_seen": 113310170, + "step": 5276, + "time_per_iteration": 2.4873099327087402 + }, + { + "auxiliary_loss_clip": 0.01114059, + "auxiliary_loss_mlp": 0.01040161, + "balance_loss_clip": 1.04892099, + "balance_loss_mlp": 1.02558148, + "epoch": 0.3172704043288742, + "flos": 23147874904320.0, + "grad_norm": 1.4706361100939296, + "language_loss": 0.78108144, + "learning_rate": 3.194988152313236e-06, + "loss": 0.80262363, + "num_input_tokens_seen": 113331140, + "step": 5277, + "time_per_iteration": 2.557978630065918 + }, + { + "auxiliary_loss_clip": 0.01109114, + "auxiliary_loss_mlp": 0.01049966, + "balance_loss_clip": 1.04867578, + "balance_loss_mlp": 1.03104806, + "epoch": 0.3173305275815422, + "flos": 17858520731520.0, + "grad_norm": 1.6391296860853455, + "language_loss": 0.7918067, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.81339753, + "num_input_tokens_seen": 113350030, + "step": 5278, + "time_per_iteration": 2.5151944160461426 + }, + { + "auxiliary_loss_clip": 0.01044267, + "auxiliary_loss_mlp": 0.01013422, + "balance_loss_clip": 1.02925158, + "balance_loss_mlp": 1.01088333, + "epoch": 0.31739065083421014, + "flos": 59973476883840.0, + "grad_norm": 0.8785415952885072, + "language_loss": 0.62846756, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.64904439, + "num_input_tokens_seen": 113395820, + "step": 5279, + "time_per_iteration": 2.8708362579345703 + }, + { + "auxiliary_loss_clip": 0.01144465, + "auxiliary_loss_mlp": 0.01047525, + "balance_loss_clip": 1.0523169, + "balance_loss_mlp": 1.0304898, + "epoch": 0.3174507740868781, + "flos": 23800981944960.0, + "grad_norm": 1.8091601046855692, + "language_loss": 0.80960584, + "learning_rate": 3.194051051653053e-06, + "loss": 0.8315258, + "num_input_tokens_seen": 113416835, + "step": 5280, + "time_per_iteration": 2.526648998260498 + }, + { + "auxiliary_loss_clip": 0.01107991, + "auxiliary_loss_mlp": 0.01045975, + "balance_loss_clip": 1.05423522, + "balance_loss_mlp": 1.03134799, + "epoch": 0.31751089733954607, + "flos": 27638899349760.0, + "grad_norm": 1.9958910987416494, + "language_loss": 0.77758092, + "learning_rate": 3.19373859419346e-06, + "loss": 0.79912055, + "num_input_tokens_seen": 113440850, + "step": 5281, + "time_per_iteration": 2.658937454223633 + }, + { + "auxiliary_loss_clip": 0.01118647, + "auxiliary_loss_mlp": 0.01043743, + "balance_loss_clip": 1.05239916, + "balance_loss_mlp": 1.02719104, + "epoch": 0.31757102059221404, + "flos": 23769273214080.0, + "grad_norm": 1.5867537340751843, + "language_loss": 0.78144634, + "learning_rate": 3.193426091467179e-06, + "loss": 0.80307019, + "num_input_tokens_seen": 113461000, + "step": 5282, + "time_per_iteration": 2.559037923812866 + }, + { + "auxiliary_loss_clip": 0.01117286, + "auxiliary_loss_mlp": 0.01052174, + "balance_loss_clip": 1.05065203, + "balance_loss_mlp": 1.03454316, + "epoch": 0.317631143844882, + "flos": 25264521596160.0, + "grad_norm": 3.343983957483309, + "language_loss": 0.67286217, + "learning_rate": 3.193113543486061e-06, + "loss": 0.69455671, + "num_input_tokens_seen": 113480820, + "step": 5283, + "time_per_iteration": 2.559706687927246 + }, + { + "auxiliary_loss_clip": 0.01040139, + "auxiliary_loss_mlp": 0.01006966, + "balance_loss_clip": 1.02919245, + "balance_loss_mlp": 1.0049988, + "epoch": 0.31769126709754997, + "flos": 55825939221120.0, + "grad_norm": 0.7513958323200097, + "language_loss": 0.52788138, + "learning_rate": 3.192800950261958e-06, + "loss": 0.54835248, + "num_input_tokens_seen": 113536910, + "step": 5284, + "time_per_iteration": 4.431357383728027 + }, + { + "auxiliary_loss_clip": 0.01122683, + "auxiliary_loss_mlp": 0.01037918, + "balance_loss_clip": 1.05562294, + "balance_loss_mlp": 1.02287376, + "epoch": 0.31775139035021793, + "flos": 16690562098560.0, + "grad_norm": 1.6448547737019368, + "language_loss": 0.70651275, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.72811872, + "num_input_tokens_seen": 113555480, + "step": 5285, + "time_per_iteration": 2.5310556888580322 + }, + { + "auxiliary_loss_clip": 0.01048465, + "auxiliary_loss_mlp": 0.01002051, + "balance_loss_clip": 1.01780212, + "balance_loss_mlp": 0.99988097, + "epoch": 0.3178115136028859, + "flos": 64227241019520.0, + "grad_norm": 0.8206012940865721, + "language_loss": 0.60512167, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.6256268, + "num_input_tokens_seen": 113616790, + "step": 5286, + "time_per_iteration": 3.0809075832366943 + }, + { + "auxiliary_loss_clip": 0.01139758, + "auxiliary_loss_mlp": 0.01043456, + "balance_loss_clip": 1.05014896, + "balance_loss_mlp": 1.02757764, + "epoch": 0.31787163685555386, + "flos": 18697465762560.0, + "grad_norm": 1.816893621436008, + "language_loss": 0.72385061, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.74568278, + "num_input_tokens_seen": 113635320, + "step": 5287, + "time_per_iteration": 2.4503214359283447 + }, + { + "auxiliary_loss_clip": 0.01129737, + "auxiliary_loss_mlp": 0.01047353, + "balance_loss_clip": 1.05023813, + "balance_loss_mlp": 1.02976942, + "epoch": 0.31793176010822183, + "flos": 21324762155520.0, + "grad_norm": 2.276961695995455, + "language_loss": 0.75566912, + "learning_rate": 3.191550125172792e-06, + "loss": 0.77744007, + "num_input_tokens_seen": 113654000, + "step": 5288, + "time_per_iteration": 2.478504180908203 + }, + { + "auxiliary_loss_clip": 0.01124303, + "auxiliary_loss_mlp": 0.01034931, + "balance_loss_clip": 1.04785633, + "balance_loss_mlp": 1.02043498, + "epoch": 0.31799188336088985, + "flos": 20958688696320.0, + "grad_norm": 1.6277909052519561, + "language_loss": 0.87596345, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.89755583, + "num_input_tokens_seen": 113672375, + "step": 5289, + "time_per_iteration": 2.508985757827759 + }, + { + "auxiliary_loss_clip": 0.01124259, + "auxiliary_loss_mlp": 0.01037444, + "balance_loss_clip": 1.05566287, + "balance_loss_mlp": 1.02322221, + "epoch": 0.3180520066135578, + "flos": 22491930689280.0, + "grad_norm": 1.7222326762923867, + "language_loss": 0.68135965, + "learning_rate": 3.190924441478572e-06, + "loss": 0.70297658, + "num_input_tokens_seen": 113692385, + "step": 5290, + "time_per_iteration": 3.8745224475860596 + }, + { + "auxiliary_loss_clip": 0.01119708, + "auxiliary_loss_mlp": 0.01045289, + "balance_loss_clip": 1.05047846, + "balance_loss_mlp": 1.02962506, + "epoch": 0.3181121298662258, + "flos": 27235335070080.0, + "grad_norm": 1.913246041662527, + "language_loss": 0.79831827, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.81996828, + "num_input_tokens_seen": 113712145, + "step": 5291, + "time_per_iteration": 3.9618608951568604 + }, + { + "auxiliary_loss_clip": 0.01101437, + "auxiliary_loss_mlp": 0.01039213, + "balance_loss_clip": 1.05205846, + "balance_loss_mlp": 1.02226186, + "epoch": 0.31817225311889374, + "flos": 23180158252800.0, + "grad_norm": 2.111259424987871, + "language_loss": 0.79828715, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.81969362, + "num_input_tokens_seen": 113731435, + "step": 5292, + "time_per_iteration": 3.992375135421753 + }, + { + "auxiliary_loss_clip": 0.01128024, + "auxiliary_loss_mlp": 0.01037086, + "balance_loss_clip": 1.05051756, + "balance_loss_mlp": 1.02261412, + "epoch": 0.3182323763715617, + "flos": 23258803080960.0, + "grad_norm": 1.7149714046775084, + "language_loss": 0.74992615, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.77157724, + "num_input_tokens_seen": 113750825, + "step": 5293, + "time_per_iteration": 2.511645555496216 + }, + { + "auxiliary_loss_clip": 0.0112787, + "auxiliary_loss_mlp": 0.01041326, + "balance_loss_clip": 1.0545733, + "balance_loss_mlp": 1.02655578, + "epoch": 0.3182924996242297, + "flos": 29016683280000.0, + "grad_norm": 2.0920477972873273, + "language_loss": 0.74227959, + "learning_rate": 3.189672532265379e-06, + "loss": 0.76397157, + "num_input_tokens_seen": 113770010, + "step": 5294, + "time_per_iteration": 2.5681374073028564 + }, + { + "auxiliary_loss_clip": 0.01143118, + "auxiliary_loss_mlp": 0.01038744, + "balance_loss_clip": 1.0529449, + "balance_loss_mlp": 1.02179301, + "epoch": 0.31835262287689764, + "flos": 20449188230400.0, + "grad_norm": 1.8813171065727299, + "language_loss": 0.7593506, + "learning_rate": 3.189359442151152e-06, + "loss": 0.78116918, + "num_input_tokens_seen": 113788640, + "step": 5295, + "time_per_iteration": 2.4738216400146484 + }, + { + "auxiliary_loss_clip": 0.01115236, + "auxiliary_loss_mlp": 0.01042768, + "balance_loss_clip": 1.05407917, + "balance_loss_mlp": 1.02688968, + "epoch": 0.3184127461295656, + "flos": 25119478477440.0, + "grad_norm": 1.472895175501562, + "language_loss": 0.69145858, + "learning_rate": 3.189046306936296e-06, + "loss": 0.71303868, + "num_input_tokens_seen": 113809515, + "step": 5296, + "time_per_iteration": 2.58266544342041 + }, + { + "auxiliary_loss_clip": 0.01114129, + "auxiliary_loss_mlp": 0.0103982, + "balance_loss_clip": 1.05432558, + "balance_loss_mlp": 1.02481151, + "epoch": 0.31847286938223357, + "flos": 25551231955200.0, + "grad_norm": 1.566883757979649, + "language_loss": 0.77718604, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.79872561, + "num_input_tokens_seen": 113829770, + "step": 5297, + "time_per_iteration": 2.5843825340270996 + }, + { + "auxiliary_loss_clip": 0.01105769, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.05038452, + "balance_loss_mlp": 1.01627886, + "epoch": 0.31853299263490154, + "flos": 27782470010880.0, + "grad_norm": 1.830209427058319, + "language_loss": 0.79226089, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.81363875, + "num_input_tokens_seen": 113849320, + "step": 5298, + "time_per_iteration": 2.609867811203003 + }, + { + "auxiliary_loss_clip": 0.01124506, + "auxiliary_loss_mlp": 0.01042221, + "balance_loss_clip": 1.05293727, + "balance_loss_mlp": 1.0269382, + "epoch": 0.3185931158875695, + "flos": 22706747976960.0, + "grad_norm": 1.6854594378553205, + "language_loss": 0.74064636, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.7623136, + "num_input_tokens_seen": 113867860, + "step": 5299, + "time_per_iteration": 2.554365634918213 + }, + { + "auxiliary_loss_clip": 0.01123902, + "auxiliary_loss_mlp": 0.01047246, + "balance_loss_clip": 1.05051339, + "balance_loss_mlp": 1.03156996, + "epoch": 0.31865323914023747, + "flos": 24571517523840.0, + "grad_norm": 1.9470045963142832, + "language_loss": 0.78165495, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.80336642, + "num_input_tokens_seen": 113886375, + "step": 5300, + "time_per_iteration": 2.597076177597046 + }, + { + "auxiliary_loss_clip": 0.01117275, + "auxiliary_loss_mlp": 0.01042573, + "balance_loss_clip": 1.05001724, + "balance_loss_mlp": 1.02528751, + "epoch": 0.31871336239290543, + "flos": 18186564666240.0, + "grad_norm": 2.077998744691648, + "language_loss": 0.8387593, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.86035776, + "num_input_tokens_seen": 113904065, + "step": 5301, + "time_per_iteration": 2.5344371795654297 + }, + { + "auxiliary_loss_clip": 0.01129491, + "auxiliary_loss_mlp": 0.01043421, + "balance_loss_clip": 1.05492163, + "balance_loss_mlp": 1.02754235, + "epoch": 0.31877348564557345, + "flos": 21826756679040.0, + "grad_norm": 2.365335892215333, + "language_loss": 0.77421021, + "learning_rate": 3.187166549199015e-06, + "loss": 0.79593933, + "num_input_tokens_seen": 113918415, + "step": 5302, + "time_per_iteration": 2.4855964183807373 + }, + { + "auxiliary_loss_clip": 0.01139179, + "auxiliary_loss_mlp": 0.01040018, + "balance_loss_clip": 1.05411005, + "balance_loss_mlp": 1.02410388, + "epoch": 0.3188336088982414, + "flos": 22015252275840.0, + "grad_norm": 1.7209117075342533, + "language_loss": 0.80104727, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.82283926, + "num_input_tokens_seen": 113938135, + "step": 5303, + "time_per_iteration": 2.6323611736297607 + }, + { + "auxiliary_loss_clip": 0.01134059, + "auxiliary_loss_mlp": 0.01042795, + "balance_loss_clip": 1.05625963, + "balance_loss_mlp": 1.02647519, + "epoch": 0.3188937321509094, + "flos": 20047886507520.0, + "grad_norm": 3.1932599498675076, + "language_loss": 0.73010331, + "learning_rate": 3.186539603020047e-06, + "loss": 0.75187182, + "num_input_tokens_seen": 113957125, + "step": 5304, + "time_per_iteration": 2.4961016178131104 + }, + { + "auxiliary_loss_clip": 0.01106329, + "auxiliary_loss_mlp": 0.01042053, + "balance_loss_clip": 1.05100846, + "balance_loss_mlp": 1.0271039, + "epoch": 0.31895385540357735, + "flos": 25848105863040.0, + "grad_norm": 2.05337772032714, + "language_loss": 0.71786052, + "learning_rate": 3.186226062434068e-06, + "loss": 0.7393443, + "num_input_tokens_seen": 113974875, + "step": 5305, + "time_per_iteration": 2.6048390865325928 + }, + { + "auxiliary_loss_clip": 0.01118405, + "auxiliary_loss_mlp": 0.0103943, + "balance_loss_clip": 1.05146778, + "balance_loss_mlp": 1.02539277, + "epoch": 0.3190139786562453, + "flos": 23477714519040.0, + "grad_norm": 1.672105837745802, + "language_loss": 0.64849806, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.67007643, + "num_input_tokens_seen": 113994450, + "step": 5306, + "time_per_iteration": 2.5436275005340576 + }, + { + "auxiliary_loss_clip": 0.0111645, + "auxiliary_loss_mlp": 0.01046225, + "balance_loss_clip": 1.05560458, + "balance_loss_mlp": 1.03034639, + "epoch": 0.3190741019089133, + "flos": 29095543589760.0, + "grad_norm": 2.2694018674988277, + "language_loss": 0.79467201, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.81629872, + "num_input_tokens_seen": 114013945, + "step": 5307, + "time_per_iteration": 2.588643789291382 + }, + { + "auxiliary_loss_clip": 0.01111101, + "auxiliary_loss_mlp": 0.01043503, + "balance_loss_clip": 1.05141211, + "balance_loss_mlp": 1.02712345, + "epoch": 0.31913422516158124, + "flos": 17129534209920.0, + "grad_norm": 1.6861403002693403, + "language_loss": 0.77331603, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.79486209, + "num_input_tokens_seen": 114031375, + "step": 5308, + "time_per_iteration": 2.50050687789917 + }, + { + "auxiliary_loss_clip": 0.01143981, + "auxiliary_loss_mlp": 0.01055543, + "balance_loss_clip": 1.05885863, + "balance_loss_mlp": 1.03774524, + "epoch": 0.3191943484142492, + "flos": 16069846147200.0, + "grad_norm": 2.2221068614279353, + "language_loss": 0.74622744, + "learning_rate": 3.184971450390961e-06, + "loss": 0.76822269, + "num_input_tokens_seen": 114048465, + "step": 5309, + "time_per_iteration": 2.4778504371643066 + }, + { + "auxiliary_loss_clip": 0.01129853, + "auxiliary_loss_mlp": 0.01040547, + "balance_loss_clip": 1.05107045, + "balance_loss_mlp": 1.02562773, + "epoch": 0.3192544716669172, + "flos": 22966166977920.0, + "grad_norm": 1.7513964125083117, + "language_loss": 0.83157957, + "learning_rate": 3.184657685014856e-06, + "loss": 0.85328352, + "num_input_tokens_seen": 114068415, + "step": 5310, + "time_per_iteration": 2.5016674995422363 + }, + { + "auxiliary_loss_clip": 0.01116101, + "auxiliary_loss_mlp": 0.01038631, + "balance_loss_clip": 1.05126595, + "balance_loss_mlp": 1.02484465, + "epoch": 0.31931459491958514, + "flos": 26870339018880.0, + "grad_norm": 1.5550780244996725, + "language_loss": 0.78235042, + "learning_rate": 3.184343874716412e-06, + "loss": 0.80389774, + "num_input_tokens_seen": 114088565, + "step": 5311, + "time_per_iteration": 2.587149143218994 + }, + { + "auxiliary_loss_clip": 0.01104121, + "auxiliary_loss_mlp": 0.01043512, + "balance_loss_clip": 1.04928446, + "balance_loss_mlp": 1.02771628, + "epoch": 0.3193747181722531, + "flos": 21836525178240.0, + "grad_norm": 1.7254339291869705, + "language_loss": 0.84810984, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.86958611, + "num_input_tokens_seen": 114107160, + "step": 5312, + "time_per_iteration": 2.5641889572143555 + }, + { + "auxiliary_loss_clip": 0.01098054, + "auxiliary_loss_mlp": 0.01052846, + "balance_loss_clip": 1.05305123, + "balance_loss_mlp": 1.0354768, + "epoch": 0.31943484142492107, + "flos": 18324999682560.0, + "grad_norm": 2.491688850830187, + "language_loss": 0.78091627, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.80242527, + "num_input_tokens_seen": 114123420, + "step": 5313, + "time_per_iteration": 2.5364279747009277 + }, + { + "auxiliary_loss_clip": 0.01128056, + "auxiliary_loss_mlp": 0.01036689, + "balance_loss_clip": 1.05131292, + "balance_loss_mlp": 1.02170467, + "epoch": 0.31949496467758903, + "flos": 21615818060160.0, + "grad_norm": 2.3651395868594536, + "language_loss": 0.86310947, + "learning_rate": 3.183402174406057e-06, + "loss": 0.88475692, + "num_input_tokens_seen": 114139230, + "step": 5314, + "time_per_iteration": 2.4928359985351562 + }, + { + "auxiliary_loss_clip": 0.01112235, + "auxiliary_loss_mlp": 0.01055268, + "balance_loss_clip": 1.04897904, + "balance_loss_mlp": 1.0377686, + "epoch": 0.31955508793025705, + "flos": 21760214734080.0, + "grad_norm": 1.7504162293059833, + "language_loss": 0.79702604, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.81870109, + "num_input_tokens_seen": 114159290, + "step": 5315, + "time_per_iteration": 2.549226999282837 + }, + { + "auxiliary_loss_clip": 0.01104542, + "auxiliary_loss_mlp": 0.01057435, + "balance_loss_clip": 1.04989696, + "balance_loss_mlp": 1.03982735, + "epoch": 0.319615211182925, + "flos": 17164331510400.0, + "grad_norm": 1.9168739887784296, + "language_loss": 0.67594182, + "learning_rate": 3.18277414980567e-06, + "loss": 0.69756162, + "num_input_tokens_seen": 114177655, + "step": 5316, + "time_per_iteration": 2.5399420261383057 + }, + { + "auxiliary_loss_clip": 0.01129004, + "auxiliary_loss_mlp": 0.01039022, + "balance_loss_clip": 1.05007148, + "balance_loss_mlp": 1.02466953, + "epoch": 0.319675334435593, + "flos": 28112812416000.0, + "grad_norm": 1.5121073909309855, + "language_loss": 0.69194674, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.71362698, + "num_input_tokens_seen": 114200880, + "step": 5317, + "time_per_iteration": 2.5801501274108887 + }, + { + "auxiliary_loss_clip": 0.01031301, + "auxiliary_loss_mlp": 0.01008252, + "balance_loss_clip": 1.02216911, + "balance_loss_mlp": 1.00598681, + "epoch": 0.31973545768826095, + "flos": 69501119408640.0, + "grad_norm": 0.72898209453558, + "language_loss": 0.5301075, + "learning_rate": 3.182145945801628e-06, + "loss": 0.55050308, + "num_input_tokens_seen": 114267145, + "step": 5318, + "time_per_iteration": 3.263522148132324 + }, + { + "auxiliary_loss_clip": 0.01137497, + "auxiliary_loss_mlp": 0.01034621, + "balance_loss_clip": 1.05191064, + "balance_loss_mlp": 1.0202148, + "epoch": 0.3197955809409289, + "flos": 13699203408000.0, + "grad_norm": 2.615331686557557, + "language_loss": 0.84313393, + "learning_rate": 3.181831776553012e-06, + "loss": 0.86485505, + "num_input_tokens_seen": 114284630, + "step": 5319, + "time_per_iteration": 2.469848394393921 + }, + { + "auxiliary_loss_clip": 0.01124364, + "auxiliary_loss_mlp": 0.01039297, + "balance_loss_clip": 1.04898071, + "balance_loss_mlp": 1.02430665, + "epoch": 0.3198557041935969, + "flos": 33218124278400.0, + "grad_norm": 1.6478802560791717, + "language_loss": 0.63309526, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.65473187, + "num_input_tokens_seen": 114305830, + "step": 5320, + "time_per_iteration": 2.6162517070770264 + }, + { + "auxiliary_loss_clip": 0.01122192, + "auxiliary_loss_mlp": 0.01039653, + "balance_loss_clip": 1.05026591, + "balance_loss_mlp": 1.02457333, + "epoch": 0.31991582744626484, + "flos": 23732033788800.0, + "grad_norm": 1.910989794333111, + "language_loss": 0.7054162, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.72703469, + "num_input_tokens_seen": 114325165, + "step": 5321, + "time_per_iteration": 2.5638370513916016 + }, + { + "auxiliary_loss_clip": 0.01146889, + "auxiliary_loss_mlp": 0.00799634, + "balance_loss_clip": 1.05285335, + "balance_loss_mlp": 1.02231359, + "epoch": 0.3199759506989328, + "flos": 18550842445440.0, + "grad_norm": 3.1853128897146705, + "language_loss": 0.86737156, + "learning_rate": 3.180888999963749e-06, + "loss": 0.88683677, + "num_input_tokens_seen": 114341310, + "step": 5322, + "time_per_iteration": 2.4539544582366943 + }, + { + "auxiliary_loss_clip": 0.01115555, + "auxiliary_loss_mlp": 0.01036456, + "balance_loss_clip": 1.05041933, + "balance_loss_mlp": 1.02150702, + "epoch": 0.3200360739516008, + "flos": 22418888382720.0, + "grad_norm": 1.6770846466241798, + "language_loss": 0.83398426, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.85550433, + "num_input_tokens_seen": 114360355, + "step": 5323, + "time_per_iteration": 4.040741920471191 + }, + { + "auxiliary_loss_clip": 0.01130072, + "auxiliary_loss_mlp": 0.01035825, + "balance_loss_clip": 1.04988706, + "balance_loss_mlp": 1.01966047, + "epoch": 0.32009619720426874, + "flos": 20595236929920.0, + "grad_norm": 1.901155423949736, + "language_loss": 0.77968752, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.80134642, + "num_input_tokens_seen": 114379220, + "step": 5324, + "time_per_iteration": 2.5038719177246094 + }, + { + "auxiliary_loss_clip": 0.01114869, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.04962683, + "balance_loss_mlp": 1.01945698, + "epoch": 0.3201563204569367, + "flos": 18147637301760.0, + "grad_norm": 2.1939150842033075, + "language_loss": 0.8018952, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.82339877, + "num_input_tokens_seen": 114396365, + "step": 5325, + "time_per_iteration": 2.5125627517700195 + }, + { + "auxiliary_loss_clip": 0.01128205, + "auxiliary_loss_mlp": 0.01040941, + "balance_loss_clip": 1.05028224, + "balance_loss_mlp": 1.0260042, + "epoch": 0.32021644370960467, + "flos": 31684235840640.0, + "grad_norm": 1.9755504813661542, + "language_loss": 0.74862838, + "learning_rate": 3.179631337655037e-06, + "loss": 0.77031988, + "num_input_tokens_seen": 114416780, + "step": 5326, + "time_per_iteration": 2.5965280532836914 + }, + { + "auxiliary_loss_clip": 0.01101284, + "auxiliary_loss_mlp": 0.0104144, + "balance_loss_clip": 1.05248737, + "balance_loss_mlp": 1.02600193, + "epoch": 0.32027656696227264, + "flos": 26865921646080.0, + "grad_norm": 1.5091304415708018, + "language_loss": 0.80718458, + "learning_rate": 3.179316810218701e-06, + "loss": 0.82861185, + "num_input_tokens_seen": 114437405, + "step": 5327, + "time_per_iteration": 2.6063945293426514 + }, + { + "auxiliary_loss_clip": 0.01110497, + "auxiliary_loss_mlp": 0.01041107, + "balance_loss_clip": 1.05058551, + "balance_loss_mlp": 1.02526414, + "epoch": 0.32033669021494066, + "flos": 24169928492160.0, + "grad_norm": 1.4696409075538084, + "language_loss": 0.77938831, + "learning_rate": 3.179002238062554e-06, + "loss": 0.80090433, + "num_input_tokens_seen": 114458505, + "step": 5328, + "time_per_iteration": 2.6059303283691406 + }, + { + "auxiliary_loss_clip": 0.01087584, + "auxiliary_loss_mlp": 0.01040674, + "balance_loss_clip": 1.05348194, + "balance_loss_mlp": 1.02402008, + "epoch": 0.3203968134676086, + "flos": 24460768915200.0, + "grad_norm": 1.8322120430742486, + "language_loss": 0.74189055, + "learning_rate": 3.178687621198524e-06, + "loss": 0.76317316, + "num_input_tokens_seen": 114479050, + "step": 5329, + "time_per_iteration": 4.044207334518433 + }, + { + "auxiliary_loss_clip": 0.01107438, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.04697418, + "balance_loss_mlp": 1.01823723, + "epoch": 0.3204569367202766, + "flos": 18004713085440.0, + "grad_norm": 1.7709617261076471, + "language_loss": 0.70846403, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.72986424, + "num_input_tokens_seen": 114497415, + "step": 5330, + "time_per_iteration": 3.90510630607605 + }, + { + "auxiliary_loss_clip": 0.01089577, + "auxiliary_loss_mlp": 0.01054094, + "balance_loss_clip": 1.05086565, + "balance_loss_mlp": 1.03518701, + "epoch": 0.32051705997294455, + "flos": 30589678650240.0, + "grad_norm": 1.8801279049244408, + "language_loss": 0.79911876, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.82055545, + "num_input_tokens_seen": 114518785, + "step": 5331, + "time_per_iteration": 4.064019441604614 + }, + { + "auxiliary_loss_clip": 0.01035931, + "auxiliary_loss_mlp": 0.01006097, + "balance_loss_clip": 1.02032804, + "balance_loss_mlp": 1.00418997, + "epoch": 0.3205771832256125, + "flos": 68417979765120.0, + "grad_norm": 0.8238478946142479, + "language_loss": 0.57755977, + "learning_rate": 3.177743502478447e-06, + "loss": 0.59798002, + "num_input_tokens_seen": 114577710, + "step": 5332, + "time_per_iteration": 3.0372300148010254 + }, + { + "auxiliary_loss_clip": 0.0110207, + "auxiliary_loss_mlp": 0.01040571, + "balance_loss_clip": 1.05117512, + "balance_loss_mlp": 1.02517521, + "epoch": 0.3206373064782805, + "flos": 30443953173120.0, + "grad_norm": 1.621781334019397, + "language_loss": 0.72995144, + "learning_rate": 3.177428706902205e-06, + "loss": 0.75137782, + "num_input_tokens_seen": 114598640, + "step": 5333, + "time_per_iteration": 2.6352834701538086 + }, + { + "auxiliary_loss_clip": 0.01114954, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.04790461, + "balance_loss_mlp": 1.0242548, + "epoch": 0.32069742973094845, + "flos": 22054502862720.0, + "grad_norm": 1.562523781130142, + "language_loss": 0.70477021, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.72631931, + "num_input_tokens_seen": 114618780, + "step": 5334, + "time_per_iteration": 2.5306146144866943 + }, + { + "auxiliary_loss_clip": 0.0109793, + "auxiliary_loss_mlp": 0.01040398, + "balance_loss_clip": 1.0482707, + "balance_loss_mlp": 1.02493656, + "epoch": 0.3207575529836164, + "flos": 22054000072320.0, + "grad_norm": 1.8278808276735463, + "language_loss": 0.77262294, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.79400623, + "num_input_tokens_seen": 114637525, + "step": 5335, + "time_per_iteration": 2.5924525260925293 + }, + { + "auxiliary_loss_clip": 0.01127958, + "auxiliary_loss_mlp": 0.01041475, + "balance_loss_clip": 1.05114889, + "balance_loss_mlp": 1.02640748, + "epoch": 0.3208176762362844, + "flos": 34057536186240.0, + "grad_norm": 1.4928934711136235, + "language_loss": 0.68518162, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.70687604, + "num_input_tokens_seen": 114659705, + "step": 5336, + "time_per_iteration": 2.6331582069396973 + }, + { + "auxiliary_loss_clip": 0.01101639, + "auxiliary_loss_mlp": 0.01044756, + "balance_loss_clip": 1.05025053, + "balance_loss_mlp": 1.02867436, + "epoch": 0.32087779948895234, + "flos": 21798711135360.0, + "grad_norm": 1.641478190657808, + "language_loss": 0.78838027, + "learning_rate": 3.176169078234487e-06, + "loss": 0.80984426, + "num_input_tokens_seen": 114678340, + "step": 5337, + "time_per_iteration": 2.5509026050567627 + }, + { + "auxiliary_loss_clip": 0.01120776, + "auxiliary_loss_mlp": 0.01035852, + "balance_loss_clip": 1.04822421, + "balance_loss_mlp": 1.02163029, + "epoch": 0.3209379227416203, + "flos": 21434110133760.0, + "grad_norm": 1.5052151443654458, + "language_loss": 0.74244797, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.76401424, + "num_input_tokens_seen": 114696980, + "step": 5338, + "time_per_iteration": 2.575389862060547 + }, + { + "auxiliary_loss_clip": 0.01122734, + "auxiliary_loss_mlp": 0.01042738, + "balance_loss_clip": 1.04887116, + "balance_loss_mlp": 1.02665114, + "epoch": 0.3209980459942883, + "flos": 25849075530240.0, + "grad_norm": 2.1612925551730626, + "language_loss": 0.63173604, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.65339071, + "num_input_tokens_seen": 114717330, + "step": 5339, + "time_per_iteration": 2.555581569671631 + }, + { + "auxiliary_loss_clip": 0.01139641, + "auxiliary_loss_mlp": 0.01037485, + "balance_loss_clip": 1.05055857, + "balance_loss_mlp": 1.02215457, + "epoch": 0.32105816924695624, + "flos": 19099162535040.0, + "grad_norm": 2.164286259137005, + "language_loss": 0.81401825, + "learning_rate": 3.175223888387192e-06, + "loss": 0.8357895, + "num_input_tokens_seen": 114736320, + "step": 5340, + "time_per_iteration": 2.4604363441467285 + }, + { + "auxiliary_loss_clip": 0.01111345, + "auxiliary_loss_mlp": 0.0104628, + "balance_loss_clip": 1.04795384, + "balance_loss_mlp": 1.03152823, + "epoch": 0.3211182924996242, + "flos": 16581860565120.0, + "grad_norm": 1.955691480428128, + "language_loss": 0.76468194, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.78625816, + "num_input_tokens_seen": 114754575, + "step": 5341, + "time_per_iteration": 2.5424680709838867 + }, + { + "auxiliary_loss_clip": 0.01101984, + "auxiliary_loss_mlp": 0.01036111, + "balance_loss_clip": 1.05153632, + "balance_loss_mlp": 1.02109015, + "epoch": 0.3211784157522922, + "flos": 22672202071680.0, + "grad_norm": 1.7240146233484375, + "language_loss": 0.7895838, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.81096476, + "num_input_tokens_seen": 114773590, + "step": 5342, + "time_per_iteration": 2.5626773834228516 + }, + { + "auxiliary_loss_clip": 0.01115964, + "auxiliary_loss_mlp": 0.01037755, + "balance_loss_clip": 1.05250466, + "balance_loss_mlp": 1.021626, + "epoch": 0.3212385390049602, + "flos": 20558787603840.0, + "grad_norm": 2.496211081863201, + "language_loss": 0.7521311, + "learning_rate": 3.174278297458438e-06, + "loss": 0.77366829, + "num_input_tokens_seen": 114790775, + "step": 5343, + "time_per_iteration": 2.5242292881011963 + }, + { + "auxiliary_loss_clip": 0.01075985, + "auxiliary_loss_mlp": 0.01035953, + "balance_loss_clip": 1.04667902, + "balance_loss_mlp": 1.01988316, + "epoch": 0.32129866225762815, + "flos": 24791147233920.0, + "grad_norm": 1.5357212026823333, + "language_loss": 0.82633555, + "learning_rate": 3.173963011408748e-06, + "loss": 0.84745491, + "num_input_tokens_seen": 114809835, + "step": 5344, + "time_per_iteration": 2.6325573921203613 + }, + { + "auxiliary_loss_clip": 0.01086207, + "auxiliary_loss_mlp": 0.01034348, + "balance_loss_clip": 1.05107391, + "balance_loss_mlp": 1.01874375, + "epoch": 0.3213587855102961, + "flos": 18366871962240.0, + "grad_norm": 2.1405973518978594, + "language_loss": 0.79600996, + "learning_rate": 3.173647680842262e-06, + "loss": 0.8172155, + "num_input_tokens_seen": 114826505, + "step": 5345, + "time_per_iteration": 2.5617942810058594 + }, + { + "auxiliary_loss_clip": 0.01110149, + "auxiliary_loss_mlp": 0.01039298, + "balance_loss_clip": 1.04668641, + "balance_loss_mlp": 1.02439046, + "epoch": 0.3214189087629641, + "flos": 27015992668800.0, + "grad_norm": 1.6892866632581776, + "language_loss": 0.83098543, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85247982, + "num_input_tokens_seen": 114846140, + "step": 5346, + "time_per_iteration": 2.5525989532470703 + }, + { + "auxiliary_loss_clip": 0.01113606, + "auxiliary_loss_mlp": 0.01038043, + "balance_loss_clip": 1.04918456, + "balance_loss_mlp": 1.02189612, + "epoch": 0.32147903201563205, + "flos": 23148269953920.0, + "grad_norm": 1.4445545823993287, + "language_loss": 0.8136065, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.83512306, + "num_input_tokens_seen": 114866660, + "step": 5347, + "time_per_iteration": 2.5763821601867676 + }, + { + "auxiliary_loss_clip": 0.01123271, + "auxiliary_loss_mlp": 0.01041363, + "balance_loss_clip": 1.04748619, + "balance_loss_mlp": 1.02470958, + "epoch": 0.3215391552683, + "flos": 16580747243520.0, + "grad_norm": 1.9302074279259138, + "language_loss": 0.79531425, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.81696057, + "num_input_tokens_seen": 114882820, + "step": 5348, + "time_per_iteration": 2.4493649005889893 + }, + { + "auxiliary_loss_clip": 0.01114097, + "auxiliary_loss_mlp": 0.01051344, + "balance_loss_clip": 1.05181479, + "balance_loss_mlp": 1.03525114, + "epoch": 0.321599278520968, + "flos": 17821820010240.0, + "grad_norm": 1.9598161818453348, + "language_loss": 0.848028, + "learning_rate": 3.172385913647542e-06, + "loss": 0.86968243, + "num_input_tokens_seen": 114900745, + "step": 5349, + "time_per_iteration": 2.524142026901245 + }, + { + "auxiliary_loss_clip": 0.01109744, + "auxiliary_loss_mlp": 0.01046802, + "balance_loss_clip": 1.04919636, + "balance_loss_mlp": 1.03145981, + "epoch": 0.32165940177363594, + "flos": 16251769555200.0, + "grad_norm": 2.058989106487354, + "language_loss": 0.80391401, + "learning_rate": 3.172070360676475e-06, + "loss": 0.82547951, + "num_input_tokens_seen": 114917940, + "step": 5350, + "time_per_iteration": 2.4989986419677734 + }, + { + "auxiliary_loss_clip": 0.01125949, + "auxiliary_loss_mlp": 0.01043299, + "balance_loss_clip": 1.04984593, + "balance_loss_mlp": 1.02862453, + "epoch": 0.3217195250263039, + "flos": 27599900158080.0, + "grad_norm": 1.5445946804618296, + "language_loss": 0.80306184, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.82475436, + "num_input_tokens_seen": 114937735, + "step": 5351, + "time_per_iteration": 2.5497188568115234 + }, + { + "auxiliary_loss_clip": 0.01103731, + "auxiliary_loss_mlp": 0.01045088, + "balance_loss_clip": 1.05010498, + "balance_loss_mlp": 1.02891099, + "epoch": 0.3217796482789719, + "flos": 21470595373440.0, + "grad_norm": 1.669699508514137, + "language_loss": 0.75719404, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.77868223, + "num_input_tokens_seen": 114956630, + "step": 5352, + "time_per_iteration": 2.5581398010253906 + }, + { + "auxiliary_loss_clip": 0.0109709, + "auxiliary_loss_mlp": 0.01041892, + "balance_loss_clip": 1.05276668, + "balance_loss_mlp": 1.02532208, + "epoch": 0.32183977153163984, + "flos": 21215593745280.0, + "grad_norm": 2.085890552989524, + "language_loss": 0.8239221, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.84531188, + "num_input_tokens_seen": 114976470, + "step": 5353, + "time_per_iteration": 2.574118137359619 + }, + { + "auxiliary_loss_clip": 0.01074557, + "auxiliary_loss_mlp": 0.01039199, + "balance_loss_clip": 1.05164945, + "balance_loss_mlp": 1.02341616, + "epoch": 0.3218998947843078, + "flos": 24608182331520.0, + "grad_norm": 1.6226918520960243, + "language_loss": 0.73130733, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.75244486, + "num_input_tokens_seen": 114996710, + "step": 5354, + "time_per_iteration": 2.69602632522583 + }, + { + "auxiliary_loss_clip": 0.0110178, + "auxiliary_loss_mlp": 0.01033349, + "balance_loss_clip": 1.04649448, + "balance_loss_mlp": 1.0189482, + "epoch": 0.3219600180369758, + "flos": 22270577126400.0, + "grad_norm": 1.6632501529294739, + "language_loss": 0.83869004, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.86004132, + "num_input_tokens_seen": 115015775, + "step": 5355, + "time_per_iteration": 2.5932302474975586 + }, + { + "auxiliary_loss_clip": 0.01142714, + "auxiliary_loss_mlp": 0.01045623, + "balance_loss_clip": 1.05234218, + "balance_loss_mlp": 1.03026938, + "epoch": 0.3220201412896438, + "flos": 14939126939520.0, + "grad_norm": 1.990411790303877, + "language_loss": 0.71202266, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.73390603, + "num_input_tokens_seen": 115034265, + "step": 5356, + "time_per_iteration": 2.4818005561828613 + }, + { + "auxiliary_loss_clip": 0.01099335, + "auxiliary_loss_mlp": 0.01042472, + "balance_loss_clip": 1.05169332, + "balance_loss_mlp": 1.02585387, + "epoch": 0.32208026454231176, + "flos": 22667389649280.0, + "grad_norm": 2.241121934273558, + "language_loss": 0.67085183, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.69226992, + "num_input_tokens_seen": 115051945, + "step": 5357, + "time_per_iteration": 2.607755422592163 + }, + { + "auxiliary_loss_clip": 0.0103618, + "auxiliary_loss_mlp": 0.01003302, + "balance_loss_clip": 1.02293539, + "balance_loss_mlp": 1.00119209, + "epoch": 0.3221403877949797, + "flos": 64605130053120.0, + "grad_norm": 0.7103009869340805, + "language_loss": 0.58286023, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60325503, + "num_input_tokens_seen": 115119090, + "step": 5358, + "time_per_iteration": 3.2216641902923584 + }, + { + "auxiliary_loss_clip": 0.01078088, + "auxiliary_loss_mlp": 0.01040192, + "balance_loss_clip": 1.05014145, + "balance_loss_mlp": 1.02426553, + "epoch": 0.3222005110476477, + "flos": 20157019004160.0, + "grad_norm": 1.7872036174454586, + "language_loss": 0.83999765, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.86118054, + "num_input_tokens_seen": 115137755, + "step": 5359, + "time_per_iteration": 2.636096715927124 + }, + { + "auxiliary_loss_clip": 0.01126916, + "auxiliary_loss_mlp": 0.01037289, + "balance_loss_clip": 1.04760993, + "balance_loss_mlp": 1.02204227, + "epoch": 0.32226063430031565, + "flos": 22674177319680.0, + "grad_norm": 1.6149637296728692, + "language_loss": 0.79648942, + "learning_rate": 3.168912388464595e-06, + "loss": 0.81813145, + "num_input_tokens_seen": 115158150, + "step": 5360, + "time_per_iteration": 2.5184459686279297 + }, + { + "auxiliary_loss_clip": 0.01042647, + "auxiliary_loss_mlp": 0.01005537, + "balance_loss_clip": 1.0236603, + "balance_loss_mlp": 1.00363004, + "epoch": 0.3223207575529836, + "flos": 63828525075840.0, + "grad_norm": 0.6567233708891804, + "language_loss": 0.56999826, + "learning_rate": 3.168596347256737e-06, + "loss": 0.59048009, + "num_input_tokens_seen": 115212755, + "step": 5361, + "time_per_iteration": 2.9626264572143555 + }, + { + "auxiliary_loss_clip": 0.0107903, + "auxiliary_loss_mlp": 0.01041335, + "balance_loss_clip": 1.04830849, + "balance_loss_mlp": 1.02548027, + "epoch": 0.3223808808056516, + "flos": 26870123537280.0, + "grad_norm": 1.7762274513735208, + "language_loss": 0.71125782, + "learning_rate": 3.168280261735588e-06, + "loss": 0.73246151, + "num_input_tokens_seen": 115233090, + "step": 5362, + "time_per_iteration": 4.108921527862549 + }, + { + "auxiliary_loss_clip": 0.01122888, + "auxiliary_loss_mlp": 0.01045759, + "balance_loss_clip": 1.04927039, + "balance_loss_mlp": 1.0295465, + "epoch": 0.32244100405831955, + "flos": 26761350176640.0, + "grad_norm": 1.9001845790567662, + "language_loss": 0.74009669, + "learning_rate": 3.167964131913135e-06, + "loss": 0.76178318, + "num_input_tokens_seen": 115252645, + "step": 5363, + "time_per_iteration": 2.535872459411621 + }, + { + "auxiliary_loss_clip": 0.01130156, + "auxiliary_loss_mlp": 0.01044466, + "balance_loss_clip": 1.04879737, + "balance_loss_mlp": 1.02911234, + "epoch": 0.3225011273109875, + "flos": 23803029020160.0, + "grad_norm": 2.366702561285205, + "language_loss": 0.76950383, + "learning_rate": 3.167647957801365e-06, + "loss": 0.79125005, + "num_input_tokens_seen": 115269085, + "step": 5364, + "time_per_iteration": 2.5045273303985596 + }, + { + "auxiliary_loss_clip": 0.01116345, + "auxiliary_loss_mlp": 0.01041332, + "balance_loss_clip": 1.04888296, + "balance_loss_mlp": 1.02527416, + "epoch": 0.3225612505636555, + "flos": 17274505501440.0, + "grad_norm": 2.4014437908881545, + "language_loss": 0.77007008, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.79164684, + "num_input_tokens_seen": 115286470, + "step": 5365, + "time_per_iteration": 2.5036580562591553 + }, + { + "auxiliary_loss_clip": 0.01123807, + "auxiliary_loss_mlp": 0.0104668, + "balance_loss_clip": 1.05610359, + "balance_loss_mlp": 1.03100348, + "epoch": 0.32262137381632344, + "flos": 23366247638400.0, + "grad_norm": 1.6573394377358865, + "language_loss": 0.7671634, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.78886825, + "num_input_tokens_seen": 115307000, + "step": 5366, + "time_per_iteration": 2.5649216175079346 + }, + { + "auxiliary_loss_clip": 0.01110219, + "auxiliary_loss_mlp": 0.01037726, + "balance_loss_clip": 1.04839659, + "balance_loss_mlp": 1.0220201, + "epoch": 0.3226814970689914, + "flos": 23258803080960.0, + "grad_norm": 2.291122162140556, + "language_loss": 0.72452486, + "learning_rate": 3.166699169850055e-06, + "loss": 0.74600434, + "num_input_tokens_seen": 115325925, + "step": 5367, + "time_per_iteration": 3.936999559402466 + }, + { + "auxiliary_loss_clip": 0.01136197, + "auxiliary_loss_mlp": 0.01039917, + "balance_loss_clip": 1.0500015, + "balance_loss_mlp": 1.02532589, + "epoch": 0.32274162032165943, + "flos": 16395196561920.0, + "grad_norm": 1.7984507437287314, + "language_loss": 0.74638617, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.76814729, + "num_input_tokens_seen": 115343705, + "step": 5368, + "time_per_iteration": 3.825505495071411 + }, + { + "auxiliary_loss_clip": 0.0110321, + "auxiliary_loss_mlp": 0.01046874, + "balance_loss_clip": 1.04709005, + "balance_loss_mlp": 1.03147805, + "epoch": 0.3228017435743274, + "flos": 27855081354240.0, + "grad_norm": 1.5525925577263395, + "language_loss": 0.78619456, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.80769545, + "num_input_tokens_seen": 115364170, + "step": 5369, + "time_per_iteration": 2.581200361251831 + }, + { + "auxiliary_loss_clip": 0.0109956, + "auxiliary_loss_mlp": 0.01036801, + "balance_loss_clip": 1.05190003, + "balance_loss_mlp": 1.02235258, + "epoch": 0.32286186682699536, + "flos": 19608770741760.0, + "grad_norm": 1.8485307558252508, + "language_loss": 0.83428025, + "learning_rate": 3.16574998372661e-06, + "loss": 0.85564387, + "num_input_tokens_seen": 115382495, + "step": 5370, + "time_per_iteration": 4.0455451011657715 + }, + { + "auxiliary_loss_clip": 0.01138555, + "auxiliary_loss_mlp": 0.01038871, + "balance_loss_clip": 1.05154407, + "balance_loss_mlp": 1.02369523, + "epoch": 0.3229219900796633, + "flos": 24134017870080.0, + "grad_norm": 3.7810677752852175, + "language_loss": 0.8308112, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.85258543, + "num_input_tokens_seen": 115399450, + "step": 5371, + "time_per_iteration": 2.4780356884002686 + }, + { + "auxiliary_loss_clip": 0.01130216, + "auxiliary_loss_mlp": 0.00802373, + "balance_loss_clip": 1.05070186, + "balance_loss_mlp": 1.0266993, + "epoch": 0.3229821133323313, + "flos": 17748705876480.0, + "grad_norm": 2.0557034085707735, + "language_loss": 0.88297474, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.9023006, + "num_input_tokens_seen": 115417700, + "step": 5372, + "time_per_iteration": 2.4758169651031494 + }, + { + "auxiliary_loss_clip": 0.01139235, + "auxiliary_loss_mlp": 0.01045029, + "balance_loss_clip": 1.05218077, + "balance_loss_mlp": 1.02954328, + "epoch": 0.32304223658499925, + "flos": 22346025644160.0, + "grad_norm": 1.916731588964195, + "language_loss": 0.73154128, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75338393, + "num_input_tokens_seen": 115435840, + "step": 5373, + "time_per_iteration": 2.4670889377593994 + }, + { + "auxiliary_loss_clip": 0.01109568, + "auxiliary_loss_mlp": 0.01035237, + "balance_loss_clip": 1.05046034, + "balance_loss_mlp": 1.0201329, + "epoch": 0.3231023598376672, + "flos": 18478302929280.0, + "grad_norm": 1.9366577151247066, + "language_loss": 0.81443393, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.83588195, + "num_input_tokens_seen": 115454210, + "step": 5374, + "time_per_iteration": 2.512446165084839 + }, + { + "auxiliary_loss_clip": 0.01091712, + "auxiliary_loss_mlp": 0.01037771, + "balance_loss_clip": 1.04674721, + "balance_loss_mlp": 1.02228534, + "epoch": 0.3231624830903352, + "flos": 27636313570560.0, + "grad_norm": 2.1493220726945865, + "language_loss": 0.8738156, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.89511043, + "num_input_tokens_seen": 115471785, + "step": 5375, + "time_per_iteration": 2.596646547317505 + }, + { + "auxiliary_loss_clip": 0.01139871, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.04915297, + "balance_loss_mlp": 1.02195919, + "epoch": 0.32322260634300315, + "flos": 21726423014400.0, + "grad_norm": 2.059544517882633, + "language_loss": 0.75919485, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.78097105, + "num_input_tokens_seen": 115491405, + "step": 5376, + "time_per_iteration": 2.4884703159332275 + }, + { + "auxiliary_loss_clip": 0.01099338, + "auxiliary_loss_mlp": 0.01032746, + "balance_loss_clip": 1.04974127, + "balance_loss_mlp": 1.01879287, + "epoch": 0.3232827295956711, + "flos": 22637656166400.0, + "grad_norm": 1.4961108394962088, + "language_loss": 0.66752285, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.68884373, + "num_input_tokens_seen": 115511555, + "step": 5377, + "time_per_iteration": 2.5662660598754883 + }, + { + "auxiliary_loss_clip": 0.01100977, + "auxiliary_loss_mlp": 0.01050163, + "balance_loss_clip": 1.0465579, + "balance_loss_mlp": 1.03236496, + "epoch": 0.3233428528483391, + "flos": 26322593546880.0, + "grad_norm": 1.40495165123392, + "language_loss": 0.72054923, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.74206066, + "num_input_tokens_seen": 115532860, + "step": 5378, + "time_per_iteration": 2.600036382675171 + }, + { + "auxiliary_loss_clip": 0.0112097, + "auxiliary_loss_mlp": 0.01037339, + "balance_loss_clip": 1.04930127, + "balance_loss_mlp": 1.02254486, + "epoch": 0.32340297610100704, + "flos": 28585217111040.0, + "grad_norm": 2.196769333378244, + "language_loss": 0.82174897, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.84333211, + "num_input_tokens_seen": 115553850, + "step": 5379, + "time_per_iteration": 2.6044538021087646 + }, + { + "auxiliary_loss_clip": 0.01136481, + "auxiliary_loss_mlp": 0.01039561, + "balance_loss_clip": 1.05703568, + "balance_loss_mlp": 1.02506518, + "epoch": 0.323463099353675, + "flos": 30773792787840.0, + "grad_norm": 1.871747138226317, + "language_loss": 0.78792357, + "learning_rate": 3.162583158454388e-06, + "loss": 0.80968398, + "num_input_tokens_seen": 115575530, + "step": 5380, + "time_per_iteration": 2.5817179679870605 + }, + { + "auxiliary_loss_clip": 0.01122744, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.05090952, + "balance_loss_mlp": 1.02402472, + "epoch": 0.32352322260634303, + "flos": 25228610974080.0, + "grad_norm": 1.6978516562193577, + "language_loss": 0.77136946, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.7929821, + "num_input_tokens_seen": 115594885, + "step": 5381, + "time_per_iteration": 2.5174336433410645 + }, + { + "auxiliary_loss_clip": 0.01120248, + "auxiliary_loss_mlp": 0.01037438, + "balance_loss_clip": 1.04743195, + "balance_loss_mlp": 1.02362716, + "epoch": 0.323583345859011, + "flos": 23330480670720.0, + "grad_norm": 1.7718215324592568, + "language_loss": 0.7114076, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.73298442, + "num_input_tokens_seen": 115614080, + "step": 5382, + "time_per_iteration": 2.5249316692352295 + }, + { + "auxiliary_loss_clip": 0.01117105, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_clip": 1.04847646, + "balance_loss_mlp": 1.02975094, + "epoch": 0.32364346911167896, + "flos": 26207499392640.0, + "grad_norm": 2.5260313995271755, + "language_loss": 0.71033585, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.73196477, + "num_input_tokens_seen": 115632820, + "step": 5383, + "time_per_iteration": 2.548656702041626 + }, + { + "auxiliary_loss_clip": 0.01120953, + "auxiliary_loss_mlp": 0.01039501, + "balance_loss_clip": 1.04777932, + "balance_loss_mlp": 1.02587533, + "epoch": 0.3237035923643469, + "flos": 23695764030720.0, + "grad_norm": 1.7984304122905868, + "language_loss": 0.78510875, + "learning_rate": 3.161315193285283e-06, + "loss": 0.80671328, + "num_input_tokens_seen": 115652860, + "step": 5384, + "time_per_iteration": 2.5162243843078613 + }, + { + "auxiliary_loss_clip": 0.01080513, + "auxiliary_loss_mlp": 0.01042879, + "balance_loss_clip": 1.05090392, + "balance_loss_mlp": 1.0260942, + "epoch": 0.3237637156170149, + "flos": 14428728633600.0, + "grad_norm": 2.6447946742184496, + "language_loss": 0.74938333, + "learning_rate": 3.16099809186998e-06, + "loss": 0.77061725, + "num_input_tokens_seen": 115670940, + "step": 5385, + "time_per_iteration": 2.588855504989624 + }, + { + "auxiliary_loss_clip": 0.01110195, + "auxiliary_loss_mlp": 0.01038839, + "balance_loss_clip": 1.05471933, + "balance_loss_mlp": 1.02387226, + "epoch": 0.32382383886968286, + "flos": 31062981185280.0, + "grad_norm": 1.7155073105094663, + "language_loss": 0.71663594, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.73812628, + "num_input_tokens_seen": 115691155, + "step": 5386, + "time_per_iteration": 2.6090614795684814 + }, + { + "auxiliary_loss_clip": 0.01137753, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.04708409, + "balance_loss_mlp": 1.02227223, + "epoch": 0.3238839621223508, + "flos": 23256935573760.0, + "grad_norm": 2.0879008397508803, + "language_loss": 0.948394, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.97014797, + "num_input_tokens_seen": 115710340, + "step": 5387, + "time_per_iteration": 2.4672648906707764 + }, + { + "auxiliary_loss_clip": 0.01128423, + "auxiliary_loss_mlp": 0.01046175, + "balance_loss_clip": 1.04990029, + "balance_loss_mlp": 1.03041542, + "epoch": 0.3239440853750188, + "flos": 22964658606720.0, + "grad_norm": 2.014040467567363, + "language_loss": 0.77483904, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.79658496, + "num_input_tokens_seen": 115726745, + "step": 5388, + "time_per_iteration": 2.4844744205474854 + }, + { + "auxiliary_loss_clip": 0.01110109, + "auxiliary_loss_mlp": 0.01035677, + "balance_loss_clip": 1.04855311, + "balance_loss_mlp": 1.02029848, + "epoch": 0.32400420862768675, + "flos": 36246614653440.0, + "grad_norm": 1.833516454365437, + "language_loss": 0.71096838, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.73242629, + "num_input_tokens_seen": 115749385, + "step": 5389, + "time_per_iteration": 2.6484270095825195 + }, + { + "auxiliary_loss_clip": 0.01098278, + "auxiliary_loss_mlp": 0.01041918, + "balance_loss_clip": 1.04968739, + "balance_loss_mlp": 1.02626598, + "epoch": 0.3240643318803547, + "flos": 21616500418560.0, + "grad_norm": 1.6988650048734761, + "language_loss": 0.80835056, + "learning_rate": 3.159411924656557e-06, + "loss": 0.82975256, + "num_input_tokens_seen": 115768105, + "step": 5390, + "time_per_iteration": 2.5615243911743164 + }, + { + "auxiliary_loss_clip": 0.01110034, + "auxiliary_loss_mlp": 0.01047814, + "balance_loss_clip": 1.05075431, + "balance_loss_mlp": 1.03247154, + "epoch": 0.3241244551330227, + "flos": 23295611543040.0, + "grad_norm": 1.8010368707071807, + "language_loss": 0.73385483, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.75543332, + "num_input_tokens_seen": 115787340, + "step": 5391, + "time_per_iteration": 2.550978422164917 + }, + { + "auxiliary_loss_clip": 0.01111601, + "auxiliary_loss_mlp": 0.01040955, + "balance_loss_clip": 1.04781353, + "balance_loss_mlp": 1.02645862, + "epoch": 0.32418457838569065, + "flos": 14097236993280.0, + "grad_norm": 1.5497678118827314, + "language_loss": 0.77146083, + "learning_rate": 3.158777149931855e-06, + "loss": 0.79298639, + "num_input_tokens_seen": 115805565, + "step": 5392, + "time_per_iteration": 2.515350103378296 + }, + { + "auxiliary_loss_clip": 0.01109367, + "auxiliary_loss_mlp": 0.01046567, + "balance_loss_clip": 1.04539609, + "balance_loss_mlp": 1.02900708, + "epoch": 0.3242447016383586, + "flos": 29752672953600.0, + "grad_norm": 2.2128977512186236, + "language_loss": 0.62530446, + "learning_rate": 3.158459696652067e-06, + "loss": 0.64686382, + "num_input_tokens_seen": 115826725, + "step": 5393, + "time_per_iteration": 2.594237804412842 + }, + { + "auxiliary_loss_clip": 0.01119538, + "auxiliary_loss_mlp": 0.01039975, + "balance_loss_clip": 1.05036557, + "balance_loss_mlp": 1.02506816, + "epoch": 0.3243048248910266, + "flos": 24351205455360.0, + "grad_norm": 1.5329086985066331, + "language_loss": 0.82812119, + "learning_rate": 3.158142199443371e-06, + "loss": 0.84971631, + "num_input_tokens_seen": 115846955, + "step": 5394, + "time_per_iteration": 2.535346031188965 + }, + { + "auxiliary_loss_clip": 0.01111743, + "auxiliary_loss_mlp": 0.01046653, + "balance_loss_clip": 1.04980302, + "balance_loss_mlp": 1.03200173, + "epoch": 0.3243649481436946, + "flos": 24353037048960.0, + "grad_norm": 1.6991828606616228, + "language_loss": 0.82083082, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.84241474, + "num_input_tokens_seen": 115865975, + "step": 5395, + "time_per_iteration": 2.540802478790283 + }, + { + "auxiliary_loss_clip": 0.01122544, + "auxiliary_loss_mlp": 0.01042073, + "balance_loss_clip": 1.04965329, + "balance_loss_mlp": 1.02741051, + "epoch": 0.32442507139636256, + "flos": 22925228451840.0, + "grad_norm": 1.7295680706852552, + "language_loss": 0.83241618, + "learning_rate": 3.157507073287417e-06, + "loss": 0.85406232, + "num_input_tokens_seen": 115884950, + "step": 5396, + "time_per_iteration": 2.523560047149658 + }, + { + "auxiliary_loss_clip": 0.0110063, + "auxiliary_loss_mlp": 0.01048673, + "balance_loss_clip": 1.04907346, + "balance_loss_mlp": 1.03101778, + "epoch": 0.32448519464903053, + "flos": 22200192426240.0, + "grad_norm": 1.7984303761815954, + "language_loss": 0.7567296, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.77822262, + "num_input_tokens_seen": 115904170, + "step": 5397, + "time_per_iteration": 2.5513064861297607 + }, + { + "auxiliary_loss_clip": 0.01101676, + "auxiliary_loss_mlp": 0.01032844, + "balance_loss_clip": 1.04938245, + "balance_loss_mlp": 1.01792479, + "epoch": 0.3245453179016985, + "flos": 18838450644480.0, + "grad_norm": 2.2606459417251124, + "language_loss": 0.67263281, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.69397807, + "num_input_tokens_seen": 115919255, + "step": 5398, + "time_per_iteration": 2.5438036918640137 + }, + { + "auxiliary_loss_clip": 0.01108012, + "auxiliary_loss_mlp": 0.01037053, + "balance_loss_clip": 1.04749513, + "balance_loss_mlp": 1.02154374, + "epoch": 0.32460544115436646, + "flos": 21178390233600.0, + "grad_norm": 1.3847606855341306, + "language_loss": 0.73305732, + "learning_rate": 3.156554054887718e-06, + "loss": 0.75450802, + "num_input_tokens_seen": 115938535, + "step": 5399, + "time_per_iteration": 2.568788766860962 + }, + { + "auxiliary_loss_clip": 0.01100833, + "auxiliary_loss_mlp": 0.01042608, + "balance_loss_clip": 1.04767179, + "balance_loss_mlp": 1.02678919, + "epoch": 0.3246655644070344, + "flos": 21981137333760.0, + "grad_norm": 2.02864900707376, + "language_loss": 0.71010518, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.73153961, + "num_input_tokens_seen": 115955005, + "step": 5400, + "time_per_iteration": 2.567824602127075 + }, + { + "auxiliary_loss_clip": 0.0112763, + "auxiliary_loss_mlp": 0.01038398, + "balance_loss_clip": 1.04844737, + "balance_loss_mlp": 1.02331781, + "epoch": 0.3247256876597024, + "flos": 32159729105280.0, + "grad_norm": 1.9672805156507402, + "language_loss": 0.79801124, + "learning_rate": 3.155918489984614e-06, + "loss": 0.81967151, + "num_input_tokens_seen": 115975305, + "step": 5401, + "time_per_iteration": 3.9444713592529297 + }, + { + "auxiliary_loss_clip": 0.01109869, + "auxiliary_loss_mlp": 0.0104594, + "balance_loss_clip": 1.04949403, + "balance_loss_mlp": 1.02938235, + "epoch": 0.32478581091237035, + "flos": 20997544233600.0, + "grad_norm": 1.4911067207729836, + "language_loss": 0.87500536, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.89656341, + "num_input_tokens_seen": 115994810, + "step": 5402, + "time_per_iteration": 2.5793704986572266 + }, + { + "auxiliary_loss_clip": 0.01078728, + "auxiliary_loss_mlp": 0.01047086, + "balance_loss_clip": 1.04276025, + "balance_loss_mlp": 1.03030133, + "epoch": 0.3248459341650383, + "flos": 17924990849280.0, + "grad_norm": 1.8597282856031376, + "language_loss": 0.84581208, + "learning_rate": 3.155282749751332e-06, + "loss": 0.8670702, + "num_input_tokens_seen": 116011095, + "step": 5403, + "time_per_iteration": 2.5595030784606934 + }, + { + "auxiliary_loss_clip": 0.01104991, + "auxiliary_loss_mlp": 0.01041625, + "balance_loss_clip": 1.04949415, + "balance_loss_mlp": 1.0271349, + "epoch": 0.3249060574177063, + "flos": 24535606901760.0, + "grad_norm": 2.1034827846191497, + "language_loss": 0.87582874, + "learning_rate": 3.154964813916007e-06, + "loss": 0.89729482, + "num_input_tokens_seen": 116028805, + "step": 5404, + "time_per_iteration": 2.5495753288269043 + }, + { + "auxiliary_loss_clip": 0.011242, + "auxiliary_loss_mlp": 0.01039234, + "balance_loss_clip": 1.0506103, + "balance_loss_mlp": 1.02454805, + "epoch": 0.32496618067037425, + "flos": 25994765093760.0, + "grad_norm": 1.6080156173463307, + "language_loss": 0.72640991, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.74804419, + "num_input_tokens_seen": 116047765, + "step": 5405, + "time_per_iteration": 3.980363607406616 + }, + { + "auxiliary_loss_clip": 0.01095306, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.05306613, + "balance_loss_mlp": 1.02086926, + "epoch": 0.3250263039230422, + "flos": 19573757959680.0, + "grad_norm": 1.7464790925081126, + "language_loss": 0.83020937, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.85152066, + "num_input_tokens_seen": 116068385, + "step": 5406, + "time_per_iteration": 2.584517240524292 + }, + { + "auxiliary_loss_clip": 0.01134825, + "auxiliary_loss_mlp": 0.01032415, + "balance_loss_clip": 1.05068731, + "balance_loss_mlp": 1.01855707, + "epoch": 0.3250864271757102, + "flos": 16763640318720.0, + "grad_norm": 1.6595417558651204, + "language_loss": 0.87602508, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.89769745, + "num_input_tokens_seen": 116085350, + "step": 5407, + "time_per_iteration": 3.7864737510681152 + }, + { + "auxiliary_loss_clip": 0.01112192, + "auxiliary_loss_mlp": 0.01039493, + "balance_loss_clip": 1.05003524, + "balance_loss_mlp": 1.02455592, + "epoch": 0.3251465504283782, + "flos": 27819458040960.0, + "grad_norm": 1.311560653971944, + "language_loss": 0.69671822, + "learning_rate": 3.153692632731479e-06, + "loss": 0.71823502, + "num_input_tokens_seen": 116107560, + "step": 5408, + "time_per_iteration": 4.010082244873047 + }, + { + "auxiliary_loss_clip": 0.01126731, + "auxiliary_loss_mlp": 0.01032743, + "balance_loss_clip": 1.04720068, + "balance_loss_mlp": 1.01800895, + "epoch": 0.32520667368104617, + "flos": 19063144172160.0, + "grad_norm": 1.7032492064776585, + "language_loss": 0.77659476, + "learning_rate": 3.153374478034841e-06, + "loss": 0.79818946, + "num_input_tokens_seen": 116125980, + "step": 5409, + "time_per_iteration": 2.4756855964660645 + }, + { + "auxiliary_loss_clip": 0.01079239, + "auxiliary_loss_mlp": 0.0104366, + "balance_loss_clip": 1.04248738, + "balance_loss_mlp": 1.02811515, + "epoch": 0.32526679693371413, + "flos": 29382146208000.0, + "grad_norm": 1.8782008808424644, + "language_loss": 0.83318877, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.85441774, + "num_input_tokens_seen": 116146530, + "step": 5410, + "time_per_iteration": 2.66770601272583 + }, + { + "auxiliary_loss_clip": 0.01084414, + "auxiliary_loss_mlp": 0.01035035, + "balance_loss_clip": 1.0484302, + "balance_loss_mlp": 1.02063417, + "epoch": 0.3253269201863821, + "flos": 20704513080960.0, + "grad_norm": 1.6059131418251358, + "language_loss": 0.71326685, + "learning_rate": 3.152738037445405e-06, + "loss": 0.73446137, + "num_input_tokens_seen": 116165695, + "step": 5411, + "time_per_iteration": 2.596151828765869 + }, + { + "auxiliary_loss_clip": 0.01091775, + "auxiliary_loss_mlp": 0.01040423, + "balance_loss_clip": 1.05682755, + "balance_loss_mlp": 1.02633262, + "epoch": 0.32538704343905006, + "flos": 29094142959360.0, + "grad_norm": 1.490604705913697, + "language_loss": 0.83218026, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85350227, + "num_input_tokens_seen": 116185375, + "step": 5412, + "time_per_iteration": 2.6373658180236816 + }, + { + "auxiliary_loss_clip": 0.01102876, + "auxiliary_loss_mlp": 0.01037716, + "balance_loss_clip": 1.04616272, + "balance_loss_mlp": 1.02163482, + "epoch": 0.325447166691718, + "flos": 24676124906880.0, + "grad_norm": 1.7095817306612302, + "language_loss": 0.81044722, + "learning_rate": 3.152101422008203e-06, + "loss": 0.83185315, + "num_input_tokens_seen": 116204335, + "step": 5413, + "time_per_iteration": 2.5852599143981934 + }, + { + "auxiliary_loss_clip": 0.01110685, + "auxiliary_loss_mlp": 0.01036913, + "balance_loss_clip": 1.04697657, + "balance_loss_mlp": 1.02099824, + "epoch": 0.325507289944386, + "flos": 21543134889600.0, + "grad_norm": 1.5573136731519273, + "language_loss": 0.76794767, + "learning_rate": 3.151783048751864e-06, + "loss": 0.78942358, + "num_input_tokens_seen": 116222840, + "step": 5414, + "time_per_iteration": 2.5623910427093506 + }, + { + "auxiliary_loss_clip": 0.01020807, + "auxiliary_loss_mlp": 0.01010425, + "balance_loss_clip": 1.02352691, + "balance_loss_mlp": 1.00862527, + "epoch": 0.32556741319705396, + "flos": 71518722347520.0, + "grad_norm": 0.9085066961436206, + "language_loss": 0.64000785, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.66032016, + "num_input_tokens_seen": 116274940, + "step": 5415, + "time_per_iteration": 3.066683292388916 + }, + { + "auxiliary_loss_clip": 0.01089423, + "auxiliary_loss_mlp": 0.01040066, + "balance_loss_clip": 1.04952765, + "balance_loss_mlp": 1.02417564, + "epoch": 0.3256275364497219, + "flos": 23732428838400.0, + "grad_norm": 2.155493165527598, + "language_loss": 0.7401213, + "learning_rate": 3.151146171224075e-06, + "loss": 0.7614162, + "num_input_tokens_seen": 116297300, + "step": 5416, + "time_per_iteration": 2.6062815189361572 + }, + { + "auxiliary_loss_clip": 0.01045711, + "auxiliary_loss_mlp": 0.01004378, + "balance_loss_clip": 1.01606297, + "balance_loss_mlp": 1.00255382, + "epoch": 0.3256876597023899, + "flos": 67289199891840.0, + "grad_norm": 0.7789619128187606, + "language_loss": 0.57983875, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.60033959, + "num_input_tokens_seen": 116362370, + "step": 5417, + "time_per_iteration": 3.1594011783599854 + }, + { + "auxiliary_loss_clip": 0.01027927, + "auxiliary_loss_mlp": 0.01003781, + "balance_loss_clip": 1.01858234, + "balance_loss_mlp": 1.0017184, + "epoch": 0.32574778295505785, + "flos": 71282323964160.0, + "grad_norm": 0.8042363563572404, + "language_loss": 0.63395613, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65427321, + "num_input_tokens_seen": 116430365, + "step": 5418, + "time_per_iteration": 3.233729362487793 + }, + { + "auxiliary_loss_clip": 0.0111788, + "auxiliary_loss_mlp": 0.01042594, + "balance_loss_clip": 1.05879796, + "balance_loss_mlp": 1.02790761, + "epoch": 0.3258079062077258, + "flos": 20776370238720.0, + "grad_norm": 2.10447166520173, + "language_loss": 0.69397229, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.71557701, + "num_input_tokens_seen": 116447525, + "step": 5419, + "time_per_iteration": 2.5184731483459473 + }, + { + "auxiliary_loss_clip": 0.0113196, + "auxiliary_loss_mlp": 0.01038813, + "balance_loss_clip": 1.05446255, + "balance_loss_mlp": 1.02288711, + "epoch": 0.3258680294603938, + "flos": 22235456603520.0, + "grad_norm": 1.6998999618423507, + "language_loss": 0.77501321, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.79672098, + "num_input_tokens_seen": 116466310, + "step": 5420, + "time_per_iteration": 2.5011355876922607 + }, + { + "auxiliary_loss_clip": 0.01122544, + "auxiliary_loss_mlp": 0.00801869, + "balance_loss_clip": 1.04578781, + "balance_loss_mlp": 1.02611065, + "epoch": 0.3259281527130618, + "flos": 26979974305920.0, + "grad_norm": 1.4607310284309045, + "language_loss": 0.80082506, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.82006919, + "num_input_tokens_seen": 116487825, + "step": 5421, + "time_per_iteration": 2.535216808319092 + }, + { + "auxiliary_loss_clip": 0.01132063, + "auxiliary_loss_mlp": 0.01038576, + "balance_loss_clip": 1.04777586, + "balance_loss_mlp": 1.02429509, + "epoch": 0.32598827596572977, + "flos": 26214251149440.0, + "grad_norm": 2.2485898833957925, + "language_loss": 0.75884289, + "learning_rate": 3.149234491389381e-06, + "loss": 0.78054929, + "num_input_tokens_seen": 116509950, + "step": 5422, + "time_per_iteration": 2.5255470275878906 + }, + { + "auxiliary_loss_clip": 0.01101564, + "auxiliary_loss_mlp": 0.00797715, + "balance_loss_clip": 1.04816091, + "balance_loss_mlp": 1.02164102, + "epoch": 0.32604839921839773, + "flos": 17639752947840.0, + "grad_norm": 2.1068187865673935, + "language_loss": 0.62819993, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.64719272, + "num_input_tokens_seen": 116527695, + "step": 5423, + "time_per_iteration": 2.5277020931243896 + }, + { + "auxiliary_loss_clip": 0.01097629, + "auxiliary_loss_mlp": 0.01035189, + "balance_loss_clip": 1.04460049, + "balance_loss_mlp": 1.02171874, + "epoch": 0.3261085224710657, + "flos": 23622721724160.0, + "grad_norm": 1.660461862129649, + "language_loss": 0.74589765, + "learning_rate": 3.148596916016224e-06, + "loss": 0.7672258, + "num_input_tokens_seen": 116547800, + "step": 5424, + "time_per_iteration": 2.5676016807556152 + }, + { + "auxiliary_loss_clip": 0.01103056, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.04837132, + "balance_loss_mlp": 1.02098346, + "epoch": 0.32616864572373366, + "flos": 23260455106560.0, + "grad_norm": 1.6098004539572819, + "language_loss": 0.77129918, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.79268062, + "num_input_tokens_seen": 116568460, + "step": 5425, + "time_per_iteration": 2.549919843673706 + }, + { + "auxiliary_loss_clip": 0.01105564, + "auxiliary_loss_mlp": 0.01044419, + "balance_loss_clip": 1.04815006, + "balance_loss_mlp": 1.02777767, + "epoch": 0.32622876897640163, + "flos": 25593427457280.0, + "grad_norm": 2.215612796829964, + "language_loss": 0.77805722, + "learning_rate": 3.147959166423428e-06, + "loss": 0.79955697, + "num_input_tokens_seen": 116588705, + "step": 5426, + "time_per_iteration": 2.604257345199585 + }, + { + "auxiliary_loss_clip": 0.01088375, + "auxiliary_loss_mlp": 0.01039826, + "balance_loss_clip": 1.04580176, + "balance_loss_mlp": 1.02336287, + "epoch": 0.3262888922290696, + "flos": 22418996123520.0, + "grad_norm": 1.712001980523705, + "language_loss": 0.7452848, + "learning_rate": 3.147640226324893e-06, + "loss": 0.76656681, + "num_input_tokens_seen": 116608845, + "step": 5427, + "time_per_iteration": 2.6226346492767334 + }, + { + "auxiliary_loss_clip": 0.01101648, + "auxiliary_loss_mlp": 0.01047673, + "balance_loss_clip": 1.04464817, + "balance_loss_mlp": 1.03185356, + "epoch": 0.32634901548173756, + "flos": 19718908819200.0, + "grad_norm": 1.5602753641467806, + "language_loss": 0.79263842, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.81413168, + "num_input_tokens_seen": 116628145, + "step": 5428, + "time_per_iteration": 2.547834873199463 + }, + { + "auxiliary_loss_clip": 0.01119582, + "auxiliary_loss_mlp": 0.01041212, + "balance_loss_clip": 1.04523325, + "balance_loss_mlp": 1.02656114, + "epoch": 0.3264091387344055, + "flos": 16142924367360.0, + "grad_norm": 1.5508952122638706, + "language_loss": 0.71127367, + "learning_rate": 3.147002215584023e-06, + "loss": 0.73288161, + "num_input_tokens_seen": 116646920, + "step": 5429, + "time_per_iteration": 2.4773359298706055 + }, + { + "auxiliary_loss_clip": 0.01098365, + "auxiliary_loss_mlp": 0.01039668, + "balance_loss_clip": 1.04940236, + "balance_loss_mlp": 1.02560127, + "epoch": 0.3264692619870735, + "flos": 16399075230720.0, + "grad_norm": 1.6249751062640256, + "language_loss": 0.78498638, + "learning_rate": 3.146683144965881e-06, + "loss": 0.80636668, + "num_input_tokens_seen": 116665100, + "step": 5430, + "time_per_iteration": 2.552614450454712 + }, + { + "auxiliary_loss_clip": 0.01084611, + "auxiliary_loss_mlp": 0.01036515, + "balance_loss_clip": 1.05107343, + "balance_loss_mlp": 1.02036238, + "epoch": 0.32652938523974145, + "flos": 22382331315840.0, + "grad_norm": 2.0332998859557017, + "language_loss": 0.8394624, + "learning_rate": 3.146364030865399e-06, + "loss": 0.86067367, + "num_input_tokens_seen": 116682205, + "step": 5431, + "time_per_iteration": 2.6155076026916504 + }, + { + "auxiliary_loss_clip": 0.01116859, + "auxiliary_loss_mlp": 0.01038375, + "balance_loss_clip": 1.04940915, + "balance_loss_mlp": 1.02385497, + "epoch": 0.3265895084924094, + "flos": 21908059113600.0, + "grad_norm": 1.9017484206687336, + "language_loss": 0.70642662, + "learning_rate": 3.146044873294678e-06, + "loss": 0.72797894, + "num_input_tokens_seen": 116702575, + "step": 5432, + "time_per_iteration": 2.527967929840088 + }, + { + "auxiliary_loss_clip": 0.0107445, + "auxiliary_loss_mlp": 0.01041284, + "balance_loss_clip": 1.04551268, + "balance_loss_mlp": 1.0251193, + "epoch": 0.3266496317450774, + "flos": 16067152627200.0, + "grad_norm": 1.369103485547817, + "language_loss": 0.84196103, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86311841, + "num_input_tokens_seen": 116720885, + "step": 5433, + "time_per_iteration": 2.610870122909546 + }, + { + "auxiliary_loss_clip": 0.01111584, + "auxiliary_loss_mlp": 0.01034753, + "balance_loss_clip": 1.05118942, + "balance_loss_mlp": 1.01999533, + "epoch": 0.3267097549977454, + "flos": 22528236360960.0, + "grad_norm": 1.3559662597362263, + "language_loss": 0.85867852, + "learning_rate": 3.145406427790931e-06, + "loss": 0.88014185, + "num_input_tokens_seen": 116740395, + "step": 5434, + "time_per_iteration": 2.5827043056488037 + }, + { + "auxiliary_loss_clip": 0.01112306, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.04578376, + "balance_loss_mlp": 1.02221906, + "epoch": 0.32676987825041337, + "flos": 27270419679360.0, + "grad_norm": 1.7947325827272111, + "language_loss": 0.87822509, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.8997274, + "num_input_tokens_seen": 116758870, + "step": 5435, + "time_per_iteration": 2.5690951347351074 + }, + { + "auxiliary_loss_clip": 0.01131822, + "auxiliary_loss_mlp": 0.01035188, + "balance_loss_clip": 1.04706204, + "balance_loss_mlp": 1.02027452, + "epoch": 0.32683000150308134, + "flos": 11508257433600.0, + "grad_norm": 2.5481723738390882, + "language_loss": 0.76462311, + "learning_rate": 3.144767808551479e-06, + "loss": 0.78629321, + "num_input_tokens_seen": 116773440, + "step": 5436, + "time_per_iteration": 2.4680943489074707 + }, + { + "auxiliary_loss_clip": 0.01133053, + "auxiliary_loss_mlp": 0.01038672, + "balance_loss_clip": 1.04929042, + "balance_loss_mlp": 1.02405143, + "epoch": 0.3268901247557493, + "flos": 25630200005760.0, + "grad_norm": 1.5794215836469658, + "language_loss": 0.72003484, + "learning_rate": 3.144448433811134e-06, + "loss": 0.74175215, + "num_input_tokens_seen": 116794375, + "step": 5437, + "time_per_iteration": 2.5115909576416016 + }, + { + "auxiliary_loss_clip": 0.01100683, + "auxiliary_loss_mlp": 0.01039723, + "balance_loss_clip": 1.04990578, + "balance_loss_mlp": 1.0233438, + "epoch": 0.32695024800841727, + "flos": 24860849575680.0, + "grad_norm": 1.7105248071197225, + "language_loss": 0.63879526, + "learning_rate": 3.144129015673189e-06, + "loss": 0.66019928, + "num_input_tokens_seen": 116815095, + "step": 5438, + "time_per_iteration": 2.5647690296173096 + }, + { + "auxiliary_loss_clip": 0.0112281, + "auxiliary_loss_mlp": 0.01038046, + "balance_loss_clip": 1.04870951, + "balance_loss_mlp": 1.02248907, + "epoch": 0.32701037126108523, + "flos": 28839249072000.0, + "grad_norm": 2.366430480314041, + "language_loss": 0.74460363, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.76621211, + "num_input_tokens_seen": 116836630, + "step": 5439, + "time_per_iteration": 2.5459823608398438 + }, + { + "auxiliary_loss_clip": 0.0112639, + "auxiliary_loss_mlp": 0.01048965, + "balance_loss_clip": 1.0502249, + "balance_loss_mlp": 1.03284156, + "epoch": 0.3270704945137532, + "flos": 27965075777280.0, + "grad_norm": 1.9649315998866363, + "language_loss": 0.74867594, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.77042949, + "num_input_tokens_seen": 116856880, + "step": 5440, + "time_per_iteration": 3.930870532989502 + }, + { + "auxiliary_loss_clip": 0.01119808, + "auxiliary_loss_mlp": 0.00798639, + "balance_loss_clip": 1.04802608, + "balance_loss_mlp": 1.01993322, + "epoch": 0.32713061776642116, + "flos": 23690700213120.0, + "grad_norm": 1.9309873447422525, + "language_loss": 0.85071123, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.8698957, + "num_input_tokens_seen": 116873770, + "step": 5441, + "time_per_iteration": 2.497464179992676 + }, + { + "auxiliary_loss_clip": 0.01119024, + "auxiliary_loss_mlp": 0.01042892, + "balance_loss_clip": 1.04390383, + "balance_loss_mlp": 1.02683449, + "epoch": 0.3271907410190891, + "flos": 22455625017600.0, + "grad_norm": 1.9894046400053618, + "language_loss": 0.86729056, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.88890976, + "num_input_tokens_seen": 116891225, + "step": 5442, + "time_per_iteration": 2.4799108505249023 + }, + { + "auxiliary_loss_clip": 0.01104169, + "auxiliary_loss_mlp": 0.01042778, + "balance_loss_clip": 1.05224967, + "balance_loss_mlp": 1.02576685, + "epoch": 0.3272508642717571, + "flos": 22820118278400.0, + "grad_norm": 2.513213694436785, + "language_loss": 0.77500743, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.79647684, + "num_input_tokens_seen": 116912300, + "step": 5443, + "time_per_iteration": 3.9552462100982666 + }, + { + "auxiliary_loss_clip": 0.01104283, + "auxiliary_loss_mlp": 0.00797833, + "balance_loss_clip": 1.04864311, + "balance_loss_mlp": 1.01857197, + "epoch": 0.32731098752442506, + "flos": 11801360413440.0, + "grad_norm": 2.0536627263236076, + "language_loss": 0.81485456, + "learning_rate": 3.142211596174343e-06, + "loss": 0.83387572, + "num_input_tokens_seen": 116929425, + "step": 5444, + "time_per_iteration": 2.5245606899261475 + }, + { + "auxiliary_loss_clip": 0.01090457, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.04617047, + "balance_loss_mlp": 1.02422345, + "epoch": 0.327371110777093, + "flos": 21027780506880.0, + "grad_norm": 1.9770537238621573, + "language_loss": 0.5920319, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.61333036, + "num_input_tokens_seen": 116948255, + "step": 5445, + "time_per_iteration": 2.598482131958008 + }, + { + "auxiliary_loss_clip": 0.01126528, + "auxiliary_loss_mlp": 0.01044067, + "balance_loss_clip": 1.050138, + "balance_loss_mlp": 1.02768743, + "epoch": 0.327431234029761, + "flos": 19062102677760.0, + "grad_norm": 2.238430077055999, + "language_loss": 0.88429976, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.90600574, + "num_input_tokens_seen": 116964905, + "step": 5446, + "time_per_iteration": 3.8755714893341064 + }, + { + "auxiliary_loss_clip": 0.01123276, + "auxiliary_loss_mlp": 0.01045038, + "balance_loss_clip": 1.04955482, + "balance_loss_mlp": 1.02756238, + "epoch": 0.32749135728242895, + "flos": 25849219184640.0, + "grad_norm": 1.5563423811742216, + "language_loss": 0.78900486, + "learning_rate": 3.141252301538802e-06, + "loss": 0.81068802, + "num_input_tokens_seen": 116983650, + "step": 5447, + "time_per_iteration": 3.9296629428863525 + }, + { + "auxiliary_loss_clip": 0.0110489, + "auxiliary_loss_mlp": 0.00794229, + "balance_loss_clip": 1.04587197, + "balance_loss_mlp": 1.01126957, + "epoch": 0.327551480535097, + "flos": 20120533764480.0, + "grad_norm": 1.7718813235975865, + "language_loss": 0.73357046, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.75256169, + "num_input_tokens_seen": 117003265, + "step": 5448, + "time_per_iteration": 2.5481104850769043 + }, + { + "auxiliary_loss_clip": 0.01134639, + "auxiliary_loss_mlp": 0.0104581, + "balance_loss_clip": 1.04820228, + "balance_loss_mlp": 1.02959728, + "epoch": 0.32761160378776494, + "flos": 28803553931520.0, + "grad_norm": 1.3685217537393823, + "language_loss": 0.66924495, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.6910494, + "num_input_tokens_seen": 117025370, + "step": 5449, + "time_per_iteration": 2.5778377056121826 + }, + { + "auxiliary_loss_clip": 0.01099978, + "auxiliary_loss_mlp": 0.01038448, + "balance_loss_clip": 1.04685152, + "balance_loss_mlp": 1.02384531, + "epoch": 0.3276717270404329, + "flos": 26937778803840.0, + "grad_norm": 1.443796868871506, + "language_loss": 0.65302283, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.67440712, + "num_input_tokens_seen": 117044350, + "step": 5450, + "time_per_iteration": 2.6024248600006104 + }, + { + "auxiliary_loss_clip": 0.01131805, + "auxiliary_loss_mlp": 0.01046314, + "balance_loss_clip": 1.05242252, + "balance_loss_mlp": 1.03031635, + "epoch": 0.32773185029310087, + "flos": 25338425829120.0, + "grad_norm": 1.4275192776950087, + "language_loss": 0.77370751, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.79548866, + "num_input_tokens_seen": 117064450, + "step": 5451, + "time_per_iteration": 2.531707286834717 + }, + { + "auxiliary_loss_clip": 0.01124713, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_clip": 1.04846179, + "balance_loss_mlp": 1.0289284, + "epoch": 0.32779197354576883, + "flos": 26391721271040.0, + "grad_norm": 2.0463258689533883, + "language_loss": 0.70519453, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.72689772, + "num_input_tokens_seen": 117083060, + "step": 5452, + "time_per_iteration": 2.5307023525238037 + }, + { + "auxiliary_loss_clip": 0.01108002, + "auxiliary_loss_mlp": 0.01038461, + "balance_loss_clip": 1.04552972, + "balance_loss_mlp": 1.02330375, + "epoch": 0.3278520967984368, + "flos": 24899381890560.0, + "grad_norm": 1.5988798846792438, + "language_loss": 0.78585583, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.80732048, + "num_input_tokens_seen": 117101860, + "step": 5453, + "time_per_iteration": 2.560089349746704 + }, + { + "auxiliary_loss_clip": 0.01128259, + "auxiliary_loss_mlp": 0.0103206, + "balance_loss_clip": 1.05044651, + "balance_loss_mlp": 1.01687837, + "epoch": 0.32791222005110476, + "flos": 29752996176000.0, + "grad_norm": 1.9969561284658575, + "language_loss": 0.75538445, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.77698767, + "num_input_tokens_seen": 117123100, + "step": 5454, + "time_per_iteration": 2.5719680786132812 + }, + { + "auxiliary_loss_clip": 0.01073759, + "auxiliary_loss_mlp": 0.01042311, + "balance_loss_clip": 1.04614854, + "balance_loss_mlp": 1.02767217, + "epoch": 0.32797234330377273, + "flos": 16508064072960.0, + "grad_norm": 1.8239852247166373, + "language_loss": 0.77065706, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.79181778, + "num_input_tokens_seen": 117140515, + "step": 5455, + "time_per_iteration": 2.5900113582611084 + }, + { + "auxiliary_loss_clip": 0.01134019, + "auxiliary_loss_mlp": 0.01043351, + "balance_loss_clip": 1.0518527, + "balance_loss_mlp": 1.02686453, + "epoch": 0.3280324665564407, + "flos": 26577918397440.0, + "grad_norm": 1.6660752178931932, + "language_loss": 0.73867655, + "learning_rate": 3.138372082016768e-06, + "loss": 0.76045024, + "num_input_tokens_seen": 117161485, + "step": 5456, + "time_per_iteration": 2.560990333557129 + }, + { + "auxiliary_loss_clip": 0.01137242, + "auxiliary_loss_mlp": 0.0104692, + "balance_loss_clip": 1.05023348, + "balance_loss_mlp": 1.03167319, + "epoch": 0.32809258980910866, + "flos": 22929969047040.0, + "grad_norm": 1.4650735136819832, + "language_loss": 0.78203988, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.80388147, + "num_input_tokens_seen": 117181870, + "step": 5457, + "time_per_iteration": 2.490001678466797 + }, + { + "auxiliary_loss_clip": 0.01100581, + "auxiliary_loss_mlp": 0.01040461, + "balance_loss_clip": 1.04724693, + "balance_loss_mlp": 1.02556038, + "epoch": 0.3281527130617766, + "flos": 22783848520320.0, + "grad_norm": 2.260344235111836, + "language_loss": 0.78703022, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.80844069, + "num_input_tokens_seen": 117201380, + "step": 5458, + "time_per_iteration": 2.5651304721832275 + }, + { + "auxiliary_loss_clip": 0.01121333, + "auxiliary_loss_mlp": 0.01039801, + "balance_loss_clip": 1.05292201, + "balance_loss_mlp": 1.02433956, + "epoch": 0.3282128363144446, + "flos": 21250678354560.0, + "grad_norm": 1.6807926449982848, + "language_loss": 0.73032379, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.75193512, + "num_input_tokens_seen": 117221040, + "step": 5459, + "time_per_iteration": 2.5186305046081543 + }, + { + "auxiliary_loss_clip": 0.01117413, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_clip": 1.05157149, + "balance_loss_mlp": 1.02517223, + "epoch": 0.32827295956711255, + "flos": 30843064166400.0, + "grad_norm": 1.9309464297881194, + "language_loss": 0.84114075, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.86271572, + "num_input_tokens_seen": 117241395, + "step": 5460, + "time_per_iteration": 2.6068003177642822 + }, + { + "auxiliary_loss_clip": 0.01135913, + "auxiliary_loss_mlp": 0.01036105, + "balance_loss_clip": 1.04918694, + "balance_loss_mlp": 1.02144241, + "epoch": 0.3283330828197806, + "flos": 25915006944000.0, + "grad_norm": 1.8227541916770955, + "language_loss": 0.77131879, + "learning_rate": 3.136770448642288e-06, + "loss": 0.79303896, + "num_input_tokens_seen": 117259340, + "step": 5461, + "time_per_iteration": 2.518712043762207 + }, + { + "auxiliary_loss_clip": 0.01120089, + "auxiliary_loss_mlp": 0.01040536, + "balance_loss_clip": 1.05017662, + "balance_loss_mlp": 1.02315497, + "epoch": 0.32839320607244854, + "flos": 38582065042560.0, + "grad_norm": 1.9794243956468336, + "language_loss": 0.62745833, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.64906454, + "num_input_tokens_seen": 117282375, + "step": 5462, + "time_per_iteration": 2.6235275268554688 + }, + { + "auxiliary_loss_clip": 0.01135017, + "auxiliary_loss_mlp": 0.00799753, + "balance_loss_clip": 1.05078566, + "balance_loss_mlp": 1.02517772, + "epoch": 0.3284533293251165, + "flos": 26650888876800.0, + "grad_norm": 1.362325422267679, + "language_loss": 0.78294134, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.80228907, + "num_input_tokens_seen": 117303830, + "step": 5463, + "time_per_iteration": 2.5316519737243652 + }, + { + "auxiliary_loss_clip": 0.01103147, + "auxiliary_loss_mlp": 0.01039787, + "balance_loss_clip": 1.04568648, + "balance_loss_mlp": 1.02405155, + "epoch": 0.32851345257778447, + "flos": 15304158904320.0, + "grad_norm": 1.8656324518084828, + "language_loss": 0.69784272, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.71927208, + "num_input_tokens_seen": 117320665, + "step": 5464, + "time_per_iteration": 2.498030185699463 + }, + { + "auxiliary_loss_clip": 0.01126028, + "auxiliary_loss_mlp": 0.01039802, + "balance_loss_clip": 1.05301952, + "balance_loss_mlp": 1.02475786, + "epoch": 0.32857357583045244, + "flos": 23513732881920.0, + "grad_norm": 1.6248117913851012, + "language_loss": 0.72404552, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.74570388, + "num_input_tokens_seen": 117339795, + "step": 5465, + "time_per_iteration": 2.5114364624023438 + }, + { + "auxiliary_loss_clip": 0.01111335, + "auxiliary_loss_mlp": 0.010484, + "balance_loss_clip": 1.05029297, + "balance_loss_mlp": 1.03153169, + "epoch": 0.3286336990831204, + "flos": 20995209849600.0, + "grad_norm": 1.4150920753040914, + "language_loss": 0.82816505, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.84976244, + "num_input_tokens_seen": 117359525, + "step": 5466, + "time_per_iteration": 2.518624782562256 + }, + { + "auxiliary_loss_clip": 0.01118214, + "auxiliary_loss_mlp": 0.0103629, + "balance_loss_clip": 1.05259144, + "balance_loss_mlp": 1.02105546, + "epoch": 0.32869382233578837, + "flos": 23658811914240.0, + "grad_norm": 1.919160953585714, + "language_loss": 0.79567707, + "learning_rate": 3.134847066213879e-06, + "loss": 0.81722212, + "num_input_tokens_seen": 117380320, + "step": 5467, + "time_per_iteration": 2.5562973022460938 + }, + { + "auxiliary_loss_clip": 0.01112083, + "auxiliary_loss_mlp": 0.01035639, + "balance_loss_clip": 1.047364, + "balance_loss_mlp": 1.02036786, + "epoch": 0.32875394558845633, + "flos": 25336522408320.0, + "grad_norm": 1.5575206248847773, + "language_loss": 0.74513215, + "learning_rate": 3.134526351787587e-06, + "loss": 0.76660943, + "num_input_tokens_seen": 117400695, + "step": 5468, + "time_per_iteration": 2.560326337814331 + }, + { + "auxiliary_loss_clip": 0.01112494, + "auxiliary_loss_mlp": 0.0104384, + "balance_loss_clip": 1.04939926, + "balance_loss_mlp": 1.02666199, + "epoch": 0.3288140688411243, + "flos": 14903108576640.0, + "grad_norm": 2.103159118987981, + "language_loss": 0.78183293, + "learning_rate": 3.134205594339942e-06, + "loss": 0.80339634, + "num_input_tokens_seen": 117418800, + "step": 5469, + "time_per_iteration": 2.506467819213867 + }, + { + "auxiliary_loss_clip": 0.0110807, + "auxiliary_loss_mlp": 0.01039681, + "balance_loss_clip": 1.0493933, + "balance_loss_mlp": 1.02528012, + "epoch": 0.32887419209379226, + "flos": 18551345235840.0, + "grad_norm": 1.7485832352108162, + "language_loss": 0.81797308, + "learning_rate": 3.133884793883107e-06, + "loss": 0.8394506, + "num_input_tokens_seen": 117438220, + "step": 5470, + "time_per_iteration": 2.5368640422821045 + }, + { + "auxiliary_loss_clip": 0.01135829, + "auxiliary_loss_mlp": 0.01043031, + "balance_loss_clip": 1.04765964, + "balance_loss_mlp": 1.02747416, + "epoch": 0.3289343153464602, + "flos": 48105610439040.0, + "grad_norm": 1.8208787496516754, + "language_loss": 0.67756569, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.69935435, + "num_input_tokens_seen": 117462560, + "step": 5471, + "time_per_iteration": 2.688544511795044 + }, + { + "auxiliary_loss_clip": 0.0114126, + "auxiliary_loss_mlp": 0.01045364, + "balance_loss_clip": 1.05116487, + "balance_loss_mlp": 1.02809083, + "epoch": 0.3289944385991282, + "flos": 27600295207680.0, + "grad_norm": 1.7151019450587666, + "language_loss": 0.64985889, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.67172515, + "num_input_tokens_seen": 117483665, + "step": 5472, + "time_per_iteration": 2.493891477584839 + }, + { + "auxiliary_loss_clip": 0.01122796, + "auxiliary_loss_mlp": 0.01052658, + "balance_loss_clip": 1.05241513, + "balance_loss_mlp": 1.03561151, + "epoch": 0.32905456185179616, + "flos": 20120318282880.0, + "grad_norm": 1.6980640494651191, + "language_loss": 0.88304245, + "learning_rate": 3.13292213457912e-06, + "loss": 0.90479702, + "num_input_tokens_seen": 117503565, + "step": 5473, + "time_per_iteration": 2.4941635131835938 + }, + { + "auxiliary_loss_clip": 0.01099416, + "auxiliary_loss_mlp": 0.01041865, + "balance_loss_clip": 1.0486846, + "balance_loss_mlp": 1.02469885, + "epoch": 0.3291146851044642, + "flos": 23180230080000.0, + "grad_norm": 1.610818880910699, + "language_loss": 0.77934992, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.80076277, + "num_input_tokens_seen": 117521460, + "step": 5474, + "time_per_iteration": 2.5681374073028564 + }, + { + "auxiliary_loss_clip": 0.01033964, + "auxiliary_loss_mlp": 0.01004579, + "balance_loss_clip": 1.02615333, + "balance_loss_mlp": 1.00250518, + "epoch": 0.32917480835713214, + "flos": 67621912594560.0, + "grad_norm": 0.8086427566173015, + "language_loss": 0.60197896, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62236434, + "num_input_tokens_seen": 117580550, + "step": 5475, + "time_per_iteration": 3.1031100749969482 + }, + { + "auxiliary_loss_clip": 0.01090035, + "auxiliary_loss_mlp": 0.01064085, + "balance_loss_clip": 1.04547465, + "balance_loss_mlp": 1.04391456, + "epoch": 0.3292349316098001, + "flos": 27964537073280.0, + "grad_norm": 2.8020759869830494, + "language_loss": 0.76461595, + "learning_rate": 3.131959088630455e-06, + "loss": 0.78615719, + "num_input_tokens_seen": 117600645, + "step": 5476, + "time_per_iteration": 2.651751756668091 + }, + { + "auxiliary_loss_clip": 0.01101078, + "auxiliary_loss_mlp": 0.01043622, + "balance_loss_clip": 1.05127311, + "balance_loss_mlp": 1.02788043, + "epoch": 0.3292950548624681, + "flos": 20263673462400.0, + "grad_norm": 1.7600088659618365, + "language_loss": 0.747738, + "learning_rate": 3.131637987449997e-06, + "loss": 0.76918501, + "num_input_tokens_seen": 117618880, + "step": 5477, + "time_per_iteration": 2.545360803604126 + }, + { + "auxiliary_loss_clip": 0.0113016, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.04784095, + "balance_loss_mlp": 1.02369285, + "epoch": 0.32935517811513604, + "flos": 20812999132800.0, + "grad_norm": 2.1056019745433394, + "language_loss": 0.76023293, + "learning_rate": 3.131316843357713e-06, + "loss": 0.78191, + "num_input_tokens_seen": 117636445, + "step": 5478, + "time_per_iteration": 2.4514975547790527 + }, + { + "auxiliary_loss_clip": 0.01123547, + "auxiliary_loss_mlp": 0.01038072, + "balance_loss_clip": 1.04846287, + "balance_loss_mlp": 1.02303922, + "epoch": 0.329415301367804, + "flos": 18441853603200.0, + "grad_norm": 1.6813502205150128, + "language_loss": 0.80191004, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.82352614, + "num_input_tokens_seen": 117653105, + "step": 5479, + "time_per_iteration": 3.837000608444214 + }, + { + "auxiliary_loss_clip": 0.01031, + "auxiliary_loss_mlp": 0.01002052, + "balance_loss_clip": 1.02065659, + "balance_loss_mlp": 0.99990654, + "epoch": 0.32947542462047197, + "flos": 66323024887680.0, + "grad_norm": 0.742993163833018, + "language_loss": 0.56541169, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58574218, + "num_input_tokens_seen": 117719225, + "step": 5480, + "time_per_iteration": 3.1774446964263916 + }, + { + "auxiliary_loss_clip": 0.0111852, + "auxiliary_loss_mlp": 0.00799337, + "balance_loss_clip": 1.044438, + "balance_loss_mlp": 1.02188611, + "epoch": 0.32953554787313993, + "flos": 23221599569280.0, + "grad_norm": 1.5796536615293784, + "language_loss": 0.76773661, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.78691512, + "num_input_tokens_seen": 117738725, + "step": 5481, + "time_per_iteration": 2.520198345184326 + }, + { + "auxiliary_loss_clip": 0.01115462, + "auxiliary_loss_mlp": 0.01039739, + "balance_loss_clip": 1.04979312, + "balance_loss_mlp": 1.02512407, + "epoch": 0.3295956711258079, + "flos": 27009492307200.0, + "grad_norm": 1.6892566733503858, + "language_loss": 0.78874493, + "learning_rate": 3.130031838113899e-06, + "loss": 0.81029695, + "num_input_tokens_seen": 117757765, + "step": 5482, + "time_per_iteration": 3.9484097957611084 + }, + { + "auxiliary_loss_clip": 0.01125318, + "auxiliary_loss_mlp": 0.0104137, + "balance_loss_clip": 1.04666817, + "balance_loss_mlp": 1.02522278, + "epoch": 0.32965579437847586, + "flos": 19171702051200.0, + "grad_norm": 1.700504979403609, + "language_loss": 0.73801476, + "learning_rate": 3.129710479645185e-06, + "loss": 0.75968158, + "num_input_tokens_seen": 117776810, + "step": 5483, + "time_per_iteration": 2.492429256439209 + }, + { + "auxiliary_loss_clip": 0.01118639, + "auxiliary_loss_mlp": 0.01040903, + "balance_loss_clip": 1.04778647, + "balance_loss_mlp": 1.02504802, + "epoch": 0.32971591763114383, + "flos": 30482521401600.0, + "grad_norm": 2.0638169605851466, + "language_loss": 0.75515503, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.77675045, + "num_input_tokens_seen": 117797730, + "step": 5484, + "time_per_iteration": 3.972377061843872 + }, + { + "auxiliary_loss_clip": 0.01136109, + "auxiliary_loss_mlp": 0.01042292, + "balance_loss_clip": 1.05119514, + "balance_loss_mlp": 1.02743912, + "epoch": 0.3297760408838118, + "flos": 16289583598080.0, + "grad_norm": 1.9152973663677686, + "language_loss": 0.71589315, + "learning_rate": 3.129067634203742e-06, + "loss": 0.73767716, + "num_input_tokens_seen": 117815365, + "step": 5485, + "time_per_iteration": 2.453735828399658 + }, + { + "auxiliary_loss_clip": 0.01079009, + "auxiliary_loss_mlp": 0.01043771, + "balance_loss_clip": 1.04968822, + "balance_loss_mlp": 1.0291853, + "epoch": 0.32983616413647976, + "flos": 29530924341120.0, + "grad_norm": 1.5433290961866533, + "language_loss": 0.80142546, + "learning_rate": 3.128746147255388e-06, + "loss": 0.82265329, + "num_input_tokens_seen": 117836095, + "step": 5486, + "time_per_iteration": 4.129487037658691 + }, + { + "auxiliary_loss_clip": 0.01103452, + "auxiliary_loss_mlp": 0.01043017, + "balance_loss_clip": 1.04526496, + "balance_loss_mlp": 1.02693594, + "epoch": 0.3298962873891478, + "flos": 20631398947200.0, + "grad_norm": 2.3778133122296223, + "language_loss": 0.84491414, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.86637884, + "num_input_tokens_seen": 117854655, + "step": 5487, + "time_per_iteration": 2.5164694786071777 + }, + { + "auxiliary_loss_clip": 0.01086335, + "auxiliary_loss_mlp": 0.01047651, + "balance_loss_clip": 1.04621315, + "balance_loss_mlp": 1.03029442, + "epoch": 0.32995641064181574, + "flos": 14976007228800.0, + "grad_norm": 4.638093314421904, + "language_loss": 0.7443555, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.76569539, + "num_input_tokens_seen": 117873300, + "step": 5488, + "time_per_iteration": 2.578690528869629 + }, + { + "auxiliary_loss_clip": 0.01135722, + "auxiliary_loss_mlp": 0.01046155, + "balance_loss_clip": 1.04939377, + "balance_loss_mlp": 1.030604, + "epoch": 0.3300165338944837, + "flos": 18661447399680.0, + "grad_norm": 1.9638895865791772, + "language_loss": 0.72278631, + "learning_rate": 3.127781429646098e-06, + "loss": 0.74460506, + "num_input_tokens_seen": 117891540, + "step": 5489, + "time_per_iteration": 2.4486260414123535 + }, + { + "auxiliary_loss_clip": 0.01129293, + "auxiliary_loss_mlp": 0.01037101, + "balance_loss_clip": 1.04441094, + "balance_loss_mlp": 1.02222323, + "epoch": 0.3300766571471517, + "flos": 25583730785280.0, + "grad_norm": 3.104893868167829, + "language_loss": 0.88910258, + "learning_rate": 3.127459771562238e-06, + "loss": 0.91076654, + "num_input_tokens_seen": 117907690, + "step": 5490, + "time_per_iteration": 2.4767656326293945 + }, + { + "auxiliary_loss_clip": 0.01121905, + "auxiliary_loss_mlp": 0.01036712, + "balance_loss_clip": 1.04669809, + "balance_loss_mlp": 1.02192378, + "epoch": 0.33013678039981964, + "flos": 11363501623680.0, + "grad_norm": 2.09878642216636, + "language_loss": 0.83256477, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85415089, + "num_input_tokens_seen": 117925640, + "step": 5491, + "time_per_iteration": 2.4777300357818604 + }, + { + "auxiliary_loss_clip": 0.01104845, + "auxiliary_loss_mlp": 0.01042442, + "balance_loss_clip": 1.04622126, + "balance_loss_mlp": 1.02684903, + "epoch": 0.3301969036524876, + "flos": 24821203939200.0, + "grad_norm": 2.1355899255731976, + "language_loss": 0.77415788, + "learning_rate": 3.126816327146554e-06, + "loss": 0.79563075, + "num_input_tokens_seen": 117944525, + "step": 5492, + "time_per_iteration": 2.550823211669922 + }, + { + "auxiliary_loss_clip": 0.01139042, + "auxiliary_loss_mlp": 0.0104954, + "balance_loss_clip": 1.0506618, + "balance_loss_mlp": 1.03301787, + "epoch": 0.33025702690515557, + "flos": 15961144613760.0, + "grad_norm": 2.0169818605182495, + "language_loss": 0.74499428, + "learning_rate": 3.12649454083913e-06, + "loss": 0.76688015, + "num_input_tokens_seen": 117962515, + "step": 5493, + "time_per_iteration": 2.442042112350464 + }, + { + "auxiliary_loss_clip": 0.01001518, + "auxiliary_loss_mlp": 0.0101201, + "balance_loss_clip": 1.02339768, + "balance_loss_mlp": 1.0100435, + "epoch": 0.33031715015782354, + "flos": 59416755989760.0, + "grad_norm": 0.8203395257930374, + "language_loss": 0.53945386, + "learning_rate": 3.12617271181492e-06, + "loss": 0.55958915, + "num_input_tokens_seen": 118018780, + "step": 5494, + "time_per_iteration": 3.1261751651763916 + }, + { + "auxiliary_loss_clip": 0.0111175, + "auxiliary_loss_mlp": 0.01038592, + "balance_loss_clip": 1.0459758, + "balance_loss_mlp": 1.02304733, + "epoch": 0.3303772734104915, + "flos": 23184360144000.0, + "grad_norm": 1.4864332269576475, + "language_loss": 0.86923397, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.89073741, + "num_input_tokens_seen": 118038610, + "step": 5495, + "time_per_iteration": 2.5393624305725098 + }, + { + "auxiliary_loss_clip": 0.0110108, + "auxiliary_loss_mlp": 0.01049143, + "balance_loss_clip": 1.05028534, + "balance_loss_mlp": 1.03208458, + "epoch": 0.33043739666315947, + "flos": 33071896010880.0, + "grad_norm": 1.935474501440665, + "language_loss": 0.74103951, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.76254177, + "num_input_tokens_seen": 118055905, + "step": 5496, + "time_per_iteration": 2.632340431213379 + }, + { + "auxiliary_loss_clip": 0.01101942, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.04539526, + "balance_loss_mlp": 1.02139401, + "epoch": 0.33049751991582743, + "flos": 24895431394560.0, + "grad_norm": 2.010564387297931, + "language_loss": 0.72076875, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.74214786, + "num_input_tokens_seen": 118073695, + "step": 5497, + "time_per_iteration": 2.5323538780212402 + }, + { + "auxiliary_loss_clip": 0.0110802, + "auxiliary_loss_mlp": 0.01037398, + "balance_loss_clip": 1.04930186, + "balance_loss_mlp": 1.02232957, + "epoch": 0.3305576431684954, + "flos": 29460575554560.0, + "grad_norm": 1.7679144991047733, + "language_loss": 0.80004895, + "learning_rate": 3.124884968794321e-06, + "loss": 0.82150304, + "num_input_tokens_seen": 118094030, + "step": 5498, + "time_per_iteration": 2.582070827484131 + }, + { + "auxiliary_loss_clip": 0.01118463, + "auxiliary_loss_mlp": 0.01040138, + "balance_loss_clip": 1.0480516, + "balance_loss_mlp": 1.02414012, + "epoch": 0.33061776642116336, + "flos": 22632305040000.0, + "grad_norm": 1.882264977955824, + "language_loss": 0.75789982, + "learning_rate": 3.12456292636927e-06, + "loss": 0.77948582, + "num_input_tokens_seen": 118111665, + "step": 5499, + "time_per_iteration": 2.4868319034576416 + }, + { + "auxiliary_loss_clip": 0.01112597, + "auxiliary_loss_mlp": 0.01037118, + "balance_loss_clip": 1.04704332, + "balance_loss_mlp": 1.02191901, + "epoch": 0.3306778896738313, + "flos": 25776320532480.0, + "grad_norm": 1.5065644749892309, + "language_loss": 0.78804672, + "learning_rate": 3.124240841300681e-06, + "loss": 0.80954385, + "num_input_tokens_seen": 118132435, + "step": 5500, + "time_per_iteration": 2.6027326583862305 + }, + { + "auxiliary_loss_clip": 0.01124093, + "auxiliary_loss_mlp": 0.01034011, + "balance_loss_clip": 1.04829073, + "balance_loss_mlp": 1.01775098, + "epoch": 0.33073801292649935, + "flos": 36940552479360.0, + "grad_norm": 2.369776800035216, + "language_loss": 0.65798068, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.67956173, + "num_input_tokens_seen": 118155255, + "step": 5501, + "time_per_iteration": 2.625359058380127 + }, + { + "auxiliary_loss_clip": 0.01124953, + "auxiliary_loss_mlp": 0.01042176, + "balance_loss_clip": 1.04852605, + "balance_loss_mlp": 1.0264703, + "epoch": 0.3307981361791673, + "flos": 12967738848000.0, + "grad_norm": 2.025538435449352, + "language_loss": 0.77407002, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.79574132, + "num_input_tokens_seen": 118169865, + "step": 5502, + "time_per_iteration": 2.4394137859344482 + }, + { + "auxiliary_loss_clip": 0.01112345, + "auxiliary_loss_mlp": 0.01037162, + "balance_loss_clip": 1.05330682, + "balance_loss_mlp": 1.02119946, + "epoch": 0.3308582594318353, + "flos": 25374372364800.0, + "grad_norm": 1.675536065866582, + "language_loss": 0.7223888, + "learning_rate": 3.123274330355824e-06, + "loss": 0.74388385, + "num_input_tokens_seen": 118190760, + "step": 5503, + "time_per_iteration": 2.576261520385742 + }, + { + "auxiliary_loss_clip": 0.01106336, + "auxiliary_loss_mlp": 0.01039226, + "balance_loss_clip": 1.04572856, + "balance_loss_mlp": 1.02251315, + "epoch": 0.33091838268450324, + "flos": 26468570419200.0, + "grad_norm": 1.491195717388614, + "language_loss": 0.75358522, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77504086, + "num_input_tokens_seen": 118213620, + "step": 5504, + "time_per_iteration": 2.628840446472168 + }, + { + "auxiliary_loss_clip": 0.0111595, + "auxiliary_loss_mlp": 0.01040822, + "balance_loss_clip": 1.04766142, + "balance_loss_mlp": 1.02623641, + "epoch": 0.3309785059371712, + "flos": 24971167221120.0, + "grad_norm": 1.5506213646942966, + "language_loss": 0.69952393, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.72109157, + "num_input_tokens_seen": 118235010, + "step": 5505, + "time_per_iteration": 2.584027051925659 + }, + { + "auxiliary_loss_clip": 0.01121121, + "auxiliary_loss_mlp": 0.01043607, + "balance_loss_clip": 1.05177379, + "balance_loss_mlp": 1.02830088, + "epoch": 0.3310386291898392, + "flos": 20446710192000.0, + "grad_norm": 2.3302474517869074, + "language_loss": 0.81918979, + "learning_rate": 3.122307436058899e-06, + "loss": 0.840837, + "num_input_tokens_seen": 118255820, + "step": 5506, + "time_per_iteration": 2.5205061435699463 + }, + { + "auxiliary_loss_clip": 0.01122387, + "auxiliary_loss_mlp": 0.01036392, + "balance_loss_clip": 1.04939198, + "balance_loss_mlp": 1.02075195, + "epoch": 0.33109875244250714, + "flos": 23182672204800.0, + "grad_norm": 1.7511576720064914, + "language_loss": 0.79148108, + "learning_rate": 3.121985052827606e-06, + "loss": 0.81306887, + "num_input_tokens_seen": 118274160, + "step": 5507, + "time_per_iteration": 2.5357484817504883 + }, + { + "auxiliary_loss_clip": 0.01108696, + "auxiliary_loss_mlp": 0.01046038, + "balance_loss_clip": 1.04426932, + "balance_loss_mlp": 1.0304929, + "epoch": 0.3311588756951751, + "flos": 24168384207360.0, + "grad_norm": 1.4683645976318707, + "language_loss": 0.71438825, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.73593557, + "num_input_tokens_seen": 118294385, + "step": 5508, + "time_per_iteration": 2.5443015098571777 + }, + { + "auxiliary_loss_clip": 0.01104344, + "auxiliary_loss_mlp": 0.01034319, + "balance_loss_clip": 1.04953694, + "balance_loss_mlp": 1.01971579, + "epoch": 0.33121899894784307, + "flos": 28145742209280.0, + "grad_norm": 1.88543403506718, + "language_loss": 0.71630645, + "learning_rate": 3.12134015873989e-06, + "loss": 0.73769307, + "num_input_tokens_seen": 118313105, + "step": 5509, + "time_per_iteration": 2.5685813426971436 + }, + { + "auxiliary_loss_clip": 0.0111882, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.05020237, + "balance_loss_mlp": 1.01719546, + "epoch": 0.33127912220051103, + "flos": 29567660976000.0, + "grad_norm": 1.6244961402668463, + "language_loss": 0.72920591, + "learning_rate": 3.121017647907921e-06, + "loss": 0.75071555, + "num_input_tokens_seen": 118335250, + "step": 5510, + "time_per_iteration": 2.567974328994751 + }, + { + "auxiliary_loss_clip": 0.010981, + "auxiliary_loss_mlp": 0.01038685, + "balance_loss_clip": 1.04612899, + "balance_loss_mlp": 1.02391493, + "epoch": 0.331339245453179, + "flos": 14428836374400.0, + "grad_norm": 9.010627189791242, + "language_loss": 0.88039297, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.90176082, + "num_input_tokens_seen": 118351470, + "step": 5511, + "time_per_iteration": 2.5276591777801514 + }, + { + "auxiliary_loss_clip": 0.01073807, + "auxiliary_loss_mlp": 0.01040851, + "balance_loss_clip": 1.04407167, + "balance_loss_mlp": 1.02567554, + "epoch": 0.33139936870584696, + "flos": 20887118847360.0, + "grad_norm": 1.6778755553351334, + "language_loss": 0.72861725, + "learning_rate": 3.12037249872891e-06, + "loss": 0.74976385, + "num_input_tokens_seen": 118370970, + "step": 5512, + "time_per_iteration": 2.5764427185058594 + }, + { + "auxiliary_loss_clip": 0.01089968, + "auxiliary_loss_mlp": 0.0103928, + "balance_loss_clip": 1.04646754, + "balance_loss_mlp": 1.0249157, + "epoch": 0.33145949195851493, + "flos": 36284356869120.0, + "grad_norm": 1.898640615628319, + "language_loss": 0.72444129, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.7457338, + "num_input_tokens_seen": 118393125, + "step": 5513, + "time_per_iteration": 2.707489490509033 + }, + { + "auxiliary_loss_clip": 0.0110083, + "auxiliary_loss_mlp": 0.01035725, + "balance_loss_clip": 1.05102122, + "balance_loss_mlp": 1.02026367, + "epoch": 0.33151961521118295, + "flos": 14279735018880.0, + "grad_norm": 2.002162609009222, + "language_loss": 0.68207383, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.70343941, + "num_input_tokens_seen": 118410860, + "step": 5514, + "time_per_iteration": 2.567573070526123 + }, + { + "auxiliary_loss_clip": 0.01107486, + "auxiliary_loss_mlp": 0.01042726, + "balance_loss_clip": 1.05014658, + "balance_loss_mlp": 1.02573872, + "epoch": 0.3315797384638509, + "flos": 20774323163520.0, + "grad_norm": 2.3633346886566207, + "language_loss": 0.66522485, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.68672705, + "num_input_tokens_seen": 118429570, + "step": 5515, + "time_per_iteration": 2.5206732749938965 + }, + { + "auxiliary_loss_clip": 0.01117267, + "auxiliary_loss_mlp": 0.01036797, + "balance_loss_clip": 1.04919469, + "balance_loss_mlp": 1.02145493, + "epoch": 0.3316398617165189, + "flos": 24679464871680.0, + "grad_norm": 1.7031284162875397, + "language_loss": 0.68992853, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.71146917, + "num_input_tokens_seen": 118450285, + "step": 5516, + "time_per_iteration": 2.4967901706695557 + }, + { + "auxiliary_loss_clip": 0.01122491, + "auxiliary_loss_mlp": 0.01039129, + "balance_loss_clip": 1.04584599, + "balance_loss_mlp": 1.0242939, + "epoch": 0.33169998496918685, + "flos": 18587974129920.0, + "grad_norm": 2.875536071878683, + "language_loss": 0.80750042, + "learning_rate": 3.118758882514359e-06, + "loss": 0.82911658, + "num_input_tokens_seen": 118468270, + "step": 5517, + "time_per_iteration": 2.471675157546997 + }, + { + "auxiliary_loss_clip": 0.01112029, + "auxiliary_loss_mlp": 0.01037728, + "balance_loss_clip": 1.04419458, + "balance_loss_mlp": 1.02232611, + "epoch": 0.3317601082218548, + "flos": 20193647898240.0, + "grad_norm": 1.682872415106335, + "language_loss": 0.74316704, + "learning_rate": 3.118436031952143e-06, + "loss": 0.76466459, + "num_input_tokens_seen": 118486615, + "step": 5518, + "time_per_iteration": 3.852931261062622 + }, + { + "auxiliary_loss_clip": 0.01028092, + "auxiliary_loss_mlp": 0.01009363, + "balance_loss_clip": 1.01982927, + "balance_loss_mlp": 1.00719392, + "epoch": 0.3318202314745228, + "flos": 68974703637120.0, + "grad_norm": 0.6218892697076962, + "language_loss": 0.54361409, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56398863, + "num_input_tokens_seen": 118553580, + "step": 5519, + "time_per_iteration": 3.2361485958099365 + }, + { + "auxiliary_loss_clip": 0.01119478, + "auxiliary_loss_mlp": 0.01038891, + "balance_loss_clip": 1.04561961, + "balance_loss_mlp": 1.02357316, + "epoch": 0.33188035472719074, + "flos": 21500113374720.0, + "grad_norm": 2.447614561413274, + "language_loss": 0.78667092, + "learning_rate": 3.117790203606336e-06, + "loss": 0.8082546, + "num_input_tokens_seen": 118570280, + "step": 5520, + "time_per_iteration": 2.484006404876709 + }, + { + "auxiliary_loss_clip": 0.01106089, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.04800844, + "balance_loss_mlp": 1.01973403, + "epoch": 0.3319404779798587, + "flos": 28870490926080.0, + "grad_norm": 1.77765321226945, + "language_loss": 0.76303041, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.7844339, + "num_input_tokens_seen": 118590455, + "step": 5521, + "time_per_iteration": 3.977576971054077 + }, + { + "auxiliary_loss_clip": 0.01120946, + "auxiliary_loss_mlp": 0.0104277, + "balance_loss_clip": 1.04569387, + "balance_loss_mlp": 1.02692688, + "epoch": 0.33200060123252667, + "flos": 23076915586560.0, + "grad_norm": 1.851364202467023, + "language_loss": 0.7035324, + "learning_rate": 3.117144205713664e-06, + "loss": 0.72516954, + "num_input_tokens_seen": 118609495, + "step": 5522, + "time_per_iteration": 2.494180202484131 + }, + { + "auxiliary_loss_clip": 0.01107221, + "auxiliary_loss_mlp": 0.01032792, + "balance_loss_clip": 1.0460093, + "balance_loss_mlp": 1.01864171, + "epoch": 0.33206072448519464, + "flos": 21142479611520.0, + "grad_norm": 2.0169205581593617, + "language_loss": 0.73715079, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.75855082, + "num_input_tokens_seen": 118628720, + "step": 5523, + "time_per_iteration": 4.068978309631348 + }, + { + "auxiliary_loss_clip": 0.0110103, + "auxiliary_loss_mlp": 0.01034125, + "balance_loss_clip": 1.04377627, + "balance_loss_mlp": 1.01887846, + "epoch": 0.3321208477378626, + "flos": 13079097987840.0, + "grad_norm": 1.8147235516209936, + "language_loss": 0.81997728, + "learning_rate": 3.116498038372114e-06, + "loss": 0.84132886, + "num_input_tokens_seen": 118645955, + "step": 5524, + "time_per_iteration": 2.500979423522949 + }, + { + "auxiliary_loss_clip": 0.01088461, + "auxiliary_loss_mlp": 0.00795264, + "balance_loss_clip": 1.04533839, + "balance_loss_mlp": 1.0185895, + "epoch": 0.33218097099053057, + "flos": 21215414177280.0, + "grad_norm": 2.136733840343463, + "language_loss": 0.82797927, + "learning_rate": 3.116174891188636e-06, + "loss": 0.84681648, + "num_input_tokens_seen": 118665605, + "step": 5525, + "time_per_iteration": 3.9312376976013184 + }, + { + "auxiliary_loss_clip": 0.01044639, + "auxiliary_loss_mlp": 0.01004166, + "balance_loss_clip": 1.01571834, + "balance_loss_mlp": 1.00182998, + "epoch": 0.33224109424319853, + "flos": 64348979189760.0, + "grad_norm": 0.7696809073821715, + "language_loss": 0.52713746, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54762548, + "num_input_tokens_seen": 118728155, + "step": 5526, + "time_per_iteration": 3.037161350250244 + }, + { + "auxiliary_loss_clip": 0.01091373, + "auxiliary_loss_mlp": 0.00797034, + "balance_loss_clip": 1.04797339, + "balance_loss_mlp": 1.01835501, + "epoch": 0.33230121749586655, + "flos": 17346003523200.0, + "grad_norm": 2.280891534753282, + "language_loss": 0.77977747, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.79866153, + "num_input_tokens_seen": 118743955, + "step": 5527, + "time_per_iteration": 2.5080807209014893 + }, + { + "auxiliary_loss_clip": 0.0108338, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.04566407, + "balance_loss_mlp": 1.02744269, + "epoch": 0.3323613407485345, + "flos": 20997041443200.0, + "grad_norm": 1.7771673260392324, + "language_loss": 0.71714807, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.73839861, + "num_input_tokens_seen": 118763275, + "step": 5528, + "time_per_iteration": 2.589385747909546 + }, + { + "auxiliary_loss_clip": 0.01106957, + "auxiliary_loss_mlp": 0.01032315, + "balance_loss_clip": 1.04449821, + "balance_loss_mlp": 1.01852846, + "epoch": 0.3324214640012025, + "flos": 13152535344000.0, + "grad_norm": 1.6043166301368745, + "language_loss": 0.82613438, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.84752715, + "num_input_tokens_seen": 118781110, + "step": 5529, + "time_per_iteration": 2.496351718902588 + }, + { + "auxiliary_loss_clip": 0.01108919, + "auxiliary_loss_mlp": 0.00798967, + "balance_loss_clip": 1.05034125, + "balance_loss_mlp": 1.02298546, + "epoch": 0.33248158725387045, + "flos": 22273522041600.0, + "grad_norm": 1.752755529474781, + "language_loss": 0.69842148, + "learning_rate": 3.114558520634423e-06, + "loss": 0.71750033, + "num_input_tokens_seen": 118800620, + "step": 5530, + "time_per_iteration": 2.5433077812194824 + }, + { + "auxiliary_loss_clip": 0.01118635, + "auxiliary_loss_mlp": 0.010425, + "balance_loss_clip": 1.04502964, + "balance_loss_mlp": 1.0268718, + "epoch": 0.3325417105065384, + "flos": 20740998320640.0, + "grad_norm": 2.6615035578931145, + "language_loss": 0.76388711, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.7854985, + "num_input_tokens_seen": 118818725, + "step": 5531, + "time_per_iteration": 2.481428861618042 + }, + { + "auxiliary_loss_clip": 0.01111239, + "auxiliary_loss_mlp": 0.0103751, + "balance_loss_clip": 1.04772425, + "balance_loss_mlp": 1.02194083, + "epoch": 0.3326018337592064, + "flos": 24790536702720.0, + "grad_norm": 1.7330229239142028, + "language_loss": 0.73223042, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.7537179, + "num_input_tokens_seen": 118839390, + "step": 5532, + "time_per_iteration": 2.588430404663086 + }, + { + "auxiliary_loss_clip": 0.01109561, + "auxiliary_loss_mlp": 0.01031368, + "balance_loss_clip": 1.050596, + "balance_loss_mlp": 1.01672876, + "epoch": 0.33266195701187434, + "flos": 14501699112960.0, + "grad_norm": 2.956639341064044, + "language_loss": 0.65750384, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.67891312, + "num_input_tokens_seen": 118856275, + "step": 5533, + "time_per_iteration": 2.4874658584594727 + }, + { + "auxiliary_loss_clip": 0.0107763, + "auxiliary_loss_mlp": 0.01036218, + "balance_loss_clip": 1.04673672, + "balance_loss_mlp": 1.02085185, + "epoch": 0.3327220802645423, + "flos": 15304410299520.0, + "grad_norm": 1.6137793026537122, + "language_loss": 0.70894378, + "learning_rate": 3.113264663362451e-06, + "loss": 0.73008227, + "num_input_tokens_seen": 118873830, + "step": 5534, + "time_per_iteration": 2.6081204414367676 + }, + { + "auxiliary_loss_clip": 0.01091323, + "auxiliary_loss_mlp": 0.01038561, + "balance_loss_clip": 1.0479908, + "balance_loss_mlp": 1.02381516, + "epoch": 0.3327822035172103, + "flos": 23477534951040.0, + "grad_norm": 1.741078651829642, + "language_loss": 0.67151392, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.6928128, + "num_input_tokens_seen": 118891560, + "step": 5535, + "time_per_iteration": 2.5801644325256348 + }, + { + "auxiliary_loss_clip": 0.01118069, + "auxiliary_loss_mlp": 0.00792287, + "balance_loss_clip": 1.04384804, + "balance_loss_mlp": 1.01301372, + "epoch": 0.33284232676987824, + "flos": 25374516019200.0, + "grad_norm": 2.0671010275432335, + "language_loss": 0.73064828, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.74975181, + "num_input_tokens_seen": 118910260, + "step": 5536, + "time_per_iteration": 2.5389885902404785 + }, + { + "auxiliary_loss_clip": 0.01119597, + "auxiliary_loss_mlp": 0.01037095, + "balance_loss_clip": 1.04611707, + "balance_loss_mlp": 1.02311754, + "epoch": 0.3329024500225462, + "flos": 23694363400320.0, + "grad_norm": 1.5191004388462481, + "language_loss": 0.81648958, + "learning_rate": 3.112293827106917e-06, + "loss": 0.83805645, + "num_input_tokens_seen": 118929985, + "step": 5537, + "time_per_iteration": 2.509449005126953 + }, + { + "auxiliary_loss_clip": 0.01124985, + "auxiliary_loss_mlp": 0.01040009, + "balance_loss_clip": 1.04992783, + "balance_loss_mlp": 1.02475059, + "epoch": 0.33296257327521417, + "flos": 31723163205120.0, + "grad_norm": 2.049255837108728, + "language_loss": 0.72100633, + "learning_rate": 3.111970130648789e-06, + "loss": 0.74265629, + "num_input_tokens_seen": 118951355, + "step": 5538, + "time_per_iteration": 2.572175979614258 + }, + { + "auxiliary_loss_clip": 0.01115995, + "auxiliary_loss_mlp": 0.01033165, + "balance_loss_clip": 1.0454154, + "balance_loss_mlp": 1.01885426, + "epoch": 0.33302269652788213, + "flos": 22744705674240.0, + "grad_norm": 1.9370743266214556, + "language_loss": 0.74407899, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.76557064, + "num_input_tokens_seen": 118970910, + "step": 5539, + "time_per_iteration": 2.5058515071868896 + }, + { + "auxiliary_loss_clip": 0.01137562, + "auxiliary_loss_mlp": 0.01045255, + "balance_loss_clip": 1.04889858, + "balance_loss_mlp": 1.02996063, + "epoch": 0.33308281978055015, + "flos": 11473747441920.0, + "grad_norm": 1.7925757816038796, + "language_loss": 0.71246433, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.73429251, + "num_input_tokens_seen": 118989200, + "step": 5540, + "time_per_iteration": 2.4454381465911865 + }, + { + "auxiliary_loss_clip": 0.01116634, + "auxiliary_loss_mlp": 0.0103525, + "balance_loss_clip": 1.04290128, + "balance_loss_mlp": 1.02064705, + "epoch": 0.3331429430332181, + "flos": 38213693112960.0, + "grad_norm": 1.5790520466337679, + "language_loss": 0.6082598, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.62977868, + "num_input_tokens_seen": 119011030, + "step": 5541, + "time_per_iteration": 2.636073589324951 + }, + { + "auxiliary_loss_clip": 0.01111752, + "auxiliary_loss_mlp": 0.01043605, + "balance_loss_clip": 1.04666042, + "balance_loss_mlp": 1.0276072, + "epoch": 0.3332030662858861, + "flos": 22528667324160.0, + "grad_norm": 1.6298898735155893, + "language_loss": 0.68772066, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.70927423, + "num_input_tokens_seen": 119030620, + "step": 5542, + "time_per_iteration": 2.5379576683044434 + }, + { + "auxiliary_loss_clip": 0.01118784, + "auxiliary_loss_mlp": 0.010397, + "balance_loss_clip": 1.045367, + "balance_loss_mlp": 1.0254786, + "epoch": 0.33326318953855405, + "flos": 15997773507840.0, + "grad_norm": 1.8484241478748256, + "language_loss": 0.7508536, + "learning_rate": 3.110351016113414e-06, + "loss": 0.77243853, + "num_input_tokens_seen": 119048015, + "step": 5543, + "time_per_iteration": 2.449554204940796 + }, + { + "auxiliary_loss_clip": 0.01056054, + "auxiliary_loss_mlp": 0.01044626, + "balance_loss_clip": 1.04643583, + "balance_loss_mlp": 1.02790082, + "epoch": 0.333323312791222, + "flos": 25593535198080.0, + "grad_norm": 2.06389028723006, + "language_loss": 0.74882305, + "learning_rate": 3.110027066843348e-06, + "loss": 0.76982975, + "num_input_tokens_seen": 119066280, + "step": 5544, + "time_per_iteration": 2.7156589031219482 + }, + { + "auxiliary_loss_clip": 0.01128864, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.04639554, + "balance_loss_mlp": 1.01706696, + "epoch": 0.33338343604389, + "flos": 25119550304640.0, + "grad_norm": 1.5600650451093327, + "language_loss": 0.70888126, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.73047924, + "num_input_tokens_seen": 119087680, + "step": 5545, + "time_per_iteration": 2.5162851810455322 + }, + { + "auxiliary_loss_clip": 0.01093825, + "auxiliary_loss_mlp": 0.01036526, + "balance_loss_clip": 1.04973483, + "balance_loss_mlp": 1.02223241, + "epoch": 0.33344355929655795, + "flos": 16947287579520.0, + "grad_norm": 1.753661732973528, + "language_loss": 0.69056153, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.71186501, + "num_input_tokens_seen": 119105820, + "step": 5546, + "time_per_iteration": 2.543632984161377 + }, + { + "auxiliary_loss_clip": 0.01095664, + "auxiliary_loss_mlp": 0.01036968, + "balance_loss_clip": 1.04292643, + "balance_loss_mlp": 1.02212656, + "epoch": 0.3335036825492259, + "flos": 27889591345920.0, + "grad_norm": 1.5377883972955524, + "language_loss": 0.64645171, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.66777802, + "num_input_tokens_seen": 119126630, + "step": 5547, + "time_per_iteration": 2.613478183746338 + }, + { + "auxiliary_loss_clip": 0.01109725, + "auxiliary_loss_mlp": 0.01032824, + "balance_loss_clip": 1.04822612, + "balance_loss_mlp": 1.01918697, + "epoch": 0.3335638058018939, + "flos": 16179553261440.0, + "grad_norm": 2.132337652019009, + "language_loss": 0.8573246, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.87875009, + "num_input_tokens_seen": 119143375, + "step": 5548, + "time_per_iteration": 2.508316993713379 + }, + { + "auxiliary_loss_clip": 0.01121636, + "auxiliary_loss_mlp": 0.01037151, + "balance_loss_clip": 1.04691625, + "balance_loss_mlp": 1.02140307, + "epoch": 0.33362392905456184, + "flos": 39896108288640.0, + "grad_norm": 1.9634304924993615, + "language_loss": 0.74345231, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.76504016, + "num_input_tokens_seen": 119166450, + "step": 5549, + "time_per_iteration": 2.657221555709839 + }, + { + "auxiliary_loss_clip": 0.01123783, + "auxiliary_loss_mlp": 0.0104177, + "balance_loss_clip": 1.04752827, + "balance_loss_mlp": 1.02597439, + "epoch": 0.3336840523072298, + "flos": 44271212567040.0, + "grad_norm": 1.9283372487541417, + "language_loss": 0.68697447, + "learning_rate": 3.108082487713921e-06, + "loss": 0.70862997, + "num_input_tokens_seen": 119189645, + "step": 5550, + "time_per_iteration": 2.6794331073760986 + }, + { + "auxiliary_loss_clip": 0.01089728, + "auxiliary_loss_mlp": 0.01044037, + "balance_loss_clip": 1.04591775, + "balance_loss_mlp": 1.02873647, + "epoch": 0.33374417555989777, + "flos": 15085678429440.0, + "grad_norm": 1.7774529748197163, + "language_loss": 0.60163587, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.6229735, + "num_input_tokens_seen": 119208045, + "step": 5551, + "time_per_iteration": 2.527299404144287 + }, + { + "auxiliary_loss_clip": 0.01090977, + "auxiliary_loss_mlp": 0.01036147, + "balance_loss_clip": 1.04778361, + "balance_loss_mlp": 1.02181196, + "epoch": 0.33380429881256574, + "flos": 15849174942720.0, + "grad_norm": 1.7950518390477725, + "language_loss": 0.70595276, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.72722399, + "num_input_tokens_seen": 119224910, + "step": 5552, + "time_per_iteration": 2.574974775314331 + }, + { + "auxiliary_loss_clip": 0.01096507, + "auxiliary_loss_mlp": 0.01035807, + "balance_loss_clip": 1.04485524, + "balance_loss_mlp": 1.02123356, + "epoch": 0.33386442206523376, + "flos": 13480327883520.0, + "grad_norm": 1.9877689899076725, + "language_loss": 0.82570982, + "learning_rate": 3.107109630732192e-06, + "loss": 0.8470329, + "num_input_tokens_seen": 119243290, + "step": 5553, + "time_per_iteration": 2.5344736576080322 + }, + { + "auxiliary_loss_clip": 0.01112315, + "auxiliary_loss_mlp": 0.00794505, + "balance_loss_clip": 1.0499388, + "balance_loss_mlp": 1.01665354, + "epoch": 0.3339245453179017, + "flos": 16690669839360.0, + "grad_norm": 1.8657876204818176, + "language_loss": 0.80720991, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.82627815, + "num_input_tokens_seen": 119261195, + "step": 5554, + "time_per_iteration": 2.5218353271484375 + }, + { + "auxiliary_loss_clip": 0.01121488, + "auxiliary_loss_mlp": 0.01041057, + "balance_loss_clip": 1.04838514, + "balance_loss_mlp": 1.02685308, + "epoch": 0.3339846685705697, + "flos": 24610624456320.0, + "grad_norm": 1.4919463605556968, + "language_loss": 0.81442487, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.83605033, + "num_input_tokens_seen": 119282845, + "step": 5555, + "time_per_iteration": 2.542682409286499 + }, + { + "auxiliary_loss_clip": 0.0111842, + "auxiliary_loss_mlp": 0.01040672, + "balance_loss_clip": 1.04644299, + "balance_loss_mlp": 1.02645588, + "epoch": 0.33404479182323765, + "flos": 30953812775040.0, + "grad_norm": 1.7407386105171407, + "language_loss": 0.74456275, + "learning_rate": 3.106136395915099e-06, + "loss": 0.76615369, + "num_input_tokens_seen": 119304430, + "step": 5556, + "time_per_iteration": 3.956327438354492 + }, + { + "auxiliary_loss_clip": 0.01120097, + "auxiliary_loss_mlp": 0.01038467, + "balance_loss_clip": 1.04759526, + "balance_loss_mlp": 1.02457333, + "epoch": 0.3341049150759056, + "flos": 23513301918720.0, + "grad_norm": 1.420117107680523, + "language_loss": 0.82470226, + "learning_rate": 3.105811900403391e-06, + "loss": 0.84628797, + "num_input_tokens_seen": 119323830, + "step": 5557, + "time_per_iteration": 2.537945508956909 + }, + { + "auxiliary_loss_clip": 0.01111923, + "auxiliary_loss_mlp": 0.01037463, + "balance_loss_clip": 1.04855561, + "balance_loss_mlp": 1.02265704, + "epoch": 0.3341650383285736, + "flos": 24026824707840.0, + "grad_norm": 1.5381210144178443, + "language_loss": 0.80001342, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.82150733, + "num_input_tokens_seen": 119346340, + "step": 5558, + "time_per_iteration": 2.6029341220855713 + }, + { + "auxiliary_loss_clip": 0.0110716, + "auxiliary_loss_mlp": 0.01042119, + "balance_loss_clip": 1.04868066, + "balance_loss_mlp": 1.02776659, + "epoch": 0.33422516158124155, + "flos": 24901967669760.0, + "grad_norm": 1.586762045545281, + "language_loss": 0.81584889, + "learning_rate": 3.105162783594788e-06, + "loss": 0.83734167, + "num_input_tokens_seen": 119367285, + "step": 5559, + "time_per_iteration": 2.571357011795044 + }, + { + "auxiliary_loss_clip": 0.01095886, + "auxiliary_loss_mlp": 0.0104257, + "balance_loss_clip": 1.04834604, + "balance_loss_mlp": 1.0279789, + "epoch": 0.3342852848339095, + "flos": 18333403464960.0, + "grad_norm": 1.6821669582844299, + "language_loss": 0.71647573, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.73786026, + "num_input_tokens_seen": 119385370, + "step": 5560, + "time_per_iteration": 3.9117767810821533 + }, + { + "auxiliary_loss_clip": 0.01114421, + "auxiliary_loss_mlp": 0.01044046, + "balance_loss_clip": 1.04834878, + "balance_loss_mlp": 1.02909684, + "epoch": 0.3343454080865775, + "flos": 30046530119040.0, + "grad_norm": 1.5268256802125977, + "language_loss": 0.75024223, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.77182686, + "num_input_tokens_seen": 119409150, + "step": 5561, + "time_per_iteration": 2.5881669521331787 + }, + { + "auxiliary_loss_clip": 0.01110901, + "auxiliary_loss_mlp": 0.01033835, + "balance_loss_clip": 1.05188227, + "balance_loss_mlp": 1.01893342, + "epoch": 0.33440553133924544, + "flos": 16398823835520.0, + "grad_norm": 1.7589936363340548, + "language_loss": 0.69297361, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.71442103, + "num_input_tokens_seen": 119426475, + "step": 5562, + "time_per_iteration": 3.9318058490753174 + }, + { + "auxiliary_loss_clip": 0.01123188, + "auxiliary_loss_mlp": 0.01039984, + "balance_loss_clip": 1.05061877, + "balance_loss_mlp": 1.02641749, + "epoch": 0.3344656545919134, + "flos": 24242072958720.0, + "grad_norm": 1.649529676026584, + "language_loss": 0.64662284, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.66825455, + "num_input_tokens_seen": 119446900, + "step": 5563, + "time_per_iteration": 2.523895025253296 + }, + { + "auxiliary_loss_clip": 0.01077403, + "auxiliary_loss_mlp": 0.01044019, + "balance_loss_clip": 1.05174661, + "balance_loss_mlp": 1.02840304, + "epoch": 0.3345257778445814, + "flos": 52118843149440.0, + "grad_norm": 1.360535054000048, + "language_loss": 0.74472356, + "learning_rate": 3.103539258400766e-06, + "loss": 0.76593781, + "num_input_tokens_seen": 119470945, + "step": 5564, + "time_per_iteration": 4.205468654632568 + }, + { + "auxiliary_loss_clip": 0.01023826, + "auxiliary_loss_mlp": 0.01008086, + "balance_loss_clip": 1.0218792, + "balance_loss_mlp": 1.005988, + "epoch": 0.33458590109724934, + "flos": 68048602254720.0, + "grad_norm": 0.778106716183215, + "language_loss": 0.55579448, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57611358, + "num_input_tokens_seen": 119529925, + "step": 5565, + "time_per_iteration": 3.114863395690918 + }, + { + "auxiliary_loss_clip": 0.01134208, + "auxiliary_loss_mlp": 0.01037207, + "balance_loss_clip": 1.05202937, + "balance_loss_mlp": 1.02309895, + "epoch": 0.3346460243499173, + "flos": 37414788768000.0, + "grad_norm": 1.8526630471417884, + "language_loss": 0.65061134, + "learning_rate": 3.102889555312721e-06, + "loss": 0.67232549, + "num_input_tokens_seen": 119550700, + "step": 5566, + "time_per_iteration": 2.61820387840271 + }, + { + "auxiliary_loss_clip": 0.01114922, + "auxiliary_loss_mlp": 0.01040118, + "balance_loss_clip": 1.05024755, + "balance_loss_mlp": 1.0253005, + "epoch": 0.3347061476025853, + "flos": 18697358021760.0, + "grad_norm": 1.7838722494845207, + "language_loss": 0.77318799, + "learning_rate": 3.102564641030016e-06, + "loss": 0.79473841, + "num_input_tokens_seen": 119569295, + "step": 5567, + "time_per_iteration": 2.5359787940979004 + }, + { + "auxiliary_loss_clip": 0.01111773, + "auxiliary_loss_mlp": 0.01036391, + "balance_loss_clip": 1.04934549, + "balance_loss_mlp": 1.02070296, + "epoch": 0.3347662708552533, + "flos": 13917827537280.0, + "grad_norm": 1.6429466092176976, + "language_loss": 0.76172352, + "learning_rate": 3.102239684937949e-06, + "loss": 0.78320515, + "num_input_tokens_seen": 119587375, + "step": 5568, + "time_per_iteration": 2.499683380126953 + }, + { + "auxiliary_loss_clip": 0.01098225, + "auxiliary_loss_mlp": 0.01039903, + "balance_loss_clip": 1.05071259, + "balance_loss_mlp": 1.02482319, + "epoch": 0.33482639410792125, + "flos": 19750402068480.0, + "grad_norm": 1.9446663549803862, + "language_loss": 0.71489513, + "learning_rate": 3.101914687048842e-06, + "loss": 0.73627645, + "num_input_tokens_seen": 119604530, + "step": 5569, + "time_per_iteration": 2.5983738899230957 + }, + { + "auxiliary_loss_clip": 0.01095767, + "auxiliary_loss_mlp": 0.01036052, + "balance_loss_clip": 1.0444659, + "balance_loss_mlp": 1.01945794, + "epoch": 0.3348865173605892, + "flos": 16102991422080.0, + "grad_norm": 1.9522498508532315, + "language_loss": 0.89482403, + "learning_rate": 3.10158964737502e-06, + "loss": 0.91614217, + "num_input_tokens_seen": 119621025, + "step": 5570, + "time_per_iteration": 2.5375895500183105 + }, + { + "auxiliary_loss_clip": 0.01099642, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.04970312, + "balance_loss_mlp": 1.01899457, + "epoch": 0.3349466406132572, + "flos": 25008945350400.0, + "grad_norm": 1.448152600262487, + "language_loss": 0.7962206, + "learning_rate": 3.101264565928808e-06, + "loss": 0.81755936, + "num_input_tokens_seen": 119641725, + "step": 5571, + "time_per_iteration": 2.600584030151367 + }, + { + "auxiliary_loss_clip": 0.01048437, + "auxiliary_loss_mlp": 0.00798853, + "balance_loss_clip": 1.01875937, + "balance_loss_mlp": 1.04525566, + "epoch": 0.33500676386592515, + "flos": 54319991564160.0, + "grad_norm": 0.90130250939553, + "language_loss": 0.55901939, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.5774923, + "num_input_tokens_seen": 119693560, + "step": 5572, + "time_per_iteration": 3.036715507507324 + }, + { + "auxiliary_loss_clip": 0.01135315, + "auxiliary_loss_mlp": 0.01046303, + "balance_loss_clip": 1.05176187, + "balance_loss_mlp": 1.03098512, + "epoch": 0.3350668871185931, + "flos": 26797332625920.0, + "grad_norm": 1.7792847980775874, + "language_loss": 0.78341997, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.8052361, + "num_input_tokens_seen": 119712935, + "step": 5573, + "time_per_iteration": 2.5151069164276123 + }, + { + "auxiliary_loss_clip": 0.01102217, + "auxiliary_loss_mlp": 0.01044989, + "balance_loss_clip": 1.04797196, + "balance_loss_mlp": 1.02928901, + "epoch": 0.3351270103712611, + "flos": 33510508986240.0, + "grad_norm": 2.4643976192808883, + "language_loss": 0.724401, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.74587309, + "num_input_tokens_seen": 119731680, + "step": 5574, + "time_per_iteration": 2.658658742904663 + }, + { + "auxiliary_loss_clip": 0.01120474, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.04919863, + "balance_loss_mlp": 1.01795125, + "epoch": 0.33518713362392905, + "flos": 26506240807680.0, + "grad_norm": 1.5527502688454318, + "language_loss": 0.88050395, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.90203589, + "num_input_tokens_seen": 119752155, + "step": 5575, + "time_per_iteration": 2.5395758152008057 + }, + { + "auxiliary_loss_clip": 0.01114346, + "auxiliary_loss_mlp": 0.01039097, + "balance_loss_clip": 1.0479871, + "balance_loss_mlp": 1.02225244, + "epoch": 0.335247256876597, + "flos": 17232345912960.0, + "grad_norm": 2.0553776260834407, + "language_loss": 0.82291591, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.84445029, + "num_input_tokens_seen": 119769195, + "step": 5576, + "time_per_iteration": 2.5089094638824463 + }, + { + "auxiliary_loss_clip": 0.01120829, + "auxiliary_loss_mlp": 0.01035573, + "balance_loss_clip": 1.0480938, + "balance_loss_mlp": 1.02008772, + "epoch": 0.335307380129265, + "flos": 25629373992960.0, + "grad_norm": 2.131808297183543, + "language_loss": 0.72159052, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.74315459, + "num_input_tokens_seen": 119786810, + "step": 5577, + "time_per_iteration": 2.522834539413452 + }, + { + "auxiliary_loss_clip": 0.0110506, + "auxiliary_loss_mlp": 0.01037695, + "balance_loss_clip": 1.05348182, + "balance_loss_mlp": 1.0222578, + "epoch": 0.33536750338193294, + "flos": 19680089195520.0, + "grad_norm": 1.5970009651873587, + "language_loss": 0.81792861, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.83935612, + "num_input_tokens_seen": 119805395, + "step": 5578, + "time_per_iteration": 2.5687131881713867 + }, + { + "auxiliary_loss_clip": 0.01072572, + "auxiliary_loss_mlp": 0.00820626, + "balance_loss_clip": 1.04605198, + "balance_loss_mlp": 1.0667789, + "epoch": 0.3354276266346009, + "flos": 18332613365760.0, + "grad_norm": 1.740764083642832, + "language_loss": 0.71580797, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.73474002, + "num_input_tokens_seen": 119823135, + "step": 5579, + "time_per_iteration": 2.5987608432769775 + }, + { + "auxiliary_loss_clip": 0.01082474, + "auxiliary_loss_mlp": 0.01037523, + "balance_loss_clip": 1.05036891, + "balance_loss_mlp": 1.02135849, + "epoch": 0.3354877498872689, + "flos": 17858556645120.0, + "grad_norm": 1.7050273367615514, + "language_loss": 0.81352794, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.83472788, + "num_input_tokens_seen": 119842265, + "step": 5580, + "time_per_iteration": 2.6142852306365967 + }, + { + "auxiliary_loss_clip": 0.01118314, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.04947257, + "balance_loss_mlp": 1.01603651, + "epoch": 0.3355478731399369, + "flos": 24717745791360.0, + "grad_norm": 1.7177670393343658, + "language_loss": 0.77889067, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.80038643, + "num_input_tokens_seen": 119862500, + "step": 5581, + "time_per_iteration": 2.5574939250946045 + }, + { + "auxiliary_loss_clip": 0.01100682, + "auxiliary_loss_mlp": 0.01045569, + "balance_loss_clip": 1.04710031, + "balance_loss_mlp": 1.02841449, + "epoch": 0.33560799639260486, + "flos": 16873886136960.0, + "grad_norm": 2.466293118509431, + "language_loss": 0.7464571, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.76791966, + "num_input_tokens_seen": 119880160, + "step": 5582, + "time_per_iteration": 2.5395913124084473 + }, + { + "auxiliary_loss_clip": 0.01108893, + "auxiliary_loss_mlp": 0.01044313, + "balance_loss_clip": 1.04518175, + "balance_loss_mlp": 1.02850592, + "epoch": 0.3356681196452728, + "flos": 18333511205760.0, + "grad_norm": 1.5615663718935355, + "language_loss": 0.81778061, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.83931267, + "num_input_tokens_seen": 119899040, + "step": 5583, + "time_per_iteration": 2.521315336227417 + }, + { + "auxiliary_loss_clip": 0.0111158, + "auxiliary_loss_mlp": 0.0104567, + "balance_loss_clip": 1.0484463, + "balance_loss_mlp": 1.0311439, + "epoch": 0.3357282428979408, + "flos": 34750612085760.0, + "grad_norm": 1.8169661673799984, + "language_loss": 0.77997488, + "learning_rate": 3.097034711451581e-06, + "loss": 0.80154741, + "num_input_tokens_seen": 119921120, + "step": 5584, + "time_per_iteration": 2.6589980125427246 + }, + { + "auxiliary_loss_clip": 0.01112227, + "auxiliary_loss_mlp": 0.01037787, + "balance_loss_clip": 1.0474087, + "balance_loss_mlp": 1.02262402, + "epoch": 0.33578836615060875, + "flos": 21580087006080.0, + "grad_norm": 1.5371972701112662, + "language_loss": 0.76172698, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78322709, + "num_input_tokens_seen": 119940165, + "step": 5585, + "time_per_iteration": 2.54693865776062 + }, + { + "auxiliary_loss_clip": 0.01117767, + "auxiliary_loss_mlp": 0.01039569, + "balance_loss_clip": 1.04538321, + "balance_loss_mlp": 1.02404773, + "epoch": 0.3358484894032767, + "flos": 24530291688960.0, + "grad_norm": 1.507248960338949, + "language_loss": 0.7747575, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.79633081, + "num_input_tokens_seen": 119959730, + "step": 5586, + "time_per_iteration": 2.5451254844665527 + }, + { + "auxiliary_loss_clip": 0.01098655, + "auxiliary_loss_mlp": 0.01055631, + "balance_loss_clip": 1.05192041, + "balance_loss_mlp": 1.03704643, + "epoch": 0.3359086126559447, + "flos": 22455589104000.0, + "grad_norm": 1.684145280134752, + "language_loss": 0.80838186, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.8299247, + "num_input_tokens_seen": 119979315, + "step": 5587, + "time_per_iteration": 2.556368589401245 + }, + { + "auxiliary_loss_clip": 0.01131141, + "auxiliary_loss_mlp": 0.01042801, + "balance_loss_clip": 1.05027795, + "balance_loss_mlp": 1.02887154, + "epoch": 0.33596873590861265, + "flos": 16543687386240.0, + "grad_norm": 1.7010533526278973, + "language_loss": 0.67026949, + "learning_rate": 3.095731802118677e-06, + "loss": 0.69200897, + "num_input_tokens_seen": 119996140, + "step": 5588, + "time_per_iteration": 2.459420680999756 + }, + { + "auxiliary_loss_clip": 0.01109811, + "auxiliary_loss_mlp": 0.00807538, + "balance_loss_clip": 1.04602671, + "balance_loss_mlp": 1.04038262, + "epoch": 0.3360288591612806, + "flos": 31175812782720.0, + "grad_norm": 2.0907640342145704, + "language_loss": 0.70187289, + "learning_rate": 3.095405970878919e-06, + "loss": 0.72104639, + "num_input_tokens_seen": 120017720, + "step": 5589, + "time_per_iteration": 2.610053539276123 + }, + { + "auxiliary_loss_clip": 0.01111767, + "auxiliary_loss_mlp": 0.01045602, + "balance_loss_clip": 1.04832494, + "balance_loss_mlp": 1.02929378, + "epoch": 0.3360889824139486, + "flos": 23696913265920.0, + "grad_norm": 1.626171432883623, + "language_loss": 0.67269981, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.69427341, + "num_input_tokens_seen": 120036335, + "step": 5590, + "time_per_iteration": 2.557910919189453 + }, + { + "auxiliary_loss_clip": 0.01103046, + "auxiliary_loss_mlp": 0.01049558, + "balance_loss_clip": 1.05045378, + "balance_loss_mlp": 1.03430474, + "epoch": 0.33614910566661654, + "flos": 19318109886720.0, + "grad_norm": 2.946946472153406, + "language_loss": 0.73538172, + "learning_rate": 3.094754183798047e-06, + "loss": 0.7569077, + "num_input_tokens_seen": 120056120, + "step": 5591, + "time_per_iteration": 2.565479278564453 + }, + { + "auxiliary_loss_clip": 0.01133683, + "auxiliary_loss_mlp": 0.01043082, + "balance_loss_clip": 1.05036747, + "balance_loss_mlp": 1.02838409, + "epoch": 0.3362092289192845, + "flos": 16472261191680.0, + "grad_norm": 2.0663762409336983, + "language_loss": 0.69870663, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.72047424, + "num_input_tokens_seen": 120073650, + "step": 5592, + "time_per_iteration": 2.4401934146881104 + }, + { + "auxiliary_loss_clip": 0.01111891, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.05168164, + "balance_loss_mlp": 1.02428555, + "epoch": 0.33626935217195253, + "flos": 24243581329920.0, + "grad_norm": 1.9110978116613533, + "language_loss": 0.76674455, + "learning_rate": 3.094102230664423e-06, + "loss": 0.78824985, + "num_input_tokens_seen": 120093260, + "step": 5593, + "time_per_iteration": 2.5452418327331543 + }, + { + "auxiliary_loss_clip": 0.01102706, + "auxiliary_loss_mlp": 0.0079502, + "balance_loss_clip": 1.04755425, + "balance_loss_mlp": 1.01427686, + "epoch": 0.3363294754246205, + "flos": 19718765164800.0, + "grad_norm": 1.9597891829019363, + "language_loss": 0.72036928, + "learning_rate": 3.093776191858731e-06, + "loss": 0.7393465, + "num_input_tokens_seen": 120111830, + "step": 5594, + "time_per_iteration": 2.552299976348877 + }, + { + "auxiliary_loss_clip": 0.01081368, + "auxiliary_loss_mlp": 0.00799073, + "balance_loss_clip": 1.04908395, + "balance_loss_mlp": 1.02090669, + "epoch": 0.33638959867728846, + "flos": 22596286677120.0, + "grad_norm": 1.524403703537189, + "language_loss": 0.800354, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.81915838, + "num_input_tokens_seen": 120130470, + "step": 5595, + "time_per_iteration": 3.9465320110321045 + }, + { + "auxiliary_loss_clip": 0.01112716, + "auxiliary_loss_mlp": 0.01036032, + "balance_loss_clip": 1.05174124, + "balance_loss_mlp": 1.02254319, + "epoch": 0.3364497219299564, + "flos": 20994742972800.0, + "grad_norm": 1.5329593644891475, + "language_loss": 0.81005889, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.83154637, + "num_input_tokens_seen": 120150735, + "step": 5596, + "time_per_iteration": 2.5364816188812256 + }, + { + "auxiliary_loss_clip": 0.01113971, + "auxiliary_loss_mlp": 0.01041187, + "balance_loss_clip": 1.05156195, + "balance_loss_mlp": 1.02699518, + "epoch": 0.3365098451826244, + "flos": 25228610974080.0, + "grad_norm": 1.860812093866987, + "language_loss": 0.75630754, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.77785915, + "num_input_tokens_seen": 120173230, + "step": 5597, + "time_per_iteration": 2.597223997116089 + }, + { + "auxiliary_loss_clip": 0.01124911, + "auxiliary_loss_mlp": 0.01035959, + "balance_loss_clip": 1.05158329, + "balance_loss_mlp": 1.02152872, + "epoch": 0.33656996843529235, + "flos": 24571697091840.0, + "grad_norm": 1.657119266375199, + "language_loss": 0.78443009, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.8060388, + "num_input_tokens_seen": 120191860, + "step": 5598, + "time_per_iteration": 3.9629456996917725 + }, + { + "auxiliary_loss_clip": 0.01141324, + "auxiliary_loss_mlp": 0.01039923, + "balance_loss_clip": 1.05237591, + "balance_loss_mlp": 1.02372265, + "epoch": 0.3366300916879603, + "flos": 44091120752640.0, + "grad_norm": 1.4033009644008196, + "language_loss": 0.64717185, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.66898429, + "num_input_tokens_seen": 120219195, + "step": 5599, + "time_per_iteration": 2.6622812747955322 + }, + { + "auxiliary_loss_clip": 0.01102727, + "auxiliary_loss_mlp": 0.0105331, + "balance_loss_clip": 1.04849589, + "balance_loss_mlp": 1.03532124, + "epoch": 0.3366902149406283, + "flos": 13879869840000.0, + "grad_norm": 2.5421862549736733, + "language_loss": 0.82373261, + "learning_rate": 3.091819088459249e-06, + "loss": 0.84529305, + "num_input_tokens_seen": 120232950, + "step": 5600, + "time_per_iteration": 3.998006820678711 + }, + { + "auxiliary_loss_clip": 0.01130753, + "auxiliary_loss_mlp": 0.01047723, + "balance_loss_clip": 1.05226827, + "balance_loss_mlp": 1.03139734, + "epoch": 0.33675033819329625, + "flos": 16253098358400.0, + "grad_norm": 2.176504053391658, + "language_loss": 0.83086812, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.85265285, + "num_input_tokens_seen": 120248865, + "step": 5601, + "time_per_iteration": 2.49052095413208 + }, + { + "auxiliary_loss_clip": 0.01127646, + "auxiliary_loss_mlp": 0.01039182, + "balance_loss_clip": 1.05613232, + "balance_loss_mlp": 1.02495408, + "epoch": 0.3368104614459642, + "flos": 17055809544960.0, + "grad_norm": 1.672505156808659, + "language_loss": 0.8374517, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.85912007, + "num_input_tokens_seen": 120267820, + "step": 5602, + "time_per_iteration": 3.850393533706665 + }, + { + "auxiliary_loss_clip": 0.01136329, + "auxiliary_loss_mlp": 0.01054165, + "balance_loss_clip": 1.05029893, + "balance_loss_mlp": 1.03925204, + "epoch": 0.3368705846986322, + "flos": 17858628472320.0, + "grad_norm": 1.842382696239513, + "language_loss": 0.69786656, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.71977144, + "num_input_tokens_seen": 120286540, + "step": 5603, + "time_per_iteration": 2.4520161151885986 + }, + { + "auxiliary_loss_clip": 0.01116307, + "auxiliary_loss_mlp": 0.01038306, + "balance_loss_clip": 1.05347061, + "balance_loss_mlp": 1.02311826, + "epoch": 0.33693070795130015, + "flos": 22929502170240.0, + "grad_norm": 1.4271207909988064, + "language_loss": 0.83068252, + "learning_rate": 3.090513524656898e-06, + "loss": 0.85222864, + "num_input_tokens_seen": 120307305, + "step": 5604, + "time_per_iteration": 2.5279109477996826 + }, + { + "auxiliary_loss_clip": 0.01098917, + "auxiliary_loss_mlp": 0.01044586, + "balance_loss_clip": 1.05080986, + "balance_loss_mlp": 1.02899337, + "epoch": 0.3369908312039681, + "flos": 22017443005440.0, + "grad_norm": 1.7583111438084957, + "language_loss": 0.73505312, + "learning_rate": 3.090187030294409e-06, + "loss": 0.75648814, + "num_input_tokens_seen": 120327845, + "step": 5605, + "time_per_iteration": 2.55694842338562 + }, + { + "auxiliary_loss_clip": 0.01121052, + "auxiliary_loss_mlp": 0.01038281, + "balance_loss_clip": 1.05290556, + "balance_loss_mlp": 1.02312946, + "epoch": 0.33705095445663613, + "flos": 11801970944640.0, + "grad_norm": 2.511059501331598, + "language_loss": 0.83051419, + "learning_rate": 3.089860494591919e-06, + "loss": 0.85210752, + "num_input_tokens_seen": 120343255, + "step": 5606, + "time_per_iteration": 2.51479434967041 + }, + { + "auxiliary_loss_clip": 0.01111624, + "auxiliary_loss_mlp": 0.01038127, + "balance_loss_clip": 1.04885507, + "balance_loss_mlp": 1.02338696, + "epoch": 0.3371110777093041, + "flos": 25046400257280.0, + "grad_norm": 1.5069068165700772, + "language_loss": 0.67864496, + "learning_rate": 3.089533917561809e-06, + "loss": 0.70014244, + "num_input_tokens_seen": 120361745, + "step": 5607, + "time_per_iteration": 2.563803195953369 + }, + { + "auxiliary_loss_clip": 0.01120993, + "auxiliary_loss_mlp": 0.01053168, + "balance_loss_clip": 1.04845631, + "balance_loss_mlp": 1.0363003, + "epoch": 0.33717120096197206, + "flos": 26579031719040.0, + "grad_norm": 1.8483661854527487, + "language_loss": 0.71059465, + "learning_rate": 3.089207299216464e-06, + "loss": 0.73233628, + "num_input_tokens_seen": 120380565, + "step": 5608, + "time_per_iteration": 2.600996494293213 + }, + { + "auxiliary_loss_clip": 0.01059139, + "auxiliary_loss_mlp": 0.01041256, + "balance_loss_clip": 1.05147934, + "balance_loss_mlp": 1.02631342, + "epoch": 0.33723132421464, + "flos": 15158541168000.0, + "grad_norm": 1.7490388323601793, + "language_loss": 0.7947253, + "learning_rate": 3.088880639568269e-06, + "loss": 0.81572926, + "num_input_tokens_seen": 120399235, + "step": 5609, + "time_per_iteration": 2.6472108364105225 + }, + { + "auxiliary_loss_clip": 0.01127094, + "auxiliary_loss_mlp": 0.01042523, + "balance_loss_clip": 1.05203962, + "balance_loss_mlp": 1.02662086, + "epoch": 0.337291447467308, + "flos": 23436093634560.0, + "grad_norm": 1.6756335469309984, + "language_loss": 0.82870519, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.8504014, + "num_input_tokens_seen": 120420095, + "step": 5610, + "time_per_iteration": 2.544128179550171 + }, + { + "auxiliary_loss_clip": 0.01124734, + "auxiliary_loss_mlp": 0.01039408, + "balance_loss_clip": 1.05186987, + "balance_loss_mlp": 1.02318382, + "epoch": 0.33735157071997596, + "flos": 17238163916160.0, + "grad_norm": 1.7585377162940603, + "language_loss": 0.81921387, + "learning_rate": 3.088227196412879e-06, + "loss": 0.8408553, + "num_input_tokens_seen": 120437690, + "step": 5611, + "time_per_iteration": 2.4829158782958984 + }, + { + "auxiliary_loss_clip": 0.01119892, + "auxiliary_loss_mlp": 0.01040604, + "balance_loss_clip": 1.0546484, + "balance_loss_mlp": 1.0237, + "epoch": 0.3374116939726439, + "flos": 28257388657920.0, + "grad_norm": 1.6092575759631285, + "language_loss": 0.79475796, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.81636286, + "num_input_tokens_seen": 120459240, + "step": 5612, + "time_per_iteration": 2.5990817546844482 + }, + { + "auxiliary_loss_clip": 0.01080523, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.04838264, + "balance_loss_mlp": 1.01961362, + "epoch": 0.3374718172253119, + "flos": 35919396731520.0, + "grad_norm": 5.218837287786804, + "language_loss": 0.70304585, + "learning_rate": 3.087573588194753e-06, + "loss": 0.72420013, + "num_input_tokens_seen": 120481090, + "step": 5613, + "time_per_iteration": 2.747260808944702 + }, + { + "auxiliary_loss_clip": 0.01118501, + "auxiliary_loss_mlp": 0.01032058, + "balance_loss_clip": 1.05151856, + "balance_loss_mlp": 1.01621521, + "epoch": 0.33753194047797985, + "flos": 18186672407040.0, + "grad_norm": 1.6778090013766145, + "language_loss": 0.79668695, + "learning_rate": 3.087246722218144e-06, + "loss": 0.81819254, + "num_input_tokens_seen": 120500045, + "step": 5614, + "time_per_iteration": 2.554368019104004 + }, + { + "auxiliary_loss_clip": 0.01105165, + "auxiliary_loss_mlp": 0.01040649, + "balance_loss_clip": 1.04732358, + "balance_loss_mlp": 1.02388835, + "epoch": 0.3375920637306478, + "flos": 23148916398720.0, + "grad_norm": 1.6209061594699703, + "language_loss": 0.91321743, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93467557, + "num_input_tokens_seen": 120521125, + "step": 5615, + "time_per_iteration": 2.5925652980804443 + }, + { + "auxiliary_loss_clip": 0.01119322, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.04715121, + "balance_loss_mlp": 1.01812196, + "epoch": 0.3376521869833158, + "flos": 23112215677440.0, + "grad_norm": 1.6616081461857979, + "language_loss": 0.80633163, + "learning_rate": 3.086592866591809e-06, + "loss": 0.8278479, + "num_input_tokens_seen": 120539180, + "step": 5616, + "time_per_iteration": 2.504072666168213 + }, + { + "auxiliary_loss_clip": 0.01126561, + "auxiliary_loss_mlp": 0.00811505, + "balance_loss_clip": 1.04820061, + "balance_loss_mlp": 1.04626822, + "epoch": 0.33771231023598375, + "flos": 19274585581440.0, + "grad_norm": 1.6969100693799928, + "language_loss": 0.83831745, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.85769808, + "num_input_tokens_seen": 120556280, + "step": 5617, + "time_per_iteration": 2.5130770206451416 + }, + { + "auxiliary_loss_clip": 0.0106806, + "auxiliary_loss_mlp": 0.01040065, + "balance_loss_clip": 1.04471517, + "balance_loss_mlp": 1.02387691, + "epoch": 0.3377724334886517, + "flos": 18150187167360.0, + "grad_norm": 1.5580699097318684, + "language_loss": 0.80092418, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.82200539, + "num_input_tokens_seen": 120575395, + "step": 5618, + "time_per_iteration": 2.647524118423462 + }, + { + "auxiliary_loss_clip": 0.01090069, + "auxiliary_loss_mlp": 0.01036784, + "balance_loss_clip": 1.04767537, + "balance_loss_mlp": 1.02165604, + "epoch": 0.3378325567413197, + "flos": 25775997310080.0, + "grad_norm": 1.6197731468412164, + "language_loss": 0.70302761, + "learning_rate": 3.085611774155481e-06, + "loss": 0.72429609, + "num_input_tokens_seen": 120596075, + "step": 5619, + "time_per_iteration": 2.641116142272949 + }, + { + "auxiliary_loss_clip": 0.0111271, + "auxiliary_loss_mlp": 0.0104642, + "balance_loss_clip": 1.04772329, + "balance_loss_mlp": 1.03142321, + "epoch": 0.3378926799939877, + "flos": 21317112558720.0, + "grad_norm": 2.3026418000394764, + "language_loss": 0.69961345, + "learning_rate": 3.085284660993821e-06, + "loss": 0.7212047, + "num_input_tokens_seen": 120614195, + "step": 5620, + "time_per_iteration": 2.604933500289917 + }, + { + "auxiliary_loss_clip": 0.01132738, + "auxiliary_loss_mlp": 0.01041644, + "balance_loss_clip": 1.04900551, + "balance_loss_mlp": 1.02747619, + "epoch": 0.33795280324665566, + "flos": 24900028335360.0, + "grad_norm": 1.64566407212675, + "language_loss": 0.68022966, + "learning_rate": 3.084957506678058e-06, + "loss": 0.70197344, + "num_input_tokens_seen": 120634475, + "step": 5621, + "time_per_iteration": 2.5334160327911377 + }, + { + "auxiliary_loss_clip": 0.01106209, + "auxiliary_loss_mlp": 0.01037532, + "balance_loss_clip": 1.05052209, + "balance_loss_mlp": 1.02364469, + "epoch": 0.33801292649932363, + "flos": 24753943722240.0, + "grad_norm": 1.6166646234589637, + "language_loss": 0.82590806, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.84734547, + "num_input_tokens_seen": 120654980, + "step": 5622, + "time_per_iteration": 2.5767462253570557 + }, + { + "auxiliary_loss_clip": 0.01097359, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.04623485, + "balance_loss_mlp": 1.02047706, + "epoch": 0.3380730497519916, + "flos": 26723967096960.0, + "grad_norm": 1.459819892968621, + "language_loss": 0.73540354, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.75672376, + "num_input_tokens_seen": 120676245, + "step": 5623, + "time_per_iteration": 2.646000862121582 + }, + { + "auxiliary_loss_clip": 0.01031737, + "auxiliary_loss_mlp": 0.01002853, + "balance_loss_clip": 1.02056813, + "balance_loss_mlp": 1.00073111, + "epoch": 0.33813317300465956, + "flos": 70035756416640.0, + "grad_norm": 0.7446325139577281, + "language_loss": 0.5490734, + "learning_rate": 3.083975796930215e-06, + "loss": 0.56941932, + "num_input_tokens_seen": 120741965, + "step": 5624, + "time_per_iteration": 3.2677204608917236 + }, + { + "auxiliary_loss_clip": 0.01090429, + "auxiliary_loss_mlp": 0.01050483, + "balance_loss_clip": 1.04867184, + "balance_loss_mlp": 1.03347182, + "epoch": 0.3381932962573275, + "flos": 24097317148800.0, + "grad_norm": 2.451999518187238, + "language_loss": 0.73290044, + "learning_rate": 3.083648478122111e-06, + "loss": 0.75430954, + "num_input_tokens_seen": 120760410, + "step": 5625, + "time_per_iteration": 2.604811668395996 + }, + { + "auxiliary_loss_clip": 0.01124962, + "auxiliary_loss_mlp": 0.01040454, + "balance_loss_clip": 1.04822373, + "balance_loss_mlp": 1.02457523, + "epoch": 0.3382534195099955, + "flos": 19278248768640.0, + "grad_norm": 1.8628169889081865, + "language_loss": 0.70656943, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.72822362, + "num_input_tokens_seen": 120777705, + "step": 5626, + "time_per_iteration": 2.4951937198638916 + }, + { + "auxiliary_loss_clip": 0.01109109, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.04828393, + "balance_loss_mlp": 1.0188657, + "epoch": 0.33831354276266346, + "flos": 25226240676480.0, + "grad_norm": 1.5739589405002028, + "language_loss": 0.8094635, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83089983, + "num_input_tokens_seen": 120798660, + "step": 5627, + "time_per_iteration": 2.6195735931396484 + }, + { + "auxiliary_loss_clip": 0.01126243, + "auxiliary_loss_mlp": 0.00795753, + "balance_loss_clip": 1.05038667, + "balance_loss_mlp": 1.01896024, + "epoch": 0.3383736660153314, + "flos": 23112000195840.0, + "grad_norm": 1.6796445860542126, + "language_loss": 0.80545831, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.8246783, + "num_input_tokens_seen": 120816705, + "step": 5628, + "time_per_iteration": 2.529750347137451 + }, + { + "auxiliary_loss_clip": 0.01079856, + "auxiliary_loss_mlp": 0.01042395, + "balance_loss_clip": 1.0477562, + "balance_loss_mlp": 1.02546692, + "epoch": 0.3384337892679994, + "flos": 23477139901440.0, + "grad_norm": 1.9767443800089315, + "language_loss": 0.77003777, + "learning_rate": 3.082338792093254e-06, + "loss": 0.79126024, + "num_input_tokens_seen": 120835375, + "step": 5629, + "time_per_iteration": 2.6379587650299072 + }, + { + "auxiliary_loss_clip": 0.01112735, + "auxiliary_loss_mlp": 0.0104129, + "balance_loss_clip": 1.04561138, + "balance_loss_mlp": 1.02519703, + "epoch": 0.33849391252066735, + "flos": 19425805839360.0, + "grad_norm": 1.7596117868287955, + "language_loss": 0.84723222, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.86877245, + "num_input_tokens_seen": 120854260, + "step": 5630, + "time_per_iteration": 2.585627555847168 + }, + { + "auxiliary_loss_clip": 0.01083255, + "auxiliary_loss_mlp": 0.01053979, + "balance_loss_clip": 1.04597759, + "balance_loss_mlp": 1.03845739, + "epoch": 0.3385540357733353, + "flos": 21064840364160.0, + "grad_norm": 1.8719353275883275, + "language_loss": 0.72145414, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.74282646, + "num_input_tokens_seen": 120871590, + "step": 5631, + "time_per_iteration": 2.592647075653076 + }, + { + "auxiliary_loss_clip": 0.01029, + "auxiliary_loss_mlp": 0.01002137, + "balance_loss_clip": 1.02049911, + "balance_loss_mlp": 1.00003898, + "epoch": 0.3386141590260033, + "flos": 69208013450880.0, + "grad_norm": 0.8596488056296706, + "language_loss": 0.56211817, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58242953, + "num_input_tokens_seen": 120925550, + "step": 5632, + "time_per_iteration": 3.202526092529297 + }, + { + "auxiliary_loss_clip": 0.01115278, + "auxiliary_loss_mlp": 0.01036377, + "balance_loss_clip": 1.04706883, + "balance_loss_mlp": 1.02099848, + "epoch": 0.3386742822786713, + "flos": 25519487310720.0, + "grad_norm": 2.033594353082862, + "language_loss": 0.80482674, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.8263433, + "num_input_tokens_seen": 120947620, + "step": 5633, + "time_per_iteration": 2.5611870288848877 + }, + { + "auxiliary_loss_clip": 0.0109425, + "auxiliary_loss_mlp": 0.01040498, + "balance_loss_clip": 1.04265249, + "balance_loss_mlp": 1.02507257, + "epoch": 0.33873440553133927, + "flos": 23623116773760.0, + "grad_norm": 1.8851048010864473, + "language_loss": 0.58898383, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.6103313, + "num_input_tokens_seen": 120965205, + "step": 5634, + "time_per_iteration": 4.003119468688965 + }, + { + "auxiliary_loss_clip": 0.01099103, + "auxiliary_loss_mlp": 0.010393, + "balance_loss_clip": 1.04838395, + "balance_loss_mlp": 1.02430916, + "epoch": 0.33879452878400723, + "flos": 17088882992640.0, + "grad_norm": 1.8212017000633076, + "language_loss": 0.92448294, + "learning_rate": 3.080373032026589e-06, + "loss": 0.945867, + "num_input_tokens_seen": 120983560, + "step": 5635, + "time_per_iteration": 2.5260612964630127 + }, + { + "auxiliary_loss_clip": 0.01085007, + "auxiliary_loss_mlp": 0.01033717, + "balance_loss_clip": 1.04764175, + "balance_loss_mlp": 1.01878035, + "epoch": 0.3388546520366752, + "flos": 15742053607680.0, + "grad_norm": 1.8010296583263754, + "language_loss": 0.75782037, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.77900755, + "num_input_tokens_seen": 121001400, + "step": 5636, + "time_per_iteration": 2.544010877609253 + }, + { + "auxiliary_loss_clip": 0.01118783, + "auxiliary_loss_mlp": 0.01041531, + "balance_loss_clip": 1.04614305, + "balance_loss_mlp": 1.02648056, + "epoch": 0.33891477528934316, + "flos": 22418744728320.0, + "grad_norm": 1.9056574527925345, + "language_loss": 0.83514988, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.85675299, + "num_input_tokens_seen": 121021760, + "step": 5637, + "time_per_iteration": 3.8791682720184326 + }, + { + "auxiliary_loss_clip": 0.01084417, + "auxiliary_loss_mlp": 0.01044177, + "balance_loss_clip": 1.04610491, + "balance_loss_mlp": 1.02585459, + "epoch": 0.3389748985420111, + "flos": 17274828723840.0, + "grad_norm": 1.7180383549100888, + "language_loss": 0.70061815, + "learning_rate": 3.079389598759495e-06, + "loss": 0.7219041, + "num_input_tokens_seen": 121041070, + "step": 5638, + "time_per_iteration": 4.015904188156128 + }, + { + "auxiliary_loss_clip": 0.01103636, + "auxiliary_loss_mlp": 0.01055274, + "balance_loss_clip": 1.04724908, + "balance_loss_mlp": 1.0397054, + "epoch": 0.3390350217946791, + "flos": 27744979190400.0, + "grad_norm": 1.7274015247509964, + "language_loss": 0.81084299, + "learning_rate": 3.079061705792765e-06, + "loss": 0.83243203, + "num_input_tokens_seen": 121060890, + "step": 5639, + "time_per_iteration": 2.594317674636841 + }, + { + "auxiliary_loss_clip": 0.011338, + "auxiliary_loss_mlp": 0.01048015, + "balance_loss_clip": 1.0469203, + "balance_loss_mlp": 1.03247023, + "epoch": 0.33909514504734706, + "flos": 20339804338560.0, + "grad_norm": 2.2522383901580962, + "language_loss": 0.67834413, + "learning_rate": 3.078733771907907e-06, + "loss": 0.70016229, + "num_input_tokens_seen": 121079135, + "step": 5640, + "time_per_iteration": 2.4960238933563232 + }, + { + "auxiliary_loss_clip": 0.0110738, + "auxiliary_loss_mlp": 0.01041151, + "balance_loss_clip": 1.04692912, + "balance_loss_mlp": 1.02629137, + "epoch": 0.339155268300015, + "flos": 14830030356480.0, + "grad_norm": 1.6752566842807726, + "language_loss": 0.69855046, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.72003573, + "num_input_tokens_seen": 121097685, + "step": 5641, + "time_per_iteration": 3.927908420562744 + }, + { + "auxiliary_loss_clip": 0.01134664, + "auxiliary_loss_mlp": 0.01046569, + "balance_loss_clip": 1.04909289, + "balance_loss_mlp": 1.03167367, + "epoch": 0.339215391552683, + "flos": 26067951054720.0, + "grad_norm": 1.7959978619916872, + "language_loss": 0.8766107, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.89842302, + "num_input_tokens_seen": 121115640, + "step": 5642, + "time_per_iteration": 2.5210633277893066 + }, + { + "auxiliary_loss_clip": 0.01113924, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.04663229, + "balance_loss_mlp": 1.01874006, + "epoch": 0.33927551480535095, + "flos": 14574705505920.0, + "grad_norm": 1.7100258677102338, + "language_loss": 0.83968282, + "learning_rate": 3.077749724868924e-06, + "loss": 0.86113775, + "num_input_tokens_seen": 121132485, + "step": 5643, + "time_per_iteration": 2.525709867477417 + }, + { + "auxiliary_loss_clip": 0.01104341, + "auxiliary_loss_mlp": 0.01047876, + "balance_loss_clip": 1.04680371, + "balance_loss_mlp": 1.03332055, + "epoch": 0.3393356380580189, + "flos": 23805578885760.0, + "grad_norm": 1.5086519357633559, + "language_loss": 0.77168912, + "learning_rate": 3.077421627435922e-06, + "loss": 0.79321122, + "num_input_tokens_seen": 121152935, + "step": 5644, + "time_per_iteration": 2.642129421234131 + }, + { + "auxiliary_loss_clip": 0.01112996, + "auxiliary_loss_mlp": 0.01044638, + "balance_loss_clip": 1.04521394, + "balance_loss_mlp": 1.03013015, + "epoch": 0.3393957613106869, + "flos": 17347871030400.0, + "grad_norm": 4.768041122555143, + "language_loss": 0.63128334, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.65285969, + "num_input_tokens_seen": 121169835, + "step": 5645, + "time_per_iteration": 2.4716014862060547 + }, + { + "auxiliary_loss_clip": 0.01117364, + "auxiliary_loss_mlp": 0.01041621, + "balance_loss_clip": 1.04601395, + "balance_loss_mlp": 1.02759576, + "epoch": 0.3394558845633549, + "flos": 28433960939520.0, + "grad_norm": 2.530394190132794, + "language_loss": 0.76728874, + "learning_rate": 3.076765310014552e-06, + "loss": 0.78887856, + "num_input_tokens_seen": 121190290, + "step": 5646, + "time_per_iteration": 2.572298765182495 + }, + { + "auxiliary_loss_clip": 0.01118916, + "auxiliary_loss_mlp": 0.01042535, + "balance_loss_clip": 1.04768991, + "balance_loss_mlp": 1.02712107, + "epoch": 0.33951600781602287, + "flos": 22086929865600.0, + "grad_norm": 3.1281602088508187, + "language_loss": 0.78474402, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.80635858, + "num_input_tokens_seen": 121209060, + "step": 5647, + "time_per_iteration": 2.550107717514038 + }, + { + "auxiliary_loss_clip": 0.01107105, + "auxiliary_loss_mlp": 0.00796549, + "balance_loss_clip": 1.05308437, + "balance_loss_mlp": 1.02542078, + "epoch": 0.33957613106869083, + "flos": 23878262056320.0, + "grad_norm": 1.8367939846392098, + "language_loss": 0.77445722, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.79349375, + "num_input_tokens_seen": 121227480, + "step": 5648, + "time_per_iteration": 2.58085298538208 + }, + { + "auxiliary_loss_clip": 0.00994624, + "auxiliary_loss_mlp": 0.01005494, + "balance_loss_clip": 1.02797484, + "balance_loss_mlp": 1.00328827, + "epoch": 0.3396362543213588, + "flos": 71242642414080.0, + "grad_norm": 0.7885268529206491, + "language_loss": 0.56328243, + "learning_rate": 3.075780527680754e-06, + "loss": 0.5832836, + "num_input_tokens_seen": 121291305, + "step": 5649, + "time_per_iteration": 3.436791181564331 + }, + { + "auxiliary_loss_clip": 0.0110223, + "auxiliary_loss_mlp": 0.00797917, + "balance_loss_clip": 1.04392338, + "balance_loss_mlp": 1.01979637, + "epoch": 0.33969637757402676, + "flos": 25921615046400.0, + "grad_norm": 1.3750576955834768, + "language_loss": 0.85620612, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.87520754, + "num_input_tokens_seen": 121312740, + "step": 5650, + "time_per_iteration": 2.7819693088531494 + }, + { + "auxiliary_loss_clip": 0.01119457, + "auxiliary_loss_mlp": 0.01033573, + "balance_loss_clip": 1.04568434, + "balance_loss_mlp": 1.01961327, + "epoch": 0.33975650082669473, + "flos": 35261728663680.0, + "grad_norm": 1.560562624615696, + "language_loss": 0.70549858, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.72702891, + "num_input_tokens_seen": 121334220, + "step": 5651, + "time_per_iteration": 2.6545698642730713 + }, + { + "auxiliary_loss_clip": 0.01082435, + "auxiliary_loss_mlp": 0.01040478, + "balance_loss_clip": 1.04599142, + "balance_loss_mlp": 1.02591109, + "epoch": 0.3398166240793627, + "flos": 16647001879680.0, + "grad_norm": 1.993898850316831, + "language_loss": 0.81301177, + "learning_rate": 3.074795378203616e-06, + "loss": 0.83424085, + "num_input_tokens_seen": 121351870, + "step": 5652, + "time_per_iteration": 2.5853075981140137 + }, + { + "auxiliary_loss_clip": 0.01135844, + "auxiliary_loss_mlp": 0.01036146, + "balance_loss_clip": 1.04973626, + "balance_loss_mlp": 1.02120948, + "epoch": 0.33987674733203066, + "flos": 24062196625920.0, + "grad_norm": 1.6504144076819252, + "language_loss": 0.76946211, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.79118204, + "num_input_tokens_seen": 121373400, + "step": 5653, + "time_per_iteration": 2.5696804523468018 + }, + { + "auxiliary_loss_clip": 0.01114578, + "auxiliary_loss_mlp": 0.01046614, + "balance_loss_clip": 1.04516721, + "balance_loss_mlp": 1.03152204, + "epoch": 0.3399368705846986, + "flos": 13250678279040.0, + "grad_norm": 2.7524037642974943, + "language_loss": 0.85278833, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.8744002, + "num_input_tokens_seen": 121385225, + "step": 5654, + "time_per_iteration": 2.4556944370269775 + }, + { + "auxiliary_loss_clip": 0.011162, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.04835129, + "balance_loss_mlp": 1.02166128, + "epoch": 0.3399969938373666, + "flos": 27012832272000.0, + "grad_norm": 2.336365321478241, + "language_loss": 0.65092331, + "learning_rate": 3.073809861919351e-06, + "loss": 0.67245054, + "num_input_tokens_seen": 121404735, + "step": 5655, + "time_per_iteration": 2.5661087036132812 + }, + { + "auxiliary_loss_clip": 0.01122172, + "auxiliary_loss_mlp": 0.01039703, + "balance_loss_clip": 1.05074382, + "balance_loss_mlp": 1.02605379, + "epoch": 0.34005711709003456, + "flos": 28550096588160.0, + "grad_norm": 1.5668395255716543, + "language_loss": 0.76854104, + "learning_rate": 3.073481275036697e-06, + "loss": 0.79015982, + "num_input_tokens_seen": 121426780, + "step": 5656, + "time_per_iteration": 2.6227197647094727 + }, + { + "auxiliary_loss_clip": 0.0109813, + "auxiliary_loss_mlp": 0.01039892, + "balance_loss_clip": 1.04587126, + "balance_loss_mlp": 1.02422786, + "epoch": 0.3401172403427025, + "flos": 21617003208960.0, + "grad_norm": 1.7656283860282789, + "language_loss": 0.8339889, + "learning_rate": 3.073152647447525e-06, + "loss": 0.85536909, + "num_input_tokens_seen": 121447245, + "step": 5657, + "time_per_iteration": 2.5967695713043213 + }, + { + "auxiliary_loss_clip": 0.01104815, + "auxiliary_loss_mlp": 0.01039551, + "balance_loss_clip": 1.04742038, + "balance_loss_mlp": 1.0261997, + "epoch": 0.3401773635953705, + "flos": 25885776251520.0, + "grad_norm": 1.9021899847783224, + "language_loss": 0.85538173, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.87682533, + "num_input_tokens_seen": 121468165, + "step": 5658, + "time_per_iteration": 2.5734925270080566 + }, + { + "auxiliary_loss_clip": 0.01037994, + "auxiliary_loss_mlp": 0.01022975, + "balance_loss_clip": 1.01932549, + "balance_loss_mlp": 1.0207938, + "epoch": 0.3402374868480385, + "flos": 65507995336320.0, + "grad_norm": 0.817092038647145, + "language_loss": 0.60063612, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62124586, + "num_input_tokens_seen": 121523795, + "step": 5659, + "time_per_iteration": 3.069321870803833 + }, + { + "auxiliary_loss_clip": 0.01129145, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.04882216, + "balance_loss_mlp": 1.02192843, + "epoch": 0.34029761010070647, + "flos": 24060580513920.0, + "grad_norm": 1.9257947345530304, + "language_loss": 0.6749649, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.69660985, + "num_input_tokens_seen": 121542950, + "step": 5660, + "time_per_iteration": 2.5340828895568848 + }, + { + "auxiliary_loss_clip": 0.01134185, + "auxiliary_loss_mlp": 0.01045857, + "balance_loss_clip": 1.05167961, + "balance_loss_mlp": 1.03097963, + "epoch": 0.34035773335337444, + "flos": 27599720590080.0, + "grad_norm": 2.0367146888013137, + "language_loss": 0.67046511, + "learning_rate": 3.071837730274918e-06, + "loss": 0.69226551, + "num_input_tokens_seen": 121562765, + "step": 5661, + "time_per_iteration": 2.559863328933716 + }, + { + "auxiliary_loss_clip": 0.01108903, + "auxiliary_loss_mlp": 0.01039425, + "balance_loss_clip": 1.04771423, + "balance_loss_mlp": 1.02567434, + "epoch": 0.3404178566060424, + "flos": 20812783651200.0, + "grad_norm": 1.7239698492965985, + "language_loss": 0.78973424, + "learning_rate": 3.071508899340113e-06, + "loss": 0.81121755, + "num_input_tokens_seen": 121581610, + "step": 5662, + "time_per_iteration": 2.558148145675659 + }, + { + "auxiliary_loss_clip": 0.01098955, + "auxiliary_loss_mlp": 0.01038596, + "balance_loss_clip": 1.05207086, + "balance_loss_mlp": 1.02348018, + "epoch": 0.34047797985871037, + "flos": 26833566470400.0, + "grad_norm": 2.052164418592098, + "language_loss": 0.73377538, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.75515079, + "num_input_tokens_seen": 121601885, + "step": 5663, + "time_per_iteration": 2.5971908569335938 + }, + { + "auxiliary_loss_clip": 0.01087352, + "auxiliary_loss_mlp": 0.01038047, + "balance_loss_clip": 1.04658115, + "balance_loss_mlp": 1.02482033, + "epoch": 0.34053810311137833, + "flos": 19682639061120.0, + "grad_norm": 1.6984642450855774, + "language_loss": 0.86486512, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.88611913, + "num_input_tokens_seen": 121621335, + "step": 5664, + "time_per_iteration": 2.5762150287628174 + }, + { + "auxiliary_loss_clip": 0.01133178, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.04900193, + "balance_loss_mlp": 1.0224818, + "epoch": 0.3405982263640463, + "flos": 21725740656000.0, + "grad_norm": 1.8640738577462617, + "language_loss": 0.68651927, + "learning_rate": 3.070522162795235e-06, + "loss": 0.7082144, + "num_input_tokens_seen": 121641310, + "step": 5665, + "time_per_iteration": 2.4929463863372803 + }, + { + "auxiliary_loss_clip": 0.01133303, + "auxiliary_loss_mlp": 0.01040057, + "balance_loss_clip": 1.04895532, + "balance_loss_mlp": 1.02404714, + "epoch": 0.34065834961671426, + "flos": 18041629288320.0, + "grad_norm": 2.5037769846412226, + "language_loss": 0.73076272, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.75249636, + "num_input_tokens_seen": 121659625, + "step": 5666, + "time_per_iteration": 2.4856858253479004 + }, + { + "auxiliary_loss_clip": 0.01125123, + "auxiliary_loss_mlp": 0.01036447, + "balance_loss_clip": 1.04790783, + "balance_loss_mlp": 1.02224898, + "epoch": 0.3407184728693822, + "flos": 21397337585280.0, + "grad_norm": 1.5060264965876597, + "language_loss": 0.73254758, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.75416327, + "num_input_tokens_seen": 121679205, + "step": 5667, + "time_per_iteration": 2.537642478942871 + }, + { + "auxiliary_loss_clip": 0.01038983, + "auxiliary_loss_mlp": 0.01013917, + "balance_loss_clip": 1.02748215, + "balance_loss_mlp": 1.01152074, + "epoch": 0.3407785961220502, + "flos": 68688101018880.0, + "grad_norm": 0.8434254783907549, + "language_loss": 0.63303459, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65356362, + "num_input_tokens_seen": 121751085, + "step": 5668, + "time_per_iteration": 3.292513370513916 + }, + { + "auxiliary_loss_clip": 0.01043531, + "auxiliary_loss_mlp": 0.01040841, + "balance_loss_clip": 1.05030179, + "balance_loss_mlp": 1.02492654, + "epoch": 0.34083871937471816, + "flos": 14064379027200.0, + "grad_norm": 2.209231736193042, + "language_loss": 0.72364616, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.74448985, + "num_input_tokens_seen": 121768565, + "step": 5669, + "time_per_iteration": 2.8021633625030518 + }, + { + "auxiliary_loss_clip": 0.01097468, + "auxiliary_loss_mlp": 0.00792473, + "balance_loss_clip": 1.04791343, + "balance_loss_mlp": 1.01242328, + "epoch": 0.3408988426273861, + "flos": 17085435287040.0, + "grad_norm": 1.9300508828493999, + "language_loss": 0.80458981, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.82348919, + "num_input_tokens_seen": 121784925, + "step": 5670, + "time_per_iteration": 3.548823595046997 + }, + { + "auxiliary_loss_clip": 0.01084916, + "auxiliary_loss_mlp": 0.0103529, + "balance_loss_clip": 1.0431633, + "balance_loss_mlp": 1.0206691, + "epoch": 0.3409589658800541, + "flos": 24024562151040.0, + "grad_norm": 1.8327681834622422, + "language_loss": 0.77530444, + "learning_rate": 3.068547593996078e-06, + "loss": 0.79650652, + "num_input_tokens_seen": 121804425, + "step": 5671, + "time_per_iteration": 2.6493420600891113 + }, + { + "auxiliary_loss_clip": 0.0113367, + "auxiliary_loss_mlp": 0.00793258, + "balance_loss_clip": 1.04912543, + "balance_loss_mlp": 1.01493692, + "epoch": 0.34101908913272205, + "flos": 21142012734720.0, + "grad_norm": 2.1735867572850687, + "language_loss": 0.74009454, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.75936383, + "num_input_tokens_seen": 121825145, + "step": 5672, + "time_per_iteration": 2.505359411239624 + }, + { + "auxiliary_loss_clip": 0.01116932, + "auxiliary_loss_mlp": 0.0104162, + "balance_loss_clip": 1.04614782, + "balance_loss_mlp": 1.02607548, + "epoch": 0.3410792123853901, + "flos": 15702012921600.0, + "grad_norm": 1.872715424960896, + "language_loss": 0.73440754, + "learning_rate": 3.06788908010777e-06, + "loss": 0.75599307, + "num_input_tokens_seen": 121842185, + "step": 5673, + "time_per_iteration": 4.238344669342041 + }, + { + "auxiliary_loss_clip": 0.01122337, + "auxiliary_loss_mlp": 0.01036204, + "balance_loss_clip": 1.04915595, + "balance_loss_mlp": 1.02151752, + "epoch": 0.34113933563805804, + "flos": 23036012974080.0, + "grad_norm": 1.7698245463823359, + "language_loss": 0.79754424, + "learning_rate": 3.067559762415682e-06, + "loss": 0.81912965, + "num_input_tokens_seen": 121862260, + "step": 5674, + "time_per_iteration": 2.549967050552368 + }, + { + "auxiliary_loss_clip": 0.01048861, + "auxiliary_loss_mlp": 0.01002786, + "balance_loss_clip": 1.02037501, + "balance_loss_mlp": 1.00025868, + "epoch": 0.341199458890726, + "flos": 69614235336960.0, + "grad_norm": 1.245975616558794, + "language_loss": 0.56058705, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58110356, + "num_input_tokens_seen": 121923560, + "step": 5675, + "time_per_iteration": 3.234285593032837 + }, + { + "auxiliary_loss_clip": 0.01111321, + "auxiliary_loss_mlp": 0.00792577, + "balance_loss_clip": 1.04987335, + "balance_loss_mlp": 1.01455426, + "epoch": 0.34125958214339397, + "flos": 22346348866560.0, + "grad_norm": 1.8667831734763414, + "language_loss": 0.79397911, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.81301808, + "num_input_tokens_seen": 121943515, + "step": 5676, + "time_per_iteration": 3.9644792079925537 + }, + { + "auxiliary_loss_clip": 0.01116355, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.0438205, + "balance_loss_mlp": 1.0158782, + "epoch": 0.34131970539606193, + "flos": 21871933009920.0, + "grad_norm": 2.063781887135893, + "language_loss": 0.85403764, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.87551296, + "num_input_tokens_seen": 121962540, + "step": 5677, + "time_per_iteration": 3.9514260292053223 + }, + { + "auxiliary_loss_clip": 0.01106829, + "auxiliary_loss_mlp": 0.01037491, + "balance_loss_clip": 1.04931426, + "balance_loss_mlp": 1.02235746, + "epoch": 0.3413798286487299, + "flos": 24935723475840.0, + "grad_norm": 1.851689470051099, + "language_loss": 0.79373002, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.81517327, + "num_input_tokens_seen": 121979830, + "step": 5678, + "time_per_iteration": 2.5567758083343506 + }, + { + "auxiliary_loss_clip": 0.01117989, + "auxiliary_loss_mlp": 0.01032435, + "balance_loss_clip": 1.04474688, + "balance_loss_mlp": 1.01770043, + "epoch": 0.34143995190139786, + "flos": 25374372364800.0, + "grad_norm": 1.7922020358948438, + "language_loss": 0.74979281, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.77129698, + "num_input_tokens_seen": 121999055, + "step": 5679, + "time_per_iteration": 3.929081678390503 + }, + { + "auxiliary_loss_clip": 0.01041389, + "auxiliary_loss_mlp": 0.01002144, + "balance_loss_clip": 1.02421856, + "balance_loss_mlp": 0.99999827, + "epoch": 0.34150007515406583, + "flos": 67782578129280.0, + "grad_norm": 0.7146282807856524, + "language_loss": 0.59485769, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.61529303, + "num_input_tokens_seen": 122067015, + "step": 5680, + "time_per_iteration": 3.1907830238342285 + }, + { + "auxiliary_loss_clip": 0.01106596, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.04587626, + "balance_loss_mlp": 1.01654291, + "epoch": 0.3415601984067338, + "flos": 20302421258880.0, + "grad_norm": 1.7535210232065053, + "language_loss": 0.72432548, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.74569786, + "num_input_tokens_seen": 122085295, + "step": 5681, + "time_per_iteration": 2.5356457233428955 + }, + { + "auxiliary_loss_clip": 0.01108874, + "auxiliary_loss_mlp": 0.01044194, + "balance_loss_clip": 1.04768205, + "balance_loss_mlp": 1.03000855, + "epoch": 0.34162032165940176, + "flos": 26031178506240.0, + "grad_norm": 3.4828876579297767, + "language_loss": 0.71029687, + "learning_rate": 3.064923764577233e-06, + "loss": 0.7318275, + "num_input_tokens_seen": 122104020, + "step": 5682, + "time_per_iteration": 2.5861222743988037 + }, + { + "auxiliary_loss_clip": 0.01129512, + "auxiliary_loss_mlp": 0.01037754, + "balance_loss_clip": 1.0455128, + "balance_loss_mlp": 1.02305579, + "epoch": 0.3416804449120697, + "flos": 28803338449920.0, + "grad_norm": 1.6045328575230626, + "language_loss": 0.83801365, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.85968637, + "num_input_tokens_seen": 122125080, + "step": 5683, + "time_per_iteration": 2.530492067337036 + }, + { + "auxiliary_loss_clip": 0.01111478, + "auxiliary_loss_mlp": 0.01047235, + "balance_loss_clip": 1.04813743, + "balance_loss_mlp": 1.03218484, + "epoch": 0.3417405681647377, + "flos": 22601601889920.0, + "grad_norm": 1.6631889328829361, + "language_loss": 0.70458078, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.72616798, + "num_input_tokens_seen": 122146350, + "step": 5684, + "time_per_iteration": 2.629939317703247 + }, + { + "auxiliary_loss_clip": 0.01128772, + "auxiliary_loss_mlp": 0.01031531, + "balance_loss_clip": 1.04715002, + "balance_loss_mlp": 1.01769042, + "epoch": 0.34180069141740566, + "flos": 24716237420160.0, + "grad_norm": 1.3529969663233177, + "language_loss": 0.75151646, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77311951, + "num_input_tokens_seen": 122168085, + "step": 5685, + "time_per_iteration": 2.5329973697662354 + }, + { + "auxiliary_loss_clip": 0.01112496, + "auxiliary_loss_mlp": 0.01041535, + "balance_loss_clip": 1.0444175, + "balance_loss_mlp": 1.02691436, + "epoch": 0.3418608146700737, + "flos": 30518755246080.0, + "grad_norm": 2.1529275824714267, + "language_loss": 0.70438361, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.7259239, + "num_input_tokens_seen": 122191040, + "step": 5686, + "time_per_iteration": 2.5817360877990723 + }, + { + "auxiliary_loss_clip": 0.01120411, + "auxiliary_loss_mlp": 0.01044068, + "balance_loss_clip": 1.0471158, + "balance_loss_mlp": 1.02877307, + "epoch": 0.34192093792274164, + "flos": 15122343237120.0, + "grad_norm": 1.9392015101583955, + "language_loss": 0.7749486, + "learning_rate": 3.06327495310661e-06, + "loss": 0.79659337, + "num_input_tokens_seen": 122209225, + "step": 5687, + "time_per_iteration": 2.4860329627990723 + }, + { + "auxiliary_loss_clip": 0.01103214, + "auxiliary_loss_mlp": 0.01036005, + "balance_loss_clip": 1.04832828, + "balance_loss_mlp": 1.02118707, + "epoch": 0.3419810611754096, + "flos": 13187799521280.0, + "grad_norm": 1.750346626851128, + "language_loss": 0.86822718, + "learning_rate": 3.062945069803981e-06, + "loss": 0.88961935, + "num_input_tokens_seen": 122226160, + "step": 5688, + "time_per_iteration": 2.5126590728759766 + }, + { + "auxiliary_loss_clip": 0.01119674, + "auxiliary_loss_mlp": 0.01038386, + "balance_loss_clip": 1.04751062, + "balance_loss_mlp": 1.02284098, + "epoch": 0.34204118442807757, + "flos": 19536267139200.0, + "grad_norm": 1.6894257675095075, + "language_loss": 0.79903233, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.82061291, + "num_input_tokens_seen": 122243115, + "step": 5689, + "time_per_iteration": 2.515993356704712 + }, + { + "auxiliary_loss_clip": 0.01123746, + "auxiliary_loss_mlp": 0.01039358, + "balance_loss_clip": 1.04777002, + "balance_loss_mlp": 1.02437329, + "epoch": 0.34210130768074554, + "flos": 15194846839680.0, + "grad_norm": 1.92160976041539, + "language_loss": 0.7376225, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.75925356, + "num_input_tokens_seen": 122261105, + "step": 5690, + "time_per_iteration": 2.4873716831207275 + }, + { + "auxiliary_loss_clip": 0.0111335, + "auxiliary_loss_mlp": 0.0103636, + "balance_loss_clip": 1.04337406, + "balance_loss_mlp": 1.02130353, + "epoch": 0.3421614309334135, + "flos": 24936226266240.0, + "grad_norm": 1.7211127251650544, + "language_loss": 0.75679612, + "learning_rate": 3.061955178104237e-06, + "loss": 0.77829319, + "num_input_tokens_seen": 122279995, + "step": 5691, + "time_per_iteration": 2.5226752758026123 + }, + { + "auxiliary_loss_clip": 0.01115209, + "auxiliary_loss_mlp": 0.0103472, + "balance_loss_clip": 1.045187, + "balance_loss_mlp": 1.02133286, + "epoch": 0.34222155418608147, + "flos": 21908633731200.0, + "grad_norm": 1.8235103448354448, + "language_loss": 0.6795457, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.70104498, + "num_input_tokens_seen": 122299070, + "step": 5692, + "time_per_iteration": 2.522111177444458 + }, + { + "auxiliary_loss_clip": 0.01119891, + "auxiliary_loss_mlp": 0.01036678, + "balance_loss_clip": 1.0467689, + "balance_loss_mlp": 1.02169359, + "epoch": 0.34228167743874943, + "flos": 18114061063680.0, + "grad_norm": 2.108045862238099, + "language_loss": 0.72894788, + "learning_rate": 3.06129504893632e-06, + "loss": 0.75051355, + "num_input_tokens_seen": 122316800, + "step": 5693, + "time_per_iteration": 2.4905612468719482 + }, + { + "auxiliary_loss_clip": 0.01087615, + "auxiliary_loss_mlp": 0.01041149, + "balance_loss_clip": 1.04446435, + "balance_loss_mlp": 1.02713013, + "epoch": 0.3423418006914174, + "flos": 21288600138240.0, + "grad_norm": 7.5947264208531085, + "language_loss": 0.75097442, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.77226204, + "num_input_tokens_seen": 122335275, + "step": 5694, + "time_per_iteration": 2.561955451965332 + }, + { + "auxiliary_loss_clip": 0.01089518, + "auxiliary_loss_mlp": 0.010373, + "balance_loss_clip": 1.04769492, + "balance_loss_mlp": 1.02375841, + "epoch": 0.34240192394408536, + "flos": 19823480288640.0, + "grad_norm": 1.849644282803971, + "language_loss": 0.79685074, + "learning_rate": 3.060634758790747e-06, + "loss": 0.81811887, + "num_input_tokens_seen": 122353215, + "step": 5695, + "time_per_iteration": 2.5597431659698486 + }, + { + "auxiliary_loss_clip": 0.01077687, + "auxiliary_loss_mlp": 0.01040897, + "balance_loss_clip": 1.04354024, + "balance_loss_mlp": 1.02640128, + "epoch": 0.3424620471967533, + "flos": 24535535074560.0, + "grad_norm": 1.6324055721219808, + "language_loss": 0.7356804, + "learning_rate": 3.060304553382635e-06, + "loss": 0.75686622, + "num_input_tokens_seen": 122372495, + "step": 5696, + "time_per_iteration": 2.624439001083374 + }, + { + "auxiliary_loss_clip": 0.01094564, + "auxiliary_loss_mlp": 0.01055637, + "balance_loss_clip": 1.04829872, + "balance_loss_mlp": 1.03990149, + "epoch": 0.3425221704494213, + "flos": 25848895962240.0, + "grad_norm": 1.6897783954598737, + "language_loss": 0.70765203, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.72915399, + "num_input_tokens_seen": 122394600, + "step": 5697, + "time_per_iteration": 2.6139001846313477 + }, + { + "auxiliary_loss_clip": 0.01111009, + "auxiliary_loss_mlp": 0.01032898, + "balance_loss_clip": 1.04961157, + "balance_loss_mlp": 1.01870036, + "epoch": 0.34258229370208926, + "flos": 21540513196800.0, + "grad_norm": 1.94705850959536, + "language_loss": 0.82419479, + "learning_rate": 3.05964402195837e-06, + "loss": 0.84563386, + "num_input_tokens_seen": 122414700, + "step": 5698, + "time_per_iteration": 2.5791268348693848 + }, + { + "auxiliary_loss_clip": 0.01079567, + "auxiliary_loss_mlp": 0.01049956, + "balance_loss_clip": 1.04466009, + "balance_loss_mlp": 1.03151441, + "epoch": 0.3426424169547573, + "flos": 23652778429440.0, + "grad_norm": 1.7503673228858105, + "language_loss": 0.69028735, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.71158254, + "num_input_tokens_seen": 122432760, + "step": 5699, + "time_per_iteration": 2.6564457416534424 + }, + { + "auxiliary_loss_clip": 0.01108427, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.04644513, + "balance_loss_mlp": 1.02148223, + "epoch": 0.34270254020742524, + "flos": 24644883052800.0, + "grad_norm": 2.0294204050599003, + "language_loss": 0.72586113, + "learning_rate": 3.058983329806877e-06, + "loss": 0.74730098, + "num_input_tokens_seen": 122449105, + "step": 5700, + "time_per_iteration": 2.579179048538208 + }, + { + "auxiliary_loss_clip": 0.01102843, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.0489527, + "balance_loss_mlp": 1.01965237, + "epoch": 0.3427626634600932, + "flos": 20996754134400.0, + "grad_norm": 1.808419279637055, + "language_loss": 0.82007027, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.84143019, + "num_input_tokens_seen": 122468700, + "step": 5701, + "time_per_iteration": 2.5421142578125 + }, + { + "auxiliary_loss_clip": 0.01113807, + "auxiliary_loss_mlp": 0.01032218, + "balance_loss_clip": 1.04644346, + "balance_loss_mlp": 1.0183177, + "epoch": 0.3428227867127612, + "flos": 21433786911360.0, + "grad_norm": 1.6211516508197004, + "language_loss": 0.71206844, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.73352861, + "num_input_tokens_seen": 122488160, + "step": 5702, + "time_per_iteration": 2.513777494430542 + }, + { + "auxiliary_loss_clip": 0.01035916, + "auxiliary_loss_mlp": 0.01006506, + "balance_loss_clip": 1.03588808, + "balance_loss_mlp": 1.0043366, + "epoch": 0.34288290996542914, + "flos": 55731782695680.0, + "grad_norm": 0.7929841012532431, + "language_loss": 0.57401109, + "learning_rate": 3.057991990435309e-06, + "loss": 0.59443533, + "num_input_tokens_seen": 122542890, + "step": 5703, + "time_per_iteration": 3.0223841667175293 + }, + { + "auxiliary_loss_clip": 0.01119227, + "auxiliary_loss_mlp": 0.01040621, + "balance_loss_clip": 1.04743934, + "balance_loss_mlp": 1.0249989, + "epoch": 0.3429430332180971, + "flos": 20156803522560.0, + "grad_norm": 1.7997286723572108, + "language_loss": 0.74340427, + "learning_rate": 3.057661463723086e-06, + "loss": 0.76500273, + "num_input_tokens_seen": 122561770, + "step": 5704, + "time_per_iteration": 2.521563768386841 + }, + { + "auxiliary_loss_clip": 0.01100733, + "auxiliary_loss_mlp": 0.01041631, + "balance_loss_clip": 1.04660726, + "balance_loss_mlp": 1.02860093, + "epoch": 0.34300315647076507, + "flos": 17965857548160.0, + "grad_norm": 1.9523696512974578, + "language_loss": 0.72830284, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.74972641, + "num_input_tokens_seen": 122580580, + "step": 5705, + "time_per_iteration": 2.597683906555176 + }, + { + "auxiliary_loss_clip": 0.01085546, + "auxiliary_loss_mlp": 0.01035446, + "balance_loss_clip": 1.04298496, + "balance_loss_mlp": 1.02101541, + "epoch": 0.34306327972343303, + "flos": 22086822124800.0, + "grad_norm": 1.98706037545995, + "language_loss": 0.80058694, + "learning_rate": 3.057000289991289e-06, + "loss": 0.82179689, + "num_input_tokens_seen": 122599810, + "step": 5706, + "time_per_iteration": 2.589869737625122 + }, + { + "auxiliary_loss_clip": 0.01113707, + "auxiliary_loss_mlp": 0.01033211, + "balance_loss_clip": 1.04834807, + "balance_loss_mlp": 1.01878083, + "epoch": 0.343123402976101, + "flos": 18442679616000.0, + "grad_norm": 3.3997122089974274, + "language_loss": 0.82996064, + "learning_rate": 3.056669642996787e-06, + "loss": 0.85142982, + "num_input_tokens_seen": 122616035, + "step": 5707, + "time_per_iteration": 2.4882290363311768 + }, + { + "auxiliary_loss_clip": 0.01120917, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.04911935, + "balance_loss_mlp": 1.01712477, + "epoch": 0.34318352622876896, + "flos": 17163685065600.0, + "grad_norm": 1.8754806185084165, + "language_loss": 0.74905574, + "learning_rate": 3.056338955933266e-06, + "loss": 0.77057755, + "num_input_tokens_seen": 122633785, + "step": 5708, + "time_per_iteration": 2.4715514183044434 + }, + { + "auxiliary_loss_clip": 0.01097198, + "auxiliary_loss_mlp": 0.01036841, + "balance_loss_clip": 1.04433513, + "balance_loss_mlp": 1.02314353, + "epoch": 0.34324364948143693, + "flos": 26688164215680.0, + "grad_norm": 1.6356228858761876, + "language_loss": 0.81198543, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.8333258, + "num_input_tokens_seen": 122652100, + "step": 5709, + "time_per_iteration": 2.561706304550171 + }, + { + "auxiliary_loss_clip": 0.01109225, + "auxiliary_loss_mlp": 0.01041432, + "balance_loss_clip": 1.04690897, + "balance_loss_mlp": 1.02567863, + "epoch": 0.3433037727341049, + "flos": 21251576194560.0, + "grad_norm": 2.095245398137087, + "language_loss": 0.78742772, + "learning_rate": 3.055677461649329e-06, + "loss": 0.80893427, + "num_input_tokens_seen": 122669720, + "step": 5710, + "time_per_iteration": 2.5305063724517822 + }, + { + "auxiliary_loss_clip": 0.01120379, + "auxiliary_loss_mlp": 0.01036491, + "balance_loss_clip": 1.04589629, + "balance_loss_mlp": 1.02122593, + "epoch": 0.34336389598677286, + "flos": 20629423699200.0, + "grad_norm": 1.8374974785896627, + "language_loss": 0.70443261, + "learning_rate": 3.055346654453996e-06, + "loss": 0.72600126, + "num_input_tokens_seen": 122688715, + "step": 5711, + "time_per_iteration": 3.9133830070495605 + }, + { + "auxiliary_loss_clip": 0.01094939, + "auxiliary_loss_mlp": 0.00793821, + "balance_loss_clip": 1.04353321, + "balance_loss_mlp": 1.01639199, + "epoch": 0.3434240192394409, + "flos": 14538579402240.0, + "grad_norm": 1.7961413868105884, + "language_loss": 0.66999364, + "learning_rate": 3.055015807239812e-06, + "loss": 0.68888128, + "num_input_tokens_seen": 122706970, + "step": 5712, + "time_per_iteration": 2.5162129402160645 + }, + { + "auxiliary_loss_clip": 0.01018826, + "auxiliary_loss_mlp": 0.01011539, + "balance_loss_clip": 1.02230716, + "balance_loss_mlp": 1.00950098, + "epoch": 0.34348414249210885, + "flos": 58051538841600.0, + "grad_norm": 0.8469514412611118, + "language_loss": 0.58112222, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60142589, + "num_input_tokens_seen": 122758095, + "step": 5713, + "time_per_iteration": 3.1269378662109375 + }, + { + "auxiliary_loss_clip": 0.01130573, + "auxiliary_loss_mlp": 0.01039821, + "balance_loss_clip": 1.04695427, + "balance_loss_mlp": 1.02540803, + "epoch": 0.3435442657447768, + "flos": 20704441253760.0, + "grad_norm": 1.7848104135568157, + "language_loss": 0.80742419, + "learning_rate": 3.054353992805076e-06, + "loss": 0.82912809, + "num_input_tokens_seen": 122777815, + "step": 5714, + "time_per_iteration": 3.9391515254974365 + }, + { + "auxiliary_loss_clip": 0.01129218, + "auxiliary_loss_mlp": 0.01039389, + "balance_loss_clip": 1.04644632, + "balance_loss_mlp": 1.02446985, + "epoch": 0.3436043889974448, + "flos": 22930256355840.0, + "grad_norm": 1.8432215189574783, + "language_loss": 0.72314036, + "learning_rate": 3.05402302560962e-06, + "loss": 0.74482644, + "num_input_tokens_seen": 122797555, + "step": 5715, + "time_per_iteration": 3.881129503250122 + }, + { + "auxiliary_loss_clip": 0.01032908, + "auxiliary_loss_mlp": 0.0100322, + "balance_loss_clip": 1.020015, + "balance_loss_mlp": 1.00106263, + "epoch": 0.34366451225011274, + "flos": 58403285752320.0, + "grad_norm": 0.894190752780206, + "language_loss": 0.65825307, + "learning_rate": 3.053692018445505e-06, + "loss": 0.67861432, + "num_input_tokens_seen": 122863955, + "step": 5716, + "time_per_iteration": 3.132615804672241 + }, + { + "auxiliary_loss_clip": 0.01118548, + "auxiliary_loss_mlp": 0.01042479, + "balance_loss_clip": 1.05299509, + "balance_loss_mlp": 1.02757812, + "epoch": 0.3437246355027807, + "flos": 15596292216960.0, + "grad_norm": 1.7165858454587368, + "language_loss": 0.73807061, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.75968087, + "num_input_tokens_seen": 122883000, + "step": 5717, + "time_per_iteration": 2.5186049938201904 + }, + { + "auxiliary_loss_clip": 0.01075882, + "auxiliary_loss_mlp": 0.01036853, + "balance_loss_clip": 1.04658663, + "balance_loss_mlp": 1.0232091, + "epoch": 0.34378475875544867, + "flos": 27672260106240.0, + "grad_norm": 1.6239063083827567, + "language_loss": 0.7564429, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.77757031, + "num_input_tokens_seen": 122903265, + "step": 5718, + "time_per_iteration": 4.009954214096069 + }, + { + "auxiliary_loss_clip": 0.0109336, + "auxiliary_loss_mlp": 0.01046333, + "balance_loss_clip": 1.04817104, + "balance_loss_mlp": 1.03115189, + "epoch": 0.34384488200811664, + "flos": 31431496769280.0, + "grad_norm": 1.8350036771739286, + "language_loss": 0.6398294, + "learning_rate": 3.052698757266734e-06, + "loss": 0.66122639, + "num_input_tokens_seen": 122923860, + "step": 5719, + "time_per_iteration": 2.6446497440338135 + }, + { + "auxiliary_loss_clip": 0.01087221, + "auxiliary_loss_mlp": 0.01038605, + "balance_loss_clip": 1.04416358, + "balance_loss_mlp": 1.02190387, + "epoch": 0.3439050052607846, + "flos": 24899920594560.0, + "grad_norm": 1.6799516539082462, + "language_loss": 0.73821795, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.75947618, + "num_input_tokens_seen": 122945305, + "step": 5720, + "time_per_iteration": 2.6081628799438477 + }, + { + "auxiliary_loss_clip": 0.01113698, + "auxiliary_loss_mlp": 0.01049324, + "balance_loss_clip": 1.0437721, + "balance_loss_mlp": 1.03251576, + "epoch": 0.34396512851345257, + "flos": 18150079426560.0, + "grad_norm": 1.6222564078771202, + "language_loss": 0.74273765, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.76436788, + "num_input_tokens_seen": 122962535, + "step": 5721, + "time_per_iteration": 2.474538564682007 + }, + { + "auxiliary_loss_clip": 0.01111049, + "auxiliary_loss_mlp": 0.00796407, + "balance_loss_clip": 1.05333543, + "balance_loss_mlp": 1.01797044, + "epoch": 0.34402525176612053, + "flos": 16034438315520.0, + "grad_norm": 1.9726514238743618, + "language_loss": 0.79943669, + "learning_rate": 3.051705136821992e-06, + "loss": 0.81851125, + "num_input_tokens_seen": 122979750, + "step": 5722, + "time_per_iteration": 2.5066146850585938 + }, + { + "auxiliary_loss_clip": 0.01079007, + "auxiliary_loss_mlp": 0.01032068, + "balance_loss_clip": 1.04562068, + "balance_loss_mlp": 1.01846611, + "epoch": 0.3440853750187885, + "flos": 21178641628800.0, + "grad_norm": 2.212307655994471, + "language_loss": 0.81790084, + "learning_rate": 3.051373850228801e-06, + "loss": 0.83901155, + "num_input_tokens_seen": 122998955, + "step": 5723, + "time_per_iteration": 2.6435256004333496 + }, + { + "auxiliary_loss_clip": 0.0109358, + "auxiliary_loss_mlp": 0.01050386, + "balance_loss_clip": 1.04387426, + "balance_loss_mlp": 1.03409004, + "epoch": 0.34414549827145646, + "flos": 12677868092160.0, + "grad_norm": 1.8472328405445384, + "language_loss": 0.80935365, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.83079338, + "num_input_tokens_seen": 123016165, + "step": 5724, + "time_per_iteration": 2.515774965286255 + }, + { + "auxiliary_loss_clip": 0.01102317, + "auxiliary_loss_mlp": 0.01043338, + "balance_loss_clip": 1.04343581, + "balance_loss_mlp": 1.02682769, + "epoch": 0.3442056215241244, + "flos": 31284514316160.0, + "grad_norm": 1.71552655143069, + "language_loss": 0.68730938, + "learning_rate": 3.05071115745038e-06, + "loss": 0.70876598, + "num_input_tokens_seen": 123036900, + "step": 5725, + "time_per_iteration": 2.60569429397583 + }, + { + "auxiliary_loss_clip": 0.01120135, + "auxiliary_loss_mlp": 0.01044824, + "balance_loss_clip": 1.04667759, + "balance_loss_mlp": 1.02792013, + "epoch": 0.34426574477679245, + "flos": 23367289132800.0, + "grad_norm": 1.3409587827898828, + "language_loss": 0.69242626, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.71407586, + "num_input_tokens_seen": 123057480, + "step": 5726, + "time_per_iteration": 2.5077943801879883 + }, + { + "auxiliary_loss_clip": 0.01099304, + "auxiliary_loss_mlp": 0.01041826, + "balance_loss_clip": 1.04617119, + "balance_loss_mlp": 1.0280093, + "epoch": 0.3443258680294604, + "flos": 24535427333760.0, + "grad_norm": 1.6817234128163239, + "language_loss": 0.73005056, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.75146192, + "num_input_tokens_seen": 123076890, + "step": 5727, + "time_per_iteration": 2.597428321838379 + }, + { + "auxiliary_loss_clip": 0.01090028, + "auxiliary_loss_mlp": 0.01039791, + "balance_loss_clip": 1.04428828, + "balance_loss_mlp": 1.02435303, + "epoch": 0.3443859912821284, + "flos": 20230133137920.0, + "grad_norm": 1.7785759388353075, + "language_loss": 0.88125885, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.90255702, + "num_input_tokens_seen": 123092530, + "step": 5728, + "time_per_iteration": 2.548021078109741 + }, + { + "auxiliary_loss_clip": 0.01086334, + "auxiliary_loss_mlp": 0.01043323, + "balance_loss_clip": 1.05194914, + "balance_loss_mlp": 1.02857649, + "epoch": 0.34444611453479634, + "flos": 24316515895680.0, + "grad_norm": 1.937217359294808, + "language_loss": 0.70012105, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.72141767, + "num_input_tokens_seen": 123110560, + "step": 5729, + "time_per_iteration": 2.61147403717041 + }, + { + "auxiliary_loss_clip": 0.01115147, + "auxiliary_loss_mlp": 0.01032406, + "balance_loss_clip": 1.04565525, + "balance_loss_mlp": 1.01780331, + "epoch": 0.3445062377874643, + "flos": 16983413683200.0, + "grad_norm": 2.2589660532263927, + "language_loss": 0.73904788, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.76052344, + "num_input_tokens_seen": 123128655, + "step": 5730, + "time_per_iteration": 2.465240716934204 + }, + { + "auxiliary_loss_clip": 0.01091006, + "auxiliary_loss_mlp": 0.01048322, + "balance_loss_clip": 1.04208827, + "balance_loss_mlp": 1.03193116, + "epoch": 0.3445663610401323, + "flos": 20302708567680.0, + "grad_norm": 2.043328677408356, + "language_loss": 0.79610717, + "learning_rate": 3.048722123283578e-06, + "loss": 0.81750053, + "num_input_tokens_seen": 123145130, + "step": 5731, + "time_per_iteration": 2.5481083393096924 + }, + { + "auxiliary_loss_clip": 0.01118139, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.04501152, + "balance_loss_mlp": 1.02233028, + "epoch": 0.34462648429280024, + "flos": 15888102307200.0, + "grad_norm": 2.0607995050496863, + "language_loss": 0.78710258, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.8086524, + "num_input_tokens_seen": 123162265, + "step": 5732, + "time_per_iteration": 2.4600865840911865 + }, + { + "auxiliary_loss_clip": 0.01016771, + "auxiliary_loss_mlp": 0.01003446, + "balance_loss_clip": 1.0188992, + "balance_loss_mlp": 1.00147939, + "epoch": 0.3446866075454682, + "flos": 59311035285120.0, + "grad_norm": 0.7427556907238304, + "language_loss": 0.53545117, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55565333, + "num_input_tokens_seen": 123218620, + "step": 5733, + "time_per_iteration": 3.1845645904541016 + }, + { + "auxiliary_loss_clip": 0.01111863, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.04751742, + "balance_loss_mlp": 1.02314782, + "epoch": 0.34474673079813617, + "flos": 22343799000960.0, + "grad_norm": 2.055627067095104, + "language_loss": 0.83521259, + "learning_rate": 3.047727069167207e-06, + "loss": 0.85671401, + "num_input_tokens_seen": 123237325, + "step": 5734, + "time_per_iteration": 2.525757312774658 + }, + { + "auxiliary_loss_clip": 0.01108431, + "auxiliary_loss_mlp": 0.01034507, + "balance_loss_clip": 1.04631579, + "balance_loss_mlp": 1.01990414, + "epoch": 0.34480685405080413, + "flos": 27670141203840.0, + "grad_norm": 1.8371320533577185, + "language_loss": 0.92572904, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.9471584, + "num_input_tokens_seen": 123258650, + "step": 5735, + "time_per_iteration": 2.580904960632324 + }, + { + "auxiliary_loss_clip": 0.01090967, + "auxiliary_loss_mlp": 0.01038424, + "balance_loss_clip": 1.04761338, + "balance_loss_mlp": 1.0231117, + "epoch": 0.3448669773034721, + "flos": 22456020067200.0, + "grad_norm": 2.1082194557465805, + "language_loss": 0.76913357, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.79042745, + "num_input_tokens_seen": 123277155, + "step": 5736, + "time_per_iteration": 2.6083786487579346 + }, + { + "auxiliary_loss_clip": 0.01109842, + "auxiliary_loss_mlp": 0.01039175, + "balance_loss_clip": 1.04582214, + "balance_loss_mlp": 1.02478611, + "epoch": 0.34492710055614006, + "flos": 24936190352640.0, + "grad_norm": 1.583783552807668, + "language_loss": 0.78856671, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.81005687, + "num_input_tokens_seen": 123297640, + "step": 5737, + "time_per_iteration": 2.5422370433807373 + }, + { + "auxiliary_loss_clip": 0.01084691, + "auxiliary_loss_mlp": 0.01048588, + "balance_loss_clip": 1.04155731, + "balance_loss_mlp": 1.03093326, + "epoch": 0.34498722380880803, + "flos": 20120821073280.0, + "grad_norm": 1.7780168905197622, + "language_loss": 0.71362531, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.73495805, + "num_input_tokens_seen": 123314370, + "step": 5738, + "time_per_iteration": 2.569952964782715 + }, + { + "auxiliary_loss_clip": 0.01092197, + "auxiliary_loss_mlp": 0.0104044, + "balance_loss_clip": 1.04213834, + "balance_loss_mlp": 1.02439475, + "epoch": 0.34504734706147605, + "flos": 28438126917120.0, + "grad_norm": 2.054872494098109, + "language_loss": 0.81606579, + "learning_rate": 3.046067851209389e-06, + "loss": 0.83739215, + "num_input_tokens_seen": 123336085, + "step": 5739, + "time_per_iteration": 2.6183180809020996 + }, + { + "auxiliary_loss_clip": 0.0109791, + "auxiliary_loss_mlp": 0.0103903, + "balance_loss_clip": 1.04581046, + "balance_loss_mlp": 1.02318728, + "epoch": 0.345107470314144, + "flos": 22674464628480.0, + "grad_norm": 1.993210083260518, + "language_loss": 0.83063483, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.85200429, + "num_input_tokens_seen": 123354460, + "step": 5740, + "time_per_iteration": 2.563612461090088 + }, + { + "auxiliary_loss_clip": 0.01121295, + "auxiliary_loss_mlp": 0.01036688, + "balance_loss_clip": 1.04820204, + "balance_loss_mlp": 1.0203445, + "epoch": 0.345167593566812, + "flos": 20630716588800.0, + "grad_norm": 4.075478657409653, + "language_loss": 0.76558262, + "learning_rate": 3.045403886269181e-06, + "loss": 0.78716242, + "num_input_tokens_seen": 123373420, + "step": 5741, + "time_per_iteration": 2.495298385620117 + }, + { + "auxiliary_loss_clip": 0.01106825, + "auxiliary_loss_mlp": 0.01031663, + "balance_loss_clip": 1.04618788, + "balance_loss_mlp": 1.01694012, + "epoch": 0.34522771681947995, + "flos": 26214358890240.0, + "grad_norm": 2.188410888477557, + "language_loss": 0.77120763, + "learning_rate": 3.045071844330053e-06, + "loss": 0.79259253, + "num_input_tokens_seen": 123394730, + "step": 5742, + "time_per_iteration": 2.581979274749756 + }, + { + "auxiliary_loss_clip": 0.01118132, + "auxiliary_loss_mlp": 0.01035684, + "balance_loss_clip": 1.04389036, + "balance_loss_mlp": 1.02091432, + "epoch": 0.3452878400721479, + "flos": 19062354072960.0, + "grad_norm": 2.046197052403095, + "language_loss": 0.75556755, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.77710575, + "num_input_tokens_seen": 123412895, + "step": 5743, + "time_per_iteration": 2.5174050331115723 + }, + { + "auxiliary_loss_clip": 0.01117526, + "auxiliary_loss_mlp": 0.01038812, + "balance_loss_clip": 1.04565287, + "balance_loss_mlp": 1.02477479, + "epoch": 0.3453479633248159, + "flos": 27929739772800.0, + "grad_norm": 2.0706179695993274, + "language_loss": 0.7048558, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.72641921, + "num_input_tokens_seen": 123432320, + "step": 5744, + "time_per_iteration": 2.564758539199829 + }, + { + "auxiliary_loss_clip": 0.01127736, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.04577875, + "balance_loss_mlp": 1.01846027, + "epoch": 0.34540808657748384, + "flos": 19606113135360.0, + "grad_norm": 1.6240809421625528, + "language_loss": 0.79789662, + "learning_rate": 3.044075480787665e-06, + "loss": 0.81950915, + "num_input_tokens_seen": 123450980, + "step": 5745, + "time_per_iteration": 2.4474313259124756 + }, + { + "auxiliary_loss_clip": 0.0108774, + "auxiliary_loss_mlp": 0.01038068, + "balance_loss_clip": 1.04627085, + "balance_loss_mlp": 1.02247524, + "epoch": 0.3454682098301518, + "flos": 20411661496320.0, + "grad_norm": 1.997813699322109, + "language_loss": 0.89171344, + "learning_rate": 3.043743280407182e-06, + "loss": 0.9129715, + "num_input_tokens_seen": 123469365, + "step": 5746, + "time_per_iteration": 2.6046202182769775 + }, + { + "auxiliary_loss_clip": 0.01124175, + "auxiliary_loss_mlp": 0.01038383, + "balance_loss_clip": 1.04847503, + "balance_loss_mlp": 1.0228498, + "epoch": 0.34552833308281977, + "flos": 21325121291520.0, + "grad_norm": 1.8169452728700954, + "language_loss": 0.64456427, + "learning_rate": 3.043411040447849e-06, + "loss": 0.66618979, + "num_input_tokens_seen": 123489425, + "step": 5747, + "time_per_iteration": 2.492297410964966 + }, + { + "auxiliary_loss_clip": 0.01109849, + "auxiliary_loss_mlp": 0.01034783, + "balance_loss_clip": 1.04472148, + "balance_loss_mlp": 1.02013254, + "epoch": 0.34558845633548774, + "flos": 36243633824640.0, + "grad_norm": 1.5217327850553932, + "language_loss": 0.73002928, + "learning_rate": 3.043078760922264e-06, + "loss": 0.75147557, + "num_input_tokens_seen": 123509970, + "step": 5748, + "time_per_iteration": 2.6488916873931885 + }, + { + "auxiliary_loss_clip": 0.0107795, + "auxiliary_loss_mlp": 0.01035976, + "balance_loss_clip": 1.04922724, + "balance_loss_mlp": 1.02225494, + "epoch": 0.3456485795881557, + "flos": 22450561200000.0, + "grad_norm": 3.022688996848376, + "language_loss": 0.7558614, + "learning_rate": 3.042746441843029e-06, + "loss": 0.77700067, + "num_input_tokens_seen": 123531055, + "step": 5749, + "time_per_iteration": 2.632967233657837 + }, + { + "auxiliary_loss_clip": 0.01027975, + "auxiliary_loss_mlp": 0.01004699, + "balance_loss_clip": 1.01792622, + "balance_loss_mlp": 1.00244546, + "epoch": 0.34570870284082367, + "flos": 62004299005440.0, + "grad_norm": 0.8827388039971793, + "language_loss": 0.62660539, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.64693213, + "num_input_tokens_seen": 123584720, + "step": 5750, + "time_per_iteration": 4.349289417266846 + }, + { + "auxiliary_loss_clip": 0.01104448, + "auxiliary_loss_mlp": 0.01033892, + "balance_loss_clip": 1.04689884, + "balance_loss_mlp": 1.0198369, + "epoch": 0.34576882609349163, + "flos": 22782196494720.0, + "grad_norm": 1.7625318178431404, + "language_loss": 0.80857599, + "learning_rate": 3.042081685074012e-06, + "loss": 0.82995939, + "num_input_tokens_seen": 123604465, + "step": 5751, + "time_per_iteration": 2.544708728790283 + }, + { + "auxiliary_loss_clip": 0.01125335, + "auxiliary_loss_mlp": 0.01042696, + "balance_loss_clip": 1.04442215, + "balance_loss_mlp": 1.02790177, + "epoch": 0.34582894934615965, + "flos": 12348818576640.0, + "grad_norm": 2.2304129204516587, + "language_loss": 0.83836287, + "learning_rate": 3.041749247409439e-06, + "loss": 0.86004317, + "num_input_tokens_seen": 123622320, + "step": 5752, + "time_per_iteration": 2.4377377033233643 + }, + { + "auxiliary_loss_clip": 0.01022518, + "auxiliary_loss_mlp": 0.00861033, + "balance_loss_clip": 1.01479959, + "balance_loss_mlp": 1.17114151, + "epoch": 0.3458890725988276, + "flos": 70167691071360.0, + "grad_norm": 0.7599886160242099, + "language_loss": 0.63112593, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.64996141, + "num_input_tokens_seen": 123678010, + "step": 5753, + "time_per_iteration": 4.451346158981323 + }, + { + "auxiliary_loss_clip": 0.01102927, + "auxiliary_loss_mlp": 0.0103607, + "balance_loss_clip": 1.04592156, + "balance_loss_mlp": 1.02096605, + "epoch": 0.3459491958514956, + "flos": 17092582093440.0, + "grad_norm": 1.9421549691834499, + "language_loss": 0.70807165, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.72946167, + "num_input_tokens_seen": 123696830, + "step": 5754, + "time_per_iteration": 3.9976775646209717 + }, + { + "auxiliary_loss_clip": 0.0111903, + "auxiliary_loss_mlp": 0.01035601, + "balance_loss_clip": 1.04542267, + "balance_loss_mlp": 1.02084327, + "epoch": 0.34600931910416355, + "flos": 16650952375680.0, + "grad_norm": 1.7656342507534244, + "language_loss": 0.72987127, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.75141758, + "num_input_tokens_seen": 123714360, + "step": 5755, + "time_per_iteration": 2.4671053886413574 + }, + { + "auxiliary_loss_clip": 0.01117104, + "auxiliary_loss_mlp": 0.01032361, + "balance_loss_clip": 1.04573309, + "balance_loss_mlp": 1.0181278, + "epoch": 0.3460694423568315, + "flos": 38546190334080.0, + "grad_norm": 1.7309596881342568, + "language_loss": 0.72141677, + "learning_rate": 3.040419101844869e-06, + "loss": 0.74291146, + "num_input_tokens_seen": 123739250, + "step": 5756, + "time_per_iteration": 4.001556634902954 + }, + { + "auxiliary_loss_clip": 0.01033947, + "auxiliary_loss_mlp": 0.010035, + "balance_loss_clip": 1.01482129, + "balance_loss_mlp": 1.00140178, + "epoch": 0.3461295656094995, + "flos": 72081479704320.0, + "grad_norm": 0.718414390385618, + "language_loss": 0.62559074, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64596522, + "num_input_tokens_seen": 123802845, + "step": 5757, + "time_per_iteration": 3.1140048503875732 + }, + { + "auxiliary_loss_clip": 0.01016922, + "auxiliary_loss_mlp": 0.00850105, + "balance_loss_clip": 1.01830029, + "balance_loss_mlp": 1.14248848, + "epoch": 0.34618968886216744, + "flos": 65460089571840.0, + "grad_norm": 0.8611784330914724, + "language_loss": 0.59212428, + "learning_rate": 3.039753792295362e-06, + "loss": 0.6107946, + "num_input_tokens_seen": 123861805, + "step": 5758, + "time_per_iteration": 3.134035587310791 + }, + { + "auxiliary_loss_clip": 0.01112544, + "auxiliary_loss_mlp": 0.01039175, + "balance_loss_clip": 1.04846597, + "balance_loss_mlp": 1.02516222, + "epoch": 0.3462498121148354, + "flos": 23472542960640.0, + "grad_norm": 1.8005421841541807, + "language_loss": 0.71731806, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.73883533, + "num_input_tokens_seen": 123881820, + "step": 5759, + "time_per_iteration": 2.5734035968780518 + }, + { + "auxiliary_loss_clip": 0.01077395, + "auxiliary_loss_mlp": 0.01050056, + "balance_loss_clip": 1.0420351, + "balance_loss_mlp": 1.03360546, + "epoch": 0.3463099353675034, + "flos": 24170790418560.0, + "grad_norm": 1.6516304241136652, + "language_loss": 0.82822049, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.84949499, + "num_input_tokens_seen": 123903700, + "step": 5760, + "time_per_iteration": 2.6013007164001465 + }, + { + "auxiliary_loss_clip": 0.01013407, + "auxiliary_loss_mlp": 0.01003671, + "balance_loss_clip": 1.01572824, + "balance_loss_mlp": 1.00148952, + "epoch": 0.34637005862017134, + "flos": 63700609766400.0, + "grad_norm": 0.8308538304551683, + "language_loss": 0.56556827, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.58573902, + "num_input_tokens_seen": 123960075, + "step": 5761, + "time_per_iteration": 3.245985507965088 + }, + { + "auxiliary_loss_clip": 0.01113061, + "auxiliary_loss_mlp": 0.00870884, + "balance_loss_clip": 1.04109764, + "balance_loss_mlp": 1.1615802, + "epoch": 0.3464301818728393, + "flos": 13145532192000.0, + "grad_norm": 2.0128922236619107, + "language_loss": 0.94299078, + "learning_rate": 3.038422700166474e-06, + "loss": 0.96283019, + "num_input_tokens_seen": 123975805, + "step": 5762, + "time_per_iteration": 2.4791805744171143 + }, + { + "auxiliary_loss_clip": 0.01098874, + "auxiliary_loss_mlp": 0.01041792, + "balance_loss_clip": 1.04270649, + "balance_loss_mlp": 1.02691448, + "epoch": 0.34649030512550727, + "flos": 29315173299840.0, + "grad_norm": 1.596381651265668, + "language_loss": 0.6982922, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.71969891, + "num_input_tokens_seen": 123997530, + "step": 5763, + "time_per_iteration": 2.622706651687622 + }, + { + "auxiliary_loss_clip": 0.01120603, + "auxiliary_loss_mlp": 0.01044863, + "balance_loss_clip": 1.0452944, + "balance_loss_mlp": 1.02791142, + "epoch": 0.34655042837817523, + "flos": 23730884553600.0, + "grad_norm": 1.743155019310353, + "language_loss": 0.84081578, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.86247039, + "num_input_tokens_seen": 124016375, + "step": 5764, + "time_per_iteration": 2.5066659450531006 + }, + { + "auxiliary_loss_clip": 0.01094207, + "auxiliary_loss_mlp": 0.01039569, + "balance_loss_clip": 1.04269552, + "balance_loss_mlp": 1.02469122, + "epoch": 0.34661055163084326, + "flos": 22054215553920.0, + "grad_norm": 2.8018435424433927, + "language_loss": 0.675928, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.6972658, + "num_input_tokens_seen": 124033975, + "step": 5765, + "time_per_iteration": 2.501084089279175 + }, + { + "auxiliary_loss_clip": 0.01106865, + "auxiliary_loss_mlp": 0.0104891, + "balance_loss_clip": 1.04815078, + "balance_loss_mlp": 1.03205407, + "epoch": 0.3466706748835112, + "flos": 21799213925760.0, + "grad_norm": 1.8691693206537439, + "language_loss": 0.77180433, + "learning_rate": 3.03709097800413e-06, + "loss": 0.79336214, + "num_input_tokens_seen": 124051930, + "step": 5766, + "time_per_iteration": 2.52838397026062 + }, + { + "auxiliary_loss_clip": 0.01076385, + "auxiliary_loss_mlp": 0.01041478, + "balance_loss_clip": 1.04993629, + "balance_loss_mlp": 1.02704823, + "epoch": 0.3467307981361792, + "flos": 19461680547840.0, + "grad_norm": 1.5288652104895275, + "language_loss": 0.73265833, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.75383699, + "num_input_tokens_seen": 124071220, + "step": 5767, + "time_per_iteration": 2.5667567253112793 + }, + { + "auxiliary_loss_clip": 0.01103977, + "auxiliary_loss_mlp": 0.01043759, + "balance_loss_clip": 1.04609942, + "balance_loss_mlp": 1.02764165, + "epoch": 0.34679092138884715, + "flos": 24827452905600.0, + "grad_norm": 2.1591011923880408, + "language_loss": 0.7798444, + "learning_rate": 3.036424880912893e-06, + "loss": 0.80132174, + "num_input_tokens_seen": 124090140, + "step": 5768, + "time_per_iteration": 2.5613629817962646 + }, + { + "auxiliary_loss_clip": 0.01031981, + "auxiliary_loss_mlp": 0.01006437, + "balance_loss_clip": 1.01456833, + "balance_loss_mlp": 1.00443411, + "epoch": 0.3468510446415151, + "flos": 63236070149760.0, + "grad_norm": 0.7706090516906573, + "language_loss": 0.57498252, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59536672, + "num_input_tokens_seen": 124152025, + "step": 5769, + "time_per_iteration": 3.115565299987793 + }, + { + "auxiliary_loss_clip": 0.01104151, + "auxiliary_loss_mlp": 0.01038226, + "balance_loss_clip": 1.04746962, + "balance_loss_mlp": 1.02077413, + "epoch": 0.3469111678941831, + "flos": 12120713256960.0, + "grad_norm": 2.668064288145012, + "language_loss": 0.86062962, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.88205338, + "num_input_tokens_seen": 124165795, + "step": 5770, + "time_per_iteration": 2.5307531356811523 + }, + { + "auxiliary_loss_clip": 0.0102093, + "auxiliary_loss_mlp": 0.01019774, + "balance_loss_clip": 1.01796949, + "balance_loss_mlp": 1.0174731, + "epoch": 0.34697129114685105, + "flos": 65934110378880.0, + "grad_norm": 0.7724028224132267, + "language_loss": 0.59851867, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.61892575, + "num_input_tokens_seen": 124222925, + "step": 5771, + "time_per_iteration": 2.8933849334716797 + }, + { + "auxiliary_loss_clip": 0.01119235, + "auxiliary_loss_mlp": 0.01049123, + "balance_loss_clip": 1.04439998, + "balance_loss_mlp": 1.0339359, + "epoch": 0.347031414399519, + "flos": 34454205054720.0, + "grad_norm": 1.8465390109194797, + "language_loss": 0.71433222, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.7360158, + "num_input_tokens_seen": 124240915, + "step": 5772, + "time_per_iteration": 2.5881481170654297 + }, + { + "auxiliary_loss_clip": 0.01101113, + "auxiliary_loss_mlp": 0.00864464, + "balance_loss_clip": 1.0436089, + "balance_loss_mlp": 1.14564204, + "epoch": 0.347091537652187, + "flos": 26944135511040.0, + "grad_norm": 1.7220906261969382, + "language_loss": 0.76238954, + "learning_rate": 3.034758950632507e-06, + "loss": 0.78204536, + "num_input_tokens_seen": 124262770, + "step": 5773, + "time_per_iteration": 2.6080851554870605 + }, + { + "auxiliary_loss_clip": 0.01120899, + "auxiliary_loss_mlp": 0.01041316, + "balance_loss_clip": 1.0450263, + "balance_loss_mlp": 1.0255568, + "epoch": 0.34715166090485494, + "flos": 21142228216320.0, + "grad_norm": 2.054189394926556, + "language_loss": 0.69683087, + "learning_rate": 3.034425646811396e-06, + "loss": 0.71845305, + "num_input_tokens_seen": 124280950, + "step": 5774, + "time_per_iteration": 2.524069309234619 + }, + { + "auxiliary_loss_clip": 0.01108204, + "auxiliary_loss_mlp": 0.00857915, + "balance_loss_clip": 1.04639173, + "balance_loss_mlp": 1.13276374, + "epoch": 0.3472117841575229, + "flos": 23478001827840.0, + "grad_norm": 1.577680715417953, + "language_loss": 0.7659058, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.78556705, + "num_input_tokens_seen": 124299540, + "step": 5775, + "time_per_iteration": 2.5543456077575684 + }, + { + "auxiliary_loss_clip": 0.01109875, + "auxiliary_loss_mlp": 0.01042385, + "balance_loss_clip": 1.04392266, + "balance_loss_mlp": 1.02595842, + "epoch": 0.34727190741019087, + "flos": 17492806408320.0, + "grad_norm": 1.9336255500658956, + "language_loss": 0.77338493, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.79490757, + "num_input_tokens_seen": 124316285, + "step": 5776, + "time_per_iteration": 2.5105631351470947 + }, + { + "auxiliary_loss_clip": 0.01020786, + "auxiliary_loss_mlp": 0.01007584, + "balance_loss_clip": 1.01212549, + "balance_loss_mlp": 1.00545049, + "epoch": 0.34733203066285884, + "flos": 65265491640960.0, + "grad_norm": 0.84534566984862, + "language_loss": 0.63361847, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65390223, + "num_input_tokens_seen": 124376650, + "step": 5777, + "time_per_iteration": 3.1819326877593994 + }, + { + "auxiliary_loss_clip": 0.01093596, + "auxiliary_loss_mlp": 0.0104532, + "balance_loss_clip": 1.04659152, + "balance_loss_mlp": 1.0292511, + "epoch": 0.3473921539155268, + "flos": 28658726294400.0, + "grad_norm": 2.01819356058731, + "language_loss": 0.65127444, + "learning_rate": 3.033092039398119e-06, + "loss": 0.67266357, + "num_input_tokens_seen": 124396475, + "step": 5778, + "time_per_iteration": 2.6209211349487305 + }, + { + "auxiliary_loss_clip": 0.01109957, + "auxiliary_loss_mlp": 0.0105062, + "balance_loss_clip": 1.04618764, + "balance_loss_mlp": 1.0352416, + "epoch": 0.3474522771681948, + "flos": 40836895355520.0, + "grad_norm": 1.707816447829896, + "language_loss": 0.71077502, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.73238081, + "num_input_tokens_seen": 124416480, + "step": 5779, + "time_per_iteration": 2.7074155807495117 + }, + { + "auxiliary_loss_clip": 0.01136369, + "auxiliary_loss_mlp": 0.01048829, + "balance_loss_clip": 1.0480268, + "balance_loss_mlp": 1.03330827, + "epoch": 0.3475124004208628, + "flos": 24608577381120.0, + "grad_norm": 1.9760099889815457, + "language_loss": 0.6216538, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.64350569, + "num_input_tokens_seen": 124435950, + "step": 5780, + "time_per_iteration": 2.5170717239379883 + }, + { + "auxiliary_loss_clip": 0.01092941, + "auxiliary_loss_mlp": 0.01047586, + "balance_loss_clip": 1.04467273, + "balance_loss_mlp": 1.032673, + "epoch": 0.34757252367353075, + "flos": 22711309004160.0, + "grad_norm": 1.8609861672635897, + "language_loss": 0.72238052, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.74378574, + "num_input_tokens_seen": 124455410, + "step": 5781, + "time_per_iteration": 2.594478130340576 + }, + { + "auxiliary_loss_clip": 0.0107505, + "auxiliary_loss_mlp": 0.01052134, + "balance_loss_clip": 1.04395282, + "balance_loss_mlp": 1.03484249, + "epoch": 0.3476326469261987, + "flos": 19828184970240.0, + "grad_norm": 2.033933592376054, + "language_loss": 0.76910055, + "learning_rate": 3.031757805185612e-06, + "loss": 0.79037243, + "num_input_tokens_seen": 124474870, + "step": 5782, + "time_per_iteration": 2.596741199493408 + }, + { + "auxiliary_loss_clip": 0.01102929, + "auxiliary_loss_mlp": 0.01035666, + "balance_loss_clip": 1.04703379, + "balance_loss_mlp": 1.02042556, + "epoch": 0.3476927701788667, + "flos": 19938107566080.0, + "grad_norm": 1.9138508804933967, + "language_loss": 0.62517834, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.64656436, + "num_input_tokens_seen": 124494105, + "step": 5783, + "time_per_iteration": 2.526453971862793 + }, + { + "auxiliary_loss_clip": 0.01089424, + "auxiliary_loss_mlp": 0.01032492, + "balance_loss_clip": 1.04706788, + "balance_loss_mlp": 1.01807988, + "epoch": 0.34775289343153465, + "flos": 20735108490240.0, + "grad_norm": 1.7739881976679155, + "language_loss": 0.88447928, + "learning_rate": 3.031090453282605e-06, + "loss": 0.90569842, + "num_input_tokens_seen": 124512030, + "step": 5784, + "time_per_iteration": 2.6694540977478027 + }, + { + "auxiliary_loss_clip": 0.01084108, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.04946375, + "balance_loss_mlp": 1.02036428, + "epoch": 0.3478130166842026, + "flos": 19354846521600.0, + "grad_norm": 1.9744060591984811, + "language_loss": 0.81722569, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.83842099, + "num_input_tokens_seen": 124530980, + "step": 5785, + "time_per_iteration": 2.610063076019287 + }, + { + "auxiliary_loss_clip": 0.01111672, + "auxiliary_loss_mlp": 0.01043322, + "balance_loss_clip": 1.05535197, + "balance_loss_mlp": 1.02803946, + "epoch": 0.3478731399368706, + "flos": 22051198811520.0, + "grad_norm": 2.021910221983646, + "language_loss": 0.80195284, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.82350278, + "num_input_tokens_seen": 124549330, + "step": 5786, + "time_per_iteration": 2.530327796936035 + }, + { + "auxiliary_loss_clip": 0.01133143, + "auxiliary_loss_mlp": 0.00799112, + "balance_loss_clip": 1.04982138, + "balance_loss_mlp": 1.02110016, + "epoch": 0.34793326318953854, + "flos": 18041449720320.0, + "grad_norm": 1.831576835483207, + "language_loss": 0.74861562, + "learning_rate": 3.030089132216836e-06, + "loss": 0.76793814, + "num_input_tokens_seen": 124567200, + "step": 5787, + "time_per_iteration": 2.4620914459228516 + }, + { + "auxiliary_loss_clip": 0.01109155, + "auxiliary_loss_mlp": 0.0079791, + "balance_loss_clip": 1.05137634, + "balance_loss_mlp": 1.01979852, + "epoch": 0.3479933864422065, + "flos": 29314670509440.0, + "grad_norm": 1.8561199133068014, + "language_loss": 0.80901581, + "learning_rate": 3.029755280389203e-06, + "loss": 0.8280865, + "num_input_tokens_seen": 124587025, + "step": 5788, + "time_per_iteration": 2.6011006832122803 + }, + { + "auxiliary_loss_clip": 0.01140229, + "auxiliary_loss_mlp": 0.01036223, + "balance_loss_clip": 1.05208719, + "balance_loss_mlp": 1.01998055, + "epoch": 0.3480535096948745, + "flos": 20120713332480.0, + "grad_norm": 1.767061898422289, + "language_loss": 0.85762572, + "learning_rate": 3.029421389513147e-06, + "loss": 0.87939024, + "num_input_tokens_seen": 124605860, + "step": 5789, + "time_per_iteration": 3.871298313140869 + }, + { + "auxiliary_loss_clip": 0.01129005, + "auxiliary_loss_mlp": 0.01056979, + "balance_loss_clip": 1.05255389, + "balance_loss_mlp": 1.04114771, + "epoch": 0.34811363294754244, + "flos": 18548974938240.0, + "grad_norm": 1.7842060875407577, + "language_loss": 0.85311353, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87497342, + "num_input_tokens_seen": 124624270, + "step": 5790, + "time_per_iteration": 2.500088691711426 + }, + { + "auxiliary_loss_clip": 0.01131189, + "auxiliary_loss_mlp": 0.01044778, + "balance_loss_clip": 1.05405951, + "balance_loss_mlp": 1.02855408, + "epoch": 0.3481737562002104, + "flos": 26870303105280.0, + "grad_norm": 1.8647454294760415, + "language_loss": 0.81413198, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.83589172, + "num_input_tokens_seen": 124644005, + "step": 5791, + "time_per_iteration": 2.548558473587036 + }, + { + "auxiliary_loss_clip": 0.01125297, + "auxiliary_loss_mlp": 0.01039113, + "balance_loss_clip": 1.04978228, + "balance_loss_mlp": 1.02325833, + "epoch": 0.3482338794528784, + "flos": 28908664104960.0, + "grad_norm": 1.9287841113986646, + "language_loss": 0.77678001, + "learning_rate": 3.028419482721056e-06, + "loss": 0.79842407, + "num_input_tokens_seen": 124663020, + "step": 5792, + "time_per_iteration": 5.296192407608032 + }, + { + "auxiliary_loss_clip": 0.01109868, + "auxiliary_loss_mlp": 0.0103031, + "balance_loss_clip": 1.04627991, + "balance_loss_mlp": 1.01533723, + "epoch": 0.3482940027055464, + "flos": 22200767043840.0, + "grad_norm": 1.8579055052979125, + "language_loss": 0.81918228, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.84058404, + "num_input_tokens_seen": 124682975, + "step": 5793, + "time_per_iteration": 2.55381441116333 + }, + { + "auxiliary_loss_clip": 0.01125094, + "auxiliary_loss_mlp": 0.01046942, + "balance_loss_clip": 1.05285621, + "balance_loss_mlp": 1.03096747, + "epoch": 0.34835412595821436, + "flos": 20302708567680.0, + "grad_norm": 2.242787635682421, + "language_loss": 0.76371729, + "learning_rate": 3.027751349849706e-06, + "loss": 0.7854377, + "num_input_tokens_seen": 124701340, + "step": 5794, + "time_per_iteration": 2.4923934936523438 + }, + { + "auxiliary_loss_clip": 0.01123688, + "auxiliary_loss_mlp": 0.01043784, + "balance_loss_clip": 1.04934585, + "balance_loss_mlp": 1.02801216, + "epoch": 0.3484142492108823, + "flos": 20449691020800.0, + "grad_norm": 1.835860575302882, + "language_loss": 0.57348204, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.59515673, + "num_input_tokens_seen": 124719165, + "step": 5795, + "time_per_iteration": 3.8540825843811035 + }, + { + "auxiliary_loss_clip": 0.01108336, + "auxiliary_loss_mlp": 0.0103216, + "balance_loss_clip": 1.05127144, + "balance_loss_mlp": 1.01734269, + "epoch": 0.3484743724635503, + "flos": 24352929308160.0, + "grad_norm": 1.7925700486875196, + "language_loss": 0.8248685, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.84627342, + "num_input_tokens_seen": 124738670, + "step": 5796, + "time_per_iteration": 2.565178394317627 + }, + { + "auxiliary_loss_clip": 0.01121693, + "auxiliary_loss_mlp": 0.01031726, + "balance_loss_clip": 1.05139971, + "balance_loss_mlp": 1.01755238, + "epoch": 0.34853449571621825, + "flos": 24353001135360.0, + "grad_norm": 1.6650060667081392, + "language_loss": 0.83196414, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.85349834, + "num_input_tokens_seen": 124758760, + "step": 5797, + "time_per_iteration": 2.561216115951538 + }, + { + "auxiliary_loss_clip": 0.01132828, + "auxiliary_loss_mlp": 0.01038517, + "balance_loss_clip": 1.0504092, + "balance_loss_mlp": 1.02280545, + "epoch": 0.3485946189688862, + "flos": 27267690245760.0, + "grad_norm": 1.6607571104170704, + "language_loss": 0.73319948, + "learning_rate": 3.026414616539167e-06, + "loss": 0.75491291, + "num_input_tokens_seen": 124777765, + "step": 5798, + "time_per_iteration": 2.5022478103637695 + }, + { + "auxiliary_loss_clip": 0.01132102, + "auxiliary_loss_mlp": 0.01042611, + "balance_loss_clip": 1.04675293, + "balance_loss_mlp": 1.0271616, + "epoch": 0.3486547422215542, + "flos": 20156695781760.0, + "grad_norm": 2.0449474544609987, + "language_loss": 0.76093674, + "learning_rate": 3.026080335875485e-06, + "loss": 0.78268385, + "num_input_tokens_seen": 124796775, + "step": 5799, + "time_per_iteration": 2.525991201400757 + }, + { + "auxiliary_loss_clip": 0.01074831, + "auxiliary_loss_mlp": 0.01033106, + "balance_loss_clip": 1.05060577, + "balance_loss_mlp": 1.01847911, + "epoch": 0.34871486547422215, + "flos": 20230348619520.0, + "grad_norm": 1.7795970406062929, + "language_loss": 0.75494033, + "learning_rate": 3.025746016302734e-06, + "loss": 0.77601969, + "num_input_tokens_seen": 124815825, + "step": 5800, + "time_per_iteration": 2.86460280418396 + }, + { + "auxiliary_loss_clip": 0.0111391, + "auxiliary_loss_mlp": 0.00798067, + "balance_loss_clip": 1.0486275, + "balance_loss_mlp": 1.02049971, + "epoch": 0.3487749887268901, + "flos": 44053234882560.0, + "grad_norm": 1.8740935404394683, + "language_loss": 0.67088264, + "learning_rate": 3.025411657833591e-06, + "loss": 0.69000244, + "num_input_tokens_seen": 124838420, + "step": 5801, + "time_per_iteration": 2.8939156532287598 + }, + { + "auxiliary_loss_clip": 0.01107945, + "auxiliary_loss_mlp": 0.01041383, + "balance_loss_clip": 1.05328321, + "balance_loss_mlp": 1.02563608, + "epoch": 0.3488351119795581, + "flos": 23295144666240.0, + "grad_norm": 1.7423924733077127, + "language_loss": 0.7668823, + "learning_rate": 3.025077260480735e-06, + "loss": 0.78837562, + "num_input_tokens_seen": 124857320, + "step": 5802, + "time_per_iteration": 2.5549662113189697 + }, + { + "auxiliary_loss_clip": 0.01060221, + "auxiliary_loss_mlp": 0.01037, + "balance_loss_clip": 1.04546428, + "balance_loss_mlp": 1.02183628, + "epoch": 0.34889523523222604, + "flos": 19934839428480.0, + "grad_norm": 1.5844279791073597, + "language_loss": 0.78726411, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.80823636, + "num_input_tokens_seen": 124875685, + "step": 5803, + "time_per_iteration": 2.769475221633911 + }, + { + "auxiliary_loss_clip": 0.01108528, + "auxiliary_loss_mlp": 0.0079545, + "balance_loss_clip": 1.04465592, + "balance_loss_mlp": 1.01478028, + "epoch": 0.348955358484894, + "flos": 30446179816320.0, + "grad_norm": 1.6736892078054633, + "language_loss": 0.67507672, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.69411653, + "num_input_tokens_seen": 124895960, + "step": 5804, + "time_per_iteration": 2.696096897125244 + }, + { + "auxiliary_loss_clip": 0.01106645, + "auxiliary_loss_mlp": 0.01040207, + "balance_loss_clip": 1.04962993, + "balance_loss_mlp": 1.02486515, + "epoch": 0.349015481737562, + "flos": 17999972490240.0, + "grad_norm": 1.7141471422876335, + "language_loss": 0.7586593, + "learning_rate": 3.024073835246702e-06, + "loss": 0.78012788, + "num_input_tokens_seen": 124914140, + "step": 5805, + "time_per_iteration": 2.5204193592071533 + }, + { + "auxiliary_loss_clip": 0.01091618, + "auxiliary_loss_mlp": 0.01037574, + "balance_loss_clip": 1.05297244, + "balance_loss_mlp": 1.02182603, + "epoch": 0.34907560499023, + "flos": 27198490694400.0, + "grad_norm": 2.54021133736412, + "language_loss": 0.66622251, + "learning_rate": 3.023739282485814e-06, + "loss": 0.68751442, + "num_input_tokens_seen": 124934180, + "step": 5806, + "time_per_iteration": 2.630375862121582 + }, + { + "auxiliary_loss_clip": 0.01118202, + "auxiliary_loss_mlp": 0.0104249, + "balance_loss_clip": 1.04937863, + "balance_loss_mlp": 1.02717161, + "epoch": 0.34913572824289796, + "flos": 30226873328640.0, + "grad_norm": 1.7744080983246713, + "language_loss": 0.72320211, + "learning_rate": 3.023404690904629e-06, + "loss": 0.74480897, + "num_input_tokens_seen": 124956060, + "step": 5807, + "time_per_iteration": 2.5776526927948 + }, + { + "auxiliary_loss_clip": 0.01130935, + "auxiliary_loss_mlp": 0.0103572, + "balance_loss_clip": 1.04565489, + "balance_loss_mlp": 1.02048469, + "epoch": 0.3491958514955659, + "flos": 29971907614080.0, + "grad_norm": 1.929717596915313, + "language_loss": 0.74174094, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.76340759, + "num_input_tokens_seen": 124976070, + "step": 5808, + "time_per_iteration": 2.584321975708008 + }, + { + "auxiliary_loss_clip": 0.01130106, + "auxiliary_loss_mlp": 0.01047369, + "balance_loss_clip": 1.04965067, + "balance_loss_mlp": 1.0324564, + "epoch": 0.3492559747482339, + "flos": 22783273902720.0, + "grad_norm": 1.8580407593026143, + "language_loss": 0.84422874, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.86600345, + "num_input_tokens_seen": 124996995, + "step": 5809, + "time_per_iteration": 2.479323387145996 + }, + { + "auxiliary_loss_clip": 0.01102738, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.04489481, + "balance_loss_mlp": 1.01965022, + "epoch": 0.34931609800090185, + "flos": 26068022881920.0, + "grad_norm": 3.678988773609896, + "language_loss": 0.8056525, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.82700956, + "num_input_tokens_seen": 125015600, + "step": 5810, + "time_per_iteration": 2.5717360973358154 + }, + { + "auxiliary_loss_clip": 0.01129266, + "auxiliary_loss_mlp": 0.01041557, + "balance_loss_clip": 1.04611552, + "balance_loss_mlp": 1.02780628, + "epoch": 0.3493762212535698, + "flos": 29242023252480.0, + "grad_norm": 1.9983467185828794, + "language_loss": 0.75277495, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.7744832, + "num_input_tokens_seen": 125035290, + "step": 5811, + "time_per_iteration": 2.5317916870117188 + }, + { + "auxiliary_loss_clip": 0.01111557, + "auxiliary_loss_mlp": 0.01043013, + "balance_loss_clip": 1.04503679, + "balance_loss_mlp": 1.02828443, + "epoch": 0.3494363445062378, + "flos": 27126058919040.0, + "grad_norm": 1.4352897368207767, + "language_loss": 0.80294013, + "learning_rate": 3.021731151138386e-06, + "loss": 0.82448572, + "num_input_tokens_seen": 125057130, + "step": 5812, + "time_per_iteration": 2.593714475631714 + }, + { + "auxiliary_loss_clip": 0.01071222, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.04467392, + "balance_loss_mlp": 1.01834393, + "epoch": 0.34949646775890575, + "flos": 12276207233280.0, + "grad_norm": 2.7080529463812573, + "language_loss": 0.69415176, + "learning_rate": 3.021396326901918e-06, + "loss": 0.71520495, + "num_input_tokens_seen": 125073720, + "step": 5813, + "time_per_iteration": 2.5898845195770264 + }, + { + "auxiliary_loss_clip": 0.01099666, + "auxiliary_loss_mlp": 0.00792923, + "balance_loss_clip": 1.04163742, + "balance_loss_mlp": 1.01363087, + "epoch": 0.3495565910115737, + "flos": 17165516659200.0, + "grad_norm": 1.9451867545086052, + "language_loss": 0.77130079, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.7902267, + "num_input_tokens_seen": 125090635, + "step": 5814, + "time_per_iteration": 2.537668228149414 + }, + { + "auxiliary_loss_clip": 0.01118534, + "auxiliary_loss_mlp": 0.00792767, + "balance_loss_clip": 1.04918635, + "balance_loss_mlp": 1.01377869, + "epoch": 0.3496167142642417, + "flos": 26465661417600.0, + "grad_norm": 2.2087955978273204, + "language_loss": 0.84412062, + "learning_rate": 3.020726562247328e-06, + "loss": 0.86323369, + "num_input_tokens_seen": 125110070, + "step": 5815, + "time_per_iteration": 2.576323986053467 + }, + { + "auxiliary_loss_clip": 0.01113641, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.04621542, + "balance_loss_mlp": 1.01929188, + "epoch": 0.34967683751690964, + "flos": 17414843938560.0, + "grad_norm": 1.9922287939743144, + "language_loss": 0.77405334, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.79551959, + "num_input_tokens_seen": 125125730, + "step": 5816, + "time_per_iteration": 2.480428695678711 + }, + { + "auxiliary_loss_clip": 0.01116871, + "auxiliary_loss_mlp": 0.01040529, + "balance_loss_clip": 1.04756451, + "balance_loss_mlp": 1.02593756, + "epoch": 0.3497369607695776, + "flos": 22600021691520.0, + "grad_norm": 1.9437217181867819, + "language_loss": 0.58660495, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.60817897, + "num_input_tokens_seen": 125146195, + "step": 5817, + "time_per_iteration": 2.5158848762512207 + }, + { + "auxiliary_loss_clip": 0.01045738, + "auxiliary_loss_mlp": 0.01005046, + "balance_loss_clip": 1.01679444, + "balance_loss_mlp": 1.00342524, + "epoch": 0.34979708402224563, + "flos": 68529374818560.0, + "grad_norm": 0.8716650635917494, + "language_loss": 0.59910929, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.61961722, + "num_input_tokens_seen": 125207790, + "step": 5818, + "time_per_iteration": 3.145456314086914 + }, + { + "auxiliary_loss_clip": 0.01098015, + "auxiliary_loss_mlp": 0.01040814, + "balance_loss_clip": 1.05323076, + "balance_loss_mlp": 1.02569246, + "epoch": 0.3498572072749136, + "flos": 18989634988800.0, + "grad_norm": 1.6392515976083797, + "language_loss": 0.83438218, + "learning_rate": 3.019386568567123e-06, + "loss": 0.85577047, + "num_input_tokens_seen": 125226220, + "step": 5819, + "time_per_iteration": 2.5575475692749023 + }, + { + "auxiliary_loss_clip": 0.0110541, + "auxiliary_loss_mlp": 0.01031489, + "balance_loss_clip": 1.04469657, + "balance_loss_mlp": 1.01749945, + "epoch": 0.34991733052758156, + "flos": 27818883423360.0, + "grad_norm": 1.6533854029478456, + "language_loss": 0.70815057, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.72951955, + "num_input_tokens_seen": 125247485, + "step": 5820, + "time_per_iteration": 2.6225826740264893 + }, + { + "auxiliary_loss_clip": 0.01118616, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.0442735, + "balance_loss_mlp": 1.02675021, + "epoch": 0.3499774537802495, + "flos": 33584197737600.0, + "grad_norm": 1.6580439431033682, + "language_loss": 0.7043128, + "learning_rate": 3.018716339744759e-06, + "loss": 0.72590733, + "num_input_tokens_seen": 125268625, + "step": 5821, + "time_per_iteration": 2.6092538833618164 + }, + { + "auxiliary_loss_clip": 0.01124586, + "auxiliary_loss_mlp": 0.01042473, + "balance_loss_clip": 1.04758298, + "balance_loss_mlp": 1.02673149, + "epoch": 0.3500375770329175, + "flos": 23476744851840.0, + "grad_norm": 2.069446735259659, + "language_loss": 0.73758411, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.75925469, + "num_input_tokens_seen": 125287530, + "step": 5822, + "time_per_iteration": 2.504305839538574 + }, + { + "auxiliary_loss_clip": 0.01109761, + "auxiliary_loss_mlp": 0.01034363, + "balance_loss_clip": 1.04915643, + "balance_loss_mlp": 1.01863956, + "epoch": 0.35009770028558546, + "flos": 19026048401280.0, + "grad_norm": 1.5812623175025033, + "language_loss": 0.78563869, + "learning_rate": 3.018045956403094e-06, + "loss": 0.80707991, + "num_input_tokens_seen": 125307020, + "step": 5823, + "time_per_iteration": 2.5024831295013428 + }, + { + "auxiliary_loss_clip": 0.01033916, + "auxiliary_loss_mlp": 0.01001325, + "balance_loss_clip": 1.01549864, + "balance_loss_mlp": 0.99957287, + "epoch": 0.3501578235382534, + "flos": 68351868783360.0, + "grad_norm": 0.7148670750866176, + "language_loss": 0.59281546, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61316788, + "num_input_tokens_seen": 125370445, + "step": 5824, + "time_per_iteration": 3.11738657951355 + }, + { + "auxiliary_loss_clip": 0.0111012, + "auxiliary_loss_mlp": 0.01036676, + "balance_loss_clip": 1.04720461, + "balance_loss_mlp": 1.02067828, + "epoch": 0.3502179467909214, + "flos": 21250893836160.0, + "grad_norm": 1.7707764067077065, + "language_loss": 0.84662026, + "learning_rate": 3.017375418643811e-06, + "loss": 0.86808825, + "num_input_tokens_seen": 125388900, + "step": 5825, + "time_per_iteration": 2.5378222465515137 + }, + { + "auxiliary_loss_clip": 0.0112165, + "auxiliary_loss_mlp": 0.00795293, + "balance_loss_clip": 1.04814231, + "balance_loss_mlp": 1.01630867, + "epoch": 0.35027807004358935, + "flos": 11942955826560.0, + "grad_norm": 2.3081970940901715, + "language_loss": 0.83331585, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.85248536, + "num_input_tokens_seen": 125402675, + "step": 5826, + "time_per_iteration": 2.4486277103424072 + }, + { + "auxiliary_loss_clip": 0.01111345, + "auxiliary_loss_mlp": 0.01045587, + "balance_loss_clip": 1.04954314, + "balance_loss_mlp": 1.03068018, + "epoch": 0.3503381932962573, + "flos": 21470918595840.0, + "grad_norm": 1.7038488778909955, + "language_loss": 0.81126523, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.8328346, + "num_input_tokens_seen": 125421360, + "step": 5827, + "time_per_iteration": 2.537687301635742 + }, + { + "auxiliary_loss_clip": 0.01080721, + "auxiliary_loss_mlp": 0.01043377, + "balance_loss_clip": 1.04439294, + "balance_loss_mlp": 1.02807057, + "epoch": 0.3503983165489253, + "flos": 21251109317760.0, + "grad_norm": 2.7796788375227526, + "language_loss": 0.71324801, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.73448896, + "num_input_tokens_seen": 125440000, + "step": 5828, + "time_per_iteration": 4.172318458557129 + }, + { + "auxiliary_loss_clip": 0.01125238, + "auxiliary_loss_mlp": 0.01046498, + "balance_loss_clip": 1.04939294, + "balance_loss_mlp": 1.02923644, + "epoch": 0.35045843980159325, + "flos": 27815723026560.0, + "grad_norm": 1.7021763742408673, + "language_loss": 0.79749548, + "learning_rate": 3.016033880279248e-06, + "loss": 0.81921285, + "num_input_tokens_seen": 125460390, + "step": 5829, + "time_per_iteration": 2.573141098022461 + }, + { + "auxiliary_loss_clip": 0.01095526, + "auxiliary_loss_mlp": 0.01051669, + "balance_loss_clip": 1.0508182, + "balance_loss_mlp": 1.03481269, + "epoch": 0.3505185630542612, + "flos": 25921148169600.0, + "grad_norm": 1.8031161144613588, + "language_loss": 0.72547805, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.74695003, + "num_input_tokens_seen": 125478410, + "step": 5830, + "time_per_iteration": 3.996628999710083 + }, + { + "auxiliary_loss_clip": 0.01094764, + "auxiliary_loss_mlp": 0.01036608, + "balance_loss_clip": 1.04997563, + "balance_loss_mlp": 1.020473, + "epoch": 0.35057868630692923, + "flos": 20521763660160.0, + "grad_norm": 2.0375878174280686, + "language_loss": 0.88555193, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.90686566, + "num_input_tokens_seen": 125495975, + "step": 5831, + "time_per_iteration": 3.968332529067993 + }, + { + "auxiliary_loss_clip": 0.0107764, + "auxiliary_loss_mlp": 0.01048227, + "balance_loss_clip": 1.0463388, + "balance_loss_mlp": 1.0322051, + "epoch": 0.3506388095595972, + "flos": 20448649526400.0, + "grad_norm": 2.0451708222432665, + "language_loss": 0.78637874, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.80763745, + "num_input_tokens_seen": 125515035, + "step": 5832, + "time_per_iteration": 2.606616973876953 + }, + { + "auxiliary_loss_clip": 0.01097098, + "auxiliary_loss_mlp": 0.01041954, + "balance_loss_clip": 1.04672647, + "balance_loss_mlp": 1.02420378, + "epoch": 0.35069893281226516, + "flos": 23109665811840.0, + "grad_norm": 1.699179386386844, + "language_loss": 0.70954883, + "learning_rate": 3.014691725465008e-06, + "loss": 0.73093939, + "num_input_tokens_seen": 125535555, + "step": 5833, + "time_per_iteration": 2.5976011753082275 + }, + { + "auxiliary_loss_clip": 0.0112194, + "auxiliary_loss_mlp": 0.01035166, + "balance_loss_clip": 1.04855418, + "balance_loss_mlp": 1.02032459, + "epoch": 0.35075905606493313, + "flos": 27271999877760.0, + "grad_norm": 1.323501883759618, + "language_loss": 0.80836153, + "learning_rate": 3.014356090536606e-06, + "loss": 0.82993257, + "num_input_tokens_seen": 125558195, + "step": 5834, + "time_per_iteration": 3.9552626609802246 + }, + { + "auxiliary_loss_clip": 0.01084504, + "auxiliary_loss_mlp": 0.01048013, + "balance_loss_clip": 1.05150414, + "balance_loss_mlp": 1.03197932, + "epoch": 0.3508191793176011, + "flos": 19128608709120.0, + "grad_norm": 1.9878917957925786, + "language_loss": 0.84337652, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.86470163, + "num_input_tokens_seen": 125575375, + "step": 5835, + "time_per_iteration": 2.6010093688964844 + }, + { + "auxiliary_loss_clip": 0.01080928, + "auxiliary_loss_mlp": 0.01044722, + "balance_loss_clip": 1.0477922, + "balance_loss_mlp": 1.02954698, + "epoch": 0.35087930257026906, + "flos": 25557588662400.0, + "grad_norm": 1.525277146120332, + "language_loss": 0.76577449, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.78703099, + "num_input_tokens_seen": 125596745, + "step": 5836, + "time_per_iteration": 2.6478371620178223 + }, + { + "auxiliary_loss_clip": 0.01096868, + "auxiliary_loss_mlp": 0.0104305, + "balance_loss_clip": 1.05297875, + "balance_loss_mlp": 1.02635467, + "epoch": 0.350939425822937, + "flos": 18004246208640.0, + "grad_norm": 1.8907688249638006, + "language_loss": 0.7710861, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.79248524, + "num_input_tokens_seen": 125613980, + "step": 5837, + "time_per_iteration": 2.5695292949676514 + }, + { + "auxiliary_loss_clip": 0.01124088, + "auxiliary_loss_mlp": 0.01043963, + "balance_loss_clip": 1.05110526, + "balance_loss_mlp": 1.02883518, + "epoch": 0.350999549075605, + "flos": 22273198819200.0, + "grad_norm": 1.6935788391722322, + "language_loss": 0.68174243, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.70342302, + "num_input_tokens_seen": 125632100, + "step": 5838, + "time_per_iteration": 2.5336806774139404 + }, + { + "auxiliary_loss_clip": 0.01134765, + "auxiliary_loss_mlp": 0.01039177, + "balance_loss_clip": 1.04972744, + "balance_loss_mlp": 1.02302456, + "epoch": 0.35105967232827295, + "flos": 14392279307520.0, + "grad_norm": 1.771987844612919, + "language_loss": 0.82979786, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.85153723, + "num_input_tokens_seen": 125649190, + "step": 5839, + "time_per_iteration": 2.483438014984131 + }, + { + "auxiliary_loss_clip": 0.01129713, + "auxiliary_loss_mlp": 0.01039239, + "balance_loss_clip": 1.04855597, + "balance_loss_mlp": 1.02336025, + "epoch": 0.3511197955809409, + "flos": 25082346792960.0, + "grad_norm": 1.6762890116167581, + "language_loss": 0.58736056, + "learning_rate": 3.012341473657572e-06, + "loss": 0.60905004, + "num_input_tokens_seen": 125668680, + "step": 5840, + "time_per_iteration": 2.543370008468628 + }, + { + "auxiliary_loss_clip": 0.01093623, + "auxiliary_loss_mlp": 0.01040527, + "balance_loss_clip": 1.04599118, + "balance_loss_mlp": 1.02455294, + "epoch": 0.3511799188336089, + "flos": 25884160139520.0, + "grad_norm": 2.217167887623517, + "language_loss": 0.87129503, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.89263654, + "num_input_tokens_seen": 125686935, + "step": 5841, + "time_per_iteration": 2.575432300567627 + }, + { + "auxiliary_loss_clip": 0.01123106, + "auxiliary_loss_mlp": 0.01043995, + "balance_loss_clip": 1.05268312, + "balance_loss_mlp": 1.02630413, + "epoch": 0.35124004208627685, + "flos": 20083725302400.0, + "grad_norm": 1.5778138073697603, + "language_loss": 0.74827182, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.76994282, + "num_input_tokens_seen": 125707180, + "step": 5842, + "time_per_iteration": 2.594163656234741 + }, + { + "auxiliary_loss_clip": 0.01129127, + "auxiliary_loss_mlp": 0.01042584, + "balance_loss_clip": 1.0499351, + "balance_loss_mlp": 1.02713406, + "epoch": 0.3513001653389448, + "flos": 17783431349760.0, + "grad_norm": 2.137753022959898, + "language_loss": 0.6872648, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.70898193, + "num_input_tokens_seen": 125722780, + "step": 5843, + "time_per_iteration": 2.4681665897369385 + }, + { + "auxiliary_loss_clip": 0.01136087, + "auxiliary_loss_mlp": 0.01041086, + "balance_loss_clip": 1.05203342, + "balance_loss_mlp": 1.02523136, + "epoch": 0.3513602885916128, + "flos": 29387138198400.0, + "grad_norm": 1.8466012567572074, + "language_loss": 0.65399778, + "learning_rate": 3.010997627806655e-06, + "loss": 0.67576945, + "num_input_tokens_seen": 125742110, + "step": 5844, + "time_per_iteration": 2.5207743644714355 + }, + { + "auxiliary_loss_clip": 0.01122659, + "auxiliary_loss_mlp": 0.01044471, + "balance_loss_clip": 1.05318165, + "balance_loss_mlp": 1.02801991, + "epoch": 0.3514204118442808, + "flos": 16179876483840.0, + "grad_norm": 2.449804850926252, + "language_loss": 0.74935693, + "learning_rate": 3.010661570469245e-06, + "loss": 0.77102822, + "num_input_tokens_seen": 125759980, + "step": 5845, + "time_per_iteration": 2.477585554122925 + }, + { + "auxiliary_loss_clip": 0.01123938, + "auxiliary_loss_mlp": 0.01040458, + "balance_loss_clip": 1.05140877, + "balance_loss_mlp": 1.02488387, + "epoch": 0.35148053509694877, + "flos": 23834665923840.0, + "grad_norm": 2.5118975235667684, + "language_loss": 0.73316616, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.75481004, + "num_input_tokens_seen": 125772660, + "step": 5846, + "time_per_iteration": 2.499051570892334 + }, + { + "auxiliary_loss_clip": 0.0110007, + "auxiliary_loss_mlp": 0.01039704, + "balance_loss_clip": 1.04864621, + "balance_loss_mlp": 1.02402782, + "epoch": 0.35154065834961673, + "flos": 20991295267200.0, + "grad_norm": 1.5676039781792857, + "language_loss": 0.75464141, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.77603912, + "num_input_tokens_seen": 125791935, + "step": 5847, + "time_per_iteration": 2.555535316467285 + }, + { + "auxiliary_loss_clip": 0.01109209, + "auxiliary_loss_mlp": 0.01037619, + "balance_loss_clip": 1.04650044, + "balance_loss_mlp": 1.02246165, + "epoch": 0.3516007816022847, + "flos": 33255471444480.0, + "grad_norm": 1.9264019898579687, + "language_loss": 0.72563243, + "learning_rate": 3.009653168561666e-06, + "loss": 0.74710065, + "num_input_tokens_seen": 125813455, + "step": 5848, + "time_per_iteration": 2.6338155269622803 + }, + { + "auxiliary_loss_clip": 0.01117002, + "auxiliary_loss_mlp": 0.01045767, + "balance_loss_clip": 1.04781115, + "balance_loss_mlp": 1.02925694, + "epoch": 0.35166090485495266, + "flos": 11726953390080.0, + "grad_norm": 6.875120398005647, + "language_loss": 0.89234543, + "learning_rate": 3.009316958003178e-06, + "loss": 0.91397309, + "num_input_tokens_seen": 125827660, + "step": 5849, + "time_per_iteration": 2.475409984588623 + }, + { + "auxiliary_loss_clip": 0.01113607, + "auxiliary_loss_mlp": 0.01036429, + "balance_loss_clip": 1.05029523, + "balance_loss_mlp": 1.02103949, + "epoch": 0.3517210281076206, + "flos": 22638446265600.0, + "grad_norm": 1.6967685486415967, + "language_loss": 0.74683148, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.76833189, + "num_input_tokens_seen": 125846655, + "step": 5850, + "time_per_iteration": 2.5845816135406494 + }, + { + "auxiliary_loss_clip": 0.01123184, + "auxiliary_loss_mlp": 0.01036262, + "balance_loss_clip": 1.05038047, + "balance_loss_mlp": 1.0206697, + "epoch": 0.3517811513602886, + "flos": 21322750993920.0, + "grad_norm": 1.3731079826195953, + "language_loss": 0.75315857, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.77475303, + "num_input_tokens_seen": 125866290, + "step": 5851, + "time_per_iteration": 2.5103023052215576 + }, + { + "auxiliary_loss_clip": 0.01109473, + "auxiliary_loss_mlp": 0.01041263, + "balance_loss_clip": 1.05027366, + "balance_loss_mlp": 1.02397728, + "epoch": 0.35184127461295656, + "flos": 21032880238080.0, + "grad_norm": 1.866952071108202, + "language_loss": 0.87552691, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.89703429, + "num_input_tokens_seen": 125884620, + "step": 5852, + "time_per_iteration": 2.6041038036346436 + }, + { + "auxiliary_loss_clip": 0.01131535, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.04928529, + "balance_loss_mlp": 1.01777458, + "epoch": 0.3519013978656245, + "flos": 22455265881600.0, + "grad_norm": 2.0272198145544684, + "language_loss": 0.6730237, + "learning_rate": 3.007971733162737e-06, + "loss": 0.69466615, + "num_input_tokens_seen": 125902430, + "step": 5853, + "time_per_iteration": 2.4967010021209717 + }, + { + "auxiliary_loss_clip": 0.01109041, + "auxiliary_loss_mlp": 0.01036568, + "balance_loss_clip": 1.04612279, + "balance_loss_mlp": 1.02073658, + "epoch": 0.3519615211182925, + "flos": 13115295918720.0, + "grad_norm": 1.7281458650852382, + "language_loss": 0.8122561, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.83371216, + "num_input_tokens_seen": 125920570, + "step": 5854, + "time_per_iteration": 2.5506367683410645 + }, + { + "auxiliary_loss_clip": 0.0111036, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_clip": 1.04889774, + "balance_loss_mlp": 1.01917291, + "epoch": 0.35202164437096045, + "flos": 19135144984320.0, + "grad_norm": 1.5272123834975948, + "language_loss": 0.73169172, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.75312817, + "num_input_tokens_seen": 125939800, + "step": 5855, + "time_per_iteration": 2.5319435596466064 + }, + { + "auxiliary_loss_clip": 0.01128804, + "auxiliary_loss_mlp": 0.01035421, + "balance_loss_clip": 1.04746926, + "balance_loss_mlp": 1.02150965, + "epoch": 0.3520817676236284, + "flos": 26542187343360.0, + "grad_norm": 1.8741235134212486, + "language_loss": 0.70963943, + "learning_rate": 3.006962413152691e-06, + "loss": 0.7312817, + "num_input_tokens_seen": 125958720, + "step": 5856, + "time_per_iteration": 2.517080068588257 + }, + { + "auxiliary_loss_clip": 0.01120545, + "auxiliary_loss_mlp": 0.01044875, + "balance_loss_clip": 1.05249798, + "balance_loss_mlp": 1.02811408, + "epoch": 0.3521418908762964, + "flos": 44893472803200.0, + "grad_norm": 1.6767013858738935, + "language_loss": 0.6127106, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.63436478, + "num_input_tokens_seen": 125984310, + "step": 5857, + "time_per_iteration": 2.7190990447998047 + }, + { + "auxiliary_loss_clip": 0.0112255, + "auxiliary_loss_mlp": 0.01039791, + "balance_loss_clip": 1.0490396, + "balance_loss_mlp": 1.02407897, + "epoch": 0.3522020141289644, + "flos": 20187398931840.0, + "grad_norm": 1.724283252075808, + "language_loss": 0.73867416, + "learning_rate": 3.006289342204152e-06, + "loss": 0.76029754, + "num_input_tokens_seen": 126002410, + "step": 5858, + "time_per_iteration": 2.5223817825317383 + }, + { + "auxiliary_loss_clip": 0.01133931, + "auxiliary_loss_mlp": 0.0103748, + "balance_loss_clip": 1.04796326, + "balance_loss_mlp": 1.02264476, + "epoch": 0.35226213738163237, + "flos": 27563917708800.0, + "grad_norm": 1.794022223030072, + "language_loss": 0.76146573, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.78317982, + "num_input_tokens_seen": 126022490, + "step": 5859, + "time_per_iteration": 2.5265674591064453 + }, + { + "auxiliary_loss_clip": 0.01116168, + "auxiliary_loss_mlp": 0.01043438, + "balance_loss_clip": 1.05037916, + "balance_loss_mlp": 1.02662909, + "epoch": 0.35232226063430033, + "flos": 22966310632320.0, + "grad_norm": 2.178415419128804, + "language_loss": 0.7196883, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.74128425, + "num_input_tokens_seen": 126042895, + "step": 5860, + "time_per_iteration": 2.5323452949523926 + }, + { + "auxiliary_loss_clip": 0.0110542, + "auxiliary_loss_mlp": 0.01039849, + "balance_loss_clip": 1.05077481, + "balance_loss_mlp": 1.02312422, + "epoch": 0.3523823838869683, + "flos": 19168290259200.0, + "grad_norm": 6.363159385561328, + "language_loss": 0.663679, + "learning_rate": 3.005279449623811e-06, + "loss": 0.68513173, + "num_input_tokens_seen": 126060130, + "step": 5861, + "time_per_iteration": 2.520472288131714 + }, + { + "auxiliary_loss_clip": 0.01110679, + "auxiliary_loss_mlp": 0.01033544, + "balance_loss_clip": 1.04919958, + "balance_loss_mlp": 1.01927423, + "epoch": 0.35244250713963626, + "flos": 17930988420480.0, + "grad_norm": 2.2330287333610985, + "language_loss": 0.66732037, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.68876261, + "num_input_tokens_seen": 126077850, + "step": 5862, + "time_per_iteration": 2.5048139095306396 + }, + { + "auxiliary_loss_clip": 0.01109281, + "auxiliary_loss_mlp": 0.01038882, + "balance_loss_clip": 1.05169249, + "balance_loss_mlp": 1.02257431, + "epoch": 0.35250263039230423, + "flos": 21432529935360.0, + "grad_norm": 1.8897853429901486, + "language_loss": 0.76899844, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.79048002, + "num_input_tokens_seen": 126095985, + "step": 5863, + "time_per_iteration": 2.533298969268799 + }, + { + "auxiliary_loss_clip": 0.01118531, + "auxiliary_loss_mlp": 0.01038936, + "balance_loss_clip": 1.05049229, + "balance_loss_mlp": 1.02432644, + "epoch": 0.3525627536449722, + "flos": 27416863428480.0, + "grad_norm": 1.8160782879304753, + "language_loss": 0.75213611, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.77371085, + "num_input_tokens_seen": 126116070, + "step": 5864, + "time_per_iteration": 2.5390617847442627 + }, + { + "auxiliary_loss_clip": 0.01120196, + "auxiliary_loss_mlp": 0.01046467, + "balance_loss_clip": 1.04623139, + "balance_loss_mlp": 1.03156555, + "epoch": 0.35262287689764016, + "flos": 24789818430720.0, + "grad_norm": 2.9143774887190306, + "language_loss": 0.79060072, + "learning_rate": 3.003932392558793e-06, + "loss": 0.8122673, + "num_input_tokens_seen": 126135205, + "step": 5865, + "time_per_iteration": 2.531221866607666 + }, + { + "auxiliary_loss_clip": 0.01126557, + "auxiliary_loss_mlp": 0.01040799, + "balance_loss_clip": 1.05069423, + "balance_loss_mlp": 1.02510512, + "epoch": 0.3526830001503081, + "flos": 17821604528640.0, + "grad_norm": 2.0126835174724964, + "language_loss": 0.8122918, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.83396536, + "num_input_tokens_seen": 126151895, + "step": 5866, + "time_per_iteration": 3.898078441619873 + }, + { + "auxiliary_loss_clip": 0.01090459, + "auxiliary_loss_mlp": 0.01036697, + "balance_loss_clip": 1.05034804, + "balance_loss_mlp": 1.01985228, + "epoch": 0.3527431234029761, + "flos": 18078114528000.0, + "grad_norm": 2.0978919941418677, + "language_loss": 0.845595, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.86686653, + "num_input_tokens_seen": 126168515, + "step": 5867, + "time_per_iteration": 2.575634717941284 + }, + { + "auxiliary_loss_clip": 0.01136033, + "auxiliary_loss_mlp": 0.01048109, + "balance_loss_clip": 1.04980278, + "balance_loss_mlp": 1.03254092, + "epoch": 0.35280324665564405, + "flos": 19427350124160.0, + "grad_norm": 1.7278714421264112, + "language_loss": 0.74146622, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.76330769, + "num_input_tokens_seen": 126186460, + "step": 5868, + "time_per_iteration": 2.4642903804779053 + }, + { + "auxiliary_loss_clip": 0.01125515, + "auxiliary_loss_mlp": 0.01040246, + "balance_loss_clip": 1.05155182, + "balance_loss_mlp": 1.02457047, + "epoch": 0.352863369908312, + "flos": 21504027957120.0, + "grad_norm": 1.8903506467184195, + "language_loss": 0.61618161, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.63783926, + "num_input_tokens_seen": 126206170, + "step": 5869, + "time_per_iteration": 3.8837032318115234 + }, + { + "auxiliary_loss_clip": 0.01123496, + "auxiliary_loss_mlp": 0.01040731, + "balance_loss_clip": 1.04868746, + "balance_loss_mlp": 1.02541852, + "epoch": 0.35292349316098, + "flos": 22309504490880.0, + "grad_norm": 1.948616548591169, + "language_loss": 0.74225891, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.76390111, + "num_input_tokens_seen": 126225605, + "step": 5870, + "time_per_iteration": 3.9331157207489014 + }, + { + "auxiliary_loss_clip": 0.01120421, + "auxiliary_loss_mlp": 0.01033814, + "balance_loss_clip": 1.04699183, + "balance_loss_mlp": 1.01885319, + "epoch": 0.352983616413648, + "flos": 33109745967360.0, + "grad_norm": 1.4252127344938181, + "language_loss": 0.71892953, + "learning_rate": 3.001910665140316e-06, + "loss": 0.74047184, + "num_input_tokens_seen": 126250230, + "step": 5871, + "time_per_iteration": 2.7069175243377686 + }, + { + "auxiliary_loss_clip": 0.01114977, + "auxiliary_loss_mlp": 0.01034465, + "balance_loss_clip": 1.0459249, + "balance_loss_mlp": 1.0204761, + "epoch": 0.35304373966631597, + "flos": 18696603836160.0, + "grad_norm": 2.2074380017367297, + "language_loss": 0.73791277, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.75940716, + "num_input_tokens_seen": 126268315, + "step": 5872, + "time_per_iteration": 4.041405916213989 + }, + { + "auxiliary_loss_clip": 0.01110694, + "auxiliary_loss_mlp": 0.00794405, + "balance_loss_clip": 1.04791975, + "balance_loss_mlp": 1.01620209, + "epoch": 0.35310386291898394, + "flos": 23364954748800.0, + "grad_norm": 1.5770157746660816, + "language_loss": 0.82102799, + "learning_rate": 3.001236451924089e-06, + "loss": 0.84007901, + "num_input_tokens_seen": 126288390, + "step": 5873, + "time_per_iteration": 2.618715763092041 + }, + { + "auxiliary_loss_clip": 0.01114701, + "auxiliary_loss_mlp": 0.0104482, + "balance_loss_clip": 1.04773211, + "balance_loss_mlp": 1.02810645, + "epoch": 0.3531639861716519, + "flos": 24461954064000.0, + "grad_norm": 1.8312039007593897, + "language_loss": 0.65897655, + "learning_rate": 3.000899288359104e-06, + "loss": 0.68057179, + "num_input_tokens_seen": 126305750, + "step": 5874, + "time_per_iteration": 2.621669292449951 + }, + { + "auxiliary_loss_clip": 0.01043001, + "auxiliary_loss_mlp": 0.01002705, + "balance_loss_clip": 1.02338457, + "balance_loss_mlp": 1.00024915, + "epoch": 0.35322410942431987, + "flos": 70312446881280.0, + "grad_norm": 0.7589241803489293, + "language_loss": 0.61531889, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63577598, + "num_input_tokens_seen": 126362495, + "step": 5875, + "time_per_iteration": 3.029635429382324 + }, + { + "auxiliary_loss_clip": 0.01074251, + "auxiliary_loss_mlp": 0.01046761, + "balance_loss_clip": 1.04577875, + "balance_loss_mlp": 1.03163362, + "epoch": 0.35328423267698783, + "flos": 19820894509440.0, + "grad_norm": 1.893886986818516, + "language_loss": 0.79803073, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.81924081, + "num_input_tokens_seen": 126378320, + "step": 5876, + "time_per_iteration": 2.5927631855010986 + }, + { + "auxiliary_loss_clip": 0.01030166, + "auxiliary_loss_mlp": 0.00833022, + "balance_loss_clip": 1.03183699, + "balance_loss_mlp": 1.11425138, + "epoch": 0.3533443559296558, + "flos": 60826356391680.0, + "grad_norm": 0.6874110913094893, + "language_loss": 0.56770647, + "learning_rate": 2.999887569990088e-06, + "loss": 0.58633828, + "num_input_tokens_seen": 126442735, + "step": 5877, + "time_per_iteration": 3.2602016925811768 + }, + { + "auxiliary_loss_clip": 0.01100188, + "auxiliary_loss_mlp": 0.01033407, + "balance_loss_clip": 1.04741549, + "balance_loss_mlp": 1.018381, + "epoch": 0.35340447918232376, + "flos": 24755775315840.0, + "grad_norm": 1.628929703377693, + "language_loss": 0.71471298, + "learning_rate": 2.999550254685024e-06, + "loss": 0.73604894, + "num_input_tokens_seen": 126463090, + "step": 5878, + "time_per_iteration": 2.5671818256378174 + }, + { + "auxiliary_loss_clip": 0.01104026, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.04665756, + "balance_loss_mlp": 1.02128363, + "epoch": 0.3534646024349917, + "flos": 21796304924160.0, + "grad_norm": 1.7268710787497756, + "language_loss": 0.78776789, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.80917156, + "num_input_tokens_seen": 126482105, + "step": 5879, + "time_per_iteration": 2.5543177127838135 + }, + { + "auxiliary_loss_clip": 0.01103535, + "auxiliary_loss_mlp": 0.01046811, + "balance_loss_clip": 1.0485642, + "balance_loss_mlp": 1.0295136, + "epoch": 0.3535247256876597, + "flos": 20012119539840.0, + "grad_norm": 2.068417698749051, + "language_loss": 0.63326859, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.65477204, + "num_input_tokens_seen": 126502125, + "step": 5880, + "time_per_iteration": 2.566581964492798 + }, + { + "auxiliary_loss_clip": 0.01110784, + "auxiliary_loss_mlp": 0.01033537, + "balance_loss_clip": 1.04823625, + "balance_loss_mlp": 1.0179683, + "epoch": 0.35358484894032766, + "flos": 18187929383040.0, + "grad_norm": 3.4726871088134543, + "language_loss": 0.65531564, + "learning_rate": 2.998538081402727e-06, + "loss": 0.67675883, + "num_input_tokens_seen": 126521950, + "step": 5881, + "time_per_iteration": 2.545579433441162 + }, + { + "auxiliary_loss_clip": 0.01111761, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.04682302, + "balance_loss_mlp": 1.01876116, + "epoch": 0.3536449721929956, + "flos": 22820369673600.0, + "grad_norm": 1.3711597106571851, + "language_loss": 0.75844944, + "learning_rate": 2.998200614562239e-06, + "loss": 0.77989405, + "num_input_tokens_seen": 126542445, + "step": 5882, + "time_per_iteration": 2.525022268295288 + }, + { + "auxiliary_loss_clip": 0.01107642, + "auxiliary_loss_mlp": 0.01047776, + "balance_loss_clip": 1.04686475, + "balance_loss_mlp": 1.03070498, + "epoch": 0.3537050954456636, + "flos": 26432336574720.0, + "grad_norm": 2.442342580905341, + "language_loss": 0.7051248, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.72667897, + "num_input_tokens_seen": 126560690, + "step": 5883, + "time_per_iteration": 2.586571216583252 + }, + { + "auxiliary_loss_clip": 0.01100512, + "auxiliary_loss_mlp": 0.01037263, + "balance_loss_clip": 1.0495882, + "balance_loss_mlp": 1.02146804, + "epoch": 0.3537652186983316, + "flos": 17197153562880.0, + "grad_norm": 2.251627720735446, + "language_loss": 0.78039622, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.80177391, + "num_input_tokens_seen": 126577620, + "step": 5884, + "time_per_iteration": 2.535024642944336 + }, + { + "auxiliary_loss_clip": 0.01111462, + "auxiliary_loss_mlp": 0.01036128, + "balance_loss_clip": 1.04832029, + "balance_loss_mlp": 1.02187634, + "epoch": 0.3538253419509996, + "flos": 19536769929600.0, + "grad_norm": 1.971236449653369, + "language_loss": 0.75404173, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.77551764, + "num_input_tokens_seen": 126596235, + "step": 5885, + "time_per_iteration": 2.5211470127105713 + }, + { + "auxiliary_loss_clip": 0.01088748, + "auxiliary_loss_mlp": 0.0104033, + "balance_loss_clip": 1.05036426, + "balance_loss_mlp": 1.02408135, + "epoch": 0.35388546520366754, + "flos": 12128578335360.0, + "grad_norm": 2.179476376925401, + "language_loss": 0.83326113, + "learning_rate": 2.996850368809606e-06, + "loss": 0.85455191, + "num_input_tokens_seen": 126612830, + "step": 5886, + "time_per_iteration": 2.6037914752960205 + }, + { + "auxiliary_loss_clip": 0.01131002, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.04753017, + "balance_loss_mlp": 1.01695132, + "epoch": 0.3539455884563355, + "flos": 19678149861120.0, + "grad_norm": 3.043492202349756, + "language_loss": 0.78383017, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.80546653, + "num_input_tokens_seen": 126630910, + "step": 5887, + "time_per_iteration": 2.4791693687438965 + }, + { + "auxiliary_loss_clip": 0.01075351, + "auxiliary_loss_mlp": 0.01045108, + "balance_loss_clip": 1.0453614, + "balance_loss_mlp": 1.02974212, + "epoch": 0.35400571170900347, + "flos": 18072045129600.0, + "grad_norm": 1.70557531171087, + "language_loss": 0.65783507, + "learning_rate": 2.996175019078089e-06, + "loss": 0.67903966, + "num_input_tokens_seen": 126648365, + "step": 5888, + "time_per_iteration": 2.6191439628601074 + }, + { + "auxiliary_loss_clip": 0.01103213, + "auxiliary_loss_mlp": 0.01040734, + "balance_loss_clip": 1.04580677, + "balance_loss_mlp": 1.02584457, + "epoch": 0.35406583496167143, + "flos": 26068058795520.0, + "grad_norm": 1.6041871621726798, + "language_loss": 0.76967895, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.79111838, + "num_input_tokens_seen": 126667500, + "step": 5889, + "time_per_iteration": 2.5735135078430176 + }, + { + "auxiliary_loss_clip": 0.01096952, + "auxiliary_loss_mlp": 0.0103385, + "balance_loss_clip": 1.04932606, + "balance_loss_mlp": 1.0192703, + "epoch": 0.3541259582143394, + "flos": 19792453916160.0, + "grad_norm": 1.698733867397149, + "language_loss": 0.8066355, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.8279435, + "num_input_tokens_seen": 126686820, + "step": 5890, + "time_per_iteration": 2.5600216388702393 + }, + { + "auxiliary_loss_clip": 0.01111626, + "auxiliary_loss_mlp": 0.01035016, + "balance_loss_clip": 1.04650974, + "balance_loss_mlp": 1.0221231, + "epoch": 0.35418608146700736, + "flos": 24022084112640.0, + "grad_norm": 1.7579952005499295, + "language_loss": 0.79573357, + "learning_rate": 2.99516171119991e-06, + "loss": 0.81719995, + "num_input_tokens_seen": 126706965, + "step": 5891, + "time_per_iteration": 2.5740268230438232 + }, + { + "auxiliary_loss_clip": 0.0109341, + "auxiliary_loss_mlp": 0.01044268, + "balance_loss_clip": 1.0453887, + "balance_loss_mlp": 1.02803206, + "epoch": 0.35424620471967533, + "flos": 12385770693120.0, + "grad_norm": 1.8392645668761929, + "language_loss": 0.72995037, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.75132716, + "num_input_tokens_seen": 126724015, + "step": 5892, + "time_per_iteration": 2.56139874458313 + }, + { + "auxiliary_loss_clip": 0.01107719, + "auxiliary_loss_mlp": 0.01039443, + "balance_loss_clip": 1.04696393, + "balance_loss_mlp": 1.02420855, + "epoch": 0.3543063279723433, + "flos": 19673624747520.0, + "grad_norm": 1.9777894333065336, + "language_loss": 0.67104053, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.69251215, + "num_input_tokens_seen": 126737565, + "step": 5893, + "time_per_iteration": 2.528381586074829 + }, + { + "auxiliary_loss_clip": 0.01081039, + "auxiliary_loss_mlp": 0.01036667, + "balance_loss_clip": 1.04858756, + "balance_loss_mlp": 1.02089596, + "epoch": 0.35436645122501126, + "flos": 21909208348800.0, + "grad_norm": 1.830654292283894, + "language_loss": 0.69861346, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.71979052, + "num_input_tokens_seen": 126756095, + "step": 5894, + "time_per_iteration": 2.5980918407440186 + }, + { + "auxiliary_loss_clip": 0.011106, + "auxiliary_loss_mlp": 0.00850742, + "balance_loss_clip": 1.04706383, + "balance_loss_mlp": 1.12833142, + "epoch": 0.3544265744776792, + "flos": 21719527603200.0, + "grad_norm": 2.510381342547178, + "language_loss": 0.74569404, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.76530749, + "num_input_tokens_seen": 126775455, + "step": 5895, + "time_per_iteration": 2.572514533996582 + }, + { + "auxiliary_loss_clip": 0.0110526, + "auxiliary_loss_mlp": 0.01039974, + "balance_loss_clip": 1.04980445, + "balance_loss_mlp": 1.02502525, + "epoch": 0.3544866977303472, + "flos": 21213223447680.0, + "grad_norm": 2.005942085986966, + "language_loss": 0.8361218, + "learning_rate": 2.993472110174491e-06, + "loss": 0.85757411, + "num_input_tokens_seen": 126792320, + "step": 5896, + "time_per_iteration": 2.532672643661499 + }, + { + "auxiliary_loss_clip": 0.0110629, + "auxiliary_loss_mlp": 0.00845838, + "balance_loss_clip": 1.04585528, + "balance_loss_mlp": 1.11739922, + "epoch": 0.35454682098301515, + "flos": 29311402371840.0, + "grad_norm": 2.1804086194929324, + "language_loss": 0.70109695, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.72061819, + "num_input_tokens_seen": 126813680, + "step": 5897, + "time_per_iteration": 2.636375904083252 + }, + { + "auxiliary_loss_clip": 0.01104333, + "auxiliary_loss_mlp": 0.01043822, + "balance_loss_clip": 1.04513574, + "balance_loss_mlp": 1.02729964, + "epoch": 0.3546069442356832, + "flos": 24316587722880.0, + "grad_norm": 1.5892179546303355, + "language_loss": 0.81791103, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.8393926, + "num_input_tokens_seen": 126834395, + "step": 5898, + "time_per_iteration": 2.567105770111084 + }, + { + "auxiliary_loss_clip": 0.01125104, + "auxiliary_loss_mlp": 0.01041448, + "balance_loss_clip": 1.0442009, + "balance_loss_mlp": 1.02750611, + "epoch": 0.35466706748835114, + "flos": 22857285876480.0, + "grad_norm": 1.4778620150762192, + "language_loss": 0.74151313, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.76317859, + "num_input_tokens_seen": 126855145, + "step": 5899, + "time_per_iteration": 2.5139100551605225 + }, + { + "auxiliary_loss_clip": 0.01128217, + "auxiliary_loss_mlp": 0.00815382, + "balance_loss_clip": 1.04502213, + "balance_loss_mlp": 1.06092286, + "epoch": 0.3547271907410191, + "flos": 28330107742080.0, + "grad_norm": 1.8180254913807365, + "language_loss": 0.79545325, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.81488931, + "num_input_tokens_seen": 126873790, + "step": 5900, + "time_per_iteration": 2.5379104614257812 + }, + { + "auxiliary_loss_clip": 0.01105229, + "auxiliary_loss_mlp": 0.01040203, + "balance_loss_clip": 1.04336488, + "balance_loss_mlp": 1.02445531, + "epoch": 0.35478731399368707, + "flos": 23514092017920.0, + "grad_norm": 1.8692506618451001, + "language_loss": 0.81474996, + "learning_rate": 2.991781567335093e-06, + "loss": 0.83620429, + "num_input_tokens_seen": 126892865, + "step": 5901, + "time_per_iteration": 2.5517306327819824 + }, + { + "auxiliary_loss_clip": 0.01117284, + "auxiliary_loss_mlp": 0.00804576, + "balance_loss_clip": 1.04686153, + "balance_loss_mlp": 1.04012465, + "epoch": 0.35484743724635504, + "flos": 18624315715200.0, + "grad_norm": 1.815426327020132, + "language_loss": 0.76073986, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.77995849, + "num_input_tokens_seen": 126911935, + "step": 5902, + "time_per_iteration": 2.524336576461792 + }, + { + "auxiliary_loss_clip": 0.01117417, + "auxiliary_loss_mlp": 0.01038539, + "balance_loss_clip": 1.04483438, + "balance_loss_mlp": 1.02487159, + "epoch": 0.354907560499023, + "flos": 17384499924480.0, + "grad_norm": 3.2326892984657474, + "language_loss": 0.70299548, + "learning_rate": 2.991105086850381e-06, + "loss": 0.72455502, + "num_input_tokens_seen": 126930040, + "step": 5903, + "time_per_iteration": 2.478618860244751 + }, + { + "auxiliary_loss_clip": 0.01117487, + "auxiliary_loss_mlp": 0.01034057, + "balance_loss_clip": 1.04408693, + "balance_loss_mlp": 1.01952553, + "epoch": 0.35496768375169097, + "flos": 19208546426880.0, + "grad_norm": 2.7251896547937067, + "language_loss": 0.75037169, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.77188706, + "num_input_tokens_seen": 126948390, + "step": 5904, + "time_per_iteration": 3.8400659561157227 + }, + { + "auxiliary_loss_clip": 0.01109055, + "auxiliary_loss_mlp": 0.00797305, + "balance_loss_clip": 1.04735982, + "balance_loss_mlp": 1.02403474, + "epoch": 0.35502780700435893, + "flos": 18332792933760.0, + "grad_norm": 1.845250381263542, + "language_loss": 0.78647125, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.80553484, + "num_input_tokens_seen": 126964905, + "step": 5905, + "time_per_iteration": 2.5221595764160156 + }, + { + "auxiliary_loss_clip": 0.01091287, + "auxiliary_loss_mlp": 0.01035918, + "balance_loss_clip": 1.04293203, + "balance_loss_mlp": 1.02301955, + "epoch": 0.3550879302570269, + "flos": 15448555578240.0, + "grad_norm": 1.9052570640456417, + "language_loss": 0.72377002, + "learning_rate": 2.990090084284356e-06, + "loss": 0.74504203, + "num_input_tokens_seen": 126982000, + "step": 5906, + "time_per_iteration": 2.5466530323028564 + }, + { + "auxiliary_loss_clip": 0.01101101, + "auxiliary_loss_mlp": 0.01036428, + "balance_loss_clip": 1.04728007, + "balance_loss_mlp": 1.02021599, + "epoch": 0.35514805350969486, + "flos": 21979197999360.0, + "grad_norm": 2.1334005490454646, + "language_loss": 0.74469888, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.76607412, + "num_input_tokens_seen": 126998390, + "step": 5907, + "time_per_iteration": 2.5080583095550537 + }, + { + "auxiliary_loss_clip": 0.01063269, + "auxiliary_loss_mlp": 0.01038131, + "balance_loss_clip": 1.04331481, + "balance_loss_mlp": 1.02039289, + "epoch": 0.3552081767623628, + "flos": 29861949104640.0, + "grad_norm": 1.785159971212328, + "language_loss": 0.75681353, + "learning_rate": 2.989413228164047e-06, + "loss": 0.7778275, + "num_input_tokens_seen": 127020220, + "step": 5908, + "time_per_iteration": 4.0817930698394775 + }, + { + "auxiliary_loss_clip": 0.01106872, + "auxiliary_loss_mlp": 0.01038946, + "balance_loss_clip": 1.04797769, + "balance_loss_mlp": 1.02436662, + "epoch": 0.3552683000150308, + "flos": 26432264747520.0, + "grad_norm": 2.343384075611453, + "language_loss": 0.68337584, + "learning_rate": 2.989074743819502e-06, + "loss": 0.7048341, + "num_input_tokens_seen": 127038585, + "step": 5909, + "time_per_iteration": 4.025432586669922 + }, + { + "auxiliary_loss_clip": 0.01114331, + "auxiliary_loss_mlp": 0.01037556, + "balance_loss_clip": 1.04907548, + "balance_loss_mlp": 1.02373993, + "epoch": 0.35532842326769876, + "flos": 19785989468160.0, + "grad_norm": 1.8363308439312656, + "language_loss": 0.7849791, + "learning_rate": 2.988736221969144e-06, + "loss": 0.80649793, + "num_input_tokens_seen": 127056215, + "step": 5910, + "time_per_iteration": 3.9087181091308594 + }, + { + "auxiliary_loss_clip": 0.01108576, + "auxiliary_loss_mlp": 0.01041932, + "balance_loss_clip": 1.04369974, + "balance_loss_mlp": 1.02553439, + "epoch": 0.3553885465203668, + "flos": 17239277237760.0, + "grad_norm": 1.5506054252026462, + "language_loss": 0.70843273, + "learning_rate": 2.98839766262581e-06, + "loss": 0.72993785, + "num_input_tokens_seen": 127075825, + "step": 5911, + "time_per_iteration": 2.5169689655303955 + }, + { + "auxiliary_loss_clip": 0.01115483, + "auxiliary_loss_mlp": 0.01039975, + "balance_loss_clip": 1.0440197, + "balance_loss_mlp": 1.02510905, + "epoch": 0.35544866977303474, + "flos": 14934350430720.0, + "grad_norm": 2.2327494856359995, + "language_loss": 0.86784804, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.88940263, + "num_input_tokens_seen": 127091205, + "step": 5912, + "time_per_iteration": 2.4740257263183594 + }, + { + "auxiliary_loss_clip": 0.01108819, + "auxiliary_loss_mlp": 0.01034894, + "balance_loss_clip": 1.04750443, + "balance_loss_mlp": 1.02110791, + "epoch": 0.3555087930257027, + "flos": 19756040503680.0, + "grad_norm": 4.886429084393904, + "language_loss": 0.77356434, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.79500151, + "num_input_tokens_seen": 127109210, + "step": 5913, + "time_per_iteration": 2.5170209407806396 + }, + { + "auxiliary_loss_clip": 0.01095072, + "auxiliary_loss_mlp": 0.01035032, + "balance_loss_clip": 1.05080402, + "balance_loss_mlp": 1.0205425, + "epoch": 0.3555689162783707, + "flos": 21068252156160.0, + "grad_norm": 1.300305217949161, + "language_loss": 0.82614881, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.8474499, + "num_input_tokens_seen": 127128400, + "step": 5914, + "time_per_iteration": 2.5719785690307617 + }, + { + "auxiliary_loss_clip": 0.01129615, + "auxiliary_loss_mlp": 0.01033783, + "balance_loss_clip": 1.0464859, + "balance_loss_mlp": 1.0185957, + "epoch": 0.35562903953103864, + "flos": 33069633454080.0, + "grad_norm": 2.0705083939855324, + "language_loss": 0.706617, + "learning_rate": 2.98704305057949e-06, + "loss": 0.72825092, + "num_input_tokens_seen": 127149965, + "step": 5915, + "time_per_iteration": 2.5745949745178223 + }, + { + "auxiliary_loss_clip": 0.01117842, + "auxiliary_loss_mlp": 0.0103844, + "balance_loss_clip": 1.04573226, + "balance_loss_mlp": 1.02430749, + "epoch": 0.3556891627837066, + "flos": 20557853850240.0, + "grad_norm": 1.8996638126963372, + "language_loss": 0.76175886, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.78332168, + "num_input_tokens_seen": 127169865, + "step": 5916, + "time_per_iteration": 2.505423069000244 + }, + { + "auxiliary_loss_clip": 0.01097738, + "auxiliary_loss_mlp": 0.01037196, + "balance_loss_clip": 1.04351783, + "balance_loss_mlp": 1.02298641, + "epoch": 0.35574928603637457, + "flos": 20703327932160.0, + "grad_norm": 1.7476965516960996, + "language_loss": 0.88649791, + "learning_rate": 2.986365519932332e-06, + "loss": 0.90784729, + "num_input_tokens_seen": 127188075, + "step": 5917, + "time_per_iteration": 2.55876088142395 + }, + { + "auxiliary_loss_clip": 0.01058378, + "auxiliary_loss_mlp": 0.01042279, + "balance_loss_clip": 1.04639721, + "balance_loss_mlp": 1.02462399, + "epoch": 0.35580940928904253, + "flos": 15194595444480.0, + "grad_norm": 1.935987539278264, + "language_loss": 0.75020069, + "learning_rate": 2.98602669849771e-06, + "loss": 0.77120721, + "num_input_tokens_seen": 127206065, + "step": 5918, + "time_per_iteration": 2.612309217453003 + }, + { + "auxiliary_loss_clip": 0.01043285, + "auxiliary_loss_mlp": 0.01005142, + "balance_loss_clip": 1.03003931, + "balance_loss_mlp": 1.00317526, + "epoch": 0.3558695325417105, + "flos": 58639145431680.0, + "grad_norm": 0.9486154769595739, + "language_loss": 0.63960767, + "learning_rate": 2.985687839672857e-06, + "loss": 0.660092, + "num_input_tokens_seen": 127257885, + "step": 5919, + "time_per_iteration": 2.868290901184082 + }, + { + "auxiliary_loss_clip": 0.01116442, + "auxiliary_loss_mlp": 0.01033077, + "balance_loss_clip": 1.04589784, + "balance_loss_mlp": 1.01794314, + "epoch": 0.35592965579437846, + "flos": 22018233104640.0, + "grad_norm": 2.2777586381794857, + "language_loss": 0.7377978, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.75929296, + "num_input_tokens_seen": 127275550, + "step": 5920, + "time_per_iteration": 2.51005220413208 + }, + { + "auxiliary_loss_clip": 0.01088301, + "auxiliary_loss_mlp": 0.01034474, + "balance_loss_clip": 1.04487062, + "balance_loss_mlp": 1.01954854, + "epoch": 0.35598977904704643, + "flos": 23367684182400.0, + "grad_norm": 1.7147366526547945, + "language_loss": 0.77297139, + "learning_rate": 2.985010009903857e-06, + "loss": 0.79419911, + "num_input_tokens_seen": 127295110, + "step": 5921, + "time_per_iteration": 2.5964930057525635 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01038194, + "balance_loss_clip": 1.04358113, + "balance_loss_mlp": 1.02374625, + "epoch": 0.3560499022997144, + "flos": 17785334770560.0, + "grad_norm": 1.723932522326925, + "language_loss": 0.67320609, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.69460994, + "num_input_tokens_seen": 127312865, + "step": 5922, + "time_per_iteration": 2.527249813079834 + }, + { + "auxiliary_loss_clip": 0.01118711, + "auxiliary_loss_mlp": 0.01036312, + "balance_loss_clip": 1.05077577, + "balance_loss_mlp": 1.02111316, + "epoch": 0.35611002555238236, + "flos": 20740459616640.0, + "grad_norm": 2.1678192262330875, + "language_loss": 0.79424655, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.81579685, + "num_input_tokens_seen": 127331710, + "step": 5923, + "time_per_iteration": 2.5060691833496094 + }, + { + "auxiliary_loss_clip": 0.01114032, + "auxiliary_loss_mlp": 0.0103813, + "balance_loss_clip": 1.05080605, + "balance_loss_mlp": 1.02338338, + "epoch": 0.3561701488050504, + "flos": 19462219251840.0, + "grad_norm": 1.6497729199181856, + "language_loss": 0.85539162, + "learning_rate": 2.983992985144908e-06, + "loss": 0.87691319, + "num_input_tokens_seen": 127350950, + "step": 5924, + "time_per_iteration": 2.5426080226898193 + }, + { + "auxiliary_loss_clip": 0.01107841, + "auxiliary_loss_mlp": 0.01037452, + "balance_loss_clip": 1.04977393, + "balance_loss_mlp": 1.02160883, + "epoch": 0.35623027205771834, + "flos": 30774942023040.0, + "grad_norm": 2.2827636997021648, + "language_loss": 0.77633607, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.79778898, + "num_input_tokens_seen": 127369385, + "step": 5925, + "time_per_iteration": 2.598454236984253 + }, + { + "auxiliary_loss_clip": 0.01069496, + "auxiliary_loss_mlp": 0.01044651, + "balance_loss_clip": 1.04355383, + "balance_loss_mlp": 1.02975607, + "epoch": 0.3562903953103863, + "flos": 16981079299200.0, + "grad_norm": 1.8641205229310265, + "language_loss": 0.75787646, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.77901793, + "num_input_tokens_seen": 127386965, + "step": 5926, + "time_per_iteration": 2.5960216522216797 + }, + { + "auxiliary_loss_clip": 0.01104279, + "auxiliary_loss_mlp": 0.00838871, + "balance_loss_clip": 1.04748142, + "balance_loss_mlp": 1.09982407, + "epoch": 0.3563505185630543, + "flos": 23839837482240.0, + "grad_norm": 1.7908867845668475, + "language_loss": 0.69476068, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.71419215, + "num_input_tokens_seen": 127406075, + "step": 5927, + "time_per_iteration": 2.592653751373291 + }, + { + "auxiliary_loss_clip": 0.01129811, + "auxiliary_loss_mlp": 0.01036621, + "balance_loss_clip": 1.0472225, + "balance_loss_mlp": 1.02278137, + "epoch": 0.35641064181572224, + "flos": 22273450214400.0, + "grad_norm": 1.9810763922489294, + "language_loss": 0.79522514, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.81688941, + "num_input_tokens_seen": 127425350, + "step": 5928, + "time_per_iteration": 2.4980735778808594 + }, + { + "auxiliary_loss_clip": 0.01131312, + "auxiliary_loss_mlp": 0.01036516, + "balance_loss_clip": 1.04841363, + "balance_loss_mlp": 1.0220139, + "epoch": 0.3564707650683902, + "flos": 23001251587200.0, + "grad_norm": 1.5959944680572675, + "language_loss": 0.82120091, + "learning_rate": 2.982297197789215e-06, + "loss": 0.84287924, + "num_input_tokens_seen": 127446335, + "step": 5929, + "time_per_iteration": 2.4944825172424316 + }, + { + "auxiliary_loss_clip": 0.01116979, + "auxiliary_loss_mlp": 0.0103565, + "balance_loss_clip": 1.04750919, + "balance_loss_mlp": 1.02147603, + "epoch": 0.35653088832105817, + "flos": 14684268965760.0, + "grad_norm": 1.591366041767244, + "language_loss": 0.7028597, + "learning_rate": 2.981957928520201e-06, + "loss": 0.72438598, + "num_input_tokens_seen": 127462795, + "step": 5930, + "time_per_iteration": 2.4946279525756836 + }, + { + "auxiliary_loss_clip": 0.0111807, + "auxiliary_loss_mlp": 0.01041516, + "balance_loss_clip": 1.04957342, + "balance_loss_mlp": 1.02643645, + "epoch": 0.35659101157372614, + "flos": 23477068074240.0, + "grad_norm": 2.0125059655742112, + "language_loss": 0.68054563, + "learning_rate": 2.981618622015244e-06, + "loss": 0.70214152, + "num_input_tokens_seen": 127482675, + "step": 5931, + "time_per_iteration": 2.5273728370666504 + }, + { + "auxiliary_loss_clip": 0.01121462, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.04881334, + "balance_loss_mlp": 1.01869845, + "epoch": 0.3566511348263941, + "flos": 26578672583040.0, + "grad_norm": 1.5793137997960722, + "language_loss": 0.67461705, + "learning_rate": 2.981279278287211e-06, + "loss": 0.69616765, + "num_input_tokens_seen": 127502275, + "step": 5932, + "time_per_iteration": 2.5972237586975098 + }, + { + "auxiliary_loss_clip": 0.01080837, + "auxiliary_loss_mlp": 0.01033702, + "balance_loss_clip": 1.05182064, + "balance_loss_mlp": 1.01921225, + "epoch": 0.35671125807906207, + "flos": 13115008609920.0, + "grad_norm": 2.0575435670946547, + "language_loss": 0.79205954, + "learning_rate": 2.980939897348969e-06, + "loss": 0.81320488, + "num_input_tokens_seen": 127520195, + "step": 5933, + "time_per_iteration": 2.6086466312408447 + }, + { + "auxiliary_loss_clip": 0.01113546, + "auxiliary_loss_mlp": 0.01048867, + "balance_loss_clip": 1.04311848, + "balance_loss_mlp": 1.03195119, + "epoch": 0.35677138133173003, + "flos": 33000577557120.0, + "grad_norm": 1.3926870747702342, + "language_loss": 0.69856393, + "learning_rate": 2.980600479213388e-06, + "loss": 0.72018802, + "num_input_tokens_seen": 127544495, + "step": 5934, + "time_per_iteration": 2.640439510345459 + }, + { + "auxiliary_loss_clip": 0.0111009, + "auxiliary_loss_mlp": 0.00830075, + "balance_loss_clip": 1.04859114, + "balance_loss_mlp": 1.07722557, + "epoch": 0.356831504584398, + "flos": 20777842696320.0, + "grad_norm": 2.3882184748866293, + "language_loss": 0.71324217, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.73264384, + "num_input_tokens_seen": 127563810, + "step": 5935, + "time_per_iteration": 2.6011602878570557 + }, + { + "auxiliary_loss_clip": 0.01101052, + "auxiliary_loss_mlp": 0.01038131, + "balance_loss_clip": 1.0468781, + "balance_loss_mlp": 1.02390969, + "epoch": 0.35689162783706596, + "flos": 12165566365440.0, + "grad_norm": 2.5809213160519056, + "language_loss": 0.78417718, + "learning_rate": 2.979921531401692e-06, + "loss": 0.80556905, + "num_input_tokens_seen": 127579065, + "step": 5936, + "time_per_iteration": 2.5282063484191895 + }, + { + "auxiliary_loss_clip": 0.01119312, + "auxiliary_loss_mlp": 0.00815233, + "balance_loss_clip": 1.04761231, + "balance_loss_mlp": 1.05570483, + "epoch": 0.356951751089734, + "flos": 23841489507840.0, + "grad_norm": 1.4429658904688256, + "language_loss": 0.64316243, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.66250789, + "num_input_tokens_seen": 127599105, + "step": 5937, + "time_per_iteration": 2.577720880508423 + }, + { + "auxiliary_loss_clip": 0.01134403, + "auxiliary_loss_mlp": 0.00805491, + "balance_loss_clip": 1.04921257, + "balance_loss_mlp": 1.03754735, + "epoch": 0.35701187434240195, + "flos": 11722176881280.0, + "grad_norm": 2.5692444882079695, + "language_loss": 0.78402746, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.80342644, + "num_input_tokens_seen": 127614940, + "step": 5938, + "time_per_iteration": 2.469907283782959 + }, + { + "auxiliary_loss_clip": 0.01098856, + "auxiliary_loss_mlp": 0.0104227, + "balance_loss_clip": 1.0538733, + "balance_loss_mlp": 1.02801228, + "epoch": 0.3570719975950699, + "flos": 24898879100160.0, + "grad_norm": 1.5137305530285494, + "language_loss": 0.80483687, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.82624805, + "num_input_tokens_seen": 127634960, + "step": 5939, + "time_per_iteration": 2.6085402965545654 + }, + { + "auxiliary_loss_clip": 0.0111103, + "auxiliary_loss_mlp": 0.01034921, + "balance_loss_clip": 1.04574907, + "balance_loss_mlp": 1.01982903, + "epoch": 0.3571321208477379, + "flos": 25994836920960.0, + "grad_norm": 1.7771710647606638, + "language_loss": 0.78722101, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.80868053, + "num_input_tokens_seen": 127654545, + "step": 5940, + "time_per_iteration": 2.555312395095825 + }, + { + "auxiliary_loss_clip": 0.01112954, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.05021882, + "balance_loss_mlp": 1.01883757, + "epoch": 0.35719224410040584, + "flos": 14501663199360.0, + "grad_norm": 1.933200498289488, + "language_loss": 0.72018665, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.74165595, + "num_input_tokens_seen": 127672320, + "step": 5941, + "time_per_iteration": 2.5324909687042236 + }, + { + "auxiliary_loss_clip": 0.01124933, + "auxiliary_loss_mlp": 0.01035845, + "balance_loss_clip": 1.05051076, + "balance_loss_mlp": 1.02004421, + "epoch": 0.3572523673530738, + "flos": 31175453646720.0, + "grad_norm": 2.12434809799019, + "language_loss": 0.64263141, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.66423917, + "num_input_tokens_seen": 127693315, + "step": 5942, + "time_per_iteration": 2.5858397483825684 + }, + { + "auxiliary_loss_clip": 0.0111858, + "auxiliary_loss_mlp": 0.01043422, + "balance_loss_clip": 1.04726613, + "balance_loss_mlp": 1.02814543, + "epoch": 0.3573124906057418, + "flos": 15851976203520.0, + "grad_norm": 1.85212196991812, + "language_loss": 0.73755795, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.75917804, + "num_input_tokens_seen": 127711570, + "step": 5943, + "time_per_iteration": 3.895104169845581 + }, + { + "auxiliary_loss_clip": 0.01052329, + "auxiliary_loss_mlp": 0.01008355, + "balance_loss_clip": 1.02331305, + "balance_loss_mlp": 1.00629294, + "epoch": 0.35737261385840974, + "flos": 60822729118080.0, + "grad_norm": 0.7942548055395825, + "language_loss": 0.60739923, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.6280061, + "num_input_tokens_seen": 127772475, + "step": 5944, + "time_per_iteration": 3.1618263721466064 + }, + { + "auxiliary_loss_clip": 0.01106056, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.04664624, + "balance_loss_mlp": 1.02102685, + "epoch": 0.3574327371110777, + "flos": 18843765857280.0, + "grad_norm": 1.684324793783894, + "language_loss": 0.72055751, + "learning_rate": 2.976864428379655e-06, + "loss": 0.74197257, + "num_input_tokens_seen": 127790940, + "step": 5945, + "time_per_iteration": 2.506046772003174 + }, + { + "auxiliary_loss_clip": 0.01107717, + "auxiliary_loss_mlp": 0.00796761, + "balance_loss_clip": 1.04529107, + "balance_loss_mlp": 1.02169776, + "epoch": 0.35749286036374567, + "flos": 23549679417600.0, + "grad_norm": 1.651420438452578, + "language_loss": 0.81069994, + "learning_rate": 2.976524564880326e-06, + "loss": 0.82974464, + "num_input_tokens_seen": 127808275, + "step": 5946, + "time_per_iteration": 2.546947956085205 + }, + { + "auxiliary_loss_clip": 0.01134005, + "auxiliary_loss_mlp": 0.01045799, + "balance_loss_clip": 1.05035233, + "balance_loss_mlp": 1.03076649, + "epoch": 0.35755298361641363, + "flos": 21105491581440.0, + "grad_norm": 1.663055461537085, + "language_loss": 0.6862157, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.70801371, + "num_input_tokens_seen": 127828840, + "step": 5947, + "time_per_iteration": 5.221043348312378 + }, + { + "auxiliary_loss_clip": 0.01103039, + "auxiliary_loss_mlp": 0.01036612, + "balance_loss_clip": 1.04591787, + "balance_loss_mlp": 1.02244985, + "epoch": 0.3576131068690816, + "flos": 19245031666560.0, + "grad_norm": 1.608889370412184, + "language_loss": 0.75318611, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.77458262, + "num_input_tokens_seen": 127846240, + "step": 5948, + "time_per_iteration": 2.5351667404174805 + }, + { + "auxiliary_loss_clip": 0.01078117, + "auxiliary_loss_mlp": 0.01043158, + "balance_loss_clip": 1.04841256, + "balance_loss_mlp": 1.02860224, + "epoch": 0.35767323012174956, + "flos": 28654703971200.0, + "grad_norm": 2.1862952878685302, + "language_loss": 0.70593643, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.72714919, + "num_input_tokens_seen": 127866880, + "step": 5949, + "time_per_iteration": 4.073309898376465 + }, + { + "auxiliary_loss_clip": 0.01108156, + "auxiliary_loss_mlp": 0.01041243, + "balance_loss_clip": 1.04757714, + "balance_loss_mlp": 1.02742708, + "epoch": 0.35773335337441753, + "flos": 17085363459840.0, + "grad_norm": 1.762737307740541, + "language_loss": 0.76782548, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.78931946, + "num_input_tokens_seen": 127883560, + "step": 5950, + "time_per_iteration": 2.506887912750244 + }, + { + "auxiliary_loss_clip": 0.01123682, + "auxiliary_loss_mlp": 0.01034032, + "balance_loss_clip": 1.04840076, + "balance_loss_mlp": 1.01874924, + "epoch": 0.35779347662708555, + "flos": 15888605097600.0, + "grad_norm": 1.6798555282652263, + "language_loss": 0.72725999, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.74883711, + "num_input_tokens_seen": 127902330, + "step": 5951, + "time_per_iteration": 2.490619659423828 + }, + { + "auxiliary_loss_clip": 0.01126228, + "auxiliary_loss_mlp": 0.01042133, + "balance_loss_clip": 1.05055261, + "balance_loss_mlp": 1.02694535, + "epoch": 0.3578535998797535, + "flos": 28658834035200.0, + "grad_norm": 3.106985764112784, + "language_loss": 0.69892228, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.72060591, + "num_input_tokens_seen": 127922325, + "step": 5952, + "time_per_iteration": 2.550905704498291 + }, + { + "auxiliary_loss_clip": 0.01079625, + "auxiliary_loss_mlp": 0.01039683, + "balance_loss_clip": 1.04491258, + "balance_loss_mlp": 1.02442431, + "epoch": 0.3579137231324215, + "flos": 37852432076160.0, + "grad_norm": 2.6441960633158024, + "language_loss": 0.69395846, + "learning_rate": 2.974144484269449e-06, + "loss": 0.71515155, + "num_input_tokens_seen": 127942635, + "step": 5953, + "time_per_iteration": 2.743317127227783 + }, + { + "auxiliary_loss_clip": 0.01109165, + "auxiliary_loss_mlp": 0.01031774, + "balance_loss_clip": 1.04885709, + "balance_loss_mlp": 1.0176959, + "epoch": 0.35797384638508944, + "flos": 22346851656960.0, + "grad_norm": 1.869943660725663, + "language_loss": 0.66622543, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.68763483, + "num_input_tokens_seen": 127962520, + "step": 5954, + "time_per_iteration": 2.557973861694336 + }, + { + "auxiliary_loss_clip": 0.01108929, + "auxiliary_loss_mlp": 0.0103941, + "balance_loss_clip": 1.05043018, + "balance_loss_mlp": 1.02588558, + "epoch": 0.3580339696377574, + "flos": 13589711775360.0, + "grad_norm": 1.8132767547870885, + "language_loss": 0.75097382, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.77245724, + "num_input_tokens_seen": 127981180, + "step": 5955, + "time_per_iteration": 2.5424773693084717 + }, + { + "auxiliary_loss_clip": 0.01109619, + "auxiliary_loss_mlp": 0.01031468, + "balance_loss_clip": 1.04732335, + "balance_loss_mlp": 1.01786041, + "epoch": 0.3580940928904254, + "flos": 23768231719680.0, + "grad_norm": 1.5069628464263818, + "language_loss": 0.76355857, + "learning_rate": 2.973123895369182e-06, + "loss": 0.78496945, + "num_input_tokens_seen": 127999725, + "step": 5956, + "time_per_iteration": 2.5322372913360596 + }, + { + "auxiliary_loss_clip": 0.0112818, + "auxiliary_loss_mlp": 0.01040332, + "balance_loss_clip": 1.04806328, + "balance_loss_mlp": 1.02612805, + "epoch": 0.35815421614309334, + "flos": 19463871277440.0, + "grad_norm": 1.6745704789546216, + "language_loss": 0.72686601, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.74855113, + "num_input_tokens_seen": 128018885, + "step": 5957, + "time_per_iteration": 2.4783859252929688 + }, + { + "auxiliary_loss_clip": 0.01108578, + "auxiliary_loss_mlp": 0.01034732, + "balance_loss_clip": 1.04988599, + "balance_loss_mlp": 1.02106452, + "epoch": 0.3582143393957613, + "flos": 23368186972800.0, + "grad_norm": 2.1015258464261333, + "language_loss": 0.71293038, + "learning_rate": 2.972443318242726e-06, + "loss": 0.73436344, + "num_input_tokens_seen": 128037875, + "step": 5958, + "time_per_iteration": 2.576857089996338 + }, + { + "auxiliary_loss_clip": 0.01094885, + "auxiliary_loss_mlp": 0.01030354, + "balance_loss_clip": 1.05042398, + "balance_loss_mlp": 1.01678848, + "epoch": 0.35827446264842927, + "flos": 26323275905280.0, + "grad_norm": 2.112963702113351, + "language_loss": 0.88615716, + "learning_rate": 2.972102974360324e-06, + "loss": 0.90740955, + "num_input_tokens_seen": 128056045, + "step": 5959, + "time_per_iteration": 2.6197562217712402 + }, + { + "auxiliary_loss_clip": 0.01128393, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_clip": 1.04779959, + "balance_loss_mlp": 1.02170038, + "epoch": 0.35833458590109724, + "flos": 30446610779520.0, + "grad_norm": 1.6286782247743752, + "language_loss": 0.58201844, + "learning_rate": 2.971762593615679e-06, + "loss": 0.60366046, + "num_input_tokens_seen": 128077815, + "step": 5960, + "time_per_iteration": 2.5444469451904297 + }, + { + "auxiliary_loss_clip": 0.01130345, + "auxiliary_loss_mlp": 0.0104042, + "balance_loss_clip": 1.04821241, + "balance_loss_mlp": 1.02528048, + "epoch": 0.3583947091537652, + "flos": 14829886702080.0, + "grad_norm": 2.179227495573977, + "language_loss": 0.76243794, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.78414559, + "num_input_tokens_seen": 128095460, + "step": 5961, + "time_per_iteration": 2.4383132457733154 + }, + { + "auxiliary_loss_clip": 0.01096037, + "auxiliary_loss_mlp": 0.01031381, + "balance_loss_clip": 1.04875302, + "balance_loss_mlp": 1.01729035, + "epoch": 0.35845483240643317, + "flos": 34240644743040.0, + "grad_norm": 1.8495525673537105, + "language_loss": 0.70254129, + "learning_rate": 2.971081721591294e-06, + "loss": 0.7238155, + "num_input_tokens_seen": 128118605, + "step": 5962, + "time_per_iteration": 2.68628191947937 + }, + { + "auxiliary_loss_clip": 0.01109136, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.04855895, + "balance_loss_mlp": 1.01987624, + "epoch": 0.35851495565910113, + "flos": 20960089326720.0, + "grad_norm": 1.6595580280926803, + "language_loss": 0.74467379, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.76608795, + "num_input_tokens_seen": 128139205, + "step": 5963, + "time_per_iteration": 2.534900426864624 + }, + { + "auxiliary_loss_clip": 0.01129915, + "auxiliary_loss_mlp": 0.01040177, + "balance_loss_clip": 1.04987526, + "balance_loss_mlp": 1.02578831, + "epoch": 0.35857507891176915, + "flos": 22309863626880.0, + "grad_norm": 1.5560822474713363, + "language_loss": 0.78483427, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.80653512, + "num_input_tokens_seen": 128158765, + "step": 5964, + "time_per_iteration": 2.5047199726104736 + }, + { + "auxiliary_loss_clip": 0.01106314, + "auxiliary_loss_mlp": 0.01038763, + "balance_loss_clip": 1.05050397, + "balance_loss_mlp": 1.0238018, + "epoch": 0.3586352021644371, + "flos": 23367863750400.0, + "grad_norm": 1.7920259861153034, + "language_loss": 0.66476274, + "learning_rate": 2.970060137410626e-06, + "loss": 0.68621349, + "num_input_tokens_seen": 128177850, + "step": 5965, + "time_per_iteration": 2.5438201427459717 + }, + { + "auxiliary_loss_clip": 0.0112784, + "auxiliary_loss_mlp": 0.00803733, + "balance_loss_clip": 1.04651546, + "balance_loss_mlp": 1.03835809, + "epoch": 0.3586953254171051, + "flos": 27849227437440.0, + "grad_norm": 1.6039219532236417, + "language_loss": 0.78802943, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.80734515, + "num_input_tokens_seen": 128196925, + "step": 5966, + "time_per_iteration": 2.5543289184570312 + }, + { + "auxiliary_loss_clip": 0.01071378, + "auxiliary_loss_mlp": 0.01036487, + "balance_loss_clip": 1.04399395, + "balance_loss_mlp": 1.02124, + "epoch": 0.35875544866977305, + "flos": 19500500171520.0, + "grad_norm": 2.0685592800433783, + "language_loss": 0.91245162, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.93353021, + "num_input_tokens_seen": 128213955, + "step": 5967, + "time_per_iteration": 2.5927343368530273 + }, + { + "auxiliary_loss_clip": 0.01091079, + "auxiliary_loss_mlp": 0.01049605, + "balance_loss_clip": 1.04512167, + "balance_loss_mlp": 1.03291547, + "epoch": 0.358815571922441, + "flos": 21471134077440.0, + "grad_norm": 2.0134637818794388, + "language_loss": 0.80468196, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.82608879, + "num_input_tokens_seen": 128232980, + "step": 5968, + "time_per_iteration": 2.575441837310791 + }, + { + "auxiliary_loss_clip": 0.01109604, + "auxiliary_loss_mlp": 0.0105013, + "balance_loss_clip": 1.05200207, + "balance_loss_mlp": 1.03433466, + "epoch": 0.358875695175109, + "flos": 21835411856640.0, + "grad_norm": 2.146694115353564, + "language_loss": 0.84511769, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.86671507, + "num_input_tokens_seen": 128252795, + "step": 5969, + "time_per_iteration": 2.5405983924865723 + }, + { + "auxiliary_loss_clip": 0.01085553, + "auxiliary_loss_mlp": 0.01035776, + "balance_loss_clip": 1.04539156, + "balance_loss_mlp": 1.0214529, + "epoch": 0.35893581842777694, + "flos": 32011633330560.0, + "grad_norm": 1.822399821351799, + "language_loss": 0.72436082, + "learning_rate": 2.968356761586202e-06, + "loss": 0.74557412, + "num_input_tokens_seen": 128273115, + "step": 5970, + "time_per_iteration": 2.6523404121398926 + }, + { + "auxiliary_loss_clip": 0.0110633, + "auxiliary_loss_mlp": 0.01032926, + "balance_loss_clip": 1.04624701, + "balance_loss_mlp": 1.01903772, + "epoch": 0.3589959416804449, + "flos": 20485817124480.0, + "grad_norm": 1.8732158741348652, + "language_loss": 0.79546452, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.81685704, + "num_input_tokens_seen": 128292220, + "step": 5971, + "time_per_iteration": 2.519878625869751 + }, + { + "auxiliary_loss_clip": 0.01087145, + "auxiliary_loss_mlp": 0.01040368, + "balance_loss_clip": 1.0437746, + "balance_loss_mlp": 1.02513289, + "epoch": 0.3590560649331129, + "flos": 16180666583040.0, + "grad_norm": 1.8008861691436284, + "language_loss": 0.7844969, + "learning_rate": 2.967675154124696e-06, + "loss": 0.80577201, + "num_input_tokens_seen": 128310305, + "step": 5972, + "time_per_iteration": 2.5484859943389893 + }, + { + "auxiliary_loss_clip": 0.01091062, + "auxiliary_loss_mlp": 0.01035786, + "balance_loss_clip": 1.04887617, + "balance_loss_mlp": 1.02118909, + "epoch": 0.35911618818578084, + "flos": 20375391738240.0, + "grad_norm": 1.7911135521298696, + "language_loss": 0.81204593, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.83331442, + "num_input_tokens_seen": 128328305, + "step": 5973, + "time_per_iteration": 2.552591562271118 + }, + { + "auxiliary_loss_clip": 0.0103082, + "auxiliary_loss_mlp": 0.01011699, + "balance_loss_clip": 1.02151942, + "balance_loss_mlp": 1.0095408, + "epoch": 0.3591763114384488, + "flos": 41236691685120.0, + "grad_norm": 0.9113429665783997, + "language_loss": 0.56650209, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.58692729, + "num_input_tokens_seen": 128378380, + "step": 5974, + "time_per_iteration": 2.9953691959381104 + }, + { + "auxiliary_loss_clip": 0.01119213, + "auxiliary_loss_mlp": 0.01039643, + "balance_loss_clip": 1.04670691, + "balance_loss_mlp": 1.02593982, + "epoch": 0.35923643469111677, + "flos": 18695454600960.0, + "grad_norm": 1.7152787712184896, + "language_loss": 0.68930304, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.7108916, + "num_input_tokens_seen": 128394315, + "step": 5975, + "time_per_iteration": 2.4960410594940186 + }, + { + "auxiliary_loss_clip": 0.0112687, + "auxiliary_loss_mlp": 0.01038117, + "balance_loss_clip": 1.04644263, + "balance_loss_mlp": 1.02460432, + "epoch": 0.35929655794378473, + "flos": 25009950931200.0, + "grad_norm": 1.6564275226500058, + "language_loss": 0.80328655, + "learning_rate": 2.96631149897303e-06, + "loss": 0.82493639, + "num_input_tokens_seen": 128414515, + "step": 5976, + "time_per_iteration": 2.542158365249634 + }, + { + "auxiliary_loss_clip": 0.01067027, + "auxiliary_loss_mlp": 0.01041434, + "balance_loss_clip": 1.04111814, + "balance_loss_mlp": 1.02592516, + "epoch": 0.35935668119645275, + "flos": 14975576265600.0, + "grad_norm": 1.6912765367734666, + "language_loss": 0.78863472, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.80971932, + "num_input_tokens_seen": 128430615, + "step": 5977, + "time_per_iteration": 2.568429470062256 + }, + { + "auxiliary_loss_clip": 0.01089446, + "auxiliary_loss_mlp": 0.01039423, + "balance_loss_clip": 1.04696155, + "balance_loss_mlp": 1.02589297, + "epoch": 0.3594168044491207, + "flos": 21178138838400.0, + "grad_norm": 1.7619290646588108, + "language_loss": 0.80130219, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.82259083, + "num_input_tokens_seen": 128449480, + "step": 5978, + "time_per_iteration": 2.5663421154022217 + }, + { + "auxiliary_loss_clip": 0.01130052, + "auxiliary_loss_mlp": 0.00792957, + "balance_loss_clip": 1.04811049, + "balance_loss_mlp": 1.01494205, + "epoch": 0.3594769277017887, + "flos": 27672152365440.0, + "grad_norm": 1.8666987706442564, + "language_loss": 0.67573881, + "learning_rate": 2.965288372816436e-06, + "loss": 0.69496882, + "num_input_tokens_seen": 128471465, + "step": 5979, + "time_per_iteration": 2.5441973209381104 + }, + { + "auxiliary_loss_clip": 0.01100597, + "auxiliary_loss_mlp": 0.01037233, + "balance_loss_clip": 1.0440768, + "balance_loss_mlp": 1.02235615, + "epoch": 0.35953705095445665, + "flos": 23002328995200.0, + "grad_norm": 2.2184508632961575, + "language_loss": 0.66860783, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.68998617, + "num_input_tokens_seen": 128490645, + "step": 5980, + "time_per_iteration": 2.55603289604187 + }, + { + "auxiliary_loss_clip": 0.01105323, + "auxiliary_loss_mlp": 0.01040223, + "balance_loss_clip": 1.04613113, + "balance_loss_mlp": 1.02464223, + "epoch": 0.3595971742071246, + "flos": 25513992529920.0, + "grad_norm": 2.091500102247451, + "language_loss": 0.71031046, + "learning_rate": 2.964606105671327e-06, + "loss": 0.73176599, + "num_input_tokens_seen": 128510225, + "step": 5981, + "time_per_iteration": 2.5656609535217285 + }, + { + "auxiliary_loss_clip": 0.01106431, + "auxiliary_loss_mlp": 0.01042036, + "balance_loss_clip": 1.05072498, + "balance_loss_mlp": 1.02594233, + "epoch": 0.3596572974597926, + "flos": 29862559635840.0, + "grad_norm": 1.6427620930588318, + "language_loss": 0.71173775, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.73322242, + "num_input_tokens_seen": 128530195, + "step": 5982, + "time_per_iteration": 3.9712226390838623 + }, + { + "auxiliary_loss_clip": 0.01109708, + "auxiliary_loss_mlp": 0.01048402, + "balance_loss_clip": 1.04489136, + "balance_loss_mlp": 1.03334582, + "epoch": 0.35971742071246054, + "flos": 23112538899840.0, + "grad_norm": 1.5844041976940482, + "language_loss": 0.75634325, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.77792442, + "num_input_tokens_seen": 128549990, + "step": 5983, + "time_per_iteration": 2.515801429748535 + }, + { + "auxiliary_loss_clip": 0.01136459, + "auxiliary_loss_mlp": 0.01043659, + "balance_loss_clip": 1.05044293, + "balance_loss_mlp": 1.02745819, + "epoch": 0.3597775439651285, + "flos": 16725359399040.0, + "grad_norm": 1.6605539248433725, + "language_loss": 0.76364267, + "learning_rate": 2.96358243065131e-06, + "loss": 0.7854439, + "num_input_tokens_seen": 128567925, + "step": 5984, + "time_per_iteration": 2.4650187492370605 + }, + { + "auxiliary_loss_clip": 0.01117923, + "auxiliary_loss_mlp": 0.00795274, + "balance_loss_clip": 1.04816544, + "balance_loss_mlp": 1.01884317, + "epoch": 0.3598376672177965, + "flos": 19719483436800.0, + "grad_norm": 1.7151553068572867, + "language_loss": 0.86074632, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.87987828, + "num_input_tokens_seen": 128585655, + "step": 5985, + "time_per_iteration": 5.33185338973999 + }, + { + "auxiliary_loss_clip": 0.01117258, + "auxiliary_loss_mlp": 0.0104128, + "balance_loss_clip": 1.04771686, + "balance_loss_mlp": 1.02504361, + "epoch": 0.35989779047046444, + "flos": 17311529445120.0, + "grad_norm": 1.4029024307346283, + "language_loss": 0.72453976, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.7461251, + "num_input_tokens_seen": 128604820, + "step": 5986, + "time_per_iteration": 2.515920639038086 + }, + { + "auxiliary_loss_clip": 0.01099703, + "auxiliary_loss_mlp": 0.01040654, + "balance_loss_clip": 1.04463291, + "balance_loss_mlp": 1.02558589, + "epoch": 0.3599579137231324, + "flos": 22711237176960.0, + "grad_norm": 2.056530305797743, + "language_loss": 0.73960054, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.76100415, + "num_input_tokens_seen": 128623070, + "step": 5987, + "time_per_iteration": 2.60262131690979 + }, + { + "auxiliary_loss_clip": 0.01136935, + "auxiliary_loss_mlp": 0.01036639, + "balance_loss_clip": 1.05197752, + "balance_loss_mlp": 1.02118945, + "epoch": 0.36001803697580037, + "flos": 20959873845120.0, + "grad_norm": 2.0265770174161584, + "language_loss": 0.69553924, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.71727502, + "num_input_tokens_seen": 128642430, + "step": 5988, + "time_per_iteration": 3.848695993423462 + }, + { + "auxiliary_loss_clip": 0.01124841, + "auxiliary_loss_mlp": 0.0104116, + "balance_loss_clip": 1.05010962, + "balance_loss_mlp": 1.02573478, + "epoch": 0.36007816022846834, + "flos": 20485565729280.0, + "grad_norm": 3.9584616748660078, + "language_loss": 0.72867572, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.75033575, + "num_input_tokens_seen": 128661285, + "step": 5989, + "time_per_iteration": 2.5136044025421143 + }, + { + "auxiliary_loss_clip": 0.01087466, + "auxiliary_loss_mlp": 0.01037738, + "balance_loss_clip": 1.04378819, + "balance_loss_mlp": 1.02240765, + "epoch": 0.36013828348113636, + "flos": 28001237794560.0, + "grad_norm": 1.5512753552241145, + "language_loss": 0.80010039, + "learning_rate": 2.961534094403931e-06, + "loss": 0.82135248, + "num_input_tokens_seen": 128682210, + "step": 5990, + "time_per_iteration": 2.6140153408050537 + }, + { + "auxiliary_loss_clip": 0.01121678, + "auxiliary_loss_mlp": 0.01035383, + "balance_loss_clip": 1.04902291, + "balance_loss_mlp": 1.02009451, + "epoch": 0.3601984067338043, + "flos": 20082181017600.0, + "grad_norm": 1.7306091506534582, + "language_loss": 0.83772665, + "learning_rate": 2.961192577338698e-06, + "loss": 0.85929728, + "num_input_tokens_seen": 128700445, + "step": 5991, + "time_per_iteration": 2.5591483116149902 + }, + { + "auxiliary_loss_clip": 0.01114515, + "auxiliary_loss_mlp": 0.01042272, + "balance_loss_clip": 1.05248928, + "balance_loss_mlp": 1.026703, + "epoch": 0.3602585299864723, + "flos": 18617599872000.0, + "grad_norm": 3.536875879955241, + "language_loss": 0.75711954, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.77868742, + "num_input_tokens_seen": 128716855, + "step": 5992, + "time_per_iteration": 2.5057694911956787 + }, + { + "auxiliary_loss_clip": 0.01132042, + "auxiliary_loss_mlp": 0.01038664, + "balance_loss_clip": 1.04953778, + "balance_loss_mlp": 1.02282083, + "epoch": 0.36031865323914025, + "flos": 19573003774080.0, + "grad_norm": 1.9265308133832486, + "language_loss": 0.77588069, + "learning_rate": 2.960509433875627e-06, + "loss": 0.79758775, + "num_input_tokens_seen": 128735835, + "step": 5993, + "time_per_iteration": 2.493623971939087 + }, + { + "auxiliary_loss_clip": 0.01107269, + "auxiliary_loss_mlp": 0.01049436, + "balance_loss_clip": 1.04573274, + "balance_loss_mlp": 1.03203154, + "epoch": 0.3603787764918082, + "flos": 17490615678720.0, + "grad_norm": 2.0472130161696316, + "language_loss": 0.74881154, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.77037859, + "num_input_tokens_seen": 128752465, + "step": 5994, + "time_per_iteration": 2.5470950603485107 + }, + { + "auxiliary_loss_clip": 0.01095953, + "auxiliary_loss_mlp": 0.01037361, + "balance_loss_clip": 1.05219233, + "balance_loss_mlp": 1.02226257, + "epoch": 0.3604388997444762, + "flos": 15523393564800.0, + "grad_norm": 1.905682320088245, + "language_loss": 0.68991947, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.71125263, + "num_input_tokens_seen": 128770865, + "step": 5995, + "time_per_iteration": 2.5763652324676514 + }, + { + "auxiliary_loss_clip": 0.0110508, + "auxiliary_loss_mlp": 0.01042683, + "balance_loss_clip": 1.0459516, + "balance_loss_mlp": 1.02623248, + "epoch": 0.36049902299714415, + "flos": 17310883000320.0, + "grad_norm": 1.9305798276998993, + "language_loss": 0.83027709, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.85175472, + "num_input_tokens_seen": 128789730, + "step": 5996, + "time_per_iteration": 2.522080183029175 + }, + { + "auxiliary_loss_clip": 0.01131943, + "auxiliary_loss_mlp": 0.01037147, + "balance_loss_clip": 1.04930615, + "balance_loss_mlp": 1.02166772, + "epoch": 0.3605591462498121, + "flos": 17056025026560.0, + "grad_norm": 1.6467544033834258, + "language_loss": 0.73412925, + "learning_rate": 2.959142709981763e-06, + "loss": 0.75582016, + "num_input_tokens_seen": 128806610, + "step": 5997, + "time_per_iteration": 2.4631779193878174 + }, + { + "auxiliary_loss_clip": 0.01126076, + "auxiliary_loss_mlp": 0.01035183, + "balance_loss_clip": 1.05029583, + "balance_loss_mlp": 1.02092528, + "epoch": 0.3606192695024801, + "flos": 16836862193280.0, + "grad_norm": 2.356465110256469, + "language_loss": 0.69311535, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.71472794, + "num_input_tokens_seen": 128824830, + "step": 5998, + "time_per_iteration": 2.492713689804077 + }, + { + "auxiliary_loss_clip": 0.01082801, + "auxiliary_loss_mlp": 0.01041155, + "balance_loss_clip": 1.04466724, + "balance_loss_mlp": 1.02391696, + "epoch": 0.36067939275514804, + "flos": 12129655743360.0, + "grad_norm": 2.175082646169121, + "language_loss": 0.76987231, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.79111183, + "num_input_tokens_seen": 128838170, + "step": 5999, + "time_per_iteration": 2.5499494075775146 + }, + { + "auxiliary_loss_clip": 0.01096531, + "auxiliary_loss_mlp": 0.01042068, + "balance_loss_clip": 1.04950011, + "balance_loss_mlp": 1.02746511, + "epoch": 0.360739516007816, + "flos": 18041449720320.0, + "grad_norm": 1.860556870037676, + "language_loss": 0.78568465, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.80707061, + "num_input_tokens_seen": 128855625, + "step": 6000, + "time_per_iteration": 2.5279386043548584 + }, + { + "auxiliary_loss_clip": 0.01101113, + "auxiliary_loss_mlp": 0.0103741, + "balance_loss_clip": 1.05082798, + "balance_loss_mlp": 1.02217555, + "epoch": 0.360799639260484, + "flos": 18549800951040.0, + "grad_norm": 1.6010085408034473, + "language_loss": 0.78378236, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.80516756, + "num_input_tokens_seen": 128873540, + "step": 6001, + "time_per_iteration": 2.577725648880005 + }, + { + "auxiliary_loss_clip": 0.01130795, + "auxiliary_loss_mlp": 0.0079383, + "balance_loss_clip": 1.04910469, + "balance_loss_mlp": 1.01483452, + "epoch": 0.36085976251315194, + "flos": 19682028529920.0, + "grad_norm": 2.066788542551071, + "language_loss": 0.83585441, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.85510063, + "num_input_tokens_seen": 128889925, + "step": 6002, + "time_per_iteration": 2.4854626655578613 + }, + { + "auxiliary_loss_clip": 0.01103822, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.04623854, + "balance_loss_mlp": 1.02123284, + "epoch": 0.3609198857658199, + "flos": 24198943703040.0, + "grad_norm": 2.2832701920978464, + "language_loss": 0.91142404, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.93281764, + "num_input_tokens_seen": 128906890, + "step": 6003, + "time_per_iteration": 2.550307273864746 + }, + { + "auxiliary_loss_clip": 0.01031238, + "auxiliary_loss_mlp": 0.01003173, + "balance_loss_clip": 1.03934705, + "balance_loss_mlp": 1.0010035, + "epoch": 0.3609800090184879, + "flos": 57115995160320.0, + "grad_norm": 0.8685928391510435, + "language_loss": 0.53384614, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55419028, + "num_input_tokens_seen": 128965940, + "step": 6004, + "time_per_iteration": 3.066077947616577 + }, + { + "auxiliary_loss_clip": 0.01112677, + "auxiliary_loss_mlp": 0.00791945, + "balance_loss_clip": 1.04862595, + "balance_loss_mlp": 1.01112723, + "epoch": 0.3610401322711559, + "flos": 20811239366400.0, + "grad_norm": 1.721171306582842, + "language_loss": 0.77763391, + "learning_rate": 2.956407517225883e-06, + "loss": 0.79668009, + "num_input_tokens_seen": 128985835, + "step": 6005, + "time_per_iteration": 2.5557022094726562 + }, + { + "auxiliary_loss_clip": 0.01114566, + "auxiliary_loss_mlp": 0.01047545, + "balance_loss_clip": 1.04751492, + "balance_loss_mlp": 1.03217888, + "epoch": 0.36110025552382385, + "flos": 13699167494400.0, + "grad_norm": 2.0635085807579974, + "language_loss": 0.79108024, + "learning_rate": 2.956065454793429e-06, + "loss": 0.81270128, + "num_input_tokens_seen": 129003120, + "step": 6006, + "time_per_iteration": 2.5151889324188232 + }, + { + "auxiliary_loss_clip": 0.01137226, + "auxiliary_loss_mlp": 0.0103935, + "balance_loss_clip": 1.05173552, + "balance_loss_mlp": 1.022053, + "epoch": 0.3611603787764918, + "flos": 22455014486400.0, + "grad_norm": 1.7596762562214843, + "language_loss": 0.84508562, + "learning_rate": 2.955723356106876e-06, + "loss": 0.86685139, + "num_input_tokens_seen": 129021645, + "step": 6007, + "time_per_iteration": 2.4855101108551025 + }, + { + "auxiliary_loss_clip": 0.0111788, + "auxiliary_loss_mlp": 0.01038428, + "balance_loss_clip": 1.04972982, + "balance_loss_mlp": 1.02153635, + "epoch": 0.3612205020291598, + "flos": 20886651970560.0, + "grad_norm": 1.9526049356140924, + "language_loss": 0.72146356, + "learning_rate": 2.955381221179198e-06, + "loss": 0.74302661, + "num_input_tokens_seen": 129038375, + "step": 6008, + "time_per_iteration": 2.521165370941162 + }, + { + "auxiliary_loss_clip": 0.01118094, + "auxiliary_loss_mlp": 0.01037201, + "balance_loss_clip": 1.04504991, + "balance_loss_mlp": 1.02219224, + "epoch": 0.36128062528182775, + "flos": 15741981780480.0, + "grad_norm": 1.8763814119767641, + "language_loss": 0.83018947, + "learning_rate": 2.955039050023368e-06, + "loss": 0.85174245, + "num_input_tokens_seen": 129056235, + "step": 6009, + "time_per_iteration": 2.589008331298828 + }, + { + "auxiliary_loss_clip": 0.0110336, + "auxiliary_loss_mlp": 0.01045649, + "balance_loss_clip": 1.04914606, + "balance_loss_mlp": 1.03040195, + "epoch": 0.3613407485344957, + "flos": 16764502245120.0, + "grad_norm": 1.8040018740200463, + "language_loss": 0.76398462, + "learning_rate": 2.954696842652362e-06, + "loss": 0.78547472, + "num_input_tokens_seen": 129072405, + "step": 6010, + "time_per_iteration": 2.527660846710205 + }, + { + "auxiliary_loss_clip": 0.01105414, + "auxiliary_loss_mlp": 0.0104053, + "balance_loss_clip": 1.04962349, + "balance_loss_mlp": 1.02574825, + "epoch": 0.3614008717871637, + "flos": 20371189847040.0, + "grad_norm": 1.5026900150312068, + "language_loss": 0.8299917, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.85145116, + "num_input_tokens_seen": 129090225, + "step": 6011, + "time_per_iteration": 2.562345027923584 + }, + { + "auxiliary_loss_clip": 0.01139261, + "auxiliary_loss_mlp": 0.01041161, + "balance_loss_clip": 1.05095625, + "balance_loss_mlp": 1.02478123, + "epoch": 0.36146099503983165, + "flos": 22776665800320.0, + "grad_norm": 3.2433870495854227, + "language_loss": 0.62401962, + "learning_rate": 2.954012319316727e-06, + "loss": 0.64582384, + "num_input_tokens_seen": 129107685, + "step": 6012, + "time_per_iteration": 2.4611761569976807 + }, + { + "auxiliary_loss_clip": 0.01108331, + "auxiliary_loss_mlp": 0.010409, + "balance_loss_clip": 1.04890013, + "balance_loss_mlp": 1.0258199, + "epoch": 0.3615211182924996, + "flos": 22996654646400.0, + "grad_norm": 1.7079969339152121, + "language_loss": 0.83622134, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.85771364, + "num_input_tokens_seen": 129125315, + "step": 6013, + "time_per_iteration": 2.5608584880828857 + }, + { + "auxiliary_loss_clip": 0.01131872, + "auxiliary_loss_mlp": 0.01040218, + "balance_loss_clip": 1.04759896, + "balance_loss_mlp": 1.02401161, + "epoch": 0.3615812415451676, + "flos": 16648079287680.0, + "grad_norm": 1.6688523653116913, + "language_loss": 0.91484773, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.93656868, + "num_input_tokens_seen": 129141600, + "step": 6014, + "time_per_iteration": 2.430281639099121 + }, + { + "auxiliary_loss_clip": 0.01129588, + "auxiliary_loss_mlp": 0.01040714, + "balance_loss_clip": 1.04605925, + "balance_loss_mlp": 1.02504969, + "epoch": 0.36164136479783554, + "flos": 21320093387520.0, + "grad_norm": 2.3134108439473002, + "language_loss": 0.73833084, + "learning_rate": 2.95298526302391e-06, + "loss": 0.76003385, + "num_input_tokens_seen": 129160665, + "step": 6015, + "time_per_iteration": 2.4982917308807373 + }, + { + "auxiliary_loss_clip": 0.01054677, + "auxiliary_loss_mlp": 0.01045262, + "balance_loss_clip": 1.04395533, + "balance_loss_mlp": 1.02741659, + "epoch": 0.3617014880505035, + "flos": 24169569356160.0, + "grad_norm": 1.6534486658128495, + "language_loss": 0.64854622, + "learning_rate": 2.9526428386344e-06, + "loss": 0.66954553, + "num_input_tokens_seen": 129179220, + "step": 6016, + "time_per_iteration": 2.651958465576172 + }, + { + "auxiliary_loss_clip": 0.01124713, + "auxiliary_loss_mlp": 0.01040113, + "balance_loss_clip": 1.04994893, + "balance_loss_mlp": 1.02307761, + "epoch": 0.3617616113031715, + "flos": 39014824101120.0, + "grad_norm": 1.8047019618565072, + "language_loss": 0.71663344, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.73828167, + "num_input_tokens_seen": 129200385, + "step": 6017, + "time_per_iteration": 2.642540693283081 + }, + { + "auxiliary_loss_clip": 0.01122275, + "auxiliary_loss_mlp": 0.01043508, + "balance_loss_clip": 1.04662895, + "balance_loss_mlp": 1.02787948, + "epoch": 0.3618217345558395, + "flos": 12130840892160.0, + "grad_norm": 1.801718550190302, + "language_loss": 0.73705584, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.75871366, + "num_input_tokens_seen": 129217395, + "step": 6018, + "time_per_iteration": 2.4781253337860107 + }, + { + "auxiliary_loss_clip": 0.01088521, + "auxiliary_loss_mlp": 0.0103668, + "balance_loss_clip": 1.05030024, + "balance_loss_mlp": 1.02080142, + "epoch": 0.36188185780850746, + "flos": 24935005203840.0, + "grad_norm": 1.995826852710398, + "language_loss": 0.69451702, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.71576905, + "num_input_tokens_seen": 129238940, + "step": 6019, + "time_per_iteration": 2.611595869064331 + }, + { + "auxiliary_loss_clip": 0.01113329, + "auxiliary_loss_mlp": 0.01038067, + "balance_loss_clip": 1.04730058, + "balance_loss_mlp": 1.02160454, + "epoch": 0.3619419810611754, + "flos": 20958832350720.0, + "grad_norm": 6.341172316559218, + "language_loss": 0.76414269, + "learning_rate": 2.95127277996311e-06, + "loss": 0.78565669, + "num_input_tokens_seen": 129258240, + "step": 6020, + "time_per_iteration": 2.5248849391937256 + }, + { + "auxiliary_loss_clip": 0.01128196, + "auxiliary_loss_mlp": 0.01040535, + "balance_loss_clip": 1.05107415, + "balance_loss_mlp": 1.02409613, + "epoch": 0.3620021043138434, + "flos": 22528882805760.0, + "grad_norm": 1.9108090161154, + "language_loss": 0.73680657, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.7584939, + "num_input_tokens_seen": 129279040, + "step": 6021, + "time_per_iteration": 3.899514675140381 + }, + { + "auxiliary_loss_clip": 0.01095906, + "auxiliary_loss_mlp": 0.01038029, + "balance_loss_clip": 1.04940856, + "balance_loss_mlp": 1.02401614, + "epoch": 0.36206222756651135, + "flos": 15596687266560.0, + "grad_norm": 2.0714076220474538, + "language_loss": 0.80991471, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.83125401, + "num_input_tokens_seen": 129295415, + "step": 6022, + "time_per_iteration": 2.547633647918701 + }, + { + "auxiliary_loss_clip": 0.01119297, + "auxiliary_loss_mlp": 0.01039596, + "balance_loss_clip": 1.04949975, + "balance_loss_mlp": 1.02507615, + "epoch": 0.3621223508191793, + "flos": 23587170238080.0, + "grad_norm": 1.6581115659754537, + "language_loss": 0.81454897, + "learning_rate": 2.950244857154417e-06, + "loss": 0.83613789, + "num_input_tokens_seen": 129312620, + "step": 6023, + "time_per_iteration": 2.513324022293091 + }, + { + "auxiliary_loss_clip": 0.01108311, + "auxiliary_loss_mlp": 0.0103742, + "balance_loss_clip": 1.04768825, + "balance_loss_mlp": 1.02187514, + "epoch": 0.3621824740718473, + "flos": 22309899540480.0, + "grad_norm": 1.597737590737222, + "language_loss": 0.79759592, + "learning_rate": 2.9499021441341e-06, + "loss": 0.81905317, + "num_input_tokens_seen": 129331825, + "step": 6024, + "time_per_iteration": 5.382205247879028 + }, + { + "auxiliary_loss_clip": 0.01096978, + "auxiliary_loss_mlp": 0.0104278, + "balance_loss_clip": 1.04573464, + "balance_loss_mlp": 1.02632928, + "epoch": 0.36224259732451525, + "flos": 16763640318720.0, + "grad_norm": 1.9427386743944617, + "language_loss": 0.74653208, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.76792967, + "num_input_tokens_seen": 129350400, + "step": 6025, + "time_per_iteration": 2.5144035816192627 + }, + { + "auxiliary_loss_clip": 0.01119483, + "auxiliary_loss_mlp": 0.00790726, + "balance_loss_clip": 1.04838967, + "balance_loss_mlp": 1.01212335, + "epoch": 0.3623027205771832, + "flos": 23149742411520.0, + "grad_norm": 1.6523297486278674, + "language_loss": 0.72563767, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.74473977, + "num_input_tokens_seen": 129371155, + "step": 6026, + "time_per_iteration": 2.5267932415008545 + }, + { + "auxiliary_loss_clip": 0.01128783, + "auxiliary_loss_mlp": 0.0104546, + "balance_loss_clip": 1.05154777, + "balance_loss_mlp": 1.02983201, + "epoch": 0.3623628438298512, + "flos": 28549162834560.0, + "grad_norm": 2.0215860745139813, + "language_loss": 0.78965998, + "learning_rate": 2.948873789002833e-06, + "loss": 0.81140238, + "num_input_tokens_seen": 129391230, + "step": 6027, + "time_per_iteration": 3.980532646179199 + }, + { + "auxiliary_loss_clip": 0.01116704, + "auxiliary_loss_mlp": 0.01039595, + "balance_loss_clip": 1.04868722, + "balance_loss_mlp": 1.02302456, + "epoch": 0.36242296708251914, + "flos": 25484941405440.0, + "grad_norm": 1.710193607645512, + "language_loss": 0.67627615, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.69783914, + "num_input_tokens_seen": 129410065, + "step": 6028, + "time_per_iteration": 2.571261167526245 + }, + { + "auxiliary_loss_clip": 0.01094025, + "auxiliary_loss_mlp": 0.01036572, + "balance_loss_clip": 1.04920936, + "balance_loss_mlp": 1.02154565, + "epoch": 0.3624830903351871, + "flos": 16290373697280.0, + "grad_norm": 1.7136521384275445, + "language_loss": 0.85163236, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.87293828, + "num_input_tokens_seen": 129428655, + "step": 6029, + "time_per_iteration": 2.5734145641326904 + }, + { + "auxiliary_loss_clip": 0.01095779, + "auxiliary_loss_mlp": 0.01041319, + "balance_loss_clip": 1.04746819, + "balance_loss_mlp": 1.02632213, + "epoch": 0.36254321358785513, + "flos": 18296307694080.0, + "grad_norm": 1.6347458162178703, + "language_loss": 0.72741926, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.74879026, + "num_input_tokens_seen": 129447845, + "step": 6030, + "time_per_iteration": 2.5389468669891357 + }, + { + "auxiliary_loss_clip": 0.01108675, + "auxiliary_loss_mlp": 0.01042018, + "balance_loss_clip": 1.04760242, + "balance_loss_mlp": 1.02474427, + "epoch": 0.3626033368405231, + "flos": 14865294533760.0, + "grad_norm": 2.1067507796916742, + "language_loss": 0.74437666, + "learning_rate": 2.94750214514905e-06, + "loss": 0.76588356, + "num_input_tokens_seen": 129463275, + "step": 6031, + "time_per_iteration": 2.4884889125823975 + }, + { + "auxiliary_loss_clip": 0.01088125, + "auxiliary_loss_mlp": 0.0104254, + "balance_loss_clip": 1.04448438, + "balance_loss_mlp": 1.026793, + "epoch": 0.36266346009319106, + "flos": 22306595489280.0, + "grad_norm": 1.9524897222406512, + "language_loss": 0.73075974, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.75206637, + "num_input_tokens_seen": 129483205, + "step": 6032, + "time_per_iteration": 2.5806376934051514 + }, + { + "auxiliary_loss_clip": 0.01083001, + "auxiliary_loss_mlp": 0.01044323, + "balance_loss_clip": 1.04523325, + "balance_loss_mlp": 1.02927852, + "epoch": 0.362723583345859, + "flos": 18222331633920.0, + "grad_norm": 1.7831734646689246, + "language_loss": 0.77924532, + "learning_rate": 2.946816107593884e-06, + "loss": 0.80051863, + "num_input_tokens_seen": 129499885, + "step": 6033, + "time_per_iteration": 2.5426104068756104 + }, + { + "auxiliary_loss_clip": 0.01018144, + "auxiliary_loss_mlp": 0.01015884, + "balance_loss_clip": 1.03933644, + "balance_loss_mlp": 1.01364255, + "epoch": 0.362783706598527, + "flos": 68499174458880.0, + "grad_norm": 0.787252805703643, + "language_loss": 0.64813119, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.66847152, + "num_input_tokens_seen": 129561885, + "step": 6034, + "time_per_iteration": 3.234199285507202 + }, + { + "auxiliary_loss_clip": 0.01114631, + "auxiliary_loss_mlp": 0.0103987, + "balance_loss_clip": 1.04740274, + "balance_loss_mlp": 1.02443266, + "epoch": 0.36284382985119495, + "flos": 26576589594240.0, + "grad_norm": 1.5021956262301723, + "language_loss": 0.89668798, + "learning_rate": 2.946129926425273e-06, + "loss": 0.91823304, + "num_input_tokens_seen": 129582325, + "step": 6035, + "time_per_iteration": 2.550851821899414 + }, + { + "auxiliary_loss_clip": 0.01106068, + "auxiliary_loss_mlp": 0.01040541, + "balance_loss_clip": 1.04616857, + "balance_loss_mlp": 1.02434087, + "epoch": 0.3629039531038629, + "flos": 20156767608960.0, + "grad_norm": 1.7262617285996955, + "language_loss": 0.7406894, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.76215553, + "num_input_tokens_seen": 129600350, + "step": 6036, + "time_per_iteration": 2.52866792678833 + }, + { + "auxiliary_loss_clip": 0.01111441, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.04447031, + "balance_loss_mlp": 1.01912761, + "epoch": 0.3629640763565309, + "flos": 18625716345600.0, + "grad_norm": 1.8352612102065866, + "language_loss": 0.75705439, + "learning_rate": 2.945443601747297e-06, + "loss": 0.77851212, + "num_input_tokens_seen": 129618425, + "step": 6037, + "time_per_iteration": 2.4933624267578125 + }, + { + "auxiliary_loss_clip": 0.01111583, + "auxiliary_loss_mlp": 0.01054806, + "balance_loss_clip": 1.04466653, + "balance_loss_mlp": 1.03761649, + "epoch": 0.36302419960919885, + "flos": 19571459489280.0, + "grad_norm": 1.5603854449156658, + "language_loss": 0.78710675, + "learning_rate": 2.945100385624828e-06, + "loss": 0.8087706, + "num_input_tokens_seen": 129636750, + "step": 6038, + "time_per_iteration": 2.4962406158447266 + }, + { + "auxiliary_loss_clip": 0.01040023, + "auxiliary_loss_mlp": 0.01006832, + "balance_loss_clip": 1.02537048, + "balance_loss_mlp": 1.00506783, + "epoch": 0.3630843228618668, + "flos": 63797606444160.0, + "grad_norm": 0.8287535598688474, + "language_loss": 0.6344533, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.65492183, + "num_input_tokens_seen": 129699030, + "step": 6039, + "time_per_iteration": 3.155261516571045 + }, + { + "auxiliary_loss_clip": 0.01102014, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_clip": 1.04502916, + "balance_loss_mlp": 1.02794278, + "epoch": 0.3631444461145348, + "flos": 21835160461440.0, + "grad_norm": 2.474446531606057, + "language_loss": 0.7109338, + "learning_rate": 2.944413845878002e-06, + "loss": 0.73239654, + "num_input_tokens_seen": 129717135, + "step": 6040, + "time_per_iteration": 2.5327258110046387 + }, + { + "auxiliary_loss_clip": 0.0112471, + "auxiliary_loss_mlp": 0.01039063, + "balance_loss_clip": 1.04908967, + "balance_loss_mlp": 1.02370894, + "epoch": 0.36320456936720275, + "flos": 21722041555200.0, + "grad_norm": 1.7116113193266367, + "language_loss": 0.81228429, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.83392203, + "num_input_tokens_seen": 129735940, + "step": 6041, + "time_per_iteration": 2.492807388305664 + }, + { + "auxiliary_loss_clip": 0.01107398, + "auxiliary_loss_mlp": 0.01035805, + "balance_loss_clip": 1.04230332, + "balance_loss_mlp": 1.0197711, + "epoch": 0.3632646926198707, + "flos": 17019072910080.0, + "grad_norm": 2.1902970298941433, + "language_loss": 0.83965778, + "learning_rate": 2.943727162882107e-06, + "loss": 0.86108977, + "num_input_tokens_seen": 129752790, + "step": 6042, + "time_per_iteration": 2.5199177265167236 + }, + { + "auxiliary_loss_clip": 0.01113122, + "auxiliary_loss_mlp": 0.01045505, + "balance_loss_clip": 1.04703093, + "balance_loss_mlp": 1.03117049, + "epoch": 0.36332481587253873, + "flos": 23331163029120.0, + "grad_norm": 1.5523653131965562, + "language_loss": 0.78184271, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.80342901, + "num_input_tokens_seen": 129773655, + "step": 6043, + "time_per_iteration": 2.53847336769104 + }, + { + "auxiliary_loss_clip": 0.01098039, + "auxiliary_loss_mlp": 0.01038932, + "balance_loss_clip": 1.04671955, + "balance_loss_mlp": 1.0232861, + "epoch": 0.3633849391252067, + "flos": 10743539857920.0, + "grad_norm": 2.917747117485441, + "language_loss": 0.65318924, + "learning_rate": 2.943040336741298e-06, + "loss": 0.674559, + "num_input_tokens_seen": 129791605, + "step": 6044, + "time_per_iteration": 2.5502843856811523 + }, + { + "auxiliary_loss_clip": 0.01103004, + "auxiliary_loss_mlp": 0.01035534, + "balance_loss_clip": 1.04720974, + "balance_loss_mlp": 1.02076983, + "epoch": 0.36344506237787466, + "flos": 25849147357440.0, + "grad_norm": 1.6672479402902334, + "language_loss": 0.81177461, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.83315998, + "num_input_tokens_seen": 129811075, + "step": 6045, + "time_per_iteration": 2.543175220489502 + }, + { + "auxiliary_loss_clip": 0.01097079, + "auxiliary_loss_mlp": 0.01039498, + "balance_loss_clip": 1.0464139, + "balance_loss_mlp": 1.02378631, + "epoch": 0.3635051856305426, + "flos": 30154046503680.0, + "grad_norm": 13.465663042396093, + "language_loss": 0.64932823, + "learning_rate": 2.942353367559755e-06, + "loss": 0.67069405, + "num_input_tokens_seen": 129833755, + "step": 6046, + "time_per_iteration": 2.6611034870147705 + }, + { + "auxiliary_loss_clip": 0.01094823, + "auxiliary_loss_mlp": 0.01038106, + "balance_loss_clip": 1.04540467, + "balance_loss_mlp": 1.02360988, + "epoch": 0.3635653088832106, + "flos": 22198396746240.0, + "grad_norm": 1.5016819739742902, + "language_loss": 0.77839953, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.79972881, + "num_input_tokens_seen": 129854475, + "step": 6047, + "time_per_iteration": 2.5873358249664307 + }, + { + "auxiliary_loss_clip": 0.01124892, + "auxiliary_loss_mlp": 0.01041456, + "balance_loss_clip": 1.04404855, + "balance_loss_mlp": 1.0249691, + "epoch": 0.36362543213587856, + "flos": 24787053083520.0, + "grad_norm": 1.469956359268572, + "language_loss": 0.79191428, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.81357777, + "num_input_tokens_seen": 129873530, + "step": 6048, + "time_per_iteration": 2.5188872814178467 + }, + { + "auxiliary_loss_clip": 0.01040358, + "auxiliary_loss_mlp": 0.01006193, + "balance_loss_clip": 1.02565217, + "balance_loss_mlp": 1.00408351, + "epoch": 0.3636855553885465, + "flos": 62526369231360.0, + "grad_norm": 0.7629309241054875, + "language_loss": 0.52642262, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54688811, + "num_input_tokens_seen": 129940400, + "step": 6049, + "time_per_iteration": 3.18770694732666 + }, + { + "auxiliary_loss_clip": 0.01096147, + "auxiliary_loss_mlp": 0.01039238, + "balance_loss_clip": 1.04733598, + "balance_loss_mlp": 1.02349019, + "epoch": 0.3637456786412145, + "flos": 24060652341120.0, + "grad_norm": 1.883574866921462, + "language_loss": 0.86228478, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.88363862, + "num_input_tokens_seen": 129958635, + "step": 6050, + "time_per_iteration": 2.635986328125 + }, + { + "auxiliary_loss_clip": 0.01110905, + "auxiliary_loss_mlp": 0.00793841, + "balance_loss_clip": 1.04545105, + "balance_loss_mlp": 1.01881254, + "epoch": 0.36380580189388245, + "flos": 16691495852160.0, + "grad_norm": 1.7856763143130676, + "language_loss": 0.78172505, + "learning_rate": 2.940635319486546e-06, + "loss": 0.80077255, + "num_input_tokens_seen": 129977685, + "step": 6051, + "time_per_iteration": 2.504589319229126 + }, + { + "auxiliary_loss_clip": 0.01116475, + "auxiliary_loss_mlp": 0.0103862, + "balance_loss_clip": 1.04212189, + "balance_loss_mlp": 1.0237608, + "epoch": 0.3638659251465504, + "flos": 25114091437440.0, + "grad_norm": 1.8315673452217278, + "language_loss": 0.82433623, + "learning_rate": 2.940291602812822e-06, + "loss": 0.84588718, + "num_input_tokens_seen": 129997530, + "step": 6052, + "time_per_iteration": 2.520249605178833 + }, + { + "auxiliary_loss_clip": 0.01091499, + "auxiliary_loss_mlp": 0.01036077, + "balance_loss_clip": 1.04288995, + "balance_loss_mlp": 1.02184916, + "epoch": 0.3639260483992184, + "flos": 23003011353600.0, + "grad_norm": 1.496378404638616, + "language_loss": 0.72262114, + "learning_rate": 2.939947850483145e-06, + "loss": 0.7438969, + "num_input_tokens_seen": 130017955, + "step": 6053, + "time_per_iteration": 2.5686519145965576 + }, + { + "auxiliary_loss_clip": 0.01004423, + "auxiliary_loss_mlp": 0.01005514, + "balance_loss_clip": 1.02512705, + "balance_loss_mlp": 1.00323737, + "epoch": 0.36398617165188635, + "flos": 70716011160960.0, + "grad_norm": 0.7717051155076367, + "language_loss": 0.61222243, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63232177, + "num_input_tokens_seen": 130074275, + "step": 6054, + "time_per_iteration": 3.1961095333099365 + }, + { + "auxiliary_loss_clip": 0.0110392, + "auxiliary_loss_mlp": 0.01038181, + "balance_loss_clip": 1.04573643, + "balance_loss_mlp": 1.02225471, + "epoch": 0.3640462949045543, + "flos": 22235456603520.0, + "grad_norm": 2.065726572044602, + "language_loss": 0.75770104, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.77912205, + "num_input_tokens_seen": 130091375, + "step": 6055, + "time_per_iteration": 2.5467910766601562 + }, + { + "auxiliary_loss_clip": 0.0112799, + "auxiliary_loss_mlp": 0.01040012, + "balance_loss_clip": 1.04525304, + "balance_loss_mlp": 1.02484858, + "epoch": 0.3641064181572223, + "flos": 21543529939200.0, + "grad_norm": 1.9147785554283034, + "language_loss": 0.75472343, + "learning_rate": 2.938916379688765e-06, + "loss": 0.77640343, + "num_input_tokens_seen": 130111595, + "step": 6056, + "time_per_iteration": 2.484001636505127 + }, + { + "auxiliary_loss_clip": 0.01107383, + "auxiliary_loss_mlp": 0.01039791, + "balance_loss_clip": 1.04489088, + "balance_loss_mlp": 1.02419877, + "epoch": 0.3641665414098903, + "flos": 22273306560000.0, + "grad_norm": 1.826966567268114, + "language_loss": 0.80148941, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.82296109, + "num_input_tokens_seen": 130131440, + "step": 6057, + "time_per_iteration": 2.537064790725708 + }, + { + "auxiliary_loss_clip": 0.01106259, + "auxiliary_loss_mlp": 0.01040987, + "balance_loss_clip": 1.04391456, + "balance_loss_mlp": 1.02494168, + "epoch": 0.36422666466255826, + "flos": 28329676778880.0, + "grad_norm": 2.01456091769444, + "language_loss": 0.80121446, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.82268697, + "num_input_tokens_seen": 130151375, + "step": 6058, + "time_per_iteration": 2.5711193084716797 + }, + { + "auxiliary_loss_clip": 0.01098697, + "auxiliary_loss_mlp": 0.00793033, + "balance_loss_clip": 1.0430814, + "balance_loss_mlp": 1.01441312, + "epoch": 0.36428678791522623, + "flos": 24170503109760.0, + "grad_norm": 1.6483794857566985, + "language_loss": 0.84644485, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.86536217, + "num_input_tokens_seen": 130169960, + "step": 6059, + "time_per_iteration": 3.924856424331665 + }, + { + "auxiliary_loss_clip": 0.01087183, + "auxiliary_loss_mlp": 0.01038061, + "balance_loss_clip": 1.04311466, + "balance_loss_mlp": 1.02189624, + "epoch": 0.3643469111678942, + "flos": 22528451842560.0, + "grad_norm": 1.6150583535284802, + "language_loss": 0.8768636, + "learning_rate": 2.937540586903884e-06, + "loss": 0.89811599, + "num_input_tokens_seen": 130189800, + "step": 6060, + "time_per_iteration": 2.562365770339966 + }, + { + "auxiliary_loss_clip": 0.0111642, + "auxiliary_loss_mlp": 0.01040432, + "balance_loss_clip": 1.04550147, + "balance_loss_mlp": 1.02449405, + "epoch": 0.36440703442056216, + "flos": 19426595938560.0, + "grad_norm": 2.564932396099177, + "language_loss": 0.67016852, + "learning_rate": 2.937196549795971e-06, + "loss": 0.69173706, + "num_input_tokens_seen": 130206370, + "step": 6061, + "time_per_iteration": 2.4822640419006348 + }, + { + "auxiliary_loss_clip": 0.01110269, + "auxiliary_loss_mlp": 0.01036598, + "balance_loss_clip": 1.04770267, + "balance_loss_mlp": 1.02091002, + "epoch": 0.3644671576732301, + "flos": 18040515966720.0, + "grad_norm": 2.700831389231872, + "language_loss": 0.75577581, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.77724445, + "num_input_tokens_seen": 130224445, + "step": 6062, + "time_per_iteration": 3.891718626022339 + }, + { + "auxiliary_loss_clip": 0.0109996, + "auxiliary_loss_mlp": 0.01032633, + "balance_loss_clip": 1.04464197, + "balance_loss_mlp": 1.01520491, + "epoch": 0.3645272809258981, + "flos": 21542811667200.0, + "grad_norm": 1.7063669090131988, + "language_loss": 0.72117579, + "learning_rate": 2.936508368977432e-06, + "loss": 0.74250174, + "num_input_tokens_seen": 130245380, + "step": 6063, + "time_per_iteration": 3.9826512336730957 + }, + { + "auxiliary_loss_clip": 0.01112027, + "auxiliary_loss_mlp": 0.01044211, + "balance_loss_clip": 1.04208755, + "balance_loss_mlp": 1.02848721, + "epoch": 0.36458740417856605, + "flos": 22746860490240.0, + "grad_norm": 1.938577346452718, + "language_loss": 0.67782688, + "learning_rate": 2.936164225292901e-06, + "loss": 0.69938922, + "num_input_tokens_seen": 130265575, + "step": 6064, + "time_per_iteration": 2.4968390464782715 + }, + { + "auxiliary_loss_clip": 0.01109061, + "auxiliary_loss_mlp": 0.01049263, + "balance_loss_clip": 1.0457238, + "balance_loss_mlp": 1.03356302, + "epoch": 0.364647527431234, + "flos": 26140670138880.0, + "grad_norm": 1.682788671934112, + "language_loss": 0.74713975, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.76872301, + "num_input_tokens_seen": 130286195, + "step": 6065, + "time_per_iteration": 3.950160026550293 + }, + { + "auxiliary_loss_clip": 0.0110889, + "auxiliary_loss_mlp": 0.01042763, + "balance_loss_clip": 1.04672289, + "balance_loss_mlp": 1.02608538, + "epoch": 0.364707650683902, + "flos": 31029907737600.0, + "grad_norm": 2.752269580475105, + "language_loss": 0.75211704, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.77363354, + "num_input_tokens_seen": 130306095, + "step": 6066, + "time_per_iteration": 2.6268577575683594 + }, + { + "auxiliary_loss_clip": 0.01117954, + "auxiliary_loss_mlp": 0.01034824, + "balance_loss_clip": 1.04449224, + "balance_loss_mlp": 1.02042317, + "epoch": 0.36476777393656995, + "flos": 19572896033280.0, + "grad_norm": 2.1359486957883456, + "language_loss": 0.76598907, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.78751683, + "num_input_tokens_seen": 130324685, + "step": 6067, + "time_per_iteration": 2.489253520965576 + }, + { + "auxiliary_loss_clip": 0.0112615, + "auxiliary_loss_mlp": 0.01039752, + "balance_loss_clip": 1.04649043, + "balance_loss_mlp": 1.02653766, + "epoch": 0.3648278971892379, + "flos": 17748849530880.0, + "grad_norm": 1.9603120328968915, + "language_loss": 0.70918894, + "learning_rate": 2.934787295690886e-06, + "loss": 0.7308479, + "num_input_tokens_seen": 130343855, + "step": 6068, + "time_per_iteration": 2.454944133758545 + }, + { + "auxiliary_loss_clip": 0.01114134, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.04220486, + "balance_loss_mlp": 1.02262259, + "epoch": 0.3648880204419059, + "flos": 17931167988480.0, + "grad_norm": 1.9875537767721032, + "language_loss": 0.74069566, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.76221216, + "num_input_tokens_seen": 130362320, + "step": 6069, + "time_per_iteration": 2.4577574729919434 + }, + { + "auxiliary_loss_clip": 0.01108907, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.04535913, + "balance_loss_mlp": 1.02629244, + "epoch": 0.3649481436945739, + "flos": 22638266697600.0, + "grad_norm": 1.9009548444488127, + "language_loss": 0.66473973, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.68624562, + "num_input_tokens_seen": 130383165, + "step": 6070, + "time_per_iteration": 2.5559580326080322 + }, + { + "auxiliary_loss_clip": 0.01108932, + "auxiliary_loss_mlp": 0.01035829, + "balance_loss_clip": 1.04378128, + "balance_loss_mlp": 1.02139306, + "epoch": 0.36500826694724187, + "flos": 21579656042880.0, + "grad_norm": 1.872183989302307, + "language_loss": 0.74425459, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76570225, + "num_input_tokens_seen": 130402425, + "step": 6071, + "time_per_iteration": 2.507399559020996 + }, + { + "auxiliary_loss_clip": 0.01113699, + "auxiliary_loss_mlp": 0.01035504, + "balance_loss_clip": 1.04160333, + "balance_loss_mlp": 1.02042413, + "epoch": 0.36506839019990983, + "flos": 13772533023360.0, + "grad_norm": 2.139390371167385, + "language_loss": 0.88506091, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.90655291, + "num_input_tokens_seen": 130419440, + "step": 6072, + "time_per_iteration": 2.466857671737671 + }, + { + "auxiliary_loss_clip": 0.01113469, + "auxiliary_loss_mlp": 0.01035063, + "balance_loss_clip": 1.04693174, + "balance_loss_mlp": 1.0207932, + "epoch": 0.3651285134525778, + "flos": 17274972378240.0, + "grad_norm": 2.6466107161755605, + "language_loss": 0.72100651, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.74249184, + "num_input_tokens_seen": 130438495, + "step": 6073, + "time_per_iteration": 2.4888339042663574 + }, + { + "auxiliary_loss_clip": 0.01062767, + "auxiliary_loss_mlp": 0.01039381, + "balance_loss_clip": 1.05234289, + "balance_loss_mlp": 1.02314436, + "epoch": 0.36518863670524576, + "flos": 21907987286400.0, + "grad_norm": 1.930718690604759, + "language_loss": 0.67158616, + "learning_rate": 2.932720838132236e-06, + "loss": 0.69260764, + "num_input_tokens_seen": 130455575, + "step": 6074, + "time_per_iteration": 2.617621421813965 + }, + { + "auxiliary_loss_clip": 0.01094481, + "auxiliary_loss_mlp": 0.01033242, + "balance_loss_clip": 1.04447913, + "balance_loss_mlp": 1.01825118, + "epoch": 0.3652487599579137, + "flos": 27122180250240.0, + "grad_norm": 1.6468493455759396, + "language_loss": 0.72885096, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.75012827, + "num_input_tokens_seen": 130476385, + "step": 6075, + "time_per_iteration": 2.6079723834991455 + }, + { + "auxiliary_loss_clip": 0.01091713, + "auxiliary_loss_mlp": 0.01044207, + "balance_loss_clip": 1.04318392, + "balance_loss_mlp": 1.027601, + "epoch": 0.3653088832105817, + "flos": 19755573626880.0, + "grad_norm": 2.093723467979191, + "language_loss": 0.89077723, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.91213644, + "num_input_tokens_seen": 130493630, + "step": 6076, + "time_per_iteration": 2.5508856773376465 + }, + { + "auxiliary_loss_clip": 0.011144, + "auxiliary_loss_mlp": 0.01038987, + "balance_loss_clip": 1.04395878, + "balance_loss_mlp": 1.02340031, + "epoch": 0.36536900646324966, + "flos": 13115008609920.0, + "grad_norm": 1.9196009049641871, + "language_loss": 0.69566894, + "learning_rate": 2.931687131696872e-06, + "loss": 0.71720278, + "num_input_tokens_seen": 130510735, + "step": 6077, + "time_per_iteration": 2.454378604888916 + }, + { + "auxiliary_loss_clip": 0.01043968, + "auxiliary_loss_mlp": 0.01005372, + "balance_loss_clip": 1.01583982, + "balance_loss_mlp": 1.00376272, + "epoch": 0.3654291297159176, + "flos": 71100472383360.0, + "grad_norm": 0.7612136279177231, + "language_loss": 0.61825776, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.63875115, + "num_input_tokens_seen": 130577050, + "step": 6078, + "time_per_iteration": 3.157278299331665 + }, + { + "auxiliary_loss_clip": 0.01099927, + "auxiliary_loss_mlp": 0.0104753, + "balance_loss_clip": 1.04087925, + "balance_loss_mlp": 1.03096008, + "epoch": 0.3654892529685856, + "flos": 23617478338560.0, + "grad_norm": 2.1576000060329537, + "language_loss": 0.78426051, + "learning_rate": 2.930997817403173e-06, + "loss": 0.80573511, + "num_input_tokens_seen": 130593780, + "step": 6079, + "time_per_iteration": 2.5255868434906006 + }, + { + "auxiliary_loss_clip": 0.01121525, + "auxiliary_loss_mlp": 0.01039465, + "balance_loss_clip": 1.04751086, + "balance_loss_mlp": 1.02358651, + "epoch": 0.36554937622125355, + "flos": 43470799850880.0, + "grad_norm": 2.0154887763837888, + "language_loss": 0.62897575, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.65058565, + "num_input_tokens_seen": 130615510, + "step": 6080, + "time_per_iteration": 2.7311177253723145 + }, + { + "auxiliary_loss_clip": 0.0110001, + "auxiliary_loss_mlp": 0.01038644, + "balance_loss_clip": 1.04397547, + "balance_loss_mlp": 1.02227616, + "epoch": 0.3656094994739215, + "flos": 23294641875840.0, + "grad_norm": 2.2816388599813258, + "language_loss": 0.6710965, + "learning_rate": 2.930308361895352e-06, + "loss": 0.69248307, + "num_input_tokens_seen": 130635410, + "step": 6081, + "time_per_iteration": 2.561708450317383 + }, + { + "auxiliary_loss_clip": 0.01106086, + "auxiliary_loss_mlp": 0.0079743, + "balance_loss_clip": 1.04607272, + "balance_loss_mlp": 1.02096248, + "epoch": 0.3656696227265895, + "flos": 24571984400640.0, + "grad_norm": 1.858536693097638, + "language_loss": 0.74903011, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.76806527, + "num_input_tokens_seen": 130657725, + "step": 6082, + "time_per_iteration": 2.6148223876953125 + }, + { + "auxiliary_loss_clip": 0.01075881, + "auxiliary_loss_mlp": 0.00793764, + "balance_loss_clip": 1.05218089, + "balance_loss_mlp": 1.0156951, + "epoch": 0.3657297459792575, + "flos": 27928375056000.0, + "grad_norm": 1.7435726883035343, + "language_loss": 0.82583416, + "learning_rate": 2.929618765277987e-06, + "loss": 0.8445307, + "num_input_tokens_seen": 130678360, + "step": 6083, + "time_per_iteration": 2.6629860401153564 + }, + { + "auxiliary_loss_clip": 0.01027303, + "auxiliary_loss_mlp": 0.01006228, + "balance_loss_clip": 1.01864743, + "balance_loss_mlp": 1.00452375, + "epoch": 0.36578986923192547, + "flos": 67392622126080.0, + "grad_norm": 0.9284324644293693, + "language_loss": 0.59299016, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61332548, + "num_input_tokens_seen": 130742110, + "step": 6084, + "time_per_iteration": 3.2181131839752197 + }, + { + "auxiliary_loss_clip": 0.01085114, + "auxiliary_loss_mlp": 0.01040993, + "balance_loss_clip": 1.04278135, + "balance_loss_mlp": 1.0248636, + "epoch": 0.36584999248459343, + "flos": 20227511445120.0, + "grad_norm": 3.296435953774852, + "language_loss": 0.72653377, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.74779487, + "num_input_tokens_seen": 130759870, + "step": 6085, + "time_per_iteration": 2.539498805999756 + }, + { + "auxiliary_loss_clip": 0.01094064, + "auxiliary_loss_mlp": 0.01032965, + "balance_loss_clip": 1.04931378, + "balance_loss_mlp": 1.01847506, + "epoch": 0.3659101157372614, + "flos": 19062461813760.0, + "grad_norm": 1.871657058101363, + "language_loss": 0.78077137, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.80204165, + "num_input_tokens_seen": 130778510, + "step": 6086, + "time_per_iteration": 2.590996503829956 + }, + { + "auxiliary_loss_clip": 0.01112933, + "auxiliary_loss_mlp": 0.01038213, + "balance_loss_clip": 1.04846668, + "balance_loss_mlp": 1.02289474, + "epoch": 0.36597023898992936, + "flos": 30810708990720.0, + "grad_norm": 1.6766818840205613, + "language_loss": 0.76512969, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.78664112, + "num_input_tokens_seen": 130798535, + "step": 6087, + "time_per_iteration": 2.6015231609344482 + }, + { + "auxiliary_loss_clip": 0.01078051, + "auxiliary_loss_mlp": 0.01039489, + "balance_loss_clip": 1.05248952, + "balance_loss_mlp": 1.02318132, + "epoch": 0.36603036224259733, + "flos": 20521799573760.0, + "grad_norm": 1.9384688504283198, + "language_loss": 0.70871842, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.72989386, + "num_input_tokens_seen": 130816655, + "step": 6088, + "time_per_iteration": 2.5857160091400146 + }, + { + "auxiliary_loss_clip": 0.01127188, + "auxiliary_loss_mlp": 0.01036327, + "balance_loss_clip": 1.04762185, + "balance_loss_mlp": 1.01955402, + "epoch": 0.3660904854952653, + "flos": 38329397798400.0, + "grad_norm": 1.5557874812146804, + "language_loss": 0.79729784, + "learning_rate": 2.92754912981472e-06, + "loss": 0.81893295, + "num_input_tokens_seen": 130841225, + "step": 6089, + "time_per_iteration": 2.6712942123413086 + }, + { + "auxiliary_loss_clip": 0.01093846, + "auxiliary_loss_mlp": 0.01036043, + "balance_loss_clip": 1.04426503, + "balance_loss_mlp": 1.02152324, + "epoch": 0.36615060874793326, + "flos": 21835555511040.0, + "grad_norm": 1.9073078871672935, + "language_loss": 0.71608889, + "learning_rate": 2.927204067389884e-06, + "loss": 0.73738778, + "num_input_tokens_seen": 130861050, + "step": 6090, + "time_per_iteration": 2.5433499813079834 + }, + { + "auxiliary_loss_clip": 0.01096997, + "auxiliary_loss_mlp": 0.01047141, + "balance_loss_clip": 1.04760695, + "balance_loss_mlp": 1.03246593, + "epoch": 0.3662107320006012, + "flos": 16581537342720.0, + "grad_norm": 1.7369872388773808, + "language_loss": 0.74036288, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.76180428, + "num_input_tokens_seen": 130879775, + "step": 6091, + "time_per_iteration": 2.5093183517456055 + }, + { + "auxiliary_loss_clip": 0.01077293, + "auxiliary_loss_mlp": 0.0103592, + "balance_loss_clip": 1.04941869, + "balance_loss_mlp": 1.02019668, + "epoch": 0.3662708552532692, + "flos": 20958365473920.0, + "grad_norm": 2.0757921258885523, + "language_loss": 0.72698426, + "learning_rate": 2.926513837074284e-06, + "loss": 0.74811637, + "num_input_tokens_seen": 130898070, + "step": 6092, + "time_per_iteration": 2.6026315689086914 + }, + { + "auxiliary_loss_clip": 0.01118359, + "auxiliary_loss_mlp": 0.01044163, + "balance_loss_clip": 1.04406643, + "balance_loss_mlp": 1.02816463, + "epoch": 0.36633097850593715, + "flos": 21902707987200.0, + "grad_norm": 1.9803768439813776, + "language_loss": 0.78518808, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.8068133, + "num_input_tokens_seen": 130915250, + "step": 6093, + "time_per_iteration": 2.506542444229126 + }, + { + "auxiliary_loss_clip": 0.01117346, + "auxiliary_loss_mlp": 0.0104002, + "balance_loss_clip": 1.0448997, + "balance_loss_mlp": 1.02501202, + "epoch": 0.3663911017586051, + "flos": 32854133808000.0, + "grad_norm": 1.6762024235929665, + "language_loss": 0.74190295, + "learning_rate": 2.925823466224696e-06, + "loss": 0.76347661, + "num_input_tokens_seen": 130936995, + "step": 6094, + "time_per_iteration": 2.596832513809204 + }, + { + "auxiliary_loss_clip": 0.01134309, + "auxiliary_loss_mlp": 0.01050623, + "balance_loss_clip": 1.04872465, + "balance_loss_mlp": 1.03435111, + "epoch": 0.3664512250112731, + "flos": 27271748482560.0, + "grad_norm": 1.5466364579683587, + "language_loss": 0.79537266, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.817222, + "num_input_tokens_seen": 130957970, + "step": 6095, + "time_per_iteration": 2.5259475708007812 + }, + { + "auxiliary_loss_clip": 0.01118217, + "auxiliary_loss_mlp": 0.0079321, + "balance_loss_clip": 1.05036068, + "balance_loss_mlp": 1.01070547, + "epoch": 0.3665113482639411, + "flos": 17784436930560.0, + "grad_norm": 2.0793459336970974, + "language_loss": 0.73551685, + "learning_rate": 2.925132954945834e-06, + "loss": 0.7546311, + "num_input_tokens_seen": 130974915, + "step": 6096, + "time_per_iteration": 2.4809913635253906 + }, + { + "auxiliary_loss_clip": 0.01094891, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.04402494, + "balance_loss_mlp": 1.02027345, + "epoch": 0.36657147151660907, + "flos": 27854614477440.0, + "grad_norm": 2.624478588341291, + "language_loss": 0.67011607, + "learning_rate": 2.924787646678155e-06, + "loss": 0.6914283, + "num_input_tokens_seen": 130995745, + "step": 6097, + "time_per_iteration": 2.600778579711914 + }, + { + "auxiliary_loss_clip": 0.01076159, + "auxiliary_loss_mlp": 0.01040296, + "balance_loss_clip": 1.04815149, + "balance_loss_mlp": 1.02450037, + "epoch": 0.36663159476927704, + "flos": 25374013228800.0, + "grad_norm": 1.40253038604872, + "language_loss": 0.77797008, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.79913461, + "num_input_tokens_seen": 131015545, + "step": 6098, + "time_per_iteration": 4.028882026672363 + }, + { + "auxiliary_loss_clip": 0.01119002, + "auxiliary_loss_mlp": 0.01039086, + "balance_loss_clip": 1.04716396, + "balance_loss_mlp": 1.02362466, + "epoch": 0.366691718021945, + "flos": 21357225072000.0, + "grad_norm": 2.2749600564227563, + "language_loss": 0.73552036, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.7571013, + "num_input_tokens_seen": 131033990, + "step": 6099, + "time_per_iteration": 2.504903554916382 + }, + { + "auxiliary_loss_clip": 0.01106505, + "auxiliary_loss_mlp": 0.01044316, + "balance_loss_clip": 1.04654765, + "balance_loss_mlp": 1.03009462, + "epoch": 0.36675184127461297, + "flos": 16800376953600.0, + "grad_norm": 1.9939212998489233, + "language_loss": 0.84097457, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.86248279, + "num_input_tokens_seen": 131050710, + "step": 6100, + "time_per_iteration": 3.865168333053589 + }, + { + "auxiliary_loss_clip": 0.01101098, + "auxiliary_loss_mlp": 0.01034076, + "balance_loss_clip": 1.04634762, + "balance_loss_mlp": 1.01834023, + "epoch": 0.36681196452728093, + "flos": 21906514828800.0, + "grad_norm": 1.704657061635493, + "language_loss": 0.70708424, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.72843599, + "num_input_tokens_seen": 131071435, + "step": 6101, + "time_per_iteration": 3.9936938285827637 + }, + { + "auxiliary_loss_clip": 0.01105007, + "auxiliary_loss_mlp": 0.01050526, + "balance_loss_clip": 1.04861736, + "balance_loss_mlp": 1.03364635, + "epoch": 0.3668720877799489, + "flos": 17712436118400.0, + "grad_norm": 2.445095618826411, + "language_loss": 0.76653767, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.78809297, + "num_input_tokens_seen": 131088775, + "step": 6102, + "time_per_iteration": 2.5468952655792236 + }, + { + "auxiliary_loss_clip": 0.01124421, + "auxiliary_loss_mlp": 0.01039248, + "balance_loss_clip": 1.04738736, + "balance_loss_mlp": 1.02203393, + "epoch": 0.36693221103261686, + "flos": 47045455499520.0, + "grad_norm": 2.811326826953952, + "language_loss": 0.70174152, + "learning_rate": 2.922715061101625e-06, + "loss": 0.72337818, + "num_input_tokens_seen": 131112800, + "step": 6103, + "time_per_iteration": 2.7469322681427 + }, + { + "auxiliary_loss_clip": 0.01087449, + "auxiliary_loss_mlp": 0.0103889, + "balance_loss_clip": 1.04638886, + "balance_loss_mlp": 1.02311873, + "epoch": 0.3669923342852848, + "flos": 15960929132160.0, + "grad_norm": 1.7040207470236035, + "language_loss": 0.71408963, + "learning_rate": 2.922369507632716e-06, + "loss": 0.73535299, + "num_input_tokens_seen": 131131150, + "step": 6104, + "time_per_iteration": 4.001452684402466 + }, + { + "auxiliary_loss_clip": 0.0112185, + "auxiliary_loss_mlp": 0.01035898, + "balance_loss_clip": 1.04637861, + "balance_loss_mlp": 1.01961422, + "epoch": 0.3670524575379528, + "flos": 19974485064960.0, + "grad_norm": 2.326823931650879, + "language_loss": 0.81627047, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.83784795, + "num_input_tokens_seen": 131150365, + "step": 6105, + "time_per_iteration": 2.540252923965454 + }, + { + "auxiliary_loss_clip": 0.01136886, + "auxiliary_loss_mlp": 0.0104132, + "balance_loss_clip": 1.04818118, + "balance_loss_mlp": 1.02484512, + "epoch": 0.36711258079062076, + "flos": 25702955003520.0, + "grad_norm": 1.833168292539064, + "language_loss": 0.808936, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.83071804, + "num_input_tokens_seen": 131169310, + "step": 6106, + "time_per_iteration": 2.5152275562286377 + }, + { + "auxiliary_loss_clip": 0.01022407, + "auxiliary_loss_mlp": 0.00776954, + "balance_loss_clip": 1.01989818, + "balance_loss_mlp": 1.00891531, + "epoch": 0.3671727040432887, + "flos": 60772743342720.0, + "grad_norm": 0.6889284615548021, + "language_loss": 0.59231114, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.61030471, + "num_input_tokens_seen": 131232900, + "step": 6107, + "time_per_iteration": 3.202441930770874 + }, + { + "auxiliary_loss_clip": 0.01109968, + "auxiliary_loss_mlp": 0.01037545, + "balance_loss_clip": 1.04972935, + "balance_loss_mlp": 1.02178538, + "epoch": 0.3672328272959567, + "flos": 18661303745280.0, + "grad_norm": 1.6126174885392424, + "language_loss": 0.74474585, + "learning_rate": 2.92098694412469e-06, + "loss": 0.76622093, + "num_input_tokens_seen": 131250920, + "step": 6108, + "time_per_iteration": 2.559065341949463 + }, + { + "auxiliary_loss_clip": 0.01126866, + "auxiliary_loss_mlp": 0.01041562, + "balance_loss_clip": 1.05067766, + "balance_loss_mlp": 1.02622032, + "epoch": 0.3672929505486247, + "flos": 15049049535360.0, + "grad_norm": 2.9015915564577845, + "language_loss": 0.73496532, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.75664961, + "num_input_tokens_seen": 131267910, + "step": 6109, + "time_per_iteration": 2.4691619873046875 + }, + { + "auxiliary_loss_clip": 0.0106396, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.04740691, + "balance_loss_mlp": 1.02587414, + "epoch": 0.3673530738012927, + "flos": 20589347099520.0, + "grad_norm": 1.903176540664286, + "language_loss": 0.5298025, + "learning_rate": 2.920295452774744e-06, + "loss": 0.5508585, + "num_input_tokens_seen": 131287150, + "step": 6110, + "time_per_iteration": 2.6277663707733154 + }, + { + "auxiliary_loss_clip": 0.01122791, + "auxiliary_loss_mlp": 0.01039808, + "balance_loss_clip": 1.04973459, + "balance_loss_mlp": 1.02279735, + "epoch": 0.36741319705396064, + "flos": 21689830033920.0, + "grad_norm": 1.5042007512305848, + "language_loss": 0.80457461, + "learning_rate": 2.919949654746672e-06, + "loss": 0.82620066, + "num_input_tokens_seen": 131308225, + "step": 6111, + "time_per_iteration": 2.507981777191162 + }, + { + "auxiliary_loss_clip": 0.01084468, + "auxiliary_loss_mlp": 0.01045985, + "balance_loss_clip": 1.04653597, + "balance_loss_mlp": 1.0303328, + "epoch": 0.3674733203066286, + "flos": 29862200499840.0, + "grad_norm": 1.5817065969633461, + "language_loss": 0.7239871, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.74529165, + "num_input_tokens_seen": 131332115, + "step": 6112, + "time_per_iteration": 2.6650800704956055 + }, + { + "auxiliary_loss_clip": 0.01124357, + "auxiliary_loss_mlp": 0.0104989, + "balance_loss_clip": 1.04994035, + "balance_loss_mlp": 1.03391623, + "epoch": 0.36753344355929657, + "flos": 18257021193600.0, + "grad_norm": 1.5497026240478489, + "language_loss": 0.85167468, + "learning_rate": 2.919257954049892e-06, + "loss": 0.8734172, + "num_input_tokens_seen": 131351885, + "step": 6113, + "time_per_iteration": 2.5089728832244873 + }, + { + "auxiliary_loss_clip": 0.01126687, + "auxiliary_loss_mlp": 0.01040332, + "balance_loss_clip": 1.04856491, + "balance_loss_mlp": 1.02389336, + "epoch": 0.36759356681196453, + "flos": 25301150490240.0, + "grad_norm": 1.6326923726119238, + "language_loss": 0.78214633, + "learning_rate": 2.918912051407413e-06, + "loss": 0.80381656, + "num_input_tokens_seen": 131370245, + "step": 6114, + "time_per_iteration": 2.5376169681549072 + }, + { + "auxiliary_loss_clip": 0.01130001, + "auxiliary_loss_mlp": 0.01052057, + "balance_loss_clip": 1.05014563, + "balance_loss_mlp": 1.03324568, + "epoch": 0.3676536900646325, + "flos": 21032952065280.0, + "grad_norm": 1.5675251942520296, + "language_loss": 0.66902125, + "learning_rate": 2.918566113919698e-06, + "loss": 0.69084179, + "num_input_tokens_seen": 131388115, + "step": 6115, + "time_per_iteration": 2.4861996173858643 + }, + { + "auxiliary_loss_clip": 0.01103854, + "auxiliary_loss_mlp": 0.01038856, + "balance_loss_clip": 1.0450604, + "balance_loss_mlp": 1.02335334, + "epoch": 0.36771381331730046, + "flos": 16288506190080.0, + "grad_norm": 2.9648017137251914, + "language_loss": 0.76171589, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.78314298, + "num_input_tokens_seen": 131404595, + "step": 6116, + "time_per_iteration": 2.508434772491455 + }, + { + "auxiliary_loss_clip": 0.01089919, + "auxiliary_loss_mlp": 0.01044235, + "balance_loss_clip": 1.05045295, + "balance_loss_mlp": 1.02843976, + "epoch": 0.36777393656996843, + "flos": 22309971367680.0, + "grad_norm": 2.1194911971804102, + "language_loss": 0.63123691, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.65257847, + "num_input_tokens_seen": 131423760, + "step": 6117, + "time_per_iteration": 2.6013195514678955 + }, + { + "auxiliary_loss_clip": 0.01107894, + "auxiliary_loss_mlp": 0.01041373, + "balance_loss_clip": 1.0507319, + "balance_loss_mlp": 1.02481472, + "epoch": 0.3678340598226364, + "flos": 26834069260800.0, + "grad_norm": 1.6880812959521245, + "language_loss": 0.7319963, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.75348896, + "num_input_tokens_seen": 131444955, + "step": 6118, + "time_per_iteration": 2.59920334815979 + }, + { + "auxiliary_loss_clip": 0.01131272, + "auxiliary_loss_mlp": 0.01047082, + "balance_loss_clip": 1.05355799, + "balance_loss_mlp": 1.02978444, + "epoch": 0.36789418307530436, + "flos": 21761723105280.0, + "grad_norm": 1.53855741114017, + "language_loss": 0.73045731, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.75224084, + "num_input_tokens_seen": 131465720, + "step": 6119, + "time_per_iteration": 2.5119521617889404 + }, + { + "auxiliary_loss_clip": 0.01110214, + "auxiliary_loss_mlp": 0.01037828, + "balance_loss_clip": 1.05250955, + "balance_loss_mlp": 1.02114522, + "epoch": 0.3679543063279723, + "flos": 15924192497280.0, + "grad_norm": 1.861733247793623, + "language_loss": 0.79959798, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.82107842, + "num_input_tokens_seen": 131483080, + "step": 6120, + "time_per_iteration": 2.525136709213257 + }, + { + "auxiliary_loss_clip": 0.0109022, + "auxiliary_loss_mlp": 0.01045494, + "balance_loss_clip": 1.04998565, + "balance_loss_mlp": 1.02923369, + "epoch": 0.3680144295806403, + "flos": 24275541456000.0, + "grad_norm": 1.8849143042750758, + "language_loss": 0.64188707, + "learning_rate": 2.916489757978126e-06, + "loss": 0.66324425, + "num_input_tokens_seen": 131502545, + "step": 6121, + "time_per_iteration": 2.57275652885437 + }, + { + "auxiliary_loss_clip": 0.01124923, + "auxiliary_loss_mlp": 0.01039242, + "balance_loss_clip": 1.05453372, + "balance_loss_mlp": 1.02283859, + "epoch": 0.36807455283330826, + "flos": 26104148985600.0, + "grad_norm": 2.2295674156198255, + "language_loss": 0.71526647, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.73690814, + "num_input_tokens_seen": 131522155, + "step": 6122, + "time_per_iteration": 2.5409178733825684 + }, + { + "auxiliary_loss_clip": 0.01105622, + "auxiliary_loss_mlp": 0.01044102, + "balance_loss_clip": 1.05037045, + "balance_loss_mlp": 1.02697182, + "epoch": 0.3681346760859763, + "flos": 24644990793600.0, + "grad_norm": 1.8540027101138485, + "language_loss": 0.69202554, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71352279, + "num_input_tokens_seen": 131543865, + "step": 6123, + "time_per_iteration": 2.575061082839966 + }, + { + "auxiliary_loss_clip": 0.01126637, + "auxiliary_loss_mlp": 0.01043992, + "balance_loss_clip": 1.0516547, + "balance_loss_mlp": 1.0267899, + "epoch": 0.36819479933864424, + "flos": 23878369797120.0, + "grad_norm": 2.849257793413385, + "language_loss": 0.73721933, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.75892562, + "num_input_tokens_seen": 131562155, + "step": 6124, + "time_per_iteration": 2.558530569076538 + }, + { + "auxiliary_loss_clip": 0.0112151, + "auxiliary_loss_mlp": 0.01044038, + "balance_loss_clip": 1.05293846, + "balance_loss_mlp": 1.02678871, + "epoch": 0.3682549225913122, + "flos": 25553997302400.0, + "grad_norm": 2.299019876289324, + "language_loss": 0.74069089, + "learning_rate": 2.915104825441114e-06, + "loss": 0.76234639, + "num_input_tokens_seen": 131581695, + "step": 6125, + "time_per_iteration": 2.5590097904205322 + }, + { + "auxiliary_loss_clip": 0.01127158, + "auxiliary_loss_mlp": 0.01049757, + "balance_loss_clip": 1.05137181, + "balance_loss_mlp": 1.0325551, + "epoch": 0.36831504584398017, + "flos": 16946605221120.0, + "grad_norm": 1.7862823218916117, + "language_loss": 0.78219771, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.80396688, + "num_input_tokens_seen": 131599465, + "step": 6126, + "time_per_iteration": 2.484788179397583 + }, + { + "auxiliary_loss_clip": 0.01125966, + "auxiliary_loss_mlp": 0.01044548, + "balance_loss_clip": 1.05078197, + "balance_loss_mlp": 1.0266186, + "epoch": 0.36837516909664814, + "flos": 19865065259520.0, + "grad_norm": 2.5300601756454375, + "language_loss": 0.65401423, + "learning_rate": 2.914412150914888e-06, + "loss": 0.67571932, + "num_input_tokens_seen": 131618330, + "step": 6127, + "time_per_iteration": 2.4821014404296875 + }, + { + "auxiliary_loss_clip": 0.01116798, + "auxiliary_loss_mlp": 0.010406, + "balance_loss_clip": 1.05265021, + "balance_loss_mlp": 1.02423251, + "epoch": 0.3684352923493161, + "flos": 37626984362880.0, + "grad_norm": 1.7620483621671204, + "language_loss": 0.70450056, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.72607458, + "num_input_tokens_seen": 131638960, + "step": 6128, + "time_per_iteration": 2.6795711517333984 + }, + { + "auxiliary_loss_clip": 0.01117652, + "auxiliary_loss_mlp": 0.01045102, + "balance_loss_clip": 1.05293024, + "balance_loss_mlp": 1.02897286, + "epoch": 0.36849541560198407, + "flos": 14465501182080.0, + "grad_norm": 1.7183833518881548, + "language_loss": 0.75252241, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.77414995, + "num_input_tokens_seen": 131657440, + "step": 6129, + "time_per_iteration": 2.512605667114258 + }, + { + "auxiliary_loss_clip": 0.01119593, + "auxiliary_loss_mlp": 0.01039458, + "balance_loss_clip": 1.05199516, + "balance_loss_mlp": 1.02309012, + "epoch": 0.36855553885465203, + "flos": 25770753924480.0, + "grad_norm": 1.585285122339762, + "language_loss": 0.8442564, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.86584687, + "num_input_tokens_seen": 131678035, + "step": 6130, + "time_per_iteration": 2.5723981857299805 + }, + { + "auxiliary_loss_clip": 0.01031183, + "auxiliary_loss_mlp": 0.01007887, + "balance_loss_clip": 1.03440142, + "balance_loss_mlp": 1.00600374, + "epoch": 0.36861566210732, + "flos": 65049417377280.0, + "grad_norm": 0.8178980750687815, + "language_loss": 0.60305786, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62344861, + "num_input_tokens_seen": 131742470, + "step": 6131, + "time_per_iteration": 3.2082784175872803 + }, + { + "auxiliary_loss_clip": 0.01098348, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.05278397, + "balance_loss_mlp": 1.01814628, + "epoch": 0.36867578535998796, + "flos": 30954495133440.0, + "grad_norm": 1.5227882722305346, + "language_loss": 0.73254573, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.75387114, + "num_input_tokens_seen": 131764570, + "step": 6132, + "time_per_iteration": 2.6497342586517334 + }, + { + "auxiliary_loss_clip": 0.01122661, + "auxiliary_loss_mlp": 0.01041207, + "balance_loss_clip": 1.04779208, + "balance_loss_mlp": 1.02413654, + "epoch": 0.3687359086126559, + "flos": 28837956182400.0, + "grad_norm": 1.5974686555646938, + "language_loss": 0.74257159, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76421028, + "num_input_tokens_seen": 131785720, + "step": 6133, + "time_per_iteration": 2.5603535175323486 + }, + { + "auxiliary_loss_clip": 0.01071873, + "auxiliary_loss_mlp": 0.01050835, + "balance_loss_clip": 1.04646969, + "balance_loss_mlp": 1.03269148, + "epoch": 0.3687960318653239, + "flos": 21396798881280.0, + "grad_norm": 1.624825924086567, + "language_loss": 0.71846235, + "learning_rate": 2.911986698512874e-06, + "loss": 0.73968947, + "num_input_tokens_seen": 131804430, + "step": 6134, + "time_per_iteration": 2.6402177810668945 + }, + { + "auxiliary_loss_clip": 0.01099921, + "auxiliary_loss_mlp": 0.01037883, + "balance_loss_clip": 1.04831088, + "balance_loss_mlp": 1.02165902, + "epoch": 0.36885615511799186, + "flos": 20266043760000.0, + "grad_norm": 1.5045917312043766, + "language_loss": 0.75128919, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.77266729, + "num_input_tokens_seen": 131822060, + "step": 6135, + "time_per_iteration": 2.577648401260376 + }, + { + "auxiliary_loss_clip": 0.01027316, + "auxiliary_loss_mlp": 0.01004833, + "balance_loss_clip": 1.02852416, + "balance_loss_mlp": 1.00231731, + "epoch": 0.3689162783706599, + "flos": 63088836301440.0, + "grad_norm": 0.8104127227940295, + "language_loss": 0.58816999, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.60849148, + "num_input_tokens_seen": 131880715, + "step": 6136, + "time_per_iteration": 3.112391233444214 + }, + { + "auxiliary_loss_clip": 0.01107184, + "auxiliary_loss_mlp": 0.01041645, + "balance_loss_clip": 1.05224895, + "balance_loss_mlp": 1.02537298, + "epoch": 0.36897640162332784, + "flos": 10961984419200.0, + "grad_norm": 1.866682844487848, + "language_loss": 0.78531969, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.80680794, + "num_input_tokens_seen": 131895850, + "step": 6137, + "time_per_iteration": 3.9055001735687256 + }, + { + "auxiliary_loss_clip": 0.01122333, + "auxiliary_loss_mlp": 0.01043156, + "balance_loss_clip": 1.04824209, + "balance_loss_mlp": 1.02701473, + "epoch": 0.3690365248759958, + "flos": 20704297599360.0, + "grad_norm": 1.9926850522133692, + "language_loss": 0.74251115, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.764166, + "num_input_tokens_seen": 131915775, + "step": 6138, + "time_per_iteration": 2.524902582168579 + }, + { + "auxiliary_loss_clip": 0.01095208, + "auxiliary_loss_mlp": 0.01042259, + "balance_loss_clip": 1.0476284, + "balance_loss_mlp": 1.02580786, + "epoch": 0.3690966481286638, + "flos": 31826369957760.0, + "grad_norm": 1.7278775363486276, + "language_loss": 0.65140885, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.67278349, + "num_input_tokens_seen": 131935715, + "step": 6139, + "time_per_iteration": 4.0203869342803955 + }, + { + "auxiliary_loss_clip": 0.01097024, + "auxiliary_loss_mlp": 0.01042575, + "balance_loss_clip": 1.05215979, + "balance_loss_mlp": 1.02669644, + "epoch": 0.36915677138133174, + "flos": 13114936782720.0, + "grad_norm": 1.9956853177993548, + "language_loss": 0.71371067, + "learning_rate": 2.909906390418006e-06, + "loss": 0.73510671, + "num_input_tokens_seen": 131954120, + "step": 6140, + "time_per_iteration": 3.9769110679626465 + }, + { + "auxiliary_loss_clip": 0.010275, + "auxiliary_loss_mlp": 0.01001973, + "balance_loss_clip": 1.04225302, + "balance_loss_mlp": 1.00016141, + "epoch": 0.3692168946339997, + "flos": 68686879956480.0, + "grad_norm": 0.7505637959514583, + "language_loss": 0.59337437, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.61366916, + "num_input_tokens_seen": 132017485, + "step": 6141, + "time_per_iteration": 3.2396280765533447 + }, + { + "auxiliary_loss_clip": 0.01118507, + "auxiliary_loss_mlp": 0.01040653, + "balance_loss_clip": 1.049106, + "balance_loss_mlp": 1.02491713, + "epoch": 0.36927701788666767, + "flos": 22017873968640.0, + "grad_norm": 1.6767321890717222, + "language_loss": 0.75071031, + "learning_rate": 2.909212678216192e-06, + "loss": 0.77230191, + "num_input_tokens_seen": 132036760, + "step": 6142, + "time_per_iteration": 2.5542895793914795 + }, + { + "auxiliary_loss_clip": 0.01120082, + "auxiliary_loss_mlp": 0.01037953, + "balance_loss_clip": 1.04938674, + "balance_loss_mlp": 1.02363634, + "epoch": 0.36933714113933563, + "flos": 21835591424640.0, + "grad_norm": 1.8153055345014977, + "language_loss": 0.77130187, + "learning_rate": 2.908865770392555e-06, + "loss": 0.7928822, + "num_input_tokens_seen": 132056935, + "step": 6143, + "time_per_iteration": 3.9607644081115723 + }, + { + "auxiliary_loss_clip": 0.01117293, + "auxiliary_loss_mlp": 0.01036106, + "balance_loss_clip": 1.04770017, + "balance_loss_mlp": 1.02210498, + "epoch": 0.3693972643920036, + "flos": 23691705793920.0, + "grad_norm": 1.4145135447068977, + "language_loss": 0.81957954, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.84111357, + "num_input_tokens_seen": 132077285, + "step": 6144, + "time_per_iteration": 2.5141496658325195 + }, + { + "auxiliary_loss_clip": 0.01120902, + "auxiliary_loss_mlp": 0.01039661, + "balance_loss_clip": 1.04731321, + "balance_loss_mlp": 1.02482557, + "epoch": 0.36945738764467156, + "flos": 22856747172480.0, + "grad_norm": 2.3668715566298424, + "language_loss": 0.78081441, + "learning_rate": 2.908171851365593e-06, + "loss": 0.80242002, + "num_input_tokens_seen": 132095520, + "step": 6145, + "time_per_iteration": 2.522660970687866 + }, + { + "auxiliary_loss_clip": 0.01113343, + "auxiliary_loss_mlp": 0.01032684, + "balance_loss_clip": 1.04715729, + "balance_loss_mlp": 1.01737714, + "epoch": 0.36951751089733953, + "flos": 16615939593600.0, + "grad_norm": 1.6528734925719124, + "language_loss": 0.76943612, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.79089642, + "num_input_tokens_seen": 132112810, + "step": 6146, + "time_per_iteration": 2.481940269470215 + }, + { + "auxiliary_loss_clip": 0.01105181, + "auxiliary_loss_mlp": 0.01040696, + "balance_loss_clip": 1.04799008, + "balance_loss_mlp": 1.02480531, + "epoch": 0.3695776341500075, + "flos": 18914545607040.0, + "grad_norm": 4.398420387454917, + "language_loss": 0.80298954, + "learning_rate": 2.907477794586761e-06, + "loss": 0.82444835, + "num_input_tokens_seen": 132131615, + "step": 6147, + "time_per_iteration": 2.5308539867401123 + }, + { + "auxiliary_loss_clip": 0.01098476, + "auxiliary_loss_mlp": 0.00821718, + "balance_loss_clip": 1.04763579, + "balance_loss_mlp": 1.07257986, + "epoch": 0.36963775740267546, + "flos": 20808474019200.0, + "grad_norm": 1.723329282176361, + "language_loss": 0.8359195, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.85512143, + "num_input_tokens_seen": 132149585, + "step": 6148, + "time_per_iteration": 2.5553088188171387 + }, + { + "auxiliary_loss_clip": 0.01119, + "auxiliary_loss_mlp": 0.01038145, + "balance_loss_clip": 1.04968667, + "balance_loss_mlp": 1.02298117, + "epoch": 0.3696978806553435, + "flos": 26061881656320.0, + "grad_norm": 2.288795123647486, + "language_loss": 0.7458598, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.76743126, + "num_input_tokens_seen": 132165555, + "step": 6149, + "time_per_iteration": 2.531938314437866 + }, + { + "auxiliary_loss_clip": 0.01132322, + "auxiliary_loss_mlp": 0.0103992, + "balance_loss_clip": 1.04774225, + "balance_loss_mlp": 1.02388644, + "epoch": 0.36975800390801145, + "flos": 26833925606400.0, + "grad_norm": 2.383777960098795, + "language_loss": 0.71126986, + "learning_rate": 2.906436451364054e-06, + "loss": 0.73299229, + "num_input_tokens_seen": 132185100, + "step": 6150, + "time_per_iteration": 2.524066925048828 + }, + { + "auxiliary_loss_clip": 0.01106454, + "auxiliary_loss_mlp": 0.01040443, + "balance_loss_clip": 1.04753923, + "balance_loss_mlp": 1.02522588, + "epoch": 0.3698181271606794, + "flos": 21142623265920.0, + "grad_norm": 1.8117307459688528, + "language_loss": 0.81985712, + "learning_rate": 2.906089268194611e-06, + "loss": 0.84132612, + "num_input_tokens_seen": 132203930, + "step": 6151, + "time_per_iteration": 2.538400888442993 + }, + { + "auxiliary_loss_clip": 0.01029227, + "auxiliary_loss_mlp": 0.01005271, + "balance_loss_clip": 1.02088761, + "balance_loss_mlp": 1.00335181, + "epoch": 0.3698782504133474, + "flos": 66742639568640.0, + "grad_norm": 0.8844077944102461, + "language_loss": 0.63206869, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65241373, + "num_input_tokens_seen": 132263845, + "step": 6152, + "time_per_iteration": 3.2307043075561523 + }, + { + "auxiliary_loss_clip": 0.01084254, + "auxiliary_loss_mlp": 0.0104285, + "balance_loss_clip": 1.04750729, + "balance_loss_mlp": 1.02732944, + "epoch": 0.36993837366601534, + "flos": 24311523905280.0, + "grad_norm": 2.015452372122809, + "language_loss": 0.69997603, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.72124714, + "num_input_tokens_seen": 132282350, + "step": 6153, + "time_per_iteration": 2.6117208003997803 + }, + { + "auxiliary_loss_clip": 0.01119298, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.04894376, + "balance_loss_mlp": 1.02450967, + "epoch": 0.3699984969186833, + "flos": 24349194293760.0, + "grad_norm": 1.7438400191123264, + "language_loss": 0.72407353, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.74566662, + "num_input_tokens_seen": 132301930, + "step": 6154, + "time_per_iteration": 2.5523509979248047 + }, + { + "auxiliary_loss_clip": 0.01103094, + "auxiliary_loss_mlp": 0.0103273, + "balance_loss_clip": 1.04836714, + "balance_loss_mlp": 1.01861608, + "epoch": 0.37005862017135127, + "flos": 19829154637440.0, + "grad_norm": 1.6464314973772576, + "language_loss": 0.6855045, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.70686269, + "num_input_tokens_seen": 132320915, + "step": 6155, + "time_per_iteration": 2.5585644245147705 + }, + { + "auxiliary_loss_clip": 0.01118698, + "auxiliary_loss_mlp": 0.01031832, + "balance_loss_clip": 1.04801917, + "balance_loss_mlp": 1.01708579, + "epoch": 0.37011874342401924, + "flos": 19573793873280.0, + "grad_norm": 1.6708482155136135, + "language_loss": 0.6744163, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.6959216, + "num_input_tokens_seen": 132340415, + "step": 6156, + "time_per_iteration": 2.5040578842163086 + }, + { + "auxiliary_loss_clip": 0.01110524, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.04700351, + "balance_loss_mlp": 1.02421165, + "epoch": 0.3701788666766872, + "flos": 20374350243840.0, + "grad_norm": 1.9362698794781563, + "language_loss": 0.81848252, + "learning_rate": 2.904005448099916e-06, + "loss": 0.83996809, + "num_input_tokens_seen": 132358600, + "step": 6157, + "time_per_iteration": 2.517601728439331 + }, + { + "auxiliary_loss_clip": 0.01086716, + "auxiliary_loss_mlp": 0.01036938, + "balance_loss_clip": 1.0485661, + "balance_loss_mlp": 1.02059424, + "epoch": 0.37023898992935517, + "flos": 15340931452800.0, + "grad_norm": 2.626825492492475, + "language_loss": 0.76345623, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.7846927, + "num_input_tokens_seen": 132373160, + "step": 6158, + "time_per_iteration": 2.58760142326355 + }, + { + "auxiliary_loss_clip": 0.01131595, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.04755771, + "balance_loss_mlp": 1.0198344, + "epoch": 0.37029911318202313, + "flos": 19573937527680.0, + "grad_norm": 2.2943862564096245, + "language_loss": 0.68775821, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.7094276, + "num_input_tokens_seen": 132392345, + "step": 6159, + "time_per_iteration": 2.468376874923706 + }, + { + "auxiliary_loss_clip": 0.01102178, + "auxiliary_loss_mlp": 0.01038151, + "balance_loss_clip": 1.0471673, + "balance_loss_mlp": 1.02436399, + "epoch": 0.3703592364346911, + "flos": 26213353309440.0, + "grad_norm": 1.6980922245244061, + "language_loss": 0.70943904, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.73084223, + "num_input_tokens_seen": 132412620, + "step": 6160, + "time_per_iteration": 2.5875155925750732 + }, + { + "auxiliary_loss_clip": 0.01102664, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.04775286, + "balance_loss_mlp": 1.01875925, + "epoch": 0.37041935968735906, + "flos": 20048317470720.0, + "grad_norm": 1.722193149307762, + "language_loss": 0.79082012, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.81216693, + "num_input_tokens_seen": 132431570, + "step": 6161, + "time_per_iteration": 2.534054756164551 + }, + { + "auxiliary_loss_clip": 0.01130329, + "auxiliary_loss_mlp": 0.01039105, + "balance_loss_clip": 1.04814982, + "balance_loss_mlp": 1.02342939, + "epoch": 0.3704794829400271, + "flos": 24133802388480.0, + "grad_norm": 1.85413278882439, + "language_loss": 0.7959826, + "learning_rate": 2.902267988534295e-06, + "loss": 0.81767702, + "num_input_tokens_seen": 132451525, + "step": 6162, + "time_per_iteration": 2.5589780807495117 + }, + { + "auxiliary_loss_clip": 0.01106328, + "auxiliary_loss_mlp": 0.00794973, + "balance_loss_clip": 1.04814398, + "balance_loss_mlp": 1.01938772, + "epoch": 0.37053960619269505, + "flos": 14866874732160.0, + "grad_norm": 1.893293286849104, + "language_loss": 0.79322976, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.81224275, + "num_input_tokens_seen": 132469875, + "step": 6163, + "time_per_iteration": 2.5130016803741455 + }, + { + "auxiliary_loss_clip": 0.01116236, + "auxiliary_loss_mlp": 0.01037411, + "balance_loss_clip": 1.04762995, + "balance_loss_mlp": 1.02236688, + "epoch": 0.370599729445363, + "flos": 21361498790400.0, + "grad_norm": 1.5468667580093949, + "language_loss": 0.67891955, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.70045602, + "num_input_tokens_seen": 132488360, + "step": 6164, + "time_per_iteration": 2.500903367996216 + }, + { + "auxiliary_loss_clip": 0.01104184, + "auxiliary_loss_mlp": 0.01042532, + "balance_loss_clip": 1.04926932, + "balance_loss_mlp": 1.02642679, + "epoch": 0.370659852698031, + "flos": 26829041356800.0, + "grad_norm": 3.0280898960088987, + "language_loss": 0.83321536, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.85468256, + "num_input_tokens_seen": 132508630, + "step": 6165, + "time_per_iteration": 2.5711727142333984 + }, + { + "auxiliary_loss_clip": 0.01118657, + "auxiliary_loss_mlp": 0.01040988, + "balance_loss_clip": 1.05085874, + "balance_loss_mlp": 1.02406061, + "epoch": 0.37071997595069894, + "flos": 19099018880640.0, + "grad_norm": 5.611673791159961, + "language_loss": 0.69024491, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.71184134, + "num_input_tokens_seen": 132527465, + "step": 6166, + "time_per_iteration": 2.5433568954467773 + }, + { + "auxiliary_loss_clip": 0.01018998, + "auxiliary_loss_mlp": 0.0100493, + "balance_loss_clip": 1.02217937, + "balance_loss_mlp": 1.00287914, + "epoch": 0.3707800992033669, + "flos": 52178384920320.0, + "grad_norm": 0.7899809670082475, + "language_loss": 0.56883633, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.58907557, + "num_input_tokens_seen": 132579940, + "step": 6167, + "time_per_iteration": 3.0093185901641846 + }, + { + "auxiliary_loss_clip": 0.01104424, + "auxiliary_loss_mlp": 0.01034787, + "balance_loss_clip": 1.04536772, + "balance_loss_mlp": 1.02047586, + "epoch": 0.3708402224560349, + "flos": 19901837808000.0, + "grad_norm": 1.9196697109395273, + "language_loss": 0.75189364, + "learning_rate": 2.900181908135584e-06, + "loss": 0.77328569, + "num_input_tokens_seen": 132598390, + "step": 6168, + "time_per_iteration": 2.5176985263824463 + }, + { + "auxiliary_loss_clip": 0.01114721, + "auxiliary_loss_mlp": 0.00794314, + "balance_loss_clip": 1.04882908, + "balance_loss_mlp": 1.01863873, + "epoch": 0.37090034570870284, + "flos": 20007630339840.0, + "grad_norm": 1.6322026154487526, + "language_loss": 0.7383889, + "learning_rate": 2.899834108519755e-06, + "loss": 0.75747931, + "num_input_tokens_seen": 132616920, + "step": 6169, + "time_per_iteration": 2.5108673572540283 + }, + { + "auxiliary_loss_clip": 0.01128284, + "auxiliary_loss_mlp": 0.01032862, + "balance_loss_clip": 1.04925108, + "balance_loss_mlp": 1.01850915, + "epoch": 0.3709604689613708, + "flos": 24134700228480.0, + "grad_norm": 1.4526614842355259, + "language_loss": 0.79303074, + "learning_rate": 2.899486274782127e-06, + "loss": 0.81464225, + "num_input_tokens_seen": 132637660, + "step": 6170, + "time_per_iteration": 2.5169289112091064 + }, + { + "auxiliary_loss_clip": 0.01117133, + "auxiliary_loss_mlp": 0.01037522, + "balance_loss_clip": 1.05064321, + "balance_loss_mlp": 1.02219725, + "epoch": 0.37102059221403877, + "flos": 23876071326720.0, + "grad_norm": 1.7631131951199839, + "language_loss": 0.76149201, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.78303856, + "num_input_tokens_seen": 132657635, + "step": 6171, + "time_per_iteration": 2.530855894088745 + }, + { + "auxiliary_loss_clip": 0.01107081, + "auxiliary_loss_mlp": 0.01034251, + "balance_loss_clip": 1.05115438, + "balance_loss_mlp": 1.01840818, + "epoch": 0.37108071546670673, + "flos": 14501268149760.0, + "grad_norm": 1.924524113688885, + "language_loss": 0.80882895, + "learning_rate": 2.898790504994232e-06, + "loss": 0.83024222, + "num_input_tokens_seen": 132674455, + "step": 6172, + "time_per_iteration": 2.507513999938965 + }, + { + "auxiliary_loss_clip": 0.01122076, + "auxiliary_loss_mlp": 0.01034524, + "balance_loss_clip": 1.04902732, + "balance_loss_mlp": 1.01852608, + "epoch": 0.3711408387193747, + "flos": 34562619279360.0, + "grad_norm": 1.7862264679881996, + "language_loss": 0.59549958, + "learning_rate": 2.89844256897035e-06, + "loss": 0.61706555, + "num_input_tokens_seen": 132695140, + "step": 6173, + "time_per_iteration": 2.606901168823242 + }, + { + "auxiliary_loss_clip": 0.01108468, + "auxiliary_loss_mlp": 0.01034872, + "balance_loss_clip": 1.0470264, + "balance_loss_mlp": 1.01905894, + "epoch": 0.37120096197204266, + "flos": 17310703432320.0, + "grad_norm": 1.9394642288181363, + "language_loss": 0.80719721, + "learning_rate": 2.898094598877435e-06, + "loss": 0.82863057, + "num_input_tokens_seen": 132712470, + "step": 6174, + "time_per_iteration": 2.5172033309936523 + }, + { + "auxiliary_loss_clip": 0.01126777, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.04781544, + "balance_loss_mlp": 1.020962, + "epoch": 0.37126108522471063, + "flos": 30664049760000.0, + "grad_norm": 1.9898520766266954, + "language_loss": 0.79815769, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.81977487, + "num_input_tokens_seen": 132732945, + "step": 6175, + "time_per_iteration": 3.9395790100097656 + }, + { + "auxiliary_loss_clip": 0.01120831, + "auxiliary_loss_mlp": 0.01043015, + "balance_loss_clip": 1.05058062, + "balance_loss_mlp": 1.02743483, + "epoch": 0.37132120847737865, + "flos": 25155640494720.0, + "grad_norm": 1.8075899138327054, + "language_loss": 0.88741648, + "learning_rate": 2.89739855653729e-06, + "loss": 0.90905499, + "num_input_tokens_seen": 132752470, + "step": 6176, + "time_per_iteration": 2.530491828918457 + }, + { + "auxiliary_loss_clip": 0.01120011, + "auxiliary_loss_mlp": 0.01036782, + "balance_loss_clip": 1.04930925, + "balance_loss_mlp": 1.02192879, + "epoch": 0.3713813317300466, + "flos": 21213474842880.0, + "grad_norm": 1.5627903295698438, + "language_loss": 0.73341531, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.75498319, + "num_input_tokens_seen": 132771485, + "step": 6177, + "time_per_iteration": 2.5354979038238525 + }, + { + "auxiliary_loss_clip": 0.01096408, + "auxiliary_loss_mlp": 0.010389, + "balance_loss_clip": 1.04486036, + "balance_loss_mlp": 1.02364123, + "epoch": 0.3714414549827146, + "flos": 21616644072960.0, + "grad_norm": 1.697243388797151, + "language_loss": 0.75110203, + "learning_rate": 2.896702378079374e-06, + "loss": 0.77245516, + "num_input_tokens_seen": 132791465, + "step": 6178, + "time_per_iteration": 5.406606197357178 + }, + { + "auxiliary_loss_clip": 0.01070445, + "auxiliary_loss_mlp": 0.01044133, + "balance_loss_clip": 1.04715729, + "balance_loss_mlp": 1.02678812, + "epoch": 0.37150157823538255, + "flos": 19972294335360.0, + "grad_norm": 1.754339233952254, + "language_loss": 0.71785581, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.73900151, + "num_input_tokens_seen": 132810160, + "step": 6179, + "time_per_iteration": 2.6201400756835938 + }, + { + "auxiliary_loss_clip": 0.01132431, + "auxiliary_loss_mlp": 0.01041196, + "balance_loss_clip": 1.0485146, + "balance_loss_mlp": 1.02468598, + "epoch": 0.3715617014880505, + "flos": 24860562266880.0, + "grad_norm": 2.7936122887318477, + "language_loss": 0.69970518, + "learning_rate": 2.896006063609283e-06, + "loss": 0.72144145, + "num_input_tokens_seen": 132831265, + "step": 6180, + "time_per_iteration": 2.553495168685913 + }, + { + "auxiliary_loss_clip": 0.01111115, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.04866219, + "balance_loss_mlp": 1.01780581, + "epoch": 0.3716218247407185, + "flos": 20449080489600.0, + "grad_norm": 1.7078475633671397, + "language_loss": 0.7835632, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.80499887, + "num_input_tokens_seen": 132850005, + "step": 6181, + "time_per_iteration": 3.942554473876953 + }, + { + "auxiliary_loss_clip": 0.01120653, + "auxiliary_loss_mlp": 0.01035438, + "balance_loss_clip": 1.05105305, + "balance_loss_mlp": 1.0192256, + "epoch": 0.37168194799338644, + "flos": 24133479166080.0, + "grad_norm": 1.7528406209844924, + "language_loss": 0.78256285, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.8041237, + "num_input_tokens_seen": 132865790, + "step": 6182, + "time_per_iteration": 2.506995916366577 + }, + { + "auxiliary_loss_clip": 0.01030202, + "auxiliary_loss_mlp": 0.01004693, + "balance_loss_clip": 1.02361858, + "balance_loss_mlp": 1.00289273, + "epoch": 0.3717420712460544, + "flos": 67408926900480.0, + "grad_norm": 0.7850100513570387, + "language_loss": 0.57531583, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59566474, + "num_input_tokens_seen": 132921775, + "step": 6183, + "time_per_iteration": 3.1307172775268555 + }, + { + "auxiliary_loss_clip": 0.01123829, + "auxiliary_loss_mlp": 0.00798885, + "balance_loss_clip": 1.04703832, + "balance_loss_mlp": 1.02290249, + "epoch": 0.37180219449872237, + "flos": 22376908362240.0, + "grad_norm": 1.987292468516446, + "language_loss": 0.77030551, + "learning_rate": 2.894613027055066e-06, + "loss": 0.78953254, + "num_input_tokens_seen": 132941060, + "step": 6184, + "time_per_iteration": 2.5137853622436523 + }, + { + "auxiliary_loss_clip": 0.01090764, + "auxiliary_loss_mlp": 0.01039898, + "balance_loss_clip": 1.04840124, + "balance_loss_mlp": 1.02487206, + "epoch": 0.37186231775139034, + "flos": 21869885934720.0, + "grad_norm": 2.4799593617328886, + "language_loss": 0.72667837, + "learning_rate": 2.894264683073954e-06, + "loss": 0.74798489, + "num_input_tokens_seen": 132961850, + "step": 6185, + "time_per_iteration": 2.575252056121826 + }, + { + "auxiliary_loss_clip": 0.0108209, + "auxiliary_loss_mlp": 0.0103509, + "balance_loss_clip": 1.04630315, + "balance_loss_mlp": 1.01918745, + "epoch": 0.3719224410040583, + "flos": 22415225195520.0, + "grad_norm": 1.4290151089854959, + "language_loss": 0.76732284, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.78849465, + "num_input_tokens_seen": 132981625, + "step": 6186, + "time_per_iteration": 2.579238176345825 + }, + { + "auxiliary_loss_clip": 0.01129203, + "auxiliary_loss_mlp": 0.01038104, + "balance_loss_clip": 1.05290973, + "balance_loss_mlp": 1.02204633, + "epoch": 0.37198256425672627, + "flos": 25151223121920.0, + "grad_norm": 1.7477246713944279, + "language_loss": 0.834867, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.85654014, + "num_input_tokens_seen": 133001225, + "step": 6187, + "time_per_iteration": 2.5495200157165527 + }, + { + "auxiliary_loss_clip": 0.01118624, + "auxiliary_loss_mlp": 0.0103659, + "balance_loss_clip": 1.0459888, + "balance_loss_mlp": 1.02133131, + "epoch": 0.37204268750939423, + "flos": 21138313633920.0, + "grad_norm": 1.6406345570332361, + "language_loss": 0.84661615, + "learning_rate": 2.893219447719824e-06, + "loss": 0.86816829, + "num_input_tokens_seen": 133018820, + "step": 6188, + "time_per_iteration": 2.4949963092803955 + }, + { + "auxiliary_loss_clip": 0.01107787, + "auxiliary_loss_mlp": 0.01036575, + "balance_loss_clip": 1.05156064, + "balance_loss_mlp": 1.02068424, + "epoch": 0.37210281076206225, + "flos": 21506829217920.0, + "grad_norm": 1.6342903083261189, + "language_loss": 0.65319955, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.67464316, + "num_input_tokens_seen": 133040205, + "step": 6189, + "time_per_iteration": 2.570903778076172 + }, + { + "auxiliary_loss_clip": 0.01108993, + "auxiliary_loss_mlp": 0.01043831, + "balance_loss_clip": 1.0460602, + "balance_loss_mlp": 1.02722478, + "epoch": 0.3721629340147302, + "flos": 17347835116800.0, + "grad_norm": 1.6852395240959965, + "language_loss": 0.84431481, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.865843, + "num_input_tokens_seen": 133058095, + "step": 6190, + "time_per_iteration": 2.478821277618408 + }, + { + "auxiliary_loss_clip": 0.01105737, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.04785359, + "balance_loss_mlp": 1.02219498, + "epoch": 0.3722230572673982, + "flos": 16432400073600.0, + "grad_norm": 4.988112172548138, + "language_loss": 0.88325226, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.90468979, + "num_input_tokens_seen": 133071530, + "step": 6191, + "time_per_iteration": 2.4879090785980225 + }, + { + "auxiliary_loss_clip": 0.01087962, + "auxiliary_loss_mlp": 0.01040563, + "balance_loss_clip": 1.04809713, + "balance_loss_mlp": 1.0217514, + "epoch": 0.37228318052006615, + "flos": 22674716023680.0, + "grad_norm": 1.8694452632865397, + "language_loss": 0.73956245, + "learning_rate": 2.891825326449073e-06, + "loss": 0.76084769, + "num_input_tokens_seen": 133091410, + "step": 6192, + "time_per_iteration": 2.620514154434204 + }, + { + "auxiliary_loss_clip": 0.01131628, + "auxiliary_loss_mlp": 0.01036427, + "balance_loss_clip": 1.04907668, + "balance_loss_mlp": 1.0215435, + "epoch": 0.3723433037727341, + "flos": 25265491263360.0, + "grad_norm": 1.9808759938291214, + "language_loss": 0.79743671, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.81911731, + "num_input_tokens_seen": 133110365, + "step": 6193, + "time_per_iteration": 2.5447306632995605 + }, + { + "auxiliary_loss_clip": 0.01103783, + "auxiliary_loss_mlp": 0.01037544, + "balance_loss_clip": 1.05198598, + "balance_loss_mlp": 1.02214253, + "epoch": 0.3724034270254021, + "flos": 10524664333440.0, + "grad_norm": 1.7724769803597884, + "language_loss": 0.84527779, + "learning_rate": 2.891128062852194e-06, + "loss": 0.86669111, + "num_input_tokens_seen": 133128255, + "step": 6194, + "time_per_iteration": 2.5395092964172363 + }, + { + "auxiliary_loss_clip": 0.01108159, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.04802561, + "balance_loss_mlp": 1.02194512, + "epoch": 0.37246355027807004, + "flos": 20266223328000.0, + "grad_norm": 2.442968745001766, + "language_loss": 0.76835519, + "learning_rate": 2.890779380359646e-06, + "loss": 0.78980434, + "num_input_tokens_seen": 133143975, + "step": 6195, + "time_per_iteration": 2.5289199352264404 + }, + { + "auxiliary_loss_clip": 0.01111618, + "auxiliary_loss_mlp": 0.01036352, + "balance_loss_clip": 1.05036402, + "balance_loss_mlp": 1.02086723, + "epoch": 0.372523673530738, + "flos": 19500571998720.0, + "grad_norm": 1.4854709748001063, + "language_loss": 0.79145384, + "learning_rate": 2.890430664088655e-06, + "loss": 0.81293356, + "num_input_tokens_seen": 133162935, + "step": 6196, + "time_per_iteration": 2.545846939086914 + }, + { + "auxiliary_loss_clip": 0.01122084, + "auxiliary_loss_mlp": 0.0103674, + "balance_loss_clip": 1.05172873, + "balance_loss_mlp": 1.0222795, + "epoch": 0.372583796783406, + "flos": 16764250849920.0, + "grad_norm": 2.100293677098559, + "language_loss": 0.83406287, + "learning_rate": 2.890081914052443e-06, + "loss": 0.85565114, + "num_input_tokens_seen": 133181180, + "step": 6197, + "time_per_iteration": 2.4989984035491943 + }, + { + "auxiliary_loss_clip": 0.01129313, + "auxiliary_loss_mlp": 0.01038445, + "balance_loss_clip": 1.04827738, + "balance_loss_mlp": 1.02229226, + "epoch": 0.37264392003607394, + "flos": 22637979388800.0, + "grad_norm": 1.5257480938126535, + "language_loss": 0.64560902, + "learning_rate": 2.889733130264237e-06, + "loss": 0.66728663, + "num_input_tokens_seen": 133199615, + "step": 6198, + "time_per_iteration": 2.4931671619415283 + }, + { + "auxiliary_loss_clip": 0.01118013, + "auxiliary_loss_mlp": 0.01046048, + "balance_loss_clip": 1.04768467, + "balance_loss_mlp": 1.03093278, + "epoch": 0.3727040432887419, + "flos": 19973120348160.0, + "grad_norm": 1.4571669166566112, + "language_loss": 0.73966074, + "learning_rate": 2.889384312737261e-06, + "loss": 0.7613014, + "num_input_tokens_seen": 133219650, + "step": 6199, + "time_per_iteration": 2.520815134048462 + }, + { + "auxiliary_loss_clip": 0.01103371, + "auxiliary_loss_mlp": 0.0103755, + "balance_loss_clip": 1.04834831, + "balance_loss_mlp": 1.02288747, + "epoch": 0.37276416654140987, + "flos": 63899122279680.0, + "grad_norm": 1.9782290599250099, + "language_loss": 0.80747348, + "learning_rate": 2.889035461484742e-06, + "loss": 0.82888269, + "num_input_tokens_seen": 133245675, + "step": 6200, + "time_per_iteration": 2.950160503387451 + }, + { + "auxiliary_loss_clip": 0.01095082, + "auxiliary_loss_mlp": 0.01044903, + "balance_loss_clip": 1.05044675, + "balance_loss_mlp": 1.02937007, + "epoch": 0.37282428979407783, + "flos": 39785970211200.0, + "grad_norm": 1.8306444564735191, + "language_loss": 0.60503137, + "learning_rate": 2.88868657651991e-06, + "loss": 0.62643123, + "num_input_tokens_seen": 133266905, + "step": 6201, + "time_per_iteration": 2.7049427032470703 + }, + { + "auxiliary_loss_clip": 0.01120597, + "auxiliary_loss_mlp": 0.01038731, + "balance_loss_clip": 1.04923463, + "balance_loss_mlp": 1.02306104, + "epoch": 0.37288441304674586, + "flos": 22709046447360.0, + "grad_norm": 2.0117258524656547, + "language_loss": 0.731058, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.75265127, + "num_input_tokens_seen": 133286865, + "step": 6202, + "time_per_iteration": 2.5042881965637207 + }, + { + "auxiliary_loss_clip": 0.01107577, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.04776192, + "balance_loss_mlp": 1.02189493, + "epoch": 0.3729445362994138, + "flos": 18770292587520.0, + "grad_norm": 65.31828046783563, + "language_loss": 0.73965305, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.76110429, + "num_input_tokens_seen": 133305295, + "step": 6203, + "time_per_iteration": 2.5027801990509033 + }, + { + "auxiliary_loss_clip": 0.01103091, + "auxiliary_loss_mlp": 0.01036887, + "balance_loss_clip": 1.04606581, + "balance_loss_mlp": 1.0236547, + "epoch": 0.3730046595520818, + "flos": 22456199635200.0, + "grad_norm": 1.7160359928723927, + "language_loss": 0.81656098, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.83796072, + "num_input_tokens_seen": 133324625, + "step": 6204, + "time_per_iteration": 2.544421911239624 + }, + { + "auxiliary_loss_clip": 0.01119054, + "auxiliary_loss_mlp": 0.01044697, + "balance_loss_clip": 1.04687309, + "balance_loss_mlp": 1.02857959, + "epoch": 0.37306478280474975, + "flos": 24316372241280.0, + "grad_norm": 1.6084665046265005, + "language_loss": 0.75239861, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77403617, + "num_input_tokens_seen": 133344625, + "step": 6205, + "time_per_iteration": 2.520479440689087 + }, + { + "auxiliary_loss_clip": 0.0111358, + "auxiliary_loss_mlp": 0.01038897, + "balance_loss_clip": 1.04433298, + "balance_loss_mlp": 1.02294719, + "epoch": 0.3731249060574177, + "flos": 15815167741440.0, + "grad_norm": 1.8812788401078289, + "language_loss": 0.7762931, + "learning_rate": 2.886941646474128e-06, + "loss": 0.79781789, + "num_input_tokens_seen": 133363605, + "step": 6206, + "time_per_iteration": 2.489677667617798 + }, + { + "auxiliary_loss_clip": 0.01129714, + "auxiliary_loss_mlp": 0.01041245, + "balance_loss_clip": 1.04675686, + "balance_loss_mlp": 1.02500844, + "epoch": 0.3731850293100857, + "flos": 19828077229440.0, + "grad_norm": 2.054049465341689, + "language_loss": 0.933608, + "learning_rate": 2.886592559513283e-06, + "loss": 0.95531756, + "num_input_tokens_seen": 133379405, + "step": 6207, + "time_per_iteration": 2.445352077484131 + }, + { + "auxiliary_loss_clip": 0.01104449, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.04582262, + "balance_loss_mlp": 1.01866043, + "epoch": 0.37324515256275365, + "flos": 19062354072960.0, + "grad_norm": 2.142712149543232, + "language_loss": 0.82495034, + "learning_rate": 2.886243438932759e-06, + "loss": 0.84632277, + "num_input_tokens_seen": 133397585, + "step": 6208, + "time_per_iteration": 2.5405585765838623 + }, + { + "auxiliary_loss_clip": 0.01117668, + "auxiliary_loss_mlp": 0.01040207, + "balance_loss_clip": 1.04415798, + "balance_loss_mlp": 1.02254033, + "epoch": 0.3733052758154216, + "flos": 20704333512960.0, + "grad_norm": 1.7871021478688158, + "language_loss": 0.73395199, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.75553071, + "num_input_tokens_seen": 133415365, + "step": 6209, + "time_per_iteration": 2.486462116241455 + }, + { + "auxiliary_loss_clip": 0.01091198, + "auxiliary_loss_mlp": 0.01040027, + "balance_loss_clip": 1.04810858, + "balance_loss_mlp": 1.02345657, + "epoch": 0.3733653990680896, + "flos": 20193504243840.0, + "grad_norm": 1.5434100125732917, + "language_loss": 0.70263684, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.72394907, + "num_input_tokens_seen": 133435700, + "step": 6210, + "time_per_iteration": 2.5888261795043945 + }, + { + "auxiliary_loss_clip": 0.01076959, + "auxiliary_loss_mlp": 0.01043016, + "balance_loss_clip": 1.04199803, + "balance_loss_mlp": 1.02443099, + "epoch": 0.37342552232075754, + "flos": 20339660684160.0, + "grad_norm": 1.8465963146904167, + "language_loss": 0.7786001, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.79979992, + "num_input_tokens_seen": 133455180, + "step": 6211, + "time_per_iteration": 2.570037364959717 + }, + { + "auxiliary_loss_clip": 0.01119817, + "auxiliary_loss_mlp": 0.01036586, + "balance_loss_clip": 1.04642391, + "balance_loss_mlp": 1.02129149, + "epoch": 0.3734856455734255, + "flos": 35517879527040.0, + "grad_norm": 2.9647646385718556, + "language_loss": 0.72990268, + "learning_rate": 2.884846620678668e-06, + "loss": 0.75146675, + "num_input_tokens_seen": 133476715, + "step": 6212, + "time_per_iteration": 2.6347734928131104 + }, + { + "auxiliary_loss_clip": 0.01123409, + "auxiliary_loss_mlp": 0.01046533, + "balance_loss_clip": 1.05011392, + "balance_loss_mlp": 1.0297718, + "epoch": 0.37354576882609347, + "flos": 21142300043520.0, + "grad_norm": 2.045015955024548, + "language_loss": 0.82440293, + "learning_rate": 2.884497332198356e-06, + "loss": 0.84610236, + "num_input_tokens_seen": 133494550, + "step": 6213, + "time_per_iteration": 2.4902937412261963 + }, + { + "auxiliary_loss_clip": 0.01089828, + "auxiliary_loss_mlp": 0.01049073, + "balance_loss_clip": 1.04805899, + "balance_loss_mlp": 1.03126311, + "epoch": 0.37360589207876144, + "flos": 21506793304320.0, + "grad_norm": 2.0570717061013024, + "language_loss": 0.78883684, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.81022584, + "num_input_tokens_seen": 133512640, + "step": 6214, + "time_per_iteration": 3.9945590496063232 + }, + { + "auxiliary_loss_clip": 0.01106074, + "auxiliary_loss_mlp": 0.0104574, + "balance_loss_clip": 1.04477286, + "balance_loss_mlp": 1.03009999, + "epoch": 0.37366601533142946, + "flos": 38435800861440.0, + "grad_norm": 1.6818093051978518, + "language_loss": 0.84807032, + "learning_rate": 2.883798654630296e-06, + "loss": 0.86958849, + "num_input_tokens_seen": 133535540, + "step": 6215, + "time_per_iteration": 2.683483600616455 + }, + { + "auxiliary_loss_clip": 0.01098076, + "auxiliary_loss_mlp": 0.01038863, + "balance_loss_clip": 1.04696012, + "balance_loss_mlp": 1.02167273, + "epoch": 0.3737261385840974, + "flos": 18441171244800.0, + "grad_norm": 1.5715619083787566, + "language_loss": 0.68114662, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.70251596, + "num_input_tokens_seen": 133555795, + "step": 6216, + "time_per_iteration": 2.5813605785369873 + }, + { + "auxiliary_loss_clip": 0.0111467, + "auxiliary_loss_mlp": 0.01044231, + "balance_loss_clip": 1.04746246, + "balance_loss_mlp": 1.02737439, + "epoch": 0.3737862618367654, + "flos": 22929861306240.0, + "grad_norm": 3.069618728397449, + "language_loss": 0.66158378, + "learning_rate": 2.883099843007303e-06, + "loss": 0.68317282, + "num_input_tokens_seen": 133575905, + "step": 6217, + "time_per_iteration": 5.353607892990112 + }, + { + "auxiliary_loss_clip": 0.01108809, + "auxiliary_loss_mlp": 0.01041678, + "balance_loss_clip": 1.04504848, + "balance_loss_mlp": 1.02515006, + "epoch": 0.37384638508943335, + "flos": 15409664127360.0, + "grad_norm": 1.9000723610627124, + "language_loss": 0.80299687, + "learning_rate": 2.88275038695833e-06, + "loss": 0.82450175, + "num_input_tokens_seen": 133592585, + "step": 6218, + "time_per_iteration": 2.5368142127990723 + }, + { + "auxiliary_loss_clip": 0.01115797, + "auxiliary_loss_mlp": 0.0103892, + "balance_loss_clip": 1.04780459, + "balance_loss_mlp": 1.02339911, + "epoch": 0.3739065083421013, + "flos": 24280820755200.0, + "grad_norm": 1.363463130915226, + "language_loss": 0.78892559, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.81047279, + "num_input_tokens_seen": 133615070, + "step": 6219, + "time_per_iteration": 2.5432002544403076 + }, + { + "auxiliary_loss_clip": 0.01106285, + "auxiliary_loss_mlp": 0.01040331, + "balance_loss_clip": 1.04747176, + "balance_loss_mlp": 1.02401137, + "epoch": 0.3739666315947693, + "flos": 23002831785600.0, + "grad_norm": 1.6069880871280953, + "language_loss": 0.76771057, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.78917676, + "num_input_tokens_seen": 133633490, + "step": 6220, + "time_per_iteration": 3.923288583755493 + }, + { + "auxiliary_loss_clip": 0.01099475, + "auxiliary_loss_mlp": 0.01042083, + "balance_loss_clip": 1.04518533, + "balance_loss_mlp": 1.02594173, + "epoch": 0.37402675484743725, + "flos": 19391116279680.0, + "grad_norm": 1.626671846030633, + "language_loss": 0.8297711, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.85118669, + "num_input_tokens_seen": 133653425, + "step": 6221, + "time_per_iteration": 2.5538079738616943 + }, + { + "auxiliary_loss_clip": 0.01105206, + "auxiliary_loss_mlp": 0.01042685, + "balance_loss_clip": 1.04576468, + "balance_loss_mlp": 1.02659166, + "epoch": 0.3740868781001052, + "flos": 17126158331520.0, + "grad_norm": 1.6678666592752713, + "language_loss": 0.76087523, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.78235412, + "num_input_tokens_seen": 133670220, + "step": 6222, + "time_per_iteration": 2.4975829124450684 + }, + { + "auxiliary_loss_clip": 0.01098838, + "auxiliary_loss_mlp": 0.00796666, + "balance_loss_clip": 1.04681945, + "balance_loss_mlp": 1.01915288, + "epoch": 0.3741470013527732, + "flos": 20043505048320.0, + "grad_norm": 1.772820108538877, + "language_loss": 0.70469058, + "learning_rate": 2.881002604868789e-06, + "loss": 0.72364557, + "num_input_tokens_seen": 133688910, + "step": 6223, + "time_per_iteration": 2.565131902694702 + }, + { + "auxiliary_loss_clip": 0.01100623, + "auxiliary_loss_mlp": 0.01038299, + "balance_loss_clip": 1.05081987, + "balance_loss_mlp": 1.02295661, + "epoch": 0.37420712460544114, + "flos": 36897279569280.0, + "grad_norm": 1.7453910489649678, + "language_loss": 0.68595231, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.70734155, + "num_input_tokens_seen": 133708690, + "step": 6224, + "time_per_iteration": 2.6873717308044434 + }, + { + "auxiliary_loss_clip": 0.01087909, + "auxiliary_loss_mlp": 0.01040287, + "balance_loss_clip": 1.05065513, + "balance_loss_mlp": 1.02411032, + "epoch": 0.3742672478581091, + "flos": 22201198007040.0, + "grad_norm": 1.9574342831272435, + "language_loss": 0.70090348, + "learning_rate": 2.880303258086228e-06, + "loss": 0.72218549, + "num_input_tokens_seen": 133728095, + "step": 6225, + "time_per_iteration": 2.600332736968994 + }, + { + "auxiliary_loss_clip": 0.01089229, + "auxiliary_loss_mlp": 0.01044308, + "balance_loss_clip": 1.05053711, + "balance_loss_mlp": 1.02728522, + "epoch": 0.3743273711107771, + "flos": 24681547860480.0, + "grad_norm": 2.014343073937966, + "language_loss": 0.79092574, + "learning_rate": 2.879953534616536e-06, + "loss": 0.8122611, + "num_input_tokens_seen": 133745590, + "step": 6226, + "time_per_iteration": 2.6541037559509277 + }, + { + "auxiliary_loss_clip": 0.01102335, + "auxiliary_loss_mlp": 0.01039273, + "balance_loss_clip": 1.04608703, + "balance_loss_mlp": 1.02276278, + "epoch": 0.37438749436344504, + "flos": 24459619680000.0, + "grad_norm": 1.7546461621204505, + "language_loss": 0.67802161, + "learning_rate": 2.879603777778917e-06, + "loss": 0.69943774, + "num_input_tokens_seen": 133766155, + "step": 6227, + "time_per_iteration": 2.5752339363098145 + }, + { + "auxiliary_loss_clip": 0.01094612, + "auxiliary_loss_mlp": 0.01037112, + "balance_loss_clip": 1.04520726, + "balance_loss_mlp": 1.02154374, + "epoch": 0.374447617616113, + "flos": 21798747048960.0, + "grad_norm": 1.7920884735946696, + "language_loss": 0.82864285, + "learning_rate": 2.879253987586635e-06, + "loss": 0.84996015, + "num_input_tokens_seen": 133783185, + "step": 6228, + "time_per_iteration": 2.5617477893829346 + }, + { + "auxiliary_loss_clip": 0.01086653, + "auxiliary_loss_mlp": 0.01043546, + "balance_loss_clip": 1.0460999, + "balance_loss_mlp": 1.02516413, + "epoch": 0.374507740868781, + "flos": 17968191932160.0, + "grad_norm": 1.583497750276829, + "language_loss": 0.7457462, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.76704818, + "num_input_tokens_seen": 133800975, + "step": 6229, + "time_per_iteration": 2.5277609825134277 + }, + { + "auxiliary_loss_clip": 0.01091993, + "auxiliary_loss_mlp": 0.01035731, + "balance_loss_clip": 1.04557741, + "balance_loss_mlp": 1.0190711, + "epoch": 0.374567864121449, + "flos": 16105828596480.0, + "grad_norm": 1.766833369613947, + "language_loss": 0.83750767, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.85878491, + "num_input_tokens_seen": 133818020, + "step": 6230, + "time_per_iteration": 2.5546700954437256 + }, + { + "auxiliary_loss_clip": 0.01119678, + "auxiliary_loss_mlp": 0.01039324, + "balance_loss_clip": 1.04836047, + "balance_loss_mlp": 1.02308178, + "epoch": 0.37462798737411696, + "flos": 25773160135680.0, + "grad_norm": 1.8295478060335288, + "language_loss": 0.73523349, + "learning_rate": 2.878204417014456e-06, + "loss": 0.75682348, + "num_input_tokens_seen": 133840690, + "step": 6231, + "time_per_iteration": 2.569983959197998 + }, + { + "auxiliary_loss_clip": 0.01117791, + "auxiliary_loss_mlp": 0.0104465, + "balance_loss_clip": 1.04973459, + "balance_loss_mlp": 1.02810335, + "epoch": 0.3746881106267849, + "flos": 16654507822080.0, + "grad_norm": 1.9600159468004925, + "language_loss": 0.73595011, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.75757444, + "num_input_tokens_seen": 133858350, + "step": 6232, + "time_per_iteration": 2.4946014881134033 + }, + { + "auxiliary_loss_clip": 0.01104005, + "auxiliary_loss_mlp": 0.01036216, + "balance_loss_clip": 1.04822969, + "balance_loss_mlp": 1.01997972, + "epoch": 0.3747482338794529, + "flos": 26177981391360.0, + "grad_norm": 1.5906641874612768, + "language_loss": 0.76665324, + "learning_rate": 2.877504536769561e-06, + "loss": 0.78805548, + "num_input_tokens_seen": 133879775, + "step": 6233, + "time_per_iteration": 2.5760295391082764 + }, + { + "auxiliary_loss_clip": 0.01112101, + "auxiliary_loss_mlp": 0.01039156, + "balance_loss_clip": 1.04833937, + "balance_loss_mlp": 1.02334929, + "epoch": 0.37480835713212085, + "flos": 12021061950720.0, + "grad_norm": 1.702172753597285, + "language_loss": 0.6955781, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.71709067, + "num_input_tokens_seen": 133898295, + "step": 6234, + "time_per_iteration": 2.503519296646118 + }, + { + "auxiliary_loss_clip": 0.01118139, + "auxiliary_loss_mlp": 0.01041922, + "balance_loss_clip": 1.04639304, + "balance_loss_mlp": 1.02698493, + "epoch": 0.3748684803847888, + "flos": 19679263182720.0, + "grad_norm": 2.0696472573382305, + "language_loss": 0.82310772, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.84470832, + "num_input_tokens_seen": 133915230, + "step": 6235, + "time_per_iteration": 2.476473331451416 + }, + { + "auxiliary_loss_clip": 0.01135616, + "auxiliary_loss_mlp": 0.01034852, + "balance_loss_clip": 1.05104566, + "balance_loss_mlp": 1.01991546, + "epoch": 0.3749286036374568, + "flos": 20521189042560.0, + "grad_norm": 1.8616623950776758, + "language_loss": 0.77790743, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.79961216, + "num_input_tokens_seen": 133934110, + "step": 6236, + "time_per_iteration": 2.4650447368621826 + }, + { + "auxiliary_loss_clip": 0.0111614, + "auxiliary_loss_mlp": 0.01051829, + "balance_loss_clip": 1.04723728, + "balance_loss_mlp": 1.03364944, + "epoch": 0.37498872689012475, + "flos": 20704620821760.0, + "grad_norm": 1.9064991559087816, + "language_loss": 0.73634315, + "learning_rate": 2.876104377085234e-06, + "loss": 0.75802284, + "num_input_tokens_seen": 133952395, + "step": 6237, + "time_per_iteration": 2.4832520484924316 + }, + { + "auxiliary_loss_clip": 0.01116667, + "auxiliary_loss_mlp": 0.00793317, + "balance_loss_clip": 1.04690838, + "balance_loss_mlp": 1.01313376, + "epoch": 0.3750488501427927, + "flos": 21574843620480.0, + "grad_norm": 2.3171794261531415, + "language_loss": 0.92827779, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.94737768, + "num_input_tokens_seen": 133969635, + "step": 6238, + "time_per_iteration": 2.6572070121765137 + }, + { + "auxiliary_loss_clip": 0.01134451, + "auxiliary_loss_mlp": 0.01037387, + "balance_loss_clip": 1.05003262, + "balance_loss_mlp": 1.02093649, + "epoch": 0.3751089733954607, + "flos": 15923869274880.0, + "grad_norm": 1.8686878963825586, + "language_loss": 0.70789278, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.72961116, + "num_input_tokens_seen": 133987215, + "step": 6239, + "time_per_iteration": 2.440011978149414 + }, + { + "auxiliary_loss_clip": 0.01066007, + "auxiliary_loss_mlp": 0.01033924, + "balance_loss_clip": 1.04909396, + "balance_loss_mlp": 1.01823628, + "epoch": 0.37516909664812864, + "flos": 36284644177920.0, + "grad_norm": 1.5485465262125777, + "language_loss": 0.65509343, + "learning_rate": 2.875053908444895e-06, + "loss": 0.67609268, + "num_input_tokens_seen": 134009250, + "step": 6240, + "time_per_iteration": 2.7622854709625244 + }, + { + "auxiliary_loss_clip": 0.01100543, + "auxiliary_loss_mlp": 0.00791571, + "balance_loss_clip": 1.0493803, + "balance_loss_mlp": 1.01212049, + "epoch": 0.3752292199007966, + "flos": 13515915283200.0, + "grad_norm": 1.6952200620092974, + "language_loss": 0.76041299, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.77933413, + "num_input_tokens_seen": 134026875, + "step": 6241, + "time_per_iteration": 2.5395147800445557 + }, + { + "auxiliary_loss_clip": 0.01103691, + "auxiliary_loss_mlp": 0.01043399, + "balance_loss_clip": 1.04971802, + "balance_loss_mlp": 1.02612603, + "epoch": 0.3752893431534646, + "flos": 27198095644800.0, + "grad_norm": 1.9018981925988414, + "language_loss": 0.83181274, + "learning_rate": 2.874353430085213e-06, + "loss": 0.85328364, + "num_input_tokens_seen": 134047185, + "step": 6242, + "time_per_iteration": 2.627531051635742 + }, + { + "auxiliary_loss_clip": 0.01107131, + "auxiliary_loss_mlp": 0.01039731, + "balance_loss_clip": 1.05004978, + "balance_loss_mlp": 1.02490747, + "epoch": 0.3753494664061326, + "flos": 30007674581760.0, + "grad_norm": 2.1023141243268477, + "language_loss": 0.68689179, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.70836037, + "num_input_tokens_seen": 134067330, + "step": 6243, + "time_per_iteration": 2.599716901779175 + }, + { + "auxiliary_loss_clip": 0.0106202, + "auxiliary_loss_mlp": 0.00797043, + "balance_loss_clip": 1.04775429, + "balance_loss_mlp": 1.01548696, + "epoch": 0.37540958965880056, + "flos": 24461954064000.0, + "grad_norm": 1.744805036867949, + "language_loss": 0.83688414, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.85547471, + "num_input_tokens_seen": 134085525, + "step": 6244, + "time_per_iteration": 2.677353858947754 + }, + { + "auxiliary_loss_clip": 0.0107392, + "auxiliary_loss_mlp": 0.01038159, + "balance_loss_clip": 1.04321206, + "balance_loss_mlp": 1.02231646, + "epoch": 0.3754697129114685, + "flos": 16508387295360.0, + "grad_norm": 2.7412831991931315, + "language_loss": 0.83219916, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.85331994, + "num_input_tokens_seen": 134101855, + "step": 6245, + "time_per_iteration": 2.5703930854797363 + }, + { + "auxiliary_loss_clip": 0.01097876, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_clip": 1.04652119, + "balance_loss_mlp": 1.02556086, + "epoch": 0.3755298361641365, + "flos": 19390900798080.0, + "grad_norm": 2.074386617411249, + "language_loss": 0.64061356, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.66201305, + "num_input_tokens_seen": 134119360, + "step": 6246, + "time_per_iteration": 2.5168678760528564 + }, + { + "auxiliary_loss_clip": 0.01104083, + "auxiliary_loss_mlp": 0.01042972, + "balance_loss_clip": 1.04617643, + "balance_loss_mlp": 1.02581763, + "epoch": 0.37558995941680445, + "flos": 14720395069440.0, + "grad_norm": 1.7884218680268675, + "language_loss": 0.74792361, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.76939416, + "num_input_tokens_seen": 134137475, + "step": 6247, + "time_per_iteration": 2.522047758102417 + }, + { + "auxiliary_loss_clip": 0.01120126, + "auxiliary_loss_mlp": 0.01039532, + "balance_loss_clip": 1.04701865, + "balance_loss_mlp": 1.02402294, + "epoch": 0.3756500826694724, + "flos": 21689901861120.0, + "grad_norm": 3.0856778317128897, + "language_loss": 0.54808867, + "learning_rate": 2.872251199697598e-06, + "loss": 0.56968528, + "num_input_tokens_seen": 134154580, + "step": 6248, + "time_per_iteration": 2.485595941543579 + }, + { + "auxiliary_loss_clip": 0.01115229, + "auxiliary_loss_mlp": 0.0104264, + "balance_loss_clip": 1.0488677, + "balance_loss_mlp": 1.02684438, + "epoch": 0.3757102059221404, + "flos": 26505666190080.0, + "grad_norm": 1.753143259254582, + "language_loss": 0.84349644, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86507517, + "num_input_tokens_seen": 134174285, + "step": 6249, + "time_per_iteration": 2.55735182762146 + }, + { + "auxiliary_loss_clip": 0.01100947, + "auxiliary_loss_mlp": 0.01033177, + "balance_loss_clip": 1.04589891, + "balance_loss_mlp": 1.01838338, + "epoch": 0.37577032917480835, + "flos": 37338083274240.0, + "grad_norm": 1.580438824779369, + "language_loss": 0.6800071, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.7013483, + "num_input_tokens_seen": 134195940, + "step": 6250, + "time_per_iteration": 2.66099214553833 + }, + { + "auxiliary_loss_clip": 0.01108767, + "auxiliary_loss_mlp": 0.01041538, + "balance_loss_clip": 1.04586327, + "balance_loss_mlp": 1.02639866, + "epoch": 0.3758304524274763, + "flos": 21908597817600.0, + "grad_norm": 2.007635406736307, + "language_loss": 0.77889752, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.80040056, + "num_input_tokens_seen": 134212235, + "step": 6251, + "time_per_iteration": 2.528585433959961 + }, + { + "auxiliary_loss_clip": 0.0111404, + "auxiliary_loss_mlp": 0.01032749, + "balance_loss_clip": 1.04653811, + "balance_loss_mlp": 1.01793134, + "epoch": 0.3758905756801443, + "flos": 36569343375360.0, + "grad_norm": 2.0716536852235605, + "language_loss": 0.57905209, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.60052001, + "num_input_tokens_seen": 134233810, + "step": 6252, + "time_per_iteration": 2.611238479614258 + }, + { + "auxiliary_loss_clip": 0.01111295, + "auxiliary_loss_mlp": 0.0104386, + "balance_loss_clip": 1.04842138, + "balance_loss_mlp": 1.02744532, + "epoch": 0.37595069893281224, + "flos": 24528783317760.0, + "grad_norm": 1.7232426246631678, + "language_loss": 0.89264846, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.91420007, + "num_input_tokens_seen": 134252020, + "step": 6253, + "time_per_iteration": 4.090145111083984 + }, + { + "auxiliary_loss_clip": 0.01093515, + "auxiliary_loss_mlp": 0.01033542, + "balance_loss_clip": 1.05030608, + "balance_loss_mlp": 1.01960683, + "epoch": 0.3760108221854802, + "flos": 16435021766400.0, + "grad_norm": 3.9400596879147747, + "language_loss": 0.76291358, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.78418422, + "num_input_tokens_seen": 134269495, + "step": 6254, + "time_per_iteration": 2.5708200931549072 + }, + { + "auxiliary_loss_clip": 0.01092638, + "auxiliary_loss_mlp": 0.01042772, + "balance_loss_clip": 1.04659951, + "balance_loss_mlp": 1.02662468, + "epoch": 0.37607094543814823, + "flos": 13771742924160.0, + "grad_norm": 2.3121607274902067, + "language_loss": 0.62340218, + "learning_rate": 2.869797092829169e-06, + "loss": 0.64475632, + "num_input_tokens_seen": 134287035, + "step": 6255, + "time_per_iteration": 5.3299009799957275 + }, + { + "auxiliary_loss_clip": 0.0112189, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.04637432, + "balance_loss_mlp": 1.02175021, + "epoch": 0.3761310686908162, + "flos": 19857918453120.0, + "grad_norm": 2.4392467552534787, + "language_loss": 0.73614573, + "learning_rate": 2.869446374096135e-06, + "loss": 0.75774688, + "num_input_tokens_seen": 134304840, + "step": 6256, + "time_per_iteration": 2.526726484298706 + }, + { + "auxiliary_loss_clip": 0.01122089, + "auxiliary_loss_mlp": 0.01044185, + "balance_loss_clip": 1.04823148, + "balance_loss_mlp": 1.02805591, + "epoch": 0.37619119194348416, + "flos": 12750802657920.0, + "grad_norm": 1.7198429314458412, + "language_loss": 0.70364141, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.72530425, + "num_input_tokens_seen": 134323180, + "step": 6257, + "time_per_iteration": 2.5043623447418213 + }, + { + "auxiliary_loss_clip": 0.01104503, + "auxiliary_loss_mlp": 0.01035037, + "balance_loss_clip": 1.04584253, + "balance_loss_mlp": 1.02045774, + "epoch": 0.3762513151961521, + "flos": 17530548624000.0, + "grad_norm": 1.5685645837260644, + "language_loss": 0.8428368, + "learning_rate": 2.868744837734889e-06, + "loss": 0.86423224, + "num_input_tokens_seen": 134341390, + "step": 6258, + "time_per_iteration": 3.903674840927124 + }, + { + "auxiliary_loss_clip": 0.01089184, + "auxiliary_loss_mlp": 0.01038737, + "balance_loss_clip": 1.04634249, + "balance_loss_mlp": 1.02480149, + "epoch": 0.3763114384488201, + "flos": 23617406511360.0, + "grad_norm": 1.5041725872815703, + "language_loss": 0.80803055, + "learning_rate": 2.868394020133277e-06, + "loss": 0.82930976, + "num_input_tokens_seen": 134360425, + "step": 6259, + "time_per_iteration": 2.6423726081848145 + }, + { + "auxiliary_loss_clip": 0.01090628, + "auxiliary_loss_mlp": 0.01043351, + "balance_loss_clip": 1.04696751, + "balance_loss_mlp": 1.02688813, + "epoch": 0.37637156170148806, + "flos": 25406978935680.0, + "grad_norm": 1.9049728775330024, + "language_loss": 0.71512866, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.73646843, + "num_input_tokens_seen": 134379775, + "step": 6260, + "time_per_iteration": 2.652569055557251 + }, + { + "auxiliary_loss_clip": 0.01106546, + "auxiliary_loss_mlp": 0.01041718, + "balance_loss_clip": 1.04931712, + "balance_loss_mlp": 1.02593446, + "epoch": 0.376431684954156, + "flos": 23440906056960.0, + "grad_norm": 2.05590790519657, + "language_loss": 0.78053373, + "learning_rate": 2.867692286154594e-06, + "loss": 0.80201638, + "num_input_tokens_seen": 134400315, + "step": 6261, + "time_per_iteration": 2.556382656097412 + }, + { + "auxiliary_loss_clip": 0.01107691, + "auxiliary_loss_mlp": 0.01047423, + "balance_loss_clip": 1.04621136, + "balance_loss_mlp": 1.03013778, + "epoch": 0.376491808206824, + "flos": 34204482725760.0, + "grad_norm": 2.120875862700909, + "language_loss": 0.80196905, + "learning_rate": 2.867341369804132e-06, + "loss": 0.82352018, + "num_input_tokens_seen": 134422875, + "step": 6262, + "time_per_iteration": 2.695587635040283 + }, + { + "auxiliary_loss_clip": 0.01109861, + "auxiliary_loss_mlp": 0.01031933, + "balance_loss_clip": 1.04424179, + "balance_loss_mlp": 1.01681685, + "epoch": 0.37655193145949195, + "flos": 35185669614720.0, + "grad_norm": 1.8047915016743075, + "language_loss": 0.80094469, + "learning_rate": 2.866990420563998e-06, + "loss": 0.82236266, + "num_input_tokens_seen": 134443025, + "step": 6263, + "time_per_iteration": 2.6193580627441406 + }, + { + "auxiliary_loss_clip": 0.01132911, + "auxiliary_loss_mlp": 0.01041241, + "balance_loss_clip": 1.0491643, + "balance_loss_mlp": 1.02625036, + "epoch": 0.3766120547121599, + "flos": 16761844638720.0, + "grad_norm": 4.420616055258283, + "language_loss": 0.79662311, + "learning_rate": 2.866639438447501e-06, + "loss": 0.81836468, + "num_input_tokens_seen": 134460945, + "step": 6264, + "time_per_iteration": 2.5011255741119385 + }, + { + "auxiliary_loss_clip": 0.01128449, + "auxiliary_loss_mlp": 0.0104237, + "balance_loss_clip": 1.04519773, + "balance_loss_mlp": 1.02742743, + "epoch": 0.3766721779648279, + "flos": 23550361776000.0, + "grad_norm": 1.87233426377101, + "language_loss": 0.73618948, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.75789762, + "num_input_tokens_seen": 134480440, + "step": 6265, + "time_per_iteration": 2.5088727474212646 + }, + { + "auxiliary_loss_clip": 0.01118019, + "auxiliary_loss_mlp": 0.01037803, + "balance_loss_clip": 1.04867411, + "balance_loss_mlp": 1.02399266, + "epoch": 0.37673230121749585, + "flos": 29129191655040.0, + "grad_norm": 1.633533319749515, + "language_loss": 0.69109797, + "learning_rate": 2.865937375638654e-06, + "loss": 0.71265614, + "num_input_tokens_seen": 134501110, + "step": 6266, + "time_per_iteration": 2.5804193019866943 + }, + { + "auxiliary_loss_clip": 0.01122951, + "auxiliary_loss_mlp": 0.01042701, + "balance_loss_clip": 1.04635, + "balance_loss_mlp": 1.0270251, + "epoch": 0.3767924244701638, + "flos": 28146783703680.0, + "grad_norm": 2.777640661537264, + "language_loss": 0.63228559, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.65394211, + "num_input_tokens_seen": 134522460, + "step": 6267, + "time_per_iteration": 2.567915678024292 + }, + { + "auxiliary_loss_clip": 0.01038252, + "auxiliary_loss_mlp": 0.01026883, + "balance_loss_clip": 1.01935005, + "balance_loss_mlp": 1.02493978, + "epoch": 0.37685254772283183, + "flos": 60797197526400.0, + "grad_norm": 0.7239239406601489, + "language_loss": 0.58891565, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.60956699, + "num_input_tokens_seen": 134589545, + "step": 6268, + "time_per_iteration": 3.197960615158081 + }, + { + "auxiliary_loss_clip": 0.01131356, + "auxiliary_loss_mlp": 0.01042617, + "balance_loss_clip": 1.04701471, + "balance_loss_mlp": 1.02665532, + "epoch": 0.3769126709754998, + "flos": 26032543223040.0, + "grad_norm": 1.5727022559127346, + "language_loss": 0.64881259, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67055225, + "num_input_tokens_seen": 134610550, + "step": 6269, + "time_per_iteration": 2.5015296936035156 + }, + { + "auxiliary_loss_clip": 0.01096529, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.04855013, + "balance_loss_mlp": 1.01753354, + "epoch": 0.37697279422816776, + "flos": 23579879777280.0, + "grad_norm": 1.5389491578568593, + "language_loss": 0.70678443, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.72808349, + "num_input_tokens_seen": 134630485, + "step": 6270, + "time_per_iteration": 2.592176914215088 + }, + { + "auxiliary_loss_clip": 0.0104741, + "auxiliary_loss_mlp": 0.01002182, + "balance_loss_clip": 1.01862073, + "balance_loss_mlp": 1.00044107, + "epoch": 0.3770329174808357, + "flos": 64745935367040.0, + "grad_norm": 0.7116894402980682, + "language_loss": 0.5617196, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.58221549, + "num_input_tokens_seen": 134693510, + "step": 6271, + "time_per_iteration": 3.0768022537231445 + }, + { + "auxiliary_loss_clip": 0.01118368, + "auxiliary_loss_mlp": 0.01035094, + "balance_loss_clip": 1.0470264, + "balance_loss_mlp": 1.01890504, + "epoch": 0.3770930407335037, + "flos": 21835304115840.0, + "grad_norm": 1.6966604951606268, + "language_loss": 0.79465914, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.81619382, + "num_input_tokens_seen": 134713115, + "step": 6272, + "time_per_iteration": 2.5183780193328857 + }, + { + "auxiliary_loss_clip": 0.01114201, + "auxiliary_loss_mlp": 0.01034385, + "balance_loss_clip": 1.04363155, + "balance_loss_mlp": 1.01999044, + "epoch": 0.37715316398617166, + "flos": 22747901984640.0, + "grad_norm": 1.4949626471250725, + "language_loss": 0.74101067, + "learning_rate": 2.863479122159103e-06, + "loss": 0.76249659, + "num_input_tokens_seen": 134732635, + "step": 6273, + "time_per_iteration": 2.491856813430786 + }, + { + "auxiliary_loss_clip": 0.01113017, + "auxiliary_loss_mlp": 0.01043224, + "balance_loss_clip": 1.04632235, + "balance_loss_mlp": 1.0284183, + "epoch": 0.3772132872388396, + "flos": 18914581520640.0, + "grad_norm": 1.4786248227707917, + "language_loss": 0.71823126, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.73979366, + "num_input_tokens_seen": 134750695, + "step": 6274, + "time_per_iteration": 2.4822094440460205 + }, + { + "auxiliary_loss_clip": 0.01104281, + "auxiliary_loss_mlp": 0.01039058, + "balance_loss_clip": 1.04889846, + "balance_loss_mlp": 1.02458, + "epoch": 0.3772734104915076, + "flos": 17346219004800.0, + "grad_norm": 1.6257731551170356, + "language_loss": 0.84028113, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.86171454, + "num_input_tokens_seen": 134768935, + "step": 6275, + "time_per_iteration": 2.526548385620117 + }, + { + "auxiliary_loss_clip": 0.01075593, + "auxiliary_loss_mlp": 0.01036423, + "balance_loss_clip": 1.04550493, + "balance_loss_mlp": 1.02298224, + "epoch": 0.37733353374417555, + "flos": 32342370785280.0, + "grad_norm": 1.5078952808177162, + "language_loss": 0.75282598, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.77394611, + "num_input_tokens_seen": 134791260, + "step": 6276, + "time_per_iteration": 2.682257890701294 + }, + { + "auxiliary_loss_clip": 0.01106965, + "auxiliary_loss_mlp": 0.0103631, + "balance_loss_clip": 1.04320621, + "balance_loss_mlp": 1.02080655, + "epoch": 0.3773936569968435, + "flos": 23360681030400.0, + "grad_norm": 1.8651679243634265, + "language_loss": 0.85492194, + "learning_rate": 2.862073685241366e-06, + "loss": 0.87635469, + "num_input_tokens_seen": 134808350, + "step": 6277, + "time_per_iteration": 2.5136756896972656 + }, + { + "auxiliary_loss_clip": 0.01118851, + "auxiliary_loss_mlp": 0.01036512, + "balance_loss_clip": 1.04914355, + "balance_loss_mlp": 1.02252293, + "epoch": 0.3774537802495115, + "flos": 21466788531840.0, + "grad_norm": 1.8113932983803622, + "language_loss": 0.77699912, + "learning_rate": 2.861722244253818e-06, + "loss": 0.79855275, + "num_input_tokens_seen": 134826005, + "step": 6278, + "time_per_iteration": 2.496595859527588 + }, + { + "auxiliary_loss_clip": 0.01103515, + "auxiliary_loss_mlp": 0.01044338, + "balance_loss_clip": 1.05167711, + "balance_loss_mlp": 1.02837563, + "epoch": 0.37751390350217945, + "flos": 24973717086720.0, + "grad_norm": 1.7695666953913747, + "language_loss": 0.83306634, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.85454488, + "num_input_tokens_seen": 134844995, + "step": 6279, + "time_per_iteration": 2.5581116676330566 + }, + { + "auxiliary_loss_clip": 0.01107659, + "auxiliary_loss_mlp": 0.01034474, + "balance_loss_clip": 1.04558599, + "balance_loss_mlp": 1.02084792, + "epoch": 0.3775740267548474, + "flos": 27819098904960.0, + "grad_norm": 1.837132267407551, + "language_loss": 0.74380904, + "learning_rate": 2.861019264262269e-06, + "loss": 0.76523042, + "num_input_tokens_seen": 134865285, + "step": 6280, + "time_per_iteration": 2.5631895065307617 + }, + { + "auxiliary_loss_clip": 0.01128281, + "auxiliary_loss_mlp": 0.0103994, + "balance_loss_clip": 1.04837251, + "balance_loss_mlp": 1.0258379, + "epoch": 0.3776341500075154, + "flos": 22565224391040.0, + "grad_norm": 1.4665349653409734, + "language_loss": 0.76097083, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.78265303, + "num_input_tokens_seen": 134886535, + "step": 6281, + "time_per_iteration": 2.47795033454895 + }, + { + "auxiliary_loss_clip": 0.01099909, + "auxiliary_loss_mlp": 0.01036022, + "balance_loss_clip": 1.0458405, + "balance_loss_mlp": 1.02135885, + "epoch": 0.3776942732601834, + "flos": 23077238808960.0, + "grad_norm": 1.6489685497170306, + "language_loss": 0.8425836, + "learning_rate": 2.860316153670974e-06, + "loss": 0.86394286, + "num_input_tokens_seen": 134907435, + "step": 6282, + "time_per_iteration": 2.5502302646636963 + }, + { + "auxiliary_loss_clip": 0.01116804, + "auxiliary_loss_mlp": 0.01034062, + "balance_loss_clip": 1.04526806, + "balance_loss_mlp": 1.01939893, + "epoch": 0.37775439651285136, + "flos": 21724411852800.0, + "grad_norm": 1.7004434356725147, + "language_loss": 0.69931096, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.72081959, + "num_input_tokens_seen": 134925360, + "step": 6283, + "time_per_iteration": 2.492694616317749 + }, + { + "auxiliary_loss_clip": 0.01073362, + "auxiliary_loss_mlp": 0.01046734, + "balance_loss_clip": 1.04827213, + "balance_loss_mlp": 1.02990139, + "epoch": 0.37781451976551933, + "flos": 23987753688960.0, + "grad_norm": 1.6976212999345577, + "language_loss": 0.75870323, + "learning_rate": 2.859612912586581e-06, + "loss": 0.77990413, + "num_input_tokens_seen": 134944205, + "step": 6284, + "time_per_iteration": 2.6398279666900635 + }, + { + "auxiliary_loss_clip": 0.01136305, + "auxiliary_loss_mlp": 0.01035205, + "balance_loss_clip": 1.04945874, + "balance_loss_mlp": 1.01935041, + "epoch": 0.3778746430181873, + "flos": 13727967223680.0, + "grad_norm": 2.0634885085973393, + "language_loss": 0.85134172, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.87305689, + "num_input_tokens_seen": 134960255, + "step": 6285, + "time_per_iteration": 2.4248037338256836 + }, + { + "auxiliary_loss_clip": 0.01107277, + "auxiliary_loss_mlp": 0.01036523, + "balance_loss_clip": 1.04817677, + "balance_loss_mlp": 1.02085948, + "epoch": 0.37793476627085526, + "flos": 19460495399040.0, + "grad_norm": 1.7549712303472602, + "language_loss": 0.84327191, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86470997, + "num_input_tokens_seen": 134978605, + "step": 6286, + "time_per_iteration": 2.529057025909424 + }, + { + "auxiliary_loss_clip": 0.01111017, + "auxiliary_loss_mlp": 0.0103964, + "balance_loss_clip": 1.04612172, + "balance_loss_mlp": 1.02522135, + "epoch": 0.3779948895235232, + "flos": 10707018704640.0, + "grad_norm": 2.3949344392273963, + "language_loss": 0.81384224, + "learning_rate": 2.858557806518775e-06, + "loss": 0.83534884, + "num_input_tokens_seen": 134995020, + "step": 6287, + "time_per_iteration": 2.456221342086792 + }, + { + "auxiliary_loss_clip": 0.01113242, + "auxiliary_loss_mlp": 0.01042351, + "balance_loss_clip": 1.04356039, + "balance_loss_mlp": 1.02759862, + "epoch": 0.3780550127761912, + "flos": 22310007281280.0, + "grad_norm": 2.234714058809263, + "language_loss": 0.73391706, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.75547296, + "num_input_tokens_seen": 135012620, + "step": 6288, + "time_per_iteration": 2.494032859802246 + }, + { + "auxiliary_loss_clip": 0.01120431, + "auxiliary_loss_mlp": 0.0103907, + "balance_loss_clip": 1.05029762, + "balance_loss_mlp": 1.02367377, + "epoch": 0.37811513602885916, + "flos": 28950644125440.0, + "grad_norm": 2.666621315894215, + "language_loss": 0.75349474, + "learning_rate": 2.857854239668352e-06, + "loss": 0.77508974, + "num_input_tokens_seen": 135033365, + "step": 6289, + "time_per_iteration": 2.549288034439087 + }, + { + "auxiliary_loss_clip": 0.01119484, + "auxiliary_loss_mlp": 0.01042506, + "balance_loss_clip": 1.04818225, + "balance_loss_mlp": 1.02797472, + "epoch": 0.3781752592815271, + "flos": 23112933949440.0, + "grad_norm": 1.935743094527195, + "language_loss": 0.73256671, + "learning_rate": 2.857502407441593e-06, + "loss": 0.75418663, + "num_input_tokens_seen": 135052185, + "step": 6290, + "time_per_iteration": 2.5163841247558594 + }, + { + "auxiliary_loss_clip": 0.01093363, + "auxiliary_loss_mlp": 0.01042728, + "balance_loss_clip": 1.04573703, + "balance_loss_mlp": 1.02588391, + "epoch": 0.3782353825341951, + "flos": 19755932762880.0, + "grad_norm": 2.1175716798700286, + "language_loss": 0.80137122, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.82273209, + "num_input_tokens_seen": 135070425, + "step": 6291, + "time_per_iteration": 3.9359819889068604 + }, + { + "auxiliary_loss_clip": 0.010959, + "auxiliary_loss_mlp": 0.01033407, + "balance_loss_clip": 1.04625881, + "balance_loss_mlp": 1.01812398, + "epoch": 0.37829550578686305, + "flos": 22050839675520.0, + "grad_norm": 1.9378129918333193, + "language_loss": 0.76216114, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.78345424, + "num_input_tokens_seen": 135090525, + "step": 6292, + "time_per_iteration": 2.5499916076660156 + }, + { + "auxiliary_loss_clip": 0.01111398, + "auxiliary_loss_mlp": 0.01045723, + "balance_loss_clip": 1.04550838, + "balance_loss_mlp": 1.03002322, + "epoch": 0.378355629039531, + "flos": 16470357770880.0, + "grad_norm": 1.6865831449560085, + "language_loss": 0.69704771, + "learning_rate": 2.856446715715224e-06, + "loss": 0.71861887, + "num_input_tokens_seen": 135109575, + "step": 6293, + "time_per_iteration": 2.4804694652557373 + }, + { + "auxiliary_loss_clip": 0.01127877, + "auxiliary_loss_mlp": 0.01039685, + "balance_loss_clip": 1.04652095, + "balance_loss_mlp": 1.02432442, + "epoch": 0.378415752292199, + "flos": 19974844200960.0, + "grad_norm": 2.1953425929313033, + "language_loss": 0.71639282, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.73806846, + "num_input_tokens_seen": 135127000, + "step": 6294, + "time_per_iteration": 5.316091299057007 + }, + { + "auxiliary_loss_clip": 0.01108763, + "auxiliary_loss_mlp": 0.0103687, + "balance_loss_clip": 1.04504359, + "balance_loss_mlp": 1.02096748, + "epoch": 0.378475875544867, + "flos": 14647388676480.0, + "grad_norm": 2.0169620915287796, + "language_loss": 0.82669044, + "learning_rate": 2.855742758826011e-06, + "loss": 0.8481468, + "num_input_tokens_seen": 135145285, + "step": 6295, + "time_per_iteration": 2.4912915229797363 + }, + { + "auxiliary_loss_clip": 0.01112008, + "auxiliary_loss_mlp": 0.0103758, + "balance_loss_clip": 1.04554105, + "balance_loss_mlp": 1.0228579, + "epoch": 0.37853599879753497, + "flos": 26650996617600.0, + "grad_norm": 1.7833155462626749, + "language_loss": 0.71388501, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.73538089, + "num_input_tokens_seen": 135165240, + "step": 6296, + "time_per_iteration": 2.5307788848876953 + }, + { + "auxiliary_loss_clip": 0.01129609, + "auxiliary_loss_mlp": 0.01041777, + "balance_loss_clip": 1.049582, + "balance_loss_mlp": 1.02703691, + "epoch": 0.37859612205020293, + "flos": 17311960408320.0, + "grad_norm": 1.7340887818817647, + "language_loss": 0.76949227, + "learning_rate": 2.855038672137396e-06, + "loss": 0.79120612, + "num_input_tokens_seen": 135184045, + "step": 6297, + "time_per_iteration": 3.816436767578125 + }, + { + "auxiliary_loss_clip": 0.01105482, + "auxiliary_loss_mlp": 0.01036799, + "balance_loss_clip": 1.04782891, + "balance_loss_mlp": 1.02238703, + "epoch": 0.3786562453028709, + "flos": 18220392299520.0, + "grad_norm": 1.8915861456330683, + "language_loss": 0.79194665, + "learning_rate": 2.854686580151684e-06, + "loss": 0.81336945, + "num_input_tokens_seen": 135202365, + "step": 6298, + "time_per_iteration": 2.49326753616333 + }, + { + "auxiliary_loss_clip": 0.01070566, + "auxiliary_loss_mlp": 0.0104793, + "balance_loss_clip": 1.04150486, + "balance_loss_mlp": 1.0310024, + "epoch": 0.37871636855553886, + "flos": 21214875473280.0, + "grad_norm": 1.6405593902856228, + "language_loss": 0.84336615, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.86455119, + "num_input_tokens_seen": 135220955, + "step": 6299, + "time_per_iteration": 2.569901943206787 + }, + { + "auxiliary_loss_clip": 0.01093415, + "auxiliary_loss_mlp": 0.01037315, + "balance_loss_clip": 1.0419873, + "balance_loss_mlp": 1.02206767, + "epoch": 0.3787764918082068, + "flos": 20952727038720.0, + "grad_norm": 1.9724564310029635, + "language_loss": 0.76330405, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.78461134, + "num_input_tokens_seen": 135239715, + "step": 6300, + "time_per_iteration": 2.531797170639038 + }, + { + "auxiliary_loss_clip": 0.01111543, + "auxiliary_loss_mlp": 0.0103827, + "balance_loss_clip": 1.04639316, + "balance_loss_mlp": 1.02133024, + "epoch": 0.3788366150608748, + "flos": 17308009912320.0, + "grad_norm": 1.9532683358153005, + "language_loss": 0.82976717, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.85126531, + "num_input_tokens_seen": 135257035, + "step": 6301, + "time_per_iteration": 2.4886343479156494 + }, + { + "auxiliary_loss_clip": 0.01112114, + "auxiliary_loss_mlp": 0.01036237, + "balance_loss_clip": 1.04647493, + "balance_loss_mlp": 1.02222395, + "epoch": 0.37889673831354276, + "flos": 24311092942080.0, + "grad_norm": 1.9949576918825387, + "language_loss": 0.68026334, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.70174682, + "num_input_tokens_seen": 135275720, + "step": 6302, + "time_per_iteration": 2.501490831375122 + }, + { + "auxiliary_loss_clip": 0.01082235, + "auxiliary_loss_mlp": 0.0104158, + "balance_loss_clip": 1.04198229, + "balance_loss_mlp": 1.02677417, + "epoch": 0.3789568615662107, + "flos": 26683603188480.0, + "grad_norm": 1.8379743572737421, + "language_loss": 0.68425649, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.70549464, + "num_input_tokens_seen": 135294140, + "step": 6303, + "time_per_iteration": 2.6166861057281494 + }, + { + "auxiliary_loss_clip": 0.01126037, + "auxiliary_loss_mlp": 0.01033583, + "balance_loss_clip": 1.04558969, + "balance_loss_mlp": 1.01961756, + "epoch": 0.3790169848188787, + "flos": 23585194990080.0, + "grad_norm": 1.5525068516171425, + "language_loss": 0.77438104, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.79597723, + "num_input_tokens_seen": 135314845, + "step": 6304, + "time_per_iteration": 2.4690423011779785 + }, + { + "auxiliary_loss_clip": 0.0113629, + "auxiliary_loss_mlp": 0.01036061, + "balance_loss_clip": 1.05091834, + "balance_loss_mlp": 1.02065885, + "epoch": 0.37907710807154665, + "flos": 18437436230400.0, + "grad_norm": 1.840956388400523, + "language_loss": 0.80538815, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.82711166, + "num_input_tokens_seen": 135333055, + "step": 6305, + "time_per_iteration": 2.4449024200439453 + }, + { + "auxiliary_loss_clip": 0.01042169, + "auxiliary_loss_mlp": 0.01012063, + "balance_loss_clip": 1.02367759, + "balance_loss_mlp": 1.01041794, + "epoch": 0.3791372313242146, + "flos": 50107165954560.0, + "grad_norm": 0.9786051169970005, + "language_loss": 0.64501238, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66555476, + "num_input_tokens_seen": 135387865, + "step": 6306, + "time_per_iteration": 3.001331329345703 + }, + { + "auxiliary_loss_clip": 0.01104632, + "auxiliary_loss_mlp": 0.01053294, + "balance_loss_clip": 1.0480094, + "balance_loss_mlp": 1.03654528, + "epoch": 0.3791973545768826, + "flos": 24316551809280.0, + "grad_norm": 1.657933892739309, + "language_loss": 0.73343682, + "learning_rate": 2.851516295441817e-06, + "loss": 0.75501609, + "num_input_tokens_seen": 135409095, + "step": 6307, + "time_per_iteration": 2.5741920471191406 + }, + { + "auxiliary_loss_clip": 0.01106413, + "auxiliary_loss_mlp": 0.01038025, + "balance_loss_clip": 1.04904246, + "balance_loss_mlp": 1.0226233, + "epoch": 0.3792574778295506, + "flos": 21579907438080.0, + "grad_norm": 1.3406060624191884, + "language_loss": 0.78220493, + "learning_rate": 2.851163879959112e-06, + "loss": 0.80364931, + "num_input_tokens_seen": 135429585, + "step": 6308, + "time_per_iteration": 2.5402448177337646 + }, + { + "auxiliary_loss_clip": 0.0109138, + "auxiliary_loss_mlp": 0.01039633, + "balance_loss_clip": 1.04669261, + "balance_loss_mlp": 1.02464843, + "epoch": 0.37931760108221857, + "flos": 22272731942400.0, + "grad_norm": 1.9453512664875034, + "language_loss": 0.72968423, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.75099444, + "num_input_tokens_seen": 135446320, + "step": 6309, + "time_per_iteration": 2.5721166133880615 + }, + { + "auxiliary_loss_clip": 0.01073639, + "auxiliary_loss_mlp": 0.01043823, + "balance_loss_clip": 1.04769468, + "balance_loss_mlp": 1.02824211, + "epoch": 0.37937772433488653, + "flos": 19682998197120.0, + "grad_norm": 1.3618338533149712, + "language_loss": 0.78296018, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.80413479, + "num_input_tokens_seen": 135465720, + "step": 6310, + "time_per_iteration": 2.585340976715088 + }, + { + "auxiliary_loss_clip": 0.01118459, + "auxiliary_loss_mlp": 0.00794277, + "balance_loss_clip": 1.04610276, + "balance_loss_mlp": 1.01990342, + "epoch": 0.3794378475875545, + "flos": 19099378016640.0, + "grad_norm": 1.7035103523654243, + "language_loss": 0.76360106, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.78272837, + "num_input_tokens_seen": 135485155, + "step": 6311, + "time_per_iteration": 2.5155489444732666 + }, + { + "auxiliary_loss_clip": 0.01112682, + "auxiliary_loss_mlp": 0.0103133, + "balance_loss_clip": 1.05058599, + "balance_loss_mlp": 1.0173285, + "epoch": 0.37949797084022246, + "flos": 20339660684160.0, + "grad_norm": 1.4954457078019694, + "language_loss": 0.7033689, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.72480905, + "num_input_tokens_seen": 135502675, + "step": 6312, + "time_per_iteration": 2.5423941612243652 + }, + { + "auxiliary_loss_clip": 0.01025078, + "auxiliary_loss_mlp": 0.01003501, + "balance_loss_clip": 1.02749729, + "balance_loss_mlp": 1.00192738, + "epoch": 0.37955809409289043, + "flos": 63972203477760.0, + "grad_norm": 0.8128010179612137, + "language_loss": 0.56089389, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58117974, + "num_input_tokens_seen": 135562005, + "step": 6313, + "time_per_iteration": 3.1380844116210938 + }, + { + "auxiliary_loss_clip": 0.01094232, + "auxiliary_loss_mlp": 0.0104081, + "balance_loss_clip": 1.04618001, + "balance_loss_mlp": 1.02609384, + "epoch": 0.3796182173455584, + "flos": 31540665179520.0, + "grad_norm": 1.692945137318909, + "language_loss": 0.71489549, + "learning_rate": 2.849048709730083e-06, + "loss": 0.73624587, + "num_input_tokens_seen": 135582600, + "step": 6314, + "time_per_iteration": 2.6523373126983643 + }, + { + "auxiliary_loss_clip": 0.01124739, + "auxiliary_loss_mlp": 0.01039533, + "balance_loss_clip": 1.05018091, + "balance_loss_mlp": 1.02407193, + "epoch": 0.37967834059822636, + "flos": 12130804978560.0, + "grad_norm": 2.013223213361535, + "language_loss": 0.7350834, + "learning_rate": 2.848696068594545e-06, + "loss": 0.75672615, + "num_input_tokens_seen": 135600280, + "step": 6315, + "time_per_iteration": 2.4840304851531982 + }, + { + "auxiliary_loss_clip": 0.01122553, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_clip": 1.04894054, + "balance_loss_mlp": 1.02751589, + "epoch": 0.3797384638508943, + "flos": 39348578298240.0, + "grad_norm": 2.070269831835568, + "language_loss": 0.70815194, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.72979856, + "num_input_tokens_seen": 135621560, + "step": 6316, + "time_per_iteration": 2.631497621536255 + }, + { + "auxiliary_loss_clip": 0.01097326, + "auxiliary_loss_mlp": 0.01032188, + "balance_loss_clip": 1.04959404, + "balance_loss_mlp": 1.0186224, + "epoch": 0.3797985871035623, + "flos": 34054016653440.0, + "grad_norm": 1.7368235689825242, + "language_loss": 0.65033746, + "learning_rate": 2.847990689788923e-06, + "loss": 0.67163259, + "num_input_tokens_seen": 135641745, + "step": 6317, + "time_per_iteration": 2.666734218597412 + }, + { + "auxiliary_loss_clip": 0.01115656, + "auxiliary_loss_mlp": 0.01034935, + "balance_loss_clip": 1.04674602, + "balance_loss_mlp": 1.02156544, + "epoch": 0.37985871035623026, + "flos": 23222174186880.0, + "grad_norm": 2.116810400178657, + "language_loss": 0.85845613, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.87996209, + "num_input_tokens_seen": 135660650, + "step": 6318, + "time_per_iteration": 2.5185465812683105 + }, + { + "auxiliary_loss_clip": 0.01107025, + "auxiliary_loss_mlp": 0.01037519, + "balance_loss_clip": 1.04762673, + "balance_loss_mlp": 1.02197981, + "epoch": 0.3799188336088982, + "flos": 18114958903680.0, + "grad_norm": 1.8249640820494513, + "language_loss": 0.75830662, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.77975202, + "num_input_tokens_seen": 135679980, + "step": 6319, + "time_per_iteration": 2.5096659660339355 + }, + { + "auxiliary_loss_clip": 0.01130299, + "auxiliary_loss_mlp": 0.01036144, + "balance_loss_clip": 1.04867947, + "balance_loss_mlp": 1.02209544, + "epoch": 0.3799789568615662, + "flos": 21871897096320.0, + "grad_norm": 1.9231228556726114, + "language_loss": 0.63970149, + "learning_rate": 2.846932380444744e-06, + "loss": 0.66136593, + "num_input_tokens_seen": 135699400, + "step": 6320, + "time_per_iteration": 2.4641120433807373 + }, + { + "auxiliary_loss_clip": 0.01088843, + "auxiliary_loss_mlp": 0.01039209, + "balance_loss_clip": 1.04508543, + "balance_loss_mlp": 1.02470112, + "epoch": 0.3800390801142342, + "flos": 32962943082240.0, + "grad_norm": 2.0538331486748236, + "language_loss": 0.71342206, + "learning_rate": 2.846579546413992e-06, + "loss": 0.73470259, + "num_input_tokens_seen": 135723455, + "step": 6321, + "time_per_iteration": 2.6682469844818115 + }, + { + "auxiliary_loss_clip": 0.01093299, + "auxiliary_loss_mlp": 0.01036557, + "balance_loss_clip": 1.04529858, + "balance_loss_mlp": 1.02173984, + "epoch": 0.38009920336690217, + "flos": 26907075653760.0, + "grad_norm": 1.7042915169570947, + "language_loss": 0.74755621, + "learning_rate": 2.846226680280859e-06, + "loss": 0.7688548, + "num_input_tokens_seen": 135744335, + "step": 6322, + "time_per_iteration": 2.606158494949341 + }, + { + "auxiliary_loss_clip": 0.01117782, + "auxiliary_loss_mlp": 0.01036415, + "balance_loss_clip": 1.04635441, + "balance_loss_mlp": 1.0213114, + "epoch": 0.38015932661957014, + "flos": 22488913946880.0, + "grad_norm": 1.9292761620526755, + "language_loss": 0.85027391, + "learning_rate": 2.845873782058725e-06, + "loss": 0.87181592, + "num_input_tokens_seen": 135761440, + "step": 6323, + "time_per_iteration": 2.483649969100952 + }, + { + "auxiliary_loss_clip": 0.0110348, + "auxiliary_loss_mlp": 0.01034562, + "balance_loss_clip": 1.04330277, + "balance_loss_mlp": 1.01912403, + "epoch": 0.3802194498722381, + "flos": 21980993679360.0, + "grad_norm": 2.6638962260827825, + "language_loss": 0.73629892, + "learning_rate": 2.845520851760973e-06, + "loss": 0.75767934, + "num_input_tokens_seen": 135779955, + "step": 6324, + "time_per_iteration": 2.522289514541626 + }, + { + "auxiliary_loss_clip": 0.01100598, + "auxiliary_loss_mlp": 0.01038874, + "balance_loss_clip": 1.04796386, + "balance_loss_mlp": 1.02368081, + "epoch": 0.38027957312490607, + "flos": 21324869896320.0, + "grad_norm": 1.6666539298181506, + "language_loss": 0.84270024, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.86409491, + "num_input_tokens_seen": 135799840, + "step": 6325, + "time_per_iteration": 2.5397698879241943 + }, + { + "auxiliary_loss_clip": 0.01103476, + "auxiliary_loss_mlp": 0.01031872, + "balance_loss_clip": 1.04868793, + "balance_loss_mlp": 1.01776385, + "epoch": 0.38033969637757403, + "flos": 16691244456960.0, + "grad_norm": 1.6222562906545879, + "language_loss": 0.79412544, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.81547892, + "num_input_tokens_seen": 135817880, + "step": 6326, + "time_per_iteration": 2.501400947570801 + }, + { + "auxiliary_loss_clip": 0.01117475, + "auxiliary_loss_mlp": 0.01042191, + "balance_loss_clip": 1.04808331, + "balance_loss_mlp": 1.02811193, + "epoch": 0.380399819630242, + "flos": 36210847685760.0, + "grad_norm": 1.7810879360542151, + "language_loss": 0.72798949, + "learning_rate": 2.844461868547842e-06, + "loss": 0.74958616, + "num_input_tokens_seen": 135838940, + "step": 6327, + "time_per_iteration": 2.6304078102111816 + }, + { + "auxiliary_loss_clip": 0.01127545, + "auxiliary_loss_mlp": 0.00796115, + "balance_loss_clip": 1.04769301, + "balance_loss_mlp": 1.02479756, + "epoch": 0.38045994288290996, + "flos": 21288851533440.0, + "grad_norm": 1.9383777495692633, + "language_loss": 0.82736421, + "learning_rate": 2.844108810081459e-06, + "loss": 0.84660083, + "num_input_tokens_seen": 135858325, + "step": 6328, + "time_per_iteration": 2.491238832473755 + }, + { + "auxiliary_loss_clip": 0.01116363, + "auxiliary_loss_mlp": 0.01029724, + "balance_loss_clip": 1.04594612, + "balance_loss_mlp": 1.01584792, + "epoch": 0.38052006613557793, + "flos": 20922885815040.0, + "grad_norm": 1.2947157391298052, + "language_loss": 0.61617184, + "learning_rate": 2.843755719606385e-06, + "loss": 0.63763267, + "num_input_tokens_seen": 135878430, + "step": 6329, + "time_per_iteration": 2.493976354598999 + }, + { + "auxiliary_loss_clip": 0.01110691, + "auxiliary_loss_mlp": 0.01038331, + "balance_loss_clip": 1.05132914, + "balance_loss_mlp": 1.0240314, + "epoch": 0.3805801893882459, + "flos": 20990720649600.0, + "grad_norm": 2.0073395572804493, + "language_loss": 0.56228602, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.58377624, + "num_input_tokens_seen": 135894755, + "step": 6330, + "time_per_iteration": 3.903026819229126 + }, + { + "auxiliary_loss_clip": 0.01083724, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.05031407, + "balance_loss_mlp": 1.02129245, + "epoch": 0.38064031264091386, + "flos": 25558594243200.0, + "grad_norm": 1.3720746176492677, + "language_loss": 0.65748399, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.67866623, + "num_input_tokens_seen": 135918275, + "step": 6331, + "time_per_iteration": 2.621837615966797 + }, + { + "auxiliary_loss_clip": 0.01117081, + "auxiliary_loss_mlp": 0.01042946, + "balance_loss_clip": 1.05162823, + "balance_loss_mlp": 1.02787232, + "epoch": 0.3807004358935818, + "flos": 15085857997440.0, + "grad_norm": 1.5012476474934946, + "language_loss": 0.76031363, + "learning_rate": 2.842696256262919e-06, + "loss": 0.78191388, + "num_input_tokens_seen": 135937430, + "step": 6332, + "time_per_iteration": 3.99216890335083 + }, + { + "auxiliary_loss_clip": 0.01073796, + "auxiliary_loss_mlp": 0.00795066, + "balance_loss_clip": 1.0484314, + "balance_loss_mlp": 1.01916027, + "epoch": 0.3807605591462498, + "flos": 16399398453120.0, + "grad_norm": 1.703158449001075, + "language_loss": 0.8230477, + "learning_rate": 2.842343037886987e-06, + "loss": 0.84173632, + "num_input_tokens_seen": 135954210, + "step": 6333, + "time_per_iteration": 4.018657684326172 + }, + { + "auxiliary_loss_clip": 0.0111568, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.04571736, + "balance_loss_mlp": 1.01695096, + "epoch": 0.3808206823989178, + "flos": 29057083102080.0, + "grad_norm": 1.4346241553768697, + "language_loss": 0.86159593, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.88305897, + "num_input_tokens_seen": 135974425, + "step": 6334, + "time_per_iteration": 2.5841455459594727 + }, + { + "auxiliary_loss_clip": 0.01119117, + "auxiliary_loss_mlp": 0.01037235, + "balance_loss_clip": 1.04527211, + "balance_loss_mlp": 1.0225842, + "epoch": 0.3808808056515858, + "flos": 15705855676800.0, + "grad_norm": 1.728989168179784, + "language_loss": 0.79461873, + "learning_rate": 2.841636505323321e-06, + "loss": 0.81618226, + "num_input_tokens_seen": 135991985, + "step": 6335, + "time_per_iteration": 2.4574644565582275 + }, + { + "auxiliary_loss_clip": 0.01117906, + "auxiliary_loss_mlp": 0.01034915, + "balance_loss_clip": 1.0464102, + "balance_loss_mlp": 1.02032995, + "epoch": 0.38094092890425374, + "flos": 20704584908160.0, + "grad_norm": 1.810350578572506, + "language_loss": 0.73560929, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.75713754, + "num_input_tokens_seen": 136010015, + "step": 6336, + "time_per_iteration": 3.863769769668579 + }, + { + "auxiliary_loss_clip": 0.01111745, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.04689753, + "balance_loss_mlp": 1.0190866, + "epoch": 0.3810010521569217, + "flos": 20667956014080.0, + "grad_norm": 2.2671224185047714, + "language_loss": 0.68611503, + "learning_rate": 2.840929845099894e-06, + "loss": 0.70756, + "num_input_tokens_seen": 136028440, + "step": 6337, + "time_per_iteration": 2.4967098236083984 + }, + { + "auxiliary_loss_clip": 0.01105162, + "auxiliary_loss_mlp": 0.01032759, + "balance_loss_clip": 1.04364085, + "balance_loss_mlp": 1.01772642, + "epoch": 0.38106117540958967, + "flos": 31827626933760.0, + "grad_norm": 1.7471261003992924, + "language_loss": 0.63274467, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.6541239, + "num_input_tokens_seen": 136048360, + "step": 6338, + "time_per_iteration": 2.629152297973633 + }, + { + "auxiliary_loss_clip": 0.01107802, + "auxiliary_loss_mlp": 0.01041032, + "balance_loss_clip": 1.04558277, + "balance_loss_mlp": 1.0254519, + "epoch": 0.38112129866225763, + "flos": 16902757693440.0, + "grad_norm": 1.6669164244577088, + "language_loss": 0.68992519, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.7114135, + "num_input_tokens_seen": 136065500, + "step": 6339, + "time_per_iteration": 2.4987664222717285 + }, + { + "auxiliary_loss_clip": 0.01104101, + "auxiliary_loss_mlp": 0.01045232, + "balance_loss_clip": 1.04830754, + "balance_loss_mlp": 1.03062332, + "epoch": 0.3811814219149256, + "flos": 20887226588160.0, + "grad_norm": 2.1906189330174612, + "language_loss": 0.67866683, + "learning_rate": 2.839869615637177e-06, + "loss": 0.70016015, + "num_input_tokens_seen": 136084060, + "step": 6340, + "time_per_iteration": 2.5332086086273193 + }, + { + "auxiliary_loss_clip": 0.0109456, + "auxiliary_loss_mlp": 0.01037279, + "balance_loss_clip": 1.04747534, + "balance_loss_mlp": 1.02233016, + "epoch": 0.38124154516759357, + "flos": 16690813493760.0, + "grad_norm": 2.4035951831324156, + "language_loss": 0.89299297, + "learning_rate": 2.839516142102522e-06, + "loss": 0.91431129, + "num_input_tokens_seen": 136102310, + "step": 6341, + "time_per_iteration": 2.5374531745910645 + }, + { + "auxiliary_loss_clip": 0.01120275, + "auxiliary_loss_mlp": 0.01043923, + "balance_loss_clip": 1.04637682, + "balance_loss_mlp": 1.02852666, + "epoch": 0.38130166842026153, + "flos": 19681956702720.0, + "grad_norm": 1.5714588810780212, + "language_loss": 0.75025702, + "learning_rate": 2.83916263673333e-06, + "loss": 0.77189898, + "num_input_tokens_seen": 136120725, + "step": 6342, + "time_per_iteration": 2.4857327938079834 + }, + { + "auxiliary_loss_clip": 0.01105481, + "auxiliary_loss_mlp": 0.01037153, + "balance_loss_clip": 1.044891, + "balance_loss_mlp": 1.02300262, + "epoch": 0.3813617916729295, + "flos": 22198432659840.0, + "grad_norm": 1.7305350387479543, + "language_loss": 0.8360371, + "learning_rate": 2.838809099543007e-06, + "loss": 0.85746348, + "num_input_tokens_seen": 136139105, + "step": 6343, + "time_per_iteration": 2.5172109603881836 + }, + { + "auxiliary_loss_clip": 0.01072515, + "auxiliary_loss_mlp": 0.01045337, + "balance_loss_clip": 1.04667091, + "balance_loss_mlp": 1.02997136, + "epoch": 0.38142191492559746, + "flos": 19096899978240.0, + "grad_norm": 1.7144557478760796, + "language_loss": 0.77029335, + "learning_rate": 2.838455530544959e-06, + "loss": 0.79147184, + "num_input_tokens_seen": 136158265, + "step": 6344, + "time_per_iteration": 2.604015350341797 + }, + { + "auxiliary_loss_clip": 0.01093917, + "auxiliary_loss_mlp": 0.01045229, + "balance_loss_clip": 1.04493988, + "balance_loss_mlp": 1.02862298, + "epoch": 0.3814820381782654, + "flos": 24097748112000.0, + "grad_norm": 3.0431561376105067, + "language_loss": 0.7361986, + "learning_rate": 2.838101929752593e-06, + "loss": 0.75759006, + "num_input_tokens_seen": 136176100, + "step": 6345, + "time_per_iteration": 2.5651538372039795 + }, + { + "auxiliary_loss_clip": 0.01088084, + "auxiliary_loss_mlp": 0.00795619, + "balance_loss_clip": 1.04413474, + "balance_loss_mlp": 1.02345788, + "epoch": 0.3815421614309334, + "flos": 15778502933760.0, + "grad_norm": 1.8084450588779348, + "language_loss": 0.69712949, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.71596646, + "num_input_tokens_seen": 136195125, + "step": 6346, + "time_per_iteration": 2.5558722019195557 + }, + { + "auxiliary_loss_clip": 0.01118019, + "auxiliary_loss_mlp": 0.01033786, + "balance_loss_clip": 1.04623437, + "balance_loss_mlp": 1.01986814, + "epoch": 0.38160228468360136, + "flos": 19899754819200.0, + "grad_norm": 1.8350031715206616, + "language_loss": 0.75714093, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.77865899, + "num_input_tokens_seen": 136213885, + "step": 6347, + "time_per_iteration": 2.4897735118865967 + }, + { + "auxiliary_loss_clip": 0.01115721, + "auxiliary_loss_mlp": 0.01041383, + "balance_loss_clip": 1.04502559, + "balance_loss_mlp": 1.02800131, + "epoch": 0.3816624079362694, + "flos": 19281050029440.0, + "grad_norm": 1.5420293332210755, + "language_loss": 0.74667543, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.76824641, + "num_input_tokens_seen": 136232700, + "step": 6348, + "time_per_iteration": 2.489745616912842 + }, + { + "auxiliary_loss_clip": 0.01100603, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.04628968, + "balance_loss_mlp": 1.01766753, + "epoch": 0.38172253118893734, + "flos": 21177564220800.0, + "grad_norm": 2.118601453882471, + "language_loss": 0.87073922, + "learning_rate": 2.836687208908142e-06, + "loss": 0.89205962, + "num_input_tokens_seen": 136248975, + "step": 6349, + "time_per_iteration": 2.5078890323638916 + }, + { + "auxiliary_loss_clip": 0.01113396, + "auxiliary_loss_mlp": 0.01039322, + "balance_loss_clip": 1.04738009, + "balance_loss_mlp": 1.02531528, + "epoch": 0.3817826544416053, + "flos": 17529219820800.0, + "grad_norm": 1.7373272629388106, + "language_loss": 0.76868415, + "learning_rate": 2.836333449345341e-06, + "loss": 0.79021132, + "num_input_tokens_seen": 136266710, + "step": 6350, + "time_per_iteration": 2.4785208702087402 + }, + { + "auxiliary_loss_clip": 0.01092793, + "auxiliary_loss_mlp": 0.0103011, + "balance_loss_clip": 1.04526901, + "balance_loss_mlp": 1.01451707, + "epoch": 0.38184277769427327, + "flos": 16326535714560.0, + "grad_norm": 2.006814262499669, + "language_loss": 0.75801581, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.77924484, + "num_input_tokens_seen": 136284445, + "step": 6351, + "time_per_iteration": 2.5159828662872314 + }, + { + "auxiliary_loss_clip": 0.01114761, + "auxiliary_loss_mlp": 0.01040675, + "balance_loss_clip": 1.04397559, + "balance_loss_mlp": 1.02534437, + "epoch": 0.38190290094694124, + "flos": 30443450382720.0, + "grad_norm": 1.6594008193698917, + "language_loss": 0.74168122, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.76323557, + "num_input_tokens_seen": 136305730, + "step": 6352, + "time_per_iteration": 2.568828582763672 + }, + { + "auxiliary_loss_clip": 0.0109161, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.04612172, + "balance_loss_mlp": 1.02064633, + "epoch": 0.3819630241996092, + "flos": 14209924936320.0, + "grad_norm": 1.8408255488969296, + "language_loss": 0.64298224, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.66423845, + "num_input_tokens_seen": 136323850, + "step": 6353, + "time_per_iteration": 2.56478214263916 + }, + { + "auxiliary_loss_clip": 0.01125881, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.0450635, + "balance_loss_mlp": 1.02084827, + "epoch": 0.38202314745227717, + "flos": 25009699536000.0, + "grad_norm": 1.5482921108665268, + "language_loss": 0.82974893, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85135329, + "num_input_tokens_seen": 136344880, + "step": 6354, + "time_per_iteration": 2.4897360801696777 + }, + { + "auxiliary_loss_clip": 0.01126808, + "auxiliary_loss_mlp": 0.01032726, + "balance_loss_clip": 1.04720318, + "balance_loss_mlp": 1.01911259, + "epoch": 0.38208327070494513, + "flos": 20814507504000.0, + "grad_norm": 1.620803795496377, + "language_loss": 0.80179405, + "learning_rate": 2.834564176091943e-06, + "loss": 0.82338935, + "num_input_tokens_seen": 136366060, + "step": 6355, + "time_per_iteration": 2.47265362739563 + }, + { + "auxiliary_loss_clip": 0.01085382, + "auxiliary_loss_mlp": 0.01042097, + "balance_loss_clip": 1.04699123, + "balance_loss_mlp": 1.02747035, + "epoch": 0.3821433939576131, + "flos": 22637727993600.0, + "grad_norm": 1.7256294128417997, + "language_loss": 0.75121212, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.77248693, + "num_input_tokens_seen": 136385625, + "step": 6356, + "time_per_iteration": 2.579561471939087 + }, + { + "auxiliary_loss_clip": 0.01117539, + "auxiliary_loss_mlp": 0.00792094, + "balance_loss_clip": 1.0462625, + "balance_loss_mlp": 1.01497245, + "epoch": 0.38220351721028106, + "flos": 26869872142080.0, + "grad_norm": 1.8699244707279434, + "language_loss": 0.81232989, + "learning_rate": 2.833856245169348e-06, + "loss": 0.83142626, + "num_input_tokens_seen": 136405750, + "step": 6357, + "time_per_iteration": 2.527331829071045 + }, + { + "auxiliary_loss_clip": 0.01110157, + "auxiliary_loss_mlp": 0.01041609, + "balance_loss_clip": 1.0498569, + "balance_loss_mlp": 1.02583718, + "epoch": 0.38226364046294903, + "flos": 23367468700800.0, + "grad_norm": 1.7561602764412787, + "language_loss": 0.77302456, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.79454225, + "num_input_tokens_seen": 136426085, + "step": 6358, + "time_per_iteration": 2.530564546585083 + }, + { + "auxiliary_loss_clip": 0.01107864, + "auxiliary_loss_mlp": 0.01039992, + "balance_loss_clip": 1.04442024, + "balance_loss_mlp": 1.02488852, + "epoch": 0.382323763715617, + "flos": 19646225648640.0, + "grad_norm": 2.182025269785297, + "language_loss": 0.78510314, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.80658174, + "num_input_tokens_seen": 136442670, + "step": 6359, + "time_per_iteration": 2.4887638092041016 + }, + { + "auxiliary_loss_clip": 0.01063318, + "auxiliary_loss_mlp": 0.01046756, + "balance_loss_clip": 1.0430212, + "balance_loss_mlp": 1.02968562, + "epoch": 0.38238388696828496, + "flos": 54124741232640.0, + "grad_norm": 1.7844643331183885, + "language_loss": 0.69828594, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.71938664, + "num_input_tokens_seen": 136465730, + "step": 6360, + "time_per_iteration": 2.8973679542541504 + }, + { + "auxiliary_loss_clip": 0.01097921, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.04556656, + "balance_loss_mlp": 1.0192461, + "epoch": 0.382444010220953, + "flos": 24936190352640.0, + "grad_norm": 1.5966238664745025, + "language_loss": 0.79015481, + "learning_rate": 2.83244000399261e-06, + "loss": 0.81148088, + "num_input_tokens_seen": 136487215, + "step": 6361, + "time_per_iteration": 2.568021059036255 + }, + { + "auxiliary_loss_clip": 0.01103455, + "auxiliary_loss_mlp": 0.01037569, + "balance_loss_clip": 1.04402626, + "balance_loss_mlp": 1.02366877, + "epoch": 0.38250413347362094, + "flos": 42337351209600.0, + "grad_norm": 1.5130039546850633, + "language_loss": 0.65356135, + "learning_rate": 2.832085864749337e-06, + "loss": 0.67497158, + "num_input_tokens_seen": 136510365, + "step": 6362, + "time_per_iteration": 2.700993061065674 + }, + { + "auxiliary_loss_clip": 0.01126461, + "auxiliary_loss_mlp": 0.01035978, + "balance_loss_clip": 1.04516792, + "balance_loss_mlp": 1.01991487, + "epoch": 0.3825642567262889, + "flos": 16289224462080.0, + "grad_norm": 1.7438183058183454, + "language_loss": 0.81665468, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.83827907, + "num_input_tokens_seen": 136527100, + "step": 6363, + "time_per_iteration": 2.436859607696533 + }, + { + "auxiliary_loss_clip": 0.01073779, + "auxiliary_loss_mlp": 0.01041921, + "balance_loss_clip": 1.04942608, + "balance_loss_mlp": 1.02688277, + "epoch": 0.3826243799789569, + "flos": 45654778586880.0, + "grad_norm": 1.6738996421412173, + "language_loss": 0.58484823, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.60600525, + "num_input_tokens_seen": 136550870, + "step": 6364, + "time_per_iteration": 2.7961654663085938 + }, + { + "auxiliary_loss_clip": 0.01110479, + "auxiliary_loss_mlp": 0.01037295, + "balance_loss_clip": 1.04545116, + "balance_loss_mlp": 1.02228069, + "epoch": 0.38268450323162484, + "flos": 25301581453440.0, + "grad_norm": 1.7900467123771298, + "language_loss": 0.68722427, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.70870203, + "num_input_tokens_seen": 136569895, + "step": 6365, + "time_per_iteration": 2.5532028675079346 + }, + { + "auxiliary_loss_clip": 0.01113518, + "auxiliary_loss_mlp": 0.01035949, + "balance_loss_clip": 1.0451206, + "balance_loss_mlp": 1.02028513, + "epoch": 0.3827446264842928, + "flos": 21836022387840.0, + "grad_norm": 1.8114283439105476, + "language_loss": 0.7350021, + "learning_rate": 2.830668992382758e-06, + "loss": 0.75649679, + "num_input_tokens_seen": 136588585, + "step": 6366, + "time_per_iteration": 2.488314628601074 + }, + { + "auxiliary_loss_clip": 0.01107329, + "auxiliary_loss_mlp": 0.01038246, + "balance_loss_clip": 1.04560494, + "balance_loss_mlp": 1.02299929, + "epoch": 0.38280474973696077, + "flos": 25734591907200.0, + "grad_norm": 2.2641980514583686, + "language_loss": 0.6836859, + "learning_rate": 2.830314695509902e-06, + "loss": 0.70514166, + "num_input_tokens_seen": 136606640, + "step": 6367, + "time_per_iteration": 2.571965217590332 + }, + { + "auxiliary_loss_clip": 0.0111666, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.04687929, + "balance_loss_mlp": 1.01847494, + "epoch": 0.38286487298962874, + "flos": 24895934184960.0, + "grad_norm": 1.9184713907149546, + "language_loss": 0.64226711, + "learning_rate": 2.82996036715143e-06, + "loss": 0.66377217, + "num_input_tokens_seen": 136624940, + "step": 6368, + "time_per_iteration": 3.9741387367248535 + }, + { + "auxiliary_loss_clip": 0.01128962, + "auxiliary_loss_mlp": 0.0103615, + "balance_loss_clip": 1.04693556, + "balance_loss_mlp": 1.02070045, + "epoch": 0.3829249962422967, + "flos": 28543703967360.0, + "grad_norm": 1.3524652263270613, + "language_loss": 0.68305594, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.70470709, + "num_input_tokens_seen": 136645540, + "step": 6369, + "time_per_iteration": 2.531676769256592 + }, + { + "auxiliary_loss_clip": 0.01078835, + "auxiliary_loss_mlp": 0.01043913, + "balance_loss_clip": 1.04402494, + "balance_loss_mlp": 1.02827251, + "epoch": 0.38298511949496467, + "flos": 21471205904640.0, + "grad_norm": 1.8410065133850892, + "language_loss": 0.786807, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.80803442, + "num_input_tokens_seen": 136664530, + "step": 6370, + "time_per_iteration": 2.574394464492798 + }, + { + "auxiliary_loss_clip": 0.01112429, + "auxiliary_loss_mlp": 0.01046485, + "balance_loss_clip": 1.04536867, + "balance_loss_mlp": 1.03005815, + "epoch": 0.38304524274763263, + "flos": 31679998035840.0, + "grad_norm": 2.6794026258987405, + "language_loss": 0.6500808, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.6716699, + "num_input_tokens_seen": 136682315, + "step": 6371, + "time_per_iteration": 3.9441590309143066 + }, + { + "auxiliary_loss_clip": 0.01100924, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.04605496, + "balance_loss_mlp": 1.0216012, + "epoch": 0.3831053660003006, + "flos": 25076816098560.0, + "grad_norm": 2.1885725187002927, + "language_loss": 0.72838646, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.74977171, + "num_input_tokens_seen": 136701185, + "step": 6372, + "time_per_iteration": 3.989328622817993 + }, + { + "auxiliary_loss_clip": 0.01119035, + "auxiliary_loss_mlp": 0.01034174, + "balance_loss_clip": 1.04696989, + "balance_loss_mlp": 1.01899862, + "epoch": 0.38316548925296856, + "flos": 23259018562560.0, + "grad_norm": 2.0335961211755293, + "language_loss": 0.85167766, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.87320971, + "num_input_tokens_seen": 136721265, + "step": 6373, + "time_per_iteration": 2.516111135482788 + }, + { + "auxiliary_loss_clip": 0.01079705, + "auxiliary_loss_mlp": 0.01045943, + "balance_loss_clip": 1.04332924, + "balance_loss_mlp": 1.02956963, + "epoch": 0.3832256125056366, + "flos": 34423465991040.0, + "grad_norm": 1.938711263273911, + "language_loss": 0.75175524, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.77301168, + "num_input_tokens_seen": 136741885, + "step": 6374, + "time_per_iteration": 2.6831583976745605 + }, + { + "auxiliary_loss_clip": 0.01128834, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.05461001, + "balance_loss_mlp": 1.02201617, + "epoch": 0.38328573575830455, + "flos": 21762764599680.0, + "grad_norm": 2.2257315899505437, + "language_loss": 0.75724936, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.77890217, + "num_input_tokens_seen": 136760905, + "step": 6375, + "time_per_iteration": 3.9062957763671875 + }, + { + "auxiliary_loss_clip": 0.01117054, + "auxiliary_loss_mlp": 0.01038314, + "balance_loss_clip": 1.04728484, + "balance_loss_mlp": 1.02327561, + "epoch": 0.3833458590109725, + "flos": 17380010724480.0, + "grad_norm": 1.8162715410607537, + "language_loss": 0.7216894, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.7432431, + "num_input_tokens_seen": 136777240, + "step": 6376, + "time_per_iteration": 2.446741819381714 + }, + { + "auxiliary_loss_clip": 0.01115881, + "auxiliary_loss_mlp": 0.01037744, + "balance_loss_clip": 1.04697669, + "balance_loss_mlp": 1.022282, + "epoch": 0.3834059822636405, + "flos": 29424557191680.0, + "grad_norm": 1.5371758587663247, + "language_loss": 0.67886591, + "learning_rate": 2.826769997289796e-06, + "loss": 0.70040214, + "num_input_tokens_seen": 136801040, + "step": 6377, + "time_per_iteration": 2.5659170150756836 + }, + { + "auxiliary_loss_clip": 0.01100986, + "auxiliary_loss_mlp": 0.01041064, + "balance_loss_clip": 1.04872131, + "balance_loss_mlp": 1.02530479, + "epoch": 0.38346610551630844, + "flos": 21470739027840.0, + "grad_norm": 2.746369077639069, + "language_loss": 0.73632789, + "learning_rate": 2.826415354814344e-06, + "loss": 0.75774837, + "num_input_tokens_seen": 136819495, + "step": 6378, + "time_per_iteration": 2.5435080528259277 + }, + { + "auxiliary_loss_clip": 0.01082783, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.04783702, + "balance_loss_mlp": 1.01997805, + "epoch": 0.3835262287689764, + "flos": 27561224188800.0, + "grad_norm": 1.708181224550276, + "language_loss": 0.69499195, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.71616364, + "num_input_tokens_seen": 136838840, + "step": 6379, + "time_per_iteration": 2.649644613265991 + }, + { + "auxiliary_loss_clip": 0.01116496, + "auxiliary_loss_mlp": 0.01037132, + "balance_loss_clip": 1.04803908, + "balance_loss_mlp": 1.02224267, + "epoch": 0.3835863520216444, + "flos": 15523716787200.0, + "grad_norm": 2.081648052641886, + "language_loss": 0.83040178, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.85193801, + "num_input_tokens_seen": 136854425, + "step": 6380, + "time_per_iteration": 2.475816249847412 + }, + { + "auxiliary_loss_clip": 0.01126875, + "auxiliary_loss_mlp": 0.01034491, + "balance_loss_clip": 1.04814756, + "balance_loss_mlp": 1.02027488, + "epoch": 0.38364647527431234, + "flos": 21904934630400.0, + "grad_norm": 1.5878223475344093, + "language_loss": 0.81285727, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.83447093, + "num_input_tokens_seen": 136874355, + "step": 6381, + "time_per_iteration": 2.483790636062622 + }, + { + "auxiliary_loss_clip": 0.01058466, + "auxiliary_loss_mlp": 0.01004256, + "balance_loss_clip": 1.02911222, + "balance_loss_mlp": 1.00265872, + "epoch": 0.3837065985269803, + "flos": 65534927558400.0, + "grad_norm": 0.7892914286643401, + "language_loss": 0.60402787, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.62465501, + "num_input_tokens_seen": 136937475, + "step": 6382, + "time_per_iteration": 3.074282646179199 + }, + { + "auxiliary_loss_clip": 0.01128838, + "auxiliary_loss_mlp": 0.01034878, + "balance_loss_clip": 1.04534221, + "balance_loss_mlp": 1.01952386, + "epoch": 0.38376672177964827, + "flos": 28256598558720.0, + "grad_norm": 2.4611504408501124, + "language_loss": 0.67183125, + "learning_rate": 2.824641672639794e-06, + "loss": 0.69346839, + "num_input_tokens_seen": 136955805, + "step": 6383, + "time_per_iteration": 2.5106618404388428 + }, + { + "auxiliary_loss_clip": 0.01093544, + "auxiliary_loss_mlp": 0.01036183, + "balance_loss_clip": 1.0441606, + "balance_loss_mlp": 1.02139533, + "epoch": 0.38382684503231623, + "flos": 20631363033600.0, + "grad_norm": 1.6752188064510494, + "language_loss": 0.75306666, + "learning_rate": 2.824286842339587e-06, + "loss": 0.77436388, + "num_input_tokens_seen": 136975240, + "step": 6384, + "time_per_iteration": 2.5614356994628906 + }, + { + "auxiliary_loss_clip": 0.01111901, + "auxiliary_loss_mlp": 0.01034882, + "balance_loss_clip": 1.04854298, + "balance_loss_mlp": 1.02061248, + "epoch": 0.3838869682849842, + "flos": 19605825826560.0, + "grad_norm": 1.3917112743237527, + "language_loss": 0.76320696, + "learning_rate": 2.823931980782341e-06, + "loss": 0.78467476, + "num_input_tokens_seen": 136994985, + "step": 6385, + "time_per_iteration": 2.4885659217834473 + }, + { + "auxiliary_loss_clip": 0.01044903, + "auxiliary_loss_mlp": 0.01005726, + "balance_loss_clip": 1.02653658, + "balance_loss_mlp": 1.00408077, + "epoch": 0.38394709153765216, + "flos": 56556110891520.0, + "grad_norm": 0.9159290374671689, + "language_loss": 0.67059577, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69110203, + "num_input_tokens_seen": 137046290, + "step": 6386, + "time_per_iteration": 2.9929840564727783 + }, + { + "auxiliary_loss_clip": 0.01088404, + "auxiliary_loss_mlp": 0.01037841, + "balance_loss_clip": 1.04442811, + "balance_loss_mlp": 1.02351761, + "epoch": 0.3840072147903202, + "flos": 15888748752000.0, + "grad_norm": 1.6690337183295485, + "language_loss": 0.72564745, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.74690986, + "num_input_tokens_seen": 137064725, + "step": 6387, + "time_per_iteration": 2.5338666439056396 + }, + { + "auxiliary_loss_clip": 0.0112576, + "auxiliary_loss_mlp": 0.01038222, + "balance_loss_clip": 1.04753447, + "balance_loss_mlp": 1.0242269, + "epoch": 0.38406733804298815, + "flos": 28218030330240.0, + "grad_norm": 1.9919553779210797, + "language_loss": 0.81110233, + "learning_rate": 2.822867208702932e-06, + "loss": 0.83274215, + "num_input_tokens_seen": 137086030, + "step": 6388, + "time_per_iteration": 2.531886100769043 + }, + { + "auxiliary_loss_clip": 0.01092788, + "auxiliary_loss_mlp": 0.01038434, + "balance_loss_clip": 1.04276872, + "balance_loss_mlp": 1.02529097, + "epoch": 0.3841274612956561, + "flos": 18223588609920.0, + "grad_norm": 1.6872158100890784, + "language_loss": 0.75988734, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.78119957, + "num_input_tokens_seen": 137105400, + "step": 6389, + "time_per_iteration": 2.5711817741394043 + }, + { + "auxiliary_loss_clip": 0.01106041, + "auxiliary_loss_mlp": 0.01049581, + "balance_loss_clip": 1.04854417, + "balance_loss_mlp": 1.03378606, + "epoch": 0.3841875845483241, + "flos": 19792884879360.0, + "grad_norm": 1.621448520199094, + "language_loss": 0.76619875, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.78775495, + "num_input_tokens_seen": 137124985, + "step": 6390, + "time_per_iteration": 2.528108596801758 + }, + { + "auxiliary_loss_clip": 0.01078395, + "auxiliary_loss_mlp": 0.0104493, + "balance_loss_clip": 1.04083443, + "balance_loss_mlp": 1.02973688, + "epoch": 0.38424770780099204, + "flos": 29898829393920.0, + "grad_norm": 1.7095664629482443, + "language_loss": 0.69603896, + "learning_rate": 2.821802155794668e-06, + "loss": 0.71727222, + "num_input_tokens_seen": 137146745, + "step": 6391, + "time_per_iteration": 2.682647466659546 + }, + { + "auxiliary_loss_clip": 0.01115273, + "auxiliary_loss_mlp": 0.01035545, + "balance_loss_clip": 1.04442716, + "balance_loss_mlp": 1.02120423, + "epoch": 0.38430783105366, + "flos": 20813717404800.0, + "grad_norm": 1.7733292157718905, + "language_loss": 0.83981049, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.86131859, + "num_input_tokens_seen": 137163195, + "step": 6392, + "time_per_iteration": 2.5066070556640625 + }, + { + "auxiliary_loss_clip": 0.01111645, + "auxiliary_loss_mlp": 0.01040523, + "balance_loss_clip": 1.04268062, + "balance_loss_mlp": 1.02668846, + "epoch": 0.384367954306328, + "flos": 10998577399680.0, + "grad_norm": 2.3926563810540564, + "language_loss": 0.60886067, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.63038236, + "num_input_tokens_seen": 137179330, + "step": 6393, + "time_per_iteration": 2.463325262069702 + }, + { + "auxiliary_loss_clip": 0.01098975, + "auxiliary_loss_mlp": 0.01033872, + "balance_loss_clip": 1.0439328, + "balance_loss_mlp": 1.01863742, + "epoch": 0.38442807755899594, + "flos": 25338030779520.0, + "grad_norm": 1.7679377570466532, + "language_loss": 0.71011388, + "learning_rate": 2.820736822421029e-06, + "loss": 0.73144233, + "num_input_tokens_seen": 137198655, + "step": 6394, + "time_per_iteration": 2.593858003616333 + }, + { + "auxiliary_loss_clip": 0.01116553, + "auxiliary_loss_mlp": 0.01034707, + "balance_loss_clip": 1.04613733, + "balance_loss_mlp": 1.01891172, + "epoch": 0.3844882008116639, + "flos": 21069760527360.0, + "grad_norm": 2.85438184062275, + "language_loss": 0.8102147, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.83172727, + "num_input_tokens_seen": 137217120, + "step": 6395, + "time_per_iteration": 2.5121679306030273 + }, + { + "auxiliary_loss_clip": 0.01125905, + "auxiliary_loss_mlp": 0.01043866, + "balance_loss_clip": 1.05387044, + "balance_loss_mlp": 1.02995396, + "epoch": 0.38454832406433187, + "flos": 17963235855360.0, + "grad_norm": 1.9148805892792504, + "language_loss": 0.70467496, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.72637272, + "num_input_tokens_seen": 137234410, + "step": 6396, + "time_per_iteration": 2.460824966430664 + }, + { + "auxiliary_loss_clip": 0.01043402, + "auxiliary_loss_mlp": 0.01001416, + "balance_loss_clip": 1.02938461, + "balance_loss_mlp": 0.99967539, + "epoch": 0.38460844731699984, + "flos": 67924999555200.0, + "grad_norm": 0.8891862899202012, + "language_loss": 0.59627998, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.61672819, + "num_input_tokens_seen": 137294940, + "step": 6397, + "time_per_iteration": 3.1644768714904785 + }, + { + "auxiliary_loss_clip": 0.01127763, + "auxiliary_loss_mlp": 0.0102608, + "balance_loss_clip": 1.04755533, + "balance_loss_mlp": 1.01174498, + "epoch": 0.3846685705696678, + "flos": 25849075530240.0, + "grad_norm": 1.8033056431073977, + "language_loss": 0.84768987, + "learning_rate": 2.819315942271794e-06, + "loss": 0.86922824, + "num_input_tokens_seen": 137315035, + "step": 6398, + "time_per_iteration": 2.5350446701049805 + }, + { + "auxiliary_loss_clip": 0.0112694, + "auxiliary_loss_mlp": 0.01028067, + "balance_loss_clip": 1.04688048, + "balance_loss_mlp": 1.01450109, + "epoch": 0.38472869382233577, + "flos": 16290194129280.0, + "grad_norm": 1.9369153608860867, + "language_loss": 0.79882693, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.82037699, + "num_input_tokens_seen": 137333155, + "step": 6399, + "time_per_iteration": 2.450942277908325 + }, + { + "auxiliary_loss_clip": 0.0112822, + "auxiliary_loss_mlp": 0.0078985, + "balance_loss_clip": 1.04645252, + "balance_loss_mlp": 1.0103178, + "epoch": 0.38478881707500373, + "flos": 19353122668800.0, + "grad_norm": 1.8294283859048401, + "language_loss": 0.67144096, + "learning_rate": 2.818605315732038e-06, + "loss": 0.69062161, + "num_input_tokens_seen": 137351515, + "step": 6400, + "time_per_iteration": 2.456462860107422 + }, + { + "auxiliary_loss_clip": 0.0110828, + "auxiliary_loss_mlp": 0.01045962, + "balance_loss_clip": 1.04651487, + "balance_loss_mlp": 1.03120399, + "epoch": 0.38484894032767175, + "flos": 24860849575680.0, + "grad_norm": 1.6795876721854746, + "language_loss": 0.73175955, + "learning_rate": 2.81824995589303e-06, + "loss": 0.75330198, + "num_input_tokens_seen": 137371255, + "step": 6401, + "time_per_iteration": 2.544548988342285 + }, + { + "auxiliary_loss_clip": 0.01092593, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.04587924, + "balance_loss_mlp": 1.02230573, + "epoch": 0.3849090635803397, + "flos": 14501806853760.0, + "grad_norm": 2.447028324220019, + "language_loss": 0.72070253, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.74199533, + "num_input_tokens_seen": 137388980, + "step": 6402, + "time_per_iteration": 2.540895462036133 + }, + { + "auxiliary_loss_clip": 0.01122974, + "auxiliary_loss_mlp": 0.01031177, + "balance_loss_clip": 1.04454374, + "balance_loss_mlp": 1.01748633, + "epoch": 0.3849691868330077, + "flos": 18515865576960.0, + "grad_norm": 1.9230008282799038, + "language_loss": 0.82806671, + "learning_rate": 2.817539143144128e-06, + "loss": 0.84960818, + "num_input_tokens_seen": 137406885, + "step": 6403, + "time_per_iteration": 2.4402687549591064 + }, + { + "auxiliary_loss_clip": 0.01072141, + "auxiliary_loss_mlp": 0.01035319, + "balance_loss_clip": 1.04378819, + "balance_loss_mlp": 1.02001894, + "epoch": 0.38502931008567565, + "flos": 21616392677760.0, + "grad_norm": 1.9048316134222274, + "language_loss": 0.82962525, + "learning_rate": 2.817183690261189e-06, + "loss": 0.8506999, + "num_input_tokens_seen": 137425535, + "step": 6404, + "time_per_iteration": 2.5846331119537354 + }, + { + "auxiliary_loss_clip": 0.01106865, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.04806674, + "balance_loss_mlp": 1.01683903, + "epoch": 0.3850894333383436, + "flos": 25415346804480.0, + "grad_norm": 1.6868844228486704, + "language_loss": 0.69556075, + "learning_rate": 2.816828206390563e-06, + "loss": 0.71693677, + "num_input_tokens_seen": 137447700, + "step": 6405, + "time_per_iteration": 2.6025304794311523 + }, + { + "auxiliary_loss_clip": 0.01096874, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.04440904, + "balance_loss_mlp": 1.02181792, + "epoch": 0.3851495565910116, + "flos": 20227870581120.0, + "grad_norm": 2.1498616708778346, + "language_loss": 0.79102367, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81234312, + "num_input_tokens_seen": 137462245, + "step": 6406, + "time_per_iteration": 2.505293607711792 + }, + { + "auxiliary_loss_clip": 0.01115212, + "auxiliary_loss_mlp": 0.01035923, + "balance_loss_clip": 1.04609013, + "balance_loss_mlp": 1.02141571, + "epoch": 0.38520967984367954, + "flos": 16508459122560.0, + "grad_norm": 2.1188889876951467, + "language_loss": 0.83819681, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.85970819, + "num_input_tokens_seen": 137476455, + "step": 6407, + "time_per_iteration": 3.834929943084717 + }, + { + "auxiliary_loss_clip": 0.01046013, + "auxiliary_loss_mlp": 0.01009963, + "balance_loss_clip": 1.02709234, + "balance_loss_mlp": 1.00821102, + "epoch": 0.3852698030963475, + "flos": 61313772971520.0, + "grad_norm": 0.8448543093117669, + "language_loss": 0.64932764, + "learning_rate": 2.815761568987365e-06, + "loss": 0.66988742, + "num_input_tokens_seen": 137539845, + "step": 6408, + "time_per_iteration": 3.1531708240509033 + }, + { + "auxiliary_loss_clip": 0.01095217, + "auxiliary_loss_mlp": 0.01040754, + "balance_loss_clip": 1.04657722, + "balance_loss_mlp": 1.02563834, + "epoch": 0.3853299263490155, + "flos": 22893016930560.0, + "grad_norm": 1.4713514769925826, + "language_loss": 0.7336688, + "learning_rate": 2.8154059613008e-06, + "loss": 0.75502855, + "num_input_tokens_seen": 137559880, + "step": 6409, + "time_per_iteration": 2.555018663406372 + }, + { + "auxiliary_loss_clip": 0.01084014, + "auxiliary_loss_mlp": 0.01042111, + "balance_loss_clip": 1.04662347, + "balance_loss_mlp": 1.02579093, + "epoch": 0.38539004960168344, + "flos": 20047491457920.0, + "grad_norm": 3.1923623249760866, + "language_loss": 0.70293194, + "learning_rate": 2.81505032269396e-06, + "loss": 0.72419322, + "num_input_tokens_seen": 137578225, + "step": 6410, + "time_per_iteration": 3.9310505390167236 + }, + { + "auxiliary_loss_clip": 0.01019263, + "auxiliary_loss_mlp": 0.00826258, + "balance_loss_clip": 1.03001952, + "balance_loss_mlp": 1.11101174, + "epoch": 0.3854501728543514, + "flos": 68730691570560.0, + "grad_norm": 0.697355969802029, + "language_loss": 0.60291171, + "learning_rate": 2.81469465318033e-06, + "loss": 0.62136698, + "num_input_tokens_seen": 137645770, + "step": 6411, + "time_per_iteration": 4.675171136856079 + }, + { + "auxiliary_loss_clip": 0.0108134, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.04053092, + "balance_loss_mlp": 1.01733541, + "epoch": 0.38551029610701937, + "flos": 20485027025280.0, + "grad_norm": 2.634265357709159, + "language_loss": 0.77659267, + "learning_rate": 2.814338952773397e-06, + "loss": 0.7977165, + "num_input_tokens_seen": 137664090, + "step": 6412, + "time_per_iteration": 2.5629477500915527 + }, + { + "auxiliary_loss_clip": 0.01090303, + "auxiliary_loss_mlp": 0.01033987, + "balance_loss_clip": 1.04231524, + "balance_loss_mlp": 1.0174526, + "epoch": 0.38557041935968733, + "flos": 23471788775040.0, + "grad_norm": 1.8064533287691518, + "language_loss": 0.78217995, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.80342281, + "num_input_tokens_seen": 137683190, + "step": 6413, + "time_per_iteration": 3.9412789344787598 + }, + { + "auxiliary_loss_clip": 0.01054894, + "auxiliary_loss_mlp": 0.01005119, + "balance_loss_clip": 1.02594435, + "balance_loss_mlp": 1.00322402, + "epoch": 0.38563054261235535, + "flos": 63966636869760.0, + "grad_norm": 0.8091090617658498, + "language_loss": 0.61364746, + "learning_rate": 2.813627459333576e-06, + "loss": 0.6342476, + "num_input_tokens_seen": 137737315, + "step": 6414, + "time_per_iteration": 2.9110121726989746 + }, + { + "auxiliary_loss_clip": 0.01093371, + "auxiliary_loss_mlp": 0.01041406, + "balance_loss_clip": 1.04798675, + "balance_loss_mlp": 1.02699387, + "epoch": 0.3856906658650233, + "flos": 23987789602560.0, + "grad_norm": 2.050018804744751, + "language_loss": 0.77890933, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.80025709, + "num_input_tokens_seen": 137753535, + "step": 6415, + "time_per_iteration": 2.563208818435669 + }, + { + "auxiliary_loss_clip": 0.01099374, + "auxiliary_loss_mlp": 0.01030509, + "balance_loss_clip": 1.04486275, + "balance_loss_mlp": 1.01790822, + "epoch": 0.3857507891176913, + "flos": 25007436979200.0, + "grad_norm": 1.6752205860148979, + "language_loss": 0.79720467, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.8185035, + "num_input_tokens_seen": 137773405, + "step": 6416, + "time_per_iteration": 2.5670573711395264 + }, + { + "auxiliary_loss_clip": 0.01117372, + "auxiliary_loss_mlp": 0.00826688, + "balance_loss_clip": 1.04488778, + "balance_loss_mlp": 1.08185816, + "epoch": 0.38581091237035925, + "flos": 21536778182400.0, + "grad_norm": 2.621507438099194, + "language_loss": 0.79025185, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.80969238, + "num_input_tokens_seen": 137790810, + "step": 6417, + "time_per_iteration": 2.495215892791748 + }, + { + "auxiliary_loss_clip": 0.01095364, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.04224634, + "balance_loss_mlp": 1.02094722, + "epoch": 0.3858710356230272, + "flos": 17383889393280.0, + "grad_norm": 2.8377962582679923, + "language_loss": 0.80012798, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.8214227, + "num_input_tokens_seen": 137810265, + "step": 6418, + "time_per_iteration": 2.5195884704589844 + }, + { + "auxiliary_loss_clip": 0.01094017, + "auxiliary_loss_mlp": 0.010296, + "balance_loss_clip": 1.04240453, + "balance_loss_mlp": 1.01675534, + "epoch": 0.3859311588756952, + "flos": 20339588856960.0, + "grad_norm": 1.891950119925469, + "language_loss": 0.79742724, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.81866336, + "num_input_tokens_seen": 137828580, + "step": 6419, + "time_per_iteration": 2.5282838344573975 + }, + { + "auxiliary_loss_clip": 0.01099853, + "auxiliary_loss_mlp": 0.01030153, + "balance_loss_clip": 1.04338276, + "balance_loss_mlp": 1.01533544, + "epoch": 0.38599128212836314, + "flos": 26321157002880.0, + "grad_norm": 5.137815322181607, + "language_loss": 0.67619896, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.69749904, + "num_input_tokens_seen": 137846145, + "step": 6420, + "time_per_iteration": 2.5504016876220703 + }, + { + "auxiliary_loss_clip": 0.01080617, + "auxiliary_loss_mlp": 0.01040635, + "balance_loss_clip": 1.0442965, + "balance_loss_mlp": 1.02607918, + "epoch": 0.3860514053810311, + "flos": 13553837066880.0, + "grad_norm": 1.8462888562357171, + "language_loss": 0.80995136, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.83116388, + "num_input_tokens_seen": 137863705, + "step": 6421, + "time_per_iteration": 2.540515422821045 + }, + { + "auxiliary_loss_clip": 0.01097788, + "auxiliary_loss_mlp": 0.01035289, + "balance_loss_clip": 1.04285395, + "balance_loss_mlp": 1.02117455, + "epoch": 0.3861115286336991, + "flos": 20954271323520.0, + "grad_norm": 2.2504239316724295, + "language_loss": 0.72204518, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.74337596, + "num_input_tokens_seen": 137880285, + "step": 6422, + "time_per_iteration": 2.4913759231567383 + }, + { + "auxiliary_loss_clip": 0.01097026, + "auxiliary_loss_mlp": 0.01036105, + "balance_loss_clip": 1.04500628, + "balance_loss_mlp": 1.02296257, + "epoch": 0.38617165188636704, + "flos": 16362697731840.0, + "grad_norm": 1.8788961206036923, + "language_loss": 0.667135, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.68846631, + "num_input_tokens_seen": 137898335, + "step": 6423, + "time_per_iteration": 2.5088589191436768 + }, + { + "auxiliary_loss_clip": 0.01118285, + "auxiliary_loss_mlp": 0.01039412, + "balance_loss_clip": 1.04683709, + "balance_loss_mlp": 1.02602458, + "epoch": 0.386231775139035, + "flos": 34787276893440.0, + "grad_norm": 2.3321006399855415, + "language_loss": 0.68705982, + "learning_rate": 2.810068143123449e-06, + "loss": 0.70863676, + "num_input_tokens_seen": 137918605, + "step": 6424, + "time_per_iteration": 2.607577085494995 + }, + { + "auxiliary_loss_clip": 0.01091057, + "auxiliary_loss_mlp": 0.01034873, + "balance_loss_clip": 1.04500055, + "balance_loss_mlp": 1.02098525, + "epoch": 0.38629189839170297, + "flos": 21726171619200.0, + "grad_norm": 1.3686456970652345, + "language_loss": 0.72535127, + "learning_rate": 2.809712042331429e-06, + "loss": 0.74661058, + "num_input_tokens_seen": 137938245, + "step": 6425, + "time_per_iteration": 2.5879650115966797 + }, + { + "auxiliary_loss_clip": 0.01092373, + "auxiliary_loss_mlp": 0.00843651, + "balance_loss_clip": 1.04210281, + "balance_loss_mlp": 1.10785317, + "epoch": 0.38635202164437094, + "flos": 27923634460800.0, + "grad_norm": 2.3596081162857896, + "language_loss": 0.80769169, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.827052, + "num_input_tokens_seen": 137956770, + "step": 6426, + "time_per_iteration": 2.589916229248047 + }, + { + "auxiliary_loss_clip": 0.01116973, + "auxiliary_loss_mlp": 0.01032919, + "balance_loss_clip": 1.04607701, + "balance_loss_mlp": 1.01857805, + "epoch": 0.38641214489703896, + "flos": 23586631534080.0, + "grad_norm": 2.475049776675903, + "language_loss": 0.74423909, + "learning_rate": 2.80899974864781e-06, + "loss": 0.76573795, + "num_input_tokens_seen": 137977040, + "step": 6427, + "time_per_iteration": 2.530529022216797 + }, + { + "auxiliary_loss_clip": 0.01069416, + "auxiliary_loss_mlp": 0.01046206, + "balance_loss_clip": 1.04116869, + "balance_loss_mlp": 1.02990997, + "epoch": 0.3864722681497069, + "flos": 12641239198080.0, + "grad_norm": 2.607437988584307, + "language_loss": 0.70625383, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.72741008, + "num_input_tokens_seen": 137993545, + "step": 6428, + "time_per_iteration": 2.5679309368133545 + }, + { + "auxiliary_loss_clip": 0.01106689, + "auxiliary_loss_mlp": 0.010446, + "balance_loss_clip": 1.04696202, + "balance_loss_mlp": 1.03084278, + "epoch": 0.3865323914023749, + "flos": 17598922162560.0, + "grad_norm": 2.0686906074719342, + "language_loss": 0.84260613, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.86411899, + "num_input_tokens_seen": 138010140, + "step": 6429, + "time_per_iteration": 2.509223222732544 + }, + { + "auxiliary_loss_clip": 0.01103519, + "auxiliary_loss_mlp": 0.01032536, + "balance_loss_clip": 1.04334426, + "balance_loss_mlp": 1.01857638, + "epoch": 0.38659251465504285, + "flos": 18478949374080.0, + "grad_norm": 1.8696135737964616, + "language_loss": 0.80969548, + "learning_rate": 2.807931078076015e-06, + "loss": 0.83105612, + "num_input_tokens_seen": 138028880, + "step": 6430, + "time_per_iteration": 2.526221513748169 + }, + { + "auxiliary_loss_clip": 0.01020528, + "auxiliary_loss_mlp": 0.01006412, + "balance_loss_clip": 1.02100384, + "balance_loss_mlp": 1.00470734, + "epoch": 0.3866526379077108, + "flos": 64165726978560.0, + "grad_norm": 0.7272513900087025, + "language_loss": 0.58815855, + "learning_rate": 2.807574793260416e-06, + "loss": 0.60842794, + "num_input_tokens_seen": 138098090, + "step": 6431, + "time_per_iteration": 3.213054895401001 + }, + { + "auxiliary_loss_clip": 0.01074426, + "auxiliary_loss_mlp": 0.01036013, + "balance_loss_clip": 1.03980923, + "balance_loss_mlp": 1.02006245, + "epoch": 0.3867127611603788, + "flos": 14388292897920.0, + "grad_norm": 1.8270629578559334, + "language_loss": 0.79114032, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.81224471, + "num_input_tokens_seen": 138114735, + "step": 6432, + "time_per_iteration": 2.595115900039673 + }, + { + "auxiliary_loss_clip": 0.01114785, + "auxiliary_loss_mlp": 0.01044955, + "balance_loss_clip": 1.04202831, + "balance_loss_mlp": 1.02947009, + "epoch": 0.38677288441304675, + "flos": 20010754823040.0, + "grad_norm": 2.149948489889016, + "language_loss": 0.80768776, + "learning_rate": 2.806862131772779e-06, + "loss": 0.82928514, + "num_input_tokens_seen": 138130480, + "step": 6433, + "time_per_iteration": 2.4791829586029053 + }, + { + "auxiliary_loss_clip": 0.01100618, + "auxiliary_loss_mlp": 0.01034886, + "balance_loss_clip": 1.04633021, + "balance_loss_mlp": 1.01981807, + "epoch": 0.3868330076657147, + "flos": 22236893147520.0, + "grad_norm": 1.5905732435704893, + "language_loss": 0.70676589, + "learning_rate": 2.806505755127765e-06, + "loss": 0.72812092, + "num_input_tokens_seen": 138150640, + "step": 6434, + "time_per_iteration": 2.5497403144836426 + }, + { + "auxiliary_loss_clip": 0.01089502, + "auxiliary_loss_mlp": 0.0104579, + "balance_loss_clip": 1.04444432, + "balance_loss_mlp": 1.02862418, + "epoch": 0.3868931309183827, + "flos": 16727442387840.0, + "grad_norm": 1.6536891542224736, + "language_loss": 0.77389848, + "learning_rate": 2.806149347899972e-06, + "loss": 0.79525143, + "num_input_tokens_seen": 138169700, + "step": 6435, + "time_per_iteration": 2.534238815307617 + }, + { + "auxiliary_loss_clip": 0.01110931, + "auxiliary_loss_mlp": 0.01032349, + "balance_loss_clip": 1.04329062, + "balance_loss_mlp": 1.01800823, + "epoch": 0.38695325417105064, + "flos": 22674716023680.0, + "grad_norm": 1.7212729875705899, + "language_loss": 0.7963196, + "learning_rate": 2.805792910102915e-06, + "loss": 0.81775248, + "num_input_tokens_seen": 138185835, + "step": 6436, + "time_per_iteration": 2.5122249126434326 + }, + { + "auxiliary_loss_clip": 0.01096522, + "auxiliary_loss_mlp": 0.01031369, + "balance_loss_clip": 1.04478908, + "balance_loss_mlp": 1.01764178, + "epoch": 0.3870133774237186, + "flos": 23112036109440.0, + "grad_norm": 1.6479397156938658, + "language_loss": 0.76463914, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.785918, + "num_input_tokens_seen": 138204080, + "step": 6437, + "time_per_iteration": 2.5441629886627197 + }, + { + "auxiliary_loss_clip": 0.01104473, + "auxiliary_loss_mlp": 0.01038849, + "balance_loss_clip": 1.04617572, + "balance_loss_mlp": 1.02572989, + "epoch": 0.3870735006763866, + "flos": 17675699483520.0, + "grad_norm": 2.319382720355043, + "language_loss": 0.81308693, + "learning_rate": 2.805079942855074e-06, + "loss": 0.83452016, + "num_input_tokens_seen": 138220710, + "step": 6438, + "time_per_iteration": 2.500417709350586 + }, + { + "auxiliary_loss_clip": 0.01103677, + "auxiliary_loss_mlp": 0.00825739, + "balance_loss_clip": 1.04211783, + "balance_loss_mlp": 1.07441044, + "epoch": 0.38713362392905454, + "flos": 23295791111040.0, + "grad_norm": 1.3674720515988905, + "language_loss": 0.75226122, + "learning_rate": 2.804723413431326e-06, + "loss": 0.77155542, + "num_input_tokens_seen": 138241720, + "step": 6439, + "time_per_iteration": 2.528979539871216 + }, + { + "auxiliary_loss_clip": 0.01122976, + "auxiliary_loss_mlp": 0.01029407, + "balance_loss_clip": 1.04589605, + "balance_loss_mlp": 1.01575136, + "epoch": 0.38719374718172256, + "flos": 21031192298880.0, + "grad_norm": 1.5085068929948482, + "language_loss": 0.73732185, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.75884569, + "num_input_tokens_seen": 138261885, + "step": 6440, + "time_per_iteration": 2.481387138366699 + }, + { + "auxiliary_loss_clip": 0.01116932, + "auxiliary_loss_mlp": 0.01036042, + "balance_loss_clip": 1.04523826, + "balance_loss_mlp": 1.02127779, + "epoch": 0.3872538704343905, + "flos": 19609776322560.0, + "grad_norm": 2.008735136208327, + "language_loss": 0.82141852, + "learning_rate": 2.804010263051774e-06, + "loss": 0.84294826, + "num_input_tokens_seen": 138280255, + "step": 6441, + "time_per_iteration": 2.470254898071289 + }, + { + "auxiliary_loss_clip": 0.01127464, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_clip": 1.04701149, + "balance_loss_mlp": 1.0272851, + "epoch": 0.3873139936870585, + "flos": 17530045833600.0, + "grad_norm": 2.4995264736687712, + "language_loss": 0.81375575, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.83544505, + "num_input_tokens_seen": 138296675, + "step": 6442, + "time_per_iteration": 2.4487905502319336 + }, + { + "auxiliary_loss_clip": 0.0108562, + "auxiliary_loss_mlp": 0.01035302, + "balance_loss_clip": 1.04730308, + "balance_loss_mlp": 1.0205555, + "epoch": 0.38737411693972645, + "flos": 17786555832960.0, + "grad_norm": 1.6953770233927317, + "language_loss": 0.83627522, + "learning_rate": 2.803296990719624e-06, + "loss": 0.85748446, + "num_input_tokens_seen": 138314985, + "step": 6443, + "time_per_iteration": 2.5268115997314453 + }, + { + "auxiliary_loss_clip": 0.01025715, + "auxiliary_loss_mlp": 0.01002132, + "balance_loss_clip": 1.01959956, + "balance_loss_mlp": 1.00057018, + "epoch": 0.3874342401923944, + "flos": 58304637048960.0, + "grad_norm": 0.764860946417262, + "language_loss": 0.50276268, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.52304113, + "num_input_tokens_seen": 138373275, + "step": 6444, + "time_per_iteration": 3.137730836868286 + }, + { + "auxiliary_loss_clip": 0.01081012, + "auxiliary_loss_mlp": 0.00815765, + "balance_loss_clip": 1.04089999, + "balance_loss_mlp": 1.05555105, + "epoch": 0.3874943634450624, + "flos": 17711933328000.0, + "grad_norm": 1.7533818152183194, + "language_loss": 0.78514814, + "learning_rate": 2.802583596543065e-06, + "loss": 0.80411595, + "num_input_tokens_seen": 138391145, + "step": 6445, + "time_per_iteration": 2.519932985305786 + }, + { + "auxiliary_loss_clip": 0.01108652, + "auxiliary_loss_mlp": 0.01036438, + "balance_loss_clip": 1.04722869, + "balance_loss_mlp": 1.02196622, + "epoch": 0.38755448669773035, + "flos": 19244852098560.0, + "grad_norm": 2.1323036684837424, + "language_loss": 0.81059605, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83204699, + "num_input_tokens_seen": 138409875, + "step": 6446, + "time_per_iteration": 3.885542631149292 + }, + { + "auxiliary_loss_clip": 0.01101309, + "auxiliary_loss_mlp": 0.01037506, + "balance_loss_clip": 1.04401898, + "balance_loss_mlp": 1.02390981, + "epoch": 0.3876146099503983, + "flos": 20594267262720.0, + "grad_norm": 1.7723757773128392, + "language_loss": 0.77116525, + "learning_rate": 2.801870080630306e-06, + "loss": 0.79255342, + "num_input_tokens_seen": 138428965, + "step": 6447, + "time_per_iteration": 2.538090705871582 + }, + { + "auxiliary_loss_clip": 0.01102183, + "auxiliary_loss_mlp": 0.01029247, + "balance_loss_clip": 1.04315066, + "balance_loss_mlp": 1.01612806, + "epoch": 0.3876747332030663, + "flos": 19281121856640.0, + "grad_norm": 1.5046075134640446, + "language_loss": 0.76064706, + "learning_rate": 2.801513277056671e-06, + "loss": 0.78196132, + "num_input_tokens_seen": 138448090, + "step": 6448, + "time_per_iteration": 3.904160499572754 + }, + { + "auxiliary_loss_clip": 0.01095362, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.0449307, + "balance_loss_mlp": 1.02147222, + "epoch": 0.38773485645573424, + "flos": 18945895201920.0, + "grad_norm": 1.903983117026687, + "language_loss": 0.76179063, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.78310078, + "num_input_tokens_seen": 138466105, + "step": 6449, + "time_per_iteration": 3.9452857971191406 + }, + { + "auxiliary_loss_clip": 0.0109104, + "auxiliary_loss_mlp": 0.00800594, + "balance_loss_clip": 1.04140019, + "balance_loss_mlp": 1.02999365, + "epoch": 0.3877949797084022, + "flos": 23071348978560.0, + "grad_norm": 1.7083311531223624, + "language_loss": 0.78086996, + "learning_rate": 2.800799578742542e-06, + "loss": 0.79978633, + "num_input_tokens_seen": 138485160, + "step": 6450, + "time_per_iteration": 2.5815467834472656 + }, + { + "auxiliary_loss_clip": 0.01128808, + "auxiliary_loss_mlp": 0.01035254, + "balance_loss_clip": 1.04447317, + "balance_loss_mlp": 1.02072239, + "epoch": 0.3878551029610702, + "flos": 29095543589760.0, + "grad_norm": 3.0040861983745573, + "language_loss": 0.78094685, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.80258751, + "num_input_tokens_seen": 138504135, + "step": 6451, + "time_per_iteration": 2.5011773109436035 + }, + { + "auxiliary_loss_clip": 0.01120951, + "auxiliary_loss_mlp": 0.01026163, + "balance_loss_clip": 1.0443331, + "balance_loss_mlp": 1.01289463, + "epoch": 0.38791522621373814, + "flos": 20996394998400.0, + "grad_norm": 1.6754834816634068, + "language_loss": 0.76596618, + "learning_rate": 2.800085758962812e-06, + "loss": 0.78743738, + "num_input_tokens_seen": 138523955, + "step": 6452, + "time_per_iteration": 3.8513684272766113 + }, + { + "auxiliary_loss_clip": 0.01097488, + "auxiliary_loss_mlp": 0.01042866, + "balance_loss_clip": 1.04607129, + "balance_loss_mlp": 1.02930605, + "epoch": 0.3879753494664061, + "flos": 15486836497920.0, + "grad_norm": 1.5418332183774686, + "language_loss": 0.79779148, + "learning_rate": 2.799728803557182e-06, + "loss": 0.81919497, + "num_input_tokens_seen": 138541655, + "step": 6453, + "time_per_iteration": 2.503101348876953 + }, + { + "auxiliary_loss_clip": 0.01123483, + "auxiliary_loss_mlp": 0.01036957, + "balance_loss_clip": 1.04784894, + "balance_loss_mlp": 1.02246666, + "epoch": 0.3880354727190741, + "flos": 22053964158720.0, + "grad_norm": 1.8613909243975562, + "language_loss": 0.71524799, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.73685241, + "num_input_tokens_seen": 138560860, + "step": 6454, + "time_per_iteration": 2.5138587951660156 + }, + { + "auxiliary_loss_clip": 0.01128282, + "auxiliary_loss_mlp": 0.01035994, + "balance_loss_clip": 1.04551542, + "balance_loss_mlp": 1.0214262, + "epoch": 0.3880955959717421, + "flos": 20340307128960.0, + "grad_norm": 1.735057366790028, + "language_loss": 0.77912402, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.80076677, + "num_input_tokens_seen": 138580200, + "step": 6455, + "time_per_iteration": 2.4472837448120117 + }, + { + "auxiliary_loss_clip": 0.01124393, + "auxiliary_loss_mlp": 0.01034312, + "balance_loss_clip": 1.04622817, + "balance_loss_mlp": 1.02020931, + "epoch": 0.38815571922441006, + "flos": 23075407215360.0, + "grad_norm": 1.4889744514211738, + "language_loss": 0.75513232, + "learning_rate": 2.798657755439662e-06, + "loss": 0.77671939, + "num_input_tokens_seen": 138598315, + "step": 6456, + "time_per_iteration": 2.4791674613952637 + }, + { + "auxiliary_loss_clip": 0.01056443, + "auxiliary_loss_mlp": 0.01035683, + "balance_loss_clip": 1.04675364, + "balance_loss_mlp": 1.02109766, + "epoch": 0.388215842477078, + "flos": 20776944856320.0, + "grad_norm": 2.213673571912731, + "language_loss": 0.60642117, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.6273424, + "num_input_tokens_seen": 138615695, + "step": 6457, + "time_per_iteration": 2.609815835952759 + }, + { + "auxiliary_loss_clip": 0.01127587, + "auxiliary_loss_mlp": 0.01036381, + "balance_loss_clip": 1.04558468, + "balance_loss_mlp": 1.0210743, + "epoch": 0.388275965729746, + "flos": 20448182649600.0, + "grad_norm": 2.1542281506287564, + "language_loss": 0.80208737, + "learning_rate": 2.797943571912841e-06, + "loss": 0.82372701, + "num_input_tokens_seen": 138633180, + "step": 6458, + "time_per_iteration": 2.4734554290771484 + }, + { + "auxiliary_loss_clip": 0.01077192, + "auxiliary_loss_mlp": 0.01040871, + "balance_loss_clip": 1.04418659, + "balance_loss_mlp": 1.02476537, + "epoch": 0.38833608898241395, + "flos": 27892392606720.0, + "grad_norm": 1.7597646954227897, + "language_loss": 0.81342506, + "learning_rate": 2.797586434755509e-06, + "loss": 0.83460569, + "num_input_tokens_seen": 138654785, + "step": 6459, + "time_per_iteration": 2.635648727416992 + }, + { + "auxiliary_loss_clip": 0.01100715, + "auxiliary_loss_mlp": 0.01035609, + "balance_loss_clip": 1.04548264, + "balance_loss_mlp": 1.02216172, + "epoch": 0.3883962122350819, + "flos": 18076390675200.0, + "grad_norm": 2.160878612451965, + "language_loss": 0.61459649, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.63595963, + "num_input_tokens_seen": 138673330, + "step": 6460, + "time_per_iteration": 2.5126261711120605 + }, + { + "auxiliary_loss_clip": 0.01112293, + "auxiliary_loss_mlp": 0.01029407, + "balance_loss_clip": 1.04562688, + "balance_loss_mlp": 1.01717639, + "epoch": 0.3884563354877499, + "flos": 23622254847360.0, + "grad_norm": 1.7094844631207189, + "language_loss": 0.85987031, + "learning_rate": 2.796872069720717e-06, + "loss": 0.88128734, + "num_input_tokens_seen": 138694185, + "step": 6461, + "time_per_iteration": 2.5585741996765137 + }, + { + "auxiliary_loss_clip": 0.01111934, + "auxiliary_loss_mlp": 0.0103606, + "balance_loss_clip": 1.04636383, + "balance_loss_mlp": 1.02229702, + "epoch": 0.38851645874041785, + "flos": 27453528236160.0, + "grad_norm": 3.1576709434995545, + "language_loss": 0.71633315, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.737813, + "num_input_tokens_seen": 138714625, + "step": 6462, + "time_per_iteration": 2.546323776245117 + }, + { + "auxiliary_loss_clip": 0.01082003, + "auxiliary_loss_mlp": 0.01039119, + "balance_loss_clip": 1.04122937, + "balance_loss_mlp": 1.02346683, + "epoch": 0.3885765819930858, + "flos": 25228072270080.0, + "grad_norm": 2.3346676739206433, + "language_loss": 0.76121771, + "learning_rate": 2.796157583816052e-06, + "loss": 0.78242898, + "num_input_tokens_seen": 138733585, + "step": 6463, + "time_per_iteration": 2.6059985160827637 + }, + { + "auxiliary_loss_clip": 0.01095339, + "auxiliary_loss_mlp": 0.01040928, + "balance_loss_clip": 1.04681075, + "balance_loss_mlp": 1.02487063, + "epoch": 0.3886367052457538, + "flos": 16946605221120.0, + "grad_norm": 1.9682475406331104, + "language_loss": 0.7051025, + "learning_rate": 2.795800295571382e-06, + "loss": 0.72646517, + "num_input_tokens_seen": 138752335, + "step": 6464, + "time_per_iteration": 2.5421345233917236 + }, + { + "auxiliary_loss_clip": 0.01101926, + "auxiliary_loss_mlp": 0.01031564, + "balance_loss_clip": 1.04567432, + "balance_loss_mlp": 1.01695502, + "epoch": 0.38869682849842174, + "flos": 27154140376320.0, + "grad_norm": 1.9679199729978532, + "language_loss": 0.69853705, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.719872, + "num_input_tokens_seen": 138768450, + "step": 6465, + "time_per_iteration": 2.568559408187866 + }, + { + "auxiliary_loss_clip": 0.01092239, + "auxiliary_loss_mlp": 0.01040036, + "balance_loss_clip": 1.04632998, + "balance_loss_mlp": 1.02490854, + "epoch": 0.3887569517510897, + "flos": 21063619301760.0, + "grad_norm": 2.332718092490034, + "language_loss": 0.77926546, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.80058819, + "num_input_tokens_seen": 138786775, + "step": 6466, + "time_per_iteration": 2.560119152069092 + }, + { + "auxiliary_loss_clip": 0.01089494, + "auxiliary_loss_mlp": 0.0103751, + "balance_loss_clip": 1.04366446, + "balance_loss_mlp": 1.02291858, + "epoch": 0.38881707500375773, + "flos": 29497384016640.0, + "grad_norm": 1.4453514793539861, + "language_loss": 0.6945613, + "learning_rate": 2.794728249830611e-06, + "loss": 0.71583128, + "num_input_tokens_seen": 138810100, + "step": 6467, + "time_per_iteration": 2.630826711654663 + }, + { + "auxiliary_loss_clip": 0.01090336, + "auxiliary_loss_mlp": 0.01041421, + "balance_loss_clip": 1.04450011, + "balance_loss_mlp": 1.0263114, + "epoch": 0.3888771982564257, + "flos": 17488281294720.0, + "grad_norm": 2.5547947742705253, + "language_loss": 0.83851385, + "learning_rate": 2.794370840959936e-06, + "loss": 0.85983139, + "num_input_tokens_seen": 138825140, + "step": 6468, + "time_per_iteration": 2.523120164871216 + }, + { + "auxiliary_loss_clip": 0.01108043, + "auxiliary_loss_mlp": 0.01033603, + "balance_loss_clip": 1.04663968, + "balance_loss_mlp": 1.02063346, + "epoch": 0.38893732150909366, + "flos": 21942425450880.0, + "grad_norm": 1.774128131425595, + "language_loss": 0.8424986, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.86391509, + "num_input_tokens_seen": 138844115, + "step": 6469, + "time_per_iteration": 2.6161844730377197 + }, + { + "auxiliary_loss_clip": 0.01087838, + "auxiliary_loss_mlp": 0.01040393, + "balance_loss_clip": 1.04486704, + "balance_loss_mlp": 1.02450871, + "epoch": 0.3889974447617616, + "flos": 24276367468800.0, + "grad_norm": 1.9106429427235614, + "language_loss": 0.74792325, + "learning_rate": 2.793655932864273e-06, + "loss": 0.76920557, + "num_input_tokens_seen": 138860860, + "step": 6470, + "time_per_iteration": 2.5828757286071777 + }, + { + "auxiliary_loss_clip": 0.01088232, + "auxiliary_loss_mlp": 0.00804018, + "balance_loss_clip": 1.04550672, + "balance_loss_mlp": 1.0370028, + "epoch": 0.3890575680144296, + "flos": 25667116208640.0, + "grad_norm": 1.6761743616331546, + "language_loss": 0.74701369, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.7659362, + "num_input_tokens_seen": 138881910, + "step": 6471, + "time_per_iteration": 2.634056806564331 + }, + { + "auxiliary_loss_clip": 0.01078258, + "auxiliary_loss_mlp": 0.01042269, + "balance_loss_clip": 1.04491842, + "balance_loss_mlp": 1.02689099, + "epoch": 0.38911769126709755, + "flos": 22855274714880.0, + "grad_norm": 1.6695312146650534, + "language_loss": 0.67775333, + "learning_rate": 2.792940904386562e-06, + "loss": 0.69895864, + "num_input_tokens_seen": 138900975, + "step": 6472, + "time_per_iteration": 2.594273090362549 + }, + { + "auxiliary_loss_clip": 0.01103368, + "auxiliary_loss_mlp": 0.01040574, + "balance_loss_clip": 1.05138922, + "balance_loss_mlp": 1.02698433, + "epoch": 0.3891778145197655, + "flos": 25447522412160.0, + "grad_norm": 1.747807806089283, + "language_loss": 0.7630446, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.78448403, + "num_input_tokens_seen": 138920795, + "step": 6473, + "time_per_iteration": 2.615492343902588 + }, + { + "auxiliary_loss_clip": 0.01109847, + "auxiliary_loss_mlp": 0.01039906, + "balance_loss_clip": 1.05172789, + "balance_loss_mlp": 1.02495754, + "epoch": 0.3892379377724335, + "flos": 14027965614720.0, + "grad_norm": 2.426244317201221, + "language_loss": 0.70520478, + "learning_rate": 2.792225755635257e-06, + "loss": 0.72670233, + "num_input_tokens_seen": 138938770, + "step": 6474, + "time_per_iteration": 2.514801025390625 + }, + { + "auxiliary_loss_clip": 0.01128446, + "auxiliary_loss_mlp": 0.01038883, + "balance_loss_clip": 1.04740047, + "balance_loss_mlp": 1.0251137, + "epoch": 0.38929806102510145, + "flos": 20157449967360.0, + "grad_norm": 1.5106461438317134, + "language_loss": 0.68851876, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.71019208, + "num_input_tokens_seen": 138958880, + "step": 6475, + "time_per_iteration": 2.4920084476470947 + }, + { + "auxiliary_loss_clip": 0.01107094, + "auxiliary_loss_mlp": 0.01052668, + "balance_loss_clip": 1.04650784, + "balance_loss_mlp": 1.03551376, + "epoch": 0.3893581842777694, + "flos": 22163958581760.0, + "grad_norm": 1.8907976078309567, + "language_loss": 0.75904459, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.78064221, + "num_input_tokens_seen": 138977240, + "step": 6476, + "time_per_iteration": 2.5170702934265137 + }, + { + "auxiliary_loss_clip": 0.01043044, + "auxiliary_loss_mlp": 0.01003089, + "balance_loss_clip": 1.0333693, + "balance_loss_mlp": 1.00152743, + "epoch": 0.3894183075304374, + "flos": 67301877392640.0, + "grad_norm": 0.7722506237719168, + "language_loss": 0.58183157, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.6022929, + "num_input_tokens_seen": 139039035, + "step": 6477, + "time_per_iteration": 3.160775661468506 + }, + { + "auxiliary_loss_clip": 0.01089329, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.04946077, + "balance_loss_mlp": 1.0181129, + "epoch": 0.38947843078310534, + "flos": 18547502480640.0, + "grad_norm": 1.752639916581999, + "language_loss": 0.78310955, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.80434537, + "num_input_tokens_seen": 139055560, + "step": 6478, + "time_per_iteration": 2.5590474605560303 + }, + { + "auxiliary_loss_clip": 0.01113235, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.04565907, + "balance_loss_mlp": 1.01938057, + "epoch": 0.3895385540357733, + "flos": 14605875532800.0, + "grad_norm": 2.302267351062905, + "language_loss": 0.82982665, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.85129672, + "num_input_tokens_seen": 139071865, + "step": 6479, + "time_per_iteration": 2.5025928020477295 + }, + { + "auxiliary_loss_clip": 0.01129081, + "auxiliary_loss_mlp": 0.01035035, + "balance_loss_clip": 1.04950202, + "balance_loss_mlp": 1.02003217, + "epoch": 0.38959867728844133, + "flos": 19975203336960.0, + "grad_norm": 1.8531599597910755, + "language_loss": 0.80110466, + "learning_rate": 2.790079588824617e-06, + "loss": 0.82274586, + "num_input_tokens_seen": 139089640, + "step": 6480, + "time_per_iteration": 2.4675283432006836 + }, + { + "auxiliary_loss_clip": 0.01101133, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.0458858, + "balance_loss_mlp": 1.01864314, + "epoch": 0.3896588005411093, + "flos": 22672130244480.0, + "grad_norm": 1.695870955779693, + "language_loss": 0.82991642, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85125566, + "num_input_tokens_seen": 139109365, + "step": 6481, + "time_per_iteration": 2.549489974975586 + }, + { + "auxiliary_loss_clip": 0.01100599, + "auxiliary_loss_mlp": 0.01034277, + "balance_loss_clip": 1.04958677, + "balance_loss_mlp": 1.02066362, + "epoch": 0.38971892379377726, + "flos": 20996035862400.0, + "grad_norm": 1.580801308547055, + "language_loss": 0.75310647, + "learning_rate": 2.789363960063863e-06, + "loss": 0.77445531, + "num_input_tokens_seen": 139128260, + "step": 6482, + "time_per_iteration": 2.5298352241516113 + }, + { + "auxiliary_loss_clip": 0.01098993, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.04976058, + "balance_loss_mlp": 1.0219022, + "epoch": 0.3897790470464452, + "flos": 22528487756160.0, + "grad_norm": 1.8238139297719058, + "language_loss": 0.79219306, + "learning_rate": 2.78900610077756e-06, + "loss": 0.81354094, + "num_input_tokens_seen": 139147315, + "step": 6483, + "time_per_iteration": 2.596503973007202 + }, + { + "auxiliary_loss_clip": 0.0111604, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.04799199, + "balance_loss_mlp": 1.01479197, + "epoch": 0.3898391702991132, + "flos": 26209905603840.0, + "grad_norm": 1.4533585735454098, + "language_loss": 0.80041236, + "learning_rate": 2.788648211572067e-06, + "loss": 0.82187569, + "num_input_tokens_seen": 139167270, + "step": 6484, + "time_per_iteration": 2.5955512523651123 + }, + { + "auxiliary_loss_clip": 0.01112282, + "auxiliary_loss_mlp": 0.01057606, + "balance_loss_clip": 1.04828477, + "balance_loss_mlp": 1.04079747, + "epoch": 0.38989929355178116, + "flos": 21065558636160.0, + "grad_norm": 1.6103396017506837, + "language_loss": 0.77548611, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.79718494, + "num_input_tokens_seen": 139185970, + "step": 6485, + "time_per_iteration": 3.8658671379089355 + }, + { + "auxiliary_loss_clip": 0.01086311, + "auxiliary_loss_mlp": 0.01038872, + "balance_loss_clip": 1.0478512, + "balance_loss_mlp": 1.02360737, + "epoch": 0.3899594168044491, + "flos": 25484115392640.0, + "grad_norm": 2.2792556806055546, + "language_loss": 0.84804285, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.86929464, + "num_input_tokens_seen": 139203730, + "step": 6486, + "time_per_iteration": 2.63047194480896 + }, + { + "auxiliary_loss_clip": 0.01113583, + "auxiliary_loss_mlp": 0.0103546, + "balance_loss_clip": 1.04821718, + "balance_loss_mlp": 1.02112556, + "epoch": 0.3900195400571171, + "flos": 31139363456640.0, + "grad_norm": 2.43840120546838, + "language_loss": 0.85338712, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.87487757, + "num_input_tokens_seen": 139222560, + "step": 6487, + "time_per_iteration": 3.996201753616333 + }, + { + "auxiliary_loss_clip": 0.01099628, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.04555285, + "balance_loss_mlp": 1.01982474, + "epoch": 0.39007966330978505, + "flos": 20229917656320.0, + "grad_norm": 1.5363464239345481, + "language_loss": 0.72726345, + "learning_rate": 2.787216355829633e-06, + "loss": 0.74860531, + "num_input_tokens_seen": 139242165, + "step": 6488, + "time_per_iteration": 3.9912161827087402 + }, + { + "auxiliary_loss_clip": 0.01095005, + "auxiliary_loss_mlp": 0.01048057, + "balance_loss_clip": 1.05022418, + "balance_loss_mlp": 1.0308907, + "epoch": 0.390139786562453, + "flos": 22528739151360.0, + "grad_norm": 1.946188694621917, + "language_loss": 0.688541, + "learning_rate": 2.786858317231779e-06, + "loss": 0.70997167, + "num_input_tokens_seen": 139262525, + "step": 6489, + "time_per_iteration": 2.5928304195404053 + }, + { + "auxiliary_loss_clip": 0.01103726, + "auxiliary_loss_mlp": 0.01038444, + "balance_loss_clip": 1.04915094, + "balance_loss_mlp": 1.02438283, + "epoch": 0.390199909815121, + "flos": 26432911192320.0, + "grad_norm": 1.488548938234222, + "language_loss": 0.80586272, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.82728446, + "num_input_tokens_seen": 139282835, + "step": 6490, + "time_per_iteration": 2.5794286727905273 + }, + { + "auxiliary_loss_clip": 0.01120222, + "auxiliary_loss_mlp": 0.01037825, + "balance_loss_clip": 1.04919124, + "balance_loss_mlp": 1.02309656, + "epoch": 0.39026003306778895, + "flos": 17274577328640.0, + "grad_norm": 1.9041890391731375, + "language_loss": 0.89497292, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.91655338, + "num_input_tokens_seen": 139299490, + "step": 6491, + "time_per_iteration": 3.843534231185913 + }, + { + "auxiliary_loss_clip": 0.01086671, + "auxiliary_loss_mlp": 0.01044441, + "balance_loss_clip": 1.04470718, + "balance_loss_mlp": 1.02898002, + "epoch": 0.3903201563204569, + "flos": 24532841554560.0, + "grad_norm": 1.7388168573376321, + "language_loss": 0.78545451, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.80676562, + "num_input_tokens_seen": 139317865, + "step": 6492, + "time_per_iteration": 2.593726396560669 + }, + { + "auxiliary_loss_clip": 0.01104122, + "auxiliary_loss_mlp": 0.01039043, + "balance_loss_clip": 1.0449723, + "balance_loss_mlp": 1.02465487, + "epoch": 0.39038027957312493, + "flos": 23767944410880.0, + "grad_norm": 1.771331957781986, + "language_loss": 0.73977494, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.76120663, + "num_input_tokens_seen": 139339840, + "step": 6493, + "time_per_iteration": 2.5584890842437744 + }, + { + "auxiliary_loss_clip": 0.01095073, + "auxiliary_loss_mlp": 0.01038093, + "balance_loss_clip": 1.04891884, + "balance_loss_mlp": 1.02340651, + "epoch": 0.3904404028257929, + "flos": 14100612871680.0, + "grad_norm": 2.2268302183507402, + "language_loss": 0.76042521, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.78175688, + "num_input_tokens_seen": 139357555, + "step": 6494, + "time_per_iteration": 2.6278977394104004 + }, + { + "auxiliary_loss_clip": 0.01124257, + "auxiliary_loss_mlp": 0.01044904, + "balance_loss_clip": 1.04912281, + "balance_loss_mlp": 1.029109, + "epoch": 0.39050052607846086, + "flos": 16910048154240.0, + "grad_norm": 1.896942616763331, + "language_loss": 0.74498981, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.76668143, + "num_input_tokens_seen": 139374455, + "step": 6495, + "time_per_iteration": 2.4797170162200928 + }, + { + "auxiliary_loss_clip": 0.01132298, + "auxiliary_loss_mlp": 0.01038809, + "balance_loss_clip": 1.05101883, + "balance_loss_mlp": 1.02319217, + "epoch": 0.39056064933112883, + "flos": 25915761129600.0, + "grad_norm": 1.6358953334433695, + "language_loss": 0.68050885, + "learning_rate": 2.784351212350352e-06, + "loss": 0.70221996, + "num_input_tokens_seen": 139394770, + "step": 6496, + "time_per_iteration": 2.527200698852539 + }, + { + "auxiliary_loss_clip": 0.01033313, + "auxiliary_loss_mlp": 0.01009611, + "balance_loss_clip": 1.03580642, + "balance_loss_mlp": 1.0081389, + "epoch": 0.3906207725837968, + "flos": 60028421713920.0, + "grad_norm": 0.6619433058669779, + "language_loss": 0.53951764, + "learning_rate": 2.783992935430775e-06, + "loss": 0.55994689, + "num_input_tokens_seen": 139454760, + "step": 6497, + "time_per_iteration": 3.2710530757904053 + }, + { + "auxiliary_loss_clip": 0.01094154, + "auxiliary_loss_mlp": 0.00790177, + "balance_loss_clip": 1.05266786, + "balance_loss_mlp": 1.01068962, + "epoch": 0.39068089583646476, + "flos": 21068683119360.0, + "grad_norm": 2.196080702422757, + "language_loss": 0.69515872, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.71400201, + "num_input_tokens_seen": 139472645, + "step": 6498, + "time_per_iteration": 2.5727479457855225 + }, + { + "auxiliary_loss_clip": 0.01029207, + "auxiliary_loss_mlp": 0.0100566, + "balance_loss_clip": 1.02966237, + "balance_loss_mlp": 1.00408649, + "epoch": 0.3907410190891327, + "flos": 70445677403520.0, + "grad_norm": 0.7579735807900574, + "language_loss": 0.51742053, + "learning_rate": 2.783276292417936e-06, + "loss": 0.5377692, + "num_input_tokens_seen": 139536730, + "step": 6499, + "time_per_iteration": 3.239419460296631 + }, + { + "auxiliary_loss_clip": 0.01119521, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.04771924, + "balance_loss_mlp": 1.02535498, + "epoch": 0.3908011423418007, + "flos": 27962454084480.0, + "grad_norm": 1.591167839935375, + "language_loss": 0.73884344, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.76045543, + "num_input_tokens_seen": 139557540, + "step": 6500, + "time_per_iteration": 2.5714402198791504 + }, + { + "auxiliary_loss_clip": 0.01121495, + "auxiliary_loss_mlp": 0.01037048, + "balance_loss_clip": 1.0518719, + "balance_loss_mlp": 1.02271867, + "epoch": 0.39086126559446865, + "flos": 24462097718400.0, + "grad_norm": 1.7585663925589514, + "language_loss": 0.6875813, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.70916677, + "num_input_tokens_seen": 139576875, + "step": 6501, + "time_per_iteration": 2.5614917278289795 + }, + { + "auxiliary_loss_clip": 0.01116175, + "auxiliary_loss_mlp": 0.01034574, + "balance_loss_clip": 1.04782939, + "balance_loss_mlp": 1.02102637, + "epoch": 0.3909213888471366, + "flos": 16941541403520.0, + "grad_norm": 1.8481110564443044, + "language_loss": 0.79107046, + "learning_rate": 2.782201105168287e-06, + "loss": 0.81257802, + "num_input_tokens_seen": 139594295, + "step": 6502, + "time_per_iteration": 2.4810447692871094 + }, + { + "auxiliary_loss_clip": 0.0110636, + "auxiliary_loss_mlp": 0.0103626, + "balance_loss_clip": 1.04994822, + "balance_loss_mlp": 1.02295589, + "epoch": 0.3909815120998046, + "flos": 29278400751360.0, + "grad_norm": 2.2674713555473294, + "language_loss": 0.80372298, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.82514918, + "num_input_tokens_seen": 139614080, + "step": 6503, + "time_per_iteration": 2.615665912628174 + }, + { + "auxiliary_loss_clip": 0.01101036, + "auxiliary_loss_mlp": 0.01031118, + "balance_loss_clip": 1.04595375, + "balance_loss_mlp": 1.01796341, + "epoch": 0.39104163535247255, + "flos": 18951246328320.0, + "grad_norm": 1.8548129653679282, + "language_loss": 0.7147249, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.73604643, + "num_input_tokens_seen": 139632755, + "step": 6504, + "time_per_iteration": 2.5145716667175293 + }, + { + "auxiliary_loss_clip": 0.01125336, + "auxiliary_loss_mlp": 0.01036441, + "balance_loss_clip": 1.04541564, + "balance_loss_mlp": 1.02180207, + "epoch": 0.3911017586051405, + "flos": 26323347732480.0, + "grad_norm": 1.4214077966931933, + "language_loss": 0.83050662, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.85212439, + "num_input_tokens_seen": 139654205, + "step": 6505, + "time_per_iteration": 2.5910468101501465 + }, + { + "auxiliary_loss_clip": 0.01125664, + "auxiliary_loss_mlp": 0.01037472, + "balance_loss_clip": 1.04749238, + "balance_loss_mlp": 1.02264261, + "epoch": 0.3911618818578085, + "flos": 21835770992640.0, + "grad_norm": 1.8133913656122165, + "language_loss": 0.70827377, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.72990513, + "num_input_tokens_seen": 139673595, + "step": 6506, + "time_per_iteration": 2.4644665718078613 + }, + { + "auxiliary_loss_clip": 0.01102396, + "auxiliary_loss_mlp": 0.01040558, + "balance_loss_clip": 1.04653418, + "balance_loss_mlp": 1.02668214, + "epoch": 0.3912220051104765, + "flos": 16359680989440.0, + "grad_norm": 1.8994859656300183, + "language_loss": 0.74931782, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.77074736, + "num_input_tokens_seen": 139690565, + "step": 6507, + "time_per_iteration": 2.524256706237793 + }, + { + "auxiliary_loss_clip": 0.01050121, + "auxiliary_loss_mlp": 0.01000243, + "balance_loss_clip": 1.02134609, + "balance_loss_mlp": 0.99870533, + "epoch": 0.39128212836314447, + "flos": 71050986420480.0, + "grad_norm": 0.7678655831466287, + "language_loss": 0.5653314, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.58583504, + "num_input_tokens_seen": 139749420, + "step": 6508, + "time_per_iteration": 3.2299723625183105 + }, + { + "auxiliary_loss_clip": 0.01113718, + "auxiliary_loss_mlp": 0.0104039, + "balance_loss_clip": 1.04617667, + "balance_loss_mlp": 1.02669895, + "epoch": 0.39134225161581243, + "flos": 20331975173760.0, + "grad_norm": 1.8487037954179102, + "language_loss": 0.76441288, + "learning_rate": 2.779691297413471e-06, + "loss": 0.785954, + "num_input_tokens_seen": 139766265, + "step": 6509, + "time_per_iteration": 2.5258700847625732 + }, + { + "auxiliary_loss_clip": 0.01094411, + "auxiliary_loss_mlp": 0.01040203, + "balance_loss_clip": 1.04143596, + "balance_loss_mlp": 1.02389479, + "epoch": 0.3914023748684804, + "flos": 17018390551680.0, + "grad_norm": 2.449716144988444, + "language_loss": 0.83010173, + "learning_rate": 2.779332635075825e-06, + "loss": 0.85144788, + "num_input_tokens_seen": 139782400, + "step": 6510, + "time_per_iteration": 2.5130233764648438 + }, + { + "auxiliary_loss_clip": 0.01116321, + "auxiliary_loss_mlp": 0.01036231, + "balance_loss_clip": 1.04559207, + "balance_loss_mlp": 1.02180684, + "epoch": 0.39146249812114836, + "flos": 18405224709120.0, + "grad_norm": 1.712606812413767, + "language_loss": 0.76766217, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.78918767, + "num_input_tokens_seen": 139801435, + "step": 6511, + "time_per_iteration": 2.5103158950805664 + }, + { + "auxiliary_loss_clip": 0.01032997, + "auxiliary_loss_mlp": 0.01000938, + "balance_loss_clip": 1.0264858, + "balance_loss_mlp": 0.99928111, + "epoch": 0.3915226213738163, + "flos": 67637355442560.0, + "grad_norm": 0.7179945269128605, + "language_loss": 0.57777214, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.59811151, + "num_input_tokens_seen": 139869700, + "step": 6512, + "time_per_iteration": 3.1969082355499268 + }, + { + "auxiliary_loss_clip": 0.01127783, + "auxiliary_loss_mlp": 0.01033822, + "balance_loss_clip": 1.04636121, + "balance_loss_mlp": 1.01828873, + "epoch": 0.3915827446264843, + "flos": 26359330181760.0, + "grad_norm": 1.5778886847800162, + "language_loss": 0.69012403, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.71174008, + "num_input_tokens_seen": 139890140, + "step": 6513, + "time_per_iteration": 2.5405921936035156 + }, + { + "auxiliary_loss_clip": 0.01088969, + "auxiliary_loss_mlp": 0.01041526, + "balance_loss_clip": 1.04830778, + "balance_loss_mlp": 1.02635098, + "epoch": 0.39164286787915226, + "flos": 21943897908480.0, + "grad_norm": 2.622752036982753, + "language_loss": 0.75833333, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.77963829, + "num_input_tokens_seen": 139908020, + "step": 6514, + "time_per_iteration": 2.5728330612182617 + }, + { + "auxiliary_loss_clip": 0.01097057, + "auxiliary_loss_mlp": 0.01041887, + "balance_loss_clip": 1.04848766, + "balance_loss_mlp": 1.02823758, + "epoch": 0.3917029911318202, + "flos": 16399829416320.0, + "grad_norm": 1.9146717257782937, + "language_loss": 0.77287781, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.7942673, + "num_input_tokens_seen": 139926180, + "step": 6515, + "time_per_iteration": 2.565471649169922 + }, + { + "auxiliary_loss_clip": 0.01082277, + "auxiliary_loss_mlp": 0.01047044, + "balance_loss_clip": 1.03938937, + "balance_loss_mlp": 1.03272653, + "epoch": 0.3917631143844882, + "flos": 26211701283840.0, + "grad_norm": 1.3357510502584609, + "language_loss": 0.79996264, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.8212558, + "num_input_tokens_seen": 139947420, + "step": 6516, + "time_per_iteration": 2.5914804935455322 + }, + { + "auxiliary_loss_clip": 0.01089147, + "auxiliary_loss_mlp": 0.01035213, + "balance_loss_clip": 1.04702687, + "balance_loss_mlp": 1.02053857, + "epoch": 0.39182323763715615, + "flos": 18548364407040.0, + "grad_norm": 1.8963646474710738, + "language_loss": 0.69793546, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.71917909, + "num_input_tokens_seen": 139965800, + "step": 6517, + "time_per_iteration": 2.594360828399658 + }, + { + "auxiliary_loss_clip": 0.01083654, + "auxiliary_loss_mlp": 0.01040988, + "balance_loss_clip": 1.04360485, + "balance_loss_mlp": 1.02627802, + "epoch": 0.3918833608898241, + "flos": 34313543395200.0, + "grad_norm": 1.6753374280600564, + "language_loss": 0.72310746, + "learning_rate": 2.776462273631956e-06, + "loss": 0.74435389, + "num_input_tokens_seen": 139988140, + "step": 6518, + "time_per_iteration": 2.665189743041992 + }, + { + "auxiliary_loss_clip": 0.01112441, + "auxiliary_loss_mlp": 0.01036934, + "balance_loss_clip": 1.04684412, + "balance_loss_mlp": 1.02210426, + "epoch": 0.3919434841424921, + "flos": 36939582812160.0, + "grad_norm": 2.7049344028561113, + "language_loss": 0.61599064, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.63748443, + "num_input_tokens_seen": 140010060, + "step": 6519, + "time_per_iteration": 2.677950620651245 + }, + { + "auxiliary_loss_clip": 0.01134268, + "auxiliary_loss_mlp": 0.01043602, + "balance_loss_clip": 1.04913616, + "balance_loss_mlp": 1.02789068, + "epoch": 0.3920036073951601, + "flos": 23508956373120.0, + "grad_norm": 1.9278475974219398, + "language_loss": 0.66945064, + "learning_rate": 2.775744388563563e-06, + "loss": 0.69122934, + "num_input_tokens_seen": 140029400, + "step": 6520, + "time_per_iteration": 2.5103533267974854 + }, + { + "auxiliary_loss_clip": 0.01123073, + "auxiliary_loss_mlp": 0.01037454, + "balance_loss_clip": 1.04433596, + "balance_loss_mlp": 1.02305925, + "epoch": 0.39206373064782807, + "flos": 18406086635520.0, + "grad_norm": 1.960270606631519, + "language_loss": 0.78542721, + "learning_rate": 2.775385401898104e-06, + "loss": 0.80703247, + "num_input_tokens_seen": 140048940, + "step": 6521, + "time_per_iteration": 2.480276346206665 + }, + { + "auxiliary_loss_clip": 0.01118085, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.04567254, + "balance_loss_mlp": 1.01657927, + "epoch": 0.39212385390049603, + "flos": 12313051608960.0, + "grad_norm": 3.262720618462274, + "language_loss": 0.70148295, + "learning_rate": 2.775026385829952e-06, + "loss": 0.72300327, + "num_input_tokens_seen": 140066380, + "step": 6522, + "time_per_iteration": 2.486363172531128 + }, + { + "auxiliary_loss_clip": 0.01099447, + "auxiliary_loss_mlp": 0.01032382, + "balance_loss_clip": 1.04256392, + "balance_loss_mlp": 1.01773131, + "epoch": 0.392183977153164, + "flos": 19719160214400.0, + "grad_norm": 2.05340382015813, + "language_loss": 0.76611352, + "learning_rate": 2.774667340372722e-06, + "loss": 0.78743184, + "num_input_tokens_seen": 140085275, + "step": 6523, + "time_per_iteration": 4.015795946121216 + }, + { + "auxiliary_loss_clip": 0.01104173, + "auxiliary_loss_mlp": 0.0104086, + "balance_loss_clip": 1.04404521, + "balance_loss_mlp": 1.02618575, + "epoch": 0.39224410040583196, + "flos": 33144902403840.0, + "grad_norm": 2.4797829287964017, + "language_loss": 0.618909, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.64035928, + "num_input_tokens_seen": 140105105, + "step": 6524, + "time_per_iteration": 2.6123106479644775 + }, + { + "auxiliary_loss_clip": 0.01126133, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.04511094, + "balance_loss_mlp": 1.01998138, + "epoch": 0.39230422365849993, + "flos": 27782434097280.0, + "grad_norm": 2.343147961510626, + "language_loss": 0.7385394, + "learning_rate": 2.773949161345489e-06, + "loss": 0.76015514, + "num_input_tokens_seen": 140125645, + "step": 6525, + "time_per_iteration": 2.549816370010376 + }, + { + "auxiliary_loss_clip": 0.01101573, + "auxiliary_loss_mlp": 0.01040002, + "balance_loss_clip": 1.04183841, + "balance_loss_mlp": 1.02603042, + "epoch": 0.3923643469111679, + "flos": 17931634865280.0, + "grad_norm": 2.0447693600668853, + "language_loss": 0.81432021, + "learning_rate": 2.773590027802719e-06, + "loss": 0.83573592, + "num_input_tokens_seen": 140141925, + "step": 6526, + "time_per_iteration": 3.847306966781616 + }, + { + "auxiliary_loss_clip": 0.01112472, + "auxiliary_loss_mlp": 0.01038072, + "balance_loss_clip": 1.0431366, + "balance_loss_mlp": 1.02410126, + "epoch": 0.39242447016383586, + "flos": 24059539019520.0, + "grad_norm": 1.5071774783004153, + "language_loss": 0.70080006, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.72230554, + "num_input_tokens_seen": 140160965, + "step": 6527, + "time_per_iteration": 3.9322421550750732 + }, + { + "auxiliary_loss_clip": 0.010868, + "auxiliary_loss_mlp": 0.01033133, + "balance_loss_clip": 1.04493654, + "balance_loss_mlp": 1.0186913, + "epoch": 0.3924845934165038, + "flos": 10664069016960.0, + "grad_norm": 2.4529634122330033, + "language_loss": 0.82066047, + "learning_rate": 2.772871672726965e-06, + "loss": 0.84185982, + "num_input_tokens_seen": 140177780, + "step": 6528, + "time_per_iteration": 2.5281364917755127 + }, + { + "auxiliary_loss_clip": 0.01097169, + "auxiliary_loss_mlp": 0.01037867, + "balance_loss_clip": 1.04245925, + "balance_loss_mlp": 1.02298331, + "epoch": 0.3925447166691718, + "flos": 31245910174080.0, + "grad_norm": 1.6067383373223, + "language_loss": 0.68480587, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.70615625, + "num_input_tokens_seen": 140201660, + "step": 6529, + "time_per_iteration": 4.059704065322876 + }, + { + "auxiliary_loss_clip": 0.01102352, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.04040909, + "balance_loss_mlp": 1.02063501, + "epoch": 0.39260483992183975, + "flos": 29415040087680.0, + "grad_norm": 2.25770616302755, + "language_loss": 0.79929024, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.82067215, + "num_input_tokens_seen": 140218585, + "step": 6530, + "time_per_iteration": 2.5790302753448486 + }, + { + "auxiliary_loss_clip": 0.01111962, + "auxiliary_loss_mlp": 0.01036436, + "balance_loss_clip": 1.04183507, + "balance_loss_mlp": 1.02202344, + "epoch": 0.3926649631745077, + "flos": 22857788666880.0, + "grad_norm": 1.423643310669421, + "language_loss": 0.7554062, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.77689016, + "num_input_tokens_seen": 140239905, + "step": 6531, + "time_per_iteration": 2.5233864784240723 + }, + { + "auxiliary_loss_clip": 0.01050476, + "auxiliary_loss_mlp": 0.01003834, + "balance_loss_clip": 1.02201676, + "balance_loss_mlp": 1.00217712, + "epoch": 0.3927250864271757, + "flos": 63893881872000.0, + "grad_norm": 0.8310896388553287, + "language_loss": 0.60358053, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62412363, + "num_input_tokens_seen": 140293820, + "step": 6532, + "time_per_iteration": 2.960930347442627 + }, + { + "auxiliary_loss_clip": 0.01032531, + "auxiliary_loss_mlp": 0.01005284, + "balance_loss_clip": 1.02552342, + "balance_loss_mlp": 1.00356793, + "epoch": 0.3927852096798437, + "flos": 68909741890560.0, + "grad_norm": 0.7742295183868704, + "language_loss": 0.55504549, + "learning_rate": 2.771075272396981e-06, + "loss": 0.57542366, + "num_input_tokens_seen": 140360420, + "step": 6533, + "time_per_iteration": 3.2317259311676025 + }, + { + "auxiliary_loss_clip": 0.01111486, + "auxiliary_loss_mlp": 0.01038896, + "balance_loss_clip": 1.04632211, + "balance_loss_mlp": 1.02404261, + "epoch": 0.39284533293251167, + "flos": 29715972232320.0, + "grad_norm": 1.983423524287572, + "language_loss": 0.75975168, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.78125548, + "num_input_tokens_seen": 140381950, + "step": 6534, + "time_per_iteration": 2.609792470932007 + }, + { + "auxiliary_loss_clip": 0.01113967, + "auxiliary_loss_mlp": 0.01043836, + "balance_loss_clip": 1.04352331, + "balance_loss_mlp": 1.02645552, + "epoch": 0.39290545618517964, + "flos": 18552027594240.0, + "grad_norm": 2.0380364986485735, + "language_loss": 0.78100467, + "learning_rate": 2.770356507494851e-06, + "loss": 0.80258262, + "num_input_tokens_seen": 140399410, + "step": 6535, + "time_per_iteration": 2.4967849254608154 + }, + { + "auxiliary_loss_clip": 0.01088841, + "auxiliary_loss_mlp": 0.01029817, + "balance_loss_clip": 1.04833293, + "balance_loss_mlp": 1.01594114, + "epoch": 0.3929655794378476, + "flos": 26249479413120.0, + "grad_norm": 1.8878762722402969, + "language_loss": 0.6833148, + "learning_rate": 2.769997081218978e-06, + "loss": 0.70450139, + "num_input_tokens_seen": 140419055, + "step": 6536, + "time_per_iteration": 2.5910658836364746 + }, + { + "auxiliary_loss_clip": 0.01093825, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.04348433, + "balance_loss_mlp": 1.01940536, + "epoch": 0.39302570269051557, + "flos": 29277933874560.0, + "grad_norm": 1.8098784897639015, + "language_loss": 0.69017893, + "learning_rate": 2.769637625744738e-06, + "loss": 0.71144623, + "num_input_tokens_seen": 140438800, + "step": 6537, + "time_per_iteration": 2.598487377166748 + }, + { + "auxiliary_loss_clip": 0.01110906, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.04555035, + "balance_loss_mlp": 1.01828933, + "epoch": 0.39308582594318353, + "flos": 17347440067200.0, + "grad_norm": 1.9237619915085742, + "language_loss": 0.78995526, + "learning_rate": 2.769278141085763e-06, + "loss": 0.81139475, + "num_input_tokens_seen": 140456880, + "step": 6538, + "time_per_iteration": 2.461862325668335 + }, + { + "auxiliary_loss_clip": 0.01018406, + "auxiliary_loss_mlp": 0.01005089, + "balance_loss_clip": 1.03274381, + "balance_loss_mlp": 1.00306249, + "epoch": 0.3931459491958515, + "flos": 61007094650880.0, + "grad_norm": 0.8028394820817766, + "language_loss": 0.61897016, + "learning_rate": 2.768918627255683e-06, + "loss": 0.6392051, + "num_input_tokens_seen": 140507510, + "step": 6539, + "time_per_iteration": 2.9905731678009033 + }, + { + "auxiliary_loss_clip": 0.01101918, + "auxiliary_loss_mlp": 0.01033936, + "balance_loss_clip": 1.04968524, + "balance_loss_mlp": 1.01885617, + "epoch": 0.39320607244851946, + "flos": 39016009249920.0, + "grad_norm": 1.9362877031887438, + "language_loss": 0.67907995, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.7004385, + "num_input_tokens_seen": 140528740, + "step": 6540, + "time_per_iteration": 2.6699414253234863 + }, + { + "auxiliary_loss_clip": 0.01101049, + "auxiliary_loss_mlp": 0.0103526, + "balance_loss_clip": 1.04391897, + "balance_loss_mlp": 1.02122891, + "epoch": 0.3932661957011874, + "flos": 24679752180480.0, + "grad_norm": 1.708695819762661, + "language_loss": 0.7223866, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.74374968, + "num_input_tokens_seen": 140547560, + "step": 6541, + "time_per_iteration": 2.574753522872925 + }, + { + "auxiliary_loss_clip": 0.01049207, + "auxiliary_loss_mlp": 0.01004506, + "balance_loss_clip": 1.02097559, + "balance_loss_mlp": 1.00269401, + "epoch": 0.3933263189538554, + "flos": 70096552185600.0, + "grad_norm": 0.8389220469040073, + "language_loss": 0.6035558, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62409294, + "num_input_tokens_seen": 140601175, + "step": 6542, + "time_per_iteration": 2.9125678539276123 + }, + { + "auxiliary_loss_clip": 0.01113873, + "auxiliary_loss_mlp": 0.01036702, + "balance_loss_clip": 1.0432632, + "balance_loss_mlp": 1.02279639, + "epoch": 0.39338644220652336, + "flos": 22929071207040.0, + "grad_norm": 1.4779570516063214, + "language_loss": 0.82212019, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.84362596, + "num_input_tokens_seen": 140622200, + "step": 6543, + "time_per_iteration": 2.5446014404296875 + }, + { + "auxiliary_loss_clip": 0.01097907, + "auxiliary_loss_mlp": 0.01037998, + "balance_loss_clip": 1.04045773, + "balance_loss_mlp": 1.02325153, + "epoch": 0.3934465654591913, + "flos": 30848163897600.0, + "grad_norm": 1.5730113541721435, + "language_loss": 0.68948078, + "learning_rate": 2.767120621015908e-06, + "loss": 0.71083987, + "num_input_tokens_seen": 140643125, + "step": 6544, + "time_per_iteration": 2.5975804328918457 + }, + { + "auxiliary_loss_clip": 0.01107861, + "auxiliary_loss_mlp": 0.01038419, + "balance_loss_clip": 1.04490149, + "balance_loss_mlp": 1.02310073, + "epoch": 0.3935066887118593, + "flos": 29236528471680.0, + "grad_norm": 2.704837534210723, + "language_loss": 0.75244486, + "learning_rate": 2.76676093244553e-06, + "loss": 0.7739076, + "num_input_tokens_seen": 140662500, + "step": 6545, + "time_per_iteration": 2.6029927730560303 + }, + { + "auxiliary_loss_clip": 0.01086334, + "auxiliary_loss_mlp": 0.0103279, + "balance_loss_clip": 1.0454241, + "balance_loss_mlp": 1.02014148, + "epoch": 0.3935668119645273, + "flos": 19135288638720.0, + "grad_norm": 1.570697980406247, + "language_loss": 0.74669802, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.76788926, + "num_input_tokens_seen": 140681960, + "step": 6546, + "time_per_iteration": 2.563077449798584 + }, + { + "auxiliary_loss_clip": 0.01101233, + "auxiliary_loss_mlp": 0.01033979, + "balance_loss_clip": 1.04756069, + "balance_loss_mlp": 1.01876783, + "epoch": 0.3936269352171953, + "flos": 18516116972160.0, + "grad_norm": 1.7126700973427667, + "language_loss": 0.8162117, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.83756381, + "num_input_tokens_seen": 140699170, + "step": 6547, + "time_per_iteration": 2.530360460281372 + }, + { + "auxiliary_loss_clip": 0.01110186, + "auxiliary_loss_mlp": 0.00795657, + "balance_loss_clip": 1.04266548, + "balance_loss_mlp": 1.01536107, + "epoch": 0.39368705846986324, + "flos": 15632813370240.0, + "grad_norm": 1.7096843779625146, + "language_loss": 0.840505, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.85956347, + "num_input_tokens_seen": 140714920, + "step": 6548, + "time_per_iteration": 2.468116521835327 + }, + { + "auxiliary_loss_clip": 0.01113171, + "auxiliary_loss_mlp": 0.00791205, + "balance_loss_clip": 1.04479074, + "balance_loss_mlp": 1.01175976, + "epoch": 0.3937471817225312, + "flos": 21325839563520.0, + "grad_norm": 1.5328264746783393, + "language_loss": 0.72748768, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.74653137, + "num_input_tokens_seen": 140734595, + "step": 6549, + "time_per_iteration": 2.5336458683013916 + }, + { + "auxiliary_loss_clip": 0.01067894, + "auxiliary_loss_mlp": 0.0104327, + "balance_loss_clip": 1.04581392, + "balance_loss_mlp": 1.02672362, + "epoch": 0.39380730497519917, + "flos": 20776693461120.0, + "grad_norm": 1.728756609781234, + "language_loss": 0.77350819, + "learning_rate": 2.764962053731699e-06, + "loss": 0.7946198, + "num_input_tokens_seen": 140754050, + "step": 6550, + "time_per_iteration": 2.635369300842285 + }, + { + "auxiliary_loss_clip": 0.01103062, + "auxiliary_loss_mlp": 0.01029657, + "balance_loss_clip": 1.06046855, + "balance_loss_mlp": 1.01573312, + "epoch": 0.39386742822786713, + "flos": 21609784575360.0, + "grad_norm": 1.7499595006687143, + "language_loss": 0.81511652, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.83644372, + "num_input_tokens_seen": 140771440, + "step": 6551, + "time_per_iteration": 2.572779417037964 + }, + { + "auxiliary_loss_clip": 0.01114745, + "auxiliary_loss_mlp": 0.01041938, + "balance_loss_clip": 1.04414713, + "balance_loss_mlp": 1.02710807, + "epoch": 0.3939275514805351, + "flos": 12414642249600.0, + "grad_norm": 2.3453215929744973, + "language_loss": 0.79836428, + "learning_rate": 2.764242299098596e-06, + "loss": 0.81993103, + "num_input_tokens_seen": 140786715, + "step": 6552, + "time_per_iteration": 2.4657928943634033 + }, + { + "auxiliary_loss_clip": 0.01128403, + "auxiliary_loss_mlp": 0.01036938, + "balance_loss_clip": 1.04620075, + "balance_loss_mlp": 1.02182174, + "epoch": 0.39398767473320306, + "flos": 18552027594240.0, + "grad_norm": 1.6739822262462836, + "language_loss": 0.7081989, + "learning_rate": 2.763882378305003e-06, + "loss": 0.72985232, + "num_input_tokens_seen": 140804950, + "step": 6553, + "time_per_iteration": 2.4562976360321045 + }, + { + "auxiliary_loss_clip": 0.01113904, + "auxiliary_loss_mlp": 0.00791218, + "balance_loss_clip": 1.04650545, + "balance_loss_mlp": 1.01062727, + "epoch": 0.39404779798587103, + "flos": 29308888419840.0, + "grad_norm": 1.58270591146748, + "language_loss": 0.64257771, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.66162896, + "num_input_tokens_seen": 140822800, + "step": 6554, + "time_per_iteration": 2.562225103378296 + }, + { + "auxiliary_loss_clip": 0.01111934, + "auxiliary_loss_mlp": 0.0103973, + "balance_loss_clip": 1.04912746, + "balance_loss_mlp": 1.02606916, + "epoch": 0.394107921238539, + "flos": 34897055834880.0, + "grad_norm": 2.3897396205959094, + "language_loss": 0.7974636, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.81898028, + "num_input_tokens_seen": 140842940, + "step": 6555, + "time_per_iteration": 2.635467052459717 + }, + { + "auxiliary_loss_clip": 0.01101485, + "auxiliary_loss_mlp": 0.01040731, + "balance_loss_clip": 1.04772854, + "balance_loss_mlp": 1.02494729, + "epoch": 0.39416804449120696, + "flos": 25081413039360.0, + "grad_norm": 1.937172891928768, + "language_loss": 0.71547353, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.73689568, + "num_input_tokens_seen": 140863060, + "step": 6556, + "time_per_iteration": 2.553980588912964 + }, + { + "auxiliary_loss_clip": 0.01125938, + "auxiliary_loss_mlp": 0.01033758, + "balance_loss_clip": 1.04393125, + "balance_loss_mlp": 1.01919675, + "epoch": 0.3942281677438749, + "flos": 32306639731200.0, + "grad_norm": 1.8511526183447458, + "language_loss": 0.83618891, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.85778588, + "num_input_tokens_seen": 140883795, + "step": 6557, + "time_per_iteration": 2.5707714557647705 + }, + { + "auxiliary_loss_clip": 0.01110106, + "auxiliary_loss_mlp": 0.01036063, + "balance_loss_clip": 1.04523647, + "balance_loss_mlp": 1.02130473, + "epoch": 0.3942882909965429, + "flos": 24936621315840.0, + "grad_norm": 2.4820357961948867, + "language_loss": 0.80513418, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.8265959, + "num_input_tokens_seen": 140903055, + "step": 6558, + "time_per_iteration": 2.5180160999298096 + }, + { + "auxiliary_loss_clip": 0.01125617, + "auxiliary_loss_mlp": 0.01040556, + "balance_loss_clip": 1.04603815, + "balance_loss_mlp": 1.02644217, + "epoch": 0.39434841424921085, + "flos": 11874797769600.0, + "grad_norm": 1.78445479156159, + "language_loss": 0.71064246, + "learning_rate": 2.761722245724792e-06, + "loss": 0.73230422, + "num_input_tokens_seen": 140920685, + "step": 6559, + "time_per_iteration": 2.445587158203125 + }, + { + "auxiliary_loss_clip": 0.01108692, + "auxiliary_loss_mlp": 0.01037462, + "balance_loss_clip": 1.0456531, + "balance_loss_mlp": 1.02108836, + "epoch": 0.3944085375018789, + "flos": 16361620323840.0, + "grad_norm": 2.7322800436506496, + "language_loss": 0.80880839, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.83026993, + "num_input_tokens_seen": 140937320, + "step": 6560, + "time_per_iteration": 2.4943389892578125 + }, + { + "auxiliary_loss_clip": 0.01113483, + "auxiliary_loss_mlp": 0.0103632, + "balance_loss_clip": 1.05260634, + "balance_loss_mlp": 1.02072716, + "epoch": 0.39446866075454684, + "flos": 10633365866880.0, + "grad_norm": 1.9981557721491578, + "language_loss": 0.83373284, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.85523093, + "num_input_tokens_seen": 140954855, + "step": 6561, + "time_per_iteration": 2.5036377906799316 + }, + { + "auxiliary_loss_clip": 0.01116099, + "auxiliary_loss_mlp": 0.0104303, + "balance_loss_clip": 1.04688346, + "balance_loss_mlp": 1.02835572, + "epoch": 0.3945287840072148, + "flos": 18187498419840.0, + "grad_norm": 3.024338987553006, + "language_loss": 0.7995165, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.8211078, + "num_input_tokens_seen": 140973250, + "step": 6562, + "time_per_iteration": 3.8899197578430176 + }, + { + "auxiliary_loss_clip": 0.0109792, + "auxiliary_loss_mlp": 0.01036434, + "balance_loss_clip": 1.04428542, + "balance_loss_mlp": 1.02216494, + "epoch": 0.39458890725988277, + "flos": 23039891642880.0, + "grad_norm": 1.749635834600201, + "language_loss": 0.81152719, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.83287072, + "num_input_tokens_seen": 140993050, + "step": 6563, + "time_per_iteration": 2.570053815841675 + }, + { + "auxiliary_loss_clip": 0.01076978, + "auxiliary_loss_mlp": 0.01047176, + "balance_loss_clip": 1.04270768, + "balance_loss_mlp": 1.02904415, + "epoch": 0.39464903051255074, + "flos": 17159052211200.0, + "grad_norm": 2.190937813483821, + "language_loss": 0.70371163, + "learning_rate": 2.759921340790127e-06, + "loss": 0.72495317, + "num_input_tokens_seen": 141010815, + "step": 6564, + "time_per_iteration": 3.956890821456909 + }, + { + "auxiliary_loss_clip": 0.01115144, + "auxiliary_loss_mlp": 0.01037708, + "balance_loss_clip": 1.0438118, + "balance_loss_mlp": 1.02277684, + "epoch": 0.3947091537652187, + "flos": 15889000147200.0, + "grad_norm": 2.1480147839194457, + "language_loss": 0.83046401, + "learning_rate": 2.759561073299676e-06, + "loss": 0.85199255, + "num_input_tokens_seen": 141028720, + "step": 6565, + "time_per_iteration": 2.472045421600342 + }, + { + "auxiliary_loss_clip": 0.01087954, + "auxiliary_loss_mlp": 0.01044529, + "balance_loss_clip": 1.04133546, + "balance_loss_mlp": 1.02793467, + "epoch": 0.39476927701788667, + "flos": 18545491319040.0, + "grad_norm": 1.8136728200934547, + "language_loss": 0.83558649, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.8569113, + "num_input_tokens_seen": 141046025, + "step": 6566, + "time_per_iteration": 3.9070703983306885 + }, + { + "auxiliary_loss_clip": 0.01132152, + "auxiliary_loss_mlp": 0.01038013, + "balance_loss_clip": 1.04662907, + "balance_loss_mlp": 1.02329624, + "epoch": 0.39482940027055463, + "flos": 22275712771200.0, + "grad_norm": 2.3199407656714714, + "language_loss": 0.77309573, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.79479742, + "num_input_tokens_seen": 141066865, + "step": 6567, + "time_per_iteration": 2.5052387714385986 + }, + { + "auxiliary_loss_clip": 0.01111325, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.0444901, + "balance_loss_mlp": 1.01999867, + "epoch": 0.3948895235232226, + "flos": 14757634494720.0, + "grad_norm": 1.9993661265184481, + "language_loss": 0.80613661, + "learning_rate": 2.758480098067182e-06, + "loss": 0.8275913, + "num_input_tokens_seen": 141084210, + "step": 6568, + "time_per_iteration": 3.8558616638183594 + }, + { + "auxiliary_loss_clip": 0.01093208, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.04424381, + "balance_loss_mlp": 1.0186733, + "epoch": 0.39494964677589056, + "flos": 22565763095040.0, + "grad_norm": 1.699410380454577, + "language_loss": 0.84486055, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.86612642, + "num_input_tokens_seen": 141103895, + "step": 6569, + "time_per_iteration": 2.5607333183288574 + }, + { + "auxiliary_loss_clip": 0.01071034, + "auxiliary_loss_mlp": 0.01036593, + "balance_loss_clip": 1.04681683, + "balance_loss_mlp": 1.02203751, + "epoch": 0.3950097700285585, + "flos": 22963186149120.0, + "grad_norm": 1.8535430898870493, + "language_loss": 0.74630904, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.76738536, + "num_input_tokens_seen": 141124000, + "step": 6570, + "time_per_iteration": 2.6543033123016357 + }, + { + "auxiliary_loss_clip": 0.01099383, + "auxiliary_loss_mlp": 0.01034956, + "balance_loss_clip": 1.05071199, + "balance_loss_mlp": 1.02040088, + "epoch": 0.3950698932812265, + "flos": 20595236929920.0, + "grad_norm": 1.582899525692935, + "language_loss": 0.79721045, + "learning_rate": 2.757398863979922e-06, + "loss": 0.81855381, + "num_input_tokens_seen": 141142535, + "step": 6571, + "time_per_iteration": 2.5591237545013428 + }, + { + "auxiliary_loss_clip": 0.01100578, + "auxiliary_loss_mlp": 0.01040897, + "balance_loss_clip": 1.0495156, + "balance_loss_mlp": 1.02608538, + "epoch": 0.39513001653389446, + "flos": 20375786787840.0, + "grad_norm": 2.2687270225249376, + "language_loss": 0.77775174, + "learning_rate": 2.757038395157997e-06, + "loss": 0.7991665, + "num_input_tokens_seen": 141161575, + "step": 6572, + "time_per_iteration": 2.5371758937835693 + }, + { + "auxiliary_loss_clip": 0.01089849, + "auxiliary_loss_mlp": 0.01038993, + "balance_loss_clip": 1.04633033, + "balance_loss_mlp": 1.02412796, + "epoch": 0.3951901397865625, + "flos": 26463650256000.0, + "grad_norm": 1.7983139781578383, + "language_loss": 0.75037402, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.77166241, + "num_input_tokens_seen": 141181150, + "step": 6573, + "time_per_iteration": 2.596883773803711 + }, + { + "auxiliary_loss_clip": 0.01113084, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.04378939, + "balance_loss_mlp": 1.01917624, + "epoch": 0.39525026303923044, + "flos": 43838345767680.0, + "grad_norm": 1.572324794709649, + "language_loss": 0.67958832, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.70104337, + "num_input_tokens_seen": 141206310, + "step": 6574, + "time_per_iteration": 2.7271339893341064 + }, + { + "auxiliary_loss_clip": 0.01066327, + "auxiliary_loss_mlp": 0.01038054, + "balance_loss_clip": 1.04167199, + "balance_loss_mlp": 1.02085173, + "epoch": 0.3953103862918984, + "flos": 18040803275520.0, + "grad_norm": 2.429583157911912, + "language_loss": 0.72135991, + "learning_rate": 2.755956816505072e-06, + "loss": 0.74240375, + "num_input_tokens_seen": 141223925, + "step": 6575, + "time_per_iteration": 2.6008336544036865 + }, + { + "auxiliary_loss_clip": 0.01111418, + "auxiliary_loss_mlp": 0.01041134, + "balance_loss_clip": 1.04874873, + "balance_loss_mlp": 1.02611351, + "epoch": 0.3953705095445664, + "flos": 16976015481600.0, + "grad_norm": 2.083257067323189, + "language_loss": 0.73768151, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.75920701, + "num_input_tokens_seen": 141239010, + "step": 6576, + "time_per_iteration": 2.4867701530456543 + }, + { + "auxiliary_loss_clip": 0.01125924, + "auxiliary_loss_mlp": 0.01037592, + "balance_loss_clip": 1.04497743, + "balance_loss_mlp": 1.02448463, + "epoch": 0.39543063279723434, + "flos": 17411144837760.0, + "grad_norm": 2.4136880488074945, + "language_loss": 0.83418632, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.85582149, + "num_input_tokens_seen": 141252255, + "step": 6577, + "time_per_iteration": 2.4168448448181152 + }, + { + "auxiliary_loss_clip": 0.01103621, + "auxiliary_loss_mlp": 0.01032801, + "balance_loss_clip": 1.04427588, + "balance_loss_mlp": 1.01922894, + "epoch": 0.3954907560499023, + "flos": 22784207656320.0, + "grad_norm": 2.7732685660571748, + "language_loss": 0.90222919, + "learning_rate": 2.75487497985853e-06, + "loss": 0.9235934, + "num_input_tokens_seen": 141269325, + "step": 6578, + "time_per_iteration": 2.5963001251220703 + }, + { + "auxiliary_loss_clip": 0.01102837, + "auxiliary_loss_mlp": 0.01035111, + "balance_loss_clip": 1.04609728, + "balance_loss_mlp": 1.01905346, + "epoch": 0.39555087930257027, + "flos": 21944400698880.0, + "grad_norm": 1.8165639161378564, + "language_loss": 0.77991134, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.80129081, + "num_input_tokens_seen": 141288505, + "step": 6579, + "time_per_iteration": 2.54209303855896 + }, + { + "auxiliary_loss_clip": 0.01076232, + "auxiliary_loss_mlp": 0.01030035, + "balance_loss_clip": 1.0478344, + "balance_loss_mlp": 1.01483595, + "epoch": 0.39561100255523823, + "flos": 20404622430720.0, + "grad_norm": 2.0786960025374723, + "language_loss": 0.68783784, + "learning_rate": 2.754153612280037e-06, + "loss": 0.70890045, + "num_input_tokens_seen": 141303680, + "step": 6580, + "time_per_iteration": 2.6481897830963135 + }, + { + "auxiliary_loss_clip": 0.01113665, + "auxiliary_loss_mlp": 0.01028537, + "balance_loss_clip": 1.0456593, + "balance_loss_mlp": 1.01485729, + "epoch": 0.3956711258079062, + "flos": 27964572986880.0, + "grad_norm": 1.9217436116661628, + "language_loss": 0.58528984, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.60671186, + "num_input_tokens_seen": 141324090, + "step": 6581, + "time_per_iteration": 2.603363513946533 + }, + { + "auxiliary_loss_clip": 0.01101212, + "auxiliary_loss_mlp": 0.01045805, + "balance_loss_clip": 1.04313087, + "balance_loss_mlp": 1.02952147, + "epoch": 0.39573124906057416, + "flos": 14428297670400.0, + "grad_norm": 1.8135305314487877, + "language_loss": 0.69298685, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.71445704, + "num_input_tokens_seen": 141342235, + "step": 6582, + "time_per_iteration": 2.5629775524139404 + }, + { + "auxiliary_loss_clip": 0.01127606, + "auxiliary_loss_mlp": 0.00793416, + "balance_loss_clip": 1.04615068, + "balance_loss_mlp": 1.01743054, + "epoch": 0.39579137231324213, + "flos": 18733699607040.0, + "grad_norm": 1.976634325652489, + "language_loss": 0.759323, + "learning_rate": 2.753071346464642e-06, + "loss": 0.77853322, + "num_input_tokens_seen": 141361195, + "step": 6583, + "time_per_iteration": 2.524796962738037 + }, + { + "auxiliary_loss_clip": 0.01088575, + "auxiliary_loss_mlp": 0.00797774, + "balance_loss_clip": 1.04538512, + "balance_loss_mlp": 1.02646732, + "epoch": 0.3958514955659101, + "flos": 17676417755520.0, + "grad_norm": 1.75031678415596, + "language_loss": 0.65879476, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.67765826, + "num_input_tokens_seen": 141378275, + "step": 6584, + "time_per_iteration": 2.6191680431365967 + }, + { + "auxiliary_loss_clip": 0.01097822, + "auxiliary_loss_mlp": 0.0104114, + "balance_loss_clip": 1.05322647, + "balance_loss_mlp": 1.02590466, + "epoch": 0.39591161881857806, + "flos": 29309103901440.0, + "grad_norm": 1.9838399360125478, + "language_loss": 0.72046393, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.74185354, + "num_input_tokens_seen": 141396960, + "step": 6585, + "time_per_iteration": 2.6208224296569824 + }, + { + "auxiliary_loss_clip": 0.01092397, + "auxiliary_loss_mlp": 0.01030345, + "balance_loss_clip": 1.04241467, + "balance_loss_mlp": 1.01628435, + "epoch": 0.3959717420712461, + "flos": 25771831332480.0, + "grad_norm": 1.9189240103135463, + "language_loss": 0.73699617, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.75822365, + "num_input_tokens_seen": 141417320, + "step": 6586, + "time_per_iteration": 2.6023528575897217 + }, + { + "auxiliary_loss_clip": 0.01107101, + "auxiliary_loss_mlp": 0.01034424, + "balance_loss_clip": 1.04649425, + "balance_loss_mlp": 1.019696, + "epoch": 0.39603186532391405, + "flos": 20923783655040.0, + "grad_norm": 2.1662706459529755, + "language_loss": 0.71514916, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.73656446, + "num_input_tokens_seen": 141435985, + "step": 6587, + "time_per_iteration": 2.53814697265625 + }, + { + "auxiliary_loss_clip": 0.01014973, + "auxiliary_loss_mlp": 0.01009685, + "balance_loss_clip": 1.02351689, + "balance_loss_mlp": 1.00788546, + "epoch": 0.396091988576582, + "flos": 54880986176640.0, + "grad_norm": 0.9005289410521834, + "language_loss": 0.61182922, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63207579, + "num_input_tokens_seen": 141486075, + "step": 6588, + "time_per_iteration": 3.005932331085205 + }, + { + "auxiliary_loss_clip": 0.01106546, + "auxiliary_loss_mlp": 0.0079161, + "balance_loss_clip": 1.04678178, + "balance_loss_mlp": 1.01358974, + "epoch": 0.39615211182925, + "flos": 20702896968960.0, + "grad_norm": 1.7862090698581379, + "language_loss": 0.81270701, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.83168852, + "num_input_tokens_seen": 141505280, + "step": 6589, + "time_per_iteration": 2.531843423843384 + }, + { + "auxiliary_loss_clip": 0.01100859, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.04531682, + "balance_loss_mlp": 1.0152117, + "epoch": 0.39621223508191794, + "flos": 20994312009600.0, + "grad_norm": 2.3163292881341566, + "language_loss": 0.6999135, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.72121668, + "num_input_tokens_seen": 141523930, + "step": 6590, + "time_per_iteration": 2.5264840126037598 + }, + { + "auxiliary_loss_clip": 0.01114325, + "auxiliary_loss_mlp": 0.01047741, + "balance_loss_clip": 1.0467031, + "balance_loss_mlp": 1.03229129, + "epoch": 0.3962723583345859, + "flos": 23368833417600.0, + "grad_norm": 1.9498492270278753, + "language_loss": 0.75504047, + "learning_rate": 2.750184048805956e-06, + "loss": 0.7766611, + "num_input_tokens_seen": 141541320, + "step": 6591, + "time_per_iteration": 2.5035622119903564 + }, + { + "auxiliary_loss_clip": 0.01044753, + "auxiliary_loss_mlp": 0.01042906, + "balance_loss_clip": 1.0472331, + "balance_loss_mlp": 1.02826118, + "epoch": 0.39633248158725387, + "flos": 25115599808640.0, + "grad_norm": 1.7904908142546463, + "language_loss": 0.78353232, + "learning_rate": 2.749823008443152e-06, + "loss": 0.80440891, + "num_input_tokens_seen": 141561880, + "step": 6592, + "time_per_iteration": 2.825254201889038 + }, + { + "auxiliary_loss_clip": 0.01062211, + "auxiliary_loss_mlp": 0.01033391, + "balance_loss_clip": 1.04973197, + "balance_loss_mlp": 1.01885378, + "epoch": 0.39639260483992184, + "flos": 39787622236800.0, + "grad_norm": 1.8201657228851904, + "language_loss": 0.69559866, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.7165547, + "num_input_tokens_seen": 141586460, + "step": 6593, + "time_per_iteration": 3.0249547958374023 + }, + { + "auxiliary_loss_clip": 0.0106281, + "auxiliary_loss_mlp": 0.01037727, + "balance_loss_clip": 1.04100943, + "balance_loss_mlp": 1.02254021, + "epoch": 0.3964527280925898, + "flos": 17347045017600.0, + "grad_norm": 1.9540812094946873, + "language_loss": 0.7753067, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.79631209, + "num_input_tokens_seen": 141605955, + "step": 6594, + "time_per_iteration": 2.648566961288452 + }, + { + "auxiliary_loss_clip": 0.01023848, + "auxiliary_loss_mlp": 0.0100475, + "balance_loss_clip": 1.028072, + "balance_loss_mlp": 1.0028547, + "epoch": 0.39651285134525777, + "flos": 71717848369920.0, + "grad_norm": 0.9477409190783928, + "language_loss": 0.63049734, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.6507833, + "num_input_tokens_seen": 141673140, + "step": 6595, + "time_per_iteration": 3.2045650482177734 + }, + { + "auxiliary_loss_clip": 0.01094592, + "auxiliary_loss_mlp": 0.01049683, + "balance_loss_clip": 1.04693985, + "balance_loss_mlp": 1.03273129, + "epoch": 0.39657297459792573, + "flos": 25775710001280.0, + "grad_norm": 2.814337477310836, + "language_loss": 0.63442439, + "learning_rate": 2.748378562795223e-06, + "loss": 0.6558671, + "num_input_tokens_seen": 141692955, + "step": 6596, + "time_per_iteration": 2.624256134033203 + }, + { + "auxiliary_loss_clip": 0.01114888, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_clip": 1.04511738, + "balance_loss_mlp": 1.02397799, + "epoch": 0.3966330978505937, + "flos": 20266115587200.0, + "grad_norm": 1.8247907389215567, + "language_loss": 0.78705901, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.80859339, + "num_input_tokens_seen": 141710680, + "step": 6597, + "time_per_iteration": 2.528625249862671 + }, + { + "auxiliary_loss_clip": 0.01097273, + "auxiliary_loss_mlp": 0.00791487, + "balance_loss_clip": 1.04583549, + "balance_loss_mlp": 1.01438141, + "epoch": 0.39669322110326166, + "flos": 20631183465600.0, + "grad_norm": 2.020721703987871, + "language_loss": 0.66946077, + "learning_rate": 2.747656169644941e-06, + "loss": 0.68834841, + "num_input_tokens_seen": 141729860, + "step": 6598, + "time_per_iteration": 2.585181713104248 + }, + { + "auxiliary_loss_clip": 0.01129011, + "auxiliary_loss_mlp": 0.0103925, + "balance_loss_clip": 1.04694188, + "balance_loss_mlp": 1.02585089, + "epoch": 0.3967533443559297, + "flos": 21726063878400.0, + "grad_norm": 1.811380785757539, + "language_loss": 0.78609788, + "learning_rate": 2.747294930536157e-06, + "loss": 0.80778044, + "num_input_tokens_seen": 141749060, + "step": 6599, + "time_per_iteration": 2.5095973014831543 + }, + { + "auxiliary_loss_clip": 0.0109857, + "auxiliary_loss_mlp": 0.01038643, + "balance_loss_clip": 1.05054116, + "balance_loss_mlp": 1.02253771, + "epoch": 0.39681346760859765, + "flos": 25484151306240.0, + "grad_norm": 1.9166971724217612, + "language_loss": 0.729312, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.75068414, + "num_input_tokens_seen": 141769860, + "step": 6600, + "time_per_iteration": 4.004424095153809 + }, + { + "auxiliary_loss_clip": 0.01083259, + "auxiliary_loss_mlp": 0.01035373, + "balance_loss_clip": 1.04217076, + "balance_loss_mlp": 1.02041864, + "epoch": 0.3968735908612656, + "flos": 20959586536320.0, + "grad_norm": 2.254581373045382, + "language_loss": 0.85992217, + "learning_rate": 2.746572367319791e-06, + "loss": 0.88110846, + "num_input_tokens_seen": 141788465, + "step": 6601, + "time_per_iteration": 2.5845186710357666 + }, + { + "auxiliary_loss_clip": 0.01092844, + "auxiliary_loss_mlp": 0.01040912, + "balance_loss_clip": 1.04349339, + "balance_loss_mlp": 1.02445543, + "epoch": 0.3969337141139336, + "flos": 10707090531840.0, + "grad_norm": 2.0078053802236484, + "language_loss": 0.70068836, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.72202599, + "num_input_tokens_seen": 141804955, + "step": 6602, + "time_per_iteration": 2.5834643840789795 + }, + { + "auxiliary_loss_clip": 0.01128624, + "auxiliary_loss_mlp": 0.01041891, + "balance_loss_clip": 1.04562747, + "balance_loss_mlp": 1.02698421, + "epoch": 0.39699383736660154, + "flos": 17593714690560.0, + "grad_norm": 2.4532961602339887, + "language_loss": 0.83546066, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.85716581, + "num_input_tokens_seen": 141820025, + "step": 6603, + "time_per_iteration": 4.248618841171265 + }, + { + "auxiliary_loss_clip": 0.01107549, + "auxiliary_loss_mlp": 0.01033644, + "balance_loss_clip": 1.04529059, + "balance_loss_mlp": 1.01993501, + "epoch": 0.3970539606192695, + "flos": 17785945301760.0, + "grad_norm": 1.6141120448730857, + "language_loss": 0.72810054, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.74951255, + "num_input_tokens_seen": 141838735, + "step": 6604, + "time_per_iteration": 3.9230358600616455 + }, + { + "auxiliary_loss_clip": 0.01102628, + "auxiliary_loss_mlp": 0.01035991, + "balance_loss_clip": 1.04502141, + "balance_loss_mlp": 1.02143514, + "epoch": 0.3971140838719375, + "flos": 24789495208320.0, + "grad_norm": 1.6406569976852292, + "language_loss": 0.82818151, + "learning_rate": 2.745126901275491e-06, + "loss": 0.84956777, + "num_input_tokens_seen": 141858090, + "step": 6605, + "time_per_iteration": 2.593484401702881 + }, + { + "auxiliary_loss_clip": 0.01125267, + "auxiliary_loss_mlp": 0.01029942, + "balance_loss_clip": 1.04671991, + "balance_loss_mlp": 1.01719868, + "epoch": 0.39717420712460544, + "flos": 24243581329920.0, + "grad_norm": 1.5940660534516067, + "language_loss": 0.73561233, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.75716448, + "num_input_tokens_seen": 141877540, + "step": 6606, + "time_per_iteration": 3.927874803543091 + }, + { + "auxiliary_loss_clip": 0.01086922, + "auxiliary_loss_mlp": 0.01038659, + "balance_loss_clip": 1.04475546, + "balance_loss_mlp": 1.02347136, + "epoch": 0.3972343303772734, + "flos": 25884698843520.0, + "grad_norm": 1.85930687831078, + "language_loss": 0.74577487, + "learning_rate": 2.744403998666805e-06, + "loss": 0.76703072, + "num_input_tokens_seen": 141897315, + "step": 6607, + "time_per_iteration": 2.59767746925354 + }, + { + "auxiliary_loss_clip": 0.01122286, + "auxiliary_loss_mlp": 0.01035585, + "balance_loss_clip": 1.04955935, + "balance_loss_mlp": 1.02135718, + "epoch": 0.39729445362994137, + "flos": 45623716300800.0, + "grad_norm": 1.5498516953767514, + "language_loss": 0.68004096, + "learning_rate": 2.744042505013797e-06, + "loss": 0.70161963, + "num_input_tokens_seen": 141919580, + "step": 6608, + "time_per_iteration": 2.716338634490967 + }, + { + "auxiliary_loss_clip": 0.01091757, + "auxiliary_loss_mlp": 0.01045145, + "balance_loss_clip": 1.04283714, + "balance_loss_mlp": 1.02780044, + "epoch": 0.39735457688260933, + "flos": 20193971120640.0, + "grad_norm": 2.4867544766959124, + "language_loss": 0.74320549, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.76457453, + "num_input_tokens_seen": 141937045, + "step": 6609, + "time_per_iteration": 2.5321836471557617 + }, + { + "auxiliary_loss_clip": 0.01111712, + "auxiliary_loss_mlp": 0.01036801, + "balance_loss_clip": 1.04804707, + "balance_loss_mlp": 1.02160144, + "epoch": 0.3974147001352773, + "flos": 23331163029120.0, + "grad_norm": 6.192107592359399, + "language_loss": 0.71242702, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.73391217, + "num_input_tokens_seen": 141956695, + "step": 6610, + "time_per_iteration": 2.553844928741455 + }, + { + "auxiliary_loss_clip": 0.01105769, + "auxiliary_loss_mlp": 0.01034005, + "balance_loss_clip": 1.04215288, + "balance_loss_mlp": 1.01962876, + "epoch": 0.39747482338794526, + "flos": 21688644885120.0, + "grad_norm": 1.6029606325720518, + "language_loss": 0.78463888, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.80603659, + "num_input_tokens_seen": 141975935, + "step": 6611, + "time_per_iteration": 2.5154647827148438 + }, + { + "auxiliary_loss_clip": 0.01119294, + "auxiliary_loss_mlp": 0.01036111, + "balance_loss_clip": 1.04958093, + "balance_loss_mlp": 1.02179372, + "epoch": 0.3975349466406133, + "flos": 30988717816320.0, + "grad_norm": 1.8347262980687606, + "language_loss": 0.79654008, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.81809413, + "num_input_tokens_seen": 141995750, + "step": 6612, + "time_per_iteration": 2.6291420459747314 + }, + { + "auxiliary_loss_clip": 0.01025236, + "auxiliary_loss_mlp": 0.01004546, + "balance_loss_clip": 1.0296247, + "balance_loss_mlp": 1.00285292, + "epoch": 0.39759506989328125, + "flos": 63683948833920.0, + "grad_norm": 0.8464413285376963, + "language_loss": 0.6498633, + "learning_rate": 2.742234613810459e-06, + "loss": 0.67016113, + "num_input_tokens_seen": 142057655, + "step": 6613, + "time_per_iteration": 3.0712215900421143 + }, + { + "auxiliary_loss_clip": 0.01099541, + "auxiliary_loss_mlp": 0.01044115, + "balance_loss_clip": 1.04417574, + "balance_loss_mlp": 1.02717519, + "epoch": 0.3976551931459492, + "flos": 23695835857920.0, + "grad_norm": 3.30456554549262, + "language_loss": 0.71247774, + "learning_rate": 2.741872951078109e-06, + "loss": 0.73391432, + "num_input_tokens_seen": 142076020, + "step": 6614, + "time_per_iteration": 2.543686866760254 + }, + { + "auxiliary_loss_clip": 0.0111569, + "auxiliary_loss_mlp": 0.01035089, + "balance_loss_clip": 1.04952872, + "balance_loss_mlp": 1.02104056, + "epoch": 0.3977153163986172, + "flos": 15669657745920.0, + "grad_norm": 1.7697501916538, + "language_loss": 0.81817067, + "learning_rate": 2.741511260213862e-06, + "loss": 0.83967847, + "num_input_tokens_seen": 142093790, + "step": 6615, + "time_per_iteration": 2.4864516258239746 + }, + { + "auxiliary_loss_clip": 0.01098813, + "auxiliary_loss_mlp": 0.01029144, + "balance_loss_clip": 1.04990256, + "balance_loss_mlp": 1.01591182, + "epoch": 0.39777543965128515, + "flos": 14064702249600.0, + "grad_norm": 2.5078641234160575, + "language_loss": 0.67732096, + "learning_rate": 2.741149541231434e-06, + "loss": 0.69860047, + "num_input_tokens_seen": 142110545, + "step": 6616, + "time_per_iteration": 2.525585412979126 + }, + { + "auxiliary_loss_clip": 0.01131886, + "auxiliary_loss_mlp": 0.01037383, + "balance_loss_clip": 1.04799795, + "balance_loss_mlp": 1.02299464, + "epoch": 0.3978355629039531, + "flos": 23367468700800.0, + "grad_norm": 2.029452203840985, + "language_loss": 0.8374964, + "learning_rate": 2.740787794144541e-06, + "loss": 0.85918915, + "num_input_tokens_seen": 142128695, + "step": 6617, + "time_per_iteration": 2.466381311416626 + }, + { + "auxiliary_loss_clip": 0.01126432, + "auxiliary_loss_mlp": 0.01037081, + "balance_loss_clip": 1.04967666, + "balance_loss_mlp": 1.02425408, + "epoch": 0.3978956861566211, + "flos": 19062785036160.0, + "grad_norm": 1.6111116008547097, + "language_loss": 0.72197276, + "learning_rate": 2.7404260189669e-06, + "loss": 0.74360788, + "num_input_tokens_seen": 142148375, + "step": 6618, + "time_per_iteration": 2.5062689781188965 + }, + { + "auxiliary_loss_clip": 0.01107579, + "auxiliary_loss_mlp": 0.01034935, + "balance_loss_clip": 1.04884338, + "balance_loss_mlp": 1.01967072, + "epoch": 0.39795580940928904, + "flos": 30227699341440.0, + "grad_norm": 1.632886260079407, + "language_loss": 0.65303481, + "learning_rate": 2.740064215712231e-06, + "loss": 0.67445999, + "num_input_tokens_seen": 142169735, + "step": 6619, + "time_per_iteration": 2.6154842376708984 + }, + { + "auxiliary_loss_clip": 0.01054887, + "auxiliary_loss_mlp": 0.01009462, + "balance_loss_clip": 1.02669108, + "balance_loss_mlp": 1.00813866, + "epoch": 0.398015932661957, + "flos": 69847224906240.0, + "grad_norm": 0.7777127318091575, + "language_loss": 0.58234823, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.6029917, + "num_input_tokens_seen": 142229520, + "step": 6620, + "time_per_iteration": 3.0367910861968994 + }, + { + "auxiliary_loss_clip": 0.01107987, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.04715371, + "balance_loss_mlp": 1.0218451, + "epoch": 0.39807605591462497, + "flos": 20157773189760.0, + "grad_norm": 1.6044551552121404, + "language_loss": 0.79170251, + "learning_rate": 2.739340525026686e-06, + "loss": 0.81312567, + "num_input_tokens_seen": 142247660, + "step": 6621, + "time_per_iteration": 2.5394749641418457 + }, + { + "auxiliary_loss_clip": 0.01108624, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.04898262, + "balance_loss_mlp": 1.019171, + "epoch": 0.39813617916729294, + "flos": 21141761339520.0, + "grad_norm": 1.802739624070729, + "language_loss": 0.78021365, + "learning_rate": 2.738978637623252e-06, + "loss": 0.80162251, + "num_input_tokens_seen": 142266990, + "step": 6622, + "time_per_iteration": 2.5325136184692383 + }, + { + "auxiliary_loss_clip": 0.01101331, + "auxiliary_loss_mlp": 0.01036553, + "balance_loss_clip": 1.04388988, + "balance_loss_mlp": 1.02205133, + "epoch": 0.3981963024199609, + "flos": 18988485753600.0, + "grad_norm": 1.637107829984215, + "language_loss": 0.75154436, + "learning_rate": 2.738616722197674e-06, + "loss": 0.77292323, + "num_input_tokens_seen": 142287170, + "step": 6623, + "time_per_iteration": 2.5320024490356445 + }, + { + "auxiliary_loss_clip": 0.01092659, + "auxiliary_loss_mlp": 0.01040065, + "balance_loss_clip": 1.05171227, + "balance_loss_mlp": 1.02545631, + "epoch": 0.39825642567262887, + "flos": 16575108808320.0, + "grad_norm": 2.0211302244649385, + "language_loss": 0.79544938, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.81677657, + "num_input_tokens_seen": 142305405, + "step": 6624, + "time_per_iteration": 2.5311384201049805 + }, + { + "auxiliary_loss_clip": 0.0113496, + "auxiliary_loss_mlp": 0.0104084, + "balance_loss_clip": 1.05088842, + "balance_loss_mlp": 1.02547979, + "epoch": 0.39831654892529683, + "flos": 22199833290240.0, + "grad_norm": 2.316400697297772, + "language_loss": 0.84092838, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.8626864, + "num_input_tokens_seen": 142322710, + "step": 6625, + "time_per_iteration": 2.499185800552368 + }, + { + "auxiliary_loss_clip": 0.01114114, + "auxiliary_loss_mlp": 0.01039827, + "balance_loss_clip": 1.05009615, + "balance_loss_mlp": 1.02539086, + "epoch": 0.39837667217796485, + "flos": 10487963612160.0, + "grad_norm": 1.9888826588300066, + "language_loss": 0.86251783, + "learning_rate": 2.737530807925321e-06, + "loss": 0.88405728, + "num_input_tokens_seen": 142338535, + "step": 6626, + "time_per_iteration": 2.4958102703094482 + }, + { + "auxiliary_loss_clip": 0.01067081, + "auxiliary_loss_mlp": 0.00793199, + "balance_loss_clip": 1.04346299, + "balance_loss_mlp": 1.01055956, + "epoch": 0.3984367954306328, + "flos": 17965282930560.0, + "grad_norm": 2.1274379717369185, + "language_loss": 0.83276486, + "learning_rate": 2.737168780548417e-06, + "loss": 0.85136765, + "num_input_tokens_seen": 142354570, + "step": 6627, + "time_per_iteration": 2.60463547706604 + }, + { + "auxiliary_loss_clip": 0.01095945, + "auxiliary_loss_mlp": 0.00790263, + "balance_loss_clip": 1.04671407, + "balance_loss_mlp": 1.01056337, + "epoch": 0.3984969186833008, + "flos": 22711057608960.0, + "grad_norm": 1.6053460685971201, + "language_loss": 0.82988405, + "learning_rate": 2.736806725217998e-06, + "loss": 0.84874612, + "num_input_tokens_seen": 142374395, + "step": 6628, + "time_per_iteration": 2.5752501487731934 + }, + { + "auxiliary_loss_clip": 0.01090211, + "auxiliary_loss_mlp": 0.01056971, + "balance_loss_clip": 1.04778075, + "balance_loss_mlp": 1.04124713, + "epoch": 0.39855704193596875, + "flos": 23405785534080.0, + "grad_norm": 1.7654908237788431, + "language_loss": 0.710006, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.73147786, + "num_input_tokens_seen": 142396040, + "step": 6629, + "time_per_iteration": 2.587245225906372 + }, + { + "auxiliary_loss_clip": 0.0109509, + "auxiliary_loss_mlp": 0.01034134, + "balance_loss_clip": 1.04905701, + "balance_loss_mlp": 1.02032912, + "epoch": 0.3986171651886367, + "flos": 21251935330560.0, + "grad_norm": 1.7780248170246171, + "language_loss": 0.8064906, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.82778287, + "num_input_tokens_seen": 142415495, + "step": 6630, + "time_per_iteration": 2.58601713180542 + }, + { + "auxiliary_loss_clip": 0.01072967, + "auxiliary_loss_mlp": 0.01031763, + "balance_loss_clip": 1.04997206, + "balance_loss_mlp": 1.01732099, + "epoch": 0.3986772884413047, + "flos": 12458705258880.0, + "grad_norm": 1.9129997828801952, + "language_loss": 0.75246465, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.77351195, + "num_input_tokens_seen": 142431865, + "step": 6631, + "time_per_iteration": 2.618156671524048 + }, + { + "auxiliary_loss_clip": 0.01095693, + "auxiliary_loss_mlp": 0.0103848, + "balance_loss_clip": 1.04742503, + "balance_loss_mlp": 1.02382922, + "epoch": 0.39873741169397264, + "flos": 19646117907840.0, + "grad_norm": 1.7408786044593307, + "language_loss": 0.71109152, + "learning_rate": 2.735358224635783e-06, + "loss": 0.7324332, + "num_input_tokens_seen": 142450595, + "step": 6632, + "time_per_iteration": 2.5608553886413574 + }, + { + "auxiliary_loss_clip": 0.01063012, + "auxiliary_loss_mlp": 0.00789224, + "balance_loss_clip": 1.04863024, + "balance_loss_mlp": 1.01107264, + "epoch": 0.3987975349466406, + "flos": 21684766216320.0, + "grad_norm": 1.9115777860144054, + "language_loss": 0.75293374, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.77145612, + "num_input_tokens_seen": 142466650, + "step": 6633, + "time_per_iteration": 2.6329824924468994 + }, + { + "auxiliary_loss_clip": 0.01104516, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.04890907, + "balance_loss_mlp": 1.01764143, + "epoch": 0.3988576581993086, + "flos": 23914064937600.0, + "grad_norm": 1.8130821436292546, + "language_loss": 0.81309015, + "learning_rate": 2.7346338069806e-06, + "loss": 0.83445168, + "num_input_tokens_seen": 142486165, + "step": 6634, + "time_per_iteration": 2.563260555267334 + }, + { + "auxiliary_loss_clip": 0.01099629, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.04691696, + "balance_loss_mlp": 1.01908624, + "epoch": 0.39891778145197654, + "flos": 18149899858560.0, + "grad_norm": 1.930823353288653, + "language_loss": 0.74485391, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.76618475, + "num_input_tokens_seen": 142505035, + "step": 6635, + "time_per_iteration": 2.5654022693634033 + }, + { + "auxiliary_loss_clip": 0.01102222, + "auxiliary_loss_mlp": 0.01045563, + "balance_loss_clip": 1.04787993, + "balance_loss_mlp": 1.03000665, + "epoch": 0.3989779047046445, + "flos": 22595281096320.0, + "grad_norm": 2.25627492708286, + "language_loss": 0.66223752, + "learning_rate": 2.733909277895868e-06, + "loss": 0.6837154, + "num_input_tokens_seen": 142521870, + "step": 6636, + "time_per_iteration": 2.5659472942352295 + }, + { + "auxiliary_loss_clip": 0.01117475, + "auxiliary_loss_mlp": 0.01036413, + "balance_loss_clip": 1.04848051, + "balance_loss_mlp": 1.02227485, + "epoch": 0.39903802795731247, + "flos": 18077216688000.0, + "grad_norm": 1.824056196772896, + "language_loss": 0.8126514, + "learning_rate": 2.733546971601763e-06, + "loss": 0.83419025, + "num_input_tokens_seen": 142540455, + "step": 6637, + "time_per_iteration": 2.51161789894104 + }, + { + "auxiliary_loss_clip": 0.01025974, + "auxiliary_loss_mlp": 0.01009067, + "balance_loss_clip": 1.02704072, + "balance_loss_mlp": 1.00719535, + "epoch": 0.39909815120998043, + "flos": 70441367771520.0, + "grad_norm": 0.721428453646231, + "language_loss": 0.53183389, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55218434, + "num_input_tokens_seen": 142599665, + "step": 6638, + "time_per_iteration": 3.2490475177764893 + }, + { + "auxiliary_loss_clip": 0.0110783, + "auxiliary_loss_mlp": 0.00789739, + "balance_loss_clip": 1.04790819, + "balance_loss_mlp": 1.01199317, + "epoch": 0.39915827446264845, + "flos": 18549262247040.0, + "grad_norm": 1.5202002055968988, + "language_loss": 0.75616956, + "learning_rate": 2.732822275578769e-06, + "loss": 0.77514529, + "num_input_tokens_seen": 142618845, + "step": 6639, + "time_per_iteration": 2.542187452316284 + }, + { + "auxiliary_loss_clip": 0.01057393, + "auxiliary_loss_mlp": 0.010338, + "balance_loss_clip": 1.0460577, + "balance_loss_mlp": 1.02069283, + "epoch": 0.3992183977153164, + "flos": 29897249195520.0, + "grad_norm": 1.563687424742328, + "language_loss": 0.75875157, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.77966344, + "num_input_tokens_seen": 142640885, + "step": 6640, + "time_per_iteration": 4.105189323425293 + }, + { + "auxiliary_loss_clip": 0.01096051, + "auxiliary_loss_mlp": 0.01033576, + "balance_loss_clip": 1.04523253, + "balance_loss_mlp": 1.01966429, + "epoch": 0.3992785209679844, + "flos": 22565080736640.0, + "grad_norm": 2.390954165778915, + "language_loss": 0.82682025, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.84811646, + "num_input_tokens_seen": 142659340, + "step": 6641, + "time_per_iteration": 3.923123359680176 + }, + { + "auxiliary_loss_clip": 0.01129286, + "auxiliary_loss_mlp": 0.01031285, + "balance_loss_clip": 1.04845643, + "balance_loss_mlp": 1.01745081, + "epoch": 0.39933864422065235, + "flos": 19682674974720.0, + "grad_norm": 2.17654224323603, + "language_loss": 0.77134776, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.79295349, + "num_input_tokens_seen": 142677085, + "step": 6642, + "time_per_iteration": 2.4789516925811768 + }, + { + "auxiliary_loss_clip": 0.01100885, + "auxiliary_loss_mlp": 0.01032498, + "balance_loss_clip": 1.04670358, + "balance_loss_mlp": 1.01794887, + "epoch": 0.3993987674733203, + "flos": 23038491012480.0, + "grad_norm": 2.0657264196938545, + "language_loss": 0.72251928, + "learning_rate": 2.731372550178393e-06, + "loss": 0.74385315, + "num_input_tokens_seen": 142694595, + "step": 6643, + "time_per_iteration": 4.0063183307647705 + }, + { + "auxiliary_loss_clip": 0.01119271, + "auxiliary_loss_mlp": 0.01031487, + "balance_loss_clip": 1.04842222, + "balance_loss_mlp": 1.01768851, + "epoch": 0.3994588907259883, + "flos": 19390828970880.0, + "grad_norm": 1.5925859316634123, + "language_loss": 0.66704696, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.68855453, + "num_input_tokens_seen": 142714175, + "step": 6644, + "time_per_iteration": 2.5068705081939697 + }, + { + "auxiliary_loss_clip": 0.01127084, + "auxiliary_loss_mlp": 0.01035394, + "balance_loss_clip": 1.04615211, + "balance_loss_mlp": 1.0211308, + "epoch": 0.39951901397865625, + "flos": 13734395758080.0, + "grad_norm": 2.30105663831236, + "language_loss": 0.78360188, + "learning_rate": 2.730647521020907e-06, + "loss": 0.80522662, + "num_input_tokens_seen": 142730955, + "step": 6645, + "time_per_iteration": 3.8158793449401855 + }, + { + "auxiliary_loss_clip": 0.01117545, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.04653192, + "balance_loss_mlp": 1.01823735, + "epoch": 0.3995791372313242, + "flos": 23586451966080.0, + "grad_norm": 1.556429180877195, + "language_loss": 0.69869637, + "learning_rate": 2.73028496487595e-06, + "loss": 0.72019124, + "num_input_tokens_seen": 142751200, + "step": 6646, + "time_per_iteration": 2.508810520172119 + }, + { + "auxiliary_loss_clip": 0.01076378, + "auxiliary_loss_mlp": 0.01035021, + "balance_loss_clip": 1.04312801, + "balance_loss_mlp": 1.02056074, + "epoch": 0.3996392604839922, + "flos": 21355896268800.0, + "grad_norm": 1.9722980296152277, + "language_loss": 0.71553195, + "learning_rate": 2.729922381038513e-06, + "loss": 0.73664594, + "num_input_tokens_seen": 142770170, + "step": 6647, + "time_per_iteration": 2.5724337100982666 + }, + { + "auxiliary_loss_clip": 0.01085415, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.04447556, + "balance_loss_mlp": 1.0191381, + "epoch": 0.39969938373666014, + "flos": 26032255914240.0, + "grad_norm": 1.5214757912924397, + "language_loss": 0.74538934, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.76655781, + "num_input_tokens_seen": 142792680, + "step": 6648, + "time_per_iteration": 2.6204991340637207 + }, + { + "auxiliary_loss_clip": 0.0112774, + "auxiliary_loss_mlp": 0.01028323, + "balance_loss_clip": 1.04716873, + "balance_loss_mlp": 1.01410127, + "epoch": 0.3997595069893281, + "flos": 20116367786880.0, + "grad_norm": 2.100429261190177, + "language_loss": 0.65788472, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.67944527, + "num_input_tokens_seen": 142810510, + "step": 6649, + "time_per_iteration": 2.4527840614318848 + }, + { + "auxiliary_loss_clip": 0.01101084, + "auxiliary_loss_mlp": 0.01040057, + "balance_loss_clip": 1.0485698, + "balance_loss_mlp": 1.0261035, + "epoch": 0.39981963024199607, + "flos": 27783403764480.0, + "grad_norm": 1.6611266173586952, + "language_loss": 0.75649244, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77790391, + "num_input_tokens_seen": 142832455, + "step": 6650, + "time_per_iteration": 2.6117494106292725 + }, + { + "auxiliary_loss_clip": 0.01127696, + "auxiliary_loss_mlp": 0.01042065, + "balance_loss_clip": 1.0472616, + "balance_loss_mlp": 1.02879727, + "epoch": 0.39987975349466404, + "flos": 21944436612480.0, + "grad_norm": 1.7480518174875843, + "language_loss": 0.71841586, + "learning_rate": 2.728471769038975e-06, + "loss": 0.7401135, + "num_input_tokens_seen": 142852590, + "step": 6651, + "time_per_iteration": 2.481882333755493 + }, + { + "auxiliary_loss_clip": 0.01128301, + "auxiliary_loss_mlp": 0.01035813, + "balance_loss_clip": 1.0468452, + "balance_loss_mlp": 1.0219605, + "epoch": 0.39993987674733206, + "flos": 20704405340160.0, + "grad_norm": 1.8892781906831801, + "language_loss": 0.73252457, + "learning_rate": 2.728109046945403e-06, + "loss": 0.75416571, + "num_input_tokens_seen": 142870595, + "step": 6652, + "time_per_iteration": 2.4900617599487305 + }, + { + "auxiliary_loss_clip": 0.01026725, + "auxiliary_loss_mlp": 0.01003323, + "balance_loss_clip": 1.02907503, + "balance_loss_mlp": 1.0018332, + "epoch": 0.4, + "flos": 61525429862400.0, + "grad_norm": 0.8481928237157531, + "language_loss": 0.60631037, + "learning_rate": 2.727746297241862e-06, + "loss": 0.62661088, + "num_input_tokens_seen": 142925805, + "step": 6653, + "time_per_iteration": 3.066547393798828 + }, + { + "auxiliary_loss_clip": 0.01088531, + "auxiliary_loss_mlp": 0.01035362, + "balance_loss_clip": 1.04932237, + "balance_loss_mlp": 1.02262449, + "epoch": 0.400060123252668, + "flos": 14502309644160.0, + "grad_norm": 2.465504993373776, + "language_loss": 0.66926008, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.69049895, + "num_input_tokens_seen": 142943145, + "step": 6654, + "time_per_iteration": 2.5468766689300537 + }, + { + "auxiliary_loss_clip": 0.01113883, + "auxiliary_loss_mlp": 0.01035237, + "balance_loss_clip": 1.04675174, + "balance_loss_mlp": 1.02359581, + "epoch": 0.40012024650533595, + "flos": 19093308618240.0, + "grad_norm": 2.854343328058018, + "language_loss": 0.89911586, + "learning_rate": 2.7270207150599e-06, + "loss": 0.92060709, + "num_input_tokens_seen": 142956925, + "step": 6655, + "time_per_iteration": 2.486863136291504 + }, + { + "auxiliary_loss_clip": 0.01096654, + "auxiliary_loss_mlp": 0.01031031, + "balance_loss_clip": 1.04450977, + "balance_loss_mlp": 1.01835895, + "epoch": 0.4001803697580039, + "flos": 29351012094720.0, + "grad_norm": 1.656922451684095, + "language_loss": 0.7349906, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.75626743, + "num_input_tokens_seen": 142978040, + "step": 6656, + "time_per_iteration": 2.6198902130126953 + }, + { + "auxiliary_loss_clip": 0.01126906, + "auxiliary_loss_mlp": 0.01040198, + "balance_loss_clip": 1.04742432, + "balance_loss_mlp": 1.02626848, + "epoch": 0.4002404930106719, + "flos": 20920048640640.0, + "grad_norm": 1.680316023502735, + "language_loss": 0.73478431, + "learning_rate": 2.726295022603144e-06, + "loss": 0.7564553, + "num_input_tokens_seen": 142998390, + "step": 6657, + "time_per_iteration": 2.539036273956299 + }, + { + "auxiliary_loss_clip": 0.01130175, + "auxiliary_loss_mlp": 0.01034316, + "balance_loss_clip": 1.05023384, + "balance_loss_mlp": 1.02029729, + "epoch": 0.40030061626333985, + "flos": 28405735827840.0, + "grad_norm": 1.5045704193256217, + "language_loss": 0.79794097, + "learning_rate": 2.725932135056117e-06, + "loss": 0.81958586, + "num_input_tokens_seen": 143021505, + "step": 6658, + "time_per_iteration": 2.5552303791046143 + }, + { + "auxiliary_loss_clip": 0.01114553, + "auxiliary_loss_mlp": 0.01038383, + "balance_loss_clip": 1.04621208, + "balance_loss_mlp": 1.02533007, + "epoch": 0.4003607395160078, + "flos": 25921615046400.0, + "grad_norm": 1.9523935863773605, + "language_loss": 0.77504581, + "learning_rate": 2.72556921998167e-06, + "loss": 0.79657519, + "num_input_tokens_seen": 143041375, + "step": 6659, + "time_per_iteration": 2.566817045211792 + }, + { + "auxiliary_loss_clip": 0.0111742, + "auxiliary_loss_mlp": 0.01026734, + "balance_loss_clip": 1.04552305, + "balance_loss_mlp": 1.01546288, + "epoch": 0.4004208627686758, + "flos": 20768648814720.0, + "grad_norm": 1.7763890227605263, + "language_loss": 0.72843397, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.74987555, + "num_input_tokens_seen": 143058725, + "step": 6660, + "time_per_iteration": 2.476580858230591 + }, + { + "auxiliary_loss_clip": 0.01098, + "auxiliary_loss_mlp": 0.01036208, + "balance_loss_clip": 1.04430938, + "balance_loss_mlp": 1.02397108, + "epoch": 0.40048098602134374, + "flos": 24681224638080.0, + "grad_norm": 1.738986696764367, + "language_loss": 0.71276963, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.73411167, + "num_input_tokens_seen": 143076995, + "step": 6661, + "time_per_iteration": 2.5720791816711426 + }, + { + "auxiliary_loss_clip": 0.01129518, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.05025542, + "balance_loss_mlp": 1.02677667, + "epoch": 0.4005411092740117, + "flos": 23185688947200.0, + "grad_norm": 1.702554894081864, + "language_loss": 0.75780714, + "learning_rate": 2.724480309731437e-06, + "loss": 0.77949607, + "num_input_tokens_seen": 143096780, + "step": 6662, + "time_per_iteration": 2.4837634563446045 + }, + { + "auxiliary_loss_clip": 0.01111596, + "auxiliary_loss_mlp": 0.01032609, + "balance_loss_clip": 1.04482651, + "balance_loss_mlp": 1.01884019, + "epoch": 0.4006012325266797, + "flos": 17522324409600.0, + "grad_norm": 1.9969985775134622, + "language_loss": 0.66016865, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.68161064, + "num_input_tokens_seen": 143112590, + "step": 6663, + "time_per_iteration": 2.484163284301758 + }, + { + "auxiliary_loss_clip": 0.01108828, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.04423165, + "balance_loss_mlp": 1.02710128, + "epoch": 0.40066135577934764, + "flos": 19857200181120.0, + "grad_norm": 2.0221134656796305, + "language_loss": 0.85540855, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.87690759, + "num_input_tokens_seen": 143130220, + "step": 6664, + "time_per_iteration": 2.513077735900879 + }, + { + "auxiliary_loss_clip": 0.01114753, + "auxiliary_loss_mlp": 0.01034576, + "balance_loss_clip": 1.04794908, + "balance_loss_mlp": 1.02190399, + "epoch": 0.40072147903201566, + "flos": 18150007599360.0, + "grad_norm": 2.124279817821128, + "language_loss": 0.8484726, + "learning_rate": 2.723391152229917e-06, + "loss": 0.86996591, + "num_input_tokens_seen": 143147160, + "step": 6665, + "time_per_iteration": 2.4638185501098633 + }, + { + "auxiliary_loss_clip": 0.01114628, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.04943013, + "balance_loss_mlp": 1.01926184, + "epoch": 0.4007816022846836, + "flos": 18661267831680.0, + "grad_norm": 1.9456708311567268, + "language_loss": 0.78443098, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.80590534, + "num_input_tokens_seen": 143164605, + "step": 6666, + "time_per_iteration": 2.4794859886169434 + }, + { + "auxiliary_loss_clip": 0.01116977, + "auxiliary_loss_mlp": 0.01040542, + "balance_loss_clip": 1.04889858, + "balance_loss_mlp": 1.02686846, + "epoch": 0.4008417255373516, + "flos": 25703170485120.0, + "grad_norm": 1.7880495967244696, + "language_loss": 0.73444456, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.75601977, + "num_input_tokens_seen": 143183965, + "step": 6667, + "time_per_iteration": 2.542250156402588 + }, + { + "auxiliary_loss_clip": 0.01110169, + "auxiliary_loss_mlp": 0.01048553, + "balance_loss_clip": 1.04596782, + "balance_loss_mlp": 1.03387821, + "epoch": 0.40090184879001955, + "flos": 22858614679680.0, + "grad_norm": 1.6555309638965752, + "language_loss": 0.75604868, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.77763587, + "num_input_tokens_seen": 143204965, + "step": 6668, + "time_per_iteration": 2.5267891883850098 + }, + { + "auxiliary_loss_clip": 0.01091894, + "auxiliary_loss_mlp": 0.01035067, + "balance_loss_clip": 1.04902732, + "balance_loss_mlp": 1.02160811, + "epoch": 0.4009619720426875, + "flos": 29059848449280.0, + "grad_norm": 1.901452942913847, + "language_loss": 0.82127237, + "learning_rate": 2.721938558257248e-06, + "loss": 0.84254199, + "num_input_tokens_seen": 143225015, + "step": 6669, + "time_per_iteration": 2.632352113723755 + }, + { + "auxiliary_loss_clip": 0.01031971, + "auxiliary_loss_mlp": 0.01005789, + "balance_loss_clip": 1.02105236, + "balance_loss_mlp": 1.00416756, + "epoch": 0.4010220952953555, + "flos": 66059763131520.0, + "grad_norm": 0.7111326349691449, + "language_loss": 0.5331794, + "learning_rate": 2.721575341289695e-06, + "loss": 0.55355698, + "num_input_tokens_seen": 143294925, + "step": 6670, + "time_per_iteration": 3.269810914993286 + }, + { + "auxiliary_loss_clip": 0.01079282, + "auxiliary_loss_mlp": 0.01034661, + "balance_loss_clip": 1.04974174, + "balance_loss_mlp": 1.02158356, + "epoch": 0.40108221854802345, + "flos": 29642822184960.0, + "grad_norm": 1.5959015082767698, + "language_loss": 0.8900888, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.9112283, + "num_input_tokens_seen": 143314170, + "step": 6671, + "time_per_iteration": 2.651196002960205 + }, + { + "auxiliary_loss_clip": 0.01114183, + "auxiliary_loss_mlp": 0.01033553, + "balance_loss_clip": 1.04528928, + "balance_loss_mlp": 1.01948678, + "epoch": 0.4011423418006914, + "flos": 19929560129280.0, + "grad_norm": 1.9201743495944381, + "language_loss": 0.78964794, + "learning_rate": 2.720848825281736e-06, + "loss": 0.81112522, + "num_input_tokens_seen": 143330050, + "step": 6672, + "time_per_iteration": 2.4958817958831787 + }, + { + "auxiliary_loss_clip": 0.01085274, + "auxiliary_loss_mlp": 0.01033576, + "balance_loss_clip": 1.04131913, + "balance_loss_mlp": 1.01896715, + "epoch": 0.4012024650533594, + "flos": 20084299920000.0, + "grad_norm": 4.39523162939733, + "language_loss": 0.63255763, + "learning_rate": 2.72048552626888e-06, + "loss": 0.65374607, + "num_input_tokens_seen": 143348650, + "step": 6673, + "time_per_iteration": 2.554178476333618 + }, + { + "auxiliary_loss_clip": 0.01101431, + "auxiliary_loss_mlp": 0.00795764, + "balance_loss_clip": 1.04613745, + "balance_loss_mlp": 1.02548778, + "epoch": 0.40126258830602735, + "flos": 21695719864320.0, + "grad_norm": 1.4396334063754586, + "language_loss": 0.80240446, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.82137644, + "num_input_tokens_seen": 143370275, + "step": 6674, + "time_per_iteration": 2.590885639190674 + }, + { + "auxiliary_loss_clip": 0.01086393, + "auxiliary_loss_mlp": 0.01035026, + "balance_loss_clip": 1.05306005, + "balance_loss_mlp": 1.02132273, + "epoch": 0.4013227115586953, + "flos": 12020379592320.0, + "grad_norm": 2.152765152486263, + "language_loss": 0.81931967, + "learning_rate": 2.719758846294294e-06, + "loss": 0.84053385, + "num_input_tokens_seen": 143385390, + "step": 6675, + "time_per_iteration": 2.6176342964172363 + }, + { + "auxiliary_loss_clip": 0.01114885, + "auxiliary_loss_mlp": 0.01035039, + "balance_loss_clip": 1.04682112, + "balance_loss_mlp": 1.02055562, + "epoch": 0.4013828348113633, + "flos": 25447522412160.0, + "grad_norm": 1.746885843783589, + "language_loss": 0.93411744, + "learning_rate": 2.71939546536012e-06, + "loss": 0.95561671, + "num_input_tokens_seen": 143404215, + "step": 6676, + "time_per_iteration": 2.5524468421936035 + }, + { + "auxiliary_loss_clip": 0.01119761, + "auxiliary_loss_mlp": 0.01041345, + "balance_loss_clip": 1.04762864, + "balance_loss_mlp": 1.02639055, + "epoch": 0.40144295806403124, + "flos": 18582946225920.0, + "grad_norm": 1.8537021691188993, + "language_loss": 0.79028338, + "learning_rate": 2.719032057146399e-06, + "loss": 0.81189442, + "num_input_tokens_seen": 143422245, + "step": 6677, + "time_per_iteration": 2.5016286373138428 + }, + { + "auxiliary_loss_clip": 0.01107175, + "auxiliary_loss_mlp": 0.01032262, + "balance_loss_clip": 1.05084121, + "balance_loss_mlp": 1.0190413, + "epoch": 0.4015030813166992, + "flos": 22930220442240.0, + "grad_norm": 1.8536075663240672, + "language_loss": 0.83936155, + "learning_rate": 2.71866862166691e-06, + "loss": 0.86075592, + "num_input_tokens_seen": 143443130, + "step": 6678, + "time_per_iteration": 3.911740779876709 + }, + { + "auxiliary_loss_clip": 0.0112302, + "auxiliary_loss_mlp": 0.01036531, + "balance_loss_clip": 1.04641283, + "balance_loss_mlp": 1.02282774, + "epoch": 0.4015632045693672, + "flos": 20595057361920.0, + "grad_norm": 2.337987728124336, + "language_loss": 0.63655138, + "learning_rate": 2.718305158935434e-06, + "loss": 0.65814692, + "num_input_tokens_seen": 143461385, + "step": 6679, + "time_per_iteration": 2.447779893875122 + }, + { + "auxiliary_loss_clip": 0.01097038, + "auxiliary_loss_mlp": 0.01029064, + "balance_loss_clip": 1.04537344, + "balance_loss_mlp": 1.01633835, + "epoch": 0.4016233278220352, + "flos": 23438930808960.0, + "grad_norm": 1.4845357228518856, + "language_loss": 0.78829908, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.80956006, + "num_input_tokens_seen": 143481750, + "step": 6680, + "time_per_iteration": 3.912682294845581 + }, + { + "auxiliary_loss_clip": 0.01094787, + "auxiliary_loss_mlp": 0.00793894, + "balance_loss_clip": 1.04841399, + "balance_loss_mlp": 1.01907921, + "epoch": 0.40168345107470316, + "flos": 21431057477760.0, + "grad_norm": 1.5435336930820476, + "language_loss": 0.75819141, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.77707827, + "num_input_tokens_seen": 143501540, + "step": 6681, + "time_per_iteration": 4.048239469528198 + }, + { + "auxiliary_loss_clip": 0.01087932, + "auxiliary_loss_mlp": 0.01029983, + "balance_loss_clip": 1.04576206, + "balance_loss_mlp": 1.01685858, + "epoch": 0.4017435743273711, + "flos": 22857214049280.0, + "grad_norm": 1.7314068070559012, + "language_loss": 0.6397624, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.6609416, + "num_input_tokens_seen": 143520530, + "step": 6682, + "time_per_iteration": 2.590996503829956 + }, + { + "auxiliary_loss_clip": 0.01079986, + "auxiliary_loss_mlp": 0.01031982, + "balance_loss_clip": 1.04313564, + "balance_loss_mlp": 1.01877987, + "epoch": 0.4018036975800391, + "flos": 28622312881920.0, + "grad_norm": 1.7006235455424963, + "language_loss": 0.72895646, + "learning_rate": 2.716851035765337e-06, + "loss": 0.75007612, + "num_input_tokens_seen": 143540210, + "step": 6683, + "time_per_iteration": 4.073796272277832 + }, + { + "auxiliary_loss_clip": 0.0111102, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.04591417, + "balance_loss_mlp": 1.02593088, + "epoch": 0.40186382083270705, + "flos": 26651212099200.0, + "grad_norm": 1.5875342191165946, + "language_loss": 0.73344898, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.75494981, + "num_input_tokens_seen": 143560940, + "step": 6684, + "time_per_iteration": 2.556485176086426 + }, + { + "auxiliary_loss_clip": 0.01043771, + "auxiliary_loss_mlp": 0.01004956, + "balance_loss_clip": 1.02355623, + "balance_loss_mlp": 1.00361466, + "epoch": 0.401923944085375, + "flos": 59259969123840.0, + "grad_norm": 0.8079764037969926, + "language_loss": 0.60413003, + "learning_rate": 2.716123811026767e-06, + "loss": 0.62461734, + "num_input_tokens_seen": 143624015, + "step": 6685, + "time_per_iteration": 3.212909460067749 + }, + { + "auxiliary_loss_clip": 0.01120327, + "auxiliary_loss_mlp": 0.01029935, + "balance_loss_clip": 1.046556, + "balance_loss_mlp": 1.01602316, + "epoch": 0.401984067338043, + "flos": 16982803152000.0, + "grad_norm": 1.6905805280646442, + "language_loss": 0.70302176, + "learning_rate": 2.715760157917357e-06, + "loss": 0.72452444, + "num_input_tokens_seen": 143642750, + "step": 6686, + "time_per_iteration": 2.490316867828369 + }, + { + "auxiliary_loss_clip": 0.0110037, + "auxiliary_loss_mlp": 0.01031434, + "balance_loss_clip": 1.04465389, + "balance_loss_mlp": 1.01880956, + "epoch": 0.40204419059071095, + "flos": 24972496024320.0, + "grad_norm": 1.556826139001412, + "language_loss": 0.74902683, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.77034491, + "num_input_tokens_seen": 143664515, + "step": 6687, + "time_per_iteration": 2.5623998641967773 + }, + { + "auxiliary_loss_clip": 0.01103458, + "auxiliary_loss_mlp": 0.01030911, + "balance_loss_clip": 1.0470593, + "balance_loss_mlp": 1.01720834, + "epoch": 0.4021043138433789, + "flos": 23477463123840.0, + "grad_norm": 2.1519694075367375, + "language_loss": 0.71244454, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.73378825, + "num_input_tokens_seen": 143683135, + "step": 6688, + "time_per_iteration": 2.584134340286255 + }, + { + "auxiliary_loss_clip": 0.01099979, + "auxiliary_loss_mlp": 0.01042807, + "balance_loss_clip": 1.04347336, + "balance_loss_mlp": 1.02859104, + "epoch": 0.4021644370960469, + "flos": 25995806588160.0, + "grad_norm": 1.6790823630555596, + "language_loss": 0.64541447, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.66684234, + "num_input_tokens_seen": 143703985, + "step": 6689, + "time_per_iteration": 2.590778112411499 + }, + { + "auxiliary_loss_clip": 0.01115062, + "auxiliary_loss_mlp": 0.01026854, + "balance_loss_clip": 1.04477024, + "balance_loss_mlp": 1.01346636, + "epoch": 0.40222456034871484, + "flos": 13587987922560.0, + "grad_norm": 2.0939155620714787, + "language_loss": 0.73322725, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.7546463, + "num_input_tokens_seen": 143719245, + "step": 6690, + "time_per_iteration": 2.4778342247009277 + }, + { + "auxiliary_loss_clip": 0.01089535, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.04163349, + "balance_loss_mlp": 1.01574087, + "epoch": 0.4022846836013828, + "flos": 24278019494400.0, + "grad_norm": 1.5115613565244943, + "language_loss": 0.74870718, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.76990044, + "num_input_tokens_seen": 143739575, + "step": 6691, + "time_per_iteration": 2.5884199142456055 + }, + { + "auxiliary_loss_clip": 0.01106243, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.04920673, + "balance_loss_mlp": 1.02095962, + "epoch": 0.40234480685405083, + "flos": 20151596050560.0, + "grad_norm": 1.6671439014002902, + "language_loss": 0.72618699, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.74759048, + "num_input_tokens_seen": 143758515, + "step": 6692, + "time_per_iteration": 2.561613082885742 + }, + { + "auxiliary_loss_clip": 0.01074646, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.04401493, + "balance_loss_mlp": 1.01944613, + "epoch": 0.4024049301067188, + "flos": 22930220442240.0, + "grad_norm": 2.080452068441124, + "language_loss": 0.83805972, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.85912919, + "num_input_tokens_seen": 143776770, + "step": 6693, + "time_per_iteration": 2.6005442142486572 + }, + { + "auxiliary_loss_clip": 0.01084834, + "auxiliary_loss_mlp": 0.01043643, + "balance_loss_clip": 1.04639482, + "balance_loss_mlp": 1.02909911, + "epoch": 0.40246505335938676, + "flos": 36028421487360.0, + "grad_norm": 1.972885657183727, + "language_loss": 0.71157956, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.73286438, + "num_input_tokens_seen": 143798450, + "step": 6694, + "time_per_iteration": 2.7122020721435547 + }, + { + "auxiliary_loss_clip": 0.01094248, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.04310942, + "balance_loss_mlp": 1.02008462, + "epoch": 0.4025251766120547, + "flos": 20594303176320.0, + "grad_norm": 2.019754183498444, + "language_loss": 0.67794913, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.69922805, + "num_input_tokens_seen": 143816995, + "step": 6695, + "time_per_iteration": 2.52754545211792 + }, + { + "auxiliary_loss_clip": 0.01092388, + "auxiliary_loss_mlp": 0.01035482, + "balance_loss_clip": 1.04045296, + "balance_loss_mlp": 1.02143919, + "epoch": 0.4025852998647227, + "flos": 64523932381440.0, + "grad_norm": 2.092151344628605, + "language_loss": 0.79533088, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.8166095, + "num_input_tokens_seen": 143842090, + "step": 6696, + "time_per_iteration": 2.945188045501709 + }, + { + "auxiliary_loss_clip": 0.01098191, + "auxiliary_loss_mlp": 0.01048227, + "balance_loss_clip": 1.0447197, + "balance_loss_mlp": 1.032336, + "epoch": 0.40264542311739066, + "flos": 20886292834560.0, + "grad_norm": 1.912673719783568, + "language_loss": 0.71027279, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.73173702, + "num_input_tokens_seen": 143860800, + "step": 6697, + "time_per_iteration": 2.576551675796509 + }, + { + "auxiliary_loss_clip": 0.01112749, + "auxiliary_loss_mlp": 0.01035592, + "balance_loss_clip": 1.04578686, + "balance_loss_mlp": 1.02228236, + "epoch": 0.4027055463700586, + "flos": 26250197685120.0, + "grad_norm": 2.019138799736659, + "language_loss": 0.6187346, + "learning_rate": 2.711394207496984e-06, + "loss": 0.64021802, + "num_input_tokens_seen": 143878950, + "step": 6698, + "time_per_iteration": 2.5355989933013916 + }, + { + "auxiliary_loss_clip": 0.01114457, + "auxiliary_loss_mlp": 0.01029172, + "balance_loss_clip": 1.04649055, + "balance_loss_mlp": 1.01533175, + "epoch": 0.4027656696227266, + "flos": 20631398947200.0, + "grad_norm": 2.5363923816351828, + "language_loss": 0.77275556, + "learning_rate": 2.711030202621491e-06, + "loss": 0.79419184, + "num_input_tokens_seen": 143898385, + "step": 6699, + "time_per_iteration": 2.5343313217163086 + }, + { + "auxiliary_loss_clip": 0.01086241, + "auxiliary_loss_mlp": 0.01027188, + "balance_loss_clip": 1.04405606, + "balance_loss_mlp": 1.01451564, + "epoch": 0.40282579287539455, + "flos": 22346277039360.0, + "grad_norm": 2.126319305041707, + "language_loss": 0.8024683, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.82360256, + "num_input_tokens_seen": 143918795, + "step": 6700, + "time_per_iteration": 2.5958404541015625 + }, + { + "auxiliary_loss_clip": 0.01106192, + "auxiliary_loss_mlp": 0.01037014, + "balance_loss_clip": 1.04975724, + "balance_loss_mlp": 1.02168322, + "epoch": 0.4028859161280625, + "flos": 29274988959360.0, + "grad_norm": 1.8114333728397314, + "language_loss": 0.75060594, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.77203798, + "num_input_tokens_seen": 143938245, + "step": 6701, + "time_per_iteration": 2.606935739517212 + }, + { + "auxiliary_loss_clip": 0.01092987, + "auxiliary_loss_mlp": 0.0103394, + "balance_loss_clip": 1.04410374, + "balance_loss_mlp": 1.02121449, + "epoch": 0.4029460393807305, + "flos": 28622312881920.0, + "grad_norm": 1.6498034203487644, + "language_loss": 0.66174555, + "learning_rate": 2.709938026276208e-06, + "loss": 0.68301487, + "num_input_tokens_seen": 143960995, + "step": 6702, + "time_per_iteration": 2.600428581237793 + }, + { + "auxiliary_loss_clip": 0.01097379, + "auxiliary_loss_mlp": 0.01034683, + "balance_loss_clip": 1.04463136, + "balance_loss_mlp": 1.0202291, + "epoch": 0.40300616263339845, + "flos": 22601925112320.0, + "grad_norm": 1.8696773033957998, + "language_loss": 0.65920657, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.68052721, + "num_input_tokens_seen": 143979910, + "step": 6703, + "time_per_iteration": 2.5674004554748535 + }, + { + "auxiliary_loss_clip": 0.01056396, + "auxiliary_loss_mlp": 0.01037601, + "balance_loss_clip": 1.04614019, + "balance_loss_mlp": 1.02211559, + "epoch": 0.4030662858860664, + "flos": 25520313323520.0, + "grad_norm": 1.933230747477807, + "language_loss": 0.81902027, + "learning_rate": 2.709209774085071e-06, + "loss": 0.83996022, + "num_input_tokens_seen": 144000095, + "step": 6704, + "time_per_iteration": 2.749000072479248 + }, + { + "auxiliary_loss_clip": 0.01111213, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.05053246, + "balance_loss_mlp": 1.01855814, + "epoch": 0.40312640913873443, + "flos": 23586703361280.0, + "grad_norm": 1.6333579160463483, + "language_loss": 0.73098695, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.75242013, + "num_input_tokens_seen": 144019695, + "step": 6705, + "time_per_iteration": 2.7992630004882812 + }, + { + "auxiliary_loss_clip": 0.01109141, + "auxiliary_loss_mlp": 0.01032208, + "balance_loss_clip": 1.04561698, + "balance_loss_mlp": 1.01936889, + "epoch": 0.4031865323914024, + "flos": 20011042131840.0, + "grad_norm": 1.9997398191913804, + "language_loss": 0.66614842, + "learning_rate": 2.708481414320713e-06, + "loss": 0.68756187, + "num_input_tokens_seen": 144038525, + "step": 6706, + "time_per_iteration": 2.5008106231689453 + }, + { + "auxiliary_loss_clip": 0.01112843, + "auxiliary_loss_mlp": 0.01038025, + "balance_loss_clip": 1.04706216, + "balance_loss_mlp": 1.02420902, + "epoch": 0.40324665564407036, + "flos": 21871430219520.0, + "grad_norm": 1.5240327175239061, + "language_loss": 0.71257877, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.73408747, + "num_input_tokens_seen": 144059485, + "step": 6707, + "time_per_iteration": 2.539362668991089 + }, + { + "auxiliary_loss_clip": 0.01096289, + "auxiliary_loss_mlp": 0.01026964, + "balance_loss_clip": 1.04515123, + "balance_loss_mlp": 1.01387453, + "epoch": 0.4033067788967383, + "flos": 23878728933120.0, + "grad_norm": 1.5303743920383786, + "language_loss": 0.79745501, + "learning_rate": 2.707752947093611e-06, + "loss": 0.81868756, + "num_input_tokens_seen": 144080265, + "step": 6708, + "time_per_iteration": 2.5736637115478516 + }, + { + "auxiliary_loss_clip": 0.01074226, + "auxiliary_loss_mlp": 0.01035686, + "balance_loss_clip": 1.04180169, + "balance_loss_mlp": 1.02198887, + "epoch": 0.4033669021494063, + "flos": 17419907756160.0, + "grad_norm": 2.0708209337115684, + "language_loss": 0.82628655, + "learning_rate": 2.70738867321606e-06, + "loss": 0.84738564, + "num_input_tokens_seen": 144098040, + "step": 6709, + "time_per_iteration": 2.5728559494018555 + }, + { + "auxiliary_loss_clip": 0.01115675, + "auxiliary_loss_mlp": 0.01035578, + "balance_loss_clip": 1.05128598, + "balance_loss_mlp": 1.02145123, + "epoch": 0.40342702540207426, + "flos": 29600554855680.0, + "grad_norm": 1.4957394050170716, + "language_loss": 0.71136451, + "learning_rate": 2.70702437251426e-06, + "loss": 0.73287702, + "num_input_tokens_seen": 144118265, + "step": 6710, + "time_per_iteration": 2.5703036785125732 + }, + { + "auxiliary_loss_clip": 0.01091098, + "auxiliary_loss_mlp": 0.01031143, + "balance_loss_clip": 1.04484558, + "balance_loss_mlp": 1.01776814, + "epoch": 0.4034871486547422, + "flos": 11284605400320.0, + "grad_norm": 2.477459158824322, + "language_loss": 0.8527509, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.87397337, + "num_input_tokens_seen": 144133865, + "step": 6711, + "time_per_iteration": 2.507282257080078 + }, + { + "auxiliary_loss_clip": 0.01116941, + "auxiliary_loss_mlp": 0.01032683, + "balance_loss_clip": 1.04766989, + "balance_loss_mlp": 1.01883113, + "epoch": 0.4035472719074102, + "flos": 15552839738880.0, + "grad_norm": 2.0314412210675497, + "language_loss": 0.7657162, + "learning_rate": 2.706295690693168e-06, + "loss": 0.78721249, + "num_input_tokens_seen": 144150125, + "step": 6712, + "time_per_iteration": 2.5194931030273438 + }, + { + "auxiliary_loss_clip": 0.0110318, + "auxiliary_loss_mlp": 0.01039378, + "balance_loss_clip": 1.04748845, + "balance_loss_mlp": 1.02574682, + "epoch": 0.40360739516007815, + "flos": 24674365140480.0, + "grad_norm": 1.8465563084290522, + "language_loss": 0.78312701, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.80455267, + "num_input_tokens_seen": 144169295, + "step": 6713, + "time_per_iteration": 2.558682918548584 + }, + { + "auxiliary_loss_clip": 0.01091225, + "auxiliary_loss_mlp": 0.01036235, + "balance_loss_clip": 1.04326773, + "balance_loss_mlp": 1.02098858, + "epoch": 0.4036675184127461, + "flos": 17304095329920.0, + "grad_norm": 2.149655025184392, + "language_loss": 0.88074416, + "learning_rate": 2.705566901740865e-06, + "loss": 0.90201879, + "num_input_tokens_seen": 144185790, + "step": 6714, + "time_per_iteration": 2.5509650707244873 + }, + { + "auxiliary_loss_clip": 0.01116439, + "auxiliary_loss_mlp": 0.01036656, + "balance_loss_clip": 1.04856622, + "balance_loss_mlp": 1.02342355, + "epoch": 0.4037276416654141, + "flos": 19864023765120.0, + "grad_norm": 1.7265144693806869, + "language_loss": 0.69112509, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.71265602, + "num_input_tokens_seen": 144205190, + "step": 6715, + "time_per_iteration": 2.5454583168029785 + }, + { + "auxiliary_loss_clip": 0.0108189, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.04356003, + "balance_loss_mlp": 1.01787806, + "epoch": 0.40378776491808205, + "flos": 18296271780480.0, + "grad_norm": 1.9100919630952946, + "language_loss": 0.77039754, + "learning_rate": 2.704838005767892e-06, + "loss": 0.79153258, + "num_input_tokens_seen": 144222705, + "step": 6716, + "time_per_iteration": 4.118279933929443 + }, + { + "auxiliary_loss_clip": 0.01078749, + "auxiliary_loss_mlp": 0.0103177, + "balance_loss_clip": 1.04607952, + "balance_loss_mlp": 1.01935434, + "epoch": 0.40384788817075, + "flos": 15049372757760.0, + "grad_norm": 2.216346700890461, + "language_loss": 0.7642917, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.78539687, + "num_input_tokens_seen": 144239545, + "step": 6717, + "time_per_iteration": 2.5754966735839844 + }, + { + "auxiliary_loss_clip": 0.01037655, + "auxiliary_loss_mlp": 0.01001156, + "balance_loss_clip": 1.02711344, + "balance_loss_mlp": 0.99978542, + "epoch": 0.40390801142341803, + "flos": 61929927895680.0, + "grad_norm": 0.9297642796698974, + "language_loss": 0.60703176, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.62741983, + "num_input_tokens_seen": 144288145, + "step": 6718, + "time_per_iteration": 2.9968924522399902 + }, + { + "auxiliary_loss_clip": 0.01129054, + "auxiliary_loss_mlp": 0.01034887, + "balance_loss_clip": 1.04654026, + "balance_loss_mlp": 1.02017641, + "epoch": 0.403968134676086, + "flos": 22738779930240.0, + "grad_norm": 3.5512177776227927, + "language_loss": 0.74935722, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.77099663, + "num_input_tokens_seen": 144302315, + "step": 6719, + "time_per_iteration": 3.9547414779663086 + }, + { + "auxiliary_loss_clip": 0.01118611, + "auxiliary_loss_mlp": 0.0104188, + "balance_loss_clip": 1.0483222, + "balance_loss_mlp": 1.02747345, + "epoch": 0.40402825792875396, + "flos": 19784409269760.0, + "grad_norm": 2.1167355610814655, + "language_loss": 0.81324291, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.83484781, + "num_input_tokens_seen": 144318990, + "step": 6720, + "time_per_iteration": 3.8440959453582764 + }, + { + "auxiliary_loss_clip": 0.01103371, + "auxiliary_loss_mlp": 0.01029381, + "balance_loss_clip": 1.04797411, + "balance_loss_mlp": 1.0161072, + "epoch": 0.40408838118142193, + "flos": 19609273532160.0, + "grad_norm": 1.9567295308614632, + "language_loss": 0.77105397, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.79238153, + "num_input_tokens_seen": 144335765, + "step": 6721, + "time_per_iteration": 3.928610324859619 + }, + { + "auxiliary_loss_clip": 0.01091236, + "auxiliary_loss_mlp": 0.01026126, + "balance_loss_clip": 1.05040932, + "balance_loss_mlp": 1.01454473, + "epoch": 0.4041485044340899, + "flos": 24426043441920.0, + "grad_norm": 1.742144270304361, + "language_loss": 0.7250129, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.74618649, + "num_input_tokens_seen": 144355825, + "step": 6722, + "time_per_iteration": 2.5956079959869385 + }, + { + "auxiliary_loss_clip": 0.01112396, + "auxiliary_loss_mlp": 0.01028526, + "balance_loss_clip": 1.04703403, + "balance_loss_mlp": 1.01606858, + "epoch": 0.40420862768675786, + "flos": 16760192613120.0, + "grad_norm": 1.8434678555712887, + "language_loss": 0.65662491, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.67803419, + "num_input_tokens_seen": 144374320, + "step": 6723, + "time_per_iteration": 2.487516403198242 + }, + { + "auxiliary_loss_clip": 0.01115401, + "auxiliary_loss_mlp": 0.01039994, + "balance_loss_clip": 1.05111742, + "balance_loss_mlp": 1.02564096, + "epoch": 0.4042687509394258, + "flos": 22491571553280.0, + "grad_norm": 1.49806729908896, + "language_loss": 0.73690772, + "learning_rate": 2.701921353880734e-06, + "loss": 0.75846171, + "num_input_tokens_seen": 144394325, + "step": 6724, + "time_per_iteration": 2.512136936187744 + }, + { + "auxiliary_loss_clip": 0.01094446, + "auxiliary_loss_mlp": 0.01034835, + "balance_loss_clip": 1.0481348, + "balance_loss_mlp": 1.02170968, + "epoch": 0.4043288741920938, + "flos": 30336149479680.0, + "grad_norm": 1.7345432133911438, + "language_loss": 0.74707603, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.76836884, + "num_input_tokens_seen": 144412765, + "step": 6725, + "time_per_iteration": 2.580963611602783 + }, + { + "auxiliary_loss_clip": 0.01109255, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.04845643, + "balance_loss_mlp": 1.01257181, + "epoch": 0.40438899744476176, + "flos": 46348321363200.0, + "grad_norm": 2.644308346963915, + "language_loss": 0.76618677, + "learning_rate": 2.701191924463126e-06, + "loss": 0.78755069, + "num_input_tokens_seen": 144435400, + "step": 6726, + "time_per_iteration": 2.725804567337036 + }, + { + "auxiliary_loss_clip": 0.01099084, + "auxiliary_loss_mlp": 0.00792736, + "balance_loss_clip": 1.04312611, + "balance_loss_mlp": 1.01369119, + "epoch": 0.4044491206974297, + "flos": 13333524998400.0, + "grad_norm": 1.9159821297985327, + "language_loss": 0.8176688, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.83658695, + "num_input_tokens_seen": 144452925, + "step": 6727, + "time_per_iteration": 2.5089118480682373 + }, + { + "auxiliary_loss_clip": 0.01122997, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.04507136, + "balance_loss_mlp": 1.01524568, + "epoch": 0.4045092439500977, + "flos": 12093745121280.0, + "grad_norm": 1.8981940472379273, + "language_loss": 0.85203177, + "learning_rate": 2.700462388688447e-06, + "loss": 0.87354219, + "num_input_tokens_seen": 144470195, + "step": 6728, + "time_per_iteration": 2.508298635482788 + }, + { + "auxiliary_loss_clip": 0.01090232, + "auxiliary_loss_mlp": 0.01027345, + "balance_loss_clip": 1.04597461, + "balance_loss_mlp": 1.014768, + "epoch": 0.40456936720276565, + "flos": 21179683123200.0, + "grad_norm": 2.006748857296638, + "language_loss": 0.81981856, + "learning_rate": 2.700097580951786e-06, + "loss": 0.84099436, + "num_input_tokens_seen": 144490320, + "step": 6729, + "time_per_iteration": 2.551534414291382 + }, + { + "auxiliary_loss_clip": 0.0110229, + "auxiliary_loss_mlp": 0.01035275, + "balance_loss_clip": 1.04726863, + "balance_loss_mlp": 1.02197146, + "epoch": 0.4046294904554336, + "flos": 23915286000000.0, + "grad_norm": 2.0931471389866565, + "language_loss": 0.73000288, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.75137854, + "num_input_tokens_seen": 144508990, + "step": 6730, + "time_per_iteration": 2.5615696907043457 + }, + { + "auxiliary_loss_clip": 0.01109254, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.04385555, + "balance_loss_mlp": 1.01908827, + "epoch": 0.4046896137081016, + "flos": 38071235773440.0, + "grad_norm": 1.6965617221567244, + "language_loss": 0.67887986, + "learning_rate": 2.699367885848985e-06, + "loss": 0.7002967, + "num_input_tokens_seen": 144529550, + "step": 6731, + "time_per_iteration": 2.6306514739990234 + }, + { + "auxiliary_loss_clip": 0.01121517, + "auxiliary_loss_mlp": 0.01031294, + "balance_loss_clip": 1.04449666, + "balance_loss_mlp": 1.01928973, + "epoch": 0.4047497369607696, + "flos": 23617262856960.0, + "grad_norm": 1.5635018053618426, + "language_loss": 0.738428, + "learning_rate": 2.699002998510517e-06, + "loss": 0.75995612, + "num_input_tokens_seen": 144549310, + "step": 6732, + "time_per_iteration": 2.49080753326416 + }, + { + "auxiliary_loss_clip": 0.01093595, + "auxiliary_loss_mlp": 0.00786568, + "balance_loss_clip": 1.04447532, + "balance_loss_mlp": 1.0105648, + "epoch": 0.40480986021343757, + "flos": 12823593569280.0, + "grad_norm": 1.8055656162293465, + "language_loss": 0.77331543, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.79211712, + "num_input_tokens_seen": 144567430, + "step": 6733, + "time_per_iteration": 2.5254604816436768 + }, + { + "auxiliary_loss_clip": 0.01097295, + "auxiliary_loss_mlp": 0.01036211, + "balance_loss_clip": 1.04166567, + "balance_loss_mlp": 1.02170873, + "epoch": 0.40486998346610553, + "flos": 23768770423680.0, + "grad_norm": 1.8409244543292647, + "language_loss": 0.76390648, + "learning_rate": 2.698273144328627e-06, + "loss": 0.78524154, + "num_input_tokens_seen": 144585975, + "step": 6734, + "time_per_iteration": 2.553057909011841 + }, + { + "auxiliary_loss_clip": 0.0110519, + "auxiliary_loss_mlp": 0.01034524, + "balance_loss_clip": 1.04897547, + "balance_loss_mlp": 1.02116704, + "epoch": 0.4049301067187735, + "flos": 22856818999680.0, + "grad_norm": 1.9313395418600623, + "language_loss": 0.64373738, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.66513449, + "num_input_tokens_seen": 144605225, + "step": 6735, + "time_per_iteration": 2.5432989597320557 + }, + { + "auxiliary_loss_clip": 0.010817, + "auxiliary_loss_mlp": 0.0103774, + "balance_loss_clip": 1.03999412, + "balance_loss_mlp": 1.024436, + "epoch": 0.40499022997144146, + "flos": 22783992174720.0, + "grad_norm": 2.0363345303392797, + "language_loss": 0.82728237, + "learning_rate": 2.697543184232387e-06, + "loss": 0.84847677, + "num_input_tokens_seen": 144624145, + "step": 6736, + "time_per_iteration": 2.5772979259490967 + }, + { + "auxiliary_loss_clip": 0.01093208, + "auxiliary_loss_mlp": 0.00788069, + "balance_loss_clip": 1.04388738, + "balance_loss_mlp": 1.00959826, + "epoch": 0.4050503532241094, + "flos": 23039352938880.0, + "grad_norm": 1.7328782526291835, + "language_loss": 0.75396949, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.77278233, + "num_input_tokens_seen": 144644470, + "step": 6737, + "time_per_iteration": 2.5709688663482666 + }, + { + "auxiliary_loss_clip": 0.01112422, + "auxiliary_loss_mlp": 0.01039035, + "balance_loss_clip": 1.04579854, + "balance_loss_mlp": 1.02604699, + "epoch": 0.4051104764767774, + "flos": 16647756065280.0, + "grad_norm": 2.3779003906642813, + "language_loss": 0.71770048, + "learning_rate": 2.696813118332519e-06, + "loss": 0.73921502, + "num_input_tokens_seen": 144661055, + "step": 6738, + "time_per_iteration": 2.510666847229004 + }, + { + "auxiliary_loss_clip": 0.0108557, + "auxiliary_loss_mlp": 0.01033241, + "balance_loss_clip": 1.03960752, + "balance_loss_mlp": 1.02087915, + "epoch": 0.40517059972944536, + "flos": 16358962717440.0, + "grad_norm": 2.033141152557743, + "language_loss": 0.74733102, + "learning_rate": 2.696448045740828e-06, + "loss": 0.76851916, + "num_input_tokens_seen": 144677935, + "step": 6739, + "time_per_iteration": 2.5242350101470947 + }, + { + "auxiliary_loss_clip": 0.01088354, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.04525959, + "balance_loss_mlp": 1.01942611, + "epoch": 0.4052307229821133, + "flos": 28803374363520.0, + "grad_norm": 1.8869391037022185, + "language_loss": 0.74257535, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.76379204, + "num_input_tokens_seen": 144697725, + "step": 6740, + "time_per_iteration": 2.6187238693237305 + }, + { + "auxiliary_loss_clip": 0.0110673, + "auxiliary_loss_mlp": 0.01031822, + "balance_loss_clip": 1.04399526, + "balance_loss_mlp": 1.01899505, + "epoch": 0.4052908462347813, + "flos": 21397876289280.0, + "grad_norm": 1.607266604576023, + "language_loss": 0.76975703, + "learning_rate": 2.695717821343153e-06, + "loss": 0.79114258, + "num_input_tokens_seen": 144718805, + "step": 6741, + "time_per_iteration": 2.55676007270813 + }, + { + "auxiliary_loss_clip": 0.01126838, + "auxiliary_loss_mlp": 0.01038336, + "balance_loss_clip": 1.04665458, + "balance_loss_mlp": 1.0234586, + "epoch": 0.40535096948744925, + "flos": 22419067950720.0, + "grad_norm": 2.3309666683725405, + "language_loss": 0.71470547, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.73635715, + "num_input_tokens_seen": 144737105, + "step": 6742, + "time_per_iteration": 2.491229772567749 + }, + { + "auxiliary_loss_clip": 0.01125482, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.04627895, + "balance_loss_mlp": 1.01817489, + "epoch": 0.4054110927401172, + "flos": 17010776868480.0, + "grad_norm": 2.0732017142561268, + "language_loss": 0.71839273, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.73996747, + "num_input_tokens_seen": 144751350, + "step": 6743, + "time_per_iteration": 2.416482448577881 + }, + { + "auxiliary_loss_clip": 0.01101797, + "auxiliary_loss_mlp": 0.01032938, + "balance_loss_clip": 1.04375541, + "balance_loss_mlp": 1.01876962, + "epoch": 0.4054712159927852, + "flos": 21614848392960.0, + "grad_norm": 1.962138527337549, + "language_loss": 0.70713949, + "learning_rate": 2.694622286918588e-06, + "loss": 0.7284869, + "num_input_tokens_seen": 144770030, + "step": 6744, + "time_per_iteration": 2.553705930709839 + }, + { + "auxiliary_loss_clip": 0.01109818, + "auxiliary_loss_mlp": 0.01035039, + "balance_loss_clip": 1.04407001, + "balance_loss_mlp": 1.02281964, + "epoch": 0.4055313392454532, + "flos": 25812554376960.0, + "grad_norm": 1.6239842500609276, + "language_loss": 0.80172557, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.82317412, + "num_input_tokens_seen": 144790965, + "step": 6745, + "time_per_iteration": 2.546596050262451 + }, + { + "auxiliary_loss_clip": 0.01105699, + "auxiliary_loss_mlp": 0.01033649, + "balance_loss_clip": 1.04608583, + "balance_loss_mlp": 1.01986265, + "epoch": 0.40559146249812117, + "flos": 14137098111360.0, + "grad_norm": 1.834793896836658, + "language_loss": 0.66692811, + "learning_rate": 2.693891798911731e-06, + "loss": 0.68832159, + "num_input_tokens_seen": 144807755, + "step": 6746, + "time_per_iteration": 2.485659122467041 + }, + { + "auxiliary_loss_clip": 0.01086128, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.04395497, + "balance_loss_mlp": 1.01647139, + "epoch": 0.40565158575078913, + "flos": 41355481962240.0, + "grad_norm": 1.589996203313907, + "language_loss": 0.57112443, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.59227812, + "num_input_tokens_seen": 144832405, + "step": 6747, + "time_per_iteration": 2.741436243057251 + }, + { + "auxiliary_loss_clip": 0.01094528, + "auxiliary_loss_mlp": 0.01039651, + "balance_loss_clip": 1.04671216, + "balance_loss_mlp": 1.0270803, + "epoch": 0.4057117090034571, + "flos": 28544529980160.0, + "grad_norm": 1.7276330152873935, + "language_loss": 0.84409076, + "learning_rate": 2.693161205655089e-06, + "loss": 0.86543262, + "num_input_tokens_seen": 144853890, + "step": 6748, + "time_per_iteration": 2.658112049102783 + }, + { + "auxiliary_loss_clip": 0.01098877, + "auxiliary_loss_mlp": 0.01041378, + "balance_loss_clip": 1.04749084, + "balance_loss_mlp": 1.02743602, + "epoch": 0.40577183225612506, + "flos": 18004066640640.0, + "grad_norm": 2.1749191179579848, + "language_loss": 0.81296337, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.83436596, + "num_input_tokens_seen": 144871395, + "step": 6749, + "time_per_iteration": 2.5143918991088867 + }, + { + "auxiliary_loss_clip": 0.01110281, + "auxiliary_loss_mlp": 0.00789194, + "balance_loss_clip": 1.04438472, + "balance_loss_mlp": 1.01255584, + "epoch": 0.40583195550879303, + "flos": 19536734016000.0, + "grad_norm": 1.8339487941733779, + "language_loss": 0.75448799, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.7734828, + "num_input_tokens_seen": 144890975, + "step": 6750, + "time_per_iteration": 2.5323739051818848 + }, + { + "auxiliary_loss_clip": 0.01104872, + "auxiliary_loss_mlp": 0.01037121, + "balance_loss_clip": 1.04265809, + "balance_loss_mlp": 1.02298927, + "epoch": 0.405892078761461, + "flos": 22309468577280.0, + "grad_norm": 2.03104302510843, + "language_loss": 0.74612653, + "learning_rate": 2.692065118669195e-06, + "loss": 0.76754647, + "num_input_tokens_seen": 144908170, + "step": 6751, + "time_per_iteration": 2.5154404640197754 + }, + { + "auxiliary_loss_clip": 0.01081636, + "auxiliary_loss_mlp": 0.01039283, + "balance_loss_clip": 1.04513478, + "balance_loss_mlp": 1.02341592, + "epoch": 0.40595220201412896, + "flos": 25484402701440.0, + "grad_norm": 1.521370816356625, + "language_loss": 0.66573215, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.68694139, + "num_input_tokens_seen": 144928020, + "step": 6752, + "time_per_iteration": 2.6662120819091797 + }, + { + "auxiliary_loss_clip": 0.01078377, + "auxiliary_loss_mlp": 0.01036092, + "balance_loss_clip": 1.04622817, + "balance_loss_mlp": 1.02134633, + "epoch": 0.4060123252667969, + "flos": 49856004103680.0, + "grad_norm": 2.1441800518473126, + "language_loss": 0.70883751, + "learning_rate": 2.691334262772948e-06, + "loss": 0.72998226, + "num_input_tokens_seen": 144951240, + "step": 6753, + "time_per_iteration": 2.8603169918060303 + }, + { + "auxiliary_loss_clip": 0.01099811, + "auxiliary_loss_mlp": 0.01035218, + "balance_loss_clip": 1.04219079, + "balance_loss_mlp": 1.02060258, + "epoch": 0.4060724485194649, + "flos": 21135476459520.0, + "grad_norm": 2.1775197674281714, + "language_loss": 0.71798944, + "learning_rate": 2.690968795494699e-06, + "loss": 0.73933971, + "num_input_tokens_seen": 144969100, + "step": 6754, + "time_per_iteration": 2.513707399368286 + }, + { + "auxiliary_loss_clip": 0.01093379, + "auxiliary_loss_mlp": 0.01038131, + "balance_loss_clip": 1.04248548, + "balance_loss_mlp": 1.02460051, + "epoch": 0.40613257177213286, + "flos": 21758059918080.0, + "grad_norm": 1.7767669729855629, + "language_loss": 0.82769632, + "learning_rate": 2.690603302014844e-06, + "loss": 0.84901142, + "num_input_tokens_seen": 144987065, + "step": 6755, + "time_per_iteration": 3.9503090381622314 + }, + { + "auxiliary_loss_clip": 0.01085653, + "auxiliary_loss_mlp": 0.01037413, + "balance_loss_clip": 1.04592371, + "balance_loss_mlp": 1.02338231, + "epoch": 0.4061926950248008, + "flos": 25555074710400.0, + "grad_norm": 1.9390775706915246, + "language_loss": 0.70986056, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.73109126, + "num_input_tokens_seen": 145007310, + "step": 6756, + "time_per_iteration": 2.6233458518981934 + }, + { + "auxiliary_loss_clip": 0.01063661, + "auxiliary_loss_mlp": 0.00790099, + "balance_loss_clip": 1.04155552, + "balance_loss_mlp": 1.00966692, + "epoch": 0.4062528182774688, + "flos": 23695799944320.0, + "grad_norm": 2.9771929866383897, + "language_loss": 0.7884202, + "learning_rate": 2.689872236505755e-06, + "loss": 0.80695778, + "num_input_tokens_seen": 145026210, + "step": 6757, + "time_per_iteration": 4.1225364208221436 + }, + { + "auxiliary_loss_clip": 0.01103365, + "auxiliary_loss_mlp": 0.01029175, + "balance_loss_clip": 1.04675412, + "balance_loss_mlp": 1.01563311, + "epoch": 0.4063129415301368, + "flos": 21726027964800.0, + "grad_norm": 1.6309350719740792, + "language_loss": 0.78776717, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.80909258, + "num_input_tokens_seen": 145045475, + "step": 6758, + "time_per_iteration": 2.578040361404419 + }, + { + "auxiliary_loss_clip": 0.0108644, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.04717946, + "balance_loss_mlp": 1.02024055, + "epoch": 0.40637306478280477, + "flos": 12787575206400.0, + "grad_norm": 1.9587827870769112, + "language_loss": 0.88788676, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.90908909, + "num_input_tokens_seen": 145062260, + "step": 6759, + "time_per_iteration": 4.040321588516235 + }, + { + "auxiliary_loss_clip": 0.01097142, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.04584825, + "balance_loss_mlp": 1.02097845, + "epoch": 0.40643318803547274, + "flos": 24024490323840.0, + "grad_norm": 1.6607598962831327, + "language_loss": 0.64506137, + "learning_rate": 2.688775442076598e-06, + "loss": 0.6663813, + "num_input_tokens_seen": 145082470, + "step": 6760, + "time_per_iteration": 3.9791762828826904 + }, + { + "auxiliary_loss_clip": 0.01111816, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.04261708, + "balance_loss_mlp": 1.01861119, + "epoch": 0.4064933112881407, + "flos": 25592421876480.0, + "grad_norm": 1.9105038597210833, + "language_loss": 0.75233138, + "learning_rate": 2.688409791678193e-06, + "loss": 0.77377683, + "num_input_tokens_seen": 145105685, + "step": 6761, + "time_per_iteration": 2.5754306316375732 + }, + { + "auxiliary_loss_clip": 0.01090834, + "auxiliary_loss_mlp": 0.01035082, + "balance_loss_clip": 1.0474577, + "balance_loss_mlp": 1.02203441, + "epoch": 0.40655343454080867, + "flos": 22054323294720.0, + "grad_norm": 1.4494421181115518, + "language_loss": 0.70187092, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.72313011, + "num_input_tokens_seen": 145125590, + "step": 6762, + "time_per_iteration": 2.5537681579589844 + }, + { + "auxiliary_loss_clip": 0.0110989, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.04818034, + "balance_loss_mlp": 1.02181256, + "epoch": 0.40661355779347663, + "flos": 26468893641600.0, + "grad_norm": 1.5531986126226598, + "language_loss": 0.73318565, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.75463182, + "num_input_tokens_seen": 145146810, + "step": 6763, + "time_per_iteration": 2.549959659576416 + }, + { + "auxiliary_loss_clip": 0.01085367, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.04135203, + "balance_loss_mlp": 1.0170269, + "epoch": 0.4066736810461446, + "flos": 13261129136640.0, + "grad_norm": 1.62240010954316, + "language_loss": 0.69232035, + "learning_rate": 2.687312683911033e-06, + "loss": 0.71348953, + "num_input_tokens_seen": 145163130, + "step": 6764, + "time_per_iteration": 2.5176656246185303 + }, + { + "auxiliary_loss_clip": 0.01093138, + "auxiliary_loss_mlp": 0.01038313, + "balance_loss_clip": 1.04074407, + "balance_loss_mlp": 1.02297056, + "epoch": 0.40673380429881256, + "flos": 28803625758720.0, + "grad_norm": 2.0685141023080473, + "language_loss": 0.91309512, + "learning_rate": 2.686946929177557e-06, + "loss": 0.93440962, + "num_input_tokens_seen": 145181420, + "step": 6765, + "time_per_iteration": 2.5855164527893066 + }, + { + "auxiliary_loss_clip": 0.01111606, + "auxiliary_loss_mlp": 0.01034931, + "balance_loss_clip": 1.0433836, + "balance_loss_mlp": 1.02034521, + "epoch": 0.4067939275514805, + "flos": 12495334152960.0, + "grad_norm": 2.40438238596153, + "language_loss": 0.78836834, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.80983365, + "num_input_tokens_seen": 145198545, + "step": 6766, + "time_per_iteration": 2.491452217102051 + }, + { + "auxiliary_loss_clip": 0.01124077, + "auxiliary_loss_mlp": 0.01032513, + "balance_loss_clip": 1.04334903, + "balance_loss_mlp": 1.01895857, + "epoch": 0.4068540508041485, + "flos": 18770508069120.0, + "grad_norm": 1.8861793963221483, + "language_loss": 0.76617962, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.78774554, + "num_input_tokens_seen": 145215835, + "step": 6767, + "time_per_iteration": 2.4552407264709473 + }, + { + "auxiliary_loss_clip": 0.0110969, + "auxiliary_loss_mlp": 0.01032409, + "balance_loss_clip": 1.04406357, + "balance_loss_mlp": 1.01974261, + "epoch": 0.40691417405681646, + "flos": 28512821249280.0, + "grad_norm": 1.8885269377200693, + "language_loss": 0.77444309, + "learning_rate": 2.685849508738034e-06, + "loss": 0.79586411, + "num_input_tokens_seen": 145236555, + "step": 6768, + "time_per_iteration": 2.557558298110962 + }, + { + "auxiliary_loss_clip": 0.01122129, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.04390907, + "balance_loss_mlp": 1.01919091, + "epoch": 0.4069742973094844, + "flos": 20814040627200.0, + "grad_norm": 1.9393004801923521, + "language_loss": 0.86812299, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.88966668, + "num_input_tokens_seen": 145254595, + "step": 6769, + "time_per_iteration": 2.469980478286743 + }, + { + "auxiliary_loss_clip": 0.01095441, + "auxiliary_loss_mlp": 0.01039219, + "balance_loss_clip": 1.04509497, + "balance_loss_mlp": 1.02579582, + "epoch": 0.4070344205621524, + "flos": 21470272151040.0, + "grad_norm": 1.7534827526793941, + "language_loss": 0.8077876, + "learning_rate": 2.685117765051156e-06, + "loss": 0.82913423, + "num_input_tokens_seen": 145274005, + "step": 6770, + "time_per_iteration": 2.516430139541626 + }, + { + "auxiliary_loss_clip": 0.0112473, + "auxiliary_loss_mlp": 0.0102955, + "balance_loss_clip": 1.04366362, + "balance_loss_mlp": 1.0155375, + "epoch": 0.4070945438148204, + "flos": 26830046937600.0, + "grad_norm": 1.6274716307928103, + "language_loss": 0.8019802, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.82352298, + "num_input_tokens_seen": 145294850, + "step": 6771, + "time_per_iteration": 2.5240402221679688 + }, + { + "auxiliary_loss_clip": 0.0108845, + "auxiliary_loss_mlp": 0.0103629, + "balance_loss_clip": 1.04183793, + "balance_loss_mlp": 1.02289057, + "epoch": 0.4071546670674884, + "flos": 26354158623360.0, + "grad_norm": 1.4031927181783144, + "language_loss": 0.7609669, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.78221428, + "num_input_tokens_seen": 145317050, + "step": 6772, + "time_per_iteration": 2.6213674545288086 + }, + { + "auxiliary_loss_clip": 0.01100505, + "auxiliary_loss_mlp": 0.01038508, + "balance_loss_clip": 1.04332089, + "balance_loss_mlp": 1.02422619, + "epoch": 0.40721479032015634, + "flos": 17895401020800.0, + "grad_norm": 1.7235552652366573, + "language_loss": 0.81552935, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.83691949, + "num_input_tokens_seen": 145334480, + "step": 6773, + "time_per_iteration": 2.523390531539917 + }, + { + "auxiliary_loss_clip": 0.01039919, + "auxiliary_loss_mlp": 0.01001185, + "balance_loss_clip": 1.0322634, + "balance_loss_mlp": 0.99949229, + "epoch": 0.4072749135728243, + "flos": 49854570537600.0, + "grad_norm": 0.83236105649381, + "language_loss": 0.64414012, + "learning_rate": 2.683653966031597e-06, + "loss": 0.66455114, + "num_input_tokens_seen": 145388695, + "step": 6774, + "time_per_iteration": 3.0575952529907227 + }, + { + "auxiliary_loss_clip": 0.01079444, + "auxiliary_loss_mlp": 0.01034489, + "balance_loss_clip": 1.04376674, + "balance_loss_mlp": 1.02138817, + "epoch": 0.40733503682549227, + "flos": 27563630400000.0, + "grad_norm": 1.7652977917510353, + "language_loss": 0.72727162, + "learning_rate": 2.683287951431446e-06, + "loss": 0.74841094, + "num_input_tokens_seen": 145408240, + "step": 6775, + "time_per_iteration": 2.6476058959960938 + }, + { + "auxiliary_loss_clip": 0.01105137, + "auxiliary_loss_mlp": 0.00792784, + "balance_loss_clip": 1.04783607, + "balance_loss_mlp": 1.01551116, + "epoch": 0.40739516007816023, + "flos": 22126970551680.0, + "grad_norm": 1.4580746653257373, + "language_loss": 0.7783798, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.79735899, + "num_input_tokens_seen": 145428395, + "step": 6776, + "time_per_iteration": 2.580406665802002 + }, + { + "auxiliary_loss_clip": 0.01116241, + "auxiliary_loss_mlp": 0.01041366, + "balance_loss_clip": 1.04624629, + "balance_loss_mlp": 1.02708507, + "epoch": 0.4074552833308282, + "flos": 23842243693440.0, + "grad_norm": 2.4426421472706163, + "language_loss": 0.79424095, + "learning_rate": 2.682555844513981e-06, + "loss": 0.81581706, + "num_input_tokens_seen": 145448290, + "step": 6777, + "time_per_iteration": 2.5438525676727295 + }, + { + "auxiliary_loss_clip": 0.01056421, + "auxiliary_loss_mlp": 0.01002616, + "balance_loss_clip": 1.02859378, + "balance_loss_mlp": 1.00117993, + "epoch": 0.40751540658349616, + "flos": 58000008781440.0, + "grad_norm": 0.6843405677614325, + "language_loss": 0.53131956, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55190992, + "num_input_tokens_seen": 145509785, + "step": 6778, + "time_per_iteration": 3.108289957046509 + }, + { + "auxiliary_loss_clip": 0.01125877, + "auxiliary_loss_mlp": 0.00799633, + "balance_loss_clip": 1.0468564, + "balance_loss_mlp": 1.02725291, + "epoch": 0.40757552983616413, + "flos": 21214659991680.0, + "grad_norm": 2.081995608558993, + "language_loss": 0.82343113, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.84268618, + "num_input_tokens_seen": 145528620, + "step": 6779, + "time_per_iteration": 2.5115256309509277 + }, + { + "auxiliary_loss_clip": 0.01114675, + "auxiliary_loss_mlp": 0.0103886, + "balance_loss_clip": 1.04526472, + "balance_loss_mlp": 1.02520466, + "epoch": 0.4076356530888321, + "flos": 26833530556800.0, + "grad_norm": 1.5959484804484703, + "language_loss": 0.76289284, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.78442812, + "num_input_tokens_seen": 145547775, + "step": 6780, + "time_per_iteration": 2.541353940963745 + }, + { + "auxiliary_loss_clip": 0.01105502, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.04612505, + "balance_loss_mlp": 1.01720142, + "epoch": 0.40769577634150006, + "flos": 12203021272320.0, + "grad_norm": 1.9166704183029777, + "language_loss": 0.65866691, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.68001366, + "num_input_tokens_seen": 145564465, + "step": 6781, + "time_per_iteration": 2.4772162437438965 + }, + { + "auxiliary_loss_clip": 0.01097315, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.04123855, + "balance_loss_mlp": 1.01980364, + "epoch": 0.407755899594168, + "flos": 33655264796160.0, + "grad_norm": 1.6251405094360731, + "language_loss": 0.71168756, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.73299628, + "num_input_tokens_seen": 145585965, + "step": 6782, + "time_per_iteration": 2.6232407093048096 + }, + { + "auxiliary_loss_clip": 0.01114304, + "auxiliary_loss_mlp": 0.01031926, + "balance_loss_clip": 1.04385304, + "balance_loss_mlp": 1.01885426, + "epoch": 0.407816022846836, + "flos": 20157342226560.0, + "grad_norm": 1.842325251144414, + "language_loss": 0.82395387, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.84541619, + "num_input_tokens_seen": 145605000, + "step": 6783, + "time_per_iteration": 2.493436098098755 + }, + { + "auxiliary_loss_clip": 0.01109535, + "auxiliary_loss_mlp": 0.01035502, + "balance_loss_clip": 1.04659367, + "balance_loss_mlp": 1.02181673, + "epoch": 0.40787614609950396, + "flos": 21178821196800.0, + "grad_norm": 1.5243863666915238, + "language_loss": 0.81154555, + "learning_rate": 2.679992655730283e-06, + "loss": 0.83299589, + "num_input_tokens_seen": 145623740, + "step": 6784, + "time_per_iteration": 2.5307202339172363 + }, + { + "auxiliary_loss_clip": 0.01094729, + "auxiliary_loss_mlp": 0.01038907, + "balance_loss_clip": 1.04605031, + "balance_loss_mlp": 1.02436304, + "epoch": 0.407936269352172, + "flos": 20520650338560.0, + "grad_norm": 1.7274893359850365, + "language_loss": 0.65807456, + "learning_rate": 2.679626382651386e-06, + "loss": 0.67941093, + "num_input_tokens_seen": 145643515, + "step": 6785, + "time_per_iteration": 2.581906795501709 + }, + { + "auxiliary_loss_clip": 0.01107994, + "auxiliary_loss_mlp": 0.01034283, + "balance_loss_clip": 1.04655838, + "balance_loss_mlp": 1.02111042, + "epoch": 0.40799639260483994, + "flos": 20118809911680.0, + "grad_norm": 2.1681410154160754, + "language_loss": 0.79699361, + "learning_rate": 2.679260083800989e-06, + "loss": 0.81841636, + "num_input_tokens_seen": 145660890, + "step": 6786, + "time_per_iteration": 2.4962656497955322 + }, + { + "auxiliary_loss_clip": 0.01122157, + "auxiliary_loss_mlp": 0.01034498, + "balance_loss_clip": 1.04452169, + "balance_loss_mlp": 1.02239275, + "epoch": 0.4080565158575079, + "flos": 20997328752000.0, + "grad_norm": 1.6241500255555212, + "language_loss": 0.81532007, + "learning_rate": 2.678893759192982e-06, + "loss": 0.83688664, + "num_input_tokens_seen": 145680070, + "step": 6787, + "time_per_iteration": 2.4843637943267822 + }, + { + "auxiliary_loss_clip": 0.01112495, + "auxiliary_loss_mlp": 0.01030257, + "balance_loss_clip": 1.04711819, + "balance_loss_mlp": 1.01710236, + "epoch": 0.40811663911017587, + "flos": 19317714837120.0, + "grad_norm": 1.9411784197224986, + "language_loss": 0.67617333, + "learning_rate": 2.678527408841255e-06, + "loss": 0.69760084, + "num_input_tokens_seen": 145698010, + "step": 6788, + "time_per_iteration": 2.500030279159546 + }, + { + "auxiliary_loss_clip": 0.01095813, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.04167747, + "balance_loss_mlp": 1.02696443, + "epoch": 0.40817676236284384, + "flos": 40625382119040.0, + "grad_norm": 9.559831749125316, + "language_loss": 0.66149259, + "learning_rate": 2.678161032759701e-06, + "loss": 0.68287408, + "num_input_tokens_seen": 145722215, + "step": 6789, + "time_per_iteration": 2.6981546878814697 + }, + { + "auxiliary_loss_clip": 0.01076427, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.04110336, + "balance_loss_mlp": 1.01825428, + "epoch": 0.4082368856155118, + "flos": 20522086882560.0, + "grad_norm": 1.646905535599218, + "language_loss": 0.6095649, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.63064915, + "num_input_tokens_seen": 145741090, + "step": 6790, + "time_per_iteration": 2.578551769256592 + }, + { + "auxiliary_loss_clip": 0.01105838, + "auxiliary_loss_mlp": 0.01040567, + "balance_loss_clip": 1.04481006, + "balance_loss_mlp": 1.02658403, + "epoch": 0.40829700886817977, + "flos": 11427745098240.0, + "grad_norm": 2.802200415853678, + "language_loss": 0.69406545, + "learning_rate": 2.677428203462683e-06, + "loss": 0.71552956, + "num_input_tokens_seen": 145754985, + "step": 6791, + "time_per_iteration": 2.456068515777588 + }, + { + "auxiliary_loss_clip": 0.01043774, + "auxiliary_loss_mlp": 0.01001359, + "balance_loss_clip": 1.026263, + "balance_loss_mlp": 1.00005329, + "epoch": 0.40835713212084773, + "flos": 67330677121920.0, + "grad_norm": 0.787686615417677, + "language_loss": 0.59630609, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.61675739, + "num_input_tokens_seen": 145815260, + "step": 6792, + "time_per_iteration": 3.10335111618042 + }, + { + "auxiliary_loss_clip": 0.01129547, + "auxiliary_loss_mlp": 0.01036921, + "balance_loss_clip": 1.04915118, + "balance_loss_mlp": 1.02288437, + "epoch": 0.4084172553735157, + "flos": 21762010414080.0, + "grad_norm": 2.220334181055106, + "language_loss": 0.8005563, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.82222098, + "num_input_tokens_seen": 145832665, + "step": 6793, + "time_per_iteration": 2.499312400817871 + }, + { + "auxiliary_loss_clip": 0.01114841, + "auxiliary_loss_mlp": 0.01033083, + "balance_loss_clip": 1.04497397, + "balance_loss_mlp": 1.01919496, + "epoch": 0.40847737862618366, + "flos": 27417258478080.0, + "grad_norm": 2.1006775862584663, + "language_loss": 0.85182106, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.87330031, + "num_input_tokens_seen": 145850240, + "step": 6794, + "time_per_iteration": 3.9080233573913574 + }, + { + "auxiliary_loss_clip": 0.01093564, + "auxiliary_loss_mlp": 0.01032582, + "balance_loss_clip": 1.0453335, + "balance_loss_mlp": 1.01926661, + "epoch": 0.4085375018788516, + "flos": 18587255857920.0, + "grad_norm": 1.5923176644369232, + "language_loss": 0.7987864, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.82004791, + "num_input_tokens_seen": 145869545, + "step": 6795, + "time_per_iteration": 2.5581021308898926 + }, + { + "auxiliary_loss_clip": 0.01112214, + "auxiliary_loss_mlp": 0.01033678, + "balance_loss_clip": 1.04415727, + "balance_loss_mlp": 1.01906276, + "epoch": 0.4085976251315196, + "flos": 15411783029760.0, + "grad_norm": 3.1049846236059375, + "language_loss": 0.69796777, + "learning_rate": 2.675595680920792e-06, + "loss": 0.71942663, + "num_input_tokens_seen": 145884025, + "step": 6796, + "time_per_iteration": 2.5323867797851562 + }, + { + "auxiliary_loss_clip": 0.01106914, + "auxiliary_loss_mlp": 0.00795973, + "balance_loss_clip": 1.04184484, + "balance_loss_mlp": 1.014678, + "epoch": 0.40865774838418756, + "flos": 21252222639360.0, + "grad_norm": 1.6221620314906178, + "language_loss": 0.77469456, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.79372346, + "num_input_tokens_seen": 145903210, + "step": 6797, + "time_per_iteration": 5.385161638259888 + }, + { + "auxiliary_loss_clip": 0.01112284, + "auxiliary_loss_mlp": 0.01040024, + "balance_loss_clip": 1.04173279, + "balance_loss_mlp": 1.02648234, + "epoch": 0.4087178716368556, + "flos": 13772245714560.0, + "grad_norm": 1.9818854728901645, + "language_loss": 0.85405278, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.8755759, + "num_input_tokens_seen": 145920985, + "step": 6798, + "time_per_iteration": 2.475155830383301 + }, + { + "auxiliary_loss_clip": 0.01121024, + "auxiliary_loss_mlp": 0.01032311, + "balance_loss_clip": 1.04452205, + "balance_loss_mlp": 1.02052712, + "epoch": 0.40877799488952354, + "flos": 23621752056960.0, + "grad_norm": 1.4741682271874887, + "language_loss": 0.84287596, + "learning_rate": 2.674495859860601e-06, + "loss": 0.86440933, + "num_input_tokens_seen": 145940350, + "step": 6799, + "time_per_iteration": 3.906771659851074 + }, + { + "auxiliary_loss_clip": 0.01084742, + "auxiliary_loss_mlp": 0.01043528, + "balance_loss_clip": 1.04388022, + "balance_loss_mlp": 1.02807832, + "epoch": 0.4088381181421915, + "flos": 20918791664640.0, + "grad_norm": 2.073906737188535, + "language_loss": 0.82914329, + "learning_rate": 2.6741292016681e-06, + "loss": 0.85042596, + "num_input_tokens_seen": 145957460, + "step": 6800, + "time_per_iteration": 2.5469377040863037 + }, + { + "auxiliary_loss_clip": 0.01111461, + "auxiliary_loss_mlp": 0.01038102, + "balance_loss_clip": 1.04361343, + "balance_loss_mlp": 1.02401137, + "epoch": 0.4088982413948595, + "flos": 13297578462720.0, + "grad_norm": 1.9413420059262188, + "language_loss": 0.74830616, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.76980174, + "num_input_tokens_seen": 145975285, + "step": 6801, + "time_per_iteration": 2.4857609272003174 + }, + { + "auxiliary_loss_clip": 0.01113465, + "auxiliary_loss_mlp": 0.01032503, + "balance_loss_clip": 1.04201245, + "balance_loss_mlp": 1.01857281, + "epoch": 0.40895836464752744, + "flos": 15267673664640.0, + "grad_norm": 1.9820038036757794, + "language_loss": 0.80693758, + "learning_rate": 2.673395808607861e-06, + "loss": 0.82839727, + "num_input_tokens_seen": 145989150, + "step": 6802, + "time_per_iteration": 2.435746669769287 + }, + { + "auxiliary_loss_clip": 0.01112456, + "auxiliary_loss_mlp": 0.0103981, + "balance_loss_clip": 1.04710305, + "balance_loss_mlp": 1.02514112, + "epoch": 0.4090184879001954, + "flos": 14501411804160.0, + "grad_norm": 3.2101755075667726, + "language_loss": 0.76649463, + "learning_rate": 2.673029073767934e-06, + "loss": 0.78801727, + "num_input_tokens_seen": 146006980, + "step": 6803, + "time_per_iteration": 2.5276782512664795 + }, + { + "auxiliary_loss_clip": 0.01062701, + "auxiliary_loss_mlp": 0.00793563, + "balance_loss_clip": 1.0423826, + "balance_loss_mlp": 1.01671791, + "epoch": 0.40907861115286337, + "flos": 13881593692800.0, + "grad_norm": 1.978222648787648, + "language_loss": 0.78181839, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.80038106, + "num_input_tokens_seen": 146025125, + "step": 6804, + "time_per_iteration": 2.5832555294036865 + }, + { + "auxiliary_loss_clip": 0.01127294, + "auxiliary_loss_mlp": 0.01039421, + "balance_loss_clip": 1.04380643, + "balance_loss_mlp": 1.02614665, + "epoch": 0.40913873440553133, + "flos": 28037615293440.0, + "grad_norm": 1.927705507054624, + "language_loss": 0.75525808, + "learning_rate": 2.672295527537998e-06, + "loss": 0.77692521, + "num_input_tokens_seen": 146044990, + "step": 6805, + "time_per_iteration": 2.5083510875701904 + }, + { + "auxiliary_loss_clip": 0.01082036, + "auxiliary_loss_mlp": 0.0103898, + "balance_loss_clip": 1.04336238, + "balance_loss_mlp": 1.02562213, + "epoch": 0.4091988576581993, + "flos": 21618188357760.0, + "grad_norm": 2.616054453999636, + "language_loss": 0.79236662, + "learning_rate": 2.671928716175804e-06, + "loss": 0.81357682, + "num_input_tokens_seen": 146066045, + "step": 6806, + "time_per_iteration": 2.598989248275757 + }, + { + "auxiliary_loss_clip": 0.01113123, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.0419991, + "balance_loss_mlp": 1.01550376, + "epoch": 0.40925898091086726, + "flos": 25224085860480.0, + "grad_norm": 2.137445448844716, + "language_loss": 0.71938109, + "learning_rate": 2.671561879334007e-06, + "loss": 0.74080491, + "num_input_tokens_seen": 146086280, + "step": 6807, + "time_per_iteration": 2.54052734375 + }, + { + "auxiliary_loss_clip": 0.01039489, + "auxiliary_loss_mlp": 0.01000769, + "balance_loss_clip": 1.03705525, + "balance_loss_mlp": 0.99904013, + "epoch": 0.40931910416353523, + "flos": 68930568800640.0, + "grad_norm": 0.8610202469196753, + "language_loss": 0.58753651, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.60793906, + "num_input_tokens_seen": 146148840, + "step": 6808, + "time_per_iteration": 3.219593048095703 + }, + { + "auxiliary_loss_clip": 0.01102554, + "auxiliary_loss_mlp": 0.01037232, + "balance_loss_clip": 1.04235709, + "balance_loss_mlp": 1.02484632, + "epoch": 0.4093792274162032, + "flos": 20189553747840.0, + "grad_norm": 1.5803397206915966, + "language_loss": 0.54771984, + "learning_rate": 2.670828129267242e-06, + "loss": 0.56911767, + "num_input_tokens_seen": 146166195, + "step": 6809, + "time_per_iteration": 2.556096315383911 + }, + { + "auxiliary_loss_clip": 0.01099392, + "auxiliary_loss_mlp": 0.01025969, + "balance_loss_clip": 1.04130435, + "balance_loss_mlp": 1.01311862, + "epoch": 0.40943935066887116, + "flos": 25228754628480.0, + "grad_norm": 1.7891714746192517, + "language_loss": 0.83166957, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85292315, + "num_input_tokens_seen": 146185045, + "step": 6810, + "time_per_iteration": 2.551877498626709 + }, + { + "auxiliary_loss_clip": 0.01100583, + "auxiliary_loss_mlp": 0.01049646, + "balance_loss_clip": 1.045609, + "balance_loss_mlp": 1.03315938, + "epoch": 0.4094994739215392, + "flos": 23255319461760.0, + "grad_norm": 2.1529406411515444, + "language_loss": 0.77305675, + "learning_rate": 2.670094277448999e-06, + "loss": 0.794559, + "num_input_tokens_seen": 146204655, + "step": 6811, + "time_per_iteration": 2.5452144145965576 + }, + { + "auxiliary_loss_clip": 0.01123442, + "auxiliary_loss_mlp": 0.01031266, + "balance_loss_clip": 1.04289782, + "balance_loss_mlp": 1.01678181, + "epoch": 0.40955959717420715, + "flos": 17382165540480.0, + "grad_norm": 1.5487855503370782, + "language_loss": 0.70175147, + "learning_rate": 2.669727313417857e-06, + "loss": 0.72329861, + "num_input_tokens_seen": 146222000, + "step": 6812, + "time_per_iteration": 2.4494807720184326 + }, + { + "auxiliary_loss_clip": 0.01121769, + "auxiliary_loss_mlp": 0.01038102, + "balance_loss_clip": 1.04259658, + "balance_loss_mlp": 1.02424431, + "epoch": 0.4096197204268751, + "flos": 25082418620160.0, + "grad_norm": 1.5196304780112853, + "language_loss": 0.66595006, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.68754888, + "num_input_tokens_seen": 146242630, + "step": 6813, + "time_per_iteration": 2.5121610164642334 + }, + { + "auxiliary_loss_clip": 0.0110715, + "auxiliary_loss_mlp": 0.00789541, + "balance_loss_clip": 1.04417062, + "balance_loss_mlp": 1.0089097, + "epoch": 0.4096798436795431, + "flos": 30586769648640.0, + "grad_norm": 3.6844110736672557, + "language_loss": 0.7414341, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.76040107, + "num_input_tokens_seen": 146263070, + "step": 6814, + "time_per_iteration": 2.56007981300354 + }, + { + "auxiliary_loss_clip": 0.01075699, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.04142237, + "balance_loss_mlp": 1.01798081, + "epoch": 0.40973996693221104, + "flos": 24133622820480.0, + "grad_norm": 2.5041423158762948, + "language_loss": 0.66443545, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.68550658, + "num_input_tokens_seen": 146282890, + "step": 6815, + "time_per_iteration": 2.622347831726074 + }, + { + "auxiliary_loss_clip": 0.01111054, + "auxiliary_loss_mlp": 0.01035775, + "balance_loss_clip": 1.04579067, + "balance_loss_mlp": 1.02312696, + "epoch": 0.409800090184879, + "flos": 23988974751360.0, + "grad_norm": 1.8516095008226958, + "language_loss": 0.77184403, + "learning_rate": 2.668259203471188e-06, + "loss": 0.79331231, + "num_input_tokens_seen": 146301755, + "step": 6816, + "time_per_iteration": 2.5128870010375977 + }, + { + "auxiliary_loss_clip": 0.01106723, + "auxiliary_loss_mlp": 0.01033997, + "balance_loss_clip": 1.04451704, + "balance_loss_mlp": 1.02039528, + "epoch": 0.40986021343754697, + "flos": 16143678552960.0, + "grad_norm": 2.2105814782409325, + "language_loss": 0.81357038, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.83497763, + "num_input_tokens_seen": 146316835, + "step": 6817, + "time_per_iteration": 2.534956932067871 + }, + { + "auxiliary_loss_clip": 0.01103895, + "auxiliary_loss_mlp": 0.0103461, + "balance_loss_clip": 1.04257774, + "balance_loss_mlp": 1.01911891, + "epoch": 0.40992033669021494, + "flos": 24790824011520.0, + "grad_norm": 1.7061832791571057, + "language_loss": 0.80122232, + "learning_rate": 2.667524996399444e-06, + "loss": 0.8226074, + "num_input_tokens_seen": 146336650, + "step": 6818, + "time_per_iteration": 2.576991081237793 + }, + { + "auxiliary_loss_clip": 0.01095707, + "auxiliary_loss_mlp": 0.01033932, + "balance_loss_clip": 1.0426923, + "balance_loss_mlp": 1.02133763, + "epoch": 0.4099804599428829, + "flos": 29641888431360.0, + "grad_norm": 1.6806689764510547, + "language_loss": 0.66536307, + "learning_rate": 2.66715785488769e-06, + "loss": 0.68665946, + "num_input_tokens_seen": 146357640, + "step": 6819, + "time_per_iteration": 2.6111834049224854 + }, + { + "auxiliary_loss_clip": 0.01104323, + "auxiliary_loss_mlp": 0.0104367, + "balance_loss_clip": 1.04396415, + "balance_loss_mlp": 1.02762437, + "epoch": 0.41004058319555087, + "flos": 24826590979200.0, + "grad_norm": 1.6414848870951682, + "language_loss": 0.85289419, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.87437403, + "num_input_tokens_seen": 146379325, + "step": 6820, + "time_per_iteration": 2.598809242248535 + }, + { + "auxiliary_loss_clip": 0.01112333, + "auxiliary_loss_mlp": 0.01030628, + "balance_loss_clip": 1.04578853, + "balance_loss_mlp": 1.01760483, + "epoch": 0.41010070644821883, + "flos": 25737464995200.0, + "grad_norm": 1.8233389300208154, + "language_loss": 0.71255922, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.73398888, + "num_input_tokens_seen": 146398635, + "step": 6821, + "time_per_iteration": 2.5405521392822266 + }, + { + "auxiliary_loss_clip": 0.01110048, + "auxiliary_loss_mlp": 0.01032669, + "balance_loss_clip": 1.04651284, + "balance_loss_mlp": 1.01956785, + "epoch": 0.4101608297008868, + "flos": 22346061557760.0, + "grad_norm": 1.792796103476639, + "language_loss": 0.74504697, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.76647413, + "num_input_tokens_seen": 146417585, + "step": 6822, + "time_per_iteration": 2.524139642715454 + }, + { + "auxiliary_loss_clip": 0.01108344, + "auxiliary_loss_mlp": 0.01031486, + "balance_loss_clip": 1.04827845, + "balance_loss_mlp": 1.01836658, + "epoch": 0.41022095295355476, + "flos": 21945083057280.0, + "grad_norm": 1.9860920163740614, + "language_loss": 0.75527006, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.77666831, + "num_input_tokens_seen": 146437035, + "step": 6823, + "time_per_iteration": 2.5477919578552246 + }, + { + "auxiliary_loss_clip": 0.0108411, + "auxiliary_loss_mlp": 0.01037442, + "balance_loss_clip": 1.04988718, + "balance_loss_mlp": 1.02182627, + "epoch": 0.4102810762062228, + "flos": 27450511493760.0, + "grad_norm": 1.7612051712744778, + "language_loss": 0.7323904, + "learning_rate": 2.665321768127001e-06, + "loss": 0.7536059, + "num_input_tokens_seen": 146457370, + "step": 6824, + "time_per_iteration": 2.656182289123535 + }, + { + "auxiliary_loss_clip": 0.01094954, + "auxiliary_loss_mlp": 0.01033747, + "balance_loss_clip": 1.04316688, + "balance_loss_mlp": 1.0191915, + "epoch": 0.41034119945889075, + "flos": 24499265316480.0, + "grad_norm": 1.7428451422392197, + "language_loss": 0.7192505, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.74053752, + "num_input_tokens_seen": 146478105, + "step": 6825, + "time_per_iteration": 2.613664388656616 + }, + { + "auxiliary_loss_clip": 0.01084844, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.04151332, + "balance_loss_mlp": 1.02491605, + "epoch": 0.4104013227115587, + "flos": 24352641999360.0, + "grad_norm": 2.1563217734495486, + "language_loss": 0.85209388, + "learning_rate": 2.664587156721768e-06, + "loss": 0.87332118, + "num_input_tokens_seen": 146497835, + "step": 6826, + "time_per_iteration": 2.5975167751312256 + }, + { + "auxiliary_loss_clip": 0.01099989, + "auxiliary_loss_mlp": 0.00793993, + "balance_loss_clip": 1.04533291, + "balance_loss_mlp": 1.01703238, + "epoch": 0.4104614459642267, + "flos": 23729340268800.0, + "grad_norm": 1.6756107297261662, + "language_loss": 0.66318536, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.68212521, + "num_input_tokens_seen": 146517735, + "step": 6827, + "time_per_iteration": 2.5744738578796387 + }, + { + "auxiliary_loss_clip": 0.01100668, + "auxiliary_loss_mlp": 0.01025674, + "balance_loss_clip": 1.04376292, + "balance_loss_mlp": 1.01281118, + "epoch": 0.41052156921689464, + "flos": 22127976132480.0, + "grad_norm": 3.924311293852761, + "language_loss": 0.71931303, + "learning_rate": 2.663852444511689e-06, + "loss": 0.74057645, + "num_input_tokens_seen": 146537640, + "step": 6828, + "time_per_iteration": 2.5491526126861572 + }, + { + "auxiliary_loss_clip": 0.01100632, + "auxiliary_loss_mlp": 0.01037574, + "balance_loss_clip": 1.04747009, + "balance_loss_mlp": 1.02294672, + "epoch": 0.4105816924695626, + "flos": 20084371747200.0, + "grad_norm": 1.7355904681060559, + "language_loss": 0.83521616, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.8565982, + "num_input_tokens_seen": 146554695, + "step": 6829, + "time_per_iteration": 2.5204808712005615 + }, + { + "auxiliary_loss_clip": 0.01110759, + "auxiliary_loss_mlp": 0.01031591, + "balance_loss_clip": 1.04394555, + "balance_loss_mlp": 1.01863921, + "epoch": 0.4106418157222306, + "flos": 18076785724800.0, + "grad_norm": 1.7185138385627095, + "language_loss": 0.89843929, + "learning_rate": 2.663117631608206e-06, + "loss": 0.91986281, + "num_input_tokens_seen": 146573740, + "step": 6830, + "time_per_iteration": 2.5198872089385986 + }, + { + "auxiliary_loss_clip": 0.01087263, + "auxiliary_loss_mlp": 0.01026375, + "balance_loss_clip": 1.04622412, + "balance_loss_mlp": 1.01277351, + "epoch": 0.41070193897489854, + "flos": 21647850013440.0, + "grad_norm": 1.8207813101767505, + "language_loss": 0.65579855, + "learning_rate": 2.662750187431268e-06, + "loss": 0.67693496, + "num_input_tokens_seen": 146592885, + "step": 6831, + "time_per_iteration": 2.571640729904175 + }, + { + "auxiliary_loss_clip": 0.01123693, + "auxiliary_loss_mlp": 0.01033078, + "balance_loss_clip": 1.04618764, + "balance_loss_mlp": 1.02037668, + "epoch": 0.4107620622275665, + "flos": 26648195356800.0, + "grad_norm": 1.7369554283845057, + "language_loss": 0.69637525, + "learning_rate": 2.662382718122776e-06, + "loss": 0.71794295, + "num_input_tokens_seen": 146611995, + "step": 6832, + "time_per_iteration": 3.908195972442627 + }, + { + "auxiliary_loss_clip": 0.01082522, + "auxiliary_loss_mlp": 0.01032121, + "balance_loss_clip": 1.04936147, + "balance_loss_mlp": 1.01956284, + "epoch": 0.41082218548023447, + "flos": 18734310138240.0, + "grad_norm": 2.2055292539818185, + "language_loss": 0.73723376, + "learning_rate": 2.662015223696666e-06, + "loss": 0.75838017, + "num_input_tokens_seen": 146628045, + "step": 6833, + "time_per_iteration": 2.5590522289276123 + }, + { + "auxiliary_loss_clip": 0.01074915, + "auxiliary_loss_mlp": 0.01034486, + "balance_loss_clip": 1.04267335, + "balance_loss_mlp": 1.01935828, + "epoch": 0.41088230873290243, + "flos": 22893771116160.0, + "grad_norm": 1.5932504281740032, + "language_loss": 0.72692162, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.74801564, + "num_input_tokens_seen": 146648355, + "step": 6834, + "time_per_iteration": 2.5925230979919434 + }, + { + "auxiliary_loss_clip": 0.01115767, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.04472208, + "balance_loss_mlp": 1.02424979, + "epoch": 0.4109424319855704, + "flos": 24276978000000.0, + "grad_norm": 1.698842359710795, + "language_loss": 0.71202642, + "learning_rate": 2.661280159547329e-06, + "loss": 0.73356652, + "num_input_tokens_seen": 146668370, + "step": 6835, + "time_per_iteration": 3.9460394382476807 + }, + { + "auxiliary_loss_clip": 0.01116768, + "auxiliary_loss_mlp": 0.01033941, + "balance_loss_clip": 1.04589498, + "balance_loss_mlp": 1.01917648, + "epoch": 0.41100255523823837, + "flos": 12969139478400.0, + "grad_norm": 2.3589321784292916, + "language_loss": 0.86749721, + "learning_rate": 2.660912589851978e-06, + "loss": 0.88900423, + "num_input_tokens_seen": 146686665, + "step": 6836, + "time_per_iteration": 3.868922710418701 + }, + { + "auxiliary_loss_clip": 0.01111607, + "auxiliary_loss_mlp": 0.01029837, + "balance_loss_clip": 1.04650283, + "balance_loss_mlp": 1.01687849, + "epoch": 0.4110626784909064, + "flos": 23145648261120.0, + "grad_norm": 2.017286582842602, + "language_loss": 0.69027245, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.71168691, + "num_input_tokens_seen": 146706570, + "step": 6837, + "time_per_iteration": 3.934495687484741 + }, + { + "auxiliary_loss_clip": 0.01127574, + "auxiliary_loss_mlp": 0.01033141, + "balance_loss_clip": 1.04653442, + "balance_loss_mlp": 1.01869333, + "epoch": 0.41112280174357435, + "flos": 22747399194240.0, + "grad_norm": 2.1667167879120472, + "language_loss": 0.74920022, + "learning_rate": 2.660177375289599e-06, + "loss": 0.77080739, + "num_input_tokens_seen": 146723425, + "step": 6838, + "time_per_iteration": 2.4857592582702637 + }, + { + "auxiliary_loss_clip": 0.01089753, + "auxiliary_loss_mlp": 0.01034597, + "balance_loss_clip": 1.04822397, + "balance_loss_mlp": 1.02061963, + "epoch": 0.4111829249962423, + "flos": 21102403011840.0, + "grad_norm": 2.333563716313449, + "language_loss": 0.8221873, + "learning_rate": 2.659809730450451e-06, + "loss": 0.84343088, + "num_input_tokens_seen": 146741640, + "step": 6839, + "time_per_iteration": 2.5463900566101074 + }, + { + "auxiliary_loss_clip": 0.01122073, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.04356182, + "balance_loss_mlp": 1.01705003, + "epoch": 0.4112430482489103, + "flos": 21505787723520.0, + "grad_norm": 1.9892596638702686, + "language_loss": 0.80314952, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.82467115, + "num_input_tokens_seen": 146759195, + "step": 6840, + "time_per_iteration": 2.4678094387054443 + }, + { + "auxiliary_loss_clip": 0.01110868, + "auxiliary_loss_mlp": 0.01030905, + "balance_loss_clip": 1.0462116, + "balance_loss_mlp": 1.01845336, + "epoch": 0.41130317150157825, + "flos": 19570022945280.0, + "grad_norm": 1.9255532363099326, + "language_loss": 0.67520368, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.69662136, + "num_input_tokens_seen": 146774990, + "step": 6841, + "time_per_iteration": 2.4816668033599854 + }, + { + "auxiliary_loss_clip": 0.01047978, + "auxiliary_loss_mlp": 0.01004488, + "balance_loss_clip": 1.02940571, + "balance_loss_mlp": 1.00285482, + "epoch": 0.4113632947542462, + "flos": 62383157706240.0, + "grad_norm": 0.7603981797507843, + "language_loss": 0.59639072, + "learning_rate": 2.65870664586847e-06, + "loss": 0.61691535, + "num_input_tokens_seen": 146839610, + "step": 6842, + "time_per_iteration": 3.179201602935791 + }, + { + "auxiliary_loss_clip": 0.01104935, + "auxiliary_loss_mlp": 0.01028371, + "balance_loss_clip": 1.04531646, + "balance_loss_mlp": 1.01650929, + "epoch": 0.4114234180069142, + "flos": 13918617636480.0, + "grad_norm": 2.0712292655551052, + "language_loss": 0.70276594, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.72409892, + "num_input_tokens_seen": 146857360, + "step": 6843, + "time_per_iteration": 2.494269847869873 + }, + { + "auxiliary_loss_clip": 0.01028017, + "auxiliary_loss_mlp": 0.01008377, + "balance_loss_clip": 1.03678036, + "balance_loss_mlp": 1.00676811, + "epoch": 0.41148354125958214, + "flos": 64928505219840.0, + "grad_norm": 0.7242240471852321, + "language_loss": 0.53604352, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.55640751, + "num_input_tokens_seen": 146917055, + "step": 6844, + "time_per_iteration": 3.162383556365967 + }, + { + "auxiliary_loss_clip": 0.01110926, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.04525852, + "balance_loss_mlp": 1.01849675, + "epoch": 0.4115436645122501, + "flos": 18728779443840.0, + "grad_norm": 3.9492768586613938, + "language_loss": 0.66072816, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.68214881, + "num_input_tokens_seen": 146935215, + "step": 6845, + "time_per_iteration": 2.485123872756958 + }, + { + "auxiliary_loss_clip": 0.01123201, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.04721308, + "balance_loss_mlp": 1.01996684, + "epoch": 0.41160378776491807, + "flos": 16252918790400.0, + "grad_norm": 1.9912357407064933, + "language_loss": 0.70279056, + "learning_rate": 2.657235516795808e-06, + "loss": 0.72435272, + "num_input_tokens_seen": 146951970, + "step": 6846, + "time_per_iteration": 2.4598653316497803 + }, + { + "auxiliary_loss_clip": 0.01104885, + "auxiliary_loss_mlp": 0.01035329, + "balance_loss_clip": 1.04746401, + "balance_loss_mlp": 1.02154231, + "epoch": 0.41166391101758604, + "flos": 27970031854080.0, + "grad_norm": 1.8455600018088398, + "language_loss": 0.65078241, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.67218459, + "num_input_tokens_seen": 146975615, + "step": 6847, + "time_per_iteration": 2.629786491394043 + }, + { + "auxiliary_loss_clip": 0.01104074, + "auxiliary_loss_mlp": 0.01036046, + "balance_loss_clip": 1.04601502, + "balance_loss_mlp": 1.02267075, + "epoch": 0.411724034270254, + "flos": 34131296764800.0, + "grad_norm": 1.6154477787203712, + "language_loss": 0.7042883, + "learning_rate": 2.656499802669069e-06, + "loss": 0.72568953, + "num_input_tokens_seen": 146998855, + "step": 6848, + "time_per_iteration": 2.682521104812622 + }, + { + "auxiliary_loss_clip": 0.01028835, + "auxiliary_loss_mlp": 0.00848663, + "balance_loss_clip": 1.02096295, + "balance_loss_mlp": 1.14628458, + "epoch": 0.41178415752292197, + "flos": 67923670752000.0, + "grad_norm": 0.8921829925355241, + "language_loss": 0.56203073, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.58080572, + "num_input_tokens_seen": 147062710, + "step": 6849, + "time_per_iteration": 3.2218313217163086 + }, + { + "auxiliary_loss_clip": 0.01102419, + "auxiliary_loss_mlp": 0.01038409, + "balance_loss_clip": 1.04372036, + "balance_loss_mlp": 1.02481949, + "epoch": 0.41184428077558993, + "flos": 34313938444800.0, + "grad_norm": 1.7566024642783675, + "language_loss": 0.76139331, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.78280163, + "num_input_tokens_seen": 147086075, + "step": 6850, + "time_per_iteration": 2.682208776473999 + }, + { + "auxiliary_loss_clip": 0.01073317, + "auxiliary_loss_mlp": 0.01030687, + "balance_loss_clip": 1.04157186, + "balance_loss_mlp": 1.01805067, + "epoch": 0.41190440402825795, + "flos": 35444118948480.0, + "grad_norm": 1.5472619468632312, + "language_loss": 0.67980599, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.70084596, + "num_input_tokens_seen": 147107590, + "step": 6851, + "time_per_iteration": 2.7286689281463623 + }, + { + "auxiliary_loss_clip": 0.01088847, + "auxiliary_loss_mlp": 0.01039864, + "balance_loss_clip": 1.04607749, + "balance_loss_mlp": 1.02432525, + "epoch": 0.4119645272809259, + "flos": 20849879422080.0, + "grad_norm": 2.2495081894546582, + "language_loss": 0.79589081, + "learning_rate": 2.655028075792743e-06, + "loss": 0.81717789, + "num_input_tokens_seen": 147123715, + "step": 6852, + "time_per_iteration": 2.5486955642700195 + }, + { + "auxiliary_loss_clip": 0.01129039, + "auxiliary_loss_mlp": 0.01035378, + "balance_loss_clip": 1.04753792, + "balance_loss_mlp": 1.0209657, + "epoch": 0.4120246505335939, + "flos": 27562050201600.0, + "grad_norm": 2.1397965442679063, + "language_loss": 0.77571726, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.79736137, + "num_input_tokens_seen": 147144290, + "step": 6853, + "time_per_iteration": 2.551239252090454 + }, + { + "auxiliary_loss_clip": 0.01116806, + "auxiliary_loss_mlp": 0.01044359, + "balance_loss_clip": 1.04492426, + "balance_loss_mlp": 1.02869487, + "epoch": 0.41208477378626185, + "flos": 37815444046080.0, + "grad_norm": 1.6755801738696414, + "language_loss": 0.65665245, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.67826414, + "num_input_tokens_seen": 147166340, + "step": 6854, + "time_per_iteration": 2.6570117473602295 + }, + { + "auxiliary_loss_clip": 0.01095614, + "auxiliary_loss_mlp": 0.0103941, + "balance_loss_clip": 1.04253209, + "balance_loss_mlp": 1.02501535, + "epoch": 0.4121448970389298, + "flos": 23440762402560.0, + "grad_norm": 1.6990811875700573, + "language_loss": 0.83648288, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.85783309, + "num_input_tokens_seen": 147184025, + "step": 6855, + "time_per_iteration": 2.5852184295654297 + }, + { + "auxiliary_loss_clip": 0.01105159, + "auxiliary_loss_mlp": 0.01040965, + "balance_loss_clip": 1.0444721, + "balance_loss_mlp": 1.02813804, + "epoch": 0.4122050202915978, + "flos": 21325300859520.0, + "grad_norm": 1.5795301505165584, + "language_loss": 0.79185343, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.81331468, + "num_input_tokens_seen": 147202730, + "step": 6856, + "time_per_iteration": 2.5122902393341064 + }, + { + "auxiliary_loss_clip": 0.01088115, + "auxiliary_loss_mlp": 0.01035923, + "balance_loss_clip": 1.04366636, + "balance_loss_mlp": 1.0227859, + "epoch": 0.41226514354426574, + "flos": 17306286059520.0, + "grad_norm": 2.4052220252179137, + "language_loss": 0.80161184, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.82285219, + "num_input_tokens_seen": 147215315, + "step": 6857, + "time_per_iteration": 2.5169365406036377 + }, + { + "auxiliary_loss_clip": 0.01112506, + "auxiliary_loss_mlp": 0.00836618, + "balance_loss_clip": 1.04361629, + "balance_loss_mlp": 1.09867477, + "epoch": 0.4123252667969337, + "flos": 17638855107840.0, + "grad_norm": 1.7040918693998164, + "language_loss": 0.70627534, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.7257666, + "num_input_tokens_seen": 147233330, + "step": 6858, + "time_per_iteration": 2.513631820678711 + }, + { + "auxiliary_loss_clip": 0.01110818, + "auxiliary_loss_mlp": 0.01035779, + "balance_loss_clip": 1.04442716, + "balance_loss_mlp": 1.02176607, + "epoch": 0.4123853900496017, + "flos": 46424811375360.0, + "grad_norm": 1.4561594251293666, + "language_loss": 0.59441072, + "learning_rate": 2.652451598005391e-06, + "loss": 0.61587667, + "num_input_tokens_seen": 147257780, + "step": 6859, + "time_per_iteration": 2.7461447715759277 + }, + { + "auxiliary_loss_clip": 0.0112324, + "auxiliary_loss_mlp": 0.01037139, + "balance_loss_clip": 1.04316044, + "balance_loss_mlp": 1.02363849, + "epoch": 0.41244551330226964, + "flos": 17675160779520.0, + "grad_norm": 2.341560675076963, + "language_loss": 0.73484683, + "learning_rate": 2.652083430674264e-06, + "loss": 0.75645065, + "num_input_tokens_seen": 147276055, + "step": 6860, + "time_per_iteration": 2.517282247543335 + }, + { + "auxiliary_loss_clip": 0.01043564, + "auxiliary_loss_mlp": 0.01034663, + "balance_loss_clip": 1.03803754, + "balance_loss_mlp": 1.02111435, + "epoch": 0.4125056365549376, + "flos": 18693730748160.0, + "grad_norm": 1.6912922910633825, + "language_loss": 0.7421335, + "learning_rate": 2.651715238616068e-06, + "loss": 0.76291579, + "num_input_tokens_seen": 147293200, + "step": 6861, + "time_per_iteration": 2.6245903968811035 + }, + { + "auxiliary_loss_clip": 0.01099151, + "auxiliary_loss_mlp": 0.01032094, + "balance_loss_clip": 1.04199648, + "balance_loss_mlp": 1.01949382, + "epoch": 0.41256575980760557, + "flos": 17895293280000.0, + "grad_norm": 2.567680720458047, + "language_loss": 0.79717612, + "learning_rate": 2.651347021844765e-06, + "loss": 0.8184886, + "num_input_tokens_seen": 147310640, + "step": 6862, + "time_per_iteration": 2.4875423908233643 + }, + { + "auxiliary_loss_clip": 0.01097089, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.0434804, + "balance_loss_mlp": 1.01957035, + "epoch": 0.41262588306027354, + "flos": 21981316901760.0, + "grad_norm": 1.7170241717266996, + "language_loss": 0.76085299, + "learning_rate": 2.650978780374318e-06, + "loss": 0.78215373, + "num_input_tokens_seen": 147329435, + "step": 6863, + "time_per_iteration": 2.5321695804595947 + }, + { + "auxiliary_loss_clip": 0.01043645, + "auxiliary_loss_mlp": 0.01008151, + "balance_loss_clip": 1.02561665, + "balance_loss_mlp": 1.00648177, + "epoch": 0.41268600631294156, + "flos": 53350006740480.0, + "grad_norm": 0.6976001681477471, + "language_loss": 0.5270859, + "learning_rate": 2.650610514218691e-06, + "loss": 0.54760385, + "num_input_tokens_seen": 147385805, + "step": 6864, + "time_per_iteration": 3.0737171173095703 + }, + { + "auxiliary_loss_clip": 0.01126081, + "auxiliary_loss_mlp": 0.01034041, + "balance_loss_clip": 1.04352593, + "balance_loss_mlp": 1.01958036, + "epoch": 0.4127461295656095, + "flos": 24385356311040.0, + "grad_norm": 1.7658898685534397, + "language_loss": 0.72996247, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.75156379, + "num_input_tokens_seen": 147405160, + "step": 6865, + "time_per_iteration": 2.506314277648926 + }, + { + "auxiliary_loss_clip": 0.01050747, + "auxiliary_loss_mlp": 0.01008981, + "balance_loss_clip": 1.02269411, + "balance_loss_mlp": 1.00744367, + "epoch": 0.4128062528182775, + "flos": 71705242696320.0, + "grad_norm": 0.9326349510747677, + "language_loss": 0.66522324, + "learning_rate": 2.649873907907753e-06, + "loss": 0.68582046, + "num_input_tokens_seen": 147460245, + "step": 6866, + "time_per_iteration": 2.9604804515838623 + }, + { + "auxiliary_loss_clip": 0.0111944, + "auxiliary_loss_mlp": 0.01031143, + "balance_loss_clip": 1.0420177, + "balance_loss_mlp": 1.01818466, + "epoch": 0.41286637607094545, + "flos": 17849111368320.0, + "grad_norm": 2.42395948972015, + "language_loss": 0.81215298, + "learning_rate": 2.649505567780375e-06, + "loss": 0.83365875, + "num_input_tokens_seen": 147476200, + "step": 6867, + "time_per_iteration": 2.4431099891662598 + }, + { + "auxiliary_loss_clip": 0.01105634, + "auxiliary_loss_mlp": 0.01031835, + "balance_loss_clip": 1.0458138, + "balance_loss_mlp": 1.01785743, + "epoch": 0.4129264993236134, + "flos": 25549544016000.0, + "grad_norm": 2.2593208860925635, + "language_loss": 0.7712326, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.79260725, + "num_input_tokens_seen": 147494315, + "step": 6868, + "time_per_iteration": 2.579080820083618 + }, + { + "auxiliary_loss_clip": 0.01034617, + "auxiliary_loss_mlp": 0.01003654, + "balance_loss_clip": 1.01773357, + "balance_loss_mlp": 1.00219965, + "epoch": 0.4129866225762814, + "flos": 65414446364160.0, + "grad_norm": 0.8295761864278823, + "language_loss": 0.57795477, + "learning_rate": 2.64876881365164e-06, + "loss": 0.59833741, + "num_input_tokens_seen": 147543665, + "step": 6869, + "time_per_iteration": 2.8623619079589844 + }, + { + "auxiliary_loss_clip": 0.01110171, + "auxiliary_loss_mlp": 0.01030627, + "balance_loss_clip": 1.04470897, + "balance_loss_mlp": 1.01681638, + "epoch": 0.41304674582894935, + "flos": 28876991287680.0, + "grad_norm": 1.79069044080286, + "language_loss": 0.75388211, + "learning_rate": 2.64840039967822e-06, + "loss": 0.77529013, + "num_input_tokens_seen": 147564870, + "step": 6870, + "time_per_iteration": 2.573387622833252 + }, + { + "auxiliary_loss_clip": 0.01092789, + "auxiliary_loss_mlp": 0.01035969, + "balance_loss_clip": 1.04457641, + "balance_loss_mlp": 1.02137804, + "epoch": 0.4131068690816173, + "flos": 22891975436160.0, + "grad_norm": 1.782596410138529, + "language_loss": 0.83561182, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.85689938, + "num_input_tokens_seen": 147584840, + "step": 6871, + "time_per_iteration": 4.014943838119507 + }, + { + "auxiliary_loss_clip": 0.01093322, + "auxiliary_loss_mlp": 0.01038259, + "balance_loss_clip": 1.04518151, + "balance_loss_mlp": 1.0243237, + "epoch": 0.4131669923342853, + "flos": 26065185707520.0, + "grad_norm": 1.8799247628132616, + "language_loss": 0.68375415, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.70506996, + "num_input_tokens_seen": 147604635, + "step": 6872, + "time_per_iteration": 2.613799810409546 + }, + { + "auxiliary_loss_clip": 0.01107358, + "auxiliary_loss_mlp": 0.01030945, + "balance_loss_clip": 1.04393148, + "balance_loss_mlp": 1.01733088, + "epoch": 0.41322711558695324, + "flos": 19244564789760.0, + "grad_norm": 1.930683615824224, + "language_loss": 0.75856042, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.77994347, + "num_input_tokens_seen": 147620700, + "step": 6873, + "time_per_iteration": 2.507756233215332 + }, + { + "auxiliary_loss_clip": 0.01100487, + "auxiliary_loss_mlp": 0.01034354, + "balance_loss_clip": 1.04543352, + "balance_loss_mlp": 1.02023411, + "epoch": 0.4132872388396212, + "flos": 22674464628480.0, + "grad_norm": 2.3789708471696915, + "language_loss": 0.83484882, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.85619724, + "num_input_tokens_seen": 147639490, + "step": 6874, + "time_per_iteration": 3.948176145553589 + }, + { + "auxiliary_loss_clip": 0.01087691, + "auxiliary_loss_mlp": 0.01032758, + "balance_loss_clip": 1.04063988, + "balance_loss_mlp": 1.01833367, + "epoch": 0.4133473620922892, + "flos": 20150195420160.0, + "grad_norm": 1.8888236725794347, + "language_loss": 0.71631092, + "learning_rate": 2.646557961279436e-06, + "loss": 0.73751539, + "num_input_tokens_seen": 147657205, + "step": 6875, + "time_per_iteration": 3.965930223464966 + }, + { + "auxiliary_loss_clip": 0.01091449, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.04151177, + "balance_loss_mlp": 1.02225256, + "epoch": 0.41340748534495714, + "flos": 24242755317120.0, + "grad_norm": 1.7486906592704157, + "language_loss": 0.82589626, + "learning_rate": 2.646189399991154e-06, + "loss": 0.84716588, + "num_input_tokens_seen": 147677005, + "step": 6876, + "time_per_iteration": 3.9192960262298584 + }, + { + "auxiliary_loss_clip": 0.01111288, + "auxiliary_loss_mlp": 0.01039671, + "balance_loss_clip": 1.04471803, + "balance_loss_mlp": 1.02372026, + "epoch": 0.41346760859762516, + "flos": 14392171566720.0, + "grad_norm": 2.339931730187285, + "language_loss": 0.65881693, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.68032658, + "num_input_tokens_seen": 147693435, + "step": 6877, + "time_per_iteration": 2.4733798503875732 + }, + { + "auxiliary_loss_clip": 0.01109974, + "auxiliary_loss_mlp": 0.01033887, + "balance_loss_clip": 1.04372263, + "balance_loss_mlp": 1.02010632, + "epoch": 0.4135277318502931, + "flos": 22492002516480.0, + "grad_norm": 1.731488260420566, + "language_loss": 0.76934761, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.79078621, + "num_input_tokens_seen": 147714000, + "step": 6878, + "time_per_iteration": 2.5016794204711914 + }, + { + "auxiliary_loss_clip": 0.01113468, + "auxiliary_loss_mlp": 0.00822204, + "balance_loss_clip": 1.04561949, + "balance_loss_mlp": 1.07058918, + "epoch": 0.4135878551029611, + "flos": 22418744728320.0, + "grad_norm": 1.8305258101647182, + "language_loss": 0.80112165, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.82047838, + "num_input_tokens_seen": 147731010, + "step": 6879, + "time_per_iteration": 2.5050699710845947 + }, + { + "auxiliary_loss_clip": 0.01122924, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.04474282, + "balance_loss_mlp": 1.02218223, + "epoch": 0.41364797835562905, + "flos": 27053232094080.0, + "grad_norm": 1.7352500546870113, + "language_loss": 0.84546053, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.86705112, + "num_input_tokens_seen": 147750880, + "step": 6880, + "time_per_iteration": 2.511765480041504 + }, + { + "auxiliary_loss_clip": 0.01101355, + "auxiliary_loss_mlp": 0.01027837, + "balance_loss_clip": 1.04330802, + "balance_loss_mlp": 1.01408076, + "epoch": 0.413708101608297, + "flos": 22967603521920.0, + "grad_norm": 1.5420282936339662, + "language_loss": 0.70583749, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.72712934, + "num_input_tokens_seen": 147771360, + "step": 6881, + "time_per_iteration": 2.5307204723358154 + }, + { + "auxiliary_loss_clip": 0.01121507, + "auxiliary_loss_mlp": 0.01033888, + "balance_loss_clip": 1.04558444, + "balance_loss_mlp": 1.02107906, + "epoch": 0.413768224860965, + "flos": 13333991875200.0, + "grad_norm": 1.7579162772761865, + "language_loss": 0.81531429, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.83686829, + "num_input_tokens_seen": 147787440, + "step": 6882, + "time_per_iteration": 2.4370205402374268 + }, + { + "auxiliary_loss_clip": 0.0110076, + "auxiliary_loss_mlp": 0.01040632, + "balance_loss_clip": 1.04251206, + "balance_loss_mlp": 1.02432466, + "epoch": 0.41382834811363295, + "flos": 20813968800000.0, + "grad_norm": 2.1006013023918166, + "language_loss": 0.69668627, + "learning_rate": 2.643608785656077e-06, + "loss": 0.71810019, + "num_input_tokens_seen": 147805720, + "step": 6883, + "time_per_iteration": 2.527916669845581 + }, + { + "auxiliary_loss_clip": 0.01113236, + "auxiliary_loss_mlp": 0.01030275, + "balance_loss_clip": 1.04398823, + "balance_loss_mlp": 1.01675677, + "epoch": 0.4138884713663009, + "flos": 20667130001280.0, + "grad_norm": 1.921822025323105, + "language_loss": 0.75513142, + "learning_rate": 2.643240028730663e-06, + "loss": 0.77656651, + "num_input_tokens_seen": 147824605, + "step": 6884, + "time_per_iteration": 2.5019338130950928 + }, + { + "auxiliary_loss_clip": 0.01096601, + "auxiliary_loss_mlp": 0.01035224, + "balance_loss_clip": 1.04711318, + "balance_loss_mlp": 1.02186084, + "epoch": 0.4139485946189689, + "flos": 29056616225280.0, + "grad_norm": 1.3987079184680344, + "language_loss": 0.75740409, + "learning_rate": 2.642871247413523e-06, + "loss": 0.77872241, + "num_input_tokens_seen": 147845445, + "step": 6885, + "time_per_iteration": 2.6034467220306396 + }, + { + "auxiliary_loss_clip": 0.01125499, + "auxiliary_loss_mlp": 0.01038549, + "balance_loss_clip": 1.04464841, + "balance_loss_mlp": 1.02481627, + "epoch": 0.41400871787163684, + "flos": 24425720219520.0, + "grad_norm": 1.8016900591452027, + "language_loss": 0.69951677, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.72115725, + "num_input_tokens_seen": 147865580, + "step": 6886, + "time_per_iteration": 2.4895639419555664 + }, + { + "auxiliary_loss_clip": 0.01123917, + "auxiliary_loss_mlp": 0.00805974, + "balance_loss_clip": 1.04437947, + "balance_loss_mlp": 1.03781629, + "epoch": 0.4140688411243048, + "flos": 19464050845440.0, + "grad_norm": 1.545613378319779, + "language_loss": 0.75476706, + "learning_rate": 2.642133611660002e-06, + "loss": 0.77406597, + "num_input_tokens_seen": 147885230, + "step": 6887, + "time_per_iteration": 2.457165002822876 + }, + { + "auxiliary_loss_clip": 0.01110131, + "auxiliary_loss_mlp": 0.01030094, + "balance_loss_clip": 1.04163003, + "balance_loss_mlp": 1.01616991, + "epoch": 0.4141289643769728, + "flos": 19313656600320.0, + "grad_norm": 2.1280914559769504, + "language_loss": 0.70327264, + "learning_rate": 2.641764757251592e-06, + "loss": 0.72467488, + "num_input_tokens_seen": 147903035, + "step": 6888, + "time_per_iteration": 2.474846363067627 + }, + { + "auxiliary_loss_clip": 0.01119953, + "auxiliary_loss_mlp": 0.01035321, + "balance_loss_clip": 1.04201293, + "balance_loss_mlp": 1.02117038, + "epoch": 0.41418908762964074, + "flos": 16726903683840.0, + "grad_norm": 2.0661678781746162, + "language_loss": 0.75710249, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.77865517, + "num_input_tokens_seen": 147918745, + "step": 6889, + "time_per_iteration": 2.426698923110962 + }, + { + "auxiliary_loss_clip": 0.01092098, + "auxiliary_loss_mlp": 0.00799513, + "balance_loss_clip": 1.04875755, + "balance_loss_mlp": 1.02828777, + "epoch": 0.41424921088230876, + "flos": 25296840858240.0, + "grad_norm": 1.5613438952495111, + "language_loss": 0.80223608, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.82115215, + "num_input_tokens_seen": 147938265, + "step": 6890, + "time_per_iteration": 2.603952407836914 + }, + { + "auxiliary_loss_clip": 0.01120235, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.04329252, + "balance_loss_mlp": 1.02314472, + "epoch": 0.4143093341349767, + "flos": 20960520289920.0, + "grad_norm": 1.5619868362425275, + "language_loss": 0.73876506, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.7603389, + "num_input_tokens_seen": 147957320, + "step": 6891, + "time_per_iteration": 2.4755373001098633 + }, + { + "auxiliary_loss_clip": 0.01082813, + "auxiliary_loss_mlp": 0.01040406, + "balance_loss_clip": 1.04322672, + "balance_loss_mlp": 1.02428269, + "epoch": 0.4143694573876447, + "flos": 22017694400640.0, + "grad_norm": 1.6235253417218969, + "language_loss": 0.8433007, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.86453283, + "num_input_tokens_seen": 147977045, + "step": 6892, + "time_per_iteration": 2.5881948471069336 + }, + { + "auxiliary_loss_clip": 0.01077008, + "auxiliary_loss_mlp": 0.00799344, + "balance_loss_clip": 1.04013932, + "balance_loss_mlp": 1.02264488, + "epoch": 0.41442958064031266, + "flos": 35697396723840.0, + "grad_norm": 4.418312530801412, + "language_loss": 0.70536613, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.72412968, + "num_input_tokens_seen": 147996905, + "step": 6893, + "time_per_iteration": 2.696990966796875 + }, + { + "auxiliary_loss_clip": 0.01120416, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.04281068, + "balance_loss_mlp": 1.01587784, + "epoch": 0.4144897038929806, + "flos": 28293766156800.0, + "grad_norm": 1.4134198980284547, + "language_loss": 0.73072726, + "learning_rate": 2.639551120239279e-06, + "loss": 0.75222361, + "num_input_tokens_seen": 148017875, + "step": 6894, + "time_per_iteration": 2.5443406105041504 + }, + { + "auxiliary_loss_clip": 0.0111241, + "auxiliary_loss_mlp": 0.01032425, + "balance_loss_clip": 1.04221773, + "balance_loss_mlp": 1.01915717, + "epoch": 0.4145498271456486, + "flos": 11648093080320.0, + "grad_norm": 2.817104185902053, + "language_loss": 0.63498086, + "learning_rate": 2.63918209577416e-06, + "loss": 0.65642923, + "num_input_tokens_seen": 148032300, + "step": 6895, + "time_per_iteration": 2.4575624465942383 + }, + { + "auxiliary_loss_clip": 0.01079516, + "auxiliary_loss_mlp": 0.01040907, + "balance_loss_clip": 1.0429709, + "balance_loss_mlp": 1.02577388, + "epoch": 0.41460995039831655, + "flos": 27235622378880.0, + "grad_norm": 1.4937700037116637, + "language_loss": 0.70880163, + "learning_rate": 2.638813047071192e-06, + "loss": 0.73000586, + "num_input_tokens_seen": 148053260, + "step": 6896, + "time_per_iteration": 2.6094095706939697 + }, + { + "auxiliary_loss_clip": 0.01123476, + "auxiliary_loss_mlp": 0.01040601, + "balance_loss_clip": 1.04251945, + "balance_loss_mlp": 1.02498472, + "epoch": 0.4146700736509845, + "flos": 25922369232000.0, + "grad_norm": 1.745138000044116, + "language_loss": 0.72874725, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.75038797, + "num_input_tokens_seen": 148072965, + "step": 6897, + "time_per_iteration": 2.5344982147216797 + }, + { + "auxiliary_loss_clip": 0.01110051, + "auxiliary_loss_mlp": 0.01042493, + "balance_loss_clip": 1.04514027, + "balance_loss_mlp": 1.02861714, + "epoch": 0.4147301969036525, + "flos": 26833243248000.0, + "grad_norm": 1.8247778571078017, + "language_loss": 0.84545344, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.86697894, + "num_input_tokens_seen": 148093240, + "step": 6898, + "time_per_iteration": 2.5377490520477295 + }, + { + "auxiliary_loss_clip": 0.010749, + "auxiliary_loss_mlp": 0.01033005, + "balance_loss_clip": 1.03971982, + "balance_loss_mlp": 1.01927233, + "epoch": 0.41479032015632045, + "flos": 20298291194880.0, + "grad_norm": 1.657087210421443, + "language_loss": 0.74594152, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.76702058, + "num_input_tokens_seen": 148110925, + "step": 6899, + "time_per_iteration": 2.5933306217193604 + }, + { + "auxiliary_loss_clip": 0.01090771, + "auxiliary_loss_mlp": 0.0103706, + "balance_loss_clip": 1.04376364, + "balance_loss_mlp": 1.02155066, + "epoch": 0.4148504434089884, + "flos": 25264988472960.0, + "grad_norm": 1.7893925950247982, + "language_loss": 0.75787258, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.7791509, + "num_input_tokens_seen": 148130670, + "step": 6900, + "time_per_iteration": 2.6006691455841064 + }, + { + "auxiliary_loss_clip": 0.011074, + "auxiliary_loss_mlp": 0.01039011, + "balance_loss_clip": 1.04391813, + "balance_loss_mlp": 1.02391934, + "epoch": 0.4149105666616564, + "flos": 12822300679680.0, + "grad_norm": 2.1754682086154467, + "language_loss": 0.80052733, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.82199138, + "num_input_tokens_seen": 148148350, + "step": 6901, + "time_per_iteration": 2.486302137374878 + }, + { + "auxiliary_loss_clip": 0.01082678, + "auxiliary_loss_mlp": 0.01042254, + "balance_loss_clip": 1.04026413, + "balance_loss_mlp": 1.02690005, + "epoch": 0.41497068991432434, + "flos": 16763891713920.0, + "grad_norm": 1.6695877168341542, + "language_loss": 0.70355242, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.72480172, + "num_input_tokens_seen": 148167550, + "step": 6902, + "time_per_iteration": 2.5214319229125977 + }, + { + "auxiliary_loss_clip": 0.01094603, + "auxiliary_loss_mlp": 0.00796509, + "balance_loss_clip": 1.04689479, + "balance_loss_mlp": 1.02437556, + "epoch": 0.4150308131669923, + "flos": 18000906243840.0, + "grad_norm": 1.6155122145045744, + "language_loss": 0.83849072, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.85740185, + "num_input_tokens_seen": 148184740, + "step": 6903, + "time_per_iteration": 2.515160322189331 + }, + { + "auxiliary_loss_clip": 0.01128297, + "auxiliary_loss_mlp": 0.01040419, + "balance_loss_clip": 1.04547811, + "balance_loss_mlp": 1.02520776, + "epoch": 0.41509093641966033, + "flos": 30044770352640.0, + "grad_norm": 1.864358375948661, + "language_loss": 0.6791569, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.70084399, + "num_input_tokens_seen": 148204605, + "step": 6904, + "time_per_iteration": 2.5165607929229736 + }, + { + "auxiliary_loss_clip": 0.0112549, + "auxiliary_loss_mlp": 0.00793184, + "balance_loss_clip": 1.04387987, + "balance_loss_mlp": 1.01789403, + "epoch": 0.4151510596723283, + "flos": 24279994742400.0, + "grad_norm": 1.9199372377495398, + "language_loss": 0.77346617, + "learning_rate": 2.635490520350643e-06, + "loss": 0.79265285, + "num_input_tokens_seen": 148224675, + "step": 6905, + "time_per_iteration": 2.5006649494171143 + }, + { + "auxiliary_loss_clip": 0.01126654, + "auxiliary_loss_mlp": 0.01033965, + "balance_loss_clip": 1.0454253, + "balance_loss_mlp": 1.01958227, + "epoch": 0.41521118292499626, + "flos": 23476206147840.0, + "grad_norm": 1.8653766223241826, + "language_loss": 0.68144101, + "learning_rate": 2.635121230039025e-06, + "loss": 0.70304722, + "num_input_tokens_seen": 148243375, + "step": 6906, + "time_per_iteration": 2.4892544746398926 + }, + { + "auxiliary_loss_clip": 0.01102526, + "auxiliary_loss_mlp": 0.01035422, + "balance_loss_clip": 1.04413891, + "balance_loss_mlp": 1.02207685, + "epoch": 0.4152713061776642, + "flos": 22125498094080.0, + "grad_norm": 2.143764362630354, + "language_loss": 0.67487544, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.69625491, + "num_input_tokens_seen": 148261140, + "step": 6907, + "time_per_iteration": 2.526439905166626 + }, + { + "auxiliary_loss_clip": 0.01092811, + "auxiliary_loss_mlp": 0.01035015, + "balance_loss_clip": 1.04461563, + "balance_loss_mlp": 1.02197921, + "epoch": 0.4153314294303322, + "flos": 21251396626560.0, + "grad_norm": 1.721520187502034, + "language_loss": 0.77118337, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.79246163, + "num_input_tokens_seen": 148279655, + "step": 6908, + "time_per_iteration": 2.561007499694824 + }, + { + "auxiliary_loss_clip": 0.0103432, + "auxiliary_loss_mlp": 0.01012766, + "balance_loss_clip": 1.02551496, + "balance_loss_mlp": 1.01096618, + "epoch": 0.41539155268300015, + "flos": 57920681594880.0, + "grad_norm": 0.9305522876801535, + "language_loss": 0.64805162, + "learning_rate": 2.634013214657026e-06, + "loss": 0.66852248, + "num_input_tokens_seen": 148339005, + "step": 6909, + "time_per_iteration": 3.128500461578369 + }, + { + "auxiliary_loss_clip": 0.010939, + "auxiliary_loss_mlp": 0.01038595, + "balance_loss_clip": 1.04510522, + "balance_loss_mlp": 1.02464771, + "epoch": 0.4154516759356681, + "flos": 21903677654400.0, + "grad_norm": 1.4744953430327294, + "language_loss": 0.87191617, + "learning_rate": 2.633643828093996e-06, + "loss": 0.89324105, + "num_input_tokens_seen": 148358715, + "step": 6910, + "time_per_iteration": 3.9596359729766846 + }, + { + "auxiliary_loss_clip": 0.01041328, + "auxiliary_loss_mlp": 0.01015311, + "balance_loss_clip": 1.02311611, + "balance_loss_mlp": 1.01382124, + "epoch": 0.4155117991883361, + "flos": 67833677226240.0, + "grad_norm": 0.8049100775817685, + "language_loss": 0.62165642, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64222282, + "num_input_tokens_seen": 148417280, + "step": 6911, + "time_per_iteration": 3.0626280307769775 + }, + { + "auxiliary_loss_clip": 0.01129183, + "auxiliary_loss_mlp": 0.0103915, + "balance_loss_clip": 1.04595721, + "balance_loss_mlp": 1.02473736, + "epoch": 0.41557192244100405, + "flos": 14282679934080.0, + "grad_norm": 4.2071813038520425, + "language_loss": 0.87356794, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.89525121, + "num_input_tokens_seen": 148432610, + "step": 6912, + "time_per_iteration": 3.81512451171875 + }, + { + "auxiliary_loss_clip": 0.01111267, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.04873657, + "balance_loss_mlp": 1.01700926, + "epoch": 0.415632045693672, + "flos": 24461954064000.0, + "grad_norm": 2.8867267843255964, + "language_loss": 0.63191372, + "learning_rate": 2.632535524293914e-06, + "loss": 0.65332925, + "num_input_tokens_seen": 148451510, + "step": 6913, + "time_per_iteration": 2.547332525253296 + }, + { + "auxiliary_loss_clip": 0.01097489, + "auxiliary_loss_mlp": 0.00792103, + "balance_loss_clip": 1.0429076, + "balance_loss_mlp": 1.01695967, + "epoch": 0.41569216894634, + "flos": 20115290378880.0, + "grad_norm": 2.6255682707553665, + "language_loss": 0.75034308, + "learning_rate": 2.632166041703586e-06, + "loss": 0.76923895, + "num_input_tokens_seen": 148469945, + "step": 6914, + "time_per_iteration": 3.8904013633728027 + }, + { + "auxiliary_loss_clip": 0.01073452, + "auxiliary_loss_mlp": 0.01041546, + "balance_loss_clip": 1.04242134, + "balance_loss_mlp": 1.02638257, + "epoch": 0.41575229219900794, + "flos": 23798827128960.0, + "grad_norm": 3.0273258105101752, + "language_loss": 0.87738669, + "learning_rate": 2.631796535141458e-06, + "loss": 0.89853662, + "num_input_tokens_seen": 148486655, + "step": 6915, + "time_per_iteration": 3.960951089859009 + }, + { + "auxiliary_loss_clip": 0.01101293, + "auxiliary_loss_mlp": 0.01043006, + "balance_loss_clip": 1.04505754, + "balance_loss_mlp": 1.02868938, + "epoch": 0.4158124154516759, + "flos": 23108229267840.0, + "grad_norm": 2.1041736126068913, + "language_loss": 0.71252906, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.73397207, + "num_input_tokens_seen": 148505035, + "step": 6916, + "time_per_iteration": 2.519998073577881 + }, + { + "auxiliary_loss_clip": 0.01126722, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.04504514, + "balance_loss_mlp": 1.01842153, + "epoch": 0.41587253870434393, + "flos": 24242970798720.0, + "grad_norm": 1.5079511011784503, + "language_loss": 0.71753323, + "learning_rate": 2.631057450157852e-06, + "loss": 0.73913091, + "num_input_tokens_seen": 148525575, + "step": 6917, + "time_per_iteration": 2.488110303878784 + }, + { + "auxiliary_loss_clip": 0.01103585, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.04292619, + "balance_loss_mlp": 1.0165689, + "epoch": 0.4159326619570119, + "flos": 23881602021120.0, + "grad_norm": 1.5732641281229314, + "language_loss": 0.80952972, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.83087063, + "num_input_tokens_seen": 148547270, + "step": 6918, + "time_per_iteration": 2.552065372467041 + }, + { + "auxiliary_loss_clip": 0.01114178, + "auxiliary_loss_mlp": 0.0103586, + "balance_loss_clip": 1.0491631, + "balance_loss_mlp": 1.02116108, + "epoch": 0.41599278520967986, + "flos": 40626531354240.0, + "grad_norm": 1.5420121770160584, + "language_loss": 0.70426035, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.72576076, + "num_input_tokens_seen": 148572100, + "step": 6919, + "time_per_iteration": 2.674450159072876 + }, + { + "auxiliary_loss_clip": 0.0110195, + "auxiliary_loss_mlp": 0.01035283, + "balance_loss_clip": 1.04376757, + "balance_loss_mlp": 1.02039385, + "epoch": 0.4160529084623478, + "flos": 18222942165120.0, + "grad_norm": 1.8389864844528119, + "language_loss": 0.81145978, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.8328321, + "num_input_tokens_seen": 148591245, + "step": 6920, + "time_per_iteration": 2.5236778259277344 + }, + { + "auxiliary_loss_clip": 0.01102806, + "auxiliary_loss_mlp": 0.01032314, + "balance_loss_clip": 1.0440011, + "balance_loss_mlp": 1.01728725, + "epoch": 0.4161130317150158, + "flos": 13661963982720.0, + "grad_norm": 1.9968737089874469, + "language_loss": 0.65816391, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.67951506, + "num_input_tokens_seen": 148607980, + "step": 6921, + "time_per_iteration": 2.484254837036133 + }, + { + "auxiliary_loss_clip": 0.01101402, + "auxiliary_loss_mlp": 0.01037238, + "balance_loss_clip": 1.04240227, + "balance_loss_mlp": 1.02294457, + "epoch": 0.41617315496768376, + "flos": 16178511767040.0, + "grad_norm": 2.9649648414612098, + "language_loss": 0.80679893, + "learning_rate": 2.629209319173274e-06, + "loss": 0.82818532, + "num_input_tokens_seen": 148624490, + "step": 6922, + "time_per_iteration": 2.4999377727508545 + }, + { + "auxiliary_loss_clip": 0.01099887, + "auxiliary_loss_mlp": 0.01035113, + "balance_loss_clip": 1.04362857, + "balance_loss_mlp": 1.02053952, + "epoch": 0.4162332782203517, + "flos": 26213317395840.0, + "grad_norm": 1.703256808317483, + "language_loss": 0.67282653, + "learning_rate": 2.628839621341247e-06, + "loss": 0.6941765, + "num_input_tokens_seen": 148646490, + "step": 6923, + "time_per_iteration": 2.5643982887268066 + }, + { + "auxiliary_loss_clip": 0.01093541, + "auxiliary_loss_mlp": 0.01044319, + "balance_loss_clip": 1.0437429, + "balance_loss_mlp": 1.02900052, + "epoch": 0.4162934014730197, + "flos": 28183987215360.0, + "grad_norm": 1.8661812988848512, + "language_loss": 0.76021123, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.78158987, + "num_input_tokens_seen": 148668580, + "step": 6924, + "time_per_iteration": 2.5902259349823 + }, + { + "auxiliary_loss_clip": 0.0112437, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.04298413, + "balance_loss_mlp": 1.02054548, + "epoch": 0.41635352472568765, + "flos": 19865316654720.0, + "grad_norm": 1.6868022704130128, + "language_loss": 0.73336565, + "learning_rate": 2.62810015415423e-06, + "loss": 0.75495714, + "num_input_tokens_seen": 148688410, + "step": 6925, + "time_per_iteration": 2.4444355964660645 + }, + { + "auxiliary_loss_clip": 0.01098966, + "auxiliary_loss_mlp": 0.01032878, + "balance_loss_clip": 1.0402627, + "balance_loss_mlp": 1.01947904, + "epoch": 0.4164136479783556, + "flos": 14935356011520.0, + "grad_norm": 2.0324694099439777, + "language_loss": 0.84183449, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.86315292, + "num_input_tokens_seen": 148704855, + "step": 6926, + "time_per_iteration": 2.4940683841705322 + }, + { + "auxiliary_loss_clip": 0.01090265, + "auxiliary_loss_mlp": 0.01033759, + "balance_loss_clip": 1.04507065, + "balance_loss_mlp": 1.02111673, + "epoch": 0.4164737712310236, + "flos": 21757593041280.0, + "grad_norm": 1.7802143608836776, + "language_loss": 0.86285698, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.88409722, + "num_input_tokens_seen": 148723065, + "step": 6927, + "time_per_iteration": 2.529287815093994 + }, + { + "auxiliary_loss_clip": 0.01111796, + "auxiliary_loss_mlp": 0.01038916, + "balance_loss_clip": 1.04299819, + "balance_loss_mlp": 1.02408063, + "epoch": 0.41653389448369155, + "flos": 20740136394240.0, + "grad_norm": 2.1932091706759484, + "language_loss": 0.72163993, + "learning_rate": 2.626990774776604e-06, + "loss": 0.74314708, + "num_input_tokens_seen": 148741780, + "step": 6928, + "time_per_iteration": 2.5097031593322754 + }, + { + "auxiliary_loss_clip": 0.01094496, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.04339266, + "balance_loss_mlp": 1.02000809, + "epoch": 0.4165940177363595, + "flos": 24972891073920.0, + "grad_norm": 1.8856219313845626, + "language_loss": 0.78070003, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.80198562, + "num_input_tokens_seen": 148759795, + "step": 6929, + "time_per_iteration": 2.5451531410217285 + }, + { + "auxiliary_loss_clip": 0.01121923, + "auxiliary_loss_mlp": 0.01032713, + "balance_loss_clip": 1.04285455, + "balance_loss_mlp": 1.01902211, + "epoch": 0.41665414098902753, + "flos": 20521727746560.0, + "grad_norm": 1.8243758788810978, + "language_loss": 0.71122926, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.73277557, + "num_input_tokens_seen": 148778680, + "step": 6930, + "time_per_iteration": 2.4697351455688477 + }, + { + "auxiliary_loss_clip": 0.0109557, + "auxiliary_loss_mlp": 0.01033431, + "balance_loss_clip": 1.04315734, + "balance_loss_mlp": 1.01977551, + "epoch": 0.4167142642416955, + "flos": 19682926369920.0, + "grad_norm": 2.192154911570851, + "language_loss": 0.81146938, + "learning_rate": 2.625881181419007e-06, + "loss": 0.83275938, + "num_input_tokens_seen": 148796470, + "step": 6931, + "time_per_iteration": 2.5105483531951904 + }, + { + "auxiliary_loss_clip": 0.01070057, + "auxiliary_loss_mlp": 0.01038411, + "balance_loss_clip": 1.03872061, + "balance_loss_mlp": 1.02387965, + "epoch": 0.41677438749436346, + "flos": 23763742519680.0, + "grad_norm": 1.693294371439168, + "language_loss": 0.78985572, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.81094033, + "num_input_tokens_seen": 148815300, + "step": 6932, + "time_per_iteration": 2.6561436653137207 + }, + { + "auxiliary_loss_clip": 0.0110327, + "auxiliary_loss_mlp": 0.0079367, + "balance_loss_clip": 1.04468858, + "balance_loss_mlp": 1.01456666, + "epoch": 0.41683451074703143, + "flos": 30410053712640.0, + "grad_norm": 1.893006947102337, + "language_loss": 0.81511551, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.83408487, + "num_input_tokens_seen": 148834315, + "step": 6933, + "time_per_iteration": 2.6245996952056885 + }, + { + "auxiliary_loss_clip": 0.01124809, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.0422039, + "balance_loss_mlp": 1.01872897, + "epoch": 0.4168946339996994, + "flos": 21506757390720.0, + "grad_norm": 1.776484676391278, + "language_loss": 0.76844943, + "learning_rate": 2.624771374460121e-06, + "loss": 0.79003859, + "num_input_tokens_seen": 148852420, + "step": 6934, + "time_per_iteration": 2.4754505157470703 + }, + { + "auxiliary_loss_clip": 0.01112361, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.04315639, + "balance_loss_mlp": 1.01877689, + "epoch": 0.41695475725236736, + "flos": 17638675539840.0, + "grad_norm": 1.8115111655812715, + "language_loss": 0.67171121, + "learning_rate": 2.624401391405668e-06, + "loss": 0.69316393, + "num_input_tokens_seen": 148869305, + "step": 6935, + "time_per_iteration": 2.4536144733428955 + }, + { + "auxiliary_loss_clip": 0.01098427, + "auxiliary_loss_mlp": 0.01039249, + "balance_loss_clip": 1.04711938, + "balance_loss_mlp": 1.02536726, + "epoch": 0.4170148805050353, + "flos": 15668903560320.0, + "grad_norm": 2.559498736452913, + "language_loss": 0.73541105, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.75678778, + "num_input_tokens_seen": 148886395, + "step": 6936, + "time_per_iteration": 2.4979114532470703 + }, + { + "auxiliary_loss_clip": 0.01104446, + "auxiliary_loss_mlp": 0.01035067, + "balance_loss_clip": 1.04136622, + "balance_loss_mlp": 1.02126265, + "epoch": 0.4170750037577033, + "flos": 15159151699200.0, + "grad_norm": 1.7716289986615963, + "language_loss": 0.73347461, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.7548697, + "num_input_tokens_seen": 148905235, + "step": 6937, + "time_per_iteration": 2.467724323272705 + }, + { + "auxiliary_loss_clip": 0.01096507, + "auxiliary_loss_mlp": 0.0103428, + "balance_loss_clip": 1.04173493, + "balance_loss_mlp": 1.02079141, + "epoch": 0.41713512701037125, + "flos": 28768289754240.0, + "grad_norm": 1.480836627043124, + "language_loss": 0.84399295, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.86530083, + "num_input_tokens_seen": 148928130, + "step": 6938, + "time_per_iteration": 2.6135406494140625 + }, + { + "auxiliary_loss_clip": 0.01105324, + "auxiliary_loss_mlp": 0.0103553, + "balance_loss_clip": 1.04726315, + "balance_loss_mlp": 1.02085543, + "epoch": 0.4171952502630392, + "flos": 28256993608320.0, + "grad_norm": 1.8353499495441778, + "language_loss": 0.74366438, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.76507294, + "num_input_tokens_seen": 148948790, + "step": 6939, + "time_per_iteration": 2.5790164470672607 + }, + { + "auxiliary_loss_clip": 0.01111685, + "auxiliary_loss_mlp": 0.01038719, + "balance_loss_clip": 1.04192686, + "balance_loss_mlp": 1.02396131, + "epoch": 0.4172553735157072, + "flos": 24571697091840.0, + "grad_norm": 1.5750714725080317, + "language_loss": 0.75045824, + "learning_rate": 2.622551121253579e-06, + "loss": 0.77196223, + "num_input_tokens_seen": 148967690, + "step": 6940, + "time_per_iteration": 2.554197072982788 + }, + { + "auxiliary_loss_clip": 0.011233, + "auxiliary_loss_mlp": 0.01038646, + "balance_loss_clip": 1.04434156, + "balance_loss_mlp": 1.02560425, + "epoch": 0.41731549676837515, + "flos": 27045797978880.0, + "grad_norm": 1.6804006228458461, + "language_loss": 0.71419311, + "learning_rate": 2.622180996345424e-06, + "loss": 0.73581254, + "num_input_tokens_seen": 148987150, + "step": 6941, + "time_per_iteration": 2.5204460620880127 + }, + { + "auxiliary_loss_clip": 0.01111186, + "auxiliary_loss_mlp": 0.01044751, + "balance_loss_clip": 1.04549289, + "balance_loss_mlp": 1.03030837, + "epoch": 0.4173756200210431, + "flos": 28394063907840.0, + "grad_norm": 1.9546128114816732, + "language_loss": 0.73961538, + "learning_rate": 2.621810847844104e-06, + "loss": 0.76117474, + "num_input_tokens_seen": 149004895, + "step": 6942, + "time_per_iteration": 2.6186671257019043 + }, + { + "auxiliary_loss_clip": 0.01086498, + "auxiliary_loss_mlp": 0.01039954, + "balance_loss_clip": 1.04080176, + "balance_loss_mlp": 1.0241704, + "epoch": 0.41743574327371114, + "flos": 22521556431360.0, + "grad_norm": 2.5041437068636365, + "language_loss": 0.72761106, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.74887556, + "num_input_tokens_seen": 149020970, + "step": 6943, + "time_per_iteration": 2.549762725830078 + }, + { + "auxiliary_loss_clip": 0.01090807, + "auxiliary_loss_mlp": 0.00794982, + "balance_loss_clip": 1.04127574, + "balance_loss_mlp": 1.01885557, + "epoch": 0.4174958665263791, + "flos": 30113431200000.0, + "grad_norm": 1.6342079768451196, + "language_loss": 0.63768238, + "learning_rate": 2.621070480118111e-06, + "loss": 0.65654033, + "num_input_tokens_seen": 149041795, + "step": 6944, + "time_per_iteration": 2.630528211593628 + }, + { + "auxiliary_loss_clip": 0.01094865, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.04138589, + "balance_loss_mlp": 1.02184343, + "epoch": 0.41755598977904707, + "flos": 25263444188160.0, + "grad_norm": 1.4292875924874588, + "language_loss": 0.69998646, + "learning_rate": 2.620700260921513e-06, + "loss": 0.72129941, + "num_input_tokens_seen": 149063700, + "step": 6945, + "time_per_iteration": 2.614041566848755 + }, + { + "auxiliary_loss_clip": 0.01080485, + "auxiliary_loss_mlp": 0.01050985, + "balance_loss_clip": 1.03772473, + "balance_loss_mlp": 1.03315163, + "epoch": 0.41761611303171503, + "flos": 19828580019840.0, + "grad_norm": 2.8055259033223923, + "language_loss": 0.8062948, + "learning_rate": 2.620330018187899e-06, + "loss": 0.82760954, + "num_input_tokens_seen": 149082410, + "step": 6946, + "time_per_iteration": 2.585125207901001 + }, + { + "auxiliary_loss_clip": 0.01116201, + "auxiliary_loss_mlp": 0.01034073, + "balance_loss_clip": 1.04470873, + "balance_loss_mlp": 1.01982772, + "epoch": 0.417676236284383, + "flos": 15523249910400.0, + "grad_norm": 2.1686195233388657, + "language_loss": 0.77047092, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.79197365, + "num_input_tokens_seen": 149098745, + "step": 6947, + "time_per_iteration": 2.4604761600494385 + }, + { + "auxiliary_loss_clip": 0.01123668, + "auxiliary_loss_mlp": 0.01035859, + "balance_loss_clip": 1.04382682, + "balance_loss_mlp": 1.02077889, + "epoch": 0.41773635953705096, + "flos": 32524473761280.0, + "grad_norm": 1.5121336451723844, + "language_loss": 0.71688318, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.73847842, + "num_input_tokens_seen": 149122255, + "step": 6948, + "time_per_iteration": 2.580331802368164 + }, + { + "auxiliary_loss_clip": 0.01107288, + "auxiliary_loss_mlp": 0.0102911, + "balance_loss_clip": 1.04142058, + "balance_loss_mlp": 1.0151273, + "epoch": 0.4177964827897189, + "flos": 23440941970560.0, + "grad_norm": 1.47041347683442, + "language_loss": 0.77177, + "learning_rate": 2.619219148905362e-06, + "loss": 0.79313403, + "num_input_tokens_seen": 149142845, + "step": 6949, + "time_per_iteration": 3.9206690788269043 + }, + { + "auxiliary_loss_clip": 0.011001, + "auxiliary_loss_mlp": 0.01033982, + "balance_loss_clip": 1.04317236, + "balance_loss_mlp": 1.01909292, + "epoch": 0.4178566060423869, + "flos": 22748907565440.0, + "grad_norm": 1.7206418277445708, + "language_loss": 0.82161498, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.84295583, + "num_input_tokens_seen": 149163375, + "step": 6950, + "time_per_iteration": 2.524818181991577 + }, + { + "auxiliary_loss_clip": 0.01093164, + "auxiliary_loss_mlp": 0.00792598, + "balance_loss_clip": 1.04412961, + "balance_loss_mlp": 1.01704764, + "epoch": 0.41791672929505486, + "flos": 26032794618240.0, + "grad_norm": 1.2833805198674966, + "language_loss": 0.75901425, + "learning_rate": 2.618478451956007e-06, + "loss": 0.77787185, + "num_input_tokens_seen": 149185610, + "step": 6951, + "time_per_iteration": 3.996936321258545 + }, + { + "auxiliary_loss_clip": 0.01078356, + "auxiliary_loss_mlp": 0.0102877, + "balance_loss_clip": 1.0445621, + "balance_loss_mlp": 1.01410782, + "epoch": 0.4179768525477228, + "flos": 19568694142080.0, + "grad_norm": 3.464949678195468, + "language_loss": 0.7324937, + "learning_rate": 2.61810806829516e-06, + "loss": 0.75356495, + "num_input_tokens_seen": 149203990, + "step": 6952, + "time_per_iteration": 2.561913251876831 + }, + { + "auxiliary_loss_clip": 0.01111087, + "auxiliary_loss_mlp": 0.01033585, + "balance_loss_clip": 1.04787636, + "balance_loss_mlp": 1.01983404, + "epoch": 0.4180369758003908, + "flos": 17783826399360.0, + "grad_norm": 2.288798618558008, + "language_loss": 0.71961844, + "learning_rate": 2.617737661195593e-06, + "loss": 0.7410652, + "num_input_tokens_seen": 149221385, + "step": 6953, + "time_per_iteration": 3.8590290546417236 + }, + { + "auxiliary_loss_clip": 0.0110912, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.04273939, + "balance_loss_mlp": 1.01847351, + "epoch": 0.41809709905305875, + "flos": 20960663944320.0, + "grad_norm": 1.7641999559919381, + "language_loss": 0.76169837, + "learning_rate": 2.617367230671353e-06, + "loss": 0.78312361, + "num_input_tokens_seen": 149241175, + "step": 6954, + "time_per_iteration": 3.8954226970672607 + }, + { + "auxiliary_loss_clip": 0.01082141, + "auxiliary_loss_mlp": 0.01042126, + "balance_loss_clip": 1.04208469, + "balance_loss_mlp": 1.02605665, + "epoch": 0.4181572223057267, + "flos": 22017622573440.0, + "grad_norm": 2.1175379458779084, + "language_loss": 0.84320503, + "learning_rate": 2.616996776736485e-06, + "loss": 0.86444771, + "num_input_tokens_seen": 149259115, + "step": 6955, + "time_per_iteration": 2.5775859355926514 + }, + { + "auxiliary_loss_clip": 0.01111786, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.04369497, + "balance_loss_mlp": 1.01950431, + "epoch": 0.4182173455583947, + "flos": 26245528917120.0, + "grad_norm": 1.5985109985036736, + "language_loss": 0.83042669, + "learning_rate": 2.616626299405037e-06, + "loss": 0.85187662, + "num_input_tokens_seen": 149278705, + "step": 6956, + "time_per_iteration": 2.5282037258148193 + }, + { + "auxiliary_loss_clip": 0.01088765, + "auxiliary_loss_mlp": 0.01037286, + "balance_loss_clip": 1.04579353, + "balance_loss_mlp": 1.02211118, + "epoch": 0.4182774688110627, + "flos": 14791605782400.0, + "grad_norm": 4.198351988146465, + "language_loss": 0.71499705, + "learning_rate": 2.616255798691059e-06, + "loss": 0.73625761, + "num_input_tokens_seen": 149294040, + "step": 6957, + "time_per_iteration": 2.5018653869628906 + }, + { + "auxiliary_loss_clip": 0.01090373, + "auxiliary_loss_mlp": 0.01038938, + "balance_loss_clip": 1.04234052, + "balance_loss_mlp": 1.0259558, + "epoch": 0.41833759206373067, + "flos": 20412020632320.0, + "grad_norm": 2.782988200386959, + "language_loss": 0.75406015, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.77535319, + "num_input_tokens_seen": 149310385, + "step": 6958, + "time_per_iteration": 2.535449981689453 + }, + { + "auxiliary_loss_clip": 0.01074426, + "auxiliary_loss_mlp": 0.00790948, + "balance_loss_clip": 1.03848648, + "balance_loss_mlp": 1.01177847, + "epoch": 0.41839771531639863, + "flos": 23656333875840.0, + "grad_norm": 1.6622305411751754, + "language_loss": 0.77178836, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.79044211, + "num_input_tokens_seen": 149328235, + "step": 6959, + "time_per_iteration": 2.5820393562316895 + }, + { + "auxiliary_loss_clip": 0.01082339, + "auxiliary_loss_mlp": 0.00789895, + "balance_loss_clip": 1.04118872, + "balance_loss_mlp": 1.01227415, + "epoch": 0.4184578385690666, + "flos": 19754137082880.0, + "grad_norm": 2.001032584738892, + "language_loss": 0.76879418, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.78751653, + "num_input_tokens_seen": 149347465, + "step": 6960, + "time_per_iteration": 2.570355176925659 + }, + { + "auxiliary_loss_clip": 0.01090666, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.04411185, + "balance_loss_mlp": 1.01861954, + "epoch": 0.41851796182173456, + "flos": 20193396503040.0, + "grad_norm": 1.6642645253243589, + "language_loss": 0.76047957, + "learning_rate": 2.614773562290835e-06, + "loss": 0.78170037, + "num_input_tokens_seen": 149366685, + "step": 6961, + "time_per_iteration": 2.517451047897339 + }, + { + "auxiliary_loss_clip": 0.01023976, + "auxiliary_loss_mlp": 0.01008544, + "balance_loss_clip": 1.02967107, + "balance_loss_mlp": 1.00674391, + "epoch": 0.41857808507440253, + "flos": 59018794231680.0, + "grad_norm": 0.7833144920680323, + "language_loss": 0.54703987, + "learning_rate": 2.61440294487496e-06, + "loss": 0.56736505, + "num_input_tokens_seen": 149422925, + "step": 6962, + "time_per_iteration": 3.095282793045044 + }, + { + "auxiliary_loss_clip": 0.01109419, + "auxiliary_loss_mlp": 0.01036067, + "balance_loss_clip": 1.04438734, + "balance_loss_mlp": 1.02226841, + "epoch": 0.4186382083270705, + "flos": 18478805719680.0, + "grad_norm": 2.4165113088594743, + "language_loss": 0.85238779, + "learning_rate": 2.614032304160864e-06, + "loss": 0.8738426, + "num_input_tokens_seen": 149440820, + "step": 6963, + "time_per_iteration": 2.477548837661743 + }, + { + "auxiliary_loss_clip": 0.01096029, + "auxiliary_loss_mlp": 0.01033145, + "balance_loss_clip": 1.04380679, + "balance_loss_mlp": 1.01960909, + "epoch": 0.41869833157973846, + "flos": 21578758202880.0, + "grad_norm": 1.5425291158505081, + "language_loss": 0.70082188, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.72211361, + "num_input_tokens_seen": 149461060, + "step": 6964, + "time_per_iteration": 2.5348896980285645 + }, + { + "auxiliary_loss_clip": 0.01120682, + "auxiliary_loss_mlp": 0.01034824, + "balance_loss_clip": 1.044065, + "balance_loss_mlp": 1.0214963, + "epoch": 0.4187584548324064, + "flos": 35517412650240.0, + "grad_norm": 1.5660162686770123, + "language_loss": 0.7134198, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.73497486, + "num_input_tokens_seen": 149483115, + "step": 6965, + "time_per_iteration": 2.6114962100982666 + }, + { + "auxiliary_loss_clip": 0.01071574, + "auxiliary_loss_mlp": 0.01036573, + "balance_loss_clip": 1.0394851, + "balance_loss_mlp": 1.02343011, + "epoch": 0.4188185780850744, + "flos": 18655880791680.0, + "grad_norm": 1.5581665285831399, + "language_loss": 0.72064012, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.74172163, + "num_input_tokens_seen": 149501495, + "step": 6966, + "time_per_iteration": 2.5596861839294434 + }, + { + "auxiliary_loss_clip": 0.01116619, + "auxiliary_loss_mlp": 0.01036538, + "balance_loss_clip": 1.04347372, + "balance_loss_mlp": 1.02195883, + "epoch": 0.41887870133774235, + "flos": 40333428374400.0, + "grad_norm": 1.9217127421208895, + "language_loss": 0.71411192, + "learning_rate": 2.612549508603375e-06, + "loss": 0.73564351, + "num_input_tokens_seen": 149523170, + "step": 6967, + "time_per_iteration": 2.6552844047546387 + }, + { + "auxiliary_loss_clip": 0.01042726, + "auxiliary_loss_mlp": 0.01001547, + "balance_loss_clip": 1.02402973, + "balance_loss_mlp": 1.00012255, + "epoch": 0.4189388245904103, + "flos": 61371336516480.0, + "grad_norm": 0.6730346752703964, + "language_loss": 0.46249914, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48294187, + "num_input_tokens_seen": 149583955, + "step": 6968, + "time_per_iteration": 3.088515520095825 + }, + { + "auxiliary_loss_clip": 0.01113696, + "auxiliary_loss_mlp": 0.01041323, + "balance_loss_clip": 1.04185236, + "balance_loss_mlp": 1.02567053, + "epoch": 0.4189989478430783, + "flos": 28215624119040.0, + "grad_norm": 2.0075571755588446, + "language_loss": 0.75039572, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.77194589, + "num_input_tokens_seen": 149604440, + "step": 6969, + "time_per_iteration": 2.578934907913208 + }, + { + "auxiliary_loss_clip": 0.01096513, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.04087162, + "balance_loss_mlp": 1.02324724, + "epoch": 0.4190590710957463, + "flos": 24565879088640.0, + "grad_norm": 1.8173549172145091, + "language_loss": 0.80765128, + "learning_rate": 2.611437167992705e-06, + "loss": 0.82897151, + "num_input_tokens_seen": 149623745, + "step": 6970, + "time_per_iteration": 2.5623490810394287 + }, + { + "auxiliary_loss_clip": 0.01109747, + "auxiliary_loss_mlp": 0.01036018, + "balance_loss_clip": 1.04307866, + "balance_loss_mlp": 1.02162969, + "epoch": 0.41911919434841427, + "flos": 21726027964800.0, + "grad_norm": 1.7735005611774097, + "language_loss": 0.82968664, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.85114431, + "num_input_tokens_seen": 149643025, + "step": 6971, + "time_per_iteration": 2.489835500717163 + }, + { + "auxiliary_loss_clip": 0.01095255, + "auxiliary_loss_mlp": 0.01035934, + "balance_loss_clip": 1.04397678, + "balance_loss_mlp": 1.02121735, + "epoch": 0.41917931760108224, + "flos": 17601543855360.0, + "grad_norm": 1.688488430910946, + "language_loss": 0.75031728, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.77162921, + "num_input_tokens_seen": 149660695, + "step": 6972, + "time_per_iteration": 2.486983299255371 + }, + { + "auxiliary_loss_clip": 0.010942, + "auxiliary_loss_mlp": 0.0103378, + "balance_loss_clip": 1.04018617, + "balance_loss_mlp": 1.02001131, + "epoch": 0.4192394408537502, + "flos": 37816701022080.0, + "grad_norm": 1.6919861510620386, + "language_loss": 0.72904158, + "learning_rate": 2.610324618710212e-06, + "loss": 0.75032133, + "num_input_tokens_seen": 149682040, + "step": 6973, + "time_per_iteration": 2.683706283569336 + }, + { + "auxiliary_loss_clip": 0.01093353, + "auxiliary_loss_mlp": 0.01039406, + "balance_loss_clip": 1.04746461, + "balance_loss_mlp": 1.02528596, + "epoch": 0.41929956410641817, + "flos": 23107726477440.0, + "grad_norm": 1.6971474998152096, + "language_loss": 0.7453506, + "learning_rate": 2.609953722643489e-06, + "loss": 0.76667821, + "num_input_tokens_seen": 149700855, + "step": 6974, + "time_per_iteration": 2.555488348007202 + }, + { + "auxiliary_loss_clip": 0.01110434, + "auxiliary_loss_mlp": 0.01034235, + "balance_loss_clip": 1.04161453, + "balance_loss_mlp": 1.02070522, + "epoch": 0.41935968735908613, + "flos": 22524537260160.0, + "grad_norm": 1.7270290497503338, + "language_loss": 0.72742534, + "learning_rate": 2.609582803447259e-06, + "loss": 0.74887204, + "num_input_tokens_seen": 149717360, + "step": 6975, + "time_per_iteration": 2.457200288772583 + }, + { + "auxiliary_loss_clip": 0.01105409, + "auxiliary_loss_mlp": 0.01037353, + "balance_loss_clip": 1.04355121, + "balance_loss_mlp": 1.02308393, + "epoch": 0.4194198106117541, + "flos": 26870446759680.0, + "grad_norm": 1.5846943497259998, + "language_loss": 0.80650526, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.82793289, + "num_input_tokens_seen": 149738975, + "step": 6976, + "time_per_iteration": 2.524115800857544 + }, + { + "auxiliary_loss_clip": 0.01095768, + "auxiliary_loss_mlp": 0.01038116, + "balance_loss_clip": 1.03763056, + "balance_loss_mlp": 1.02290535, + "epoch": 0.41947993386442206, + "flos": 19902412425600.0, + "grad_norm": 1.8422888926674093, + "language_loss": 0.68120319, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.70254207, + "num_input_tokens_seen": 149757055, + "step": 6977, + "time_per_iteration": 2.4975345134735107 + }, + { + "auxiliary_loss_clip": 0.01111489, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.0434972, + "balance_loss_mlp": 1.02417994, + "epoch": 0.41954005711709, + "flos": 17383889393280.0, + "grad_norm": 2.338707343760537, + "language_loss": 0.8060019, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.82749367, + "num_input_tokens_seen": 149772885, + "step": 6978, + "time_per_iteration": 2.4364173412323 + }, + { + "auxiliary_loss_clip": 0.01122836, + "auxiliary_loss_mlp": 0.01037118, + "balance_loss_clip": 1.04131329, + "balance_loss_mlp": 1.02264643, + "epoch": 0.419600180369758, + "flos": 25003306915200.0, + "grad_norm": 1.8663935066920778, + "language_loss": 0.82596147, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.847561, + "num_input_tokens_seen": 149791515, + "step": 6979, + "time_per_iteration": 2.4946377277374268 + }, + { + "auxiliary_loss_clip": 0.01118851, + "auxiliary_loss_mlp": 0.01035684, + "balance_loss_clip": 1.04060102, + "balance_loss_mlp": 1.02214146, + "epoch": 0.41966030362242596, + "flos": 17383781652480.0, + "grad_norm": 3.0591406563818313, + "language_loss": 0.83501476, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.85656011, + "num_input_tokens_seen": 149807250, + "step": 6980, + "time_per_iteration": 2.405071258544922 + }, + { + "auxiliary_loss_clip": 0.01123053, + "auxiliary_loss_mlp": 0.01034206, + "balance_loss_clip": 1.04271507, + "balance_loss_mlp": 1.02061033, + "epoch": 0.4197204268750939, + "flos": 22156165330560.0, + "grad_norm": 2.4257458328839268, + "language_loss": 0.78875482, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.81032741, + "num_input_tokens_seen": 149821640, + "step": 6981, + "time_per_iteration": 2.4353840351104736 + }, + { + "auxiliary_loss_clip": 0.01086199, + "auxiliary_loss_mlp": 0.01032154, + "balance_loss_clip": 1.04056776, + "balance_loss_mlp": 1.01869488, + "epoch": 0.4197805501277619, + "flos": 22084128604800.0, + "grad_norm": 1.7496004991136724, + "language_loss": 0.84303665, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.86422014, + "num_input_tokens_seen": 149840545, + "step": 6982, + "time_per_iteration": 2.5416836738586426 + }, + { + "auxiliary_loss_clip": 0.01117, + "auxiliary_loss_mlp": 0.01036059, + "balance_loss_clip": 1.04249036, + "balance_loss_mlp": 1.02158165, + "epoch": 0.4198406733804299, + "flos": 26432192920320.0, + "grad_norm": 1.9878958739237256, + "language_loss": 0.56849933, + "learning_rate": 2.606614618903214e-06, + "loss": 0.59002995, + "num_input_tokens_seen": 149860375, + "step": 6983, + "time_per_iteration": 2.5312018394470215 + }, + { + "auxiliary_loss_clip": 0.01109466, + "auxiliary_loss_mlp": 0.01035413, + "balance_loss_clip": 1.04274261, + "balance_loss_mlp": 1.02228236, + "epoch": 0.4199007966330979, + "flos": 12531029293440.0, + "grad_norm": 2.056352007734659, + "language_loss": 0.82436645, + "learning_rate": 2.606243492174471e-06, + "loss": 0.8458153, + "num_input_tokens_seen": 149877850, + "step": 6984, + "time_per_iteration": 2.447140693664551 + }, + { + "auxiliary_loss_clip": 0.01101743, + "auxiliary_loss_mlp": 0.01032865, + "balance_loss_clip": 1.03966606, + "balance_loss_mlp": 1.01928067, + "epoch": 0.41996091988576584, + "flos": 21762944167680.0, + "grad_norm": 3.6551564368695537, + "language_loss": 0.79312462, + "learning_rate": 2.605872342456914e-06, + "loss": 0.81447071, + "num_input_tokens_seen": 149896110, + "step": 6985, + "time_per_iteration": 2.4955735206604004 + }, + { + "auxiliary_loss_clip": 0.01123808, + "auxiliary_loss_mlp": 0.01034165, + "balance_loss_clip": 1.0411166, + "balance_loss_mlp": 1.01993144, + "epoch": 0.4200210431384338, + "flos": 26541935948160.0, + "grad_norm": 1.6269455596897593, + "language_loss": 0.78110814, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.80268788, + "num_input_tokens_seen": 149916495, + "step": 6986, + "time_per_iteration": 3.905823230743408 + }, + { + "auxiliary_loss_clip": 0.01097751, + "auxiliary_loss_mlp": 0.0102794, + "balance_loss_clip": 1.04143095, + "balance_loss_mlp": 1.01541114, + "epoch": 0.42008116639110177, + "flos": 26795824254720.0, + "grad_norm": 1.971381820425643, + "language_loss": 0.72193265, + "learning_rate": 2.605129974111655e-06, + "loss": 0.74318957, + "num_input_tokens_seen": 149936445, + "step": 6987, + "time_per_iteration": 2.548518657684326 + }, + { + "auxiliary_loss_clip": 0.01098859, + "auxiliary_loss_mlp": 0.00795641, + "balance_loss_clip": 1.04161799, + "balance_loss_mlp": 1.01428938, + "epoch": 0.42014128964376973, + "flos": 32087333243520.0, + "grad_norm": 1.4638129949935732, + "language_loss": 0.75073153, + "learning_rate": 2.604758755512104e-06, + "loss": 0.76967657, + "num_input_tokens_seen": 149959430, + "step": 6988, + "time_per_iteration": 2.615042209625244 + }, + { + "auxiliary_loss_clip": 0.01112238, + "auxiliary_loss_mlp": 0.010375, + "balance_loss_clip": 1.04184413, + "balance_loss_mlp": 1.02329659, + "epoch": 0.4202014128964377, + "flos": 26467133875200.0, + "grad_norm": 1.8540971695433768, + "language_loss": 0.74102914, + "learning_rate": 2.60438751398004e-06, + "loss": 0.76252657, + "num_input_tokens_seen": 149980365, + "step": 6989, + "time_per_iteration": 3.892676591873169 + }, + { + "auxiliary_loss_clip": 0.0110363, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.04406548, + "balance_loss_mlp": 1.0162909, + "epoch": 0.42026153614910566, + "flos": 13401216178560.0, + "grad_norm": 1.946141216382325, + "language_loss": 0.70664573, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.72798574, + "num_input_tokens_seen": 149997375, + "step": 6990, + "time_per_iteration": 2.5177974700927734 + }, + { + "auxiliary_loss_clip": 0.01039412, + "auxiliary_loss_mlp": 0.00837294, + "balance_loss_clip": 1.02923322, + "balance_loss_mlp": 1.12527084, + "epoch": 0.42032165940177363, + "flos": 60250457635200.0, + "grad_norm": 0.8517478143214952, + "language_loss": 0.60456693, + "learning_rate": 2.603644962174685e-06, + "loss": 0.62333399, + "num_input_tokens_seen": 150051230, + "step": 6991, + "time_per_iteration": 2.9769465923309326 + }, + { + "auxiliary_loss_clip": 0.01126039, + "auxiliary_loss_mlp": 0.01038941, + "balance_loss_clip": 1.04511428, + "balance_loss_mlp": 1.0244925, + "epoch": 0.4203817826544416, + "flos": 24535211852160.0, + "grad_norm": 1.592413610118192, + "language_loss": 0.83142668, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.85307652, + "num_input_tokens_seen": 150071135, + "step": 6992, + "time_per_iteration": 3.8524773120880127 + }, + { + "auxiliary_loss_clip": 0.01044119, + "auxiliary_loss_mlp": 0.01006903, + "balance_loss_clip": 1.01636052, + "balance_loss_mlp": 1.00521076, + "epoch": 0.42044190590710956, + "flos": 58820781530880.0, + "grad_norm": 0.8041002601200566, + "language_loss": 0.65496969, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.67547989, + "num_input_tokens_seen": 150125220, + "step": 6993, + "time_per_iteration": 4.385503530502319 + }, + { + "auxiliary_loss_clip": 0.01127686, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.04343235, + "balance_loss_mlp": 1.01980615, + "epoch": 0.4205020291597775, + "flos": 16436063260800.0, + "grad_norm": 1.8971275360109305, + "language_loss": 0.83324945, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.85488021, + "num_input_tokens_seen": 150142300, + "step": 6994, + "time_per_iteration": 2.4313316345214844 + }, + { + "auxiliary_loss_clip": 0.01111691, + "auxiliary_loss_mlp": 0.00795131, + "balance_loss_clip": 1.04390705, + "balance_loss_mlp": 1.02264738, + "epoch": 0.4205621524124455, + "flos": 18405655672320.0, + "grad_norm": 1.5491275975594605, + "language_loss": 0.78267044, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.80173862, + "num_input_tokens_seen": 150161345, + "step": 6995, + "time_per_iteration": 2.4673421382904053 + }, + { + "auxiliary_loss_clip": 0.01086026, + "auxiliary_loss_mlp": 0.01030692, + "balance_loss_clip": 1.04145598, + "balance_loss_mlp": 1.01739419, + "epoch": 0.4206222756651135, + "flos": 25520097841920.0, + "grad_norm": 1.5142758791513486, + "language_loss": 0.80236334, + "learning_rate": 2.60178818232786e-06, + "loss": 0.82353055, + "num_input_tokens_seen": 150182420, + "step": 6996, + "time_per_iteration": 2.5898354053497314 + }, + { + "auxiliary_loss_clip": 0.01102471, + "auxiliary_loss_mlp": 0.00802739, + "balance_loss_clip": 1.04274917, + "balance_loss_mlp": 1.03742981, + "epoch": 0.4206823989177815, + "flos": 15304338472320.0, + "grad_norm": 2.1144138920939612, + "language_loss": 0.75341874, + "learning_rate": 2.601416757842559e-06, + "loss": 0.77247083, + "num_input_tokens_seen": 150200175, + "step": 6997, + "time_per_iteration": 2.4930367469787598 + }, + { + "auxiliary_loss_clip": 0.01120627, + "auxiliary_loss_mlp": 0.01038064, + "balance_loss_clip": 1.03995073, + "balance_loss_mlp": 1.02418196, + "epoch": 0.42074252217044944, + "flos": 15554096714880.0, + "grad_norm": 1.9964550889584023, + "language_loss": 0.75564432, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.77723128, + "num_input_tokens_seen": 150217100, + "step": 6998, + "time_per_iteration": 2.4391465187072754 + }, + { + "auxiliary_loss_clip": 0.01125954, + "auxiliary_loss_mlp": 0.01035952, + "balance_loss_clip": 1.04411089, + "balance_loss_mlp": 1.02167058, + "epoch": 0.4208026454231174, + "flos": 26145877610880.0, + "grad_norm": 1.7664017630549027, + "language_loss": 0.75868535, + "learning_rate": 2.60067384046869e-06, + "loss": 0.78030443, + "num_input_tokens_seen": 150239830, + "step": 6999, + "time_per_iteration": 2.5153234004974365 + }, + { + "auxiliary_loss_clip": 0.01075963, + "auxiliary_loss_mlp": 0.01038823, + "balance_loss_clip": 1.04111719, + "balance_loss_mlp": 1.02420831, + "epoch": 0.42086276867578537, + "flos": 23550110380800.0, + "grad_norm": 4.085729936156881, + "language_loss": 0.64003181, + "learning_rate": 2.600302347608295e-06, + "loss": 0.66117972, + "num_input_tokens_seen": 150260690, + "step": 7000, + "time_per_iteration": 2.5757107734680176 + }, + { + "auxiliary_loss_clip": 0.01086597, + "auxiliary_loss_mlp": 0.01036669, + "balance_loss_clip": 1.04553938, + "balance_loss_mlp": 1.02160072, + "epoch": 0.42092289192845334, + "flos": 18113414618880.0, + "grad_norm": 1.487582695554992, + "language_loss": 0.7641052, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.78533787, + "num_input_tokens_seen": 150279885, + "step": 7001, + "time_per_iteration": 2.5500075817108154 + }, + { + "auxiliary_loss_clip": 0.01092944, + "auxiliary_loss_mlp": 0.00802124, + "balance_loss_clip": 1.04526448, + "balance_loss_mlp": 1.03571391, + "epoch": 0.4209830151811213, + "flos": 20006588845440.0, + "grad_norm": 1.4567946401476999, + "language_loss": 0.86736441, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.88631499, + "num_input_tokens_seen": 150297390, + "step": 7002, + "time_per_iteration": 2.5626943111419678 + }, + { + "auxiliary_loss_clip": 0.01084748, + "auxiliary_loss_mlp": 0.01034753, + "balance_loss_clip": 1.04329085, + "balance_loss_mlp": 1.02162206, + "epoch": 0.42104313843378927, + "flos": 21978946604160.0, + "grad_norm": 2.27376547838274, + "language_loss": 0.6807127, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.70190775, + "num_input_tokens_seen": 150317390, + "step": 7003, + "time_per_iteration": 2.543154239654541 + }, + { + "auxiliary_loss_clip": 0.01125428, + "auxiliary_loss_mlp": 0.01040035, + "balance_loss_clip": 1.04376161, + "balance_loss_mlp": 1.02539587, + "epoch": 0.42110326168645723, + "flos": 25443966965760.0, + "grad_norm": 3.083412188049608, + "language_loss": 0.77476871, + "learning_rate": 2.598816148672344e-06, + "loss": 0.79642338, + "num_input_tokens_seen": 150337455, + "step": 7004, + "time_per_iteration": 2.5100674629211426 + }, + { + "auxiliary_loss_clip": 0.01118422, + "auxiliary_loss_mlp": 0.01036475, + "balance_loss_clip": 1.04269648, + "balance_loss_mlp": 1.02197361, + "epoch": 0.4211633849391252, + "flos": 17822574195840.0, + "grad_norm": 1.6799881442169071, + "language_loss": 0.68037724, + "learning_rate": 2.59844454213521e-06, + "loss": 0.70192623, + "num_input_tokens_seen": 150355385, + "step": 7005, + "time_per_iteration": 2.4322383403778076 + }, + { + "auxiliary_loss_clip": 0.01111586, + "auxiliary_loss_mlp": 0.01034687, + "balance_loss_clip": 1.04303396, + "balance_loss_mlp": 1.02103758, + "epoch": 0.42122350819179316, + "flos": 16282436791680.0, + "grad_norm": 2.3828499137170533, + "language_loss": 0.71933538, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.74079812, + "num_input_tokens_seen": 150371750, + "step": 7006, + "time_per_iteration": 2.4813992977142334 + }, + { + "auxiliary_loss_clip": 0.0112184, + "auxiliary_loss_mlp": 0.01033454, + "balance_loss_clip": 1.04175329, + "balance_loss_mlp": 1.01924467, + "epoch": 0.4212836314444611, + "flos": 19645866512640.0, + "grad_norm": 2.9235464114451264, + "language_loss": 0.71095145, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.73250437, + "num_input_tokens_seen": 150389955, + "step": 7007, + "time_per_iteration": 2.4288744926452637 + }, + { + "auxiliary_loss_clip": 0.01096691, + "auxiliary_loss_mlp": 0.00794606, + "balance_loss_clip": 1.03995943, + "balance_loss_mlp": 1.02189875, + "epoch": 0.4213437546971291, + "flos": 18369026778240.0, + "grad_norm": 1.7609492528652881, + "language_loss": 0.82313615, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.84204912, + "num_input_tokens_seen": 150405780, + "step": 7008, + "time_per_iteration": 2.5352776050567627 + }, + { + "auxiliary_loss_clip": 0.01081049, + "auxiliary_loss_mlp": 0.01038911, + "balance_loss_clip": 1.04172683, + "balance_loss_mlp": 1.02462411, + "epoch": 0.42140387794979706, + "flos": 27704507541120.0, + "grad_norm": 1.6603963031220297, + "language_loss": 0.71658206, + "learning_rate": 2.596957889196831e-06, + "loss": 0.73778164, + "num_input_tokens_seen": 150425615, + "step": 7009, + "time_per_iteration": 2.5837604999542236 + }, + { + "auxiliary_loss_clip": 0.01121318, + "auxiliary_loss_mlp": 0.01032231, + "balance_loss_clip": 1.0409776, + "balance_loss_mlp": 1.01802719, + "epoch": 0.4214640012024651, + "flos": 28147071012480.0, + "grad_norm": 8.56482520856282, + "language_loss": 0.65433186, + "learning_rate": 2.596586169335243e-06, + "loss": 0.67586732, + "num_input_tokens_seen": 150445765, + "step": 7010, + "time_per_iteration": 2.4966347217559814 + }, + { + "auxiliary_loss_clip": 0.01083258, + "auxiliary_loss_mlp": 0.01027284, + "balance_loss_clip": 1.04075432, + "balance_loss_mlp": 1.01371789, + "epoch": 0.42152412445513304, + "flos": 22997265177600.0, + "grad_norm": 1.9573097546297447, + "language_loss": 0.72714972, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.74825513, + "num_input_tokens_seen": 150464405, + "step": 7011, + "time_per_iteration": 2.5421650409698486 + }, + { + "auxiliary_loss_clip": 0.01033526, + "auxiliary_loss_mlp": 0.01001231, + "balance_loss_clip": 1.01489544, + "balance_loss_mlp": 0.99962157, + "epoch": 0.421584247707801, + "flos": 63749592938880.0, + "grad_norm": 0.7836872857400162, + "language_loss": 0.54328597, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.56363356, + "num_input_tokens_seen": 150520430, + "step": 7012, + "time_per_iteration": 2.9943134784698486 + }, + { + "auxiliary_loss_clip": 0.01109142, + "auxiliary_loss_mlp": 0.01031962, + "balance_loss_clip": 1.0407716, + "balance_loss_mlp": 1.01798511, + "epoch": 0.421644370960469, + "flos": 24314612474880.0, + "grad_norm": 1.593901638867922, + "language_loss": 0.78672719, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.80813825, + "num_input_tokens_seen": 150542610, + "step": 7013, + "time_per_iteration": 2.5130465030670166 + }, + { + "auxiliary_loss_clip": 0.01120087, + "auxiliary_loss_mlp": 0.01035561, + "balance_loss_clip": 1.03979373, + "balance_loss_mlp": 1.02065992, + "epoch": 0.42170449421313694, + "flos": 23440690575360.0, + "grad_norm": 1.7054730141053818, + "language_loss": 0.80984294, + "learning_rate": 2.595099063803787e-06, + "loss": 0.83139944, + "num_input_tokens_seen": 150560970, + "step": 7014, + "time_per_iteration": 2.4900002479553223 + }, + { + "auxiliary_loss_clip": 0.01108466, + "auxiliary_loss_mlp": 0.01034277, + "balance_loss_clip": 1.03981566, + "balance_loss_mlp": 1.02055001, + "epoch": 0.4217646174658049, + "flos": 23695476721920.0, + "grad_norm": 2.6921600227142477, + "language_loss": 0.77849889, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.79992628, + "num_input_tokens_seen": 150582615, + "step": 7015, + "time_per_iteration": 2.525897741317749 + }, + { + "auxiliary_loss_clip": 0.01123232, + "auxiliary_loss_mlp": 0.01037424, + "balance_loss_clip": 1.04288983, + "balance_loss_mlp": 1.0229404, + "epoch": 0.42182474071847287, + "flos": 24971562270720.0, + "grad_norm": 1.7465189692441223, + "language_loss": 0.82268035, + "learning_rate": 2.594355375584368e-06, + "loss": 0.84428692, + "num_input_tokens_seen": 150603640, + "step": 7016, + "time_per_iteration": 2.508558988571167 + }, + { + "auxiliary_loss_clip": 0.01085932, + "auxiliary_loss_mlp": 0.01033197, + "balance_loss_clip": 1.04055715, + "balance_loss_mlp": 1.018677, + "epoch": 0.42188486397114083, + "flos": 22856639431680.0, + "grad_norm": 1.924778520899619, + "language_loss": 0.68145865, + "learning_rate": 2.593983497660586e-06, + "loss": 0.70264989, + "num_input_tokens_seen": 150622490, + "step": 7017, + "time_per_iteration": 2.5409436225891113 + }, + { + "auxiliary_loss_clip": 0.01031411, + "auxiliary_loss_mlp": 0.0100418, + "balance_loss_clip": 1.01367784, + "balance_loss_mlp": 1.0024519, + "epoch": 0.4219449872238088, + "flos": 66975700965120.0, + "grad_norm": 0.7462697310055326, + "language_loss": 0.59498364, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61533958, + "num_input_tokens_seen": 150689545, + "step": 7018, + "time_per_iteration": 3.1377434730529785 + }, + { + "auxiliary_loss_clip": 0.01107205, + "auxiliary_loss_mlp": 0.01036991, + "balance_loss_clip": 1.03940046, + "balance_loss_mlp": 1.02260828, + "epoch": 0.42200511047647676, + "flos": 13115367745920.0, + "grad_norm": 1.9871860115302116, + "language_loss": 0.75314647, + "learning_rate": 2.593239674255382e-06, + "loss": 0.77458841, + "num_input_tokens_seen": 150707610, + "step": 7019, + "time_per_iteration": 2.4950366020202637 + }, + { + "auxiliary_loss_clip": 0.0110365, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.04106653, + "balance_loss_mlp": 1.01961148, + "epoch": 0.42206523372914473, + "flos": 13991193066240.0, + "grad_norm": 1.9303302974837744, + "language_loss": 0.69317132, + "learning_rate": 2.592867728802166e-06, + "loss": 0.71455157, + "num_input_tokens_seen": 150724530, + "step": 7020, + "time_per_iteration": 2.489919662475586 + }, + { + "auxiliary_loss_clip": 0.01096449, + "auxiliary_loss_mlp": 0.00788022, + "balance_loss_clip": 1.0414139, + "balance_loss_mlp": 1.00906694, + "epoch": 0.4221253569818127, + "flos": 21942317710080.0, + "grad_norm": 1.8332652678251198, + "language_loss": 0.81133378, + "learning_rate": 2.592495760867347e-06, + "loss": 0.8301785, + "num_input_tokens_seen": 150742870, + "step": 7021, + "time_per_iteration": 2.52946138381958 + }, + { + "auxiliary_loss_clip": 0.01054692, + "auxiliary_loss_mlp": 0.01041952, + "balance_loss_clip": 1.03792048, + "balance_loss_mlp": 1.02527475, + "epoch": 0.42218548023448066, + "flos": 32192587071360.0, + "grad_norm": 1.5956273483048007, + "language_loss": 0.69894695, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.71991342, + "num_input_tokens_seen": 150765500, + "step": 7022, + "time_per_iteration": 2.689459800720215 + }, + { + "auxiliary_loss_clip": 0.0110581, + "auxiliary_loss_mlp": 0.01027256, + "balance_loss_clip": 1.04167366, + "balance_loss_mlp": 1.01588941, + "epoch": 0.4222456034871487, + "flos": 30118961894400.0, + "grad_norm": 1.6207389432016326, + "language_loss": 0.67661583, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.69794655, + "num_input_tokens_seen": 150784945, + "step": 7023, + "time_per_iteration": 2.564887046813965 + }, + { + "auxiliary_loss_clip": 0.01092414, + "auxiliary_loss_mlp": 0.0104398, + "balance_loss_clip": 1.04066443, + "balance_loss_mlp": 1.02780318, + "epoch": 0.42230572673981664, + "flos": 22127904305280.0, + "grad_norm": 1.6756608384821896, + "language_loss": 0.69243115, + "learning_rate": 2.591379722314322e-06, + "loss": 0.71379513, + "num_input_tokens_seen": 150803120, + "step": 7024, + "time_per_iteration": 2.563551187515259 + }, + { + "auxiliary_loss_clip": 0.01119943, + "auxiliary_loss_mlp": 0.01034363, + "balance_loss_clip": 1.04140472, + "balance_loss_mlp": 1.02107143, + "epoch": 0.4223658499924846, + "flos": 22055077480320.0, + "grad_norm": 1.6450332544471788, + "language_loss": 0.76804036, + "learning_rate": 2.591007664594147e-06, + "loss": 0.78958338, + "num_input_tokens_seen": 150823135, + "step": 7025, + "time_per_iteration": 3.921006679534912 + }, + { + "auxiliary_loss_clip": 0.01088488, + "auxiliary_loss_mlp": 0.01035032, + "balance_loss_clip": 1.04015827, + "balance_loss_mlp": 1.02189565, + "epoch": 0.4224259732451526, + "flos": 20410727742720.0, + "grad_norm": 1.6683542186032743, + "language_loss": 0.79504573, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.81628096, + "num_input_tokens_seen": 150842070, + "step": 7026, + "time_per_iteration": 2.5599634647369385 + }, + { + "auxiliary_loss_clip": 0.01042549, + "auxiliary_loss_mlp": 0.0100285, + "balance_loss_clip": 1.01460767, + "balance_loss_mlp": 1.00108552, + "epoch": 0.42248609649782054, + "flos": 62846655828480.0, + "grad_norm": 0.7465182596330706, + "language_loss": 0.62032455, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.6407786, + "num_input_tokens_seen": 150907450, + "step": 7027, + "time_per_iteration": 3.1564276218414307 + }, + { + "auxiliary_loss_clip": 0.01116936, + "auxiliary_loss_mlp": 0.01036496, + "balance_loss_clip": 1.0409075, + "balance_loss_mlp": 1.02341866, + "epoch": 0.4225462197504885, + "flos": 26249946289920.0, + "grad_norm": 2.3872782288028733, + "language_loss": 0.70789182, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.72942615, + "num_input_tokens_seen": 150928040, + "step": 7028, + "time_per_iteration": 3.863915205001831 + }, + { + "auxiliary_loss_clip": 0.01103149, + "auxiliary_loss_mlp": 0.01037605, + "balance_loss_clip": 1.04210436, + "balance_loss_mlp": 1.0236814, + "epoch": 0.42260634300315647, + "flos": 20521943228160.0, + "grad_norm": 1.7250168682658376, + "language_loss": 0.82116318, + "learning_rate": 2.589519209743846e-06, + "loss": 0.84257066, + "num_input_tokens_seen": 150945760, + "step": 7029, + "time_per_iteration": 2.496976375579834 + }, + { + "auxiliary_loss_clip": 0.01082398, + "auxiliary_loss_mlp": 0.01039221, + "balance_loss_clip": 1.04415274, + "balance_loss_mlp": 1.02429581, + "epoch": 0.42266646625582444, + "flos": 24316731377280.0, + "grad_norm": 1.9299042045927883, + "language_loss": 0.74598867, + "learning_rate": 2.589147040109424e-06, + "loss": 0.76720488, + "num_input_tokens_seen": 150965665, + "step": 7030, + "time_per_iteration": 2.6110968589782715 + }, + { + "auxiliary_loss_clip": 0.01118719, + "auxiliary_loss_mlp": 0.01034838, + "balance_loss_clip": 1.04009414, + "balance_loss_mlp": 1.02017498, + "epoch": 0.4227265895084924, + "flos": 24204151175040.0, + "grad_norm": 1.9935267024109524, + "language_loss": 0.86750114, + "learning_rate": 2.588774848134486e-06, + "loss": 0.88903666, + "num_input_tokens_seen": 150982260, + "step": 7031, + "time_per_iteration": 3.907400608062744 + }, + { + "auxiliary_loss_clip": 0.01110462, + "auxiliary_loss_mlp": 0.01034936, + "balance_loss_clip": 1.04213285, + "balance_loss_mlp": 1.01978505, + "epoch": 0.42278671276116037, + "flos": 16909760845440.0, + "grad_norm": 1.845244664076516, + "language_loss": 0.7320013, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.75345528, + "num_input_tokens_seen": 150999990, + "step": 7032, + "time_per_iteration": 3.921236276626587 + }, + { + "auxiliary_loss_clip": 0.0109206, + "auxiliary_loss_mlp": 0.01041356, + "balance_loss_clip": 1.03951287, + "balance_loss_mlp": 1.02677703, + "epoch": 0.42284683601382833, + "flos": 25411073086080.0, + "grad_norm": 1.5829485731842445, + "language_loss": 0.70100796, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.72234207, + "num_input_tokens_seen": 151021105, + "step": 7033, + "time_per_iteration": 2.558011293411255 + }, + { + "auxiliary_loss_clip": 0.0109847, + "auxiliary_loss_mlp": 0.00788591, + "balance_loss_clip": 1.04182005, + "balance_loss_mlp": 1.01147199, + "epoch": 0.4229069592664963, + "flos": 23040322606080.0, + "grad_norm": 1.7387664094970163, + "language_loss": 0.90344977, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.92232037, + "num_input_tokens_seen": 151040665, + "step": 7034, + "time_per_iteration": 2.541513442993164 + }, + { + "auxiliary_loss_clip": 0.01096732, + "auxiliary_loss_mlp": 0.01035206, + "balance_loss_clip": 1.04011202, + "balance_loss_mlp": 1.02219486, + "epoch": 0.42296708251916426, + "flos": 26067448264320.0, + "grad_norm": 1.5088055983426913, + "language_loss": 0.77187264, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.79319203, + "num_input_tokens_seen": 151061240, + "step": 7035, + "time_per_iteration": 2.551865339279175 + }, + { + "auxiliary_loss_clip": 0.01107698, + "auxiliary_loss_mlp": 0.01038702, + "balance_loss_clip": 1.04279757, + "balance_loss_mlp": 1.02417004, + "epoch": 0.4230272057718323, + "flos": 19458376496640.0, + "grad_norm": 1.7411051809564735, + "language_loss": 0.82345867, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.84492272, + "num_input_tokens_seen": 151076870, + "step": 7036, + "time_per_iteration": 2.4677059650421143 + }, + { + "auxiliary_loss_clip": 0.01097163, + "auxiliary_loss_mlp": 0.01034033, + "balance_loss_clip": 1.04301393, + "balance_loss_mlp": 1.0204674, + "epoch": 0.42308732902450025, + "flos": 22383300983040.0, + "grad_norm": 1.7405278127379054, + "language_loss": 0.7047565, + "learning_rate": 2.58654122792447e-06, + "loss": 0.7260685, + "num_input_tokens_seen": 151095110, + "step": 7037, + "time_per_iteration": 2.5231099128723145 + }, + { + "auxiliary_loss_clip": 0.01081118, + "auxiliary_loss_mlp": 0.00790488, + "balance_loss_clip": 1.04280174, + "balance_loss_mlp": 1.01375854, + "epoch": 0.4231474522771682, + "flos": 20995425331200.0, + "grad_norm": 1.5312869060700665, + "language_loss": 0.77501535, + "learning_rate": 2.586168879961155e-06, + "loss": 0.79373145, + "num_input_tokens_seen": 151114355, + "step": 7038, + "time_per_iteration": 2.545872926712036 + }, + { + "auxiliary_loss_clip": 0.01093844, + "auxiliary_loss_mlp": 0.01042143, + "balance_loss_clip": 1.04719996, + "balance_loss_mlp": 1.02652717, + "epoch": 0.4232075755298362, + "flos": 14975863574400.0, + "grad_norm": 2.685781190410265, + "language_loss": 0.68071032, + "learning_rate": 2.585796509770259e-06, + "loss": 0.70207024, + "num_input_tokens_seen": 151131505, + "step": 7039, + "time_per_iteration": 2.5527939796447754 + }, + { + "auxiliary_loss_clip": 0.01114997, + "auxiliary_loss_mlp": 0.01036409, + "balance_loss_clip": 1.04200912, + "balance_loss_mlp": 1.02097189, + "epoch": 0.42326769878250414, + "flos": 24532661986560.0, + "grad_norm": 1.58429689968665, + "language_loss": 0.75884426, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.78035825, + "num_input_tokens_seen": 151151555, + "step": 7040, + "time_per_iteration": 2.5146079063415527 + }, + { + "auxiliary_loss_clip": 0.01113284, + "auxiliary_loss_mlp": 0.01032714, + "balance_loss_clip": 1.04413497, + "balance_loss_mlp": 1.01855755, + "epoch": 0.4233278220351721, + "flos": 26870303105280.0, + "grad_norm": 1.6026896661231305, + "language_loss": 0.64890045, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.67036039, + "num_input_tokens_seen": 151172385, + "step": 7041, + "time_per_iteration": 2.555338144302368 + }, + { + "auxiliary_loss_clip": 0.01097142, + "auxiliary_loss_mlp": 0.01032789, + "balance_loss_clip": 1.04332829, + "balance_loss_mlp": 1.0177927, + "epoch": 0.4233879452878401, + "flos": 42814927463040.0, + "grad_norm": 1.7860557601866012, + "language_loss": 0.74348658, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.76478589, + "num_input_tokens_seen": 151194930, + "step": 7042, + "time_per_iteration": 2.7212564945220947 + }, + { + "auxiliary_loss_clip": 0.01108187, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.0431149, + "balance_loss_mlp": 1.01862335, + "epoch": 0.42344806854050804, + "flos": 25229006023680.0, + "grad_norm": 1.4381619283044869, + "language_loss": 0.82081324, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84220982, + "num_input_tokens_seen": 151217905, + "step": 7043, + "time_per_iteration": 2.554152727127075 + }, + { + "auxiliary_loss_clip": 0.01100938, + "auxiliary_loss_mlp": 0.01040221, + "balance_loss_clip": 1.04618871, + "balance_loss_mlp": 1.02509904, + "epoch": 0.423508191793176, + "flos": 22778820616320.0, + "grad_norm": 1.9052524483104685, + "language_loss": 0.64750731, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.66891885, + "num_input_tokens_seen": 151234580, + "step": 7044, + "time_per_iteration": 2.5055863857269287 + }, + { + "auxiliary_loss_clip": 0.01111929, + "auxiliary_loss_mlp": 0.01044393, + "balance_loss_clip": 1.04683697, + "balance_loss_mlp": 1.02771533, + "epoch": 0.42356831504584397, + "flos": 34637493179520.0, + "grad_norm": 1.7185582339098966, + "language_loss": 0.75184834, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.77341163, + "num_input_tokens_seen": 151254765, + "step": 7045, + "time_per_iteration": 2.6097419261932373 + }, + { + "auxiliary_loss_clip": 0.01086896, + "auxiliary_loss_mlp": 0.01047174, + "balance_loss_clip": 1.04320002, + "balance_loss_mlp": 1.03184342, + "epoch": 0.42362843829851193, + "flos": 17596767346560.0, + "grad_norm": 1.9691778424247808, + "language_loss": 0.80431461, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.82565534, + "num_input_tokens_seen": 151269045, + "step": 7046, + "time_per_iteration": 2.5086309909820557 + }, + { + "auxiliary_loss_clip": 0.01046193, + "auxiliary_loss_mlp": 0.01039601, + "balance_loss_clip": 1.04344606, + "balance_loss_mlp": 1.02489018, + "epoch": 0.4236885615511799, + "flos": 22565691267840.0, + "grad_norm": 3.6176522127067003, + "language_loss": 0.77107334, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.79193133, + "num_input_tokens_seen": 151287530, + "step": 7047, + "time_per_iteration": 2.84773588180542 + }, + { + "auxiliary_loss_clip": 0.01121333, + "auxiliary_loss_mlp": 0.01035473, + "balance_loss_clip": 1.04472959, + "balance_loss_mlp": 1.02182388, + "epoch": 0.42374868480384786, + "flos": 26469216864000.0, + "grad_norm": 2.2768527205339035, + "language_loss": 0.68171704, + "learning_rate": 2.582444180141098e-06, + "loss": 0.70328516, + "num_input_tokens_seen": 151308905, + "step": 7048, + "time_per_iteration": 2.7422633171081543 + }, + { + "auxiliary_loss_clip": 0.01110229, + "auxiliary_loss_mlp": 0.01039133, + "balance_loss_clip": 1.0422622, + "balance_loss_mlp": 1.02405286, + "epoch": 0.4238088080565159, + "flos": 20370220179840.0, + "grad_norm": 1.7410379554693425, + "language_loss": 0.77860463, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.8000983, + "num_input_tokens_seen": 151326525, + "step": 7049, + "time_per_iteration": 2.471752643585205 + }, + { + "auxiliary_loss_clip": 0.01116212, + "auxiliary_loss_mlp": 0.01038661, + "balance_loss_clip": 1.04669595, + "balance_loss_mlp": 1.02486849, + "epoch": 0.42386893130918385, + "flos": 21172105353600.0, + "grad_norm": 1.658287175730911, + "language_loss": 0.82416356, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.8457123, + "num_input_tokens_seen": 151344675, + "step": 7050, + "time_per_iteration": 2.5054450035095215 + }, + { + "auxiliary_loss_clip": 0.01121895, + "auxiliary_loss_mlp": 0.01033963, + "balance_loss_clip": 1.04189181, + "balance_loss_mlp": 1.02017617, + "epoch": 0.4239290545618518, + "flos": 17675627656320.0, + "grad_norm": 2.090773503576969, + "language_loss": 0.73692894, + "learning_rate": 2.581326338868687e-06, + "loss": 0.75848746, + "num_input_tokens_seen": 151360730, + "step": 7051, + "time_per_iteration": 2.4067749977111816 + }, + { + "auxiliary_loss_clip": 0.01092492, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.04457808, + "balance_loss_mlp": 1.01901722, + "epoch": 0.4239891778145198, + "flos": 24314504734080.0, + "grad_norm": 1.353913504927804, + "language_loss": 0.86105639, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.88230437, + "num_input_tokens_seen": 151380445, + "step": 7052, + "time_per_iteration": 2.5676839351654053 + }, + { + "auxiliary_loss_clip": 0.0110062, + "auxiliary_loss_mlp": 0.01045616, + "balance_loss_clip": 1.04392684, + "balance_loss_mlp": 1.03102481, + "epoch": 0.42404930106718774, + "flos": 20558428467840.0, + "grad_norm": 1.3968717581610424, + "language_loss": 0.72682679, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.74828911, + "num_input_tokens_seen": 151399325, + "step": 7053, + "time_per_iteration": 2.5181994438171387 + }, + { + "auxiliary_loss_clip": 0.01091391, + "auxiliary_loss_mlp": 0.00790512, + "balance_loss_clip": 1.04507017, + "balance_loss_mlp": 1.01264977, + "epoch": 0.4241094243198557, + "flos": 22308067946880.0, + "grad_norm": 1.6157702854212261, + "language_loss": 0.82432997, + "learning_rate": 2.580208299200704e-06, + "loss": 0.84314901, + "num_input_tokens_seen": 151417240, + "step": 7054, + "time_per_iteration": 2.5625245571136475 + }, + { + "auxiliary_loss_clip": 0.01040327, + "auxiliary_loss_mlp": 0.01007325, + "balance_loss_clip": 1.02367318, + "balance_loss_mlp": 1.00538146, + "epoch": 0.4241695475725237, + "flos": 70612445272320.0, + "grad_norm": 0.7859987729284187, + "language_loss": 0.60410035, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62457687, + "num_input_tokens_seen": 151476015, + "step": 7055, + "time_per_iteration": 3.0514087677001953 + }, + { + "auxiliary_loss_clip": 0.01124766, + "auxiliary_loss_mlp": 0.01040938, + "balance_loss_clip": 1.04450738, + "balance_loss_mlp": 1.02600074, + "epoch": 0.42422967082519164, + "flos": 14027462824320.0, + "grad_norm": 2.1851719011765454, + "language_loss": 0.7677806, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.78943765, + "num_input_tokens_seen": 151492035, + "step": 7056, + "time_per_iteration": 2.434020757675171 + }, + { + "auxiliary_loss_clip": 0.01115721, + "auxiliary_loss_mlp": 0.01039249, + "balance_loss_clip": 1.0433923, + "balance_loss_mlp": 1.02344799, + "epoch": 0.4242897940778596, + "flos": 22345522853760.0, + "grad_norm": 1.8880188697230667, + "language_loss": 0.84405744, + "learning_rate": 2.579090061518714e-06, + "loss": 0.86560714, + "num_input_tokens_seen": 151508970, + "step": 7057, + "time_per_iteration": 2.4808428287506104 + }, + { + "auxiliary_loss_clip": 0.01087008, + "auxiliary_loss_mlp": 0.01036114, + "balance_loss_clip": 1.04494536, + "balance_loss_mlp": 1.02133179, + "epoch": 0.42434991733052757, + "flos": 22595855713920.0, + "grad_norm": 2.4967374978629633, + "language_loss": 0.82603312, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.84726429, + "num_input_tokens_seen": 151525295, + "step": 7058, + "time_per_iteration": 2.5708515644073486 + }, + { + "auxiliary_loss_clip": 0.01098802, + "auxiliary_loss_mlp": 0.00788449, + "balance_loss_clip": 1.04681826, + "balance_loss_mlp": 1.01315928, + "epoch": 0.42441004058319554, + "flos": 20011437181440.0, + "grad_norm": 1.6838609578451464, + "language_loss": 0.80421871, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.82309115, + "num_input_tokens_seen": 151544435, + "step": 7059, + "time_per_iteration": 2.5195930004119873 + }, + { + "auxiliary_loss_clip": 0.01124769, + "auxiliary_loss_mlp": 0.01037345, + "balance_loss_clip": 1.04378319, + "balance_loss_mlp": 1.02191305, + "epoch": 0.4244701638358635, + "flos": 11144985235200.0, + "grad_norm": 2.0413602233076547, + "language_loss": 0.70270765, + "learning_rate": 2.57797162620435e-06, + "loss": 0.72432876, + "num_input_tokens_seen": 151559520, + "step": 7060, + "time_per_iteration": 2.4492642879486084 + }, + { + "auxiliary_loss_clip": 0.01115279, + "auxiliary_loss_mlp": 0.01038803, + "balance_loss_clip": 1.04511213, + "balance_loss_mlp": 1.02498615, + "epoch": 0.42453028708853147, + "flos": 23987753688960.0, + "grad_norm": 1.6621359157658246, + "language_loss": 0.76451159, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78605241, + "num_input_tokens_seen": 151579790, + "step": 7061, + "time_per_iteration": 2.5197062492370605 + }, + { + "auxiliary_loss_clip": 0.01117767, + "auxiliary_loss_mlp": 0.01040767, + "balance_loss_clip": 1.04680288, + "balance_loss_mlp": 1.02518058, + "epoch": 0.42459041034119943, + "flos": 18406338030720.0, + "grad_norm": 2.3499794846841, + "language_loss": 0.72851765, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.750103, + "num_input_tokens_seen": 151598285, + "step": 7062, + "time_per_iteration": 2.4814164638519287 + }, + { + "auxiliary_loss_clip": 0.01098518, + "auxiliary_loss_mlp": 0.01042669, + "balance_loss_clip": 1.04355764, + "balance_loss_mlp": 1.02854824, + "epoch": 0.42465053359386745, + "flos": 20958006337920.0, + "grad_norm": 1.696897952798624, + "language_loss": 0.66294885, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.68436074, + "num_input_tokens_seen": 151615430, + "step": 7063, + "time_per_iteration": 2.495457649230957 + }, + { + "auxiliary_loss_clip": 0.0108808, + "auxiliary_loss_mlp": 0.00787092, + "balance_loss_clip": 1.04106092, + "balance_loss_mlp": 1.00971353, + "epoch": 0.4247106568465354, + "flos": 33106190520960.0, + "grad_norm": 6.0736057190722095, + "language_loss": 0.78385466, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.8026064, + "num_input_tokens_seen": 151637030, + "step": 7064, + "time_per_iteration": 4.02016544342041 + }, + { + "auxiliary_loss_clip": 0.01122132, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.04181337, + "balance_loss_mlp": 1.02221501, + "epoch": 0.4247707800992034, + "flos": 20046916840320.0, + "grad_norm": 1.8067226083134649, + "language_loss": 0.74775159, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.76933742, + "num_input_tokens_seen": 151655745, + "step": 7065, + "time_per_iteration": 2.446256637573242 + }, + { + "auxiliary_loss_clip": 0.01113483, + "auxiliary_loss_mlp": 0.01035231, + "balance_loss_clip": 1.04508638, + "balance_loss_mlp": 1.02047837, + "epoch": 0.42483090335187135, + "flos": 22385132576640.0, + "grad_norm": 1.305923839861759, + "language_loss": 0.72343969, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.74492681, + "num_input_tokens_seen": 151678040, + "step": 7066, + "time_per_iteration": 2.5459933280944824 + }, + { + "auxiliary_loss_clip": 0.01087628, + "auxiliary_loss_mlp": 0.01037353, + "balance_loss_clip": 1.04357171, + "balance_loss_mlp": 1.02205825, + "epoch": 0.4248910266045393, + "flos": 21356830022400.0, + "grad_norm": 1.8065899594865165, + "language_loss": 0.79706848, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.81831825, + "num_input_tokens_seen": 151696410, + "step": 7067, + "time_per_iteration": 3.9097342491149902 + }, + { + "auxiliary_loss_clip": 0.01044395, + "auxiliary_loss_mlp": 0.01006543, + "balance_loss_clip": 1.01689756, + "balance_loss_mlp": 1.00465989, + "epoch": 0.4249511498572073, + "flos": 64008114099840.0, + "grad_norm": 0.9090269517716135, + "language_loss": 0.63490307, + "learning_rate": 2.574988168733022e-06, + "loss": 0.65541244, + "num_input_tokens_seen": 151756365, + "step": 7068, + "time_per_iteration": 2.99078369140625 + }, + { + "auxiliary_loss_clip": 0.01123654, + "auxiliary_loss_mlp": 0.01035546, + "balance_loss_clip": 1.04316211, + "balance_loss_mlp": 1.02001858, + "epoch": 0.42501127310987524, + "flos": 19607046888960.0, + "grad_norm": 1.5049126963598385, + "language_loss": 0.7222237, + "learning_rate": 2.574615138284361e-06, + "loss": 0.74381566, + "num_input_tokens_seen": 151775165, + "step": 7069, + "time_per_iteration": 2.4509873390197754 + }, + { + "auxiliary_loss_clip": 0.01123358, + "auxiliary_loss_mlp": 0.01034542, + "balance_loss_clip": 1.04268432, + "balance_loss_mlp": 1.01867473, + "epoch": 0.4250713963625432, + "flos": 19462326992640.0, + "grad_norm": 1.9546087447706204, + "language_loss": 0.79238623, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.81396526, + "num_input_tokens_seen": 151792620, + "step": 7070, + "time_per_iteration": 3.7899694442749023 + }, + { + "auxiliary_loss_clip": 0.01111104, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.04218507, + "balance_loss_mlp": 1.01687217, + "epoch": 0.4251315196152112, + "flos": 25337707557120.0, + "grad_norm": 2.055101139780295, + "language_loss": 0.70003778, + "learning_rate": 2.573869012032795e-06, + "loss": 0.72146416, + "num_input_tokens_seen": 151812850, + "step": 7071, + "time_per_iteration": 3.9286630153656006 + }, + { + "auxiliary_loss_clip": 0.0112051, + "auxiliary_loss_mlp": 0.01035534, + "balance_loss_clip": 1.04078484, + "balance_loss_mlp": 1.02194452, + "epoch": 0.42519164286787914, + "flos": 26359186527360.0, + "grad_norm": 2.429418953200352, + "language_loss": 0.70702225, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.72858274, + "num_input_tokens_seen": 151831785, + "step": 7072, + "time_per_iteration": 2.4975457191467285 + }, + { + "auxiliary_loss_clip": 0.01087551, + "auxiliary_loss_mlp": 0.01036267, + "balance_loss_clip": 1.04201615, + "balance_loss_mlp": 1.02176476, + "epoch": 0.4252517661205471, + "flos": 26031070765440.0, + "grad_norm": 1.5782417985140955, + "language_loss": 0.81610107, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.83733922, + "num_input_tokens_seen": 151853885, + "step": 7073, + "time_per_iteration": 2.5851187705993652 + }, + { + "auxiliary_loss_clip": 0.01107558, + "auxiliary_loss_mlp": 0.01034863, + "balance_loss_clip": 1.04320014, + "balance_loss_mlp": 1.02173221, + "epoch": 0.42531188937321507, + "flos": 12713635059840.0, + "grad_norm": 2.31547168305089, + "language_loss": 0.91184717, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.93327141, + "num_input_tokens_seen": 151871780, + "step": 7074, + "time_per_iteration": 2.455430269241333 + }, + { + "auxiliary_loss_clip": 0.01116257, + "auxiliary_loss_mlp": 0.00790122, + "balance_loss_clip": 1.04280257, + "balance_loss_mlp": 1.01097751, + "epoch": 0.42537201262588303, + "flos": 22091670460800.0, + "grad_norm": 1.635618511304507, + "language_loss": 0.64261442, + "learning_rate": 2.572376498508805e-06, + "loss": 0.66167819, + "num_input_tokens_seen": 151891600, + "step": 7075, + "time_per_iteration": 2.5051214694976807 + }, + { + "auxiliary_loss_clip": 0.01083047, + "auxiliary_loss_mlp": 0.01028312, + "balance_loss_clip": 1.04065859, + "balance_loss_mlp": 1.01513374, + "epoch": 0.42543213587855105, + "flos": 23003119094400.0, + "grad_norm": 1.5719245344824275, + "language_loss": 0.73642933, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.75754297, + "num_input_tokens_seen": 151911330, + "step": 7076, + "time_per_iteration": 2.562434673309326 + }, + { + "auxiliary_loss_clip": 0.01097137, + "auxiliary_loss_mlp": 0.0104239, + "balance_loss_clip": 1.04017162, + "balance_loss_mlp": 1.02732205, + "epoch": 0.425492259131219, + "flos": 25082454533760.0, + "grad_norm": 2.0252333586120788, + "language_loss": 0.79090643, + "learning_rate": 2.571630111462766e-06, + "loss": 0.8123017, + "num_input_tokens_seen": 151930355, + "step": 7077, + "time_per_iteration": 2.537177562713623 + }, + { + "auxiliary_loss_clip": 0.01095039, + "auxiliary_loss_mlp": 0.01032998, + "balance_loss_clip": 1.04093552, + "balance_loss_mlp": 1.02004039, + "epoch": 0.425552382383887, + "flos": 22816850140800.0, + "grad_norm": 2.145074744117491, + "language_loss": 0.72942531, + "learning_rate": 2.571256885418265e-06, + "loss": 0.75070572, + "num_input_tokens_seen": 151949695, + "step": 7078, + "time_per_iteration": 2.5365304946899414 + }, + { + "auxiliary_loss_clip": 0.01094665, + "auxiliary_loss_mlp": 0.0103424, + "balance_loss_clip": 1.04707587, + "balance_loss_mlp": 1.02140093, + "epoch": 0.42561250563655495, + "flos": 13553585671680.0, + "grad_norm": 1.9916311736525718, + "language_loss": 0.80102324, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.8223123, + "num_input_tokens_seen": 151967640, + "step": 7079, + "time_per_iteration": 2.4939804077148438 + }, + { + "auxiliary_loss_clip": 0.01111659, + "auxiliary_loss_mlp": 0.01030311, + "balance_loss_clip": 1.04563546, + "balance_loss_mlp": 1.01774049, + "epoch": 0.4256726288892229, + "flos": 46978303023360.0, + "grad_norm": 1.9679222126528748, + "language_loss": 0.71801943, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.73943913, + "num_input_tokens_seen": 151994020, + "step": 7080, + "time_per_iteration": 2.73435378074646 + }, + { + "auxiliary_loss_clip": 0.01119486, + "auxiliary_loss_mlp": 0.01032371, + "balance_loss_clip": 1.04110909, + "balance_loss_mlp": 1.0191865, + "epoch": 0.4257327521418909, + "flos": 23586451966080.0, + "grad_norm": 3.472488431829096, + "language_loss": 0.80472827, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.82624686, + "num_input_tokens_seen": 152013415, + "step": 7081, + "time_per_iteration": 2.4659922122955322 + }, + { + "auxiliary_loss_clip": 0.01097215, + "auxiliary_loss_mlp": 0.01031174, + "balance_loss_clip": 1.04017711, + "balance_loss_mlp": 1.01791191, + "epoch": 0.42579287539455885, + "flos": 18989994124800.0, + "grad_norm": 1.6062833608597828, + "language_loss": 0.81467336, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.83595723, + "num_input_tokens_seen": 152030860, + "step": 7082, + "time_per_iteration": 2.5083959102630615 + }, + { + "auxiliary_loss_clip": 0.01109137, + "auxiliary_loss_mlp": 0.01036303, + "balance_loss_clip": 1.04250383, + "balance_loss_mlp": 1.02276123, + "epoch": 0.4258529986472268, + "flos": 25191910252800.0, + "grad_norm": 1.7677048278392415, + "language_loss": 0.69581205, + "learning_rate": 2.569390430547065e-06, + "loss": 0.71726644, + "num_input_tokens_seen": 152050395, + "step": 7083, + "time_per_iteration": 2.520800828933716 + }, + { + "auxiliary_loss_clip": 0.01038089, + "auxiliary_loss_mlp": 0.01001579, + "balance_loss_clip": 1.01978636, + "balance_loss_mlp": 0.999982, + "epoch": 0.4259131218998948, + "flos": 69968280718080.0, + "grad_norm": 0.8780290141810734, + "language_loss": 0.67151326, + "learning_rate": 2.569017074742173e-06, + "loss": 0.69190991, + "num_input_tokens_seen": 152113555, + "step": 7084, + "time_per_iteration": 3.195033073425293 + }, + { + "auxiliary_loss_clip": 0.01107343, + "auxiliary_loss_mlp": 0.01040154, + "balance_loss_clip": 1.04236066, + "balance_loss_mlp": 1.02539611, + "epoch": 0.42597324515256274, + "flos": 18004964480640.0, + "grad_norm": 1.8699861184028506, + "language_loss": 0.78371215, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.80518711, + "num_input_tokens_seen": 152131575, + "step": 7085, + "time_per_iteration": 2.5097341537475586 + }, + { + "auxiliary_loss_clip": 0.0111941, + "auxiliary_loss_mlp": 0.01044167, + "balance_loss_clip": 1.04464507, + "balance_loss_mlp": 1.02864587, + "epoch": 0.4260333684052307, + "flos": 15158792563200.0, + "grad_norm": 2.1973511056593322, + "language_loss": 0.76256859, + "learning_rate": 2.568270298414995e-06, + "loss": 0.78420436, + "num_input_tokens_seen": 152149435, + "step": 7086, + "time_per_iteration": 2.465534210205078 + }, + { + "auxiliary_loss_clip": 0.01096453, + "auxiliary_loss_mlp": 0.01035312, + "balance_loss_clip": 1.03982604, + "balance_loss_mlp": 1.02123976, + "epoch": 0.42609349165789867, + "flos": 14939342421120.0, + "grad_norm": 1.8729320279838206, + "language_loss": 0.80228901, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.82360667, + "num_input_tokens_seen": 152166860, + "step": 7087, + "time_per_iteration": 2.4819304943084717 + }, + { + "auxiliary_loss_clip": 0.01099782, + "auxiliary_loss_mlp": 0.01031052, + "balance_loss_clip": 1.04197216, + "balance_loss_mlp": 1.01657355, + "epoch": 0.42615361491056664, + "flos": 23731961961600.0, + "grad_norm": 1.6155266434852105, + "language_loss": 0.66055441, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.68186271, + "num_input_tokens_seen": 152187475, + "step": 7088, + "time_per_iteration": 2.53523850440979 + }, + { + "auxiliary_loss_clip": 0.0107733, + "auxiliary_loss_mlp": 0.0103779, + "balance_loss_clip": 1.04281831, + "balance_loss_mlp": 1.02316344, + "epoch": 0.42621373816323466, + "flos": 24936441747840.0, + "grad_norm": 2.159230052434351, + "language_loss": 0.68153608, + "learning_rate": 2.56714997234313e-06, + "loss": 0.70268738, + "num_input_tokens_seen": 152207235, + "step": 7089, + "time_per_iteration": 2.6273391246795654 + }, + { + "auxiliary_loss_clip": 0.01073972, + "auxiliary_loss_mlp": 0.01034763, + "balance_loss_clip": 1.03900361, + "balance_loss_mlp": 1.02070177, + "epoch": 0.4262738614159026, + "flos": 13552975140480.0, + "grad_norm": 2.0846298702878987, + "language_loss": 0.72946239, + "learning_rate": 2.566776487287525e-06, + "loss": 0.75054967, + "num_input_tokens_seen": 152224240, + "step": 7090, + "time_per_iteration": 2.541851282119751 + }, + { + "auxiliary_loss_clip": 0.01102356, + "auxiliary_loss_mlp": 0.01039731, + "balance_loss_clip": 1.04137409, + "balance_loss_mlp": 1.0257479, + "epoch": 0.4263339846685706, + "flos": 29748794284800.0, + "grad_norm": 1.9309960364079948, + "language_loss": 0.75740469, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.77882564, + "num_input_tokens_seen": 152242595, + "step": 7091, + "time_per_iteration": 2.581172227859497 + }, + { + "auxiliary_loss_clip": 0.01067988, + "auxiliary_loss_mlp": 0.01028689, + "balance_loss_clip": 1.04044139, + "balance_loss_mlp": 1.01644027, + "epoch": 0.42639410792123855, + "flos": 16834204586880.0, + "grad_norm": 1.5947324425687468, + "language_loss": 0.82147139, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.84243816, + "num_input_tokens_seen": 152260840, + "step": 7092, + "time_per_iteration": 2.55586314201355 + }, + { + "auxiliary_loss_clip": 0.0110479, + "auxiliary_loss_mlp": 0.01039932, + "balance_loss_clip": 1.04292607, + "balance_loss_mlp": 1.02590156, + "epoch": 0.4264542311739065, + "flos": 28763118195840.0, + "grad_norm": 1.9357148956136556, + "language_loss": 0.73959994, + "learning_rate": 2.565655903224038e-06, + "loss": 0.76104718, + "num_input_tokens_seen": 152280580, + "step": 7093, + "time_per_iteration": 2.565652847290039 + }, + { + "auxiliary_loss_clip": 0.01111192, + "auxiliary_loss_mlp": 0.01032411, + "balance_loss_clip": 1.04122472, + "balance_loss_mlp": 1.01816523, + "epoch": 0.4265143544265745, + "flos": 24713615727360.0, + "grad_norm": 3.5015335829004868, + "language_loss": 0.6971246, + "learning_rate": 2.565282332284532e-06, + "loss": 0.71856064, + "num_input_tokens_seen": 152298455, + "step": 7094, + "time_per_iteration": 2.4964139461517334 + }, + { + "auxiliary_loss_clip": 0.01087608, + "auxiliary_loss_mlp": 0.01034851, + "balance_loss_clip": 1.04272807, + "balance_loss_mlp": 1.02082586, + "epoch": 0.42657447767924245, + "flos": 21865971352320.0, + "grad_norm": 1.5966982751996088, + "language_loss": 0.81757629, + "learning_rate": 2.564908739909464e-06, + "loss": 0.83880091, + "num_input_tokens_seen": 152316995, + "step": 7095, + "time_per_iteration": 2.546546697616577 + }, + { + "auxiliary_loss_clip": 0.01122847, + "auxiliary_loss_mlp": 0.01042299, + "balance_loss_clip": 1.04306161, + "balance_loss_mlp": 1.02817321, + "epoch": 0.4266346009319104, + "flos": 21470236237440.0, + "grad_norm": 2.324569006395003, + "language_loss": 0.80066139, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.82231283, + "num_input_tokens_seen": 152334800, + "step": 7096, + "time_per_iteration": 2.4624435901641846 + }, + { + "auxiliary_loss_clip": 0.01116475, + "auxiliary_loss_mlp": 0.01035747, + "balance_loss_clip": 1.04503489, + "balance_loss_mlp": 1.02145362, + "epoch": 0.4266947241845784, + "flos": 25519379569920.0, + "grad_norm": 1.9758945759903384, + "language_loss": 0.65559256, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.67711484, + "num_input_tokens_seen": 152355175, + "step": 7097, + "time_per_iteration": 2.5658419132232666 + }, + { + "auxiliary_loss_clip": 0.01097219, + "auxiliary_loss_mlp": 0.0103032, + "balance_loss_clip": 1.04368246, + "balance_loss_mlp": 1.01668847, + "epoch": 0.42675484743724634, + "flos": 26541217676160.0, + "grad_norm": 1.8914528928860685, + "language_loss": 0.74134886, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.76262432, + "num_input_tokens_seen": 152377245, + "step": 7098, + "time_per_iteration": 2.5605335235595703 + }, + { + "auxiliary_loss_clip": 0.01108957, + "auxiliary_loss_mlp": 0.01030872, + "balance_loss_clip": 1.04150581, + "balance_loss_mlp": 1.01761043, + "epoch": 0.4268149706899143, + "flos": 23112718467840.0, + "grad_norm": 1.7248997144902938, + "language_loss": 0.75221795, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.77361625, + "num_input_tokens_seen": 152396985, + "step": 7099, + "time_per_iteration": 2.5321266651153564 + }, + { + "auxiliary_loss_clip": 0.0110132, + "auxiliary_loss_mlp": 0.0103901, + "balance_loss_clip": 1.0418067, + "balance_loss_mlp": 1.0245862, + "epoch": 0.4268750939425823, + "flos": 22706532495360.0, + "grad_norm": 2.099813860441557, + "language_loss": 0.82643461, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.84783792, + "num_input_tokens_seen": 152415590, + "step": 7100, + "time_per_iteration": 2.5286149978637695 + }, + { + "auxiliary_loss_clip": 0.01101945, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.04224038, + "balance_loss_mlp": 1.01757765, + "epoch": 0.42693521719525024, + "flos": 25374875155200.0, + "grad_norm": 1.7806678964552969, + "language_loss": 0.8222096, + "learning_rate": 2.562666736305627e-06, + "loss": 0.8435387, + "num_input_tokens_seen": 152436735, + "step": 7101, + "time_per_iteration": 2.5608038902282715 + }, + { + "auxiliary_loss_clip": 0.01126285, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.04487562, + "balance_loss_mlp": 1.01935351, + "epoch": 0.42699534044791826, + "flos": 18150689957760.0, + "grad_norm": 1.8543926593252629, + "language_loss": 0.7234143, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.74501568, + "num_input_tokens_seen": 152455685, + "step": 7102, + "time_per_iteration": 3.832385778427124 + }, + { + "auxiliary_loss_clip": 0.01110354, + "auxiliary_loss_mlp": 0.01032876, + "balance_loss_clip": 1.043136, + "balance_loss_mlp": 1.01948881, + "epoch": 0.4270554637005862, + "flos": 13698413308800.0, + "grad_norm": 1.7767134499951431, + "language_loss": 0.82566017, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.84709251, + "num_input_tokens_seen": 152473500, + "step": 7103, + "time_per_iteration": 2.4593327045440674 + }, + { + "auxiliary_loss_clip": 0.01097537, + "auxiliary_loss_mlp": 0.01045843, + "balance_loss_clip": 1.03955388, + "balance_loss_mlp": 1.02911782, + "epoch": 0.4271155869532542, + "flos": 17493596507520.0, + "grad_norm": 1.9885747918755825, + "language_loss": 0.73549271, + "learning_rate": 2.561545446271294e-06, + "loss": 0.75692654, + "num_input_tokens_seen": 152491320, + "step": 7104, + "time_per_iteration": 2.4987401962280273 + }, + { + "auxiliary_loss_clip": 0.01107817, + "auxiliary_loss_mlp": 0.0103672, + "balance_loss_clip": 1.04230475, + "balance_loss_mlp": 1.02261162, + "epoch": 0.42717571020592215, + "flos": 32452293381120.0, + "grad_norm": 2.246278937984321, + "language_loss": 0.75245345, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.77389878, + "num_input_tokens_seen": 152511970, + "step": 7105, + "time_per_iteration": 3.954803466796875 + }, + { + "auxiliary_loss_clip": 0.01126078, + "auxiliary_loss_mlp": 0.01031517, + "balance_loss_clip": 1.04572642, + "balance_loss_mlp": 1.01855326, + "epoch": 0.4272358334585901, + "flos": 16253062444800.0, + "grad_norm": 2.1524027328449438, + "language_loss": 0.76927686, + "learning_rate": 2.560797813088819e-06, + "loss": 0.79085284, + "num_input_tokens_seen": 152530515, + "step": 7106, + "time_per_iteration": 2.4423468112945557 + }, + { + "auxiliary_loss_clip": 0.01100343, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.04261899, + "balance_loss_mlp": 1.01882124, + "epoch": 0.4272959567112581, + "flos": 24200092938240.0, + "grad_norm": 1.8901180011416496, + "language_loss": 0.80246234, + "learning_rate": 2.560423964592229e-06, + "loss": 0.82378769, + "num_input_tokens_seen": 152549295, + "step": 7107, + "time_per_iteration": 2.5143795013427734 + }, + { + "auxiliary_loss_clip": 0.01076685, + "auxiliary_loss_mlp": 0.01037212, + "balance_loss_clip": 1.04397619, + "balance_loss_mlp": 1.02301979, + "epoch": 0.42735607996392605, + "flos": 27963495578880.0, + "grad_norm": 1.3439445640469168, + "language_loss": 0.68042636, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.70156533, + "num_input_tokens_seen": 152570725, + "step": 7108, + "time_per_iteration": 2.6578423976898193 + }, + { + "auxiliary_loss_clip": 0.01103648, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.04338205, + "balance_loss_mlp": 1.02371001, + "epoch": 0.427416203216594, + "flos": 20295597674880.0, + "grad_norm": 1.7154359516174509, + "language_loss": 0.712309, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.7337147, + "num_input_tokens_seen": 152588950, + "step": 7109, + "time_per_iteration": 3.881704568862915 + }, + { + "auxiliary_loss_clip": 0.01110112, + "auxiliary_loss_mlp": 0.01035438, + "balance_loss_clip": 1.04170704, + "balance_loss_mlp": 1.01920152, + "epoch": 0.427476326469262, + "flos": 26943955943040.0, + "grad_norm": 6.044346281657514, + "language_loss": 0.647493, + "learning_rate": 2.559302291651174e-06, + "loss": 0.66894847, + "num_input_tokens_seen": 152608965, + "step": 7110, + "time_per_iteration": 3.9058444499969482 + }, + { + "auxiliary_loss_clip": 0.01124197, + "auxiliary_loss_mlp": 0.00789898, + "balance_loss_clip": 1.04377913, + "balance_loss_mlp": 1.01349866, + "epoch": 0.42753644972192995, + "flos": 25702847262720.0, + "grad_norm": 2.4068797476022916, + "language_loss": 0.76462936, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.78377032, + "num_input_tokens_seen": 152630220, + "step": 7111, + "time_per_iteration": 2.502591609954834 + }, + { + "auxiliary_loss_clip": 0.01092769, + "auxiliary_loss_mlp": 0.01029836, + "balance_loss_clip": 1.04361737, + "balance_loss_mlp": 1.01548958, + "epoch": 0.4275965729745979, + "flos": 18767419499520.0, + "grad_norm": 1.6874259178605695, + "language_loss": 0.72963029, + "learning_rate": 2.558554403622845e-06, + "loss": 0.75085628, + "num_input_tokens_seen": 152648835, + "step": 7112, + "time_per_iteration": 2.524606943130493 + }, + { + "auxiliary_loss_clip": 0.01095108, + "auxiliary_loss_mlp": 0.01039567, + "balance_loss_clip": 1.04049313, + "balance_loss_mlp": 1.02612591, + "epoch": 0.4276566962272659, + "flos": 23764424878080.0, + "grad_norm": 1.815185376515291, + "language_loss": 0.71475184, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.73609853, + "num_input_tokens_seen": 152668375, + "step": 7113, + "time_per_iteration": 2.5288755893707275 + }, + { + "auxiliary_loss_clip": 0.01117542, + "auxiliary_loss_mlp": 0.01038669, + "balance_loss_clip": 1.04764462, + "balance_loss_mlp": 1.02440584, + "epoch": 0.42771681947993384, + "flos": 22492505306880.0, + "grad_norm": 1.7222820990261507, + "language_loss": 0.62053466, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.64209676, + "num_input_tokens_seen": 152689725, + "step": 7114, + "time_per_iteration": 2.5182688236236572 + }, + { + "auxiliary_loss_clip": 0.01118845, + "auxiliary_loss_mlp": 0.01041219, + "balance_loss_clip": 1.04659581, + "balance_loss_mlp": 1.02494121, + "epoch": 0.42777694273260186, + "flos": 25044712318080.0, + "grad_norm": 2.5162375368993906, + "language_loss": 0.64796162, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.66956222, + "num_input_tokens_seen": 152709375, + "step": 7115, + "time_per_iteration": 2.5169191360473633 + }, + { + "auxiliary_loss_clip": 0.01097487, + "auxiliary_loss_mlp": 0.01035211, + "balance_loss_clip": 1.04078865, + "balance_loss_mlp": 1.02191877, + "epoch": 0.4278370659852698, + "flos": 18661519226880.0, + "grad_norm": 1.4806147530669242, + "language_loss": 0.73880363, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.76013064, + "num_input_tokens_seen": 152727510, + "step": 7116, + "time_per_iteration": 2.502995729446411 + }, + { + "auxiliary_loss_clip": 0.01094051, + "auxiliary_loss_mlp": 0.01037587, + "balance_loss_clip": 1.03930879, + "balance_loss_mlp": 1.02468276, + "epoch": 0.4278971892379378, + "flos": 27308269635840.0, + "grad_norm": 1.599206316013892, + "language_loss": 0.69298828, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.71430469, + "num_input_tokens_seen": 152746670, + "step": 7117, + "time_per_iteration": 2.566161870956421 + }, + { + "auxiliary_loss_clip": 0.01099804, + "auxiliary_loss_mlp": 0.01038877, + "balance_loss_clip": 1.04556727, + "balance_loss_mlp": 1.02510238, + "epoch": 0.42795731249060576, + "flos": 12888698970240.0, + "grad_norm": 2.1870373064494184, + "language_loss": 0.70131552, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.72270238, + "num_input_tokens_seen": 152760545, + "step": 7118, + "time_per_iteration": 2.5052826404571533 + }, + { + "auxiliary_loss_clip": 0.0108544, + "auxiliary_loss_mlp": 0.01045679, + "balance_loss_clip": 1.04267478, + "balance_loss_mlp": 1.03042006, + "epoch": 0.4280174357432737, + "flos": 33401448316800.0, + "grad_norm": 1.7854188117569785, + "language_loss": 0.74760103, + "learning_rate": 2.55593612908444e-06, + "loss": 0.7689122, + "num_input_tokens_seen": 152780970, + "step": 7119, + "time_per_iteration": 2.657177448272705 + }, + { + "auxiliary_loss_clip": 0.01060564, + "auxiliary_loss_mlp": 0.01032496, + "balance_loss_clip": 1.04095244, + "balance_loss_mlp": 1.01851881, + "epoch": 0.4280775589959417, + "flos": 18259104182400.0, + "grad_norm": 2.205306109147953, + "language_loss": 0.7465024, + "learning_rate": 2.555562005426573e-06, + "loss": 0.76743305, + "num_input_tokens_seen": 152798475, + "step": 7120, + "time_per_iteration": 2.577143669128418 + }, + { + "auxiliary_loss_clip": 0.01100396, + "auxiliary_loss_mlp": 0.00786896, + "balance_loss_clip": 1.04440749, + "balance_loss_mlp": 1.00939274, + "epoch": 0.42813768224860965, + "flos": 21471277731840.0, + "grad_norm": 2.4938477814880002, + "language_loss": 0.77051002, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.78938293, + "num_input_tokens_seen": 152817555, + "step": 7121, + "time_per_iteration": 2.542776584625244 + }, + { + "auxiliary_loss_clip": 0.01101544, + "auxiliary_loss_mlp": 0.01034834, + "balance_loss_clip": 1.04453766, + "balance_loss_mlp": 1.02216172, + "epoch": 0.4281978055012776, + "flos": 15669262696320.0, + "grad_norm": 1.9211332540588129, + "language_loss": 0.85617977, + "learning_rate": 2.554813694924126e-06, + "loss": 0.87754351, + "num_input_tokens_seen": 152836295, + "step": 7122, + "time_per_iteration": 2.508695363998413 + }, + { + "auxiliary_loss_clip": 0.01070032, + "auxiliary_loss_mlp": 0.01033916, + "balance_loss_clip": 1.03951693, + "balance_loss_mlp": 1.01971269, + "epoch": 0.4282579287539456, + "flos": 17712005155200.0, + "grad_norm": 1.6602051492569085, + "language_loss": 0.80760944, + "learning_rate": 2.554439508107921e-06, + "loss": 0.82864887, + "num_input_tokens_seen": 152854950, + "step": 7123, + "time_per_iteration": 2.601229190826416 + }, + { + "auxiliary_loss_clip": 0.01083572, + "auxiliary_loss_mlp": 0.0103561, + "balance_loss_clip": 1.04765856, + "balance_loss_mlp": 1.02213979, + "epoch": 0.42831805200661355, + "flos": 19281157770240.0, + "grad_norm": 1.626677010525872, + "language_loss": 0.80615515, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.82734692, + "num_input_tokens_seen": 152873995, + "step": 7124, + "time_per_iteration": 2.5472359657287598 + }, + { + "auxiliary_loss_clip": 0.01113052, + "auxiliary_loss_mlp": 0.01038273, + "balance_loss_clip": 1.04505038, + "balance_loss_mlp": 1.02438545, + "epoch": 0.4283781752592815, + "flos": 19792633484160.0, + "grad_norm": 1.659274243374459, + "language_loss": 0.8032515, + "learning_rate": 2.553691071416498e-06, + "loss": 0.82476479, + "num_input_tokens_seen": 152892925, + "step": 7125, + "time_per_iteration": 2.475978374481201 + }, + { + "auxiliary_loss_clip": 0.01121633, + "auxiliary_loss_mlp": 0.00787718, + "balance_loss_clip": 1.04536796, + "balance_loss_mlp": 1.01077163, + "epoch": 0.4284382985119495, + "flos": 16508064072960.0, + "grad_norm": 1.8285167548858474, + "language_loss": 0.74959713, + "learning_rate": 2.553316821569659e-06, + "loss": 0.76869065, + "num_input_tokens_seen": 152910935, + "step": 7126, + "time_per_iteration": 2.442854881286621 + }, + { + "auxiliary_loss_clip": 0.0111154, + "auxiliary_loss_mlp": 0.0103346, + "balance_loss_clip": 1.04463339, + "balance_loss_mlp": 1.01941705, + "epoch": 0.42849842176461744, + "flos": 23330767979520.0, + "grad_norm": 1.7013304390131359, + "language_loss": 0.81430924, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.83575916, + "num_input_tokens_seen": 152931030, + "step": 7127, + "time_per_iteration": 2.4904518127441406 + }, + { + "auxiliary_loss_clip": 0.0107768, + "auxiliary_loss_mlp": 0.01034989, + "balance_loss_clip": 1.04424167, + "balance_loss_mlp": 1.02104712, + "epoch": 0.4285585450172854, + "flos": 17274433674240.0, + "grad_norm": 1.8052341028308765, + "language_loss": 0.76152551, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.7826522, + "num_input_tokens_seen": 152948085, + "step": 7128, + "time_per_iteration": 2.560262680053711 + }, + { + "auxiliary_loss_clip": 0.0107855, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.04392362, + "balance_loss_mlp": 1.01606679, + "epoch": 0.42861866826995343, + "flos": 24279599692800.0, + "grad_norm": 1.8723746445999567, + "language_loss": 0.74218053, + "learning_rate": 2.552193946194937e-06, + "loss": 0.76326919, + "num_input_tokens_seen": 152966265, + "step": 7129, + "time_per_iteration": 2.5935497283935547 + }, + { + "auxiliary_loss_clip": 0.01116525, + "auxiliary_loss_mlp": 0.00788173, + "balance_loss_clip": 1.04926801, + "balance_loss_mlp": 1.01015127, + "epoch": 0.4286787915226214, + "flos": 24353108876160.0, + "grad_norm": 1.8165673603095516, + "language_loss": 0.78034747, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.79939449, + "num_input_tokens_seen": 152986775, + "step": 7130, + "time_per_iteration": 2.5303232669830322 + }, + { + "auxiliary_loss_clip": 0.01105335, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.04727924, + "balance_loss_mlp": 1.02055192, + "epoch": 0.42873891477528936, + "flos": 15449992122240.0, + "grad_norm": 2.1265040698941378, + "language_loss": 0.73839617, + "learning_rate": 2.551445257891886e-06, + "loss": 0.75979769, + "num_input_tokens_seen": 153003595, + "step": 7131, + "time_per_iteration": 2.4886181354522705 + }, + { + "auxiliary_loss_clip": 0.01107674, + "auxiliary_loss_mlp": 0.01036924, + "balance_loss_clip": 1.04755306, + "balance_loss_mlp": 1.02246952, + "epoch": 0.4287990380279573, + "flos": 17639573379840.0, + "grad_norm": 2.2220991670556933, + "language_loss": 0.77415675, + "learning_rate": 2.551070882366973e-06, + "loss": 0.79560274, + "num_input_tokens_seen": 153021960, + "step": 7132, + "time_per_iteration": 2.495535373687744 + }, + { + "auxiliary_loss_clip": 0.01087886, + "auxiliary_loss_mlp": 0.00789746, + "balance_loss_clip": 1.04757309, + "balance_loss_mlp": 1.01407492, + "epoch": 0.4288591612806253, + "flos": 27162328677120.0, + "grad_norm": 1.7216307097934347, + "language_loss": 0.78607982, + "learning_rate": 2.550696485945397e-06, + "loss": 0.80485618, + "num_input_tokens_seen": 153042110, + "step": 7133, + "time_per_iteration": 2.6045432090759277 + }, + { + "auxiliary_loss_clip": 0.01099091, + "auxiliary_loss_mlp": 0.0103573, + "balance_loss_clip": 1.04480028, + "balance_loss_mlp": 1.0220921, + "epoch": 0.42891928453329325, + "flos": 17163182275200.0, + "grad_norm": 1.7616881695001934, + "language_loss": 0.7514056, + "learning_rate": 2.550322068641355e-06, + "loss": 0.77275383, + "num_input_tokens_seen": 153058925, + "step": 7134, + "time_per_iteration": 2.497849702835083 + }, + { + "auxiliary_loss_clip": 0.01104864, + "auxiliary_loss_mlp": 0.01040039, + "balance_loss_clip": 1.04079866, + "balance_loss_mlp": 1.0261873, + "epoch": 0.4289794077859612, + "flos": 18187031543040.0, + "grad_norm": 1.8838177514168155, + "language_loss": 0.83804458, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.85949361, + "num_input_tokens_seen": 153078070, + "step": 7135, + "time_per_iteration": 2.457148313522339 + }, + { + "auxiliary_loss_clip": 0.01054817, + "auxiliary_loss_mlp": 0.01038818, + "balance_loss_clip": 1.04291189, + "balance_loss_mlp": 1.02332091, + "epoch": 0.4290395310386292, + "flos": 28256885867520.0, + "grad_norm": 1.956002903236043, + "language_loss": 0.75080013, + "learning_rate": 2.549573171442666e-06, + "loss": 0.77173638, + "num_input_tokens_seen": 153096680, + "step": 7136, + "time_per_iteration": 2.6529407501220703 + }, + { + "auxiliary_loss_clip": 0.01108915, + "auxiliary_loss_mlp": 0.01032643, + "balance_loss_clip": 1.04487419, + "balance_loss_mlp": 1.01918483, + "epoch": 0.42909965429129715, + "flos": 16216074414720.0, + "grad_norm": 1.8879144840989626, + "language_loss": 0.78816801, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.80958354, + "num_input_tokens_seen": 153113305, + "step": 7137, + "time_per_iteration": 2.485599994659424 + }, + { + "auxiliary_loss_clip": 0.01128096, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.04730165, + "balance_loss_mlp": 1.02031302, + "epoch": 0.4291597775439651, + "flos": 23112862122240.0, + "grad_norm": 1.858696512575758, + "language_loss": 0.76639199, + "learning_rate": 2.548824190884499e-06, + "loss": 0.788019, + "num_input_tokens_seen": 153132735, + "step": 7138, + "time_per_iteration": 2.4957005977630615 + }, + { + "auxiliary_loss_clip": 0.0103868, + "auxiliary_loss_mlp": 0.01007225, + "balance_loss_clip": 1.03116024, + "balance_loss_mlp": 1.00537705, + "epoch": 0.4292199007966331, + "flos": 67546212681600.0, + "grad_norm": 1.0765855541557439, + "language_loss": 0.56234348, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58280253, + "num_input_tokens_seen": 153187925, + "step": 7139, + "time_per_iteration": 3.0154480934143066 + }, + { + "auxiliary_loss_clip": 0.01120193, + "auxiliary_loss_mlp": 0.0078996, + "balance_loss_clip": 1.04496658, + "balance_loss_mlp": 1.01529121, + "epoch": 0.42928002404930105, + "flos": 22999850956800.0, + "grad_norm": 1.7105802154765224, + "language_loss": 0.8082459, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.82734746, + "num_input_tokens_seen": 153206990, + "step": 7140, + "time_per_iteration": 2.4906117916107178 + }, + { + "auxiliary_loss_clip": 0.01112576, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_clip": 1.0448724, + "balance_loss_mlp": 1.01877081, + "epoch": 0.429340147301969, + "flos": 11544922241280.0, + "grad_norm": 1.9571774577489969, + "language_loss": 0.82073557, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.84219128, + "num_input_tokens_seen": 153222345, + "step": 7141, + "time_per_iteration": 3.8308210372924805 + }, + { + "auxiliary_loss_clip": 0.01122228, + "auxiliary_loss_mlp": 0.01038097, + "balance_loss_clip": 1.04784489, + "balance_loss_mlp": 1.02315974, + "epoch": 0.42940027055463703, + "flos": 25264988472960.0, + "grad_norm": 1.6991495671669412, + "language_loss": 0.86448872, + "learning_rate": 2.547325980144166e-06, + "loss": 0.88609195, + "num_input_tokens_seen": 153240570, + "step": 7142, + "time_per_iteration": 2.520124912261963 + }, + { + "auxiliary_loss_clip": 0.01100385, + "auxiliary_loss_mlp": 0.01030828, + "balance_loss_clip": 1.04874921, + "balance_loss_mlp": 1.01753068, + "epoch": 0.429460393807305, + "flos": 23805004268160.0, + "grad_norm": 1.8577434483713888, + "language_loss": 0.78532672, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.80663884, + "num_input_tokens_seen": 153259575, + "step": 7143, + "time_per_iteration": 3.9784836769104004 + }, + { + "auxiliary_loss_clip": 0.01074613, + "auxiliary_loss_mlp": 0.01038842, + "balance_loss_clip": 1.0416522, + "balance_loss_mlp": 1.02425051, + "epoch": 0.42952051705997296, + "flos": 13918294414080.0, + "grad_norm": 1.9266998326249292, + "language_loss": 0.7714259, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.79256046, + "num_input_tokens_seen": 153276650, + "step": 7144, + "time_per_iteration": 2.5507609844207764 + }, + { + "auxiliary_loss_clip": 0.01090722, + "auxiliary_loss_mlp": 0.01029378, + "balance_loss_clip": 1.04061556, + "balance_loss_mlp": 1.01506734, + "epoch": 0.4295806403126409, + "flos": 26760380509440.0, + "grad_norm": 1.7250274801367815, + "language_loss": 0.73375738, + "learning_rate": 2.54620210411532e-06, + "loss": 0.75495833, + "num_input_tokens_seen": 153298025, + "step": 7145, + "time_per_iteration": 2.593904495239258 + }, + { + "auxiliary_loss_clip": 0.0111334, + "auxiliary_loss_mlp": 0.01035864, + "balance_loss_clip": 1.04478025, + "balance_loss_mlp": 1.02149951, + "epoch": 0.4296407635653089, + "flos": 20952619297920.0, + "grad_norm": 2.0973330237123204, + "language_loss": 0.7935003, + "learning_rate": 2.545827437329352e-06, + "loss": 0.81499237, + "num_input_tokens_seen": 153315775, + "step": 7146, + "time_per_iteration": 2.4782676696777344 + }, + { + "auxiliary_loss_clip": 0.01108198, + "auxiliary_loss_mlp": 0.010322, + "balance_loss_clip": 1.0430212, + "balance_loss_mlp": 1.01970065, + "epoch": 0.42970088681797686, + "flos": 15852335339520.0, + "grad_norm": 3.7010135025747695, + "language_loss": 0.82933509, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.85073906, + "num_input_tokens_seen": 153332765, + "step": 7147, + "time_per_iteration": 3.8305106163024902 + }, + { + "auxiliary_loss_clip": 0.01111357, + "auxiliary_loss_mlp": 0.01033832, + "balance_loss_clip": 1.04784048, + "balance_loss_mlp": 1.01848936, + "epoch": 0.4297610100706448, + "flos": 22382618624640.0, + "grad_norm": 2.998023342546849, + "language_loss": 0.87718254, + "learning_rate": 2.545078041678131e-06, + "loss": 0.89863443, + "num_input_tokens_seen": 153350760, + "step": 7148, + "time_per_iteration": 3.873980760574341 + }, + { + "auxiliary_loss_clip": 0.01104989, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.04500723, + "balance_loss_mlp": 1.02057207, + "epoch": 0.4298211333233128, + "flos": 27925681536000.0, + "grad_norm": 1.5739212943043868, + "language_loss": 0.77799404, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.79938769, + "num_input_tokens_seen": 153370765, + "step": 7149, + "time_per_iteration": 2.567481756210327 + }, + { + "auxiliary_loss_clip": 0.01080543, + "auxiliary_loss_mlp": 0.01033198, + "balance_loss_clip": 1.04116702, + "balance_loss_mlp": 1.01875019, + "epoch": 0.42988125657598075, + "flos": 24425612478720.0, + "grad_norm": 1.687368369001774, + "language_loss": 0.79722023, + "learning_rate": 2.544328563349256e-06, + "loss": 0.81835765, + "num_input_tokens_seen": 153390725, + "step": 7150, + "time_per_iteration": 2.577098846435547 + }, + { + "auxiliary_loss_clip": 0.0111714, + "auxiliary_loss_mlp": 0.0103892, + "balance_loss_clip": 1.04702568, + "balance_loss_mlp": 1.02308869, + "epoch": 0.4299413798286487, + "flos": 15850180523520.0, + "grad_norm": 1.5599140326004195, + "language_loss": 0.74846035, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.77002096, + "num_input_tokens_seen": 153408010, + "step": 7151, + "time_per_iteration": 2.4604878425598145 + }, + { + "auxiliary_loss_clip": 0.01078867, + "auxiliary_loss_mlp": 0.01034474, + "balance_loss_clip": 1.04111302, + "balance_loss_mlp": 1.01970434, + "epoch": 0.4300015030813167, + "flos": 22309504490880.0, + "grad_norm": 1.9132899478263063, + "language_loss": 0.70264828, + "learning_rate": 2.543579002456406e-06, + "loss": 0.7237817, + "num_input_tokens_seen": 153426865, + "step": 7152, + "time_per_iteration": 2.5325629711151123 + }, + { + "auxiliary_loss_clip": 0.01101138, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.03932214, + "balance_loss_mlp": 1.01885068, + "epoch": 0.43006162633398465, + "flos": 34897666366080.0, + "grad_norm": 1.836918484590717, + "language_loss": 0.71012735, + "learning_rate": 2.54320419108402e-06, + "loss": 0.73146158, + "num_input_tokens_seen": 153449410, + "step": 7153, + "time_per_iteration": 2.6170499324798584 + }, + { + "auxiliary_loss_clip": 0.01106755, + "auxiliary_loss_mlp": 0.01032485, + "balance_loss_clip": 1.04092097, + "balance_loss_mlp": 1.01784635, + "epoch": 0.4301217495866526, + "flos": 15961575576960.0, + "grad_norm": 3.304614650439234, + "language_loss": 0.78413135, + "learning_rate": 2.542829359113276e-06, + "loss": 0.80552375, + "num_input_tokens_seen": 153467910, + "step": 7154, + "time_per_iteration": 2.4738783836364746 + }, + { + "auxiliary_loss_clip": 0.01087922, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.03980148, + "balance_loss_mlp": 1.02077317, + "epoch": 0.43018187283932063, + "flos": 18770364414720.0, + "grad_norm": 1.5850342352773876, + "language_loss": 0.7869277, + "learning_rate": 2.542454506558389e-06, + "loss": 0.80815303, + "num_input_tokens_seen": 153487100, + "step": 7155, + "time_per_iteration": 2.511211395263672 + }, + { + "auxiliary_loss_clip": 0.01094572, + "auxiliary_loss_mlp": 0.01027039, + "balance_loss_clip": 1.04318476, + "balance_loss_mlp": 1.01397943, + "epoch": 0.4302419960919886, + "flos": 20151703791360.0, + "grad_norm": 1.8682137433677013, + "language_loss": 0.88685691, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.90807307, + "num_input_tokens_seen": 153505565, + "step": 7156, + "time_per_iteration": 2.547166347503662 + }, + { + "auxiliary_loss_clip": 0.01124095, + "auxiliary_loss_mlp": 0.01032233, + "balance_loss_clip": 1.04390931, + "balance_loss_mlp": 1.01780868, + "epoch": 0.43030211934465656, + "flos": 26432731624320.0, + "grad_norm": 2.1345554806324767, + "language_loss": 0.83028519, + "learning_rate": 2.541704739753042e-06, + "loss": 0.85184836, + "num_input_tokens_seen": 153526130, + "step": 7157, + "time_per_iteration": 2.491027593612671 + }, + { + "auxiliary_loss_clip": 0.01127411, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.04642749, + "balance_loss_mlp": 1.01760471, + "epoch": 0.43036224259732453, + "flos": 24389234979840.0, + "grad_norm": 1.87283904148046, + "language_loss": 0.71698451, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.73857665, + "num_input_tokens_seen": 153546370, + "step": 7158, + "time_per_iteration": 2.4837417602539062 + }, + { + "auxiliary_loss_clip": 0.01108912, + "auxiliary_loss_mlp": 0.01031654, + "balance_loss_clip": 1.0419035, + "balance_loss_mlp": 1.0183804, + "epoch": 0.4304223658499925, + "flos": 17201714590080.0, + "grad_norm": 1.8570494064800376, + "language_loss": 0.82598031, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.847386, + "num_input_tokens_seen": 153562800, + "step": 7159, + "time_per_iteration": 2.4435482025146484 + }, + { + "auxiliary_loss_clip": 0.01094612, + "auxiliary_loss_mlp": 0.01034322, + "balance_loss_clip": 1.04147649, + "balance_loss_mlp": 1.02003527, + "epoch": 0.43048248910266046, + "flos": 14903000835840.0, + "grad_norm": 2.250356178031571, + "language_loss": 0.82969081, + "learning_rate": 2.54057993551933e-06, + "loss": 0.85098016, + "num_input_tokens_seen": 153578395, + "step": 7160, + "time_per_iteration": 2.5094876289367676 + }, + { + "auxiliary_loss_clip": 0.01117857, + "auxiliary_loss_mlp": 0.01034785, + "balance_loss_clip": 1.0469799, + "balance_loss_mlp": 1.01918089, + "epoch": 0.4305426123553284, + "flos": 21579835610880.0, + "grad_norm": 2.120379927808996, + "language_loss": 0.7697947, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.79132116, + "num_input_tokens_seen": 153596880, + "step": 7161, + "time_per_iteration": 2.4977798461914062 + }, + { + "auxiliary_loss_clip": 0.01109185, + "auxiliary_loss_mlp": 0.01036649, + "balance_loss_clip": 1.04250979, + "balance_loss_mlp": 1.0226717, + "epoch": 0.4306027356079964, + "flos": 22601278667520.0, + "grad_norm": 1.9482475378157686, + "language_loss": 0.7312752, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.75273353, + "num_input_tokens_seen": 153616570, + "step": 7162, + "time_per_iteration": 2.5267441272735596 + }, + { + "auxiliary_loss_clip": 0.01014414, + "auxiliary_loss_mlp": 0.00795148, + "balance_loss_clip": 1.02664149, + "balance_loss_mlp": 1.05112648, + "epoch": 0.43066285886066435, + "flos": 70672091806080.0, + "grad_norm": 0.8003441778444421, + "language_loss": 0.59015006, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.60824573, + "num_input_tokens_seen": 153671450, + "step": 7163, + "time_per_iteration": 3.0540506839752197 + }, + { + "auxiliary_loss_clip": 0.01093211, + "auxiliary_loss_mlp": 0.01034379, + "balance_loss_clip": 1.04007876, + "balance_loss_mlp": 1.02022862, + "epoch": 0.4307229821133323, + "flos": 26720591218560.0, + "grad_norm": 1.6421500291672633, + "language_loss": 0.7923162, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.81359208, + "num_input_tokens_seen": 153691405, + "step": 7164, + "time_per_iteration": 2.5857553482055664 + }, + { + "auxiliary_loss_clip": 0.01124144, + "auxiliary_loss_mlp": 0.01038097, + "balance_loss_clip": 1.04244149, + "balance_loss_mlp": 1.02400661, + "epoch": 0.4307831053660003, + "flos": 26177119464960.0, + "grad_norm": 2.0515474659848483, + "language_loss": 0.67857838, + "learning_rate": 2.538704852009177e-06, + "loss": 0.7002008, + "num_input_tokens_seen": 153711555, + "step": 7165, + "time_per_iteration": 2.5039310455322266 + }, + { + "auxiliary_loss_clip": 0.01096591, + "auxiliary_loss_mlp": 0.00798202, + "balance_loss_clip": 1.04564536, + "balance_loss_mlp": 1.02970409, + "epoch": 0.43084322861866825, + "flos": 18910343715840.0, + "grad_norm": 2.0743995676077884, + "language_loss": 0.75120312, + "learning_rate": 2.538329773967034e-06, + "loss": 0.77015102, + "num_input_tokens_seen": 153730095, + "step": 7166, + "time_per_iteration": 2.5098583698272705 + }, + { + "auxiliary_loss_clip": 0.01110317, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.04494476, + "balance_loss_mlp": 1.02305102, + "epoch": 0.4309033518713362, + "flos": 26432911192320.0, + "grad_norm": 1.6546633075900048, + "language_loss": 0.71741199, + "learning_rate": 2.537954675511372e-06, + "loss": 0.73887068, + "num_input_tokens_seen": 153749320, + "step": 7167, + "time_per_iteration": 2.5271265506744385 + }, + { + "auxiliary_loss_clip": 0.01092451, + "auxiliary_loss_mlp": 0.00805582, + "balance_loss_clip": 1.04476988, + "balance_loss_mlp": 1.04600835, + "epoch": 0.43096347512400424, + "flos": 21213295274880.0, + "grad_norm": 1.4862017344561447, + "language_loss": 0.78267533, + "learning_rate": 2.537579556656414e-06, + "loss": 0.80165565, + "num_input_tokens_seen": 153767825, + "step": 7168, + "time_per_iteration": 2.527155637741089 + }, + { + "auxiliary_loss_clip": 0.01101806, + "auxiliary_loss_mlp": 0.01038104, + "balance_loss_clip": 1.04676914, + "balance_loss_mlp": 1.02418673, + "epoch": 0.4310235983766722, + "flos": 16540131939840.0, + "grad_norm": 2.595748750562116, + "language_loss": 0.82557547, + "learning_rate": 2.537204417416387e-06, + "loss": 0.84697461, + "num_input_tokens_seen": 153785350, + "step": 7169, + "time_per_iteration": 2.5173392295837402 + }, + { + "auxiliary_loss_clip": 0.01033771, + "auxiliary_loss_mlp": 0.01003736, + "balance_loss_clip": 1.02829099, + "balance_loss_mlp": 1.001912, + "epoch": 0.43108372162934017, + "flos": 64775704763520.0, + "grad_norm": 0.6715431690984631, + "language_loss": 0.6075722, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.62794733, + "num_input_tokens_seen": 153856400, + "step": 7170, + "time_per_iteration": 3.2644782066345215 + }, + { + "auxiliary_loss_clip": 0.01122283, + "auxiliary_loss_mlp": 0.01031623, + "balance_loss_clip": 1.04439783, + "balance_loss_mlp": 1.01880193, + "epoch": 0.43114384488200813, + "flos": 13444094039040.0, + "grad_norm": 1.7640497630991898, + "language_loss": 0.75683016, + "learning_rate": 2.536454077838021e-06, + "loss": 0.77836919, + "num_input_tokens_seen": 153875230, + "step": 7171, + "time_per_iteration": 2.470296859741211 + }, + { + "auxiliary_loss_clip": 0.01111597, + "auxiliary_loss_mlp": 0.01034876, + "balance_loss_clip": 1.04552102, + "balance_loss_mlp": 1.02180505, + "epoch": 0.4312039681346761, + "flos": 26286682924800.0, + "grad_norm": 1.6202151455132376, + "language_loss": 0.77508837, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.79655302, + "num_input_tokens_seen": 153894740, + "step": 7172, + "time_per_iteration": 2.5180022716522217 + }, + { + "auxiliary_loss_clip": 0.01097751, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_clip": 1.04301763, + "balance_loss_mlp": 1.02635837, + "epoch": 0.43126409138734406, + "flos": 20376684627840.0, + "grad_norm": 1.5766275580741786, + "language_loss": 0.76361495, + "learning_rate": 2.535703656890086e-06, + "loss": 0.78502107, + "num_input_tokens_seen": 153913230, + "step": 7173, + "time_per_iteration": 2.517632484436035 + }, + { + "auxiliary_loss_clip": 0.0112241, + "auxiliary_loss_mlp": 0.00806646, + "balance_loss_clip": 1.04486108, + "balance_loss_mlp": 1.04437518, + "epoch": 0.431324214640012, + "flos": 22123091882880.0, + "grad_norm": 1.4424011181760175, + "language_loss": 0.77093816, + "learning_rate": 2.5353284159381e-06, + "loss": 0.79022872, + "num_input_tokens_seen": 153933250, + "step": 7174, + "time_per_iteration": 2.4716413021087646 + }, + { + "auxiliary_loss_clip": 0.01124439, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.04388833, + "balance_loss_mlp": 1.01875937, + "epoch": 0.43138433789268, + "flos": 15231008856960.0, + "grad_norm": 1.5706056318743462, + "language_loss": 0.8214879, + "learning_rate": 2.534953154686407e-06, + "loss": 0.84306836, + "num_input_tokens_seen": 153951325, + "step": 7175, + "time_per_iteration": 2.469491958618164 + }, + { + "auxiliary_loss_clip": 0.01081358, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_clip": 1.04169416, + "balance_loss_mlp": 1.02723098, + "epoch": 0.43144446114534796, + "flos": 18150294908160.0, + "grad_norm": 2.067525967973552, + "language_loss": 0.7475543, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.76880938, + "num_input_tokens_seen": 153966975, + "step": 7176, + "time_per_iteration": 2.5579540729522705 + }, + { + "auxiliary_loss_clip": 0.0111476, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.04381537, + "balance_loss_mlp": 1.01969481, + "epoch": 0.4315045843980159, + "flos": 22929861306240.0, + "grad_norm": 1.5425117933515733, + "language_loss": 0.73601735, + "learning_rate": 2.534202571340819e-06, + "loss": 0.75749886, + "num_input_tokens_seen": 153986695, + "step": 7177, + "time_per_iteration": 2.525585651397705 + }, + { + "auxiliary_loss_clip": 0.01110452, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.04464245, + "balance_loss_mlp": 1.02060223, + "epoch": 0.4315647076506839, + "flos": 22126862810880.0, + "grad_norm": 1.7804819539848253, + "language_loss": 0.81335664, + "learning_rate": 2.533827249275387e-06, + "loss": 0.83482939, + "num_input_tokens_seen": 154004710, + "step": 7178, + "time_per_iteration": 2.5620601177215576 + }, + { + "auxiliary_loss_clip": 0.01098424, + "auxiliary_loss_mlp": 0.0103069, + "balance_loss_clip": 1.04353714, + "balance_loss_mlp": 1.01758277, + "epoch": 0.43162483090335185, + "flos": 26871129118080.0, + "grad_norm": 1.4989036552721977, + "language_loss": 0.84284705, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.86413819, + "num_input_tokens_seen": 154024320, + "step": 7179, + "time_per_iteration": 2.591660976409912 + }, + { + "auxiliary_loss_clip": 0.01099284, + "auxiliary_loss_mlp": 0.01029428, + "balance_loss_clip": 1.0425458, + "balance_loss_mlp": 1.01597476, + "epoch": 0.4316849541560198, + "flos": 13913122855680.0, + "grad_norm": 1.7395509246632255, + "language_loss": 0.75356442, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.77485156, + "num_input_tokens_seen": 154041755, + "step": 7180, + "time_per_iteration": 3.897235870361328 + }, + { + "auxiliary_loss_clip": 0.01097874, + "auxiliary_loss_mlp": 0.00793993, + "balance_loss_clip": 1.04047084, + "balance_loss_mlp": 1.01483476, + "epoch": 0.4317450774086878, + "flos": 16435165420800.0, + "grad_norm": 1.8740179454287824, + "language_loss": 0.81834078, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.83725941, + "num_input_tokens_seen": 154056775, + "step": 7181, + "time_per_iteration": 2.4960196018218994 + }, + { + "auxiliary_loss_clip": 0.01111081, + "auxiliary_loss_mlp": 0.01036639, + "balance_loss_clip": 1.04785466, + "balance_loss_mlp": 1.0210166, + "epoch": 0.4318052006613558, + "flos": 20554980762240.0, + "grad_norm": 1.7964108726793857, + "language_loss": 0.88705325, + "learning_rate": 2.532325758728165e-06, + "loss": 0.90853047, + "num_input_tokens_seen": 154075015, + "step": 7182, + "time_per_iteration": 3.9089596271514893 + }, + { + "auxiliary_loss_clip": 0.01111837, + "auxiliary_loss_mlp": 0.00791525, + "balance_loss_clip": 1.04494095, + "balance_loss_mlp": 1.01766801, + "epoch": 0.43186532391402377, + "flos": 22820046451200.0, + "grad_norm": 1.819828211794466, + "language_loss": 0.75964844, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.77868205, + "num_input_tokens_seen": 154095170, + "step": 7183, + "time_per_iteration": 2.503681182861328 + }, + { + "auxiliary_loss_clip": 0.0111132, + "auxiliary_loss_mlp": 0.01032347, + "balance_loss_clip": 1.04361022, + "balance_loss_mlp": 1.01864958, + "epoch": 0.43192544716669173, + "flos": 25556583081600.0, + "grad_norm": 1.5505146324222656, + "language_loss": 0.7751857, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.79662228, + "num_input_tokens_seen": 154116895, + "step": 7184, + "time_per_iteration": 2.5433454513549805 + }, + { + "auxiliary_loss_clip": 0.01096725, + "auxiliary_loss_mlp": 0.01032912, + "balance_loss_clip": 1.04481542, + "balance_loss_mlp": 1.01968527, + "epoch": 0.4319855704193597, + "flos": 30954674701440.0, + "grad_norm": 2.090514327419112, + "language_loss": 0.73300678, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.75430316, + "num_input_tokens_seen": 154138395, + "step": 7185, + "time_per_iteration": 2.5992045402526855 + }, + { + "auxiliary_loss_clip": 0.01108157, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.04424, + "balance_loss_mlp": 1.02132654, + "epoch": 0.43204569367202766, + "flos": 24238732993920.0, + "grad_norm": 2.5693468084136244, + "language_loss": 0.75209564, + "learning_rate": 2.530823945207421e-06, + "loss": 0.77352923, + "num_input_tokens_seen": 154156775, + "step": 7186, + "time_per_iteration": 3.9048914909362793 + }, + { + "auxiliary_loss_clip": 0.01090026, + "auxiliary_loss_mlp": 0.01034374, + "balance_loss_clip": 1.04436207, + "balance_loss_mlp": 1.02065897, + "epoch": 0.43210581692469563, + "flos": 18406948561920.0, + "grad_norm": 2.7389551437482753, + "language_loss": 0.76167178, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.78291583, + "num_input_tokens_seen": 154177500, + "step": 7187, + "time_per_iteration": 4.000559091567993 + }, + { + "auxiliary_loss_clip": 0.01019829, + "auxiliary_loss_mlp": 0.01000351, + "balance_loss_clip": 1.02937591, + "balance_loss_mlp": 0.99861068, + "epoch": 0.4321659401773636, + "flos": 49832378910720.0, + "grad_norm": 0.8579279674889612, + "language_loss": 0.68133634, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70153815, + "num_input_tokens_seen": 154237110, + "step": 7188, + "time_per_iteration": 3.1861555576324463 + }, + { + "auxiliary_loss_clip": 0.01092769, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.04440522, + "balance_loss_mlp": 1.02030468, + "epoch": 0.43222606343003156, + "flos": 17128564542720.0, + "grad_norm": 1.9335668276854072, + "language_loss": 0.77981639, + "learning_rate": 2.529697373663614e-06, + "loss": 0.80107963, + "num_input_tokens_seen": 154253910, + "step": 7189, + "time_per_iteration": 2.502847671508789 + }, + { + "auxiliary_loss_clip": 0.01079544, + "auxiliary_loss_mlp": 0.01040915, + "balance_loss_clip": 1.04201579, + "balance_loss_mlp": 1.02616882, + "epoch": 0.4322861866826995, + "flos": 22749949059840.0, + "grad_norm": 1.9130008486644403, + "language_loss": 0.71499145, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.73619598, + "num_input_tokens_seen": 154274770, + "step": 7190, + "time_per_iteration": 2.6052281856536865 + }, + { + "auxiliary_loss_clip": 0.01097351, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.04134679, + "balance_loss_mlp": 1.01852703, + "epoch": 0.4323463099353675, + "flos": 27891925729920.0, + "grad_norm": 1.4172437169133458, + "language_loss": 0.79497623, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.81626809, + "num_input_tokens_seen": 154295035, + "step": 7191, + "time_per_iteration": 2.586731433868408 + }, + { + "auxiliary_loss_clip": 0.01076263, + "auxiliary_loss_mlp": 0.01032211, + "balance_loss_clip": 1.043823, + "balance_loss_mlp": 1.01934791, + "epoch": 0.43240643318803546, + "flos": 21614740652160.0, + "grad_norm": 1.5758543933735882, + "language_loss": 0.74813598, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.76922071, + "num_input_tokens_seen": 154314905, + "step": 7192, + "time_per_iteration": 2.6035068035125732 + }, + { + "auxiliary_loss_clip": 0.01079055, + "auxiliary_loss_mlp": 0.01037027, + "balance_loss_clip": 1.04351163, + "balance_loss_mlp": 1.02223349, + "epoch": 0.4324665564407034, + "flos": 17558378686080.0, + "grad_norm": 1.7628846021016014, + "language_loss": 0.78801572, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.80917656, + "num_input_tokens_seen": 154331740, + "step": 7193, + "time_per_iteration": 2.574690580368042 + }, + { + "auxiliary_loss_clip": 0.01102748, + "auxiliary_loss_mlp": 0.01039485, + "balance_loss_clip": 1.04245758, + "balance_loss_mlp": 1.02568698, + "epoch": 0.4325266796933714, + "flos": 18402423448320.0, + "grad_norm": 1.8062278229436828, + "language_loss": 0.75939, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.78081238, + "num_input_tokens_seen": 154348740, + "step": 7194, + "time_per_iteration": 2.5526301860809326 + }, + { + "auxiliary_loss_clip": 0.0112613, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.04784608, + "balance_loss_mlp": 1.02234197, + "epoch": 0.4325868029460394, + "flos": 22564793427840.0, + "grad_norm": 2.0603285712147366, + "language_loss": 0.59653878, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.61815989, + "num_input_tokens_seen": 154368835, + "step": 7195, + "time_per_iteration": 2.497948408126831 + }, + { + "auxiliary_loss_clip": 0.01102037, + "auxiliary_loss_mlp": 0.01038858, + "balance_loss_clip": 1.04349875, + "balance_loss_mlp": 1.02360535, + "epoch": 0.43264692619870737, + "flos": 14605516396800.0, + "grad_norm": 1.8536712300888893, + "language_loss": 0.64990151, + "learning_rate": 2.527068004376515e-06, + "loss": 0.67131042, + "num_input_tokens_seen": 154384620, + "step": 7196, + "time_per_iteration": 2.5159857273101807 + }, + { + "auxiliary_loss_clip": 0.01127053, + "auxiliary_loss_mlp": 0.0103751, + "balance_loss_clip": 1.04549551, + "balance_loss_mlp": 1.02337718, + "epoch": 0.43270704945137534, + "flos": 21501657659520.0, + "grad_norm": 2.434843124549809, + "language_loss": 0.72349226, + "learning_rate": 2.526692300132797e-06, + "loss": 0.74513781, + "num_input_tokens_seen": 154402865, + "step": 7197, + "time_per_iteration": 2.466332197189331 + }, + { + "auxiliary_loss_clip": 0.0111073, + "auxiliary_loss_mlp": 0.01039162, + "balance_loss_clip": 1.04610145, + "balance_loss_mlp": 1.02523851, + "epoch": 0.4327671727040433, + "flos": 25155891889920.0, + "grad_norm": 1.4863479526756327, + "language_loss": 0.72633946, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.74783838, + "num_input_tokens_seen": 154423625, + "step": 7198, + "time_per_iteration": 2.5538575649261475 + }, + { + "auxiliary_loss_clip": 0.01085396, + "auxiliary_loss_mlp": 0.01030339, + "balance_loss_clip": 1.04314554, + "balance_loss_mlp": 1.01715422, + "epoch": 0.43282729595671127, + "flos": 25447163276160.0, + "grad_norm": 1.3634782753867378, + "language_loss": 0.80892956, + "learning_rate": 2.525940831742934e-06, + "loss": 0.83008695, + "num_input_tokens_seen": 154444775, + "step": 7199, + "time_per_iteration": 2.6172633171081543 + }, + { + "auxiliary_loss_clip": 0.01104637, + "auxiliary_loss_mlp": 0.01033394, + "balance_loss_clip": 1.04460204, + "balance_loss_mlp": 1.02031112, + "epoch": 0.43288741920937923, + "flos": 24126116878080.0, + "grad_norm": 2.1799005691921174, + "language_loss": 0.69033992, + "learning_rate": 2.525565067625286e-06, + "loss": 0.71172023, + "num_input_tokens_seen": 154460815, + "step": 7200, + "time_per_iteration": 2.5478813648223877 + }, + { + "auxiliary_loss_clip": 0.01101394, + "auxiliary_loss_mlp": 0.00787796, + "balance_loss_clip": 1.04485786, + "balance_loss_mlp": 1.01040733, + "epoch": 0.4329475424620472, + "flos": 19204955066880.0, + "grad_norm": 1.820368454828695, + "language_loss": 0.86980492, + "learning_rate": 2.525189283578157e-06, + "loss": 0.88869685, + "num_input_tokens_seen": 154479145, + "step": 7201, + "time_per_iteration": 2.6181232929229736 + }, + { + "auxiliary_loss_clip": 0.01076141, + "auxiliary_loss_mlp": 0.01036538, + "balance_loss_clip": 1.04813993, + "balance_loss_mlp": 1.02094519, + "epoch": 0.43300766571471516, + "flos": 22638374438400.0, + "grad_norm": 1.808363318183575, + "language_loss": 0.64586151, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.66698831, + "num_input_tokens_seen": 154498905, + "step": 7202, + "time_per_iteration": 2.6463847160339355 + }, + { + "auxiliary_loss_clip": 0.01072861, + "auxiliary_loss_mlp": 0.01027471, + "balance_loss_clip": 1.04454947, + "balance_loss_mlp": 1.0147754, + "epoch": 0.4330677889673831, + "flos": 22121080721280.0, + "grad_norm": 1.7417500350078539, + "language_loss": 0.81969815, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.84070152, + "num_input_tokens_seen": 154517270, + "step": 7203, + "time_per_iteration": 2.6132962703704834 + }, + { + "auxiliary_loss_clip": 0.01094669, + "auxiliary_loss_mlp": 0.01043856, + "balance_loss_clip": 1.04503334, + "balance_loss_mlp": 1.0296762, + "epoch": 0.4331279122200511, + "flos": 23221527742080.0, + "grad_norm": 1.7763302869295055, + "language_loss": 0.81001306, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.83139825, + "num_input_tokens_seen": 154535945, + "step": 7204, + "time_per_iteration": 2.588550567626953 + }, + { + "auxiliary_loss_clip": 0.01100733, + "auxiliary_loss_mlp": 0.01037333, + "balance_loss_clip": 1.04404449, + "balance_loss_mlp": 1.02445257, + "epoch": 0.43318803547271906, + "flos": 18259750627200.0, + "grad_norm": 1.9267616720113059, + "language_loss": 0.73687071, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.75825137, + "num_input_tokens_seen": 154554935, + "step": 7205, + "time_per_iteration": 2.506317138671875 + }, + { + "auxiliary_loss_clip": 0.01122926, + "auxiliary_loss_mlp": 0.00789484, + "balance_loss_clip": 1.04672194, + "balance_loss_mlp": 1.01415563, + "epoch": 0.433248158725387, + "flos": 27418407713280.0, + "grad_norm": 1.6254481429361376, + "language_loss": 0.75428653, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.77341056, + "num_input_tokens_seen": 154576065, + "step": 7206, + "time_per_iteration": 2.5586225986480713 + }, + { + "auxiliary_loss_clip": 0.01076861, + "auxiliary_loss_mlp": 0.01033394, + "balance_loss_clip": 1.0437839, + "balance_loss_mlp": 1.01990592, + "epoch": 0.433308281978055, + "flos": 23218008209280.0, + "grad_norm": 2.674889081898101, + "language_loss": 0.79286206, + "learning_rate": 2.522934161574342e-06, + "loss": 0.81396461, + "num_input_tokens_seen": 154595110, + "step": 7207, + "time_per_iteration": 2.60380482673645 + }, + { + "auxiliary_loss_clip": 0.01093676, + "auxiliary_loss_mlp": 0.01032425, + "balance_loss_clip": 1.04373813, + "balance_loss_mlp": 1.01760149, + "epoch": 0.433368405230723, + "flos": 15852407166720.0, + "grad_norm": 1.8908976746113093, + "language_loss": 0.80900091, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.83026195, + "num_input_tokens_seen": 154612255, + "step": 7208, + "time_per_iteration": 2.5419259071350098 + }, + { + "auxiliary_loss_clip": 0.01104064, + "auxiliary_loss_mlp": 0.01032313, + "balance_loss_clip": 1.04598832, + "balance_loss_mlp": 1.01900959, + "epoch": 0.433428528483391, + "flos": 19026084314880.0, + "grad_norm": 2.0452482392867113, + "language_loss": 0.69658399, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.71794772, + "num_input_tokens_seen": 154630440, + "step": 7209, + "time_per_iteration": 2.519158124923706 + }, + { + "auxiliary_loss_clip": 0.01110224, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.0434972, + "balance_loss_mlp": 1.02066052, + "epoch": 0.43348865173605894, + "flos": 24718248581760.0, + "grad_norm": 1.3886188799347143, + "language_loss": 0.81422794, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.83567768, + "num_input_tokens_seen": 154652515, + "step": 7210, + "time_per_iteration": 2.5654988288879395 + }, + { + "auxiliary_loss_clip": 0.01097538, + "auxiliary_loss_mlp": 0.01032789, + "balance_loss_clip": 1.04303992, + "balance_loss_mlp": 1.02002788, + "epoch": 0.4335487749887269, + "flos": 22090664880000.0, + "grad_norm": 1.7261502580393158, + "language_loss": 0.81857741, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.8398807, + "num_input_tokens_seen": 154670965, + "step": 7211, + "time_per_iteration": 2.519228935241699 + }, + { + "auxiliary_loss_clip": 0.01110212, + "auxiliary_loss_mlp": 0.01035225, + "balance_loss_clip": 1.0418365, + "balance_loss_mlp": 1.02340531, + "epoch": 0.43360889824139487, + "flos": 22382941847040.0, + "grad_norm": 1.9472703230425426, + "language_loss": 0.74891973, + "learning_rate": 2.521054347790029e-06, + "loss": 0.77037406, + "num_input_tokens_seen": 154689980, + "step": 7212, + "time_per_iteration": 2.508733034133911 + }, + { + "auxiliary_loss_clip": 0.01103768, + "auxiliary_loss_mlp": 0.01032382, + "balance_loss_clip": 1.04431581, + "balance_loss_mlp": 1.0197463, + "epoch": 0.43366902149406283, + "flos": 17528286067200.0, + "grad_norm": 2.0520229497917013, + "language_loss": 0.763677, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.78503847, + "num_input_tokens_seen": 154706570, + "step": 7213, + "time_per_iteration": 2.519376516342163 + }, + { + "auxiliary_loss_clip": 0.01112631, + "auxiliary_loss_mlp": 0.01037261, + "balance_loss_clip": 1.04516959, + "balance_loss_mlp": 1.02482164, + "epoch": 0.4337291447467308, + "flos": 19022672522880.0, + "grad_norm": 1.4635139485900466, + "language_loss": 0.65155429, + "learning_rate": 2.520302283867471e-06, + "loss": 0.67305326, + "num_input_tokens_seen": 154725210, + "step": 7214, + "time_per_iteration": 2.507765531539917 + }, + { + "auxiliary_loss_clip": 0.01095595, + "auxiliary_loss_mlp": 0.01032202, + "balance_loss_clip": 1.04115176, + "balance_loss_mlp": 1.01994717, + "epoch": 0.43378926799939876, + "flos": 27234042180480.0, + "grad_norm": 1.5856144381217636, + "language_loss": 0.71537769, + "learning_rate": 2.519926222304191e-06, + "loss": 0.73665565, + "num_input_tokens_seen": 154745945, + "step": 7215, + "time_per_iteration": 2.57932448387146 + }, + { + "auxiliary_loss_clip": 0.01094165, + "auxiliary_loss_mlp": 0.01036555, + "balance_loss_clip": 1.04285359, + "balance_loss_mlp": 1.02259004, + "epoch": 0.43384939125206673, + "flos": 15961108700160.0, + "grad_norm": 1.7084298565828318, + "language_loss": 0.7501989, + "learning_rate": 2.519550141025255e-06, + "loss": 0.77150607, + "num_input_tokens_seen": 154763580, + "step": 7216, + "time_per_iteration": 2.5101757049560547 + }, + { + "auxiliary_loss_clip": 0.01103174, + "auxiliary_loss_mlp": 0.0104131, + "balance_loss_clip": 1.04415715, + "balance_loss_mlp": 1.02611709, + "epoch": 0.4339095145047347, + "flos": 21793216354560.0, + "grad_norm": 2.384456371980344, + "language_loss": 0.76107067, + "learning_rate": 2.519174040044927e-06, + "loss": 0.78251553, + "num_input_tokens_seen": 154776825, + "step": 7217, + "time_per_iteration": 2.507002830505371 + }, + { + "auxiliary_loss_clip": 0.0108587, + "auxiliary_loss_mlp": 0.01038506, + "balance_loss_clip": 1.04082775, + "balance_loss_mlp": 1.02486801, + "epoch": 0.43396963775740266, + "flos": 14209853109120.0, + "grad_norm": 1.9949445084847832, + "language_loss": 0.7404508, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.76169449, + "num_input_tokens_seen": 154794025, + "step": 7218, + "time_per_iteration": 3.8940656185150146 + }, + { + "auxiliary_loss_clip": 0.0110471, + "auxiliary_loss_mlp": 0.01029999, + "balance_loss_clip": 1.04744363, + "balance_loss_mlp": 1.01634991, + "epoch": 0.4340297610100706, + "flos": 19719052473600.0, + "grad_norm": 1.5753300192395339, + "language_loss": 0.68584305, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.70719016, + "num_input_tokens_seen": 154813105, + "step": 7219, + "time_per_iteration": 2.53041934967041 + }, + { + "auxiliary_loss_clip": 0.01093286, + "auxiliary_loss_mlp": 0.01033601, + "balance_loss_clip": 1.04342222, + "balance_loss_mlp": 1.01999974, + "epoch": 0.4340898842627386, + "flos": 18953508885120.0, + "grad_norm": 1.858604914496032, + "language_loss": 0.77559412, + "learning_rate": 2.518045619038202e-06, + "loss": 0.79686302, + "num_input_tokens_seen": 154833525, + "step": 7220, + "time_per_iteration": 3.98419189453125 + }, + { + "auxiliary_loss_clip": 0.01058319, + "auxiliary_loss_mlp": 0.01030657, + "balance_loss_clip": 1.04139566, + "balance_loss_mlp": 1.01694202, + "epoch": 0.4341500075154066, + "flos": 22018304931840.0, + "grad_norm": 2.18701612705658, + "language_loss": 0.69821835, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.71910816, + "num_input_tokens_seen": 154853090, + "step": 7221, + "time_per_iteration": 2.6250741481781006 + }, + { + "auxiliary_loss_clip": 0.01113913, + "auxiliary_loss_mlp": 0.010325, + "balance_loss_clip": 1.04329634, + "balance_loss_mlp": 1.01972103, + "epoch": 0.4342101307680746, + "flos": 23582465556480.0, + "grad_norm": 3.208549470630045, + "language_loss": 0.64773929, + "learning_rate": 2.51729324012157e-06, + "loss": 0.6692034, + "num_input_tokens_seen": 154872055, + "step": 7222, + "time_per_iteration": 2.532081365585327 + }, + { + "auxiliary_loss_clip": 0.01088214, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.04194415, + "balance_loss_mlp": 1.01697838, + "epoch": 0.43427025402074254, + "flos": 17967976450560.0, + "grad_norm": 5.003645368780293, + "language_loss": 0.72874624, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.74993962, + "num_input_tokens_seen": 154886645, + "step": 7223, + "time_per_iteration": 2.5219576358795166 + }, + { + "auxiliary_loss_clip": 0.01123806, + "auxiliary_loss_mlp": 0.01031069, + "balance_loss_clip": 1.04263806, + "balance_loss_mlp": 1.01699591, + "epoch": 0.4343303772734105, + "flos": 26286395616000.0, + "grad_norm": 10.947192979562725, + "language_loss": 0.94228798, + "learning_rate": 2.516540782741694e-06, + "loss": 0.96383667, + "num_input_tokens_seen": 154906775, + "step": 7224, + "time_per_iteration": 2.539252519607544 + }, + { + "auxiliary_loss_clip": 0.01086542, + "auxiliary_loss_mlp": 0.01040426, + "balance_loss_clip": 1.04372263, + "balance_loss_mlp": 1.02651477, + "epoch": 0.43439050052607847, + "flos": 26833961520000.0, + "grad_norm": 2.4043241583048585, + "language_loss": 0.61016935, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.63143909, + "num_input_tokens_seen": 154926990, + "step": 7225, + "time_per_iteration": 4.064651966094971 + }, + { + "auxiliary_loss_clip": 0.0109511, + "auxiliary_loss_mlp": 0.00790348, + "balance_loss_clip": 1.04293156, + "balance_loss_mlp": 1.01318264, + "epoch": 0.43445062377874644, + "flos": 21397660807680.0, + "grad_norm": 2.187514532586146, + "language_loss": 0.78010499, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.79895961, + "num_input_tokens_seen": 154946210, + "step": 7226, + "time_per_iteration": 2.5720386505126953 + }, + { + "auxiliary_loss_clip": 0.01109896, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.04391313, + "balance_loss_mlp": 1.01678312, + "epoch": 0.4345107470314144, + "flos": 19901945548800.0, + "grad_norm": 1.54698783174401, + "language_loss": 0.84462744, + "learning_rate": 2.515411949802964e-06, + "loss": 0.86602294, + "num_input_tokens_seen": 154964995, + "step": 7227, + "time_per_iteration": 2.4900476932525635 + }, + { + "auxiliary_loss_clip": 0.01107763, + "auxiliary_loss_mlp": 0.01034924, + "balance_loss_clip": 1.04310393, + "balance_loss_mlp": 1.02083981, + "epoch": 0.43457087028408237, + "flos": 26432623883520.0, + "grad_norm": 1.8800967372303998, + "language_loss": 0.76712102, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.78854787, + "num_input_tokens_seen": 154984775, + "step": 7228, + "time_per_iteration": 2.604343891143799 + }, + { + "auxiliary_loss_clip": 0.01081143, + "auxiliary_loss_mlp": 0.01036882, + "balance_loss_clip": 1.04985726, + "balance_loss_mlp": 1.02293456, + "epoch": 0.43463099353675033, + "flos": 31868816855040.0, + "grad_norm": 1.5915338970076094, + "language_loss": 0.80441368, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.82559395, + "num_input_tokens_seen": 155008125, + "step": 7229, + "time_per_iteration": 2.774876356124878 + }, + { + "auxiliary_loss_clip": 0.01109515, + "auxiliary_loss_mlp": 0.01037841, + "balance_loss_clip": 1.04201961, + "balance_loss_mlp": 1.02433515, + "epoch": 0.4346911167894183, + "flos": 24571266128640.0, + "grad_norm": 1.8046146559866976, + "language_loss": 0.82312679, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.84460032, + "num_input_tokens_seen": 155027885, + "step": 7230, + "time_per_iteration": 2.612349033355713 + }, + { + "auxiliary_loss_clip": 0.01110515, + "auxiliary_loss_mlp": 0.01042211, + "balance_loss_clip": 1.04521632, + "balance_loss_mlp": 1.02783442, + "epoch": 0.43475124004208626, + "flos": 17090678672640.0, + "grad_norm": 2.205667708744853, + "language_loss": 0.77092731, + "learning_rate": 2.513906565661973e-06, + "loss": 0.7924546, + "num_input_tokens_seen": 155043375, + "step": 7231, + "time_per_iteration": 2.54496693611145 + }, + { + "auxiliary_loss_clip": 0.01081378, + "auxiliary_loss_mlp": 0.01031175, + "balance_loss_clip": 1.04426992, + "balance_loss_mlp": 1.01901007, + "epoch": 0.4348113632947542, + "flos": 26104615862400.0, + "grad_norm": 1.4569014854656854, + "language_loss": 0.68888581, + "learning_rate": 2.513530170872575e-06, + "loss": 0.71001136, + "num_input_tokens_seen": 155062930, + "step": 7232, + "time_per_iteration": 2.66925311088562 + }, + { + "auxiliary_loss_clip": 0.0108838, + "auxiliary_loss_mlp": 0.01032739, + "balance_loss_clip": 1.04230499, + "balance_loss_mlp": 1.01840377, + "epoch": 0.4348714865474222, + "flos": 34200496316160.0, + "grad_norm": 1.6207125911339715, + "language_loss": 0.72164738, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.74285853, + "num_input_tokens_seen": 155084980, + "step": 7233, + "time_per_iteration": 2.7525479793548584 + }, + { + "auxiliary_loss_clip": 0.01063675, + "auxiliary_loss_mlp": 0.01036797, + "balance_loss_clip": 1.04094768, + "balance_loss_mlp": 1.02156186, + "epoch": 0.43493160980009016, + "flos": 31537468869120.0, + "grad_norm": 1.7189075509451857, + "language_loss": 0.74246156, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.76346624, + "num_input_tokens_seen": 155107260, + "step": 7234, + "time_per_iteration": 2.7633466720581055 + }, + { + "auxiliary_loss_clip": 0.0110055, + "auxiliary_loss_mlp": 0.01035951, + "balance_loss_clip": 1.04332554, + "balance_loss_mlp": 1.02116871, + "epoch": 0.4349917330527582, + "flos": 24061334699520.0, + "grad_norm": 5.848003252674072, + "language_loss": 0.59482384, + "learning_rate": 2.512400869722782e-06, + "loss": 0.61618888, + "num_input_tokens_seen": 155126720, + "step": 7235, + "time_per_iteration": 2.5894975662231445 + }, + { + "auxiliary_loss_clip": 0.01060253, + "auxiliary_loss_mlp": 0.01039881, + "balance_loss_clip": 1.04084921, + "balance_loss_mlp": 1.023633, + "epoch": 0.43505185630542614, + "flos": 30519329863680.0, + "grad_norm": 1.6714004043355728, + "language_loss": 0.77538133, + "learning_rate": 2.512024397126566e-06, + "loss": 0.79638267, + "num_input_tokens_seen": 155148640, + "step": 7236, + "time_per_iteration": 2.6797313690185547 + }, + { + "auxiliary_loss_clip": 0.01119456, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.04346967, + "balance_loss_mlp": 1.01773715, + "epoch": 0.4351119795580941, + "flos": 15735158196480.0, + "grad_norm": 1.7825589073536603, + "language_loss": 0.81316006, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.83467054, + "num_input_tokens_seen": 155165870, + "step": 7237, + "time_per_iteration": 2.468858003616333 + }, + { + "auxiliary_loss_clip": 0.01108725, + "auxiliary_loss_mlp": 0.01035763, + "balance_loss_clip": 1.04201567, + "balance_loss_mlp": 1.02188659, + "epoch": 0.4351721028107621, + "flos": 18731760272640.0, + "grad_norm": 1.4130004653838943, + "language_loss": 0.63141465, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.65285957, + "num_input_tokens_seen": 155185315, + "step": 7238, + "time_per_iteration": 2.512594223022461 + }, + { + "auxiliary_loss_clip": 0.01085322, + "auxiliary_loss_mlp": 0.00786771, + "balance_loss_clip": 1.0417273, + "balance_loss_mlp": 1.00752139, + "epoch": 0.43523222606343004, + "flos": 25226887121280.0, + "grad_norm": 1.5779036020295623, + "language_loss": 0.85861874, + "learning_rate": 2.510894862898928e-06, + "loss": 0.87733972, + "num_input_tokens_seen": 155205790, + "step": 7239, + "time_per_iteration": 2.6338257789611816 + }, + { + "auxiliary_loss_clip": 0.0110299, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.04554939, + "balance_loss_mlp": 1.0164113, + "epoch": 0.435292349316098, + "flos": 22709190101760.0, + "grad_norm": 1.9302159132422705, + "language_loss": 0.7256124, + "learning_rate": 2.510518312724309e-06, + "loss": 0.74694169, + "num_input_tokens_seen": 155226475, + "step": 7240, + "time_per_iteration": 2.606358766555786 + }, + { + "auxiliary_loss_clip": 0.01090677, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.04515338, + "balance_loss_mlp": 1.0163933, + "epoch": 0.43535247256876597, + "flos": 25775889569280.0, + "grad_norm": 1.8026667005756591, + "language_loss": 0.8211199, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.84233689, + "num_input_tokens_seen": 155247110, + "step": 7241, + "time_per_iteration": 2.6029632091522217 + }, + { + "auxiliary_loss_clip": 0.0109408, + "auxiliary_loss_mlp": 0.00786645, + "balance_loss_clip": 1.04373944, + "balance_loss_mlp": 1.00742781, + "epoch": 0.43541259582143393, + "flos": 17528142412800.0, + "grad_norm": 2.433334628088347, + "language_loss": 0.7939961, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.81280339, + "num_input_tokens_seen": 155261335, + "step": 7242, + "time_per_iteration": 2.5214080810546875 + }, + { + "auxiliary_loss_clip": 0.01097931, + "auxiliary_loss_mlp": 0.01032762, + "balance_loss_clip": 1.04104292, + "balance_loss_mlp": 1.01800418, + "epoch": 0.4354727190741019, + "flos": 15195205975680.0, + "grad_norm": 5.902734645839141, + "language_loss": 0.68018556, + "learning_rate": 2.509388546104138e-06, + "loss": 0.70149249, + "num_input_tokens_seen": 155278510, + "step": 7243, + "time_per_iteration": 2.519796133041382 + }, + { + "auxiliary_loss_clip": 0.01067391, + "auxiliary_loss_mlp": 0.01028259, + "balance_loss_clip": 1.04196358, + "balance_loss_mlp": 1.0154078, + "epoch": 0.43553284232676986, + "flos": 16649264436480.0, + "grad_norm": 1.8763201142544046, + "language_loss": 0.81636095, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.83731741, + "num_input_tokens_seen": 155296450, + "step": 7244, + "time_per_iteration": 2.5679445266723633 + }, + { + "auxiliary_loss_clip": 0.01063875, + "auxiliary_loss_mlp": 0.01028406, + "balance_loss_clip": 1.0415442, + "balance_loss_mlp": 1.0154475, + "epoch": 0.43559296557943783, + "flos": 23400865370880.0, + "grad_norm": 1.8546071150116974, + "language_loss": 0.73555183, + "learning_rate": 2.508635271753234e-06, + "loss": 0.75647461, + "num_input_tokens_seen": 155316080, + "step": 7245, + "time_per_iteration": 2.6432852745056152 + }, + { + "auxiliary_loss_clip": 0.01066872, + "auxiliary_loss_mlp": 0.01036181, + "balance_loss_clip": 1.0455848, + "balance_loss_mlp": 1.02277589, + "epoch": 0.4356530888321058, + "flos": 22419067950720.0, + "grad_norm": 1.6613046005343854, + "language_loss": 0.76942348, + "learning_rate": 2.508258605639389e-06, + "loss": 0.79045397, + "num_input_tokens_seen": 155336765, + "step": 7246, + "time_per_iteration": 2.62353777885437 + }, + { + "auxiliary_loss_clip": 0.01110325, + "auxiliary_loss_mlp": 0.01036064, + "balance_loss_clip": 1.04263926, + "balance_loss_mlp": 1.02215207, + "epoch": 0.43571321208477376, + "flos": 21616141282560.0, + "grad_norm": 1.7381373929755897, + "language_loss": 0.85437596, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.87583983, + "num_input_tokens_seen": 155356440, + "step": 7247, + "time_per_iteration": 2.5292601585388184 + }, + { + "auxiliary_loss_clip": 0.01122932, + "auxiliary_loss_mlp": 0.01034083, + "balance_loss_clip": 1.04459679, + "balance_loss_mlp": 1.02145314, + "epoch": 0.4357733353374418, + "flos": 23987358639360.0, + "grad_norm": 1.5458919130198876, + "language_loss": 0.72231913, + "learning_rate": 2.507505215606333e-06, + "loss": 0.74388933, + "num_input_tokens_seen": 155377070, + "step": 7248, + "time_per_iteration": 2.497880697250366 + }, + { + "auxiliary_loss_clip": 0.01110932, + "auxiliary_loss_mlp": 0.01029607, + "balance_loss_clip": 1.04502988, + "balance_loss_mlp": 1.0160116, + "epoch": 0.43583345859010975, + "flos": 25264737077760.0, + "grad_norm": 1.520602260819967, + "language_loss": 0.87565982, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.89706528, + "num_input_tokens_seen": 155398415, + "step": 7249, + "time_per_iteration": 2.5495471954345703 + }, + { + "auxiliary_loss_clip": 0.01104734, + "auxiliary_loss_mlp": 0.01036851, + "balance_loss_clip": 1.04439354, + "balance_loss_mlp": 1.02391672, + "epoch": 0.4358935818427777, + "flos": 23696302734720.0, + "grad_norm": 1.685638736407713, + "language_loss": 0.816212, + "learning_rate": 2.506751748594683e-06, + "loss": 0.83762789, + "num_input_tokens_seen": 155415625, + "step": 7250, + "time_per_iteration": 2.550302505493164 + }, + { + "auxiliary_loss_clip": 0.01115696, + "auxiliary_loss_mlp": 0.01032518, + "balance_loss_clip": 1.0479089, + "balance_loss_mlp": 1.01919651, + "epoch": 0.4359537050954457, + "flos": 29532827761920.0, + "grad_norm": 1.887137288889742, + "language_loss": 0.84538352, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.86686569, + "num_input_tokens_seen": 155435505, + "step": 7251, + "time_per_iteration": 2.5762550830841064 + }, + { + "auxiliary_loss_clip": 0.01100795, + "auxiliary_loss_mlp": 0.01042239, + "balance_loss_clip": 1.03984594, + "balance_loss_mlp": 1.02789164, + "epoch": 0.43601382834811364, + "flos": 22711273090560.0, + "grad_norm": 1.6712002684322527, + "language_loss": 0.69168866, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.71311903, + "num_input_tokens_seen": 155455425, + "step": 7252, + "time_per_iteration": 2.5006253719329834 + }, + { + "auxiliary_loss_clip": 0.01097068, + "auxiliary_loss_mlp": 0.01034741, + "balance_loss_clip": 1.04482543, + "balance_loss_mlp": 1.01962554, + "epoch": 0.4360739516007816, + "flos": 19098731571840.0, + "grad_norm": 1.6734478385039597, + "language_loss": 0.83452332, + "learning_rate": 2.505621403992348e-06, + "loss": 0.8558414, + "num_input_tokens_seen": 155474250, + "step": 7253, + "time_per_iteration": 2.5249991416931152 + }, + { + "auxiliary_loss_clip": 0.0111379, + "auxiliary_loss_mlp": 0.01037383, + "balance_loss_clip": 1.0484376, + "balance_loss_mlp": 1.02347755, + "epoch": 0.43613407485344957, + "flos": 23404420817280.0, + "grad_norm": 1.4768437310906288, + "language_loss": 0.70236647, + "learning_rate": 2.505244584092757e-06, + "loss": 0.7238782, + "num_input_tokens_seen": 155494685, + "step": 7254, + "time_per_iteration": 2.5210423469543457 + }, + { + "auxiliary_loss_clip": 0.01100443, + "auxiliary_loss_mlp": 0.01032837, + "balance_loss_clip": 1.04544401, + "balance_loss_mlp": 1.01984358, + "epoch": 0.43619419810611754, + "flos": 22637799820800.0, + "grad_norm": 1.77377815934272, + "language_loss": 0.81275082, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.83408368, + "num_input_tokens_seen": 155513040, + "step": 7255, + "time_per_iteration": 2.5297908782958984 + }, + { + "auxiliary_loss_clip": 0.01123111, + "auxiliary_loss_mlp": 0.0103594, + "balance_loss_clip": 1.04455829, + "balance_loss_mlp": 1.02286911, + "epoch": 0.4362543213587855, + "flos": 20047958334720.0, + "grad_norm": 1.687830498793561, + "language_loss": 0.77567458, + "learning_rate": 2.504490886831089e-06, + "loss": 0.79726505, + "num_input_tokens_seen": 155530100, + "step": 7256, + "time_per_iteration": 2.4536232948303223 + }, + { + "auxiliary_loss_clip": 0.01121378, + "auxiliary_loss_mlp": 0.01032939, + "balance_loss_clip": 1.045259, + "balance_loss_mlp": 1.02013636, + "epoch": 0.43631444461145347, + "flos": 21361319222400.0, + "grad_norm": 1.5985138794526068, + "language_loss": 0.75924516, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.7807883, + "num_input_tokens_seen": 155549375, + "step": 7257, + "time_per_iteration": 3.9023940563201904 + }, + { + "auxiliary_loss_clip": 0.01111447, + "auxiliary_loss_mlp": 0.0103793, + "balance_loss_clip": 1.04384232, + "balance_loss_mlp": 1.02321959, + "epoch": 0.43637456786412143, + "flos": 22418529246720.0, + "grad_norm": 2.312139632338265, + "language_loss": 0.72776413, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.74925798, + "num_input_tokens_seen": 155569395, + "step": 7258, + "time_per_iteration": 3.927535057067871 + }, + { + "auxiliary_loss_clip": 0.01100172, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.0451256, + "balance_loss_mlp": 1.02033019, + "epoch": 0.4364346911167894, + "flos": 28548839612160.0, + "grad_norm": 1.8346638906606565, + "language_loss": 0.76694906, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.7882874, + "num_input_tokens_seen": 155589090, + "step": 7259, + "time_per_iteration": 2.578845500946045 + }, + { + "auxiliary_loss_clip": 0.01031867, + "auxiliary_loss_mlp": 0.01003387, + "balance_loss_clip": 1.03248405, + "balance_loss_mlp": 1.00170612, + "epoch": 0.43649481436945736, + "flos": 62659345380480.0, + "grad_norm": 0.7456643669606448, + "language_loss": 0.57006007, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.59041262, + "num_input_tokens_seen": 155648660, + "step": 7260, + "time_per_iteration": 3.0990757942199707 + }, + { + "auxiliary_loss_clip": 0.01100509, + "auxiliary_loss_mlp": 0.01041276, + "balance_loss_clip": 1.04117405, + "balance_loss_mlp": 1.02651823, + "epoch": 0.4365549376221254, + "flos": 30592120775040.0, + "grad_norm": 1.817178369668107, + "language_loss": 0.71150047, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.73291832, + "num_input_tokens_seen": 155669945, + "step": 7261, + "time_per_iteration": 2.6088192462921143 + }, + { + "auxiliary_loss_clip": 0.01073701, + "auxiliary_loss_mlp": 0.01050368, + "balance_loss_clip": 1.04024625, + "balance_loss_mlp": 1.03521013, + "epoch": 0.43661506087479335, + "flos": 17165875795200.0, + "grad_norm": 2.0033602905366945, + "language_loss": 0.69336605, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.71460676, + "num_input_tokens_seen": 155688555, + "step": 7262, + "time_per_iteration": 2.5762970447540283 + }, + { + "auxiliary_loss_clip": 0.01066765, + "auxiliary_loss_mlp": 0.0103067, + "balance_loss_clip": 1.0453409, + "balance_loss_mlp": 1.01886189, + "epoch": 0.4366751841274613, + "flos": 22047499710720.0, + "grad_norm": 1.6913406143307854, + "language_loss": 0.79506409, + "learning_rate": 2.501852344559726e-06, + "loss": 0.81603837, + "num_input_tokens_seen": 155705370, + "step": 7263, + "time_per_iteration": 4.011101007461548 + }, + { + "auxiliary_loss_clip": 0.01085772, + "auxiliary_loss_mlp": 0.01045001, + "balance_loss_clip": 1.04642701, + "balance_loss_mlp": 1.03154778, + "epoch": 0.4367353073801293, + "flos": 15997306631040.0, + "grad_norm": 1.7617497799418937, + "language_loss": 0.7511878, + "learning_rate": 2.50147533371401e-06, + "loss": 0.77249551, + "num_input_tokens_seen": 155721890, + "step": 7264, + "time_per_iteration": 3.9633805751800537 + }, + { + "auxiliary_loss_clip": 0.0106874, + "auxiliary_loss_mlp": 0.01033149, + "balance_loss_clip": 1.04427171, + "balance_loss_mlp": 1.01911759, + "epoch": 0.43679543063279724, + "flos": 38217535868160.0, + "grad_norm": 2.400409641338185, + "language_loss": 0.61648768, + "learning_rate": 2.501098303852298e-06, + "loss": 0.63750654, + "num_input_tokens_seen": 155743970, + "step": 7265, + "time_per_iteration": 2.7330641746520996 + }, + { + "auxiliary_loss_clip": 0.01096577, + "auxiliary_loss_mlp": 0.01027947, + "balance_loss_clip": 1.04157257, + "balance_loss_mlp": 1.01507807, + "epoch": 0.4368555538854652, + "flos": 15193230727680.0, + "grad_norm": 1.9545913321836685, + "language_loss": 0.72570801, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.74695319, + "num_input_tokens_seen": 155761830, + "step": 7266, + "time_per_iteration": 2.523548126220703 + }, + { + "auxiliary_loss_clip": 0.01098852, + "auxiliary_loss_mlp": 0.01033409, + "balance_loss_clip": 1.04542255, + "balance_loss_mlp": 1.02061224, + "epoch": 0.4369156771381332, + "flos": 23069086421760.0, + "grad_norm": 2.558901498717704, + "language_loss": 0.81844378, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.83976638, + "num_input_tokens_seen": 155779610, + "step": 7267, + "time_per_iteration": 2.5329091548919678 + }, + { + "auxiliary_loss_clip": 0.01117704, + "auxiliary_loss_mlp": 0.01030942, + "balance_loss_clip": 1.04261303, + "balance_loss_mlp": 1.01867557, + "epoch": 0.43697580039080114, + "flos": 23441085624960.0, + "grad_norm": 2.0973947894285203, + "language_loss": 0.74858427, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.77007073, + "num_input_tokens_seen": 155798765, + "step": 7268, + "time_per_iteration": 2.490900754928589 + }, + { + "auxiliary_loss_clip": 0.01126043, + "auxiliary_loss_mlp": 0.01037754, + "balance_loss_clip": 1.04504108, + "balance_loss_mlp": 1.02358043, + "epoch": 0.4370359236434691, + "flos": 18514680428160.0, + "grad_norm": 2.107106413817305, + "language_loss": 0.80177653, + "learning_rate": 2.499589994531454e-06, + "loss": 0.82341444, + "num_input_tokens_seen": 155817750, + "step": 7269, + "time_per_iteration": 2.4495410919189453 + }, + { + "auxiliary_loss_clip": 0.01101904, + "auxiliary_loss_mlp": 0.01035499, + "balance_loss_clip": 1.04506707, + "balance_loss_mlp": 1.02247596, + "epoch": 0.43709604689613707, + "flos": 23222497409280.0, + "grad_norm": 1.9840089485260395, + "language_loss": 0.75407505, + "learning_rate": 2.499212869804237e-06, + "loss": 0.7754491, + "num_input_tokens_seen": 155836490, + "step": 7270, + "time_per_iteration": 2.537155866622925 + }, + { + "auxiliary_loss_clip": 0.01062421, + "auxiliary_loss_mlp": 0.01040168, + "balance_loss_clip": 1.03981733, + "balance_loss_mlp": 1.02467108, + "epoch": 0.43715617014880503, + "flos": 23803711378560.0, + "grad_norm": 1.887500388953251, + "language_loss": 0.79860747, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.81963336, + "num_input_tokens_seen": 155856225, + "step": 7271, + "time_per_iteration": 2.6116034984588623 + }, + { + "auxiliary_loss_clip": 0.010383, + "auxiliary_loss_mlp": 0.01005072, + "balance_loss_clip": 1.02248502, + "balance_loss_mlp": 1.00331962, + "epoch": 0.437216293401473, + "flos": 61941204766080.0, + "grad_norm": 0.7037976774725125, + "language_loss": 0.54939276, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.56982648, + "num_input_tokens_seen": 155916770, + "step": 7272, + "time_per_iteration": 3.1886825561523438 + }, + { + "auxiliary_loss_clip": 0.01124651, + "auxiliary_loss_mlp": 0.01039253, + "balance_loss_clip": 1.04469633, + "balance_loss_mlp": 1.02481043, + "epoch": 0.43727641665414096, + "flos": 21982250655360.0, + "grad_norm": 1.657729723776728, + "language_loss": 0.6988585, + "learning_rate": 2.498081382098581e-06, + "loss": 0.72049749, + "num_input_tokens_seen": 155936490, + "step": 7273, + "time_per_iteration": 2.4779486656188965 + }, + { + "auxiliary_loss_clip": 0.01098901, + "auxiliary_loss_mlp": 0.01043537, + "balance_loss_clip": 1.04207301, + "balance_loss_mlp": 1.0280515, + "epoch": 0.437336539906809, + "flos": 39530860842240.0, + "grad_norm": 1.7662562886006012, + "language_loss": 0.75557721, + "learning_rate": 2.497704181736367e-06, + "loss": 0.77700162, + "num_input_tokens_seen": 155957595, + "step": 7274, + "time_per_iteration": 2.672356367111206 + }, + { + "auxiliary_loss_clip": 0.01103444, + "auxiliary_loss_mlp": 0.01025325, + "balance_loss_clip": 1.04186785, + "balance_loss_mlp": 1.0135653, + "epoch": 0.43739666315947695, + "flos": 17457147181440.0, + "grad_norm": 1.838885308291702, + "language_loss": 0.80172539, + "learning_rate": 2.49732696250116e-06, + "loss": 0.82301307, + "num_input_tokens_seen": 155975710, + "step": 7275, + "time_per_iteration": 2.493558883666992 + }, + { + "auxiliary_loss_clip": 0.01100411, + "auxiliary_loss_mlp": 0.01034172, + "balance_loss_clip": 1.04695892, + "balance_loss_mlp": 1.02090383, + "epoch": 0.4374567864121449, + "flos": 16358747235840.0, + "grad_norm": 2.2259481091365645, + "language_loss": 0.80802965, + "learning_rate": 2.496949724407266e-06, + "loss": 0.82937551, + "num_input_tokens_seen": 155993090, + "step": 7276, + "time_per_iteration": 2.5278773307800293 + }, + { + "auxiliary_loss_clip": 0.01110603, + "auxiliary_loss_mlp": 0.01029639, + "balance_loss_clip": 1.04522681, + "balance_loss_mlp": 1.01557779, + "epoch": 0.4375169096648129, + "flos": 30587523834240.0, + "grad_norm": 2.479621374122403, + "language_loss": 0.73125517, + "learning_rate": 2.496572467468988e-06, + "loss": 0.75265765, + "num_input_tokens_seen": 156013685, + "step": 7277, + "time_per_iteration": 2.598536491394043 + }, + { + "auxiliary_loss_clip": 0.01101481, + "auxiliary_loss_mlp": 0.00787412, + "balance_loss_clip": 1.04628766, + "balance_loss_mlp": 1.00917768, + "epoch": 0.43757703291748085, + "flos": 30555599621760.0, + "grad_norm": 1.8589230004581698, + "language_loss": 0.73223746, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.75112641, + "num_input_tokens_seen": 156034300, + "step": 7278, + "time_per_iteration": 2.6280500888824463 + }, + { + "auxiliary_loss_clip": 0.01090068, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.04243207, + "balance_loss_mlp": 1.0203706, + "epoch": 0.4376371561701488, + "flos": 21397373498880.0, + "grad_norm": 1.6253055856092011, + "language_loss": 0.66339689, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.68462336, + "num_input_tokens_seen": 156053805, + "step": 7279, + "time_per_iteration": 2.550201892852783 + }, + { + "auxiliary_loss_clip": 0.0112503, + "auxiliary_loss_mlp": 0.01033004, + "balance_loss_clip": 1.04413152, + "balance_loss_mlp": 1.01925349, + "epoch": 0.4376972794228168, + "flos": 23404384903680.0, + "grad_norm": 1.7962008852571945, + "language_loss": 0.82139683, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.84297723, + "num_input_tokens_seen": 156073295, + "step": 7280, + "time_per_iteration": 2.480160713195801 + }, + { + "auxiliary_loss_clip": 0.01096027, + "auxiliary_loss_mlp": 0.01031306, + "balance_loss_clip": 1.04140377, + "balance_loss_mlp": 1.01856279, + "epoch": 0.43775740267548474, + "flos": 22892945103360.0, + "grad_norm": 1.5435880847155574, + "language_loss": 0.76742607, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.78869939, + "num_input_tokens_seen": 156094540, + "step": 7281, + "time_per_iteration": 2.5410802364349365 + }, + { + "auxiliary_loss_clip": 0.01094955, + "auxiliary_loss_mlp": 0.01039375, + "balance_loss_clip": 1.04076898, + "balance_loss_mlp": 1.0259521, + "epoch": 0.4378175259281527, + "flos": 23294390480640.0, + "grad_norm": 2.085014865884542, + "language_loss": 0.7566514, + "learning_rate": 2.494685900612569e-06, + "loss": 0.77799469, + "num_input_tokens_seen": 156114070, + "step": 7282, + "time_per_iteration": 2.529700517654419 + }, + { + "auxiliary_loss_clip": 0.01085106, + "auxiliary_loss_mlp": 0.01041089, + "balance_loss_clip": 1.04415417, + "balance_loss_mlp": 1.02702856, + "epoch": 0.43787764918082067, + "flos": 23876897339520.0, + "grad_norm": 1.6748802268441414, + "language_loss": 0.84705281, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.86831474, + "num_input_tokens_seen": 156132130, + "step": 7283, + "time_per_iteration": 2.5963010787963867 + }, + { + "auxiliary_loss_clip": 0.0110727, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.04401755, + "balance_loss_mlp": 1.0214963, + "epoch": 0.43793777243348864, + "flos": 23988148738560.0, + "grad_norm": 1.7193257834672488, + "language_loss": 0.8056078, + "learning_rate": 2.49393114246007e-06, + "loss": 0.82703716, + "num_input_tokens_seen": 156150820, + "step": 7284, + "time_per_iteration": 2.5541956424713135 + }, + { + "auxiliary_loss_clip": 0.01111387, + "auxiliary_loss_mlp": 0.01042707, + "balance_loss_clip": 1.04373908, + "balance_loss_mlp": 1.02995777, + "epoch": 0.4379978956861566, + "flos": 18624064320000.0, + "grad_norm": 1.4682978882081847, + "language_loss": 0.79967725, + "learning_rate": 2.493553735281787e-06, + "loss": 0.82121819, + "num_input_tokens_seen": 156170125, + "step": 7285, + "time_per_iteration": 2.4881415367126465 + }, + { + "auxiliary_loss_clip": 0.01108772, + "auxiliary_loss_mlp": 0.0102905, + "balance_loss_clip": 1.04188323, + "balance_loss_mlp": 1.0158, + "epoch": 0.43805801893882457, + "flos": 21981388728960.0, + "grad_norm": 2.039534793323649, + "language_loss": 0.7510916, + "learning_rate": 2.493176309387897e-06, + "loss": 0.77246988, + "num_input_tokens_seen": 156187320, + "step": 7286, + "time_per_iteration": 2.495570659637451 + }, + { + "auxiliary_loss_clip": 0.01086601, + "auxiliary_loss_mlp": 0.01028963, + "balance_loss_clip": 1.0411942, + "balance_loss_mlp": 1.01488447, + "epoch": 0.43811814219149253, + "flos": 26393337383040.0, + "grad_norm": 1.5211932527350813, + "language_loss": 0.73524094, + "learning_rate": 2.492798864792712e-06, + "loss": 0.75639659, + "num_input_tokens_seen": 156207455, + "step": 7287, + "time_per_iteration": 2.6239006519317627 + }, + { + "auxiliary_loss_clip": 0.01100832, + "auxiliary_loss_mlp": 0.01037939, + "balance_loss_clip": 1.04391932, + "balance_loss_mlp": 1.02434969, + "epoch": 0.43817826544416055, + "flos": 17493309198720.0, + "grad_norm": 1.7396482701404024, + "language_loss": 0.8220396, + "learning_rate": 2.492421401510545e-06, + "loss": 0.8434273, + "num_input_tokens_seen": 156226560, + "step": 7288, + "time_per_iteration": 2.5081512928009033 + }, + { + "auxiliary_loss_clip": 0.01085863, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.04114449, + "balance_loss_mlp": 1.01656342, + "epoch": 0.4382383886968285, + "flos": 21581020759680.0, + "grad_norm": 1.3528146914506565, + "language_loss": 0.83936065, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.86052108, + "num_input_tokens_seen": 156246740, + "step": 7289, + "time_per_iteration": 2.578768730163574 + }, + { + "auxiliary_loss_clip": 0.0109165, + "auxiliary_loss_mlp": 0.01049747, + "balance_loss_clip": 1.03985119, + "balance_loss_mlp": 1.03379631, + "epoch": 0.4382985119494965, + "flos": 27923742201600.0, + "grad_norm": 1.5559799418764293, + "language_loss": 0.7812624, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.80267644, + "num_input_tokens_seen": 156266440, + "step": 7290, + "time_per_iteration": 2.595935583114624 + }, + { + "auxiliary_loss_clip": 0.01122612, + "auxiliary_loss_mlp": 0.0103598, + "balance_loss_clip": 1.04517508, + "balance_loss_mlp": 1.02328432, + "epoch": 0.43835863520216445, + "flos": 24936836797440.0, + "grad_norm": 1.719626289184289, + "language_loss": 0.78072757, + "learning_rate": 2.491288899685288e-06, + "loss": 0.80231345, + "num_input_tokens_seen": 156286900, + "step": 7291, + "time_per_iteration": 2.5112600326538086 + }, + { + "auxiliary_loss_clip": 0.01083989, + "auxiliary_loss_mlp": 0.01032215, + "balance_loss_clip": 1.03930771, + "balance_loss_mlp": 1.0186131, + "epoch": 0.4384187584548324, + "flos": 33510293504640.0, + "grad_norm": 1.6120568289254253, + "language_loss": 0.64865696, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.669819, + "num_input_tokens_seen": 156307690, + "step": 7292, + "time_per_iteration": 2.6372756958007812 + }, + { + "auxiliary_loss_clip": 0.01106036, + "auxiliary_loss_mlp": 0.01040141, + "balance_loss_clip": 1.04044747, + "balance_loss_mlp": 1.02589548, + "epoch": 0.4384788817075004, + "flos": 23951052967680.0, + "grad_norm": 1.6676056730363993, + "language_loss": 0.74325472, + "learning_rate": 2.49053380529597e-06, + "loss": 0.76471657, + "num_input_tokens_seen": 156326620, + "step": 7293, + "time_per_iteration": 2.521942615509033 + }, + { + "auxiliary_loss_clip": 0.01093037, + "auxiliary_loss_mlp": 0.01041417, + "balance_loss_clip": 1.04162443, + "balance_loss_mlp": 1.02697444, + "epoch": 0.43853900496016834, + "flos": 19098516090240.0, + "grad_norm": 2.0123886627793897, + "language_loss": 0.78312457, + "learning_rate": 2.490156230192516e-06, + "loss": 0.80446905, + "num_input_tokens_seen": 156345495, + "step": 7294, + "time_per_iteration": 2.524009943008423 + }, + { + "auxiliary_loss_clip": 0.01082955, + "auxiliary_loss_mlp": 0.01046811, + "balance_loss_clip": 1.04227614, + "balance_loss_mlp": 1.03347743, + "epoch": 0.4385991282128363, + "flos": 13225362168960.0, + "grad_norm": 1.634113811414013, + "language_loss": 0.73092985, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.75222754, + "num_input_tokens_seen": 156363155, + "step": 7295, + "time_per_iteration": 2.560290813446045 + }, + { + "auxiliary_loss_clip": 0.01084671, + "auxiliary_loss_mlp": 0.01045887, + "balance_loss_clip": 1.04488838, + "balance_loss_mlp": 1.02947187, + "epoch": 0.4386592514655043, + "flos": 14319883445760.0, + "grad_norm": 1.9323782473501545, + "language_loss": 0.75108927, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.7723949, + "num_input_tokens_seen": 156380940, + "step": 7296, + "time_per_iteration": 4.475850343704224 + }, + { + "auxiliary_loss_clip": 0.01109721, + "auxiliary_loss_mlp": 0.01034392, + "balance_loss_clip": 1.04168224, + "balance_loss_mlp": 1.02099895, + "epoch": 0.43871937471817224, + "flos": 22784423137920.0, + "grad_norm": 1.5040344378435904, + "language_loss": 0.69268531, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.71412647, + "num_input_tokens_seen": 156400415, + "step": 7297, + "time_per_iteration": 3.9088637828826904 + }, + { + "auxiliary_loss_clip": 0.0110303, + "auxiliary_loss_mlp": 0.01030831, + "balance_loss_clip": 1.0407443, + "balance_loss_mlp": 1.01750314, + "epoch": 0.4387794979708402, + "flos": 28072304853120.0, + "grad_norm": 1.612399057893791, + "language_loss": 0.70181173, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.72315037, + "num_input_tokens_seen": 156421120, + "step": 7298, + "time_per_iteration": 2.5521249771118164 + }, + { + "auxiliary_loss_clip": 0.01111534, + "auxiliary_loss_mlp": 0.01030791, + "balance_loss_clip": 1.04622579, + "balance_loss_mlp": 1.01725507, + "epoch": 0.43883962122350817, + "flos": 26249551240320.0, + "grad_norm": 2.9047359455227775, + "language_loss": 0.72251672, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.74394, + "num_input_tokens_seen": 156441535, + "step": 7299, + "time_per_iteration": 2.542653799057007 + }, + { + "auxiliary_loss_clip": 0.01098557, + "auxiliary_loss_mlp": 0.00793016, + "balance_loss_clip": 1.04198027, + "balance_loss_mlp": 1.0152688, + "epoch": 0.43889974447617613, + "flos": 25883765089920.0, + "grad_norm": 1.74336847452392, + "language_loss": 0.76984966, + "learning_rate": 2.487890389750719e-06, + "loss": 0.78876543, + "num_input_tokens_seen": 156462015, + "step": 7300, + "time_per_iteration": 2.5758426189422607 + }, + { + "auxiliary_loss_clip": 0.01100555, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.04314923, + "balance_loss_mlp": 1.02014923, + "epoch": 0.43895986772884416, + "flos": 25046615738880.0, + "grad_norm": 1.7160029741199156, + "language_loss": 0.70551604, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.72686374, + "num_input_tokens_seen": 156482165, + "step": 7301, + "time_per_iteration": 3.9254062175750732 + }, + { + "auxiliary_loss_clip": 0.01076496, + "auxiliary_loss_mlp": 0.01042117, + "balance_loss_clip": 1.04203081, + "balance_loss_mlp": 1.02551103, + "epoch": 0.4390199909815121, + "flos": 25994585525760.0, + "grad_norm": 1.9129672109297557, + "language_loss": 0.70615268, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.72733879, + "num_input_tokens_seen": 156503170, + "step": 7302, + "time_per_iteration": 2.633246660232544 + }, + { + "auxiliary_loss_clip": 0.01099731, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.04430282, + "balance_loss_mlp": 1.0210278, + "epoch": 0.4390801142341801, + "flos": 29022249888000.0, + "grad_norm": 1.6276782129548113, + "language_loss": 0.82104516, + "learning_rate": 2.486757219574983e-06, + "loss": 0.84238183, + "num_input_tokens_seen": 156523005, + "step": 7303, + "time_per_iteration": 3.968296766281128 + }, + { + "auxiliary_loss_clip": 0.01110485, + "auxiliary_loss_mlp": 0.01040303, + "balance_loss_clip": 1.04388142, + "balance_loss_mlp": 1.02490163, + "epoch": 0.43914023748684805, + "flos": 33438544087680.0, + "grad_norm": 1.832046484916962, + "language_loss": 0.68183291, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.70334077, + "num_input_tokens_seen": 156544440, + "step": 7304, + "time_per_iteration": 2.58320951461792 + }, + { + "auxiliary_loss_clip": 0.01101855, + "auxiliary_loss_mlp": 0.00790715, + "balance_loss_clip": 1.04354262, + "balance_loss_mlp": 1.01553547, + "epoch": 0.439200360739516, + "flos": 34531844302080.0, + "grad_norm": 1.562163311094618, + "language_loss": 0.78185368, + "learning_rate": 2.486001680477873e-06, + "loss": 0.80077946, + "num_input_tokens_seen": 156565410, + "step": 7305, + "time_per_iteration": 2.6304421424865723 + }, + { + "auxiliary_loss_clip": 0.01098786, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.04474175, + "balance_loss_mlp": 1.0177002, + "epoch": 0.439260483992184, + "flos": 21907843632000.0, + "grad_norm": 1.9824592696198204, + "language_loss": 0.68690014, + "learning_rate": 2.485623883278308e-06, + "loss": 0.70820343, + "num_input_tokens_seen": 156584210, + "step": 7306, + "time_per_iteration": 2.5036439895629883 + }, + { + "auxiliary_loss_clip": 0.01082146, + "auxiliary_loss_mlp": 0.01028948, + "balance_loss_clip": 1.04121089, + "balance_loss_mlp": 1.01461899, + "epoch": 0.43932060724485195, + "flos": 20996430912000.0, + "grad_norm": 1.5215665927938415, + "language_loss": 0.62322366, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.64433455, + "num_input_tokens_seen": 156602730, + "step": 7307, + "time_per_iteration": 2.539475202560425 + }, + { + "auxiliary_loss_clip": 0.01125624, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.04432118, + "balance_loss_mlp": 1.01953042, + "epoch": 0.4393807304975199, + "flos": 17747053850880.0, + "grad_norm": 2.0718164594003605, + "language_loss": 0.71858948, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.7401768, + "num_input_tokens_seen": 156619405, + "step": 7308, + "time_per_iteration": 2.4014334678649902 + }, + { + "auxiliary_loss_clip": 0.01106304, + "auxiliary_loss_mlp": 0.01032865, + "balance_loss_clip": 1.04377794, + "balance_loss_mlp": 1.01843476, + "epoch": 0.4394408537501879, + "flos": 22528523669760.0, + "grad_norm": 1.6903482489473085, + "language_loss": 0.76779723, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.78918892, + "num_input_tokens_seen": 156638165, + "step": 7309, + "time_per_iteration": 2.52130126953125 + }, + { + "auxiliary_loss_clip": 0.01107136, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.04275966, + "balance_loss_mlp": 1.01618564, + "epoch": 0.43950097700285584, + "flos": 23440654661760.0, + "grad_norm": 1.7338094526054357, + "language_loss": 0.70763177, + "learning_rate": 2.484112510474251e-06, + "loss": 0.72899556, + "num_input_tokens_seen": 156658845, + "step": 7310, + "time_per_iteration": 2.500941038131714 + }, + { + "auxiliary_loss_clip": 0.01097353, + "auxiliary_loss_mlp": 0.00791813, + "balance_loss_clip": 1.04431343, + "balance_loss_mlp": 1.01585042, + "epoch": 0.4395611002555238, + "flos": 23180696956800.0, + "grad_norm": 3.1050892008620625, + "language_loss": 0.76220489, + "learning_rate": 2.483734621343429e-06, + "loss": 0.78109652, + "num_input_tokens_seen": 156677275, + "step": 7311, + "time_per_iteration": 2.5199697017669678 + }, + { + "auxiliary_loss_clip": 0.01113482, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.04403877, + "balance_loss_mlp": 1.02007318, + "epoch": 0.43962122350819177, + "flos": 22127365601280.0, + "grad_norm": 2.750716085253334, + "language_loss": 0.81371069, + "learning_rate": 2.483356713869341e-06, + "loss": 0.83517921, + "num_input_tokens_seen": 156695815, + "step": 7312, + "time_per_iteration": 2.481703281402588 + }, + { + "auxiliary_loss_clip": 0.01089997, + "auxiliary_loss_mlp": 0.01030484, + "balance_loss_clip": 1.04064202, + "balance_loss_mlp": 1.01725221, + "epoch": 0.43968134676085974, + "flos": 17420554200960.0, + "grad_norm": 3.5717380847507654, + "language_loss": 0.85412443, + "learning_rate": 2.482978788066318e-06, + "loss": 0.87532926, + "num_input_tokens_seen": 156714385, + "step": 7313, + "time_per_iteration": 2.5297653675079346 + }, + { + "auxiliary_loss_clip": 0.01099571, + "auxiliary_loss_mlp": 0.010295, + "balance_loss_clip": 1.04185343, + "balance_loss_mlp": 1.01567745, + "epoch": 0.43974147001352776, + "flos": 18952646958720.0, + "grad_norm": 1.8014453103123205, + "language_loss": 0.67811906, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.69940972, + "num_input_tokens_seen": 156732615, + "step": 7314, + "time_per_iteration": 2.5030927658081055 + }, + { + "auxiliary_loss_clip": 0.01104159, + "auxiliary_loss_mlp": 0.01030509, + "balance_loss_clip": 1.04398704, + "balance_loss_mlp": 1.01644814, + "epoch": 0.4398015932661957, + "flos": 18953508885120.0, + "grad_norm": 2.3348147639444816, + "language_loss": 0.76656681, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.7879135, + "num_input_tokens_seen": 156750920, + "step": 7315, + "time_per_iteration": 2.5152525901794434 + }, + { + "auxiliary_loss_clip": 0.0109834, + "auxiliary_loss_mlp": 0.01032692, + "balance_loss_clip": 1.04424989, + "balance_loss_mlp": 1.01914954, + "epoch": 0.4398617165188637, + "flos": 24199913370240.0, + "grad_norm": 2.086122811746675, + "language_loss": 0.74020731, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.76151764, + "num_input_tokens_seen": 156768520, + "step": 7316, + "time_per_iteration": 2.5329952239990234 + }, + { + "auxiliary_loss_clip": 0.01087393, + "auxiliary_loss_mlp": 0.01035921, + "balance_loss_clip": 1.04759812, + "balance_loss_mlp": 1.02284336, + "epoch": 0.43992183977153165, + "flos": 22236677665920.0, + "grad_norm": 2.6561423476946335, + "language_loss": 0.64965689, + "learning_rate": 2.481466901851506e-06, + "loss": 0.67089009, + "num_input_tokens_seen": 156788700, + "step": 7317, + "time_per_iteration": 2.5947585105895996 + }, + { + "auxiliary_loss_clip": 0.01097145, + "auxiliary_loss_mlp": 0.01037867, + "balance_loss_clip": 1.04348707, + "balance_loss_mlp": 1.02453303, + "epoch": 0.4399819630241996, + "flos": 18697465762560.0, + "grad_norm": 2.0040467979956107, + "language_loss": 0.7995286, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.82087874, + "num_input_tokens_seen": 156806470, + "step": 7318, + "time_per_iteration": 2.4920053482055664 + }, + { + "auxiliary_loss_clip": 0.01081911, + "auxiliary_loss_mlp": 0.01040482, + "balance_loss_clip": 1.04068971, + "balance_loss_mlp": 1.02555692, + "epoch": 0.4400420862768676, + "flos": 23879375377920.0, + "grad_norm": 1.7542271556516045, + "language_loss": 0.79349953, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.81472343, + "num_input_tokens_seen": 156825895, + "step": 7319, + "time_per_iteration": 2.568981170654297 + }, + { + "auxiliary_loss_clip": 0.011073, + "auxiliary_loss_mlp": 0.01038426, + "balance_loss_clip": 1.04273808, + "balance_loss_mlp": 1.02455008, + "epoch": 0.44010220952953555, + "flos": 28037615293440.0, + "grad_norm": 2.2495257086929734, + "language_loss": 0.79824746, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.81970465, + "num_input_tokens_seen": 156845990, + "step": 7320, + "time_per_iteration": 2.5480270385742188 + }, + { + "auxiliary_loss_clip": 0.01084848, + "auxiliary_loss_mlp": 0.01042859, + "balance_loss_clip": 1.04328895, + "balance_loss_mlp": 1.02965653, + "epoch": 0.4401623327822035, + "flos": 23768985905280.0, + "grad_norm": 1.7401219076481353, + "language_loss": 0.69371641, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.71499348, + "num_input_tokens_seen": 156866685, + "step": 7321, + "time_per_iteration": 2.5548219680786133 + }, + { + "auxiliary_loss_clip": 0.01014886, + "auxiliary_loss_mlp": 0.01018539, + "balance_loss_clip": 1.02822495, + "balance_loss_mlp": 1.01687038, + "epoch": 0.4402224560348715, + "flos": 70774583264640.0, + "grad_norm": 0.883113647100991, + "language_loss": 0.56907403, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.58940828, + "num_input_tokens_seen": 156923450, + "step": 7322, + "time_per_iteration": 3.324855089187622 + }, + { + "auxiliary_loss_clip": 0.0107117, + "auxiliary_loss_mlp": 0.01040506, + "balance_loss_clip": 1.03775072, + "balance_loss_mlp": 1.02678514, + "epoch": 0.44028257928753944, + "flos": 22891795868160.0, + "grad_norm": 1.4123092128131713, + "language_loss": 0.76160324, + "learning_rate": 2.479198525097822e-06, + "loss": 0.78272003, + "num_input_tokens_seen": 156944795, + "step": 7323, + "time_per_iteration": 2.5896964073181152 + }, + { + "auxiliary_loss_clip": 0.01116577, + "auxiliary_loss_mlp": 0.01042789, + "balance_loss_clip": 1.045555, + "balance_loss_mlp": 1.02899647, + "epoch": 0.4403427025402074, + "flos": 17895760156800.0, + "grad_norm": 1.5210203307415013, + "language_loss": 0.80623448, + "learning_rate": 2.478820398622511e-06, + "loss": 0.82782817, + "num_input_tokens_seen": 156962755, + "step": 7324, + "time_per_iteration": 2.4788978099823 + }, + { + "auxiliary_loss_clip": 0.01023382, + "auxiliary_loss_mlp": 0.01005248, + "balance_loss_clip": 1.02391076, + "balance_loss_mlp": 1.00338793, + "epoch": 0.4404028257928754, + "flos": 69562525708800.0, + "grad_norm": 0.663576768204068, + "language_loss": 0.54558647, + "learning_rate": 2.478442253990283e-06, + "loss": 0.56587279, + "num_input_tokens_seen": 157028095, + "step": 7325, + "time_per_iteration": 3.1552305221557617 + }, + { + "auxiliary_loss_clip": 0.01121343, + "auxiliary_loss_mlp": 0.01028078, + "balance_loss_clip": 1.04523802, + "balance_loss_mlp": 1.01594245, + "epoch": 0.44046294904554334, + "flos": 20923675914240.0, + "grad_norm": 1.3848740969027262, + "language_loss": 0.69718587, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.71868008, + "num_input_tokens_seen": 157048365, + "step": 7326, + "time_per_iteration": 2.495586633682251 + }, + { + "auxiliary_loss_clip": 0.01081628, + "auxiliary_loss_mlp": 0.0103075, + "balance_loss_clip": 1.04111886, + "balance_loss_mlp": 1.01750553, + "epoch": 0.44052307229821136, + "flos": 23623475909760.0, + "grad_norm": 1.6772765867748605, + "language_loss": 0.76465464, + "learning_rate": 2.477685910312432e-06, + "loss": 0.7857784, + "num_input_tokens_seen": 157069130, + "step": 7327, + "time_per_iteration": 2.584575653076172 + }, + { + "auxiliary_loss_clip": 0.0109579, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.04126573, + "balance_loss_mlp": 1.01946712, + "epoch": 0.4405831955508793, + "flos": 17597665186560.0, + "grad_norm": 1.8962918938933897, + "language_loss": 0.8415978, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.86288345, + "num_input_tokens_seen": 157084940, + "step": 7328, + "time_per_iteration": 2.489612579345703 + }, + { + "auxiliary_loss_clip": 0.01097188, + "auxiliary_loss_mlp": 0.01029125, + "balance_loss_clip": 1.0418303, + "balance_loss_mlp": 1.01605964, + "epoch": 0.4406433188035473, + "flos": 21463376739840.0, + "grad_norm": 2.4240167781141895, + "language_loss": 0.77349484, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.79475796, + "num_input_tokens_seen": 157102770, + "step": 7329, + "time_per_iteration": 2.5101242065429688 + }, + { + "auxiliary_loss_clip": 0.01109307, + "auxiliary_loss_mlp": 0.01038674, + "balance_loss_clip": 1.04293323, + "balance_loss_mlp": 1.02445197, + "epoch": 0.44070344205621526, + "flos": 22673566788480.0, + "grad_norm": 1.5900670990498873, + "language_loss": 0.73263991, + "learning_rate": 2.476551258977278e-06, + "loss": 0.75411963, + "num_input_tokens_seen": 157122035, + "step": 7330, + "time_per_iteration": 2.503915309906006 + }, + { + "auxiliary_loss_clip": 0.01103224, + "auxiliary_loss_mlp": 0.01039787, + "balance_loss_clip": 1.04657865, + "balance_loss_mlp": 1.02726364, + "epoch": 0.4407635653088832, + "flos": 23441193365760.0, + "grad_norm": 1.9775166347986528, + "language_loss": 0.74469554, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.76612568, + "num_input_tokens_seen": 157142800, + "step": 7331, + "time_per_iteration": 2.5767111778259277 + }, + { + "auxiliary_loss_clip": 0.01069181, + "auxiliary_loss_mlp": 0.01030987, + "balance_loss_clip": 1.04067159, + "balance_loss_mlp": 1.01786768, + "epoch": 0.4408236885615512, + "flos": 24021294013440.0, + "grad_norm": 1.4761327043926402, + "language_loss": 0.76282358, + "learning_rate": 2.475794734375581e-06, + "loss": 0.78382528, + "num_input_tokens_seen": 157163295, + "step": 7332, + "time_per_iteration": 2.611268997192383 + }, + { + "auxiliary_loss_clip": 0.01094951, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.04512632, + "balance_loss_mlp": 1.02548027, + "epoch": 0.44088381181421915, + "flos": 12676826597760.0, + "grad_norm": 1.7762109728756612, + "language_loss": 0.73912895, + "learning_rate": 2.475416445004285e-06, + "loss": 0.76045829, + "num_input_tokens_seen": 157180890, + "step": 7333, + "time_per_iteration": 2.4792332649230957 + }, + { + "auxiliary_loss_clip": 0.01084746, + "auxiliary_loss_mlp": 0.01032567, + "balance_loss_clip": 1.04504991, + "balance_loss_mlp": 1.01950216, + "epoch": 0.4409439350668871, + "flos": 24569865498240.0, + "grad_norm": 1.6294356654849593, + "language_loss": 0.79731101, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.81848413, + "num_input_tokens_seen": 157200580, + "step": 7334, + "time_per_iteration": 3.951969861984253 + }, + { + "auxiliary_loss_clip": 0.01094874, + "auxiliary_loss_mlp": 0.01038546, + "balance_loss_clip": 1.0414257, + "balance_loss_mlp": 1.02246523, + "epoch": 0.4410040583195551, + "flos": 22668574798080.0, + "grad_norm": 2.0672676216740014, + "language_loss": 0.75795656, + "learning_rate": 2.47465981219252e-06, + "loss": 0.77929074, + "num_input_tokens_seen": 157218345, + "step": 7335, + "time_per_iteration": 4.149829626083374 + }, + { + "auxiliary_loss_clip": 0.01105685, + "auxiliary_loss_mlp": 0.01037094, + "balance_loss_clip": 1.04501927, + "balance_loss_mlp": 1.02372479, + "epoch": 0.44106418157222305, + "flos": 10852528700160.0, + "grad_norm": 1.879587574450716, + "language_loss": 0.72824997, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.74967778, + "num_input_tokens_seen": 157234395, + "step": 7336, + "time_per_iteration": 2.513986110687256 + }, + { + "auxiliary_loss_clip": 0.01113739, + "auxiliary_loss_mlp": 0.01042333, + "balance_loss_clip": 1.04164624, + "balance_loss_mlp": 1.02847493, + "epoch": 0.441124304824891, + "flos": 21726710323200.0, + "grad_norm": 2.0406308707030254, + "language_loss": 0.63156742, + "learning_rate": 2.473903107384165e-06, + "loss": 0.65312815, + "num_input_tokens_seen": 157254805, + "step": 7337, + "time_per_iteration": 2.5007779598236084 + }, + { + "auxiliary_loss_clip": 0.01029244, + "auxiliary_loss_mlp": 0.00771692, + "balance_loss_clip": 1.020679, + "balance_loss_mlp": 1.0097698, + "epoch": 0.441184428077559, + "flos": 63220486625280.0, + "grad_norm": 0.7489733799119582, + "language_loss": 0.52717757, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54518694, + "num_input_tokens_seen": 157317870, + "step": 7338, + "time_per_iteration": 3.194927215576172 + }, + { + "auxiliary_loss_clip": 0.01102979, + "auxiliary_loss_mlp": 0.01042682, + "balance_loss_clip": 1.04181957, + "balance_loss_mlp": 1.02750707, + "epoch": 0.44124455133022694, + "flos": 21177959270400.0, + "grad_norm": 1.9488433477430696, + "language_loss": 0.70537174, + "learning_rate": 2.473146330693997e-06, + "loss": 0.7268284, + "num_input_tokens_seen": 157336505, + "step": 7339, + "time_per_iteration": 3.983253240585327 + }, + { + "auxiliary_loss_clip": 0.01054717, + "auxiliary_loss_mlp": 0.0104524, + "balance_loss_clip": 1.04300976, + "balance_loss_mlp": 1.03150153, + "epoch": 0.4413046745828949, + "flos": 17457865453440.0, + "grad_norm": 1.3853448061024611, + "language_loss": 0.69350052, + "learning_rate": 2.472767915429105e-06, + "loss": 0.71450013, + "num_input_tokens_seen": 157354995, + "step": 7340, + "time_per_iteration": 2.5850021839141846 + }, + { + "auxiliary_loss_clip": 0.01027026, + "auxiliary_loss_mlp": 0.01007957, + "balance_loss_clip": 1.02148509, + "balance_loss_mlp": 1.00637174, + "epoch": 0.4413647978355629, + "flos": 61586153804160.0, + "grad_norm": 0.903915288173658, + "language_loss": 0.6401903, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.6605401, + "num_input_tokens_seen": 157404260, + "step": 7341, + "time_per_iteration": 4.26634669303894 + }, + { + "auxiliary_loss_clip": 0.01085196, + "auxiliary_loss_mlp": 0.01035646, + "balance_loss_clip": 1.0395925, + "balance_loss_mlp": 1.02185905, + "epoch": 0.4414249210882309, + "flos": 27527001505920.0, + "grad_norm": 1.987478911152901, + "language_loss": 0.73558474, + "learning_rate": 2.47201103113145e-06, + "loss": 0.75679308, + "num_input_tokens_seen": 157423045, + "step": 7342, + "time_per_iteration": 2.596327543258667 + }, + { + "auxiliary_loss_clip": 0.01118343, + "auxiliary_loss_mlp": 0.0103652, + "balance_loss_clip": 1.0397495, + "balance_loss_mlp": 1.02245355, + "epoch": 0.44148504434089886, + "flos": 23513984277120.0, + "grad_norm": 1.6995392946371923, + "language_loss": 0.79435456, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.81590319, + "num_input_tokens_seen": 157441815, + "step": 7343, + "time_per_iteration": 2.4992613792419434 + }, + { + "auxiliary_loss_clip": 0.01084078, + "auxiliary_loss_mlp": 0.01032114, + "balance_loss_clip": 1.04259515, + "balance_loss_mlp": 1.0183332, + "epoch": 0.4415451675935668, + "flos": 21580589796480.0, + "grad_norm": 1.7548972450543137, + "language_loss": 0.76559412, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.78675604, + "num_input_tokens_seen": 157460470, + "step": 7344, + "time_per_iteration": 2.5495591163635254 + }, + { + "auxiliary_loss_clip": 0.01026551, + "auxiliary_loss_mlp": 0.01014582, + "balance_loss_clip": 1.01919246, + "balance_loss_mlp": 1.0128535, + "epoch": 0.4416052908462348, + "flos": 59006368126080.0, + "grad_norm": 0.7910025494403222, + "language_loss": 0.63803893, + "learning_rate": 2.470875570480556e-06, + "loss": 0.65845031, + "num_input_tokens_seen": 157512655, + "step": 7345, + "time_per_iteration": 2.879918336868286 + }, + { + "auxiliary_loss_clip": 0.01123826, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.04340053, + "balance_loss_mlp": 1.01925039, + "epoch": 0.44166541409890275, + "flos": 26357642242560.0, + "grad_norm": 1.9458304046262362, + "language_loss": 0.85892463, + "learning_rate": 2.470497047866489e-06, + "loss": 0.88049078, + "num_input_tokens_seen": 157533700, + "step": 7346, + "time_per_iteration": 2.512547016143799 + }, + { + "auxiliary_loss_clip": 0.01111426, + "auxiliary_loss_mlp": 0.01038301, + "balance_loss_clip": 1.04135537, + "balance_loss_mlp": 1.02380466, + "epoch": 0.4417255373515707, + "flos": 20192678231040.0, + "grad_norm": 1.7197144635998982, + "language_loss": 0.80781126, + "learning_rate": 2.470118507411128e-06, + "loss": 0.82930857, + "num_input_tokens_seen": 157551105, + "step": 7347, + "time_per_iteration": 2.4806082248687744 + }, + { + "auxiliary_loss_clip": 0.01096949, + "auxiliary_loss_mlp": 0.0103153, + "balance_loss_clip": 1.04523873, + "balance_loss_mlp": 1.01729655, + "epoch": 0.4417856606042387, + "flos": 17887895078400.0, + "grad_norm": 1.6675034864414466, + "language_loss": 0.82817107, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.84945583, + "num_input_tokens_seen": 157568285, + "step": 7348, + "time_per_iteration": 2.493452548980713 + }, + { + "auxiliary_loss_clip": 0.01113347, + "auxiliary_loss_mlp": 0.01036483, + "balance_loss_clip": 1.04406619, + "balance_loss_mlp": 1.02273178, + "epoch": 0.44184578385690665, + "flos": 27964034282880.0, + "grad_norm": 1.516526505188605, + "language_loss": 0.69955873, + "learning_rate": 2.469361373033938e-06, + "loss": 0.72105706, + "num_input_tokens_seen": 157590405, + "step": 7349, + "time_per_iteration": 2.5466208457946777 + }, + { + "auxiliary_loss_clip": 0.01099646, + "auxiliary_loss_mlp": 0.01035143, + "balance_loss_clip": 1.0402987, + "balance_loss_mlp": 1.02074218, + "epoch": 0.4419059071095746, + "flos": 23367899664000.0, + "grad_norm": 1.7730075595979071, + "language_loss": 0.7432934, + "learning_rate": 2.468982779140819e-06, + "loss": 0.76464134, + "num_input_tokens_seen": 157607420, + "step": 7350, + "time_per_iteration": 2.5074541568756104 + }, + { + "auxiliary_loss_clip": 0.01122072, + "auxiliary_loss_mlp": 0.0103378, + "balance_loss_clip": 1.04226518, + "balance_loss_mlp": 1.02001178, + "epoch": 0.4419660303622426, + "flos": 15012169246080.0, + "grad_norm": 2.1891389855363657, + "language_loss": 0.81140149, + "learning_rate": 2.468604167463827e-06, + "loss": 0.83296001, + "num_input_tokens_seen": 157624990, + "step": 7351, + "time_per_iteration": 2.4371485710144043 + }, + { + "auxiliary_loss_clip": 0.01080397, + "auxiliary_loss_mlp": 0.00794901, + "balance_loss_clip": 1.03796649, + "balance_loss_mlp": 1.02464104, + "epoch": 0.44202615361491054, + "flos": 25371750672000.0, + "grad_norm": 1.4691185488904712, + "language_loss": 0.7304672, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.74922013, + "num_input_tokens_seen": 157645300, + "step": 7352, + "time_per_iteration": 2.598890542984009 + }, + { + "auxiliary_loss_clip": 0.01094131, + "auxiliary_loss_mlp": 0.01027219, + "balance_loss_clip": 1.04283381, + "balance_loss_mlp": 1.01365304, + "epoch": 0.4420862768675785, + "flos": 24681116897280.0, + "grad_norm": 2.027823749704245, + "language_loss": 0.87232047, + "learning_rate": 2.467846890815649e-06, + "loss": 0.893534, + "num_input_tokens_seen": 157664060, + "step": 7353, + "time_per_iteration": 2.5594730377197266 + }, + { + "auxiliary_loss_clip": 0.01122639, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.04311347, + "balance_loss_mlp": 1.02297163, + "epoch": 0.44214640012024653, + "flos": 19528437974400.0, + "grad_norm": 1.8691485799444854, + "language_loss": 0.76387608, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.78546089, + "num_input_tokens_seen": 157680905, + "step": 7354, + "time_per_iteration": 2.4346916675567627 + }, + { + "auxiliary_loss_clip": 0.01081284, + "auxiliary_loss_mlp": 0.01031602, + "balance_loss_clip": 1.04235959, + "balance_loss_mlp": 1.01915681, + "epoch": 0.4422065233729145, + "flos": 47557434003840.0, + "grad_norm": 2.7448450973547067, + "language_loss": 0.6451782, + "learning_rate": 2.467089543204268e-06, + "loss": 0.66630709, + "num_input_tokens_seen": 157701980, + "step": 7355, + "time_per_iteration": 2.7877674102783203 + }, + { + "auxiliary_loss_clip": 0.01123837, + "auxiliary_loss_mlp": 0.01037196, + "balance_loss_clip": 1.04200029, + "balance_loss_mlp": 1.02293825, + "epoch": 0.44226664662558246, + "flos": 19281050029440.0, + "grad_norm": 1.7989927853345915, + "language_loss": 0.77945948, + "learning_rate": 2.466710842823274e-06, + "loss": 0.80106974, + "num_input_tokens_seen": 157720555, + "step": 7356, + "time_per_iteration": 2.4474599361419678 + }, + { + "auxiliary_loss_clip": 0.0110226, + "auxiliary_loss_mlp": 0.00788608, + "balance_loss_clip": 1.04463077, + "balance_loss_mlp": 1.01282287, + "epoch": 0.4423267698782504, + "flos": 17821820010240.0, + "grad_norm": 1.5763897914923495, + "language_loss": 0.7725215, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.79143018, + "num_input_tokens_seen": 157739160, + "step": 7357, + "time_per_iteration": 2.499589443206787 + }, + { + "auxiliary_loss_clip": 0.0109905, + "auxiliary_loss_mlp": 0.01038792, + "balance_loss_clip": 1.04408932, + "balance_loss_mlp": 1.02424264, + "epoch": 0.4423868931309184, + "flos": 29204424691200.0, + "grad_norm": 1.6799675589712149, + "language_loss": 0.73468256, + "learning_rate": 2.465953388982481e-06, + "loss": 0.75606096, + "num_input_tokens_seen": 157760020, + "step": 7358, + "time_per_iteration": 2.605456829071045 + }, + { + "auxiliary_loss_clip": 0.01099128, + "auxiliary_loss_mlp": 0.01032079, + "balance_loss_clip": 1.04754293, + "balance_loss_mlp": 1.01899552, + "epoch": 0.44244701638358636, + "flos": 29713135057920.0, + "grad_norm": 1.701132461754456, + "language_loss": 0.75652349, + "learning_rate": 2.465574635551405e-06, + "loss": 0.77783561, + "num_input_tokens_seen": 157780435, + "step": 7359, + "time_per_iteration": 2.58886981010437 + }, + { + "auxiliary_loss_clip": 0.01097926, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.04530144, + "balance_loss_mlp": 1.01766336, + "epoch": 0.4425071396362543, + "flos": 22930040874240.0, + "grad_norm": 1.650021209983342, + "language_loss": 0.70074397, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.7220428, + "num_input_tokens_seen": 157799420, + "step": 7360, + "time_per_iteration": 2.5300168991088867 + }, + { + "auxiliary_loss_clip": 0.01095398, + "auxiliary_loss_mlp": 0.01030924, + "balance_loss_clip": 1.04298425, + "balance_loss_mlp": 1.01715541, + "epoch": 0.4425672628889223, + "flos": 19792346175360.0, + "grad_norm": 2.295600326958118, + "language_loss": 0.70199239, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.72325552, + "num_input_tokens_seen": 157817025, + "step": 7361, + "time_per_iteration": 2.5028905868530273 + }, + { + "auxiliary_loss_clip": 0.01098743, + "auxiliary_loss_mlp": 0.0103586, + "balance_loss_clip": 1.0415225, + "balance_loss_mlp": 1.02126312, + "epoch": 0.44262738614159025, + "flos": 13662215377920.0, + "grad_norm": 1.9013795098914195, + "language_loss": 0.8290031, + "learning_rate": 2.464438269387809e-06, + "loss": 0.85034913, + "num_input_tokens_seen": 157834345, + "step": 7362, + "time_per_iteration": 2.49328875541687 + }, + { + "auxiliary_loss_clip": 0.01096704, + "auxiliary_loss_mlp": 0.01039972, + "balance_loss_clip": 1.04851174, + "balance_loss_mlp": 1.02478456, + "epoch": 0.4426875093942582, + "flos": 14210212245120.0, + "grad_norm": 1.6067379598975573, + "language_loss": 0.74538147, + "learning_rate": 2.464059445424366e-06, + "loss": 0.76674819, + "num_input_tokens_seen": 157852290, + "step": 7363, + "time_per_iteration": 2.523386240005493 + }, + { + "auxiliary_loss_clip": 0.01013358, + "auxiliary_loss_mlp": 0.01007973, + "balance_loss_clip": 1.03492022, + "balance_loss_mlp": 1.00628042, + "epoch": 0.4427476326469262, + "flos": 70117525728000.0, + "grad_norm": 0.7180701085624108, + "language_loss": 0.55647713, + "learning_rate": 2.463680603863743e-06, + "loss": 0.57669044, + "num_input_tokens_seen": 157923060, + "step": 7364, + "time_per_iteration": 3.2705013751983643 + }, + { + "auxiliary_loss_clip": 0.01098958, + "auxiliary_loss_mlp": 0.01035673, + "balance_loss_clip": 1.04047644, + "balance_loss_mlp": 1.02224994, + "epoch": 0.44280775589959415, + "flos": 25445080287360.0, + "grad_norm": 1.6661870989897638, + "language_loss": 0.74262297, + "learning_rate": 2.463301744720305e-06, + "loss": 0.76396936, + "num_input_tokens_seen": 157944110, + "step": 7365, + "time_per_iteration": 2.543498992919922 + }, + { + "auxiliary_loss_clip": 0.01091738, + "auxiliary_loss_mlp": 0.0103745, + "balance_loss_clip": 1.04053426, + "balance_loss_mlp": 1.02364588, + "epoch": 0.4428678791522621, + "flos": 22857214049280.0, + "grad_norm": 1.5999579630264356, + "language_loss": 0.74163628, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.76292813, + "num_input_tokens_seen": 157964295, + "step": 7366, + "time_per_iteration": 2.5248794555664062 + }, + { + "auxiliary_loss_clip": 0.01101432, + "auxiliary_loss_mlp": 0.010357, + "balance_loss_clip": 1.0447402, + "balance_loss_mlp": 1.02165687, + "epoch": 0.44292800240493013, + "flos": 25812446636160.0, + "grad_norm": 2.343299600941484, + "language_loss": 0.73612726, + "learning_rate": 2.46254397374245e-06, + "loss": 0.7574985, + "num_input_tokens_seen": 157983970, + "step": 7367, + "time_per_iteration": 2.5513699054718018 + }, + { + "auxiliary_loss_clip": 0.01120895, + "auxiliary_loss_mlp": 0.01036631, + "balance_loss_clip": 1.04259396, + "balance_loss_mlp": 1.0236311, + "epoch": 0.4429881256575981, + "flos": 32416885549440.0, + "grad_norm": 1.3953483112588263, + "language_loss": 0.73822284, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.75979805, + "num_input_tokens_seen": 158006515, + "step": 7368, + "time_per_iteration": 2.545046806335449 + }, + { + "auxiliary_loss_clip": 0.01092595, + "auxiliary_loss_mlp": 0.01034556, + "balance_loss_clip": 1.03986049, + "balance_loss_mlp": 1.02146721, + "epoch": 0.44304824891026606, + "flos": 22163707186560.0, + "grad_norm": 4.349084119073061, + "language_loss": 0.79945552, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.82072699, + "num_input_tokens_seen": 158025565, + "step": 7369, + "time_per_iteration": 2.5254852771759033 + }, + { + "auxiliary_loss_clip": 0.01083584, + "auxiliary_loss_mlp": 0.01032937, + "balance_loss_clip": 1.04050994, + "balance_loss_mlp": 1.0200628, + "epoch": 0.443108372162934, + "flos": 25338569483520.0, + "grad_norm": 1.9071217829039973, + "language_loss": 0.72184193, + "learning_rate": 2.461407185763737e-06, + "loss": 0.74300712, + "num_input_tokens_seen": 158045620, + "step": 7370, + "time_per_iteration": 2.589053153991699 + }, + { + "auxiliary_loss_clip": 0.01119854, + "auxiliary_loss_mlp": 0.01032076, + "balance_loss_clip": 1.04219234, + "balance_loss_mlp": 1.01880765, + "epoch": 0.443168495415602, + "flos": 23330947547520.0, + "grad_norm": 1.9595335993761063, + "language_loss": 0.70515865, + "learning_rate": 2.461028221425126e-06, + "loss": 0.72667789, + "num_input_tokens_seen": 158063505, + "step": 7371, + "time_per_iteration": 2.456566572189331 + }, + { + "auxiliary_loss_clip": 0.01107272, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_clip": 1.04057467, + "balance_loss_mlp": 1.01831877, + "epoch": 0.44322861866826996, + "flos": 21871502046720.0, + "grad_norm": 2.149548079615021, + "language_loss": 0.68605781, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.70743614, + "num_input_tokens_seen": 158080335, + "step": 7372, + "time_per_iteration": 2.503039598464966 + }, + { + "auxiliary_loss_clip": 0.01087745, + "auxiliary_loss_mlp": 0.01032373, + "balance_loss_clip": 1.03900802, + "balance_loss_mlp": 1.01823437, + "epoch": 0.4432887419209379, + "flos": 20084407660800.0, + "grad_norm": 2.4681760221047218, + "language_loss": 0.8366735, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.85787463, + "num_input_tokens_seen": 158098955, + "step": 7373, + "time_per_iteration": 3.9084439277648926 + }, + { + "auxiliary_loss_clip": 0.0103581, + "auxiliary_loss_mlp": 0.01002274, + "balance_loss_clip": 1.02017951, + "balance_loss_mlp": 1.00048578, + "epoch": 0.4433488651736059, + "flos": 70035540935040.0, + "grad_norm": 0.7534798885330452, + "language_loss": 0.55265146, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57303226, + "num_input_tokens_seen": 158164110, + "step": 7374, + "time_per_iteration": 4.526760101318359 + }, + { + "auxiliary_loss_clip": 0.01071479, + "auxiliary_loss_mlp": 0.01041405, + "balance_loss_clip": 1.04439151, + "balance_loss_mlp": 1.02739179, + "epoch": 0.44340898842627385, + "flos": 16282472705280.0, + "grad_norm": 2.3243722632459125, + "language_loss": 0.82543015, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.84655905, + "num_input_tokens_seen": 158179850, + "step": 7375, + "time_per_iteration": 2.538583517074585 + }, + { + "auxiliary_loss_clip": 0.01122149, + "auxiliary_loss_mlp": 0.01032179, + "balance_loss_clip": 1.04297042, + "balance_loss_mlp": 1.01898253, + "epoch": 0.4434691116789418, + "flos": 16611989097600.0, + "grad_norm": 1.86450824693346, + "language_loss": 0.83944428, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.8609876, + "num_input_tokens_seen": 158196590, + "step": 7376, + "time_per_iteration": 2.4468259811401367 + }, + { + "auxiliary_loss_clip": 0.0109885, + "auxiliary_loss_mlp": 0.01032809, + "balance_loss_clip": 1.04338098, + "balance_loss_mlp": 1.01997614, + "epoch": 0.4435292349316098, + "flos": 19063251912960.0, + "grad_norm": 1.8186672537306472, + "language_loss": 0.77104115, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.79235774, + "num_input_tokens_seen": 158216355, + "step": 7377, + "time_per_iteration": 2.525489091873169 + }, + { + "auxiliary_loss_clip": 0.01102739, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.04418039, + "balance_loss_mlp": 1.01714051, + "epoch": 0.44358935818427775, + "flos": 21251324799360.0, + "grad_norm": 1.7920790697773181, + "language_loss": 0.75225317, + "learning_rate": 2.458374982357057e-06, + "loss": 0.77357876, + "num_input_tokens_seen": 158235825, + "step": 7378, + "time_per_iteration": 3.890754222869873 + }, + { + "auxiliary_loss_clip": 0.01095166, + "auxiliary_loss_mlp": 0.01052613, + "balance_loss_clip": 1.04030919, + "balance_loss_mlp": 1.03775954, + "epoch": 0.4436494814369457, + "flos": 12495298239360.0, + "grad_norm": 1.7870222650248118, + "language_loss": 0.68731141, + "learning_rate": 2.457995878562982e-06, + "loss": 0.70878923, + "num_input_tokens_seen": 158254230, + "step": 7379, + "time_per_iteration": 3.9493415355682373 + }, + { + "auxiliary_loss_clip": 0.01058305, + "auxiliary_loss_mlp": 0.01038986, + "balance_loss_clip": 1.03815198, + "balance_loss_mlp": 1.02335763, + "epoch": 0.44370960468961373, + "flos": 23659853408640.0, + "grad_norm": 1.656922489830327, + "language_loss": 0.73111248, + "learning_rate": 2.457616757401656e-06, + "loss": 0.75208533, + "num_input_tokens_seen": 158273400, + "step": 7380, + "time_per_iteration": 2.602708101272583 + }, + { + "auxiliary_loss_clip": 0.01100841, + "auxiliary_loss_mlp": 0.01034292, + "balance_loss_clip": 1.04307294, + "balance_loss_mlp": 1.02113712, + "epoch": 0.4437697279422817, + "flos": 32416849635840.0, + "grad_norm": 1.5046368819454756, + "language_loss": 0.64296812, + "learning_rate": 2.457237618887458e-06, + "loss": 0.66431952, + "num_input_tokens_seen": 158296840, + "step": 7381, + "time_per_iteration": 2.64060640335083 + }, + { + "auxiliary_loss_clip": 0.01110219, + "auxiliary_loss_mlp": 0.01034429, + "balance_loss_clip": 1.04323483, + "balance_loss_mlp": 1.02090418, + "epoch": 0.44382985119494966, + "flos": 18112875914880.0, + "grad_norm": 2.073694259336502, + "language_loss": 0.7970888, + "learning_rate": 2.456858463034763e-06, + "loss": 0.81853527, + "num_input_tokens_seen": 158314935, + "step": 7382, + "time_per_iteration": 2.4656431674957275 + }, + { + "auxiliary_loss_clip": 0.01115497, + "auxiliary_loss_mlp": 0.01040916, + "balance_loss_clip": 1.04515433, + "balance_loss_mlp": 1.0277555, + "epoch": 0.44388997444761763, + "flos": 30774151923840.0, + "grad_norm": 1.6471541169655348, + "language_loss": 0.65385628, + "learning_rate": 2.456479289857949e-06, + "loss": 0.6754204, + "num_input_tokens_seen": 158334620, + "step": 7383, + "time_per_iteration": 2.5567376613616943 + }, + { + "auxiliary_loss_clip": 0.01100968, + "auxiliary_loss_mlp": 0.01035326, + "balance_loss_clip": 1.04497123, + "balance_loss_mlp": 1.02118218, + "epoch": 0.4439500977002856, + "flos": 20339157893760.0, + "grad_norm": 2.0833132001745103, + "language_loss": 0.76047927, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.78184223, + "num_input_tokens_seen": 158350550, + "step": 7384, + "time_per_iteration": 2.4870405197143555 + }, + { + "auxiliary_loss_clip": 0.01124874, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.0448823, + "balance_loss_mlp": 1.01741743, + "epoch": 0.44401022095295356, + "flos": 20371225760640.0, + "grad_norm": 1.8453910164114724, + "language_loss": 0.81044638, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.83200121, + "num_input_tokens_seen": 158369555, + "step": 7385, + "time_per_iteration": 2.4588024616241455 + }, + { + "auxiliary_loss_clip": 0.0107375, + "auxiliary_loss_mlp": 0.01034745, + "balance_loss_clip": 1.038908, + "balance_loss_mlp": 1.02027917, + "epoch": 0.4440703442056215, + "flos": 20230635928320.0, + "grad_norm": 1.7568600555033005, + "language_loss": 0.81664991, + "learning_rate": 2.455341666526582e-06, + "loss": 0.83773488, + "num_input_tokens_seen": 158388045, + "step": 7386, + "time_per_iteration": 2.565333366394043 + }, + { + "auxiliary_loss_clip": 0.01081995, + "auxiliary_loss_mlp": 0.01039313, + "balance_loss_clip": 1.04043019, + "balance_loss_mlp": 1.02460265, + "epoch": 0.4441304674582895, + "flos": 39494698824960.0, + "grad_norm": 1.7789878058468669, + "language_loss": 0.69741619, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.71862924, + "num_input_tokens_seen": 158410115, + "step": 7387, + "time_per_iteration": 2.702714443206787 + }, + { + "auxiliary_loss_clip": 0.01054077, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.04028678, + "balance_loss_mlp": 1.0243628, + "epoch": 0.44419059071095746, + "flos": 14829671220480.0, + "grad_norm": 1.993022994028245, + "language_loss": 0.72460485, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.7455281, + "num_input_tokens_seen": 158427765, + "step": 7388, + "time_per_iteration": 2.575754404067993 + }, + { + "auxiliary_loss_clip": 0.01110102, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.04152155, + "balance_loss_mlp": 1.01860857, + "epoch": 0.4442507139636254, + "flos": 22637835734400.0, + "grad_norm": 1.6065197189641502, + "language_loss": 0.69025224, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.71167612, + "num_input_tokens_seen": 158446375, + "step": 7389, + "time_per_iteration": 2.5158185958862305 + }, + { + "auxiliary_loss_clip": 0.01110352, + "auxiliary_loss_mlp": 0.01031833, + "balance_loss_clip": 1.0422411, + "balance_loss_mlp": 1.01907194, + "epoch": 0.4443108372162934, + "flos": 38290721829120.0, + "grad_norm": 1.6510287068028187, + "language_loss": 0.74583483, + "learning_rate": 2.453824593752788e-06, + "loss": 0.76725668, + "num_input_tokens_seen": 158467260, + "step": 7390, + "time_per_iteration": 2.651491403579712 + }, + { + "auxiliary_loss_clip": 0.01102637, + "auxiliary_loss_mlp": 0.01038086, + "balance_loss_clip": 1.03918755, + "balance_loss_mlp": 1.02349496, + "epoch": 0.44437096046896135, + "flos": 17748993185280.0, + "grad_norm": 2.0094964345929798, + "language_loss": 0.81470346, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.83611077, + "num_input_tokens_seen": 158486720, + "step": 7391, + "time_per_iteration": 2.5151851177215576 + }, + { + "auxiliary_loss_clip": 0.01092809, + "auxiliary_loss_mlp": 0.01038497, + "balance_loss_clip": 1.04597723, + "balance_loss_mlp": 1.02473402, + "epoch": 0.4444310837216293, + "flos": 13732348682880.0, + "grad_norm": 1.6491910526200175, + "language_loss": 0.73424315, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.75555629, + "num_input_tokens_seen": 158502530, + "step": 7392, + "time_per_iteration": 2.5465571880340576 + }, + { + "auxiliary_loss_clip": 0.01107437, + "auxiliary_loss_mlp": 0.01029435, + "balance_loss_clip": 1.04082692, + "balance_loss_mlp": 1.01664364, + "epoch": 0.44449120697429734, + "flos": 25010238240000.0, + "grad_norm": 1.5639215556325483, + "language_loss": 0.79372394, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.81509268, + "num_input_tokens_seen": 158522715, + "step": 7393, + "time_per_iteration": 2.5820066928863525 + }, + { + "auxiliary_loss_clip": 0.01113114, + "auxiliary_loss_mlp": 0.01034856, + "balance_loss_clip": 1.04216862, + "balance_loss_mlp": 1.02108729, + "epoch": 0.4445513302269653, + "flos": 32671707609600.0, + "grad_norm": 1.909140796152854, + "language_loss": 0.80475283, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.82623255, + "num_input_tokens_seen": 158543615, + "step": 7394, + "time_per_iteration": 2.564201831817627 + }, + { + "auxiliary_loss_clip": 0.01093837, + "auxiliary_loss_mlp": 0.01039095, + "balance_loss_clip": 1.03972316, + "balance_loss_mlp": 1.02661431, + "epoch": 0.44461145347963327, + "flos": 11655814504320.0, + "grad_norm": 4.343345611093114, + "language_loss": 0.7948432, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.8161726, + "num_input_tokens_seen": 158560330, + "step": 7395, + "time_per_iteration": 2.490449905395508 + }, + { + "auxiliary_loss_clip": 0.01094983, + "auxiliary_loss_mlp": 0.01036902, + "balance_loss_clip": 1.04043508, + "balance_loss_mlp": 1.02419996, + "epoch": 0.44467157673230123, + "flos": 20886759711360.0, + "grad_norm": 1.7031303663128772, + "language_loss": 0.68336976, + "learning_rate": 2.451548468607584e-06, + "loss": 0.70468867, + "num_input_tokens_seen": 158579735, + "step": 7396, + "time_per_iteration": 2.559892416000366 + }, + { + "auxiliary_loss_clip": 0.01106334, + "auxiliary_loss_mlp": 0.0079152, + "balance_loss_clip": 1.04057491, + "balance_loss_mlp": 1.01857686, + "epoch": 0.4447316999849692, + "flos": 18546137763840.0, + "grad_norm": 1.7570315564215924, + "language_loss": 0.80708921, + "learning_rate": 2.451169054403126e-06, + "loss": 0.82606769, + "num_input_tokens_seen": 158597075, + "step": 7397, + "time_per_iteration": 2.50197172164917 + }, + { + "auxiliary_loss_clip": 0.01107463, + "auxiliary_loss_mlp": 0.01034873, + "balance_loss_clip": 1.04048896, + "balance_loss_mlp": 1.02211142, + "epoch": 0.44479182323763716, + "flos": 23769057732480.0, + "grad_norm": 1.633641888073597, + "language_loss": 0.67625737, + "learning_rate": 2.450789623090293e-06, + "loss": 0.69768071, + "num_input_tokens_seen": 158616650, + "step": 7398, + "time_per_iteration": 2.494442939758301 + }, + { + "auxiliary_loss_clip": 0.01087317, + "auxiliary_loss_mlp": 0.01038571, + "balance_loss_clip": 1.04102325, + "balance_loss_mlp": 1.02603602, + "epoch": 0.44485194649030513, + "flos": 16543831040640.0, + "grad_norm": 2.3262350985889872, + "language_loss": 0.69622934, + "learning_rate": 2.450410174683472e-06, + "loss": 0.71748817, + "num_input_tokens_seen": 158634515, + "step": 7399, + "time_per_iteration": 2.5390021800994873 + }, + { + "auxiliary_loss_clip": 0.01086216, + "auxiliary_loss_mlp": 0.01034776, + "balance_loss_clip": 1.03936291, + "balance_loss_mlp": 1.02239609, + "epoch": 0.4449120697429731, + "flos": 22600955445120.0, + "grad_norm": 2.099410451625582, + "language_loss": 0.72376847, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.74497837, + "num_input_tokens_seen": 158653760, + "step": 7400, + "time_per_iteration": 2.543809413909912 + }, + { + "auxiliary_loss_clip": 0.0107277, + "auxiliary_loss_mlp": 0.0078921, + "balance_loss_clip": 1.04336214, + "balance_loss_mlp": 1.01648426, + "epoch": 0.44497219299564106, + "flos": 20004864992640.0, + "grad_norm": 2.3392910319087727, + "language_loss": 0.84828854, + "learning_rate": 2.449651226645422e-06, + "loss": 0.86690831, + "num_input_tokens_seen": 158672190, + "step": 7401, + "time_per_iteration": 2.604703664779663 + }, + { + "auxiliary_loss_clip": 0.01091967, + "auxiliary_loss_mlp": 0.01035204, + "balance_loss_clip": 1.04000759, + "balance_loss_mlp": 1.02338433, + "epoch": 0.445032316248309, + "flos": 25594253470080.0, + "grad_norm": 1.545166587646256, + "language_loss": 0.83198184, + "learning_rate": 2.449271727042973e-06, + "loss": 0.85325354, + "num_input_tokens_seen": 158694115, + "step": 7402, + "time_per_iteration": 2.5712177753448486 + }, + { + "auxiliary_loss_clip": 0.01099046, + "auxiliary_loss_mlp": 0.01032262, + "balance_loss_clip": 1.04404616, + "balance_loss_mlp": 1.01934016, + "epoch": 0.445092439500977, + "flos": 21250426959360.0, + "grad_norm": 1.708147841356983, + "language_loss": 0.76956093, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.790874, + "num_input_tokens_seen": 158711000, + "step": 7403, + "time_per_iteration": 2.5259180068969727 + }, + { + "auxiliary_loss_clip": 0.01028835, + "auxiliary_loss_mlp": 0.01007005, + "balance_loss_clip": 1.02806973, + "balance_loss_mlp": 1.00540733, + "epoch": 0.44515256275364495, + "flos": 57764900309760.0, + "grad_norm": 0.7431518282666976, + "language_loss": 0.60000384, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62036222, + "num_input_tokens_seen": 158769675, + "step": 7404, + "time_per_iteration": 3.1163673400878906 + }, + { + "auxiliary_loss_clip": 0.01095807, + "auxiliary_loss_mlp": 0.01037454, + "balance_loss_clip": 1.03983843, + "balance_loss_mlp": 1.02235627, + "epoch": 0.4452126860063129, + "flos": 15596004908160.0, + "grad_norm": 1.7871058358776286, + "language_loss": 0.81944239, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.84077501, + "num_input_tokens_seen": 158788215, + "step": 7405, + "time_per_iteration": 2.5334198474884033 + }, + { + "auxiliary_loss_clip": 0.01093631, + "auxiliary_loss_mlp": 0.01027567, + "balance_loss_clip": 1.03947926, + "balance_loss_mlp": 1.01472187, + "epoch": 0.4452728092589809, + "flos": 21617398258560.0, + "grad_norm": 2.2874055860558355, + "language_loss": 0.74988556, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.77109754, + "num_input_tokens_seen": 158809090, + "step": 7406, + "time_per_iteration": 2.5324010848999023 + }, + { + "auxiliary_loss_clip": 0.01079687, + "auxiliary_loss_mlp": 0.01030029, + "balance_loss_clip": 1.03673828, + "balance_loss_mlp": 1.0174706, + "epoch": 0.4453329325116489, + "flos": 29497491757440.0, + "grad_norm": 1.6667225714028913, + "language_loss": 0.6522187, + "learning_rate": 2.447373973772129e-06, + "loss": 0.67331588, + "num_input_tokens_seen": 158828320, + "step": 7407, + "time_per_iteration": 2.629051685333252 + }, + { + "auxiliary_loss_clip": 0.01095012, + "auxiliary_loss_mlp": 0.01032648, + "balance_loss_clip": 1.04608774, + "balance_loss_mlp": 1.01952934, + "epoch": 0.44539305576431687, + "flos": 21361139654400.0, + "grad_norm": 1.7194354529599825, + "language_loss": 0.68226975, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.70354635, + "num_input_tokens_seen": 158847040, + "step": 7408, + "time_per_iteration": 2.5417163372039795 + }, + { + "auxiliary_loss_clip": 0.01117665, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.03917861, + "balance_loss_mlp": 1.02018929, + "epoch": 0.44545317901698483, + "flos": 41427626428800.0, + "grad_norm": 1.4811674914507897, + "language_loss": 0.72102529, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.74254096, + "num_input_tokens_seen": 158870490, + "step": 7409, + "time_per_iteration": 2.6263303756713867 + }, + { + "auxiliary_loss_clip": 0.01096067, + "auxiliary_loss_mlp": 0.01034058, + "balance_loss_clip": 1.03983068, + "balance_loss_mlp": 1.02003908, + "epoch": 0.4455133022696528, + "flos": 22055005653120.0, + "grad_norm": 1.8005081749006628, + "language_loss": 0.65371382, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.67501509, + "num_input_tokens_seen": 158889920, + "step": 7410, + "time_per_iteration": 2.515620470046997 + }, + { + "auxiliary_loss_clip": 0.01096941, + "auxiliary_loss_mlp": 0.01031289, + "balance_loss_clip": 1.04130936, + "balance_loss_mlp": 1.01728225, + "epoch": 0.44557342552232077, + "flos": 23476960333440.0, + "grad_norm": 1.9208992854065592, + "language_loss": 0.73997593, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.76125824, + "num_input_tokens_seen": 158909580, + "step": 7411, + "time_per_iteration": 2.577432632446289 + }, + { + "auxiliary_loss_clip": 0.01053717, + "auxiliary_loss_mlp": 0.01029757, + "balance_loss_clip": 1.04172099, + "balance_loss_mlp": 1.01707292, + "epoch": 0.44563354877498873, + "flos": 19134678107520.0, + "grad_norm": 1.7759344421222005, + "language_loss": 0.79032755, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.81116229, + "num_input_tokens_seen": 158924600, + "step": 7412, + "time_per_iteration": 3.9864614009857178 + }, + { + "auxiliary_loss_clip": 0.01099523, + "auxiliary_loss_mlp": 0.01032412, + "balance_loss_clip": 1.03955472, + "balance_loss_mlp": 1.01945972, + "epoch": 0.4456936720276567, + "flos": 13621420506240.0, + "grad_norm": 1.8479450670135789, + "language_loss": 0.80014396, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.82146335, + "num_input_tokens_seen": 158939345, + "step": 7413, + "time_per_iteration": 3.8547914028167725 + }, + { + "auxiliary_loss_clip": 0.01105613, + "auxiliary_loss_mlp": 0.0102702, + "balance_loss_clip": 1.0403136, + "balance_loss_mlp": 1.0140022, + "epoch": 0.44575379528032466, + "flos": 14713715139840.0, + "grad_norm": 1.8078941267235116, + "language_loss": 0.76546514, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.7867915, + "num_input_tokens_seen": 158955855, + "step": 7414, + "time_per_iteration": 2.4533703327178955 + }, + { + "auxiliary_loss_clip": 0.01089424, + "auxiliary_loss_mlp": 0.01038166, + "balance_loss_clip": 1.03976583, + "balance_loss_mlp": 1.02405143, + "epoch": 0.4458139185329926, + "flos": 24170682677760.0, + "grad_norm": 1.726835541145688, + "language_loss": 0.83579743, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.85707337, + "num_input_tokens_seen": 158976315, + "step": 7415, + "time_per_iteration": 2.5485339164733887 + }, + { + "auxiliary_loss_clip": 0.01115156, + "auxiliary_loss_mlp": 0.01038434, + "balance_loss_clip": 1.03852189, + "balance_loss_mlp": 1.0252254, + "epoch": 0.4458740417856606, + "flos": 21762225895680.0, + "grad_norm": 1.6724523862360705, + "language_loss": 0.84123683, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.86277276, + "num_input_tokens_seen": 158996725, + "step": 7416, + "time_per_iteration": 2.4750187397003174 + }, + { + "auxiliary_loss_clip": 0.01084669, + "auxiliary_loss_mlp": 0.01030117, + "balance_loss_clip": 1.04118276, + "balance_loss_mlp": 1.01701546, + "epoch": 0.44593416503832856, + "flos": 21068790860160.0, + "grad_norm": 4.632257721662155, + "language_loss": 0.81347209, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.83461994, + "num_input_tokens_seen": 159017255, + "step": 7417, + "time_per_iteration": 3.9501397609710693 + }, + { + "auxiliary_loss_clip": 0.01098307, + "auxiliary_loss_mlp": 0.01040538, + "balance_loss_clip": 1.03979218, + "balance_loss_mlp": 1.02697766, + "epoch": 0.4459942882909965, + "flos": 22600488568320.0, + "grad_norm": 1.9340637062076786, + "language_loss": 0.81122351, + "learning_rate": 2.443197426237077e-06, + "loss": 0.83261192, + "num_input_tokens_seen": 159035010, + "step": 7418, + "time_per_iteration": 3.8855233192443848 + }, + { + "auxiliary_loss_clip": 0.01107706, + "auxiliary_loss_mlp": 0.00787007, + "balance_loss_clip": 1.03898573, + "balance_loss_mlp": 1.01169086, + "epoch": 0.4460544115436645, + "flos": 26505486622080.0, + "grad_norm": 1.5359206590581067, + "language_loss": 0.77144349, + "learning_rate": 2.442817638972991e-06, + "loss": 0.79039067, + "num_input_tokens_seen": 159055345, + "step": 7419, + "time_per_iteration": 2.51133131980896 + }, + { + "auxiliary_loss_clip": 0.01085732, + "auxiliary_loss_mlp": 0.0103486, + "balance_loss_clip": 1.03909302, + "balance_loss_mlp": 1.02230108, + "epoch": 0.4461145347963325, + "flos": 17604021893760.0, + "grad_norm": 1.5523187550904443, + "language_loss": 0.72370851, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.74491441, + "num_input_tokens_seen": 159074225, + "step": 7420, + "time_per_iteration": 2.5188379287719727 + }, + { + "auxiliary_loss_clip": 0.01095908, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.03897285, + "balance_loss_mlp": 1.01828218, + "epoch": 0.44617465804900047, + "flos": 27268193036160.0, + "grad_norm": 1.6653675043305245, + "language_loss": 0.7459991, + "learning_rate": 2.442058014084156e-06, + "loss": 0.76728141, + "num_input_tokens_seen": 159095415, + "step": 7421, + "time_per_iteration": 2.5413825511932373 + }, + { + "auxiliary_loss_clip": 0.01053929, + "auxiliary_loss_mlp": 0.01035933, + "balance_loss_clip": 1.03758562, + "balance_loss_mlp": 1.0224508, + "epoch": 0.44623478130166844, + "flos": 17786412178560.0, + "grad_norm": 1.8123598304337882, + "language_loss": 0.75934547, + "learning_rate": 2.44167817648821e-06, + "loss": 0.78024411, + "num_input_tokens_seen": 159114615, + "step": 7422, + "time_per_iteration": 2.586775541305542 + }, + { + "auxiliary_loss_clip": 0.01118812, + "auxiliary_loss_mlp": 0.01030441, + "balance_loss_clip": 1.04219091, + "balance_loss_mlp": 1.01778126, + "epoch": 0.4462949045543364, + "flos": 23003011353600.0, + "grad_norm": 1.5470905565911495, + "language_loss": 0.65211606, + "learning_rate": 2.441298322143784e-06, + "loss": 0.67360866, + "num_input_tokens_seen": 159134370, + "step": 7423, + "time_per_iteration": 2.4573192596435547 + }, + { + "auxiliary_loss_clip": 0.01092495, + "auxiliary_loss_mlp": 0.01031302, + "balance_loss_clip": 1.04254985, + "balance_loss_mlp": 1.01957726, + "epoch": 0.44635502780700437, + "flos": 17820096157440.0, + "grad_norm": 1.422198181864022, + "language_loss": 0.79056275, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.81180066, + "num_input_tokens_seen": 159152540, + "step": 7424, + "time_per_iteration": 2.504376173019409 + }, + { + "auxiliary_loss_clip": 0.01103048, + "auxiliary_loss_mlp": 0.01030074, + "balance_loss_clip": 1.04210019, + "balance_loss_mlp": 1.01786685, + "epoch": 0.44641515105967233, + "flos": 26688020561280.0, + "grad_norm": 1.42210487840812, + "language_loss": 0.80121565, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.8225469, + "num_input_tokens_seen": 159173425, + "step": 7425, + "time_per_iteration": 2.537569046020508 + }, + { + "auxiliary_loss_clip": 0.01105222, + "auxiliary_loss_mlp": 0.01028491, + "balance_loss_clip": 1.04013073, + "balance_loss_mlp": 1.0164212, + "epoch": 0.4464752743123403, + "flos": 18913324544640.0, + "grad_norm": 1.4848317511871891, + "language_loss": 0.7725504, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.7938875, + "num_input_tokens_seen": 159191210, + "step": 7426, + "time_per_iteration": 2.4929213523864746 + }, + { + "auxiliary_loss_clip": 0.01089791, + "auxiliary_loss_mlp": 0.00786154, + "balance_loss_clip": 1.0381844, + "balance_loss_mlp": 1.00958741, + "epoch": 0.44653539756500826, + "flos": 29570318582400.0, + "grad_norm": 1.5682584008667182, + "language_loss": 0.64415956, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.66291904, + "num_input_tokens_seen": 159211755, + "step": 7427, + "time_per_iteration": 2.613039016723633 + }, + { + "auxiliary_loss_clip": 0.0111285, + "auxiliary_loss_mlp": 0.0103274, + "balance_loss_clip": 1.04468906, + "balance_loss_mlp": 1.01978159, + "epoch": 0.44659552081767623, + "flos": 21468979261440.0, + "grad_norm": 2.008674076730728, + "language_loss": 0.75627744, + "learning_rate": 2.439398799698608e-06, + "loss": 0.77773333, + "num_input_tokens_seen": 159230315, + "step": 7428, + "time_per_iteration": 2.490018129348755 + }, + { + "auxiliary_loss_clip": 0.0108462, + "auxiliary_loss_mlp": 0.01032958, + "balance_loss_clip": 1.0385859, + "balance_loss_mlp": 1.02010155, + "epoch": 0.4466556440703442, + "flos": 17931886260480.0, + "grad_norm": 1.7709711046132655, + "language_loss": 0.77910864, + "learning_rate": 2.439018845165806e-06, + "loss": 0.8002845, + "num_input_tokens_seen": 159249810, + "step": 7429, + "time_per_iteration": 2.5291969776153564 + }, + { + "auxiliary_loss_clip": 0.01108108, + "auxiliary_loss_mlp": 0.01031513, + "balance_loss_clip": 1.04038763, + "balance_loss_mlp": 1.01760101, + "epoch": 0.44671576732301216, + "flos": 21107430915840.0, + "grad_norm": 2.514145522006151, + "language_loss": 0.91172755, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93312371, + "num_input_tokens_seen": 159271715, + "step": 7430, + "time_per_iteration": 2.537958860397339 + }, + { + "auxiliary_loss_clip": 0.01097615, + "auxiliary_loss_mlp": 0.00789099, + "balance_loss_clip": 1.0413326, + "balance_loss_mlp": 1.01172304, + "epoch": 0.4467758905756801, + "flos": 23508920459520.0, + "grad_norm": 1.597068142248159, + "language_loss": 0.79465067, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.81351781, + "num_input_tokens_seen": 159290690, + "step": 7431, + "time_per_iteration": 2.534229278564453 + }, + { + "auxiliary_loss_clip": 0.01099554, + "auxiliary_loss_mlp": 0.01034894, + "balance_loss_clip": 1.04109001, + "balance_loss_mlp": 1.02141738, + "epoch": 0.4468360138283481, + "flos": 18734022829440.0, + "grad_norm": 1.7903861281663358, + "language_loss": 0.79553199, + "learning_rate": 2.437878881739204e-06, + "loss": 0.81687647, + "num_input_tokens_seen": 159309400, + "step": 7432, + "time_per_iteration": 2.509443759918213 + }, + { + "auxiliary_loss_clip": 0.01083633, + "auxiliary_loss_mlp": 0.01034475, + "balance_loss_clip": 1.03936327, + "balance_loss_mlp": 1.022089, + "epoch": 0.4468961370810161, + "flos": 23477139901440.0, + "grad_norm": 1.7725576508021001, + "language_loss": 0.76376545, + "learning_rate": 2.437498860702301e-06, + "loss": 0.78494656, + "num_input_tokens_seen": 159327425, + "step": 7433, + "time_per_iteration": 2.5415995121002197 + }, + { + "auxiliary_loss_clip": 0.01098757, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.04148149, + "balance_loss_mlp": 1.01919317, + "epoch": 0.4469562603336841, + "flos": 30075042539520.0, + "grad_norm": 1.6943480538579203, + "language_loss": 0.77176708, + "learning_rate": 2.437118823075398e-06, + "loss": 0.79305393, + "num_input_tokens_seen": 159345805, + "step": 7434, + "time_per_iteration": 2.5465259552001953 + }, + { + "auxiliary_loss_clip": 0.01110062, + "auxiliary_loss_mlp": 0.01028553, + "balance_loss_clip": 1.04183161, + "balance_loss_mlp": 1.01555908, + "epoch": 0.44701638358635204, + "flos": 22456415116800.0, + "grad_norm": 2.0620106160321874, + "language_loss": 0.64582145, + "learning_rate": 2.436738768872905e-06, + "loss": 0.6672076, + "num_input_tokens_seen": 159364595, + "step": 7435, + "time_per_iteration": 2.493406295776367 + }, + { + "auxiliary_loss_clip": 0.01100004, + "auxiliary_loss_mlp": 0.01029407, + "balance_loss_clip": 1.04089046, + "balance_loss_mlp": 1.0159843, + "epoch": 0.44707650683902, + "flos": 24057851080320.0, + "grad_norm": 1.7160011317363102, + "language_loss": 0.83686835, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.8581624, + "num_input_tokens_seen": 159385265, + "step": 7436, + "time_per_iteration": 2.5529539585113525 + }, + { + "auxiliary_loss_clip": 0.01069151, + "auxiliary_loss_mlp": 0.01049936, + "balance_loss_clip": 1.03913295, + "balance_loss_mlp": 1.03291273, + "epoch": 0.44713663009168797, + "flos": 23766938830080.0, + "grad_norm": 1.7592174959421958, + "language_loss": 0.79477149, + "learning_rate": 2.435978610798798e-06, + "loss": 0.81596237, + "num_input_tokens_seen": 159405080, + "step": 7437, + "time_per_iteration": 2.5765185356140137 + }, + { + "auxiliary_loss_clip": 0.01074174, + "auxiliary_loss_mlp": 0.01034003, + "balance_loss_clip": 1.04423571, + "balance_loss_mlp": 1.02106929, + "epoch": 0.44719675334435594, + "flos": 24499265316480.0, + "grad_norm": 1.5765958624701573, + "language_loss": 0.71908504, + "learning_rate": 2.435598506956009e-06, + "loss": 0.74016678, + "num_input_tokens_seen": 159424595, + "step": 7438, + "time_per_iteration": 2.614311695098877 + }, + { + "auxiliary_loss_clip": 0.01083953, + "auxiliary_loss_mlp": 0.01037594, + "balance_loss_clip": 1.04163742, + "balance_loss_mlp": 1.02293086, + "epoch": 0.4472568765970239, + "flos": 29781759991680.0, + "grad_norm": 1.6042391542388204, + "language_loss": 0.6713571, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.69257253, + "num_input_tokens_seen": 159443865, + "step": 7439, + "time_per_iteration": 2.63220477104187 + }, + { + "auxiliary_loss_clip": 0.01095266, + "auxiliary_loss_mlp": 0.01039255, + "balance_loss_clip": 1.03786087, + "balance_loss_mlp": 1.02393019, + "epoch": 0.44731699984969187, + "flos": 24643123286400.0, + "grad_norm": 1.8831113160536335, + "language_loss": 0.73817587, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.75952113, + "num_input_tokens_seen": 159464525, + "step": 7440, + "time_per_iteration": 2.548387050628662 + }, + { + "auxiliary_loss_clip": 0.01066938, + "auxiliary_loss_mlp": 0.01045617, + "balance_loss_clip": 1.03527236, + "balance_loss_mlp": 1.03047156, + "epoch": 0.44737712310235983, + "flos": 29455691304960.0, + "grad_norm": 1.680109608425036, + "language_loss": 0.74019319, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.76131868, + "num_input_tokens_seen": 159486385, + "step": 7441, + "time_per_iteration": 2.622896909713745 + }, + { + "auxiliary_loss_clip": 0.0108788, + "auxiliary_loss_mlp": 0.01034517, + "balance_loss_clip": 1.04468584, + "balance_loss_mlp": 1.02056956, + "epoch": 0.4474372463550278, + "flos": 24896832024960.0, + "grad_norm": 1.8077807839840991, + "language_loss": 0.74847817, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.76970214, + "num_input_tokens_seen": 159503880, + "step": 7442, + "time_per_iteration": 2.565925121307373 + }, + { + "auxiliary_loss_clip": 0.01122346, + "auxiliary_loss_mlp": 0.01035802, + "balance_loss_clip": 1.04035389, + "balance_loss_mlp": 1.02097213, + "epoch": 0.44749736960769576, + "flos": 33181603125120.0, + "grad_norm": 1.7341627429769624, + "language_loss": 0.74522817, + "learning_rate": 2.433697740261273e-06, + "loss": 0.76680958, + "num_input_tokens_seen": 159522980, + "step": 7443, + "time_per_iteration": 2.5312509536743164 + }, + { + "auxiliary_loss_clip": 0.01094322, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.03604805, + "balance_loss_mlp": 1.01800478, + "epoch": 0.4475574928603637, + "flos": 21071807602560.0, + "grad_norm": 1.6553949656962152, + "language_loss": 0.77939546, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.80066943, + "num_input_tokens_seen": 159543340, + "step": 7444, + "time_per_iteration": 2.5193114280700684 + }, + { + "auxiliary_loss_clip": 0.01100731, + "auxiliary_loss_mlp": 0.01034158, + "balance_loss_clip": 1.04064274, + "balance_loss_mlp": 1.02062154, + "epoch": 0.4476176161130317, + "flos": 21862523646720.0, + "grad_norm": 2.273572040242331, + "language_loss": 0.84874004, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.87008893, + "num_input_tokens_seen": 159558210, + "step": 7445, + "time_per_iteration": 2.458216667175293 + }, + { + "auxiliary_loss_clip": 0.01083787, + "auxiliary_loss_mlp": 0.01043496, + "balance_loss_clip": 1.03954816, + "balance_loss_mlp": 1.02600765, + "epoch": 0.4476777393656997, + "flos": 22528667324160.0, + "grad_norm": 2.1808979765518313, + "language_loss": 0.64138019, + "learning_rate": 2.432557082778765e-06, + "loss": 0.66265297, + "num_input_tokens_seen": 159577920, + "step": 7446, + "time_per_iteration": 2.550922155380249 + }, + { + "auxiliary_loss_clip": 0.01036656, + "auxiliary_loss_mlp": 0.01000138, + "balance_loss_clip": 1.02048635, + "balance_loss_mlp": 0.99792045, + "epoch": 0.4477378626183677, + "flos": 49017133877760.0, + "grad_norm": 0.7396427118422974, + "language_loss": 0.50242054, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52278841, + "num_input_tokens_seen": 159632295, + "step": 7447, + "time_per_iteration": 2.962052583694458 + }, + { + "auxiliary_loss_clip": 0.01044446, + "auxiliary_loss_mlp": 0.01002953, + "balance_loss_clip": 1.01748574, + "balance_loss_mlp": 1.00084329, + "epoch": 0.44779798587103564, + "flos": 56542179392640.0, + "grad_norm": 0.7769494042661208, + "language_loss": 0.593665, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61413902, + "num_input_tokens_seen": 159698435, + "step": 7448, + "time_per_iteration": 3.1229686737060547 + }, + { + "auxiliary_loss_clip": 0.01086581, + "auxiliary_loss_mlp": 0.01033774, + "balance_loss_clip": 1.03974962, + "balance_loss_mlp": 1.02071476, + "epoch": 0.4478581091237036, + "flos": 46498536040320.0, + "grad_norm": 1.646677942202416, + "language_loss": 0.58944619, + "learning_rate": 2.431416277672789e-06, + "loss": 0.61064976, + "num_input_tokens_seen": 159722150, + "step": 7449, + "time_per_iteration": 2.7624213695526123 + }, + { + "auxiliary_loss_clip": 0.01090197, + "auxiliary_loss_mlp": 0.01029868, + "balance_loss_clip": 1.04050636, + "balance_loss_mlp": 1.01684451, + "epoch": 0.4479182323763716, + "flos": 20814363849600.0, + "grad_norm": 1.7449380054488262, + "language_loss": 0.79866958, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.81987023, + "num_input_tokens_seen": 159740550, + "step": 7450, + "time_per_iteration": 3.9778456687927246 + }, + { + "auxiliary_loss_clip": 0.01119223, + "auxiliary_loss_mlp": 0.01042666, + "balance_loss_clip": 1.04125738, + "balance_loss_mlp": 1.02927303, + "epoch": 0.44797835562903954, + "flos": 14245979212800.0, + "grad_norm": 2.3212304750461743, + "language_loss": 0.7958827, + "learning_rate": 2.430655659114697e-06, + "loss": 0.8175016, + "num_input_tokens_seen": 159758245, + "step": 7451, + "time_per_iteration": 3.8403098583221436 + }, + { + "auxiliary_loss_clip": 0.01010712, + "auxiliary_loss_mlp": 0.01002181, + "balance_loss_clip": 1.03050435, + "balance_loss_mlp": 1.00046396, + "epoch": 0.4480384788817075, + "flos": 63534560169600.0, + "grad_norm": 0.8378029958482914, + "language_loss": 0.62868547, + "learning_rate": 2.430275325332681e-06, + "loss": 0.64881438, + "num_input_tokens_seen": 159826790, + "step": 7452, + "time_per_iteration": 3.281829833984375 + }, + { + "auxiliary_loss_clip": 0.01122056, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.04267943, + "balance_loss_mlp": 1.01933825, + "epoch": 0.44809860213437547, + "flos": 21652626522240.0, + "grad_norm": 1.8934888103971135, + "language_loss": 0.62768275, + "learning_rate": 2.429894975234582e-06, + "loss": 0.64924002, + "num_input_tokens_seen": 159845805, + "step": 7453, + "time_per_iteration": 2.458674192428589 + }, + { + "auxiliary_loss_clip": 0.01036726, + "auxiliary_loss_mlp": 0.01002061, + "balance_loss_clip": 1.01874316, + "balance_loss_mlp": 1.00009441, + "epoch": 0.44815872538704343, + "flos": 69190634246400.0, + "grad_norm": 0.7915424986533482, + "language_loss": 0.57079083, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59117872, + "num_input_tokens_seen": 159898860, + "step": 7454, + "time_per_iteration": 2.9776217937469482 + }, + { + "auxiliary_loss_clip": 0.01092665, + "auxiliary_loss_mlp": 0.01040223, + "balance_loss_clip": 1.03712249, + "balance_loss_mlp": 1.02592945, + "epoch": 0.4482188486397114, + "flos": 12598289510400.0, + "grad_norm": 2.1182682221340015, + "language_loss": 0.74760842, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.76893723, + "num_input_tokens_seen": 159911555, + "step": 7455, + "time_per_iteration": 2.4634969234466553 + }, + { + "auxiliary_loss_clip": 0.01096517, + "auxiliary_loss_mlp": 0.01034734, + "balance_loss_clip": 1.04014063, + "balance_loss_mlp": 1.02204418, + "epoch": 0.44827897189237936, + "flos": 34058182631040.0, + "grad_norm": 1.7187053018502862, + "language_loss": 0.75731248, + "learning_rate": 2.428753827188016e-06, + "loss": 0.77862501, + "num_input_tokens_seen": 159931470, + "step": 7456, + "time_per_iteration": 5.374706506729126 + }, + { + "auxiliary_loss_clip": 0.01122152, + "auxiliary_loss_mlp": 0.0103624, + "balance_loss_clip": 1.04475796, + "balance_loss_mlp": 1.02317476, + "epoch": 0.44833909514504733, + "flos": 25147416280320.0, + "grad_norm": 2.451813000481833, + "language_loss": 0.76448894, + "learning_rate": 2.428373411969818e-06, + "loss": 0.78607297, + "num_input_tokens_seen": 159946115, + "step": 7457, + "time_per_iteration": 2.45881724357605 + }, + { + "auxiliary_loss_clip": 0.01107207, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.0411675, + "balance_loss_mlp": 1.02086377, + "epoch": 0.4483992183977153, + "flos": 16179984224640.0, + "grad_norm": 1.9948400991335697, + "language_loss": 0.68029451, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.70172322, + "num_input_tokens_seen": 159963915, + "step": 7458, + "time_per_iteration": 2.464815139770508 + }, + { + "auxiliary_loss_clip": 0.01083854, + "auxiliary_loss_mlp": 0.01034659, + "balance_loss_clip": 1.04222274, + "balance_loss_mlp": 1.02030063, + "epoch": 0.44845934165038326, + "flos": 17746048270080.0, + "grad_norm": 1.9062169185049425, + "language_loss": 0.71811426, + "learning_rate": 2.427612532815961e-06, + "loss": 0.73929942, + "num_input_tokens_seen": 159982140, + "step": 7459, + "time_per_iteration": 2.5825932025909424 + }, + { + "auxiliary_loss_clip": 0.01101382, + "auxiliary_loss_mlp": 0.01035907, + "balance_loss_clip": 1.03849745, + "balance_loss_mlp": 1.02139306, + "epoch": 0.4485194649030513, + "flos": 21835914647040.0, + "grad_norm": 1.5429270203410197, + "language_loss": 0.695104, + "learning_rate": 2.427232068909154e-06, + "loss": 0.71647686, + "num_input_tokens_seen": 160002280, + "step": 7460, + "time_per_iteration": 2.4890499114990234 + }, + { + "auxiliary_loss_clip": 0.01119193, + "auxiliary_loss_mlp": 0.01035831, + "balance_loss_clip": 1.04026842, + "balance_loss_mlp": 1.02229536, + "epoch": 0.44857958815571924, + "flos": 20084515401600.0, + "grad_norm": 1.7863404508744487, + "language_loss": 0.76794553, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.78949577, + "num_input_tokens_seen": 160020260, + "step": 7461, + "time_per_iteration": 2.470203399658203 + }, + { + "auxiliary_loss_clip": 0.01119589, + "auxiliary_loss_mlp": 0.01039295, + "balance_loss_clip": 1.039621, + "balance_loss_mlp": 1.026003, + "epoch": 0.4486397114083872, + "flos": 27053519402880.0, + "grad_norm": 1.7809043995170295, + "language_loss": 0.67769396, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.69928282, + "num_input_tokens_seen": 160040240, + "step": 7462, + "time_per_iteration": 2.5207231044769287 + }, + { + "auxiliary_loss_clip": 0.01040115, + "auxiliary_loss_mlp": 0.01008634, + "balance_loss_clip": 1.0139997, + "balance_loss_mlp": 1.00696516, + "epoch": 0.4486998346610552, + "flos": 67321195931520.0, + "grad_norm": 0.7478405465364303, + "language_loss": 0.54416436, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.56465185, + "num_input_tokens_seen": 160093865, + "step": 7463, + "time_per_iteration": 3.1067306995391846 + }, + { + "auxiliary_loss_clip": 0.01109545, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.04103279, + "balance_loss_mlp": 1.0174005, + "epoch": 0.44875995791372314, + "flos": 27636816360960.0, + "grad_norm": 2.7229290716490095, + "language_loss": 0.75747603, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.77887952, + "num_input_tokens_seen": 160113590, + "step": 7464, + "time_per_iteration": 2.5568926334381104 + }, + { + "auxiliary_loss_clip": 0.01106957, + "auxiliary_loss_mlp": 0.01033391, + "balance_loss_clip": 1.04170692, + "balance_loss_mlp": 1.02176845, + "epoch": 0.4488200811663911, + "flos": 13005947940480.0, + "grad_norm": 1.7728280577331545, + "language_loss": 0.74107951, + "learning_rate": 2.425329506653441e-06, + "loss": 0.762483, + "num_input_tokens_seen": 160131795, + "step": 7465, + "time_per_iteration": 2.4462995529174805 + }, + { + "auxiliary_loss_clip": 0.011089, + "auxiliary_loss_mlp": 0.01042363, + "balance_loss_clip": 1.0481019, + "balance_loss_mlp": 1.02694917, + "epoch": 0.44888020441905907, + "flos": 27489977562240.0, + "grad_norm": 2.0001391552874077, + "language_loss": 0.8027159, + "learning_rate": 2.424948945758966e-06, + "loss": 0.82422853, + "num_input_tokens_seen": 160150635, + "step": 7466, + "time_per_iteration": 2.549793243408203 + }, + { + "auxiliary_loss_clip": 0.01098831, + "auxiliary_loss_mlp": 0.01034173, + "balance_loss_clip": 1.0408392, + "balance_loss_mlp": 1.02033865, + "epoch": 0.44894032767172704, + "flos": 18259678800000.0, + "grad_norm": 2.4774837534927885, + "language_loss": 0.80069566, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.82202572, + "num_input_tokens_seen": 160168615, + "step": 7467, + "time_per_iteration": 2.4681432247161865 + }, + { + "auxiliary_loss_clip": 0.01068258, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.04233718, + "balance_loss_mlp": 1.02067196, + "epoch": 0.449000450924395, + "flos": 21579835610880.0, + "grad_norm": 1.7044031511042466, + "language_loss": 0.75294113, + "learning_rate": 2.424187775642129e-06, + "loss": 0.77395451, + "num_input_tokens_seen": 160187295, + "step": 7468, + "time_per_iteration": 2.571585178375244 + }, + { + "auxiliary_loss_clip": 0.01088927, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.03822994, + "balance_loss_mlp": 1.0192616, + "epoch": 0.44906057417706297, + "flos": 17967904623360.0, + "grad_norm": 6.109489524242912, + "language_loss": 0.70895469, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.73015857, + "num_input_tokens_seen": 160205115, + "step": 7469, + "time_per_iteration": 2.5202560424804688 + }, + { + "auxiliary_loss_clip": 0.01106402, + "auxiliary_loss_mlp": 0.01035682, + "balance_loss_clip": 1.04301965, + "balance_loss_mlp": 1.02222979, + "epoch": 0.44912069742973093, + "flos": 20047347803520.0, + "grad_norm": 1.7288823165940888, + "language_loss": 0.72220266, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.7436235, + "num_input_tokens_seen": 160222580, + "step": 7470, + "time_per_iteration": 2.478724479675293 + }, + { + "auxiliary_loss_clip": 0.01078572, + "auxiliary_loss_mlp": 0.01036431, + "balance_loss_clip": 1.03886759, + "balance_loss_mlp": 1.02201843, + "epoch": 0.4491808206823989, + "flos": 21033526682880.0, + "grad_norm": 2.128938025560052, + "language_loss": 0.7691859, + "learning_rate": 2.423045899863634e-06, + "loss": 0.79033595, + "num_input_tokens_seen": 160241520, + "step": 7471, + "time_per_iteration": 2.5516316890716553 + }, + { + "auxiliary_loss_clip": 0.01119139, + "auxiliary_loss_mlp": 0.01036289, + "balance_loss_clip": 1.04134977, + "balance_loss_mlp": 1.02337885, + "epoch": 0.44924094393506686, + "flos": 22967136645120.0, + "grad_norm": 1.7221060419447058, + "language_loss": 0.70381987, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.72537416, + "num_input_tokens_seen": 160261815, + "step": 7472, + "time_per_iteration": 2.491719961166382 + }, + { + "auxiliary_loss_clip": 0.01033797, + "auxiliary_loss_mlp": 0.01006363, + "balance_loss_clip": 1.01732326, + "balance_loss_mlp": 1.00486088, + "epoch": 0.4493010671877349, + "flos": 59233467864960.0, + "grad_norm": 0.7395344164586624, + "language_loss": 0.61709046, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.63749206, + "num_input_tokens_seen": 160317070, + "step": 7473, + "time_per_iteration": 3.0657989978790283 + }, + { + "auxiliary_loss_clip": 0.01119872, + "auxiliary_loss_mlp": 0.00788652, + "balance_loss_clip": 1.04126847, + "balance_loss_mlp": 1.01353598, + "epoch": 0.44936119044040285, + "flos": 18004892653440.0, + "grad_norm": 4.308204085421883, + "language_loss": 0.77862155, + "learning_rate": 2.421903879707657e-06, + "loss": 0.79770678, + "num_input_tokens_seen": 160334980, + "step": 7474, + "time_per_iteration": 2.442902088165283 + }, + { + "auxiliary_loss_clip": 0.01076731, + "auxiliary_loss_mlp": 0.01040008, + "balance_loss_clip": 1.04098356, + "balance_loss_mlp": 1.02591181, + "epoch": 0.4494213136930708, + "flos": 21251827589760.0, + "grad_norm": 1.6535241718371663, + "language_loss": 0.72148156, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.74264896, + "num_input_tokens_seen": 160354500, + "step": 7475, + "time_per_iteration": 2.5746469497680664 + }, + { + "auxiliary_loss_clip": 0.01071706, + "auxiliary_loss_mlp": 0.0103666, + "balance_loss_clip": 1.03923965, + "balance_loss_mlp": 1.02244401, + "epoch": 0.4494814369457388, + "flos": 27418695022080.0, + "grad_norm": 1.8705457211832148, + "language_loss": 0.77214348, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.79322708, + "num_input_tokens_seen": 160373650, + "step": 7476, + "time_per_iteration": 2.6316397190093994 + }, + { + "auxiliary_loss_clip": 0.0111338, + "auxiliary_loss_mlp": 0.00791896, + "balance_loss_clip": 1.04155481, + "balance_loss_mlp": 1.01589036, + "epoch": 0.44954156019840674, + "flos": 22854053652480.0, + "grad_norm": 2.229506384022198, + "language_loss": 0.72138512, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.74043787, + "num_input_tokens_seen": 160393430, + "step": 7477, + "time_per_iteration": 2.4935190677642822 + }, + { + "auxiliary_loss_clip": 0.01096664, + "auxiliary_loss_mlp": 0.01040295, + "balance_loss_clip": 1.03879094, + "balance_loss_mlp": 1.02526259, + "epoch": 0.4496016834510747, + "flos": 17201570935680.0, + "grad_norm": 2.4739339032451433, + "language_loss": 0.68029106, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.70166063, + "num_input_tokens_seen": 160410545, + "step": 7478, + "time_per_iteration": 2.488309621810913 + }, + { + "auxiliary_loss_clip": 0.01092842, + "auxiliary_loss_mlp": 0.01034262, + "balance_loss_clip": 1.04334641, + "balance_loss_mlp": 1.02194726, + "epoch": 0.4496618067037427, + "flos": 18916628595840.0, + "grad_norm": 1.8749234175382625, + "language_loss": 0.89341116, + "learning_rate": 2.420000193000779e-06, + "loss": 0.91468227, + "num_input_tokens_seen": 160428105, + "step": 7479, + "time_per_iteration": 2.5018234252929688 + }, + { + "auxiliary_loss_clip": 0.01067555, + "auxiliary_loss_mlp": 0.0103776, + "balance_loss_clip": 1.04417574, + "balance_loss_mlp": 1.02324021, + "epoch": 0.44972192995641064, + "flos": 21031659175680.0, + "grad_norm": 1.7576030506875309, + "language_loss": 0.75431848, + "learning_rate": 2.419619407822302e-06, + "loss": 0.77537167, + "num_input_tokens_seen": 160448815, + "step": 7480, + "time_per_iteration": 2.608046531677246 + }, + { + "auxiliary_loss_clip": 0.01087534, + "auxiliary_loss_mlp": 0.01034706, + "balance_loss_clip": 1.04003787, + "balance_loss_mlp": 1.02046037, + "epoch": 0.4497820532090786, + "flos": 20777088510720.0, + "grad_norm": 2.1611678168009516, + "language_loss": 0.79406643, + "learning_rate": 2.419238606731815e-06, + "loss": 0.8152889, + "num_input_tokens_seen": 160465940, + "step": 7481, + "time_per_iteration": 2.5168232917785645 + }, + { + "auxiliary_loss_clip": 0.01094823, + "auxiliary_loss_mlp": 0.01032792, + "balance_loss_clip": 1.04051697, + "balance_loss_mlp": 1.01870728, + "epoch": 0.44984217646174657, + "flos": 33802606385280.0, + "grad_norm": 1.6457344749497105, + "language_loss": 0.68493736, + "learning_rate": 2.418857789743758e-06, + "loss": 0.70621353, + "num_input_tokens_seen": 160486710, + "step": 7482, + "time_per_iteration": 2.613288164138794 + }, + { + "auxiliary_loss_clip": 0.01112768, + "auxiliary_loss_mlp": 0.01041946, + "balance_loss_clip": 1.04410338, + "balance_loss_mlp": 1.02814198, + "epoch": 0.44990229971441453, + "flos": 15518365660800.0, + "grad_norm": 2.328342859930525, + "language_loss": 0.84898472, + "learning_rate": 2.418476956872571e-06, + "loss": 0.87053186, + "num_input_tokens_seen": 160503405, + "step": 7483, + "time_per_iteration": 2.4353723526000977 + }, + { + "auxiliary_loss_clip": 0.010922, + "auxiliary_loss_mlp": 0.01039036, + "balance_loss_clip": 1.0427084, + "balance_loss_mlp": 1.02442718, + "epoch": 0.4499624229670825, + "flos": 29861913191040.0, + "grad_norm": 1.6508699740926254, + "language_loss": 0.80455148, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.8258639, + "num_input_tokens_seen": 160525080, + "step": 7484, + "time_per_iteration": 2.5738296508789062 + }, + { + "auxiliary_loss_clip": 0.01071435, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.04156017, + "balance_loss_mlp": 1.01557827, + "epoch": 0.45002254621975046, + "flos": 18513674847360.0, + "grad_norm": 3.067625360631521, + "language_loss": 0.7501753, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.77119648, + "num_input_tokens_seen": 160540895, + "step": 7485, + "time_per_iteration": 2.5476644039154053 + }, + { + "auxiliary_loss_clip": 0.01029676, + "auxiliary_loss_mlp": 0.01003487, + "balance_loss_clip": 1.02202952, + "balance_loss_mlp": 1.00182998, + "epoch": 0.4500826694724185, + "flos": 70420394229120.0, + "grad_norm": 0.8104522579117598, + "language_loss": 0.5866369, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.60696852, + "num_input_tokens_seen": 160598270, + "step": 7486, + "time_per_iteration": 3.140064239501953 + }, + { + "auxiliary_loss_clip": 0.01104297, + "auxiliary_loss_mlp": 0.01038423, + "balance_loss_clip": 1.03971446, + "balance_loss_mlp": 1.02370667, + "epoch": 0.45014279272508645, + "flos": 15778897983360.0, + "grad_norm": 1.9863082933087652, + "language_loss": 0.83193159, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.85335875, + "num_input_tokens_seen": 160614720, + "step": 7487, + "time_per_iteration": 2.433023691177368 + }, + { + "auxiliary_loss_clip": 0.0111887, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.04187942, + "balance_loss_mlp": 1.01881707, + "epoch": 0.4502029159777544, + "flos": 21799573061760.0, + "grad_norm": 1.5213904540863725, + "language_loss": 0.77486223, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.79637623, + "num_input_tokens_seen": 160635170, + "step": 7488, + "time_per_iteration": 2.5049641132354736 + }, + { + "auxiliary_loss_clip": 0.01119658, + "auxiliary_loss_mlp": 0.01034776, + "balance_loss_clip": 1.04513001, + "balance_loss_mlp": 1.02030993, + "epoch": 0.4502630392304224, + "flos": 28767966531840.0, + "grad_norm": 2.060698080381875, + "language_loss": 0.71967602, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.74122036, + "num_input_tokens_seen": 160654490, + "step": 7489, + "time_per_iteration": 3.967219114303589 + }, + { + "auxiliary_loss_clip": 0.01103419, + "auxiliary_loss_mlp": 0.01035264, + "balance_loss_clip": 1.04631448, + "balance_loss_mlp": 1.01985013, + "epoch": 0.45032316248309034, + "flos": 15844182952320.0, + "grad_norm": 2.1485720870561438, + "language_loss": 0.69158375, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.71297061, + "num_input_tokens_seen": 160669400, + "step": 7490, + "time_per_iteration": 3.9282257556915283 + }, + { + "auxiliary_loss_clip": 0.01025426, + "auxiliary_loss_mlp": 0.01009842, + "balance_loss_clip": 1.02566445, + "balance_loss_mlp": 1.00826859, + "epoch": 0.4503832857357583, + "flos": 57853600945920.0, + "grad_norm": 0.7702191598155552, + "language_loss": 0.56740868, + "learning_rate": 2.415429723843495e-06, + "loss": 0.58776134, + "num_input_tokens_seen": 160733820, + "step": 7491, + "time_per_iteration": 3.0926125049591064 + }, + { + "auxiliary_loss_clip": 0.01104112, + "auxiliary_loss_mlp": 0.01028262, + "balance_loss_clip": 1.044554, + "balance_loss_mlp": 1.01510119, + "epoch": 0.4504434089884263, + "flos": 23878082488320.0, + "grad_norm": 1.7178820345370165, + "language_loss": 0.79525363, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.81657743, + "num_input_tokens_seen": 160753175, + "step": 7492, + "time_per_iteration": 2.52463436126709 + }, + { + "auxiliary_loss_clip": 0.01089897, + "auxiliary_loss_mlp": 0.00792965, + "balance_loss_clip": 1.04204929, + "balance_loss_mlp": 1.01445484, + "epoch": 0.45050353224109424, + "flos": 17785083375360.0, + "grad_norm": 2.3949937946844173, + "language_loss": 0.9296087, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.94843733, + "num_input_tokens_seen": 160768310, + "step": 7493, + "time_per_iteration": 2.514392852783203 + }, + { + "auxiliary_loss_clip": 0.01034166, + "auxiliary_loss_mlp": 0.01002672, + "balance_loss_clip": 1.01782846, + "balance_loss_mlp": 1.00103903, + "epoch": 0.4505636554937622, + "flos": 65063420703360.0, + "grad_norm": 0.7967236880125421, + "language_loss": 0.62852103, + "learning_rate": 2.4142867511336e-06, + "loss": 0.64888936, + "num_input_tokens_seen": 160827370, + "step": 7494, + "time_per_iteration": 4.554858207702637 + }, + { + "auxiliary_loss_clip": 0.01119423, + "auxiliary_loss_mlp": 0.01029227, + "balance_loss_clip": 1.04248643, + "balance_loss_mlp": 1.01628661, + "epoch": 0.45062377874643017, + "flos": 22200084685440.0, + "grad_norm": 1.9790247069796794, + "language_loss": 0.82277077, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.84425724, + "num_input_tokens_seen": 160849140, + "step": 7495, + "time_per_iteration": 3.8982043266296387 + }, + { + "auxiliary_loss_clip": 0.01104936, + "auxiliary_loss_mlp": 0.0103463, + "balance_loss_clip": 1.04019189, + "balance_loss_mlp": 1.01922822, + "epoch": 0.45068390199909814, + "flos": 37670293186560.0, + "grad_norm": 1.8698243006733988, + "language_loss": 0.85662889, + "learning_rate": 2.41352469075395e-06, + "loss": 0.87802452, + "num_input_tokens_seen": 160871280, + "step": 7496, + "time_per_iteration": 2.6323671340942383 + }, + { + "auxiliary_loss_clip": 0.01121892, + "auxiliary_loss_mlp": 0.01030328, + "balance_loss_clip": 1.04297256, + "balance_loss_mlp": 1.0170238, + "epoch": 0.4507440252517661, + "flos": 22302501338880.0, + "grad_norm": 2.222316084680107, + "language_loss": 0.76726997, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.78879213, + "num_input_tokens_seen": 160888625, + "step": 7497, + "time_per_iteration": 2.4628498554229736 + }, + { + "auxiliary_loss_clip": 0.01088457, + "auxiliary_loss_mlp": 0.01032269, + "balance_loss_clip": 1.03957665, + "balance_loss_mlp": 1.01873875, + "epoch": 0.45080414850443407, + "flos": 13188374138880.0, + "grad_norm": 1.9030259542056154, + "language_loss": 0.74904305, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.77025032, + "num_input_tokens_seen": 160907040, + "step": 7498, + "time_per_iteration": 2.506969690322876 + }, + { + "auxiliary_loss_clip": 0.01120582, + "auxiliary_loss_mlp": 0.01040867, + "balance_loss_clip": 1.04171968, + "balance_loss_mlp": 1.02703834, + "epoch": 0.4508642717571021, + "flos": 21944939402880.0, + "grad_norm": 1.9345944622319866, + "language_loss": 0.70223612, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.72385061, + "num_input_tokens_seen": 160927115, + "step": 7499, + "time_per_iteration": 2.4582290649414062 + }, + { + "auxiliary_loss_clip": 0.01078381, + "auxiliary_loss_mlp": 0.01033924, + "balance_loss_clip": 1.04080057, + "balance_loss_mlp": 1.02008402, + "epoch": 0.45092439500977005, + "flos": 23367468700800.0, + "grad_norm": 1.8952890024017854, + "language_loss": 0.77068257, + "learning_rate": 2.412000381939477e-06, + "loss": 0.79180562, + "num_input_tokens_seen": 160944405, + "step": 7500, + "time_per_iteration": 2.5586631298065186 + }, + { + "auxiliary_loss_clip": 0.01076981, + "auxiliary_loss_mlp": 0.01032123, + "balance_loss_clip": 1.04256213, + "balance_loss_mlp": 1.01890874, + "epoch": 0.450984518262438, + "flos": 20772958446720.0, + "grad_norm": 1.7559589938866471, + "language_loss": 0.62458837, + "learning_rate": 2.411619265641992e-06, + "loss": 0.64567947, + "num_input_tokens_seen": 160961345, + "step": 7501, + "time_per_iteration": 2.566627264022827 + }, + { + "auxiliary_loss_clip": 0.01123118, + "auxiliary_loss_mlp": 0.01032539, + "balance_loss_clip": 1.04224491, + "balance_loss_mlp": 1.018031, + "epoch": 0.451044641515106, + "flos": 17707372300800.0, + "grad_norm": 2.019327563735736, + "language_loss": 0.84837198, + "learning_rate": 2.411238133735863e-06, + "loss": 0.86992854, + "num_input_tokens_seen": 160977330, + "step": 7502, + "time_per_iteration": 2.4307401180267334 + }, + { + "auxiliary_loss_clip": 0.01094173, + "auxiliary_loss_mlp": 0.0103556, + "balance_loss_clip": 1.04083872, + "balance_loss_mlp": 1.02245927, + "epoch": 0.45110476476777395, + "flos": 20594698225920.0, + "grad_norm": 1.4200880764343613, + "language_loss": 0.79928786, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.82058513, + "num_input_tokens_seen": 160997280, + "step": 7503, + "time_per_iteration": 2.52477765083313 + }, + { + "auxiliary_loss_clip": 0.01101403, + "auxiliary_loss_mlp": 0.01033581, + "balance_loss_clip": 1.04441881, + "balance_loss_mlp": 1.01978815, + "epoch": 0.4511648880204419, + "flos": 16034043265920.0, + "grad_norm": 1.789525204578375, + "language_loss": 0.81026232, + "learning_rate": 2.410475823155484e-06, + "loss": 0.83161217, + "num_input_tokens_seen": 161014235, + "step": 7504, + "time_per_iteration": 2.4716033935546875 + }, + { + "auxiliary_loss_clip": 0.01086658, + "auxiliary_loss_mlp": 0.01033315, + "balance_loss_clip": 1.03950596, + "balance_loss_mlp": 1.0203985, + "epoch": 0.4512250112731099, + "flos": 23978811202560.0, + "grad_norm": 1.7462433425114166, + "language_loss": 0.63452345, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.65572315, + "num_input_tokens_seen": 161032360, + "step": 7505, + "time_per_iteration": 2.5522115230560303 + }, + { + "auxiliary_loss_clip": 0.0101216, + "auxiliary_loss_mlp": 0.01004, + "balance_loss_clip": 1.02224612, + "balance_loss_mlp": 1.00242662, + "epoch": 0.45128513452577784, + "flos": 71462308037760.0, + "grad_norm": 0.8389431473825758, + "language_loss": 0.588947, + "learning_rate": 2.409713450313968e-06, + "loss": 0.60910857, + "num_input_tokens_seen": 161091360, + "step": 7506, + "time_per_iteration": 3.214512586593628 + }, + { + "auxiliary_loss_clip": 0.01070155, + "auxiliary_loss_mlp": 0.01038292, + "balance_loss_clip": 1.04112959, + "balance_loss_mlp": 1.02437425, + "epoch": 0.4513452577784458, + "flos": 22090844448000.0, + "grad_norm": 1.5936984939734624, + "language_loss": 0.79402089, + "learning_rate": 2.40933224058142e-06, + "loss": 0.81510544, + "num_input_tokens_seen": 161110825, + "step": 7507, + "time_per_iteration": 2.5835394859313965 + }, + { + "auxiliary_loss_clip": 0.0108201, + "auxiliary_loss_mlp": 0.01031067, + "balance_loss_clip": 1.04009628, + "balance_loss_mlp": 1.01608253, + "epoch": 0.4514053810311138, + "flos": 24276403382400.0, + "grad_norm": 1.4778827396728555, + "language_loss": 0.73775184, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.75888264, + "num_input_tokens_seen": 161130685, + "step": 7508, + "time_per_iteration": 2.5691139698028564 + }, + { + "auxiliary_loss_clip": 0.011081, + "auxiliary_loss_mlp": 0.0103431, + "balance_loss_clip": 1.04330087, + "balance_loss_mlp": 1.02167964, + "epoch": 0.45146550428378174, + "flos": 17886781756800.0, + "grad_norm": 1.9257403990498172, + "language_loss": 0.79665279, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.81807691, + "num_input_tokens_seen": 161147555, + "step": 7509, + "time_per_iteration": 2.4637935161590576 + }, + { + "auxiliary_loss_clip": 0.01119243, + "auxiliary_loss_mlp": 0.0103478, + "balance_loss_clip": 1.04215002, + "balance_loss_mlp": 1.02162504, + "epoch": 0.4515256275364497, + "flos": 24243437675520.0, + "grad_norm": 1.8399599248982386, + "language_loss": 0.73125845, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.75279868, + "num_input_tokens_seen": 161166255, + "step": 7510, + "time_per_iteration": 2.4808263778686523 + }, + { + "auxiliary_loss_clip": 0.01120064, + "auxiliary_loss_mlp": 0.01032065, + "balance_loss_clip": 1.04056096, + "balance_loss_mlp": 1.01789093, + "epoch": 0.45158575078911767, + "flos": 20631039811200.0, + "grad_norm": 1.7858241992223973, + "language_loss": 0.76907313, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.79059434, + "num_input_tokens_seen": 161184720, + "step": 7511, + "time_per_iteration": 2.4483795166015625 + }, + { + "auxiliary_loss_clip": 0.01109253, + "auxiliary_loss_mlp": 0.01034227, + "balance_loss_clip": 1.04092622, + "balance_loss_mlp": 1.02057779, + "epoch": 0.45164587404178563, + "flos": 23327751237120.0, + "grad_norm": 1.4893286749867192, + "language_loss": 0.78675687, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.80819166, + "num_input_tokens_seen": 161204360, + "step": 7512, + "time_per_iteration": 2.4978139400482178 + }, + { + "auxiliary_loss_clip": 0.01088876, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.04173017, + "balance_loss_mlp": 1.02194786, + "epoch": 0.45170599729445365, + "flos": 23805973935360.0, + "grad_norm": 2.54205141765288, + "language_loss": 0.87497962, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.89623153, + "num_input_tokens_seen": 161223575, + "step": 7513, + "time_per_iteration": 2.5627903938293457 + }, + { + "auxiliary_loss_clip": 0.01103741, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.04105401, + "balance_loss_mlp": 1.01830077, + "epoch": 0.4517661205471216, + "flos": 23512942782720.0, + "grad_norm": 1.5867246989468144, + "language_loss": 0.67499959, + "learning_rate": 2.406663338649419e-06, + "loss": 0.69634336, + "num_input_tokens_seen": 161243805, + "step": 7514, + "time_per_iteration": 2.513930082321167 + }, + { + "auxiliary_loss_clip": 0.01106967, + "auxiliary_loss_mlp": 0.01030483, + "balance_loss_clip": 1.04376233, + "balance_loss_mlp": 1.01517105, + "epoch": 0.4518262437997896, + "flos": 23513948363520.0, + "grad_norm": 1.74672358084882, + "language_loss": 0.69302464, + "learning_rate": 2.406282005146318e-06, + "loss": 0.71439916, + "num_input_tokens_seen": 161261450, + "step": 7515, + "time_per_iteration": 2.512010097503662 + }, + { + "auxiliary_loss_clip": 0.01109011, + "auxiliary_loss_mlp": 0.01040219, + "balance_loss_clip": 1.04158652, + "balance_loss_mlp": 1.02571118, + "epoch": 0.45188636705245755, + "flos": 14568061489920.0, + "grad_norm": 2.245857008777817, + "language_loss": 0.81849134, + "learning_rate": 2.405900656236963e-06, + "loss": 0.8399837, + "num_input_tokens_seen": 161276965, + "step": 7516, + "time_per_iteration": 2.4500176906585693 + }, + { + "auxiliary_loss_clip": 0.01118166, + "auxiliary_loss_mlp": 0.01035031, + "balance_loss_clip": 1.04253376, + "balance_loss_mlp": 1.02116108, + "epoch": 0.4519464903051255, + "flos": 19901550499200.0, + "grad_norm": 1.5550919372812402, + "language_loss": 0.65483129, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.67636323, + "num_input_tokens_seen": 161295375, + "step": 7517, + "time_per_iteration": 2.4442341327667236 + }, + { + "auxiliary_loss_clip": 0.01084421, + "auxiliary_loss_mlp": 0.01025559, + "balance_loss_clip": 1.04027069, + "balance_loss_mlp": 1.0133636, + "epoch": 0.4520066135577935, + "flos": 18844376388480.0, + "grad_norm": 1.9210714409731768, + "language_loss": 0.62749964, + "learning_rate": 2.405137912257333e-06, + "loss": 0.64859939, + "num_input_tokens_seen": 161313010, + "step": 7518, + "time_per_iteration": 2.535388231277466 + }, + { + "auxiliary_loss_clip": 0.01107336, + "auxiliary_loss_mlp": 0.01036308, + "balance_loss_clip": 1.0416199, + "balance_loss_mlp": 1.02332616, + "epoch": 0.45206673681046144, + "flos": 48214419713280.0, + "grad_norm": 1.4327375991822382, + "language_loss": 0.59508288, + "learning_rate": 2.404756517215982e-06, + "loss": 0.61651933, + "num_input_tokens_seen": 161336690, + "step": 7519, + "time_per_iteration": 2.71445369720459 + }, + { + "auxiliary_loss_clip": 0.01112967, + "auxiliary_loss_mlp": 0.01039751, + "balance_loss_clip": 1.04477262, + "balance_loss_mlp": 1.02606535, + "epoch": 0.4521268600631294, + "flos": 23842171866240.0, + "grad_norm": 1.3323757611159335, + "language_loss": 0.72354436, + "learning_rate": 2.404375106826223e-06, + "loss": 0.74507159, + "num_input_tokens_seen": 161357845, + "step": 7520, + "time_per_iteration": 2.5167789459228516 + }, + { + "auxiliary_loss_clip": 0.01098931, + "auxiliary_loss_mlp": 0.01036055, + "balance_loss_clip": 1.04142475, + "balance_loss_mlp": 1.02310896, + "epoch": 0.4521869833157974, + "flos": 18843622202880.0, + "grad_norm": 1.810369498044126, + "language_loss": 0.75555933, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.77690923, + "num_input_tokens_seen": 161375160, + "step": 7521, + "time_per_iteration": 2.468827724456787 + }, + { + "auxiliary_loss_clip": 0.0110631, + "auxiliary_loss_mlp": 0.0103904, + "balance_loss_clip": 1.04486918, + "balance_loss_mlp": 1.024616, + "epoch": 0.45224710656846534, + "flos": 19788072456960.0, + "grad_norm": 1.667657404984478, + "language_loss": 0.675982, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.6974355, + "num_input_tokens_seen": 161393690, + "step": 7522, + "time_per_iteration": 2.5115177631378174 + }, + { + "auxiliary_loss_clip": 0.01106628, + "auxiliary_loss_mlp": 0.01032874, + "balance_loss_clip": 1.0411787, + "balance_loss_mlp": 1.01934421, + "epoch": 0.4523072298211333, + "flos": 28256131681920.0, + "grad_norm": 1.4492898570671542, + "language_loss": 0.60874474, + "learning_rate": 2.403230783711134e-06, + "loss": 0.63013971, + "num_input_tokens_seen": 161415015, + "step": 7523, + "time_per_iteration": 2.5834474563598633 + }, + { + "auxiliary_loss_clip": 0.01113231, + "auxiliary_loss_mlp": 0.01035928, + "balance_loss_clip": 1.0421139, + "balance_loss_mlp": 1.02142024, + "epoch": 0.45236735307380127, + "flos": 11181039511680.0, + "grad_norm": 1.9161846680095935, + "language_loss": 0.78269398, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.80418557, + "num_input_tokens_seen": 161432940, + "step": 7524, + "time_per_iteration": 2.4899485111236572 + }, + { + "auxiliary_loss_clip": 0.0107283, + "auxiliary_loss_mlp": 0.01034809, + "balance_loss_clip": 1.038746, + "balance_loss_mlp": 1.02120709, + "epoch": 0.45242747632646924, + "flos": 22601386408320.0, + "grad_norm": 1.7066201955640377, + "language_loss": 0.63744146, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.65851784, + "num_input_tokens_seen": 161452215, + "step": 7525, + "time_per_iteration": 2.57053804397583 + }, + { + "auxiliary_loss_clip": 0.01108061, + "auxiliary_loss_mlp": 0.01034486, + "balance_loss_clip": 1.04107881, + "balance_loss_mlp": 1.02183247, + "epoch": 0.45248759957913726, + "flos": 18256267008000.0, + "grad_norm": 1.5582623761363177, + "language_loss": 0.79091644, + "learning_rate": 2.402086322981083e-06, + "loss": 0.81234193, + "num_input_tokens_seen": 161469520, + "step": 7526, + "time_per_iteration": 2.458824396133423 + }, + { + "auxiliary_loss_clip": 0.01092123, + "auxiliary_loss_mlp": 0.01031904, + "balance_loss_clip": 1.03953981, + "balance_loss_mlp": 1.01807523, + "epoch": 0.4525477228318052, + "flos": 22450094323200.0, + "grad_norm": 1.5863929153520704, + "language_loss": 0.80946767, + "learning_rate": 2.40170480555747e-06, + "loss": 0.83070791, + "num_input_tokens_seen": 161487335, + "step": 7527, + "time_per_iteration": 2.587440252304077 + }, + { + "auxiliary_loss_clip": 0.01091987, + "auxiliary_loss_mlp": 0.01029398, + "balance_loss_clip": 1.04374051, + "balance_loss_mlp": 1.01555192, + "epoch": 0.4526078460844732, + "flos": 29644869260160.0, + "grad_norm": 1.4320675628203965, + "language_loss": 0.65507007, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.67628396, + "num_input_tokens_seen": 161510095, + "step": 7528, + "time_per_iteration": 3.992781162261963 + }, + { + "auxiliary_loss_clip": 0.01096222, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.04113615, + "balance_loss_mlp": 1.01856399, + "epoch": 0.45266796933714115, + "flos": 23039747988480.0, + "grad_norm": 1.52337438696075, + "language_loss": 0.75711334, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.77839625, + "num_input_tokens_seen": 161528725, + "step": 7529, + "time_per_iteration": 3.9235610961914062 + }, + { + "auxiliary_loss_clip": 0.01117994, + "auxiliary_loss_mlp": 0.01032836, + "balance_loss_clip": 1.04041648, + "balance_loss_mlp": 1.01945496, + "epoch": 0.4527280925898091, + "flos": 14428405411200.0, + "grad_norm": 1.857760500987767, + "language_loss": 0.72947562, + "learning_rate": 2.400560161948384e-06, + "loss": 0.75098395, + "num_input_tokens_seen": 161547195, + "step": 7530, + "time_per_iteration": 2.4253456592559814 + }, + { + "auxiliary_loss_clip": 0.01084435, + "auxiliary_loss_mlp": 0.01031566, + "balance_loss_clip": 1.04124618, + "balance_loss_mlp": 1.01854873, + "epoch": 0.4527882158424771, + "flos": 22925515760640.0, + "grad_norm": 1.6697923639223333, + "language_loss": 0.75818348, + "learning_rate": 2.400178583680834e-06, + "loss": 0.77934349, + "num_input_tokens_seen": 161565565, + "step": 7531, + "time_per_iteration": 2.556476354598999 + }, + { + "auxiliary_loss_clip": 0.01115238, + "auxiliary_loss_mlp": 0.01034153, + "balance_loss_clip": 1.04102397, + "balance_loss_mlp": 1.02039027, + "epoch": 0.45284833909514505, + "flos": 25555326105600.0, + "grad_norm": 1.4244616415112288, + "language_loss": 0.66768533, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.68917924, + "num_input_tokens_seen": 161586630, + "step": 7532, + "time_per_iteration": 2.491196393966675 + }, + { + "auxiliary_loss_clip": 0.01105093, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.04056692, + "balance_loss_mlp": 1.02231169, + "epoch": 0.452908462347813, + "flos": 18150007599360.0, + "grad_norm": 1.9266768190834322, + "language_loss": 0.78542423, + "learning_rate": 2.399415381635768e-06, + "loss": 0.80683208, + "num_input_tokens_seen": 161603815, + "step": 7533, + "time_per_iteration": 3.8915374279022217 + }, + { + "auxiliary_loss_clip": 0.01091352, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.03835964, + "balance_loss_mlp": 1.02066791, + "epoch": 0.452968585600481, + "flos": 19062749122560.0, + "grad_norm": 1.725536360241159, + "language_loss": 0.83097637, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.85224253, + "num_input_tokens_seen": 161622900, + "step": 7534, + "time_per_iteration": 4.0086798667907715 + }, + { + "auxiliary_loss_clip": 0.0109097, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.04400849, + "balance_loss_mlp": 1.01937866, + "epoch": 0.45302870885314894, + "flos": 22051737515520.0, + "grad_norm": 1.5220336312267768, + "language_loss": 0.76409733, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.78534406, + "num_input_tokens_seen": 161641700, + "step": 7535, + "time_per_iteration": 2.516554832458496 + }, + { + "auxiliary_loss_clip": 0.01078579, + "auxiliary_loss_mlp": 0.01034381, + "balance_loss_clip": 1.04196, + "balance_loss_mlp": 1.02160788, + "epoch": 0.4530888321058169, + "flos": 20376217751040.0, + "grad_norm": 1.5872489556624223, + "language_loss": 0.80409974, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.82522935, + "num_input_tokens_seen": 161661955, + "step": 7536, + "time_per_iteration": 2.572300434112549 + }, + { + "auxiliary_loss_clip": 0.01087391, + "auxiliary_loss_mlp": 0.01034211, + "balance_loss_clip": 1.03667331, + "balance_loss_mlp": 1.02067506, + "epoch": 0.4531489553584849, + "flos": 14830425406080.0, + "grad_norm": 1.7209322910412848, + "language_loss": 0.75941736, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.78063339, + "num_input_tokens_seen": 161679245, + "step": 7537, + "time_per_iteration": 2.4924445152282715 + }, + { + "auxiliary_loss_clip": 0.01108078, + "auxiliary_loss_mlp": 0.0103189, + "balance_loss_clip": 1.04024982, + "balance_loss_mlp": 1.01939058, + "epoch": 0.45320907861115284, + "flos": 21944975316480.0, + "grad_norm": 1.8136449024325656, + "language_loss": 0.75979364, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.78119332, + "num_input_tokens_seen": 161698795, + "step": 7538, + "time_per_iteration": 2.495335817337036 + }, + { + "auxiliary_loss_clip": 0.01034954, + "auxiliary_loss_mlp": 0.01009332, + "balance_loss_clip": 1.01863205, + "balance_loss_mlp": 1.00791359, + "epoch": 0.45326920186382086, + "flos": 66251455038720.0, + "grad_norm": 0.7896755772819237, + "language_loss": 0.62403715, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.64447999, + "num_input_tokens_seen": 161761980, + "step": 7539, + "time_per_iteration": 3.1319046020507812 + }, + { + "auxiliary_loss_clip": 0.01119866, + "auxiliary_loss_mlp": 0.01041231, + "balance_loss_clip": 1.04283237, + "balance_loss_mlp": 1.02815986, + "epoch": 0.4533293251164888, + "flos": 14684233052160.0, + "grad_norm": 1.8917120101010776, + "language_loss": 0.66026628, + "learning_rate": 2.396743698142872e-06, + "loss": 0.68187726, + "num_input_tokens_seen": 161779455, + "step": 7540, + "time_per_iteration": 2.4548158645629883 + }, + { + "auxiliary_loss_clip": 0.01100981, + "auxiliary_loss_mlp": 0.01040739, + "balance_loss_clip": 1.04298651, + "balance_loss_mlp": 1.02508664, + "epoch": 0.4533894483691568, + "flos": 22601206840320.0, + "grad_norm": 1.9469440732431267, + "language_loss": 0.8499248, + "learning_rate": 2.396361968778424e-06, + "loss": 0.87134194, + "num_input_tokens_seen": 161798980, + "step": 7541, + "time_per_iteration": 2.5491039752960205 + }, + { + "auxiliary_loss_clip": 0.01096254, + "auxiliary_loss_mlp": 0.01029768, + "balance_loss_clip": 1.04077625, + "balance_loss_mlp": 1.01650608, + "epoch": 0.45344957162182475, + "flos": 34751617666560.0, + "grad_norm": 1.7927916555094816, + "language_loss": 0.76346445, + "learning_rate": 2.395980224383889e-06, + "loss": 0.78472465, + "num_input_tokens_seen": 161819745, + "step": 7542, + "time_per_iteration": 2.636831283569336 + }, + { + "auxiliary_loss_clip": 0.01091845, + "auxiliary_loss_mlp": 0.01026691, + "balance_loss_clip": 1.04007053, + "balance_loss_mlp": 1.01291108, + "epoch": 0.4535096948744927, + "flos": 23550218121600.0, + "grad_norm": 1.6580692177561902, + "language_loss": 0.80160815, + "learning_rate": 2.395598464973746e-06, + "loss": 0.82279348, + "num_input_tokens_seen": 161838575, + "step": 7543, + "time_per_iteration": 2.544360637664795 + }, + { + "auxiliary_loss_clip": 0.01106536, + "auxiliary_loss_mlp": 0.00789603, + "balance_loss_clip": 1.03935826, + "balance_loss_mlp": 1.01374149, + "epoch": 0.4535698181271607, + "flos": 25557552748800.0, + "grad_norm": 1.610403814559713, + "language_loss": 0.76227498, + "learning_rate": 2.395216690562469e-06, + "loss": 0.78123641, + "num_input_tokens_seen": 161858590, + "step": 7544, + "time_per_iteration": 2.543915271759033 + }, + { + "auxiliary_loss_clip": 0.01090836, + "auxiliary_loss_mlp": 0.01035968, + "balance_loss_clip": 1.04275, + "balance_loss_mlp": 1.02243745, + "epoch": 0.45362994137982865, + "flos": 24864117713280.0, + "grad_norm": 1.6467215558897073, + "language_loss": 0.75508571, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.77635378, + "num_input_tokens_seen": 161878390, + "step": 7545, + "time_per_iteration": 2.5673301219940186 + }, + { + "auxiliary_loss_clip": 0.01099725, + "auxiliary_loss_mlp": 0.01033131, + "balance_loss_clip": 1.04171968, + "balance_loss_mlp": 1.01932669, + "epoch": 0.4536900646324966, + "flos": 30806794408320.0, + "grad_norm": 1.8938380501715006, + "language_loss": 0.72214293, + "learning_rate": 2.394453096794423e-06, + "loss": 0.7434715, + "num_input_tokens_seen": 161898610, + "step": 7546, + "time_per_iteration": 2.5985326766967773 + }, + { + "auxiliary_loss_clip": 0.01100686, + "auxiliary_loss_mlp": 0.01032289, + "balance_loss_clip": 1.041049, + "balance_loss_mlp": 1.01729858, + "epoch": 0.4537501878851646, + "flos": 23404313076480.0, + "grad_norm": 1.5453574207572947, + "language_loss": 0.7541256, + "learning_rate": 2.394071277466609e-06, + "loss": 0.77545536, + "num_input_tokens_seen": 161918210, + "step": 7547, + "time_per_iteration": 2.5293996334075928 + }, + { + "auxiliary_loss_clip": 0.01111515, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.04169774, + "balance_loss_mlp": 1.01745534, + "epoch": 0.45381031113783254, + "flos": 18149289327360.0, + "grad_norm": 1.9113224280566248, + "language_loss": 0.69135523, + "learning_rate": 2.393689443195573e-06, + "loss": 0.71278918, + "num_input_tokens_seen": 161936950, + "step": 7548, + "time_per_iteration": 2.471928358078003 + }, + { + "auxiliary_loss_clip": 0.01119041, + "auxiliary_loss_mlp": 0.01038828, + "balance_loss_clip": 1.04054976, + "balance_loss_mlp": 1.02512455, + "epoch": 0.4538704343905005, + "flos": 25336666062720.0, + "grad_norm": 1.9528313110622653, + "language_loss": 0.72441006, + "learning_rate": 2.393307593995794e-06, + "loss": 0.74598873, + "num_input_tokens_seen": 161955550, + "step": 7549, + "time_per_iteration": 2.5065805912017822 + }, + { + "auxiliary_loss_clip": 0.01085023, + "auxiliary_loss_mlp": 0.01027706, + "balance_loss_clip": 1.0383997, + "balance_loss_mlp": 1.01484942, + "epoch": 0.4539305576431685, + "flos": 28731445378560.0, + "grad_norm": 1.5510735845768886, + "language_loss": 0.65360427, + "learning_rate": 2.392925729881751e-06, + "loss": 0.67473155, + "num_input_tokens_seen": 161976760, + "step": 7550, + "time_per_iteration": 2.594890594482422 + }, + { + "auxiliary_loss_clip": 0.01101428, + "auxiliary_loss_mlp": 0.01031409, + "balance_loss_clip": 1.04198194, + "balance_loss_mlp": 1.01880217, + "epoch": 0.45399068089583644, + "flos": 22492397566080.0, + "grad_norm": 1.6528691851977826, + "language_loss": 0.68808752, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.70941591, + "num_input_tokens_seen": 161996120, + "step": 7551, + "time_per_iteration": 2.502305269241333 + }, + { + "auxiliary_loss_clip": 0.01106462, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.03812587, + "balance_loss_mlp": 1.01996541, + "epoch": 0.45405080414850446, + "flos": 12893403651840.0, + "grad_norm": 1.9869528637006668, + "language_loss": 0.79115903, + "learning_rate": 2.392161956968798e-06, + "loss": 0.81257224, + "num_input_tokens_seen": 162011125, + "step": 7552, + "time_per_iteration": 2.4385323524475098 + }, + { + "auxiliary_loss_clip": 0.01032311, + "auxiliary_loss_mlp": 0.01004622, + "balance_loss_clip": 1.01695275, + "balance_loss_mlp": 1.0032568, + "epoch": 0.4541109274011724, + "flos": 59766919724160.0, + "grad_norm": 0.8122332999290258, + "language_loss": 0.57808006, + "learning_rate": 2.39178004819885e-06, + "loss": 0.59844947, + "num_input_tokens_seen": 162068705, + "step": 7553, + "time_per_iteration": 3.0522093772888184 + }, + { + "auxiliary_loss_clip": 0.01065168, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.04344988, + "balance_loss_mlp": 1.02260399, + "epoch": 0.4541710506538404, + "flos": 28511743841280.0, + "grad_norm": 1.4331164712803282, + "language_loss": 0.76836181, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.78937399, + "num_input_tokens_seen": 162089655, + "step": 7554, + "time_per_iteration": 2.6535513401031494 + }, + { + "auxiliary_loss_clip": 0.01099922, + "auxiliary_loss_mlp": 0.01033105, + "balance_loss_clip": 1.04087281, + "balance_loss_mlp": 1.01816773, + "epoch": 0.45423117390650836, + "flos": 17675591742720.0, + "grad_norm": 2.17260200060156, + "language_loss": 0.76381415, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.78514445, + "num_input_tokens_seen": 162108465, + "step": 7555, + "time_per_iteration": 2.5028486251831055 + }, + { + "auxiliary_loss_clip": 0.01052193, + "auxiliary_loss_mlp": 0.01033872, + "balance_loss_clip": 1.03846061, + "balance_loss_mlp": 1.0204612, + "epoch": 0.4542912971591763, + "flos": 28072556248320.0, + "grad_norm": 1.3031368983599358, + "language_loss": 0.72527254, + "learning_rate": 2.390634232808903e-06, + "loss": 0.74613315, + "num_input_tokens_seen": 162129910, + "step": 7556, + "time_per_iteration": 2.6869678497314453 + }, + { + "auxiliary_loss_clip": 0.01123979, + "auxiliary_loss_mlp": 0.01036481, + "balance_loss_clip": 1.04254627, + "balance_loss_mlp": 1.02270591, + "epoch": 0.4543514204118443, + "flos": 22671771108480.0, + "grad_norm": 1.975003485957552, + "language_loss": 0.62864166, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.65024632, + "num_input_tokens_seen": 162148840, + "step": 7557, + "time_per_iteration": 2.472999334335327 + }, + { + "auxiliary_loss_clip": 0.01024851, + "auxiliary_loss_mlp": 0.01002178, + "balance_loss_clip": 1.01815712, + "balance_loss_mlp": 1.0007472, + "epoch": 0.45441154366451225, + "flos": 58216549921920.0, + "grad_norm": 0.6817168858425913, + "language_loss": 0.57655728, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.59682751, + "num_input_tokens_seen": 162208500, + "step": 7558, + "time_per_iteration": 3.0351898670196533 + }, + { + "auxiliary_loss_clip": 0.01110365, + "auxiliary_loss_mlp": 0.01039909, + "balance_loss_clip": 1.04053986, + "balance_loss_mlp": 1.02473354, + "epoch": 0.4544716669171802, + "flos": 16764286763520.0, + "grad_norm": 2.6652305120113065, + "language_loss": 0.56044316, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.5819459, + "num_input_tokens_seen": 162224650, + "step": 7559, + "time_per_iteration": 2.47643780708313 + }, + { + "auxiliary_loss_clip": 0.01105864, + "auxiliary_loss_mlp": 0.00789043, + "balance_loss_clip": 1.04166746, + "balance_loss_mlp": 1.01187468, + "epoch": 0.4545317901698482, + "flos": 15925233991680.0, + "grad_norm": 1.77060312511399, + "language_loss": 0.72154319, + "learning_rate": 2.389106271642792e-06, + "loss": 0.74049222, + "num_input_tokens_seen": 162242930, + "step": 7560, + "time_per_iteration": 2.4859554767608643 + }, + { + "auxiliary_loss_clip": 0.0105071, + "auxiliary_loss_mlp": 0.01039745, + "balance_loss_clip": 1.03976691, + "balance_loss_mlp": 1.02437341, + "epoch": 0.45459191342251615, + "flos": 17639752947840.0, + "grad_norm": 2.689862472272109, + "language_loss": 0.68986869, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.71077323, + "num_input_tokens_seen": 162261455, + "step": 7561, + "time_per_iteration": 2.6903345584869385 + }, + { + "auxiliary_loss_clip": 0.01095665, + "auxiliary_loss_mlp": 0.01034862, + "balance_loss_clip": 1.0406028, + "balance_loss_mlp": 1.02202296, + "epoch": 0.4546520366751841, + "flos": 16176608346240.0, + "grad_norm": 1.7692013309673016, + "language_loss": 0.84869254, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.86999774, + "num_input_tokens_seen": 162279725, + "step": 7562, + "time_per_iteration": 2.7375617027282715 + }, + { + "auxiliary_loss_clip": 0.011047, + "auxiliary_loss_mlp": 0.01034114, + "balance_loss_clip": 1.03860712, + "balance_loss_mlp": 1.02082253, + "epoch": 0.4547121599278521, + "flos": 19751443562880.0, + "grad_norm": 1.7870307629486721, + "language_loss": 0.89485151, + "learning_rate": 2.38796014579055e-06, + "loss": 0.91623962, + "num_input_tokens_seen": 162297865, + "step": 7563, + "time_per_iteration": 2.4810233116149902 + }, + { + "auxiliary_loss_clip": 0.01118039, + "auxiliary_loss_mlp": 0.00791942, + "balance_loss_clip": 1.0392766, + "balance_loss_mlp": 1.0157696, + "epoch": 0.45477228318052004, + "flos": 19937461121280.0, + "grad_norm": 1.9067591967194177, + "language_loss": 0.7153365, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.73443633, + "num_input_tokens_seen": 162316010, + "step": 7564, + "time_per_iteration": 2.448718786239624 + }, + { + "auxiliary_loss_clip": 0.01109414, + "auxiliary_loss_mlp": 0.01038265, + "balance_loss_clip": 1.03885674, + "balance_loss_mlp": 1.02438927, + "epoch": 0.454832406433188, + "flos": 21288312829440.0, + "grad_norm": 2.0431797494485853, + "language_loss": 0.68389094, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.70536768, + "num_input_tokens_seen": 162336115, + "step": 7565, + "time_per_iteration": 2.5072708129882812 + }, + { + "auxiliary_loss_clip": 0.01075108, + "auxiliary_loss_mlp": 0.01034795, + "balance_loss_clip": 1.03542805, + "balance_loss_mlp": 1.02078152, + "epoch": 0.45489252968585603, + "flos": 24498726612480.0, + "grad_norm": 1.681564106811362, + "language_loss": 0.80448914, + "learning_rate": 2.386813887534922e-06, + "loss": 0.82558817, + "num_input_tokens_seen": 162355705, + "step": 7566, + "time_per_iteration": 2.5969293117523193 + }, + { + "auxiliary_loss_clip": 0.01083796, + "auxiliary_loss_mlp": 0.01034689, + "balance_loss_clip": 1.04015112, + "balance_loss_mlp": 1.01960874, + "epoch": 0.454952652938524, + "flos": 17092474352640.0, + "grad_norm": 1.606633612457842, + "language_loss": 0.73541552, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.75660038, + "num_input_tokens_seen": 162374055, + "step": 7567, + "time_per_iteration": 5.267664432525635 + }, + { + "auxiliary_loss_clip": 0.01084496, + "auxiliary_loss_mlp": 0.0103801, + "balance_loss_clip": 1.0390799, + "balance_loss_mlp": 1.02320433, + "epoch": 0.45501277619119196, + "flos": 27630387826560.0, + "grad_norm": 2.71120563044189, + "language_loss": 0.80976224, + "learning_rate": 2.386049642000249e-06, + "loss": 0.83098733, + "num_input_tokens_seen": 162393560, + "step": 7568, + "time_per_iteration": 2.5954670906066895 + }, + { + "auxiliary_loss_clip": 0.01113344, + "auxiliary_loss_mlp": 0.01047838, + "balance_loss_clip": 1.04025161, + "balance_loss_mlp": 1.03226352, + "epoch": 0.4550728994438599, + "flos": 19974664632960.0, + "grad_norm": 1.8364789366893806, + "language_loss": 0.80004013, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.82165194, + "num_input_tokens_seen": 162413170, + "step": 7569, + "time_per_iteration": 2.4610133171081543 + }, + { + "auxiliary_loss_clip": 0.01111416, + "auxiliary_loss_mlp": 0.01033977, + "balance_loss_clip": 1.04104984, + "balance_loss_mlp": 1.01871848, + "epoch": 0.4551330226965279, + "flos": 26066873646720.0, + "grad_norm": 1.5790860316190194, + "language_loss": 0.75027299, + "learning_rate": 2.385285337909412e-06, + "loss": 0.77172697, + "num_input_tokens_seen": 162434080, + "step": 7570, + "time_per_iteration": 2.520038604736328 + }, + { + "auxiliary_loss_clip": 0.01097931, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.0410794, + "balance_loss_mlp": 1.02412224, + "epoch": 0.45519314594919585, + "flos": 32781091501440.0, + "grad_norm": 1.8056831411998504, + "language_loss": 0.74491507, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.7662766, + "num_input_tokens_seen": 162455445, + "step": 7571, + "time_per_iteration": 4.003863573074341 + }, + { + "auxiliary_loss_clip": 0.01103382, + "auxiliary_loss_mlp": 0.01029787, + "balance_loss_clip": 1.03892112, + "balance_loss_mlp": 1.01604199, + "epoch": 0.4552532692018638, + "flos": 19172671718400.0, + "grad_norm": 1.4742333690324358, + "language_loss": 0.81200504, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.83333671, + "num_input_tokens_seen": 162474940, + "step": 7572, + "time_per_iteration": 2.4658901691436768 + }, + { + "auxiliary_loss_clip": 0.01103799, + "auxiliary_loss_mlp": 0.01038026, + "balance_loss_clip": 1.04047716, + "balance_loss_mlp": 1.02215898, + "epoch": 0.4553133924545318, + "flos": 26027156183040.0, + "grad_norm": 1.9441153247358252, + "language_loss": 0.72756255, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.74898076, + "num_input_tokens_seen": 162493340, + "step": 7573, + "time_per_iteration": 3.9208662509918213 + }, + { + "auxiliary_loss_clip": 0.01112158, + "auxiliary_loss_mlp": 0.01038818, + "balance_loss_clip": 1.04175496, + "balance_loss_mlp": 1.02211702, + "epoch": 0.45537351570719975, + "flos": 30661535808000.0, + "grad_norm": 1.8539720424639634, + "language_loss": 0.74764109, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.76915085, + "num_input_tokens_seen": 162514360, + "step": 7574, + "time_per_iteration": 2.5490190982818604 + }, + { + "auxiliary_loss_clip": 0.01108407, + "auxiliary_loss_mlp": 0.01033563, + "balance_loss_clip": 1.03923297, + "balance_loss_mlp": 1.01915097, + "epoch": 0.4554336389598677, + "flos": 24353396184960.0, + "grad_norm": 1.434467350911415, + "language_loss": 0.71333903, + "learning_rate": 2.383374322259915e-06, + "loss": 0.73475873, + "num_input_tokens_seen": 162535240, + "step": 7575, + "time_per_iteration": 2.4957494735717773 + }, + { + "auxiliary_loss_clip": 0.01094681, + "auxiliary_loss_mlp": 0.01032105, + "balance_loss_clip": 1.0382303, + "balance_loss_mlp": 1.01800251, + "epoch": 0.4554937622125357, + "flos": 20557925677440.0, + "grad_norm": 1.9324751736784862, + "language_loss": 0.7347014, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.75596929, + "num_input_tokens_seen": 162553880, + "step": 7576, + "time_per_iteration": 2.5234806537628174 + }, + { + "auxiliary_loss_clip": 0.01116268, + "auxiliary_loss_mlp": 0.01035861, + "balance_loss_clip": 1.03972268, + "balance_loss_mlp": 1.02153158, + "epoch": 0.45555388546520365, + "flos": 22820764723200.0, + "grad_norm": 1.6693662114601133, + "language_loss": 0.66307771, + "learning_rate": 2.382609814135511e-06, + "loss": 0.68459892, + "num_input_tokens_seen": 162574485, + "step": 7577, + "time_per_iteration": 2.4518446922302246 + }, + { + "auxiliary_loss_clip": 0.01094324, + "auxiliary_loss_mlp": 0.01045008, + "balance_loss_clip": 1.04298246, + "balance_loss_mlp": 1.02862883, + "epoch": 0.4556140087178716, + "flos": 21725992051200.0, + "grad_norm": 1.8762253678718777, + "language_loss": 0.74463916, + "learning_rate": 2.382227538303157e-06, + "loss": 0.76603246, + "num_input_tokens_seen": 162595130, + "step": 7578, + "time_per_iteration": 2.538808822631836 + }, + { + "auxiliary_loss_clip": 0.0106986, + "auxiliary_loss_mlp": 0.00791618, + "balance_loss_clip": 1.03989303, + "balance_loss_mlp": 1.01096559, + "epoch": 0.45567413197053963, + "flos": 25994513698560.0, + "grad_norm": 1.8338235316490974, + "language_loss": 0.70507818, + "learning_rate": 2.381845247976697e-06, + "loss": 0.72369295, + "num_input_tokens_seen": 162615720, + "step": 7579, + "time_per_iteration": 2.6062605381011963 + }, + { + "auxiliary_loss_clip": 0.01104777, + "auxiliary_loss_mlp": 0.01033422, + "balance_loss_clip": 1.03821778, + "balance_loss_mlp": 1.01955795, + "epoch": 0.4557342552232076, + "flos": 21537604195200.0, + "grad_norm": 1.697454214547009, + "language_loss": 0.78625524, + "learning_rate": 2.381462943170627e-06, + "loss": 0.80763721, + "num_input_tokens_seen": 162635825, + "step": 7580, + "time_per_iteration": 2.5063295364379883 + }, + { + "auxiliary_loss_clip": 0.01118488, + "auxiliary_loss_mlp": 0.01031727, + "balance_loss_clip": 1.04131413, + "balance_loss_mlp": 1.01711786, + "epoch": 0.45579437847587556, + "flos": 40001972647680.0, + "grad_norm": 2.222231053429561, + "language_loss": 0.68710983, + "learning_rate": 2.381080623899444e-06, + "loss": 0.70861191, + "num_input_tokens_seen": 162659130, + "step": 7581, + "time_per_iteration": 2.623645544052124 + }, + { + "auxiliary_loss_clip": 0.0110064, + "auxiliary_loss_mlp": 0.01030509, + "balance_loss_clip": 1.0360198, + "balance_loss_mlp": 1.01610899, + "epoch": 0.4558545017285435, + "flos": 31138501530240.0, + "grad_norm": 1.5937889860080712, + "language_loss": 0.7317872, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.75309873, + "num_input_tokens_seen": 162681665, + "step": 7582, + "time_per_iteration": 2.5898470878601074 + }, + { + "auxiliary_loss_clip": 0.01123027, + "auxiliary_loss_mlp": 0.01047086, + "balance_loss_clip": 1.04235196, + "balance_loss_mlp": 1.03145146, + "epoch": 0.4559146249812115, + "flos": 21725776569600.0, + "grad_norm": 1.9367076076637972, + "language_loss": 0.72423142, + "learning_rate": 2.380315942019729e-06, + "loss": 0.74593258, + "num_input_tokens_seen": 162702040, + "step": 7583, + "time_per_iteration": 2.450486660003662 + }, + { + "auxiliary_loss_clip": 0.01105483, + "auxiliary_loss_mlp": 0.01033757, + "balance_loss_clip": 1.04023194, + "balance_loss_mlp": 1.0194217, + "epoch": 0.45597474823387946, + "flos": 23805973935360.0, + "grad_norm": 1.766572313546338, + "language_loss": 0.72733414, + "learning_rate": 2.379933579440195e-06, + "loss": 0.74872655, + "num_input_tokens_seen": 162722375, + "step": 7584, + "time_per_iteration": 2.554657459259033 + }, + { + "auxiliary_loss_clip": 0.01080455, + "auxiliary_loss_mlp": 0.0103594, + "balance_loss_clip": 1.03864956, + "balance_loss_mlp": 1.02105045, + "epoch": 0.4560348714865474, + "flos": 31905661230720.0, + "grad_norm": 1.9903713561869234, + "language_loss": 0.68426853, + "learning_rate": 2.379551202453541e-06, + "loss": 0.70543247, + "num_input_tokens_seen": 162746095, + "step": 7585, + "time_per_iteration": 2.6137399673461914 + }, + { + "auxiliary_loss_clip": 0.01117344, + "auxiliary_loss_mlp": 0.01031292, + "balance_loss_clip": 1.03969336, + "balance_loss_mlp": 1.01803565, + "epoch": 0.4560949947392154, + "flos": 22048828513920.0, + "grad_norm": 1.4517747517778161, + "language_loss": 0.76071852, + "learning_rate": 2.379168811074267e-06, + "loss": 0.78220487, + "num_input_tokens_seen": 162766330, + "step": 7586, + "time_per_iteration": 2.4805328845977783 + }, + { + "auxiliary_loss_clip": 0.01098014, + "auxiliary_loss_mlp": 0.01027486, + "balance_loss_clip": 1.03953004, + "balance_loss_mlp": 1.01462364, + "epoch": 0.45615511799188335, + "flos": 24571804832640.0, + "grad_norm": 1.6487776957163107, + "language_loss": 0.78086066, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.80211568, + "num_input_tokens_seen": 162784755, + "step": 7587, + "time_per_iteration": 2.5229883193969727 + }, + { + "auxiliary_loss_clip": 0.01096809, + "auxiliary_loss_mlp": 0.01040283, + "balance_loss_clip": 1.03808415, + "balance_loss_mlp": 1.02556038, + "epoch": 0.4562152412445513, + "flos": 18330709944960.0, + "grad_norm": 2.3895287283229423, + "language_loss": 0.6903345, + "learning_rate": 2.378403985195863e-06, + "loss": 0.71170545, + "num_input_tokens_seen": 162803850, + "step": 7588, + "time_per_iteration": 2.485286235809326 + }, + { + "auxiliary_loss_clip": 0.0110323, + "auxiliary_loss_mlp": 0.01029889, + "balance_loss_clip": 1.0430268, + "balance_loss_mlp": 1.01713979, + "epoch": 0.4562753644972193, + "flos": 13516525814400.0, + "grad_norm": 1.7465814289427044, + "language_loss": 0.79119766, + "learning_rate": 2.378021550725735e-06, + "loss": 0.81252885, + "num_input_tokens_seen": 162820775, + "step": 7589, + "time_per_iteration": 2.4393413066864014 + }, + { + "auxiliary_loss_clip": 0.0110711, + "auxiliary_loss_mlp": 0.01036011, + "balance_loss_clip": 1.03968978, + "balance_loss_mlp": 1.02164078, + "epoch": 0.45633548774988725, + "flos": 29639697701760.0, + "grad_norm": 2.095357071551132, + "language_loss": 0.61417788, + "learning_rate": 2.377639101920992e-06, + "loss": 0.63560903, + "num_input_tokens_seen": 162839695, + "step": 7590, + "time_per_iteration": 2.563778877258301 + }, + { + "auxiliary_loss_clip": 0.01094144, + "auxiliary_loss_mlp": 0.01041747, + "balance_loss_clip": 1.04019403, + "balance_loss_mlp": 1.02776372, + "epoch": 0.4563956110025552, + "flos": 22233409528320.0, + "grad_norm": 2.24968527411082, + "language_loss": 0.72622401, + "learning_rate": 2.377256638796135e-06, + "loss": 0.74758291, + "num_input_tokens_seen": 162856095, + "step": 7591, + "time_per_iteration": 2.4966702461242676 + }, + { + "auxiliary_loss_clip": 0.01102012, + "auxiliary_loss_mlp": 0.0103959, + "balance_loss_clip": 1.04251075, + "balance_loss_mlp": 1.02492738, + "epoch": 0.45645573425522323, + "flos": 17092043389440.0, + "grad_norm": 1.7903218773313025, + "language_loss": 0.76429343, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.78570938, + "num_input_tokens_seen": 162874070, + "step": 7592, + "time_per_iteration": 2.526815414428711 + }, + { + "auxiliary_loss_clip": 0.01092682, + "auxiliary_loss_mlp": 0.01041989, + "balance_loss_clip": 1.03756714, + "balance_loss_mlp": 1.02659929, + "epoch": 0.4565158575078912, + "flos": 20332334309760.0, + "grad_norm": 2.0262548926383723, + "language_loss": 0.69492704, + "learning_rate": 2.376491669644098e-06, + "loss": 0.71627373, + "num_input_tokens_seen": 162891000, + "step": 7593, + "time_per_iteration": 2.5015547275543213 + }, + { + "auxiliary_loss_clip": 0.01099924, + "auxiliary_loss_mlp": 0.01029716, + "balance_loss_clip": 1.03694177, + "balance_loss_mlp": 1.01770556, + "epoch": 0.45657598076055916, + "flos": 23983013093760.0, + "grad_norm": 1.9634593911032778, + "language_loss": 0.84365898, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.86495543, + "num_input_tokens_seen": 162910120, + "step": 7594, + "time_per_iteration": 2.5480031967163086 + }, + { + "auxiliary_loss_clip": 0.01033417, + "auxiliary_loss_mlp": 0.00772001, + "balance_loss_clip": 1.01620317, + "balance_loss_mlp": 1.00659323, + "epoch": 0.45663610401322713, + "flos": 69364297526400.0, + "grad_norm": 0.8129282383523677, + "language_loss": 0.52773058, + "learning_rate": 2.375726643385654e-06, + "loss": 0.54578477, + "num_input_tokens_seen": 162963720, + "step": 7595, + "time_per_iteration": 3.119999408721924 + }, + { + "auxiliary_loss_clip": 0.01087868, + "auxiliary_loss_mlp": 0.01032857, + "balance_loss_clip": 1.03932357, + "balance_loss_mlp": 1.0185039, + "epoch": 0.4566962272658951, + "flos": 15149095891200.0, + "grad_norm": 2.0422655435570922, + "language_loss": 0.87240541, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.89361262, + "num_input_tokens_seen": 162975760, + "step": 7596, + "time_per_iteration": 2.5413429737091064 + }, + { + "auxiliary_loss_clip": 0.01112869, + "auxiliary_loss_mlp": 0.01040828, + "balance_loss_clip": 1.04262829, + "balance_loss_mlp": 1.02725589, + "epoch": 0.45675635051856306, + "flos": 18697465762560.0, + "grad_norm": 1.6840430860729967, + "language_loss": 0.77552772, + "learning_rate": 2.374961560136843e-06, + "loss": 0.79706478, + "num_input_tokens_seen": 162994865, + "step": 7597, + "time_per_iteration": 2.493898630142212 + }, + { + "auxiliary_loss_clip": 0.01108369, + "auxiliary_loss_mlp": 0.01035252, + "balance_loss_clip": 1.04021645, + "balance_loss_mlp": 1.0215553, + "epoch": 0.456816473771231, + "flos": 19098300608640.0, + "grad_norm": 2.351601000664415, + "language_loss": 0.78603959, + "learning_rate": 2.374578997177314e-06, + "loss": 0.80747575, + "num_input_tokens_seen": 163014730, + "step": 7598, + "time_per_iteration": 2.5423309803009033 + }, + { + "auxiliary_loss_clip": 0.01118073, + "auxiliary_loss_mlp": 0.01029067, + "balance_loss_clip": 1.04119968, + "balance_loss_mlp": 1.01639462, + "epoch": 0.456876597023899, + "flos": 28950069507840.0, + "grad_norm": 2.8115651274903604, + "language_loss": 0.71335077, + "learning_rate": 2.374196420013712e-06, + "loss": 0.73482209, + "num_input_tokens_seen": 163033405, + "step": 7599, + "time_per_iteration": 2.520876407623291 + }, + { + "auxiliary_loss_clip": 0.01084907, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.03826666, + "balance_loss_mlp": 1.0189935, + "epoch": 0.45693672027656695, + "flos": 23289470317440.0, + "grad_norm": 1.8847014825033102, + "language_loss": 0.69735563, + "learning_rate": 2.373813828660544e-06, + "loss": 0.71852982, + "num_input_tokens_seen": 163051400, + "step": 7600, + "time_per_iteration": 2.580392599105835 + }, + { + "auxiliary_loss_clip": 0.01058003, + "auxiliary_loss_mlp": 0.01033901, + "balance_loss_clip": 1.0433495, + "balance_loss_mlp": 1.02093685, + "epoch": 0.4569968435292349, + "flos": 20558212986240.0, + "grad_norm": 1.9031234603011618, + "language_loss": 0.7933526, + "learning_rate": 2.373431223132319e-06, + "loss": 0.81427169, + "num_input_tokens_seen": 163069250, + "step": 7601, + "time_per_iteration": 2.6294960975646973 + }, + { + "auxiliary_loss_clip": 0.01091037, + "auxiliary_loss_mlp": 0.0103892, + "balance_loss_clip": 1.03888965, + "balance_loss_mlp": 1.02590263, + "epoch": 0.4570569667819029, + "flos": 41282619223680.0, + "grad_norm": 1.721885691913234, + "language_loss": 0.71676493, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.73806453, + "num_input_tokens_seen": 163091755, + "step": 7602, + "time_per_iteration": 2.6990182399749756 + }, + { + "auxiliary_loss_clip": 0.01105011, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.03896666, + "balance_loss_mlp": 1.01894546, + "epoch": 0.45711709003457085, + "flos": 26031573555840.0, + "grad_norm": 2.252512470955106, + "language_loss": 0.73047626, + "learning_rate": 2.372665969608729e-06, + "loss": 0.75187218, + "num_input_tokens_seen": 163111600, + "step": 7603, + "time_per_iteration": 2.543278217315674 + }, + { + "auxiliary_loss_clip": 0.01107366, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.04149461, + "balance_loss_mlp": 1.01888537, + "epoch": 0.4571772132872388, + "flos": 22158068751360.0, + "grad_norm": 1.8718344049941582, + "language_loss": 0.83037108, + "learning_rate": 2.372283321642383e-06, + "loss": 0.85177922, + "num_input_tokens_seen": 163127350, + "step": 7604, + "time_per_iteration": 2.5536022186279297 + }, + { + "auxiliary_loss_clip": 0.01102427, + "auxiliary_loss_mlp": 0.01039182, + "balance_loss_clip": 1.04756379, + "balance_loss_mlp": 1.02431691, + "epoch": 0.45723733653990684, + "flos": 23878872587520.0, + "grad_norm": 1.996124146096969, + "language_loss": 0.85733104, + "learning_rate": 2.371900659559016e-06, + "loss": 0.87874722, + "num_input_tokens_seen": 163145855, + "step": 7605, + "time_per_iteration": 3.9621567726135254 + }, + { + "auxiliary_loss_clip": 0.01073793, + "auxiliary_loss_mlp": 0.01038877, + "balance_loss_clip": 1.03840542, + "balance_loss_mlp": 1.02439892, + "epoch": 0.4572974597925748, + "flos": 16871803148160.0, + "grad_norm": 2.4353757228506656, + "language_loss": 0.73794538, + "learning_rate": 2.371517983373138e-06, + "loss": 0.75907207, + "num_input_tokens_seen": 163163830, + "step": 7606, + "time_per_iteration": 3.9344241619110107 + }, + { + "auxiliary_loss_clip": 0.0108751, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.03937411, + "balance_loss_mlp": 1.02042079, + "epoch": 0.45735758304524277, + "flos": 13771491528960.0, + "grad_norm": 2.0676747463116576, + "language_loss": 0.80022287, + "learning_rate": 2.371135293099262e-06, + "loss": 0.82144356, + "num_input_tokens_seen": 163180700, + "step": 7607, + "time_per_iteration": 2.505862236022949 + }, + { + "auxiliary_loss_clip": 0.01091, + "auxiliary_loss_mlp": 0.0103589, + "balance_loss_clip": 1.04783189, + "balance_loss_mlp": 1.0217334, + "epoch": 0.45741770629791073, + "flos": 21100750986240.0, + "grad_norm": 1.7714052568667134, + "language_loss": 0.80896032, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.83022916, + "num_input_tokens_seen": 163199450, + "step": 7608, + "time_per_iteration": 2.601933479309082 + }, + { + "auxiliary_loss_clip": 0.01098089, + "auxiliary_loss_mlp": 0.01038844, + "balance_loss_clip": 1.03991508, + "balance_loss_mlp": 1.02394307, + "epoch": 0.4574778295505787, + "flos": 23112898035840.0, + "grad_norm": 1.5982228151972964, + "language_loss": 0.68430746, + "learning_rate": 2.370369870345559e-06, + "loss": 0.70567679, + "num_input_tokens_seen": 163217875, + "step": 7609, + "time_per_iteration": 2.5782577991485596 + }, + { + "auxiliary_loss_clip": 0.01092135, + "auxiliary_loss_mlp": 0.01042401, + "balance_loss_clip": 1.04431975, + "balance_loss_mlp": 1.02806592, + "epoch": 0.45753795280324666, + "flos": 24352929308160.0, + "grad_norm": 1.6947926137591287, + "language_loss": 0.80750495, + "learning_rate": 2.369987137894757e-06, + "loss": 0.82885027, + "num_input_tokens_seen": 163237430, + "step": 7610, + "time_per_iteration": 3.9914653301239014 + }, + { + "auxiliary_loss_clip": 0.01111081, + "auxiliary_loss_mlp": 0.01036881, + "balance_loss_clip": 1.04205537, + "balance_loss_mlp": 1.02206337, + "epoch": 0.4575980760559146, + "flos": 16653789550080.0, + "grad_norm": 2.1088040958240946, + "language_loss": 0.82442546, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.84590507, + "num_input_tokens_seen": 163253905, + "step": 7611, + "time_per_iteration": 2.5525200366973877 + }, + { + "auxiliary_loss_clip": 0.01113607, + "auxiliary_loss_mlp": 0.01030166, + "balance_loss_clip": 1.04379201, + "balance_loss_mlp": 1.01568794, + "epoch": 0.4576581993085826, + "flos": 35911423912320.0, + "grad_norm": 1.7814302346055144, + "language_loss": 0.73668349, + "learning_rate": 2.369221630917819e-06, + "loss": 0.75812125, + "num_input_tokens_seen": 163274285, + "step": 7612, + "time_per_iteration": 3.9727871417999268 + }, + { + "auxiliary_loss_clip": 0.01092458, + "auxiliary_loss_mlp": 0.01037321, + "balance_loss_clip": 1.03717506, + "balance_loss_mlp": 1.022277, + "epoch": 0.45771832256125056, + "flos": 20080421251200.0, + "grad_norm": 1.590652309877682, + "language_loss": 0.85070229, + "learning_rate": 2.368838856420711e-06, + "loss": 0.87200004, + "num_input_tokens_seen": 163293150, + "step": 7613, + "time_per_iteration": 2.5711684226989746 + }, + { + "auxiliary_loss_clip": 0.01083488, + "auxiliary_loss_mlp": 0.01032615, + "balance_loss_clip": 1.03823304, + "balance_loss_mlp": 1.01875103, + "epoch": 0.4577784458139185, + "flos": 10744329957120.0, + "grad_norm": 2.230470479130553, + "language_loss": 0.7548058, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.77596682, + "num_input_tokens_seen": 163310065, + "step": 7614, + "time_per_iteration": 2.535323143005371 + }, + { + "auxiliary_loss_clip": 0.01117654, + "auxiliary_loss_mlp": 0.01030735, + "balance_loss_clip": 1.0413506, + "balance_loss_mlp": 1.01758623, + "epoch": 0.4578385690665865, + "flos": 21907269014400.0, + "grad_norm": 1.4667500321312488, + "language_loss": 0.74462497, + "learning_rate": 2.368073265481791e-06, + "loss": 0.76610887, + "num_input_tokens_seen": 163329415, + "step": 7615, + "time_per_iteration": 2.53828501701355 + }, + { + "auxiliary_loss_clip": 0.01027824, + "auxiliary_loss_mlp": 0.01011728, + "balance_loss_clip": 1.02092576, + "balance_loss_mlp": 1.01021373, + "epoch": 0.45789869231925445, + "flos": 64758286667520.0, + "grad_norm": 0.7859753188367176, + "language_loss": 0.57693172, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.59732723, + "num_input_tokens_seen": 163385875, + "step": 7616, + "time_per_iteration": 3.115879774093628 + }, + { + "auxiliary_loss_clip": 0.01089357, + "auxiliary_loss_mlp": 0.0079253, + "balance_loss_clip": 1.03827643, + "balance_loss_mlp": 1.01796889, + "epoch": 0.4579588155719224, + "flos": 16144001775360.0, + "grad_norm": 1.661409550548061, + "language_loss": 0.70831835, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.72713721, + "num_input_tokens_seen": 163405170, + "step": 7617, + "time_per_iteration": 2.5494508743286133 + }, + { + "auxiliary_loss_clip": 0.01121328, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.04333103, + "balance_loss_mlp": 1.02131724, + "epoch": 0.45801893882459044, + "flos": 21395541905280.0, + "grad_norm": 3.1471515872120253, + "language_loss": 0.76268744, + "learning_rate": 2.36692477442939e-06, + "loss": 0.78425229, + "num_input_tokens_seen": 163423155, + "step": 7618, + "time_per_iteration": 2.5000972747802734 + }, + { + "auxiliary_loss_clip": 0.0108475, + "auxiliary_loss_mlp": 0.01040274, + "balance_loss_clip": 1.04117751, + "balance_loss_mlp": 1.02689874, + "epoch": 0.4580790620772584, + "flos": 19536554448000.0, + "grad_norm": 1.646988952337748, + "language_loss": 0.76904356, + "learning_rate": 2.366541916231585e-06, + "loss": 0.79029381, + "num_input_tokens_seen": 163442450, + "step": 7619, + "time_per_iteration": 2.6207408905029297 + }, + { + "auxiliary_loss_clip": 0.01118006, + "auxiliary_loss_mlp": 0.01037168, + "balance_loss_clip": 1.0417397, + "balance_loss_mlp": 1.02474046, + "epoch": 0.45813918532992637, + "flos": 16581070465920.0, + "grad_norm": 1.7959546743044283, + "language_loss": 0.71756518, + "learning_rate": 2.366159044134473e-06, + "loss": 0.73911691, + "num_input_tokens_seen": 163459810, + "step": 7620, + "time_per_iteration": 2.4641294479370117 + }, + { + "auxiliary_loss_clip": 0.01093194, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.04045558, + "balance_loss_mlp": 1.01717925, + "epoch": 0.45819930858259433, + "flos": 42230301701760.0, + "grad_norm": 1.55379691382913, + "language_loss": 0.77805609, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.79929042, + "num_input_tokens_seen": 163482970, + "step": 7621, + "time_per_iteration": 2.755755662918091 + }, + { + "auxiliary_loss_clip": 0.01032376, + "auxiliary_loss_mlp": 0.01002387, + "balance_loss_clip": 1.01493573, + "balance_loss_mlp": 1.00097466, + "epoch": 0.4582594318352623, + "flos": 63714795638400.0, + "grad_norm": 0.7859044406718821, + "language_loss": 0.65025854, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.67060626, + "num_input_tokens_seen": 163545330, + "step": 7622, + "time_per_iteration": 3.1740617752075195 + }, + { + "auxiliary_loss_clip": 0.0110539, + "auxiliary_loss_mlp": 0.01031616, + "balance_loss_clip": 1.04168653, + "balance_loss_mlp": 1.01737022, + "epoch": 0.45831955508793026, + "flos": 26869979882880.0, + "grad_norm": 1.768788089588182, + "language_loss": 0.79664278, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.81801283, + "num_input_tokens_seen": 163564620, + "step": 7623, + "time_per_iteration": 2.5738155841827393 + }, + { + "auxiliary_loss_clip": 0.01070845, + "auxiliary_loss_mlp": 0.01036976, + "balance_loss_clip": 1.03885829, + "balance_loss_mlp": 1.02244401, + "epoch": 0.45837967834059823, + "flos": 18733951002240.0, + "grad_norm": 2.5129157285400803, + "language_loss": 0.70921087, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.7302891, + "num_input_tokens_seen": 163581010, + "step": 7624, + "time_per_iteration": 2.6099324226379395 + }, + { + "auxiliary_loss_clip": 0.01090713, + "auxiliary_loss_mlp": 0.0103878, + "balance_loss_clip": 1.03759336, + "balance_loss_mlp": 1.02426004, + "epoch": 0.4584398015932662, + "flos": 21178102924800.0, + "grad_norm": 1.8986518305475755, + "language_loss": 0.72960842, + "learning_rate": 2.364244475667491e-06, + "loss": 0.75090331, + "num_input_tokens_seen": 163599955, + "step": 7625, + "time_per_iteration": 2.634599447250366 + }, + { + "auxiliary_loss_clip": 0.01112431, + "auxiliary_loss_mlp": 0.01039077, + "balance_loss_clip": 1.04246819, + "balance_loss_mlp": 1.02556419, + "epoch": 0.45849992484593416, + "flos": 19790047704960.0, + "grad_norm": 2.5742670433707215, + "language_loss": 0.7777921, + "learning_rate": 2.363861520479451e-06, + "loss": 0.79930717, + "num_input_tokens_seen": 163618545, + "step": 7626, + "time_per_iteration": 2.5304484367370605 + }, + { + "auxiliary_loss_clip": 0.01122124, + "auxiliary_loss_mlp": 0.01042399, + "balance_loss_clip": 1.04216182, + "balance_loss_mlp": 1.02854049, + "epoch": 0.4585600480986021, + "flos": 18223265387520.0, + "grad_norm": 1.730047676486694, + "language_loss": 0.84948349, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.87112868, + "num_input_tokens_seen": 163636055, + "step": 7627, + "time_per_iteration": 2.441471576690674 + }, + { + "auxiliary_loss_clip": 0.01123868, + "auxiliary_loss_mlp": 0.01036989, + "balance_loss_clip": 1.04135537, + "balance_loss_mlp": 1.02281487, + "epoch": 0.4586201713512701, + "flos": 29022213974400.0, + "grad_norm": 1.5863784464980368, + "language_loss": 0.69371521, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.71532381, + "num_input_tokens_seen": 163657485, + "step": 7628, + "time_per_iteration": 2.5832228660583496 + }, + { + "auxiliary_loss_clip": 0.0110529, + "auxiliary_loss_mlp": 0.0103064, + "balance_loss_clip": 1.03810275, + "balance_loss_mlp": 1.01687765, + "epoch": 0.45868029460393805, + "flos": 23404600385280.0, + "grad_norm": 1.6224546273552722, + "language_loss": 0.78075004, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.8021093, + "num_input_tokens_seen": 163676030, + "step": 7629, + "time_per_iteration": 2.4919581413269043 + }, + { + "auxiliary_loss_clip": 0.01103298, + "auxiliary_loss_mlp": 0.01039635, + "balance_loss_clip": 1.04149127, + "balance_loss_mlp": 1.02461517, + "epoch": 0.458740417856606, + "flos": 18221972497920.0, + "grad_norm": 2.1339777208700546, + "language_loss": 0.79405236, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.81548166, + "num_input_tokens_seen": 163694490, + "step": 7630, + "time_per_iteration": 2.596471071243286 + }, + { + "auxiliary_loss_clip": 0.01100869, + "auxiliary_loss_mlp": 0.01037962, + "balance_loss_clip": 1.04167938, + "balance_loss_mlp": 1.0233829, + "epoch": 0.458800541109274, + "flos": 34568760504960.0, + "grad_norm": 1.7609221671558626, + "language_loss": 0.71637946, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.73776776, + "num_input_tokens_seen": 163717035, + "step": 7631, + "time_per_iteration": 2.623997211456299 + }, + { + "auxiliary_loss_clip": 0.01084976, + "auxiliary_loss_mlp": 0.0104287, + "balance_loss_clip": 1.04098201, + "balance_loss_mlp": 1.02708673, + "epoch": 0.458860664361942, + "flos": 17712112896000.0, + "grad_norm": 2.144255544167101, + "language_loss": 0.7130543, + "learning_rate": 2.361563500108531e-06, + "loss": 0.73433274, + "num_input_tokens_seen": 163734525, + "step": 7632, + "time_per_iteration": 2.5423994064331055 + }, + { + "auxiliary_loss_clip": 0.01072048, + "auxiliary_loss_mlp": 0.00791533, + "balance_loss_clip": 1.03816247, + "balance_loss_mlp": 1.01211751, + "epoch": 0.45892078761460997, + "flos": 18441889516800.0, + "grad_norm": 2.7195490602890846, + "language_loss": 0.69922262, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.71785843, + "num_input_tokens_seen": 163752860, + "step": 7633, + "time_per_iteration": 2.57700514793396 + }, + { + "auxiliary_loss_clip": 0.01108469, + "auxiliary_loss_mlp": 0.010389, + "balance_loss_clip": 1.04038715, + "balance_loss_mlp": 1.0247972, + "epoch": 0.45898091086727794, + "flos": 22672956257280.0, + "grad_norm": 1.5397566883618532, + "language_loss": 0.81118381, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.83265758, + "num_input_tokens_seen": 163772495, + "step": 7634, + "time_per_iteration": 2.5451271533966064 + }, + { + "auxiliary_loss_clip": 0.01110907, + "auxiliary_loss_mlp": 0.00793467, + "balance_loss_clip": 1.04217982, + "balance_loss_mlp": 1.01617789, + "epoch": 0.4590410341199459, + "flos": 21652949744640.0, + "grad_norm": 1.8397511767835342, + "language_loss": 0.8146944, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.83373815, + "num_input_tokens_seen": 163791475, + "step": 7635, + "time_per_iteration": 2.5213987827301025 + }, + { + "auxiliary_loss_clip": 0.01096505, + "auxiliary_loss_mlp": 0.010403, + "balance_loss_clip": 1.04109836, + "balance_loss_mlp": 1.02680516, + "epoch": 0.45910115737261387, + "flos": 36535372087680.0, + "grad_norm": 1.8742781608241168, + "language_loss": 0.64844489, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.66981292, + "num_input_tokens_seen": 163812995, + "step": 7636, + "time_per_iteration": 2.681605100631714 + }, + { + "auxiliary_loss_clip": 0.01104089, + "auxiliary_loss_mlp": 0.01030441, + "balance_loss_clip": 1.04196239, + "balance_loss_mlp": 1.01695275, + "epoch": 0.45916128062528183, + "flos": 24419866302720.0, + "grad_norm": 1.4884766745810083, + "language_loss": 0.80778164, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.82912689, + "num_input_tokens_seen": 163833945, + "step": 7637, + "time_per_iteration": 2.5660412311553955 + }, + { + "auxiliary_loss_clip": 0.01092463, + "auxiliary_loss_mlp": 0.01037586, + "balance_loss_clip": 1.0392096, + "balance_loss_mlp": 1.02168322, + "epoch": 0.4592214038779498, + "flos": 23221958705280.0, + "grad_norm": 1.4506137287407685, + "language_loss": 0.75485563, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.77615613, + "num_input_tokens_seen": 163853885, + "step": 7638, + "time_per_iteration": 2.558304786682129 + }, + { + "auxiliary_loss_clip": 0.01106265, + "auxiliary_loss_mlp": 0.01034468, + "balance_loss_clip": 1.04026937, + "balance_loss_mlp": 1.02074742, + "epoch": 0.45928152713061776, + "flos": 19172133014400.0, + "grad_norm": 1.7010931043479793, + "language_loss": 0.74211645, + "learning_rate": 2.358881852733989e-06, + "loss": 0.7635237, + "num_input_tokens_seen": 163871855, + "step": 7639, + "time_per_iteration": 2.509464979171753 + }, + { + "auxiliary_loss_clip": 0.01121694, + "auxiliary_loss_mlp": 0.01034818, + "balance_loss_clip": 1.04186797, + "balance_loss_mlp": 1.02132952, + "epoch": 0.4593416503832857, + "flos": 22414686491520.0, + "grad_norm": 1.5916016146333272, + "language_loss": 0.68320984, + "learning_rate": 2.358498705700346e-06, + "loss": 0.70477498, + "num_input_tokens_seen": 163891450, + "step": 7640, + "time_per_iteration": 2.5085716247558594 + }, + { + "auxiliary_loss_clip": 0.01101929, + "auxiliary_loss_mlp": 0.01036831, + "balance_loss_clip": 1.03980827, + "balance_loss_mlp": 1.02272868, + "epoch": 0.4594017736359537, + "flos": 18880215183360.0, + "grad_norm": 1.6915228940210105, + "language_loss": 0.75646693, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.77785456, + "num_input_tokens_seen": 163909345, + "step": 7641, + "time_per_iteration": 2.534411907196045 + }, + { + "auxiliary_loss_clip": 0.01097747, + "auxiliary_loss_mlp": 0.01031313, + "balance_loss_clip": 1.04293299, + "balance_loss_mlp": 1.01634622, + "epoch": 0.45946189688862166, + "flos": 20518567349760.0, + "grad_norm": 1.6774200501312442, + "language_loss": 0.74939781, + "learning_rate": 2.357732370864668e-06, + "loss": 0.77068841, + "num_input_tokens_seen": 163926940, + "step": 7642, + "time_per_iteration": 2.548884153366089 + }, + { + "auxiliary_loss_clip": 0.01042152, + "auxiliary_loss_mlp": 0.01003357, + "balance_loss_clip": 1.03218675, + "balance_loss_mlp": 1.00184917, + "epoch": 0.4595220201412896, + "flos": 61405990162560.0, + "grad_norm": 0.8419302966472683, + "language_loss": 0.58213031, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60258532, + "num_input_tokens_seen": 163977785, + "step": 7643, + "time_per_iteration": 4.252629518508911 + }, + { + "auxiliary_loss_clip": 0.0111339, + "auxiliary_loss_mlp": 0.01033498, + "balance_loss_clip": 1.03975976, + "balance_loss_mlp": 1.01926494, + "epoch": 0.4595821433939576, + "flos": 23330947547520.0, + "grad_norm": 1.5301806325876963, + "language_loss": 0.93004668, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.95151556, + "num_input_tokens_seen": 163996630, + "step": 7644, + "time_per_iteration": 2.534205436706543 + }, + { + "auxiliary_loss_clip": 0.01108079, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.04201889, + "balance_loss_mlp": 1.02155066, + "epoch": 0.4596422666466256, + "flos": 14282356711680.0, + "grad_norm": 2.06181017288064, + "language_loss": 0.82900751, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.85044456, + "num_input_tokens_seen": 164013190, + "step": 7645, + "time_per_iteration": 3.869338035583496 + }, + { + "auxiliary_loss_clip": 0.01015267, + "auxiliary_loss_mlp": 0.01005511, + "balance_loss_clip": 1.01651347, + "balance_loss_mlp": 1.00388992, + "epoch": 0.4597023898992936, + "flos": 65727337737600.0, + "grad_norm": 0.7646749821338306, + "language_loss": 0.59887481, + "learning_rate": 2.356199538526593e-06, + "loss": 0.61908263, + "num_input_tokens_seen": 164074030, + "step": 7646, + "time_per_iteration": 3.106841564178467 + }, + { + "auxiliary_loss_clip": 0.01105205, + "auxiliary_loss_mlp": 0.01033821, + "balance_loss_clip": 1.04034245, + "balance_loss_mlp": 1.01901531, + "epoch": 0.45976251315196154, + "flos": 26907075653760.0, + "grad_norm": 2.077900568253787, + "language_loss": 0.72630507, + "learning_rate": 2.355816296637939e-06, + "loss": 0.74769533, + "num_input_tokens_seen": 164095515, + "step": 7647, + "time_per_iteration": 2.565035104751587 + }, + { + "auxiliary_loss_clip": 0.01086973, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.03834581, + "balance_loss_mlp": 1.01850772, + "epoch": 0.4598226364046295, + "flos": 26618066824320.0, + "grad_norm": 1.698220443095354, + "language_loss": 0.66848505, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.68967617, + "num_input_tokens_seen": 164117270, + "step": 7648, + "time_per_iteration": 2.6183369159698486 + }, + { + "auxiliary_loss_clip": 0.01109208, + "auxiliary_loss_mlp": 0.01029101, + "balance_loss_clip": 1.03942943, + "balance_loss_mlp": 1.01552308, + "epoch": 0.45988275965729747, + "flos": 24387762522240.0, + "grad_norm": 1.4286224261872482, + "language_loss": 0.78756607, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.80894911, + "num_input_tokens_seen": 164137850, + "step": 7649, + "time_per_iteration": 3.909003734588623 + }, + { + "auxiliary_loss_clip": 0.01061418, + "auxiliary_loss_mlp": 0.01035294, + "balance_loss_clip": 1.04134846, + "balance_loss_mlp": 1.02091122, + "epoch": 0.45994288290996543, + "flos": 24535822383360.0, + "grad_norm": 1.6865603816500283, + "language_loss": 0.69116116, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.71212828, + "num_input_tokens_seen": 164157960, + "step": 7650, + "time_per_iteration": 2.6528146266937256 + }, + { + "auxiliary_loss_clip": 0.01113658, + "auxiliary_loss_mlp": 0.0103579, + "balance_loss_clip": 1.04118979, + "balance_loss_mlp": 1.02044725, + "epoch": 0.4600030061626334, + "flos": 14830245838080.0, + "grad_norm": 1.9014868323277268, + "language_loss": 0.83910304, + "learning_rate": 2.354283194302761e-06, + "loss": 0.86059749, + "num_input_tokens_seen": 164174590, + "step": 7651, + "time_per_iteration": 3.8575491905212402 + }, + { + "auxiliary_loss_clip": 0.0109899, + "auxiliary_loss_mlp": 0.0079038, + "balance_loss_clip": 1.041381, + "balance_loss_mlp": 1.01572144, + "epoch": 0.46006312941530136, + "flos": 18113845582080.0, + "grad_norm": 2.1826951985825187, + "language_loss": 0.75960147, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.77849519, + "num_input_tokens_seen": 164192935, + "step": 7652, + "time_per_iteration": 2.5263516902923584 + }, + { + "auxiliary_loss_clip": 0.01073694, + "auxiliary_loss_mlp": 0.01029385, + "balance_loss_clip": 1.03804576, + "balance_loss_mlp": 1.0145911, + "epoch": 0.46012325266796933, + "flos": 21976468565760.0, + "grad_norm": 1.7107258164037007, + "language_loss": 0.76244694, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.78347772, + "num_input_tokens_seen": 164213160, + "step": 7653, + "time_per_iteration": 2.6429028511047363 + }, + { + "auxiliary_loss_clip": 0.01080334, + "auxiliary_loss_mlp": 0.01037799, + "balance_loss_clip": 1.04263592, + "balance_loss_mlp": 1.02138376, + "epoch": 0.4601833759206373, + "flos": 15268068714240.0, + "grad_norm": 2.018157079197094, + "language_loss": 0.6607368, + "learning_rate": 2.353133226438741e-06, + "loss": 0.68191814, + "num_input_tokens_seen": 164229330, + "step": 7654, + "time_per_iteration": 2.553253650665283 + }, + { + "auxiliary_loss_clip": 0.01095063, + "auxiliary_loss_mlp": 0.01035969, + "balance_loss_clip": 1.03785217, + "balance_loss_mlp": 1.02162218, + "epoch": 0.46024349917330526, + "flos": 27088999061760.0, + "grad_norm": 1.7556587733254012, + "language_loss": 0.79282081, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.81413114, + "num_input_tokens_seen": 164248240, + "step": 7655, + "time_per_iteration": 2.5941550731658936 + }, + { + "auxiliary_loss_clip": 0.01081913, + "auxiliary_loss_mlp": 0.01031586, + "balance_loss_clip": 1.03999507, + "balance_loss_mlp": 1.01734638, + "epoch": 0.4603036224259732, + "flos": 24462923731200.0, + "grad_norm": 1.5479810482361755, + "language_loss": 0.67717469, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.69830966, + "num_input_tokens_seen": 164268020, + "step": 7656, + "time_per_iteration": 2.6021289825439453 + }, + { + "auxiliary_loss_clip": 0.0109807, + "auxiliary_loss_mlp": 0.01031397, + "balance_loss_clip": 1.03947306, + "balance_loss_mlp": 1.01744962, + "epoch": 0.4603637456786412, + "flos": 28109292883200.0, + "grad_norm": 1.8122647939833152, + "language_loss": 0.8100704, + "learning_rate": 2.351983138057098e-06, + "loss": 0.83136505, + "num_input_tokens_seen": 164287305, + "step": 7657, + "time_per_iteration": 2.611483573913574 + }, + { + "auxiliary_loss_clip": 0.01119171, + "auxiliary_loss_mlp": 0.00788671, + "balance_loss_clip": 1.03998661, + "balance_loss_mlp": 1.01264083, + "epoch": 0.4604238689313092, + "flos": 24348942898560.0, + "grad_norm": 2.012227400347502, + "language_loss": 0.7062875, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.725366, + "num_input_tokens_seen": 164306835, + "step": 7658, + "time_per_iteration": 2.5044569969177246 + }, + { + "auxiliary_loss_clip": 0.01030385, + "auxiliary_loss_mlp": 0.01004023, + "balance_loss_clip": 1.01189709, + "balance_loss_mlp": 1.00210381, + "epoch": 0.4604839921839772, + "flos": 53606229431040.0, + "grad_norm": 0.9541891940241639, + "language_loss": 0.62080538, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64114946, + "num_input_tokens_seen": 164367095, + "step": 7659, + "time_per_iteration": 3.202589988708496 + }, + { + "auxiliary_loss_clip": 0.01071169, + "auxiliary_loss_mlp": 0.01041641, + "balance_loss_clip": 1.03894782, + "balance_loss_mlp": 1.02561378, + "epoch": 0.46054411543664514, + "flos": 31248424126080.0, + "grad_norm": 1.5576642004348105, + "language_loss": 0.68752116, + "learning_rate": 2.350832929550336e-06, + "loss": 0.70864928, + "num_input_tokens_seen": 164388895, + "step": 7660, + "time_per_iteration": 2.6544415950775146 + }, + { + "auxiliary_loss_clip": 0.01107435, + "auxiliary_loss_mlp": 0.01036002, + "balance_loss_clip": 1.03891981, + "balance_loss_mlp": 1.02121449, + "epoch": 0.4606042386893131, + "flos": 24092863862400.0, + "grad_norm": 1.7053034197622479, + "language_loss": 0.77097237, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.79240668, + "num_input_tokens_seen": 164409080, + "step": 7661, + "time_per_iteration": 2.5493686199188232 + }, + { + "auxiliary_loss_clip": 0.01103326, + "auxiliary_loss_mlp": 0.01042335, + "balance_loss_clip": 1.03970325, + "balance_loss_mlp": 1.02660489, + "epoch": 0.46066436194198107, + "flos": 26578457101440.0, + "grad_norm": 1.765423355802958, + "language_loss": 0.74655795, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.76801455, + "num_input_tokens_seen": 164427585, + "step": 7662, + "time_per_iteration": 2.5316247940063477 + }, + { + "auxiliary_loss_clip": 0.01096165, + "auxiliary_loss_mlp": 0.01036538, + "balance_loss_clip": 1.03880656, + "balance_loss_mlp": 1.02073121, + "epoch": 0.46072448519464904, + "flos": 17775602184960.0, + "grad_norm": 3.071430590193815, + "language_loss": 0.80206239, + "learning_rate": 2.349682601310998e-06, + "loss": 0.82338947, + "num_input_tokens_seen": 164438455, + "step": 7663, + "time_per_iteration": 2.4879274368286133 + }, + { + "auxiliary_loss_clip": 0.01105761, + "auxiliary_loss_mlp": 0.01032487, + "balance_loss_clip": 1.03975666, + "balance_loss_mlp": 1.01855135, + "epoch": 0.460784608447317, + "flos": 15086109392640.0, + "grad_norm": 1.8812340339377516, + "language_loss": 0.73201686, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.75339931, + "num_input_tokens_seen": 164456830, + "step": 7664, + "time_per_iteration": 2.476667642593384 + }, + { + "auxiliary_loss_clip": 0.01086414, + "auxiliary_loss_mlp": 0.01035256, + "balance_loss_clip": 1.04016805, + "balance_loss_mlp": 1.02176785, + "epoch": 0.46084473169998497, + "flos": 18588261438720.0, + "grad_norm": 1.763503066088783, + "language_loss": 0.72367668, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.74489331, + "num_input_tokens_seen": 164475375, + "step": 7665, + "time_per_iteration": 2.550098180770874 + }, + { + "auxiliary_loss_clip": 0.01089384, + "auxiliary_loss_mlp": 0.01031269, + "balance_loss_clip": 1.03883672, + "balance_loss_mlp": 1.01800132, + "epoch": 0.46090485495265293, + "flos": 19494789909120.0, + "grad_norm": 1.6729094451826851, + "language_loss": 0.78279579, + "learning_rate": 2.348532153731669e-06, + "loss": 0.80400229, + "num_input_tokens_seen": 164492040, + "step": 7666, + "time_per_iteration": 2.525930404663086 + }, + { + "auxiliary_loss_clip": 0.0107826, + "auxiliary_loss_mlp": 0.01034797, + "balance_loss_clip": 1.03968883, + "balance_loss_mlp": 1.01916838, + "epoch": 0.4609649782053209, + "flos": 33364927163520.0, + "grad_norm": 1.3671635653073348, + "language_loss": 0.7382766, + "learning_rate": 2.348148644753088e-06, + "loss": 0.75940716, + "num_input_tokens_seen": 164513665, + "step": 7667, + "time_per_iteration": 2.6547675132751465 + }, + { + "auxiliary_loss_clip": 0.01073016, + "auxiliary_loss_mlp": 0.01033071, + "balance_loss_clip": 1.03946304, + "balance_loss_mlp": 1.01971912, + "epoch": 0.46102510145798886, + "flos": 23769165473280.0, + "grad_norm": 1.4117675009464312, + "language_loss": 0.76319575, + "learning_rate": 2.347765122572676e-06, + "loss": 0.78425658, + "num_input_tokens_seen": 164533890, + "step": 7668, + "time_per_iteration": 2.6041359901428223 + }, + { + "auxiliary_loss_clip": 0.01069712, + "auxiliary_loss_mlp": 0.01030773, + "balance_loss_clip": 1.04336298, + "balance_loss_mlp": 1.0178628, + "epoch": 0.4610852247106568, + "flos": 23294821443840.0, + "grad_norm": 1.5044116422787515, + "language_loss": 0.78235716, + "learning_rate": 2.347381587204975e-06, + "loss": 0.80336201, + "num_input_tokens_seen": 164553815, + "step": 7669, + "time_per_iteration": 2.619983673095703 + }, + { + "auxiliary_loss_clip": 0.01103597, + "auxiliary_loss_mlp": 0.01037644, + "balance_loss_clip": 1.0378896, + "balance_loss_mlp": 1.02172971, + "epoch": 0.4611453479633248, + "flos": 25447450584960.0, + "grad_norm": 2.003135880179364, + "language_loss": 0.8251074, + "learning_rate": 2.34699803866453e-06, + "loss": 0.84651983, + "num_input_tokens_seen": 164573125, + "step": 7670, + "time_per_iteration": 2.5754334926605225 + }, + { + "auxiliary_loss_clip": 0.01106972, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.03882694, + "balance_loss_mlp": 1.01949894, + "epoch": 0.4612054712159928, + "flos": 21139606523520.0, + "grad_norm": 1.6971171386104273, + "language_loss": 0.63679534, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.65819538, + "num_input_tokens_seen": 164592575, + "step": 7671, + "time_per_iteration": 2.5200605392456055 + }, + { + "auxiliary_loss_clip": 0.01024823, + "auxiliary_loss_mlp": 0.0100358, + "balance_loss_clip": 1.01737523, + "balance_loss_mlp": 1.00195873, + "epoch": 0.4612655944686608, + "flos": 69959266404480.0, + "grad_norm": 0.6989707125355941, + "language_loss": 0.55913389, + "learning_rate": 2.346230902123583e-06, + "loss": 0.57941794, + "num_input_tokens_seen": 164659795, + "step": 7672, + "time_per_iteration": 3.2214865684509277 + }, + { + "auxiliary_loss_clip": 0.01110825, + "auxiliary_loss_mlp": 0.01035798, + "balance_loss_clip": 1.04137814, + "balance_loss_mlp": 1.02210045, + "epoch": 0.46132571772132874, + "flos": 16837149502080.0, + "grad_norm": 1.8526541007839679, + "language_loss": 0.71173739, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.73320365, + "num_input_tokens_seen": 164678735, + "step": 7673, + "time_per_iteration": 2.471381664276123 + }, + { + "auxiliary_loss_clip": 0.01094855, + "auxiliary_loss_mlp": 0.01032052, + "balance_loss_clip": 1.03973222, + "balance_loss_mlp": 1.01810503, + "epoch": 0.4613858409739967, + "flos": 35808935431680.0, + "grad_norm": 1.7210410752930523, + "language_loss": 0.70637441, + "learning_rate": 2.345463713066195e-06, + "loss": 0.72764355, + "num_input_tokens_seen": 164700885, + "step": 7674, + "time_per_iteration": 2.6591646671295166 + }, + { + "auxiliary_loss_clip": 0.01094978, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.038414, + "balance_loss_mlp": 1.02358866, + "epoch": 0.4614459642266647, + "flos": 35266756567680.0, + "grad_norm": 1.5110608931118745, + "language_loss": 0.65785176, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.67917633, + "num_input_tokens_seen": 164726960, + "step": 7675, + "time_per_iteration": 2.671116828918457 + }, + { + "auxiliary_loss_clip": 0.01041789, + "auxiliary_loss_mlp": 0.01001623, + "balance_loss_clip": 1.0143019, + "balance_loss_mlp": 1.00020409, + "epoch": 0.46150608747933264, + "flos": 66704610044160.0, + "grad_norm": 0.8220956935109288, + "language_loss": 0.58616233, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.60659641, + "num_input_tokens_seen": 164788525, + "step": 7676, + "time_per_iteration": 3.106959104537964 + }, + { + "auxiliary_loss_clip": 0.01015114, + "auxiliary_loss_mlp": 0.01000722, + "balance_loss_clip": 1.01788282, + "balance_loss_mlp": 0.99926722, + "epoch": 0.4615662107320006, + "flos": 55830177025920.0, + "grad_norm": 0.7888027773719017, + "language_loss": 0.62663752, + "learning_rate": 2.344312831266341e-06, + "loss": 0.64679587, + "num_input_tokens_seen": 164843525, + "step": 7677, + "time_per_iteration": 3.0071446895599365 + }, + { + "auxiliary_loss_clip": 0.0109576, + "auxiliary_loss_mlp": 0.01031816, + "balance_loss_clip": 1.04063618, + "balance_loss_mlp": 1.01916814, + "epoch": 0.46162633398466857, + "flos": 15483245137920.0, + "grad_norm": 2.2101449459433122, + "language_loss": 0.76387417, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.78514999, + "num_input_tokens_seen": 164859895, + "step": 7678, + "time_per_iteration": 2.5216281414031982 + }, + { + "auxiliary_loss_clip": 0.01121224, + "auxiliary_loss_mlp": 0.01031527, + "balance_loss_clip": 1.04209828, + "balance_loss_mlp": 1.01750207, + "epoch": 0.46168645723733653, + "flos": 20011437181440.0, + "grad_norm": 1.9514510588442076, + "language_loss": 0.66871035, + "learning_rate": 2.343545511426974e-06, + "loss": 0.69023782, + "num_input_tokens_seen": 164878030, + "step": 7679, + "time_per_iteration": 2.4820895195007324 + }, + { + "auxiliary_loss_clip": 0.01087137, + "auxiliary_loss_mlp": 0.01036182, + "balance_loss_clip": 1.04032731, + "balance_loss_mlp": 1.02287829, + "epoch": 0.4617465804900045, + "flos": 20298542590080.0, + "grad_norm": 1.8177495470725336, + "language_loss": 0.70005095, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.72128403, + "num_input_tokens_seen": 164895710, + "step": 7680, + "time_per_iteration": 2.564502716064453 + }, + { + "auxiliary_loss_clip": 0.01128247, + "auxiliary_loss_mlp": 0.01044718, + "balance_loss_clip": 1.04595828, + "balance_loss_mlp": 1.03015614, + "epoch": 0.46180670374267246, + "flos": 22346312952960.0, + "grad_norm": 1.6948636091688467, + "language_loss": 0.63357991, + "learning_rate": 2.342778139478487e-06, + "loss": 0.65530956, + "num_input_tokens_seen": 164913365, + "step": 7681, + "time_per_iteration": 2.461862564086914 + }, + { + "auxiliary_loss_clip": 0.01105626, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.03897905, + "balance_loss_mlp": 1.01769948, + "epoch": 0.46186682699534043, + "flos": 19895696582400.0, + "grad_norm": 1.4304829874092857, + "language_loss": 0.67451274, + "learning_rate": 2.342394433999697e-06, + "loss": 0.69587582, + "num_input_tokens_seen": 164931620, + "step": 7682, + "time_per_iteration": 3.886735677719116 + }, + { + "auxiliary_loss_clip": 0.01081577, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.04018164, + "balance_loss_mlp": 1.02411473, + "epoch": 0.4619269502480084, + "flos": 31503569408640.0, + "grad_norm": 2.0372989331438967, + "language_loss": 0.73769724, + "learning_rate": 2.342010715537275e-06, + "loss": 0.75889671, + "num_input_tokens_seen": 164950905, + "step": 7683, + "time_per_iteration": 2.623055934906006 + }, + { + "auxiliary_loss_clip": 0.01119567, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.04215288, + "balance_loss_mlp": 1.0204854, + "epoch": 0.46198707350067636, + "flos": 25009484054400.0, + "grad_norm": 1.7280956742610316, + "language_loss": 0.76175058, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.78328574, + "num_input_tokens_seen": 164970950, + "step": 7684, + "time_per_iteration": 4.328751087188721 + }, + { + "auxiliary_loss_clip": 0.01126335, + "auxiliary_loss_mlp": 0.01039418, + "balance_loss_clip": 1.04404974, + "balance_loss_mlp": 1.02515495, + "epoch": 0.4620471967533444, + "flos": 18292357198080.0, + "grad_norm": 1.8526383160449416, + "language_loss": 0.79703903, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.81869656, + "num_input_tokens_seen": 164989855, + "step": 7685, + "time_per_iteration": 2.466585636138916 + }, + { + "auxiliary_loss_clip": 0.01081243, + "auxiliary_loss_mlp": 0.01041395, + "balance_loss_clip": 1.04605615, + "balance_loss_mlp": 1.02641678, + "epoch": 0.46210732000601235, + "flos": 33985104410880.0, + "grad_norm": 1.6851256628803053, + "language_loss": 0.66549659, + "learning_rate": 2.340859482393731e-06, + "loss": 0.68672293, + "num_input_tokens_seen": 165012290, + "step": 7686, + "time_per_iteration": 2.6781020164489746 + }, + { + "auxiliary_loss_clip": 0.01100377, + "auxiliary_loss_mlp": 0.00789326, + "balance_loss_clip": 1.04238904, + "balance_loss_mlp": 1.01172185, + "epoch": 0.4621674432586803, + "flos": 25009412227200.0, + "grad_norm": 2.3162017582398384, + "language_loss": 0.73812568, + "learning_rate": 2.340475712142296e-06, + "loss": 0.75702274, + "num_input_tokens_seen": 165030810, + "step": 7687, + "time_per_iteration": 4.028199195861816 + }, + { + "auxiliary_loss_clip": 0.01056044, + "auxiliary_loss_mlp": 0.01030527, + "balance_loss_clip": 1.04256916, + "balance_loss_mlp": 1.01635885, + "epoch": 0.4622275665113483, + "flos": 22014031213440.0, + "grad_norm": 2.7328188495499823, + "language_loss": 0.74931192, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.7701776, + "num_input_tokens_seen": 165050205, + "step": 7688, + "time_per_iteration": 2.642214059829712 + }, + { + "auxiliary_loss_clip": 0.01069587, + "auxiliary_loss_mlp": 0.007884, + "balance_loss_clip": 1.03822267, + "balance_loss_mlp": 1.0111419, + "epoch": 0.46228768976401624, + "flos": 24058820747520.0, + "grad_norm": 1.8254642045945535, + "language_loss": 0.79026461, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.80884451, + "num_input_tokens_seen": 165069370, + "step": 7689, + "time_per_iteration": 4.026331424713135 + }, + { + "auxiliary_loss_clip": 0.01109207, + "auxiliary_loss_mlp": 0.01036642, + "balance_loss_clip": 1.03988111, + "balance_loss_mlp": 1.02172899, + "epoch": 0.4623478130166842, + "flos": 26651391667200.0, + "grad_norm": 2.286629313103284, + "language_loss": 0.57157046, + "learning_rate": 2.339324323980964e-06, + "loss": 0.5930289, + "num_input_tokens_seen": 165089610, + "step": 7690, + "time_per_iteration": 2.5442843437194824 + }, + { + "auxiliary_loss_clip": 0.01109013, + "auxiliary_loss_mlp": 0.01034134, + "balance_loss_clip": 1.04087782, + "balance_loss_mlp": 1.02018619, + "epoch": 0.46240793626935217, + "flos": 20558428467840.0, + "grad_norm": 1.988708727304612, + "language_loss": 0.82691473, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.84834617, + "num_input_tokens_seen": 165109050, + "step": 7691, + "time_per_iteration": 2.492231845855713 + }, + { + "auxiliary_loss_clip": 0.0109876, + "auxiliary_loss_mlp": 0.01027752, + "balance_loss_clip": 1.04081774, + "balance_loss_mlp": 1.01453793, + "epoch": 0.46246805952202014, + "flos": 22456055980800.0, + "grad_norm": 1.5312903978158867, + "language_loss": 0.75474274, + "learning_rate": 2.338556667513091e-06, + "loss": 0.77600789, + "num_input_tokens_seen": 165130130, + "step": 7692, + "time_per_iteration": 2.5442233085632324 + }, + { + "auxiliary_loss_clip": 0.01090252, + "auxiliary_loss_mlp": 0.01037017, + "balance_loss_clip": 1.04133844, + "balance_loss_mlp": 1.02262831, + "epoch": 0.4625281827746881, + "flos": 35041308854400.0, + "grad_norm": 1.6199728044497228, + "language_loss": 0.74061632, + "learning_rate": 2.338172820014723e-06, + "loss": 0.76188904, + "num_input_tokens_seen": 165152685, + "step": 7693, + "time_per_iteration": 2.6615331172943115 + }, + { + "auxiliary_loss_clip": 0.01079445, + "auxiliary_loss_mlp": 0.01046493, + "balance_loss_clip": 1.04113126, + "balance_loss_mlp": 1.03107977, + "epoch": 0.46258830602735607, + "flos": 21068647205760.0, + "grad_norm": 1.5392312943629898, + "language_loss": 0.85664201, + "learning_rate": 2.337788959692808e-06, + "loss": 0.87790138, + "num_input_tokens_seen": 165173315, + "step": 7694, + "time_per_iteration": 2.5553884506225586 + }, + { + "auxiliary_loss_clip": 0.01100079, + "auxiliary_loss_mlp": 0.0104082, + "balance_loss_clip": 1.04042149, + "balance_loss_mlp": 1.02743936, + "epoch": 0.46264842928002403, + "flos": 26177227205760.0, + "grad_norm": 1.984011202539224, + "language_loss": 0.78849781, + "learning_rate": 2.337405086561902e-06, + "loss": 0.80990684, + "num_input_tokens_seen": 165192395, + "step": 7695, + "time_per_iteration": 2.555619478225708 + }, + { + "auxiliary_loss_clip": 0.01104425, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.0392797, + "balance_loss_mlp": 1.01910377, + "epoch": 0.462708552532692, + "flos": 16764214936320.0, + "grad_norm": 1.7699832113288594, + "language_loss": 0.72457474, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.74594134, + "num_input_tokens_seen": 165211355, + "step": 7696, + "time_per_iteration": 2.470168352127075 + }, + { + "auxiliary_loss_clip": 0.010982, + "auxiliary_loss_mlp": 0.0103739, + "balance_loss_clip": 1.04032063, + "balance_loss_mlp": 1.0231328, + "epoch": 0.46276867578535996, + "flos": 15560453422080.0, + "grad_norm": 1.5835126638543793, + "language_loss": 0.69817591, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.71953177, + "num_input_tokens_seen": 165229380, + "step": 7697, + "time_per_iteration": 2.501528739929199 + }, + { + "auxiliary_loss_clip": 0.01119059, + "auxiliary_loss_mlp": 0.01029787, + "balance_loss_clip": 1.04218268, + "balance_loss_mlp": 1.01672149, + "epoch": 0.462828799038028, + "flos": 22415404763520.0, + "grad_norm": 2.3272337162454857, + "language_loss": 0.84601039, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.86749887, + "num_input_tokens_seen": 165247200, + "step": 7698, + "time_per_iteration": 2.4781899452209473 + }, + { + "auxiliary_loss_clip": 0.0111914, + "auxiliary_loss_mlp": 0.01033101, + "balance_loss_clip": 1.04183519, + "balance_loss_mlp": 1.0199697, + "epoch": 0.46288892229069595, + "flos": 21069580959360.0, + "grad_norm": 2.0092190696938625, + "language_loss": 0.7102077, + "learning_rate": 2.335869466239502e-06, + "loss": 0.7317301, + "num_input_tokens_seen": 165265825, + "step": 7699, + "time_per_iteration": 2.4430227279663086 + }, + { + "auxiliary_loss_clip": 0.01069208, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.039253, + "balance_loss_mlp": 1.01895392, + "epoch": 0.4629490455433639, + "flos": 23185688947200.0, + "grad_norm": 1.8784100116108031, + "language_loss": 0.71567023, + "learning_rate": 2.335485529281996e-06, + "loss": 0.73669583, + "num_input_tokens_seen": 165284380, + "step": 7700, + "time_per_iteration": 2.597999334335327 + }, + { + "auxiliary_loss_clip": 0.01117006, + "auxiliary_loss_mlp": 0.00786505, + "balance_loss_clip": 1.03965521, + "balance_loss_mlp": 1.00879729, + "epoch": 0.4630091687960319, + "flos": 18835541642880.0, + "grad_norm": 2.0507341327455255, + "language_loss": 0.7250576, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.74409264, + "num_input_tokens_seen": 165300320, + "step": 7701, + "time_per_iteration": 2.4569966793060303 + }, + { + "auxiliary_loss_clip": 0.01077481, + "auxiliary_loss_mlp": 0.01035126, + "balance_loss_clip": 1.04007864, + "balance_loss_mlp": 1.02077353, + "epoch": 0.46306929204869984, + "flos": 38907020407680.0, + "grad_norm": 2.091919920128807, + "language_loss": 0.64534295, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.66646898, + "num_input_tokens_seen": 165318130, + "step": 7702, + "time_per_iteration": 2.6831111907958984 + }, + { + "auxiliary_loss_clip": 0.01092241, + "auxiliary_loss_mlp": 0.01027803, + "balance_loss_clip": 1.03940725, + "balance_loss_mlp": 1.01445794, + "epoch": 0.4631294153013678, + "flos": 19644178573440.0, + "grad_norm": 2.052567610124491, + "language_loss": 0.73763227, + "learning_rate": 2.33433364213785e-06, + "loss": 0.75883275, + "num_input_tokens_seen": 165336225, + "step": 7703, + "time_per_iteration": 2.528388261795044 + }, + { + "auxiliary_loss_clip": 0.01098096, + "auxiliary_loss_mlp": 0.01033677, + "balance_loss_clip": 1.04039407, + "balance_loss_mlp": 1.01933014, + "epoch": 0.4631895385540358, + "flos": 24608254158720.0, + "grad_norm": 1.7230614529144024, + "language_loss": 0.69168186, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.71299958, + "num_input_tokens_seen": 165355005, + "step": 7704, + "time_per_iteration": 2.5417864322662354 + }, + { + "auxiliary_loss_clip": 0.01109415, + "auxiliary_loss_mlp": 0.01026997, + "balance_loss_clip": 1.04257524, + "balance_loss_mlp": 1.01303196, + "epoch": 0.46324966180670374, + "flos": 26320115508480.0, + "grad_norm": 2.8049463022045926, + "language_loss": 0.81378317, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.83514726, + "num_input_tokens_seen": 165374910, + "step": 7705, + "time_per_iteration": 2.5289628505706787 + }, + { + "auxiliary_loss_clip": 0.01112798, + "auxiliary_loss_mlp": 0.01031687, + "balance_loss_clip": 1.04030728, + "balance_loss_mlp": 1.01836514, + "epoch": 0.4633097850593717, + "flos": 19240506552960.0, + "grad_norm": 1.8889933376802064, + "language_loss": 0.77424723, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.79569209, + "num_input_tokens_seen": 165392590, + "step": 7706, + "time_per_iteration": 2.468928813934326 + }, + { + "auxiliary_loss_clip": 0.01089299, + "auxiliary_loss_mlp": 0.01028455, + "balance_loss_clip": 1.04187787, + "balance_loss_mlp": 1.01559854, + "epoch": 0.46336990831203967, + "flos": 22783166161920.0, + "grad_norm": 1.9210812452162274, + "language_loss": 0.70211184, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.72328937, + "num_input_tokens_seen": 165411195, + "step": 7707, + "time_per_iteration": 2.516860246658325 + }, + { + "auxiliary_loss_clip": 0.0109902, + "auxiliary_loss_mlp": 0.01035662, + "balance_loss_clip": 1.03833234, + "balance_loss_mlp": 1.02080822, + "epoch": 0.46343003156470763, + "flos": 38210604543360.0, + "grad_norm": 2.217064240128672, + "language_loss": 0.60830623, + "learning_rate": 2.332413576865791e-06, + "loss": 0.62965304, + "num_input_tokens_seen": 165430150, + "step": 7708, + "time_per_iteration": 2.649106979370117 + }, + { + "auxiliary_loss_clip": 0.01081182, + "auxiliary_loss_mlp": 0.01029262, + "balance_loss_clip": 1.04025376, + "balance_loss_mlp": 1.01549947, + "epoch": 0.4634901548173756, + "flos": 31938555110400.0, + "grad_norm": 1.9385647275717128, + "language_loss": 0.77335364, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.79445809, + "num_input_tokens_seen": 165450595, + "step": 7709, + "time_per_iteration": 2.63010835647583 + }, + { + "auxiliary_loss_clip": 0.01121136, + "auxiliary_loss_mlp": 0.01038412, + "balance_loss_clip": 1.041605, + "balance_loss_mlp": 1.02444625, + "epoch": 0.46355027807004356, + "flos": 20082540153600.0, + "grad_norm": 2.2393731375008277, + "language_loss": 0.76939744, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.79099292, + "num_input_tokens_seen": 165469515, + "step": 7710, + "time_per_iteration": 2.457853317260742 + }, + { + "auxiliary_loss_clip": 0.01111419, + "auxiliary_loss_mlp": 0.01032317, + "balance_loss_clip": 1.04102743, + "balance_loss_mlp": 1.01729631, + "epoch": 0.4636104013227116, + "flos": 24061370613120.0, + "grad_norm": 2.2805322457174664, + "language_loss": 0.73436117, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.75579852, + "num_input_tokens_seen": 165488125, + "step": 7711, + "time_per_iteration": 2.505941867828369 + }, + { + "auxiliary_loss_clip": 0.0110033, + "auxiliary_loss_mlp": 0.01044622, + "balance_loss_clip": 1.04200947, + "balance_loss_mlp": 1.03055501, + "epoch": 0.46367052457537955, + "flos": 23914639555200.0, + "grad_norm": 1.3153904872717346, + "language_loss": 0.71470094, + "learning_rate": 2.33087729766797e-06, + "loss": 0.73615044, + "num_input_tokens_seen": 165509225, + "step": 7712, + "time_per_iteration": 2.5354371070861816 + }, + { + "auxiliary_loss_clip": 0.01096787, + "auxiliary_loss_mlp": 0.01036064, + "balance_loss_clip": 1.04048181, + "balance_loss_mlp": 1.01999485, + "epoch": 0.4637306478280475, + "flos": 26396533693440.0, + "grad_norm": 1.826385248557772, + "language_loss": 0.72965932, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.75098789, + "num_input_tokens_seen": 165529945, + "step": 7713, + "time_per_iteration": 2.5611751079559326 + }, + { + "auxiliary_loss_clip": 0.0108618, + "auxiliary_loss_mlp": 0.01036593, + "balance_loss_clip": 1.03890359, + "balance_loss_mlp": 1.02159643, + "epoch": 0.4637907710807155, + "flos": 21980706370560.0, + "grad_norm": 1.8639041408237584, + "language_loss": 0.58601058, + "learning_rate": 2.3301090827294e-06, + "loss": 0.60723829, + "num_input_tokens_seen": 165550690, + "step": 7714, + "time_per_iteration": 2.5711097717285156 + }, + { + "auxiliary_loss_clip": 0.01106237, + "auxiliary_loss_mlp": 0.01027717, + "balance_loss_clip": 1.03976274, + "balance_loss_mlp": 1.01429439, + "epoch": 0.46385089433338345, + "flos": 12422291846400.0, + "grad_norm": 2.0394830182093013, + "language_loss": 0.70224643, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.72358596, + "num_input_tokens_seen": 165567775, + "step": 7715, + "time_per_iteration": 2.4826362133026123 + }, + { + "auxiliary_loss_clip": 0.01125258, + "auxiliary_loss_mlp": 0.01032213, + "balance_loss_clip": 1.04203713, + "balance_loss_mlp": 1.01833689, + "epoch": 0.4639110175860514, + "flos": 23915752876800.0, + "grad_norm": 1.842812977558961, + "language_loss": 0.68472016, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.70629489, + "num_input_tokens_seen": 165587010, + "step": 7716, + "time_per_iteration": 2.4966373443603516 + }, + { + "auxiliary_loss_clip": 0.01122155, + "auxiliary_loss_mlp": 0.01028961, + "balance_loss_clip": 1.04184914, + "balance_loss_mlp": 1.01429832, + "epoch": 0.4639711408387194, + "flos": 25300396304640.0, + "grad_norm": 1.5544837750882015, + "language_loss": 0.80950403, + "learning_rate": 2.328956666474691e-06, + "loss": 0.83101517, + "num_input_tokens_seen": 165607850, + "step": 7717, + "time_per_iteration": 2.5004947185516357 + }, + { + "auxiliary_loss_clip": 0.01120239, + "auxiliary_loss_mlp": 0.01032207, + "balance_loss_clip": 1.04096198, + "balance_loss_mlp": 1.01820588, + "epoch": 0.46403126409138734, + "flos": 21211822817280.0, + "grad_norm": 1.7732864542382256, + "language_loss": 0.73185825, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.75338268, + "num_input_tokens_seen": 165627175, + "step": 7718, + "time_per_iteration": 2.4812440872192383 + }, + { + "auxiliary_loss_clip": 0.01117014, + "auxiliary_loss_mlp": 0.00788514, + "balance_loss_clip": 1.03952599, + "balance_loss_mlp": 1.01359737, + "epoch": 0.4640913873440553, + "flos": 35845564325760.0, + "grad_norm": 1.6892350551723885, + "language_loss": 0.70344579, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.72250116, + "num_input_tokens_seen": 165648340, + "step": 7719, + "time_per_iteration": 2.581648826599121 + }, + { + "auxiliary_loss_clip": 0.01094536, + "auxiliary_loss_mlp": 0.01038906, + "balance_loss_clip": 1.04018426, + "balance_loss_mlp": 1.02463698, + "epoch": 0.46415151059672327, + "flos": 19166207270400.0, + "grad_norm": 1.7572685751858477, + "language_loss": 0.86780953, + "learning_rate": 2.327804137953357e-06, + "loss": 0.889144, + "num_input_tokens_seen": 165667195, + "step": 7720, + "time_per_iteration": 2.5443360805511475 + }, + { + "auxiliary_loss_clip": 0.01023437, + "auxiliary_loss_mlp": 0.01001766, + "balance_loss_clip": 1.01634669, + "balance_loss_mlp": 1.00006092, + "epoch": 0.46421163384939124, + "flos": 58912750304640.0, + "grad_norm": 0.7345087865902721, + "language_loss": 0.55043364, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57068563, + "num_input_tokens_seen": 165726760, + "step": 7721, + "time_per_iteration": 4.473818063735962 + }, + { + "auxiliary_loss_clip": 0.01097783, + "auxiliary_loss_mlp": 0.0103775, + "balance_loss_clip": 1.04168725, + "balance_loss_mlp": 1.02377295, + "epoch": 0.4642717571020592, + "flos": 20157342226560.0, + "grad_norm": 1.9485585225214987, + "language_loss": 0.79649895, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.81785429, + "num_input_tokens_seen": 165745005, + "step": 7722, + "time_per_iteration": 2.5162031650543213 + }, + { + "auxiliary_loss_clip": 0.01124674, + "auxiliary_loss_mlp": 0.01034889, + "balance_loss_clip": 1.04344118, + "balance_loss_mlp": 1.02097094, + "epoch": 0.46433188035472717, + "flos": 25046184775680.0, + "grad_norm": 1.5233806165953534, + "language_loss": 0.7783348, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.79993045, + "num_input_tokens_seen": 165765750, + "step": 7723, + "time_per_iteration": 3.8636393547058105 + }, + { + "auxiliary_loss_clip": 0.01027051, + "auxiliary_loss_mlp": 0.01035562, + "balance_loss_clip": 1.04024005, + "balance_loss_mlp": 1.02228236, + "epoch": 0.4643920036073952, + "flos": 28075644817920.0, + "grad_norm": 1.752273523095809, + "language_loss": 0.68474835, + "learning_rate": 2.326267259301118e-06, + "loss": 0.70537448, + "num_input_tokens_seen": 165787515, + "step": 7724, + "time_per_iteration": 2.8889546394348145 + }, + { + "auxiliary_loss_clip": 0.01101981, + "auxiliary_loss_mlp": 0.01032939, + "balance_loss_clip": 1.041412, + "balance_loss_mlp": 1.01915216, + "epoch": 0.46445212686006315, + "flos": 18369350000640.0, + "grad_norm": 2.5042636373490317, + "language_loss": 0.67005563, + "learning_rate": 2.325883008671415e-06, + "loss": 0.69140482, + "num_input_tokens_seen": 165806675, + "step": 7725, + "time_per_iteration": 2.68361234664917 + }, + { + "auxiliary_loss_clip": 0.0110366, + "auxiliary_loss_mlp": 0.01039991, + "balance_loss_clip": 1.04111969, + "balance_loss_mlp": 1.02777767, + "epoch": 0.4645122501127311, + "flos": 31721618920320.0, + "grad_norm": 1.7304440232393894, + "language_loss": 0.65004623, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.67148274, + "num_input_tokens_seen": 165829835, + "step": 7726, + "time_per_iteration": 4.290181398391724 + }, + { + "auxiliary_loss_clip": 0.01097556, + "auxiliary_loss_mlp": 0.00789469, + "balance_loss_clip": 1.04345274, + "balance_loss_mlp": 1.01277804, + "epoch": 0.4645723733653991, + "flos": 23768806337280.0, + "grad_norm": 1.7559302670064163, + "language_loss": 0.74857241, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.76744264, + "num_input_tokens_seen": 165849380, + "step": 7727, + "time_per_iteration": 2.5930943489074707 + }, + { + "auxiliary_loss_clip": 0.01096691, + "auxiliary_loss_mlp": 0.01043322, + "balance_loss_clip": 1.04132688, + "balance_loss_mlp": 1.02923203, + "epoch": 0.46463249661806705, + "flos": 33145512935040.0, + "grad_norm": 1.974641867602, + "language_loss": 0.7852428, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.80664295, + "num_input_tokens_seen": 165868620, + "step": 7728, + "time_per_iteration": 4.205172061920166 + }, + { + "auxiliary_loss_clip": 0.01087579, + "auxiliary_loss_mlp": 0.0104157, + "balance_loss_clip": 1.04251158, + "balance_loss_mlp": 1.02755094, + "epoch": 0.464692619870735, + "flos": 18296020385280.0, + "grad_norm": 2.296302998201104, + "language_loss": 0.76009619, + "learning_rate": 2.324345882723155e-06, + "loss": 0.78138769, + "num_input_tokens_seen": 165885915, + "step": 7729, + "time_per_iteration": 2.5763397216796875 + }, + { + "auxiliary_loss_clip": 0.01096444, + "auxiliary_loss_mlp": 0.01048863, + "balance_loss_clip": 1.04140365, + "balance_loss_mlp": 1.03322279, + "epoch": 0.464752743123403, + "flos": 22638051216000.0, + "grad_norm": 1.7039507964934366, + "language_loss": 0.80081117, + "learning_rate": 2.323961570451588e-06, + "loss": 0.82226419, + "num_input_tokens_seen": 165905465, + "step": 7730, + "time_per_iteration": 2.584210157394409 + }, + { + "auxiliary_loss_clip": 0.01119555, + "auxiliary_loss_mlp": 0.01035517, + "balance_loss_clip": 1.04214478, + "balance_loss_mlp": 1.02255309, + "epoch": 0.46481286637607094, + "flos": 20412128373120.0, + "grad_norm": 1.53602550272045, + "language_loss": 0.7719748, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.79352552, + "num_input_tokens_seen": 165924640, + "step": 7731, + "time_per_iteration": 2.4774513244628906 + }, + { + "auxiliary_loss_clip": 0.01079136, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.04084063, + "balance_loss_mlp": 1.01711667, + "epoch": 0.4648729896287389, + "flos": 34275406129920.0, + "grad_norm": 4.3548211996039905, + "language_loss": 0.66017175, + "learning_rate": 2.323192909069061e-06, + "loss": 0.68126869, + "num_input_tokens_seen": 165945765, + "step": 7732, + "time_per_iteration": 2.656026840209961 + }, + { + "auxiliary_loss_clip": 0.01099452, + "auxiliary_loss_mlp": 0.01043894, + "balance_loss_clip": 1.04157043, + "balance_loss_mlp": 1.02849197, + "epoch": 0.4649331128814069, + "flos": 21321781326720.0, + "grad_norm": 2.3000012640744343, + "language_loss": 0.72725958, + "learning_rate": 2.32280855998725e-06, + "loss": 0.74869299, + "num_input_tokens_seen": 165964025, + "step": 7733, + "time_per_iteration": 2.548311471939087 + }, + { + "auxiliary_loss_clip": 0.01042071, + "auxiliary_loss_mlp": 0.01005962, + "balance_loss_clip": 1.01558065, + "balance_loss_mlp": 1.00429344, + "epoch": 0.46499323613407484, + "flos": 58308515717760.0, + "grad_norm": 1.2256078106869954, + "language_loss": 0.5191825, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.53966278, + "num_input_tokens_seen": 166021950, + "step": 7734, + "time_per_iteration": 3.016254186630249 + }, + { + "auxiliary_loss_clip": 0.01097316, + "auxiliary_loss_mlp": 0.01030427, + "balance_loss_clip": 1.04528999, + "balance_loss_mlp": 1.01651502, + "epoch": 0.4650533593867428, + "flos": 10889660384640.0, + "grad_norm": 1.9338551794930403, + "language_loss": 0.75945437, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.7807318, + "num_input_tokens_seen": 166039675, + "step": 7735, + "time_per_iteration": 2.52327823638916 + }, + { + "auxiliary_loss_clip": 0.01075503, + "auxiliary_loss_mlp": 0.0103847, + "balance_loss_clip": 1.04125381, + "balance_loss_mlp": 1.02439713, + "epoch": 0.46511348263941077, + "flos": 19974592805760.0, + "grad_norm": 1.8272204273418973, + "language_loss": 0.69826007, + "learning_rate": 2.321655439354519e-06, + "loss": 0.71939981, + "num_input_tokens_seen": 166057745, + "step": 7736, + "time_per_iteration": 2.5655429363250732 + }, + { + "auxiliary_loss_clip": 0.01119166, + "auxiliary_loss_mlp": 0.01031816, + "balance_loss_clip": 1.04420066, + "balance_loss_mlp": 1.0194068, + "epoch": 0.46517360589207873, + "flos": 19678401256320.0, + "grad_norm": 1.6239642428290886, + "language_loss": 0.72159749, + "learning_rate": 2.321271041396427e-06, + "loss": 0.74310732, + "num_input_tokens_seen": 166076440, + "step": 7737, + "time_per_iteration": 2.4780867099761963 + }, + { + "auxiliary_loss_clip": 0.0110596, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.04677153, + "balance_loss_mlp": 1.02085066, + "epoch": 0.46523372914474675, + "flos": 16872665074560.0, + "grad_norm": 1.9133728843776778, + "language_loss": 0.83646643, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.85787976, + "num_input_tokens_seen": 166092520, + "step": 7738, + "time_per_iteration": 2.4867942333221436 + }, + { + "auxiliary_loss_clip": 0.01031819, + "auxiliary_loss_mlp": 0.01002502, + "balance_loss_clip": 1.01491046, + "balance_loss_mlp": 1.00092888, + "epoch": 0.4652938523974147, + "flos": 53439138339840.0, + "grad_norm": 0.7611536118798935, + "language_loss": 0.5783515, + "learning_rate": 2.320502208946932e-06, + "loss": 0.59869468, + "num_input_tokens_seen": 166156285, + "step": 7739, + "time_per_iteration": 3.1521754264831543 + }, + { + "auxiliary_loss_clip": 0.0110212, + "auxiliary_loss_mlp": 0.01038262, + "balance_loss_clip": 1.04559946, + "balance_loss_mlp": 1.02473783, + "epoch": 0.4653539756500827, + "flos": 15231296165760.0, + "grad_norm": 1.838040744933791, + "language_loss": 0.85096025, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.87236404, + "num_input_tokens_seen": 166173455, + "step": 7740, + "time_per_iteration": 2.544041872024536 + }, + { + "auxiliary_loss_clip": 0.01097522, + "auxiliary_loss_mlp": 0.01034706, + "balance_loss_clip": 1.04482722, + "balance_loss_mlp": 1.02038908, + "epoch": 0.46541409890275065, + "flos": 23732249270400.0, + "grad_norm": 2.144216194559229, + "language_loss": 0.75812727, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.77944958, + "num_input_tokens_seen": 166194370, + "step": 7741, + "time_per_iteration": 2.554856061935425 + }, + { + "auxiliary_loss_clip": 0.01092479, + "auxiliary_loss_mlp": 0.01036915, + "balance_loss_clip": 1.04293716, + "balance_loss_mlp": 1.02349186, + "epoch": 0.4654742221554186, + "flos": 20847329556480.0, + "grad_norm": 1.9452364256123638, + "language_loss": 0.81020474, + "learning_rate": 2.319348869158064e-06, + "loss": 0.83149868, + "num_input_tokens_seen": 166213195, + "step": 7742, + "time_per_iteration": 2.5594208240509033 + }, + { + "auxiliary_loss_clip": 0.01100494, + "auxiliary_loss_mlp": 0.01039096, + "balance_loss_clip": 1.04219377, + "balance_loss_mlp": 1.02477944, + "epoch": 0.4655343454080866, + "flos": 20704836303360.0, + "grad_norm": 1.6604379484289034, + "language_loss": 0.72637701, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.74777293, + "num_input_tokens_seen": 166231350, + "step": 7743, + "time_per_iteration": 2.522688627243042 + }, + { + "auxiliary_loss_clip": 0.01087408, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.04212797, + "balance_loss_mlp": 1.01545215, + "epoch": 0.46559446866075455, + "flos": 18989850470400.0, + "grad_norm": 2.4220844355991926, + "language_loss": 0.71052551, + "learning_rate": 2.318579915392483e-06, + "loss": 0.73169476, + "num_input_tokens_seen": 166250530, + "step": 7744, + "time_per_iteration": 2.5307161808013916 + }, + { + "auxiliary_loss_clip": 0.01078532, + "auxiliary_loss_mlp": 0.01026822, + "balance_loss_clip": 1.04600739, + "balance_loss_mlp": 1.01421022, + "epoch": 0.4656545919134225, + "flos": 34496364643200.0, + "grad_norm": 1.498853352547733, + "language_loss": 0.84978235, + "learning_rate": 2.31819542038153e-06, + "loss": 0.87083584, + "num_input_tokens_seen": 166272545, + "step": 7745, + "time_per_iteration": 2.733593225479126 + }, + { + "auxiliary_loss_clip": 0.0110838, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.04254234, + "balance_loss_mlp": 1.02133894, + "epoch": 0.4657147151660905, + "flos": 24310554238080.0, + "grad_norm": 1.3545103482103438, + "language_loss": 0.72861671, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75005257, + "num_input_tokens_seen": 166292135, + "step": 7746, + "time_per_iteration": 2.539642095565796 + }, + { + "auxiliary_loss_clip": 0.01107687, + "auxiliary_loss_mlp": 0.01035249, + "balance_loss_clip": 1.04511094, + "balance_loss_mlp": 1.02202892, + "epoch": 0.46577483841875844, + "flos": 58795139220480.0, + "grad_norm": 1.7019900797669245, + "language_loss": 0.70105875, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.72248816, + "num_input_tokens_seen": 166316710, + "step": 7747, + "time_per_iteration": 2.8398334980010986 + }, + { + "auxiliary_loss_clip": 0.0107973, + "auxiliary_loss_mlp": 0.01037722, + "balance_loss_clip": 1.03959417, + "balance_loss_mlp": 1.02261806, + "epoch": 0.4658349616714264, + "flos": 31321969223040.0, + "grad_norm": 1.4594100560552439, + "language_loss": 0.67525494, + "learning_rate": 2.317041863010978e-06, + "loss": 0.69642949, + "num_input_tokens_seen": 166338535, + "step": 7748, + "time_per_iteration": 2.661412477493286 + }, + { + "auxiliary_loss_clip": 0.01085955, + "auxiliary_loss_mlp": 0.01035706, + "balance_loss_clip": 1.04428136, + "balance_loss_mlp": 1.0202086, + "epoch": 0.46589508492409437, + "flos": 14860338456960.0, + "grad_norm": 1.9434076440979862, + "language_loss": 0.6396181, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.66083473, + "num_input_tokens_seen": 166355540, + "step": 7749, + "time_per_iteration": 2.5158324241638184 + }, + { + "auxiliary_loss_clip": 0.0111475, + "auxiliary_loss_mlp": 0.01034149, + "balance_loss_clip": 1.04484582, + "balance_loss_mlp": 1.01932502, + "epoch": 0.46595520817676234, + "flos": 12895989431040.0, + "grad_norm": 2.1289787477738256, + "language_loss": 0.74871409, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.77020299, + "num_input_tokens_seen": 166372635, + "step": 7750, + "time_per_iteration": 2.5047662258148193 + }, + { + "auxiliary_loss_clip": 0.01106027, + "auxiliary_loss_mlp": 0.01028411, + "balance_loss_clip": 1.0439347, + "balance_loss_mlp": 1.01398683, + "epoch": 0.46601533142943036, + "flos": 32854169721600.0, + "grad_norm": 2.0242678847766613, + "language_loss": 0.74018157, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.76152599, + "num_input_tokens_seen": 166393175, + "step": 7751, + "time_per_iteration": 2.6138193607330322 + }, + { + "auxiliary_loss_clip": 0.01090812, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.04297495, + "balance_loss_mlp": 1.01754451, + "epoch": 0.4660754546820983, + "flos": 19967517826560.0, + "grad_norm": 1.80002853460792, + "language_loss": 0.73253942, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.75376803, + "num_input_tokens_seen": 166408630, + "step": 7752, + "time_per_iteration": 2.598094940185547 + }, + { + "auxiliary_loss_clip": 0.01095325, + "auxiliary_loss_mlp": 0.01032063, + "balance_loss_clip": 1.04391468, + "balance_loss_mlp": 1.01839614, + "epoch": 0.4661355779347663, + "flos": 26688164215680.0, + "grad_norm": 2.2877430454946346, + "language_loss": 0.68848234, + "learning_rate": 2.315119027142644e-06, + "loss": 0.7097562, + "num_input_tokens_seen": 166428170, + "step": 7753, + "time_per_iteration": 2.575422763824463 + }, + { + "auxiliary_loss_clip": 0.01094217, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.04422128, + "balance_loss_mlp": 1.0193789, + "epoch": 0.46619570118743425, + "flos": 20959442881920.0, + "grad_norm": 1.8567069868149921, + "language_loss": 0.72733343, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.74860704, + "num_input_tokens_seen": 166446705, + "step": 7754, + "time_per_iteration": 2.575521230697632 + }, + { + "auxiliary_loss_clip": 0.01101046, + "auxiliary_loss_mlp": 0.01028362, + "balance_loss_clip": 1.04231429, + "balance_loss_mlp": 1.01423585, + "epoch": 0.4662558244401022, + "flos": 24426079355520.0, + "grad_norm": 1.5780660496469205, + "language_loss": 0.78762019, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.80891424, + "num_input_tokens_seen": 166466750, + "step": 7755, + "time_per_iteration": 2.561568021774292 + }, + { + "auxiliary_loss_clip": 0.0110499, + "auxiliary_loss_mlp": 0.01027175, + "balance_loss_clip": 1.04136801, + "balance_loss_mlp": 1.01409817, + "epoch": 0.4663159476927702, + "flos": 20595452411520.0, + "grad_norm": 1.7536979059193405, + "language_loss": 0.72348523, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.74480689, + "num_input_tokens_seen": 166485400, + "step": 7756, + "time_per_iteration": 2.532601833343506 + }, + { + "auxiliary_loss_clip": 0.0110671, + "auxiliary_loss_mlp": 0.0102983, + "balance_loss_clip": 1.04089355, + "balance_loss_mlp": 1.01622224, + "epoch": 0.46637607094543815, + "flos": 25661872823040.0, + "grad_norm": 1.6691703228491073, + "language_loss": 0.77888852, + "learning_rate": 2.313580543272274e-06, + "loss": 0.80025387, + "num_input_tokens_seen": 166505730, + "step": 7757, + "time_per_iteration": 2.5754804611206055 + }, + { + "auxiliary_loss_clip": 0.01080074, + "auxiliary_loss_mlp": 0.01030242, + "balance_loss_clip": 1.03928137, + "balance_loss_mlp": 1.01724219, + "epoch": 0.4664361941981061, + "flos": 24273853516800.0, + "grad_norm": 1.9725481817779147, + "language_loss": 0.66091484, + "learning_rate": 2.313195892540705e-06, + "loss": 0.68201798, + "num_input_tokens_seen": 166523770, + "step": 7758, + "time_per_iteration": 2.5847082138061523 + }, + { + "auxiliary_loss_clip": 0.01092305, + "auxiliary_loss_mlp": 0.01035587, + "balance_loss_clip": 1.04262066, + "balance_loss_mlp": 1.0218184, + "epoch": 0.4664963174507741, + "flos": 18405871153920.0, + "grad_norm": 1.7121169313829245, + "language_loss": 0.74617004, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.76744896, + "num_input_tokens_seen": 166542935, + "step": 7759, + "time_per_iteration": 2.508730888366699 + }, + { + "auxiliary_loss_clip": 0.01098881, + "auxiliary_loss_mlp": 0.01034406, + "balance_loss_clip": 1.0432632, + "balance_loss_mlp": 1.02168643, + "epoch": 0.46655644070344204, + "flos": 22455122227200.0, + "grad_norm": 1.5440393967651211, + "language_loss": 0.77907872, + "learning_rate": 2.312426555462893e-06, + "loss": 0.80041164, + "num_input_tokens_seen": 166563935, + "step": 7760, + "time_per_iteration": 3.9425315856933594 + }, + { + "auxiliary_loss_clip": 0.01091909, + "auxiliary_loss_mlp": 0.01028966, + "balance_loss_clip": 1.03896379, + "balance_loss_mlp": 1.0159725, + "epoch": 0.46661656395611, + "flos": 13808407731840.0, + "grad_norm": 1.910291063183726, + "language_loss": 0.74280322, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.76401198, + "num_input_tokens_seen": 166582175, + "step": 7761, + "time_per_iteration": 3.89027738571167 + }, + { + "auxiliary_loss_clip": 0.0110887, + "auxiliary_loss_mlp": 0.01038883, + "balance_loss_clip": 1.04518032, + "balance_loss_mlp": 1.02367759, + "epoch": 0.466676687208778, + "flos": 21652159645440.0, + "grad_norm": 1.740244403852362, + "language_loss": 0.78757137, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.80904889, + "num_input_tokens_seen": 166601870, + "step": 7762, + "time_per_iteration": 2.5022125244140625 + }, + { + "auxiliary_loss_clip": 0.01028445, + "auxiliary_loss_mlp": 0.01003018, + "balance_loss_clip": 1.01245606, + "balance_loss_mlp": 1.00144434, + "epoch": 0.46673681046144594, + "flos": 68534259068160.0, + "grad_norm": 0.7945625112745611, + "language_loss": 0.59798241, + "learning_rate": 2.311272461028297e-06, + "loss": 0.61829704, + "num_input_tokens_seen": 166668960, + "step": 7763, + "time_per_iteration": 3.1578493118286133 + }, + { + "auxiliary_loss_clip": 0.01081604, + "auxiliary_loss_mlp": 0.01036536, + "balance_loss_clip": 1.04097986, + "balance_loss_mlp": 1.0211997, + "epoch": 0.46679693371411396, + "flos": 15814449469440.0, + "grad_norm": 2.489887650683663, + "language_loss": 0.78928566, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.81046706, + "num_input_tokens_seen": 166686110, + "step": 7764, + "time_per_iteration": 3.9242348670959473 + }, + { + "auxiliary_loss_clip": 0.01088922, + "auxiliary_loss_mlp": 0.01033015, + "balance_loss_clip": 1.04300296, + "balance_loss_mlp": 1.02038443, + "epoch": 0.4668570569667819, + "flos": 18514572687360.0, + "grad_norm": 1.8246583984764693, + "language_loss": 0.72423416, + "learning_rate": 2.310503005696839e-06, + "loss": 0.74545348, + "num_input_tokens_seen": 166703930, + "step": 7765, + "time_per_iteration": 2.531252384185791 + }, + { + "auxiliary_loss_clip": 0.01076088, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.04068065, + "balance_loss_mlp": 1.01812911, + "epoch": 0.4669171802194499, + "flos": 19206643006080.0, + "grad_norm": 1.9175954286081385, + "language_loss": 0.77831745, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.79939556, + "num_input_tokens_seen": 166719940, + "step": 7766, + "time_per_iteration": 2.5352258682250977 + }, + { + "auxiliary_loss_clip": 0.01102645, + "auxiliary_loss_mlp": 0.0103339, + "balance_loss_clip": 1.03975749, + "balance_loss_mlp": 1.01993728, + "epoch": 0.46697730347211786, + "flos": 12276135406080.0, + "grad_norm": 3.620814830873186, + "language_loss": 0.64600128, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.66736162, + "num_input_tokens_seen": 166738285, + "step": 7767, + "time_per_iteration": 3.856947422027588 + }, + { + "auxiliary_loss_clip": 0.01108676, + "auxiliary_loss_mlp": 0.01037764, + "balance_loss_clip": 1.04229057, + "balance_loss_mlp": 1.02434123, + "epoch": 0.4670374267247858, + "flos": 23586739274880.0, + "grad_norm": 1.979717476635691, + "language_loss": 0.74515891, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.76662326, + "num_input_tokens_seen": 166758170, + "step": 7768, + "time_per_iteration": 2.5261070728302 + }, + { + "auxiliary_loss_clip": 0.01090034, + "auxiliary_loss_mlp": 0.01026116, + "balance_loss_clip": 1.04488611, + "balance_loss_mlp": 1.01314616, + "epoch": 0.4670975499774538, + "flos": 15991093578240.0, + "grad_norm": 1.757373041646167, + "language_loss": 0.70678699, + "learning_rate": 2.308963953858982e-06, + "loss": 0.72794855, + "num_input_tokens_seen": 166775750, + "step": 7769, + "time_per_iteration": 2.5165154933929443 + }, + { + "auxiliary_loss_clip": 0.01117767, + "auxiliary_loss_mlp": 0.01031727, + "balance_loss_clip": 1.04103374, + "balance_loss_mlp": 1.01890564, + "epoch": 0.46715767323012175, + "flos": 15377596260480.0, + "grad_norm": 1.9186799145266846, + "language_loss": 0.81349564, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.83499062, + "num_input_tokens_seen": 166791720, + "step": 7770, + "time_per_iteration": 2.4689180850982666 + }, + { + "auxiliary_loss_clip": 0.010363, + "auxiliary_loss_mlp": 0.01002437, + "balance_loss_clip": 1.00999415, + "balance_loss_mlp": 1.00087547, + "epoch": 0.4672177964827897, + "flos": 60252217401600.0, + "grad_norm": 0.8202379013701127, + "language_loss": 0.55595064, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.57633793, + "num_input_tokens_seen": 166856360, + "step": 7771, + "time_per_iteration": 3.073331832885742 + }, + { + "auxiliary_loss_clip": 0.0110471, + "auxiliary_loss_mlp": 0.00785829, + "balance_loss_clip": 1.04168379, + "balance_loss_mlp": 1.00958645, + "epoch": 0.4672779197354577, + "flos": 27636134002560.0, + "grad_norm": 1.9086491710168587, + "language_loss": 0.66117942, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.68008482, + "num_input_tokens_seen": 166875925, + "step": 7772, + "time_per_iteration": 2.5498743057250977 + }, + { + "auxiliary_loss_clip": 0.01103612, + "auxiliary_loss_mlp": 0.01031181, + "balance_loss_clip": 1.04354715, + "balance_loss_mlp": 1.01859856, + "epoch": 0.46733804298812565, + "flos": 31394257344000.0, + "grad_norm": 1.8745131584463803, + "language_loss": 0.6389088, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.66025674, + "num_input_tokens_seen": 166896520, + "step": 7773, + "time_per_iteration": 2.6010642051696777 + }, + { + "auxiliary_loss_clip": 0.01097536, + "auxiliary_loss_mlp": 0.01034008, + "balance_loss_clip": 1.04027724, + "balance_loss_mlp": 1.02015007, + "epoch": 0.4673981662407936, + "flos": 19500607912320.0, + "grad_norm": 1.8147572773654406, + "language_loss": 0.80300564, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.82432109, + "num_input_tokens_seen": 166915370, + "step": 7774, + "time_per_iteration": 2.505866050720215 + }, + { + "auxiliary_loss_clip": 0.0108598, + "auxiliary_loss_mlp": 0.01029989, + "balance_loss_clip": 1.04209292, + "balance_loss_mlp": 1.01629162, + "epoch": 0.4674582894934616, + "flos": 20521835487360.0, + "grad_norm": 1.6090827505560474, + "language_loss": 0.77447701, + "learning_rate": 2.306655024915726e-06, + "loss": 0.79563665, + "num_input_tokens_seen": 166934875, + "step": 7775, + "time_per_iteration": 2.5865416526794434 + }, + { + "auxiliary_loss_clip": 0.01085391, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.04042137, + "balance_loss_mlp": 1.0188179, + "epoch": 0.46751841274612954, + "flos": 22090952188800.0, + "grad_norm": 11.02748829987365, + "language_loss": 0.69357753, + "learning_rate": 2.306270162640694e-06, + "loss": 0.71475422, + "num_input_tokens_seen": 166954285, + "step": 7776, + "time_per_iteration": 2.5515105724334717 + }, + { + "auxiliary_loss_clip": 0.01109643, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.04358172, + "balance_loss_mlp": 1.01690102, + "epoch": 0.46757853599879756, + "flos": 26980082046720.0, + "grad_norm": 1.5096878050670963, + "language_loss": 0.73525393, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.75664276, + "num_input_tokens_seen": 166975975, + "step": 7777, + "time_per_iteration": 2.5589287281036377 + }, + { + "auxiliary_loss_clip": 0.01106795, + "auxiliary_loss_mlp": 0.01030944, + "balance_loss_clip": 1.04172242, + "balance_loss_mlp": 1.01782477, + "epoch": 0.4676386592514655, + "flos": 24134053783680.0, + "grad_norm": 2.243164947834768, + "language_loss": 0.69572246, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.71709985, + "num_input_tokens_seen": 166996140, + "step": 7778, + "time_per_iteration": 2.52089262008667 + }, + { + "auxiliary_loss_clip": 0.01108111, + "auxiliary_loss_mlp": 0.01039829, + "balance_loss_clip": 1.04287267, + "balance_loss_mlp": 1.02661443, + "epoch": 0.4676987825041335, + "flos": 25483720343040.0, + "grad_norm": 2.1247482545681096, + "language_loss": 0.73655951, + "learning_rate": 2.305115506191206e-06, + "loss": 0.75803888, + "num_input_tokens_seen": 167016105, + "step": 7779, + "time_per_iteration": 2.5840559005737305 + }, + { + "auxiliary_loss_clip": 0.01072687, + "auxiliary_loss_mlp": 0.01037005, + "balance_loss_clip": 1.03887057, + "balance_loss_mlp": 1.02442884, + "epoch": 0.46775890575680146, + "flos": 21945298538880.0, + "grad_norm": 1.6326438102656085, + "language_loss": 0.72078359, + "learning_rate": 2.304730597548562e-06, + "loss": 0.74188048, + "num_input_tokens_seen": 167036185, + "step": 7780, + "time_per_iteration": 2.6115572452545166 + }, + { + "auxiliary_loss_clip": 0.01093028, + "auxiliary_loss_mlp": 0.01048159, + "balance_loss_clip": 1.04099655, + "balance_loss_mlp": 1.03211331, + "epoch": 0.4678190290094694, + "flos": 25228395492480.0, + "grad_norm": 1.851462687193211, + "language_loss": 0.74498624, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.76639807, + "num_input_tokens_seen": 167054515, + "step": 7781, + "time_per_iteration": 2.564302444458008 + }, + { + "auxiliary_loss_clip": 0.01107927, + "auxiliary_loss_mlp": 0.01034937, + "balance_loss_clip": 1.04054487, + "balance_loss_mlp": 1.02113867, + "epoch": 0.4678791522621374, + "flos": 32268358811520.0, + "grad_norm": 1.6428876065416453, + "language_loss": 0.62826061, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.64968932, + "num_input_tokens_seen": 167077245, + "step": 7782, + "time_per_iteration": 2.5739686489105225 + }, + { + "auxiliary_loss_clip": 0.01097712, + "auxiliary_loss_mlp": 0.01038862, + "balance_loss_clip": 1.03938222, + "balance_loss_mlp": 1.02558231, + "epoch": 0.46793927551480535, + "flos": 27046480337280.0, + "grad_norm": 2.4752652224333445, + "language_loss": 0.63278008, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.65414584, + "num_input_tokens_seen": 167097235, + "step": 7783, + "time_per_iteration": 2.563169002532959 + }, + { + "auxiliary_loss_clip": 0.01109193, + "auxiliary_loss_mlp": 0.01036532, + "balance_loss_clip": 1.04439998, + "balance_loss_mlp": 1.02201855, + "epoch": 0.4679993987674733, + "flos": 17457398576640.0, + "grad_norm": 2.5885363019199414, + "language_loss": 0.67707098, + "learning_rate": 2.303190847569801e-06, + "loss": 0.69852823, + "num_input_tokens_seen": 167113155, + "step": 7784, + "time_per_iteration": 2.465085744857788 + }, + { + "auxiliary_loss_clip": 0.0108769, + "auxiliary_loss_mlp": 0.01027551, + "balance_loss_clip": 1.04178929, + "balance_loss_mlp": 1.01518321, + "epoch": 0.4680595220201413, + "flos": 17165121609600.0, + "grad_norm": 2.0317813450862934, + "language_loss": 0.84221959, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.86337203, + "num_input_tokens_seen": 167131765, + "step": 7785, + "time_per_iteration": 2.503460645675659 + }, + { + "auxiliary_loss_clip": 0.01082189, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.04066288, + "balance_loss_mlp": 1.01882911, + "epoch": 0.46811964527280925, + "flos": 11327591001600.0, + "grad_norm": 1.8095435468508374, + "language_loss": 0.77291536, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.79406255, + "num_input_tokens_seen": 167149030, + "step": 7786, + "time_per_iteration": 2.531744956970215 + }, + { + "auxiliary_loss_clip": 0.0110056, + "auxiliary_loss_mlp": 0.01027167, + "balance_loss_clip": 1.03814387, + "balance_loss_mlp": 1.01519227, + "epoch": 0.4681797685254772, + "flos": 24278809593600.0, + "grad_norm": 2.051352693624312, + "language_loss": 0.7424162, + "learning_rate": 2.302035914315856e-06, + "loss": 0.76369345, + "num_input_tokens_seen": 167167375, + "step": 7787, + "time_per_iteration": 2.5101876258850098 + }, + { + "auxiliary_loss_clip": 0.01089649, + "auxiliary_loss_mlp": 0.0103599, + "balance_loss_clip": 1.03978109, + "balance_loss_mlp": 1.02253091, + "epoch": 0.4682398917781452, + "flos": 31650372293760.0, + "grad_norm": 1.6813381866235253, + "language_loss": 0.65419388, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.67545021, + "num_input_tokens_seen": 167188065, + "step": 7788, + "time_per_iteration": 2.6235709190368652 + }, + { + "auxiliary_loss_clip": 0.01105487, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.04091871, + "balance_loss_mlp": 1.01737392, + "epoch": 0.46830001503081314, + "flos": 28110765340800.0, + "grad_norm": 1.7185100266540785, + "language_loss": 0.64017069, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.66151714, + "num_input_tokens_seen": 167209675, + "step": 7789, + "time_per_iteration": 2.5686206817626953 + }, + { + "auxiliary_loss_clip": 0.01030313, + "auxiliary_loss_mlp": 0.01001798, + "balance_loss_clip": 1.01331091, + "balance_loss_mlp": 1.00040913, + "epoch": 0.4683601382834811, + "flos": 57881718316800.0, + "grad_norm": 0.7071927290681167, + "language_loss": 0.61891329, + "learning_rate": 2.300880877982825e-06, + "loss": 0.63923442, + "num_input_tokens_seen": 167273940, + "step": 7790, + "time_per_iteration": 3.1747379302978516 + }, + { + "auxiliary_loss_clip": 0.01075716, + "auxiliary_loss_mlp": 0.01037331, + "balance_loss_clip": 1.04040253, + "balance_loss_mlp": 1.02319241, + "epoch": 0.46842026153614913, + "flos": 21871933009920.0, + "grad_norm": 1.5664434720871605, + "language_loss": 0.79323459, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.81436509, + "num_input_tokens_seen": 167292730, + "step": 7791, + "time_per_iteration": 2.575923442840576 + }, + { + "auxiliary_loss_clip": 0.01107139, + "auxiliary_loss_mlp": 0.01031444, + "balance_loss_clip": 1.04109466, + "balance_loss_mlp": 1.01816988, + "epoch": 0.4684803847888171, + "flos": 24900818434560.0, + "grad_norm": 1.6835025185055241, + "language_loss": 0.74798787, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.76937371, + "num_input_tokens_seen": 167313460, + "step": 7792, + "time_per_iteration": 2.516066312789917 + }, + { + "auxiliary_loss_clip": 0.01085837, + "auxiliary_loss_mlp": 0.01036716, + "balance_loss_clip": 1.03865528, + "balance_loss_mlp": 1.02282214, + "epoch": 0.46854050804148506, + "flos": 26251670142720.0, + "grad_norm": 1.498163292303882, + "language_loss": 0.68419969, + "learning_rate": 2.299725738964898e-06, + "loss": 0.70542526, + "num_input_tokens_seen": 167335385, + "step": 7793, + "time_per_iteration": 2.596992015838623 + }, + { + "auxiliary_loss_clip": 0.01105482, + "auxiliary_loss_mlp": 0.00786867, + "balance_loss_clip": 1.04161644, + "balance_loss_mlp": 1.01322246, + "epoch": 0.468600631294153, + "flos": 21579799697280.0, + "grad_norm": 1.599227051108829, + "language_loss": 0.73983198, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.75875551, + "num_input_tokens_seen": 167353625, + "step": 7794, + "time_per_iteration": 2.508617877960205 + }, + { + "auxiliary_loss_clip": 0.01088755, + "auxiliary_loss_mlp": 0.01031538, + "balance_loss_clip": 1.04218686, + "balance_loss_mlp": 1.01731086, + "epoch": 0.468660754546821, + "flos": 25885632597120.0, + "grad_norm": 1.5223590421742774, + "language_loss": 0.63093823, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.65214121, + "num_input_tokens_seen": 167374565, + "step": 7795, + "time_per_iteration": 2.610881805419922 + }, + { + "auxiliary_loss_clip": 0.01078247, + "auxiliary_loss_mlp": 0.01027179, + "balance_loss_clip": 1.03787279, + "balance_loss_mlp": 1.01374435, + "epoch": 0.46872087779948896, + "flos": 35475001666560.0, + "grad_norm": 1.7803469037435196, + "language_loss": 0.68408775, + "learning_rate": 2.298570497656304e-06, + "loss": 0.70514202, + "num_input_tokens_seen": 167395010, + "step": 7796, + "time_per_iteration": 2.6751792430877686 + }, + { + "auxiliary_loss_clip": 0.01119174, + "auxiliary_loss_mlp": 0.00788559, + "balance_loss_clip": 1.04212332, + "balance_loss_mlp": 1.01383114, + "epoch": 0.4687810010521569, + "flos": 26396425952640.0, + "grad_norm": 1.8148000599888063, + "language_loss": 0.70293629, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.72201359, + "num_input_tokens_seen": 167415285, + "step": 7797, + "time_per_iteration": 2.5049824714660645 + }, + { + "auxiliary_loss_clip": 0.01104133, + "auxiliary_loss_mlp": 0.01032466, + "balance_loss_clip": 1.04361916, + "balance_loss_mlp": 1.01776171, + "epoch": 0.4688411243048249, + "flos": 19972761212160.0, + "grad_norm": 1.9703223898059004, + "language_loss": 0.6723181, + "learning_rate": 2.297800280150454e-06, + "loss": 0.6936841, + "num_input_tokens_seen": 167432405, + "step": 7798, + "time_per_iteration": 2.509965419769287 + }, + { + "auxiliary_loss_clip": 0.01032918, + "auxiliary_loss_mlp": 0.01003541, + "balance_loss_clip": 1.01530409, + "balance_loss_mlp": 1.00196767, + "epoch": 0.46890124755749285, + "flos": 63977015900160.0, + "grad_norm": 0.9482300437868589, + "language_loss": 0.64604592, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.66641045, + "num_input_tokens_seen": 167499365, + "step": 7799, + "time_per_iteration": 4.639041900634766 + }, + { + "auxiliary_loss_clip": 0.01087453, + "auxiliary_loss_mlp": 0.01024358, + "balance_loss_clip": 1.04077923, + "balance_loss_mlp": 1.01144791, + "epoch": 0.4689613708101608, + "flos": 23768985905280.0, + "grad_norm": 1.4925565328097643, + "language_loss": 0.72374094, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.7448591, + "num_input_tokens_seen": 167520390, + "step": 7800, + "time_per_iteration": 3.969536542892456 + }, + { + "auxiliary_loss_clip": 0.01115944, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.0423094, + "balance_loss_mlp": 1.0189147, + "epoch": 0.4690214940628288, + "flos": 24788705109120.0, + "grad_norm": 2.83580568768042, + "language_loss": 0.72071826, + "learning_rate": 2.296644869233568e-06, + "loss": 0.74218732, + "num_input_tokens_seen": 167539865, + "step": 7801, + "time_per_iteration": 2.489851236343384 + }, + { + "auxiliary_loss_clip": 0.01089613, + "auxiliary_loss_mlp": 0.01039249, + "balance_loss_clip": 1.04181612, + "balance_loss_mlp": 1.02388334, + "epoch": 0.46908161731549675, + "flos": 18077324428800.0, + "grad_norm": 1.874913106942222, + "language_loss": 0.62451738, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.64580607, + "num_input_tokens_seen": 167558190, + "step": 7802, + "time_per_iteration": 2.5229990482330322 + }, + { + "auxiliary_loss_clip": 0.01120996, + "auxiliary_loss_mlp": 0.01036524, + "balance_loss_clip": 1.04214168, + "balance_loss_mlp": 1.02333307, + "epoch": 0.4691417405681647, + "flos": 25703350053120.0, + "grad_norm": 1.9867974776887167, + "language_loss": 0.7339707, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.75554591, + "num_input_tokens_seen": 167577685, + "step": 7803, + "time_per_iteration": 3.8851864337921143 + }, + { + "auxiliary_loss_clip": 0.01094057, + "auxiliary_loss_mlp": 0.00787738, + "balance_loss_clip": 1.04124904, + "balance_loss_mlp": 1.01060212, + "epoch": 0.46920186382083273, + "flos": 17457039440640.0, + "grad_norm": 1.5668638514281588, + "language_loss": 0.77616018, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.79497814, + "num_input_tokens_seen": 167596390, + "step": 7804, + "time_per_iteration": 2.498448133468628 + }, + { + "auxiliary_loss_clip": 0.01093564, + "auxiliary_loss_mlp": 0.01027735, + "balance_loss_clip": 1.04298663, + "balance_loss_mlp": 1.01500356, + "epoch": 0.4692619870735007, + "flos": 20339445202560.0, + "grad_norm": 1.6935922166858808, + "language_loss": 0.77065015, + "learning_rate": 2.295104163929305e-06, + "loss": 0.7918632, + "num_input_tokens_seen": 167614980, + "step": 7805, + "time_per_iteration": 3.8941218852996826 + }, + { + "auxiliary_loss_clip": 0.01127043, + "auxiliary_loss_mlp": 0.01041301, + "balance_loss_clip": 1.04518533, + "balance_loss_mlp": 1.02706695, + "epoch": 0.46932211032616866, + "flos": 29496558003840.0, + "grad_norm": 1.6163692188574958, + "language_loss": 0.82652223, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.84820569, + "num_input_tokens_seen": 167635895, + "step": 7806, + "time_per_iteration": 2.5460362434387207 + }, + { + "auxiliary_loss_clip": 0.01099772, + "auxiliary_loss_mlp": 0.01037894, + "balance_loss_clip": 1.04292583, + "balance_loss_mlp": 1.02376795, + "epoch": 0.4693822335788366, + "flos": 36211242735360.0, + "grad_norm": 1.9314458856024193, + "language_loss": 0.76935846, + "learning_rate": 2.294333744076472e-06, + "loss": 0.79073513, + "num_input_tokens_seen": 167657440, + "step": 7807, + "time_per_iteration": 2.647977352142334 + }, + { + "auxiliary_loss_clip": 0.01096758, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.04472041, + "balance_loss_mlp": 1.02008986, + "epoch": 0.4694423568315046, + "flos": 20338978325760.0, + "grad_norm": 2.377820017390032, + "language_loss": 0.51276386, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.53407037, + "num_input_tokens_seen": 167675025, + "step": 7808, + "time_per_iteration": 2.5327816009521484 + }, + { + "auxiliary_loss_clip": 0.01005202, + "auxiliary_loss_mlp": 0.01000234, + "balance_loss_clip": 1.02427244, + "balance_loss_mlp": 0.99878591, + "epoch": 0.46950248008417256, + "flos": 64326353621760.0, + "grad_norm": 0.7826410186412291, + "language_loss": 0.5780341, + "learning_rate": 2.293563279578978e-06, + "loss": 0.59808844, + "num_input_tokens_seen": 167729635, + "step": 7809, + "time_per_iteration": 3.0488524436950684 + }, + { + "auxiliary_loss_clip": 0.01085108, + "auxiliary_loss_mlp": 0.01037032, + "balance_loss_clip": 1.04521918, + "balance_loss_mlp": 1.02322721, + "epoch": 0.4695626033368405, + "flos": 19200106730880.0, + "grad_norm": 2.419743901055374, + "language_loss": 0.71696436, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.73818576, + "num_input_tokens_seen": 167745135, + "step": 7810, + "time_per_iteration": 2.594684362411499 + }, + { + "auxiliary_loss_clip": 0.01112083, + "auxiliary_loss_mlp": 0.010365, + "balance_loss_clip": 1.0443666, + "balance_loss_mlp": 1.02343488, + "epoch": 0.4696227265895085, + "flos": 23002436736000.0, + "grad_norm": 2.830552721719309, + "language_loss": 0.8116833, + "learning_rate": 2.29279277055369e-06, + "loss": 0.8331691, + "num_input_tokens_seen": 167763875, + "step": 7811, + "time_per_iteration": 2.5199778079986572 + }, + { + "auxiliary_loss_clip": 0.01107054, + "auxiliary_loss_mlp": 0.01036457, + "balance_loss_clip": 1.04494643, + "balance_loss_mlp": 1.02254534, + "epoch": 0.46968284984217645, + "flos": 21870855601920.0, + "grad_norm": 1.6177238215744718, + "language_loss": 0.8043142, + "learning_rate": 2.292407499379644e-06, + "loss": 0.82574934, + "num_input_tokens_seen": 167784895, + "step": 7812, + "time_per_iteration": 2.5386803150177 + }, + { + "auxiliary_loss_clip": 0.01075692, + "auxiliary_loss_mlp": 0.01036389, + "balance_loss_clip": 1.04327106, + "balance_loss_mlp": 1.02279305, + "epoch": 0.4697429730948444, + "flos": 19974987855360.0, + "grad_norm": 1.9781537206190236, + "language_loss": 0.74206185, + "learning_rate": 2.292022217117477e-06, + "loss": 0.76318264, + "num_input_tokens_seen": 167803185, + "step": 7813, + "time_per_iteration": 2.6149747371673584 + }, + { + "auxiliary_loss_clip": 0.01095721, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.04365444, + "balance_loss_mlp": 1.01819801, + "epoch": 0.4698030963475124, + "flos": 15156206784000.0, + "grad_norm": 2.3773481080205885, + "language_loss": 0.85037363, + "learning_rate": 2.291636923781798e-06, + "loss": 0.87164867, + "num_input_tokens_seen": 167816550, + "step": 7814, + "time_per_iteration": 2.4940598011016846 + }, + { + "auxiliary_loss_clip": 0.01097618, + "auxiliary_loss_mlp": 0.01036255, + "balance_loss_clip": 1.04150248, + "balance_loss_mlp": 1.02376747, + "epoch": 0.46986321960018035, + "flos": 15151178880000.0, + "grad_norm": 1.9373464713664799, + "language_loss": 0.81707901, + "learning_rate": 2.291251619387217e-06, + "loss": 0.83841777, + "num_input_tokens_seen": 167831845, + "step": 7815, + "time_per_iteration": 2.486842632293701 + }, + { + "auxiliary_loss_clip": 0.01079787, + "auxiliary_loss_mlp": 0.01037067, + "balance_loss_clip": 1.04562485, + "balance_loss_mlp": 1.02241015, + "epoch": 0.4699233428528483, + "flos": 23108911626240.0, + "grad_norm": 2.2724251067961716, + "language_loss": 0.77227294, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.79344147, + "num_input_tokens_seen": 167850360, + "step": 7816, + "time_per_iteration": 2.604668378829956 + }, + { + "auxiliary_loss_clip": 0.01042301, + "auxiliary_loss_mlp": 0.01005859, + "balance_loss_clip": 1.01578307, + "balance_loss_mlp": 1.00442839, + "epoch": 0.46998346610551633, + "flos": 68105558246400.0, + "grad_norm": 0.8754150850695498, + "language_loss": 0.59076893, + "learning_rate": 2.290480977479796e-06, + "loss": 0.61125052, + "num_input_tokens_seen": 167908660, + "step": 7817, + "time_per_iteration": 3.0702075958251953 + }, + { + "auxiliary_loss_clip": 0.01098391, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.0446223, + "balance_loss_mlp": 1.02160013, + "epoch": 0.4700435893581843, + "flos": 24129456842880.0, + "grad_norm": 1.7342451905176604, + "language_loss": 0.79205859, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.81339395, + "num_input_tokens_seen": 167927905, + "step": 7818, + "time_per_iteration": 2.5409815311431885 + }, + { + "auxiliary_loss_clip": 0.01120619, + "auxiliary_loss_mlp": 0.01032249, + "balance_loss_clip": 1.04311085, + "balance_loss_mlp": 1.01928532, + "epoch": 0.47010371261085226, + "flos": 20150518642560.0, + "grad_norm": 2.3553082068980813, + "language_loss": 0.83618325, + "learning_rate": 2.289710291512104e-06, + "loss": 0.85771197, + "num_input_tokens_seen": 167945995, + "step": 7819, + "time_per_iteration": 2.4858360290527344 + }, + { + "auxiliary_loss_clip": 0.01093717, + "auxiliary_loss_mlp": 0.01035902, + "balance_loss_clip": 1.0424633, + "balance_loss_mlp": 1.02112031, + "epoch": 0.47016383586352023, + "flos": 15122199582720.0, + "grad_norm": 2.333842257968576, + "language_loss": 0.76694727, + "learning_rate": 2.289324932042186e-06, + "loss": 0.78824353, + "num_input_tokens_seen": 167963380, + "step": 7820, + "time_per_iteration": 2.520315170288086 + }, + { + "auxiliary_loss_clip": 0.01106047, + "auxiliary_loss_mlp": 0.01037779, + "balance_loss_clip": 1.04667997, + "balance_loss_mlp": 1.02407539, + "epoch": 0.4702239591161882, + "flos": 13552975140480.0, + "grad_norm": 2.0501009044392866, + "language_loss": 0.74571764, + "learning_rate": 2.288939561601039e-06, + "loss": 0.76715589, + "num_input_tokens_seen": 167981740, + "step": 7821, + "time_per_iteration": 2.483879566192627 + }, + { + "auxiliary_loss_clip": 0.01119841, + "auxiliary_loss_mlp": 0.01039382, + "balance_loss_clip": 1.04466391, + "balance_loss_mlp": 1.0267992, + "epoch": 0.47028408236885616, + "flos": 24276511123200.0, + "grad_norm": 1.8577670756257596, + "language_loss": 0.8893311, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.9109233, + "num_input_tokens_seen": 167999380, + "step": 7822, + "time_per_iteration": 2.497555732727051 + }, + { + "auxiliary_loss_clip": 0.01104449, + "auxiliary_loss_mlp": 0.01033828, + "balance_loss_clip": 1.04547012, + "balance_loss_mlp": 1.02106667, + "epoch": 0.4703442056215241, + "flos": 22856926740480.0, + "grad_norm": 1.5696363460056841, + "language_loss": 0.79657233, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.81795508, + "num_input_tokens_seen": 168018395, + "step": 7823, + "time_per_iteration": 2.5196025371551514 + }, + { + "auxiliary_loss_clip": 0.01039711, + "auxiliary_loss_mlp": 0.01002739, + "balance_loss_clip": 1.0380075, + "balance_loss_mlp": 1.00118959, + "epoch": 0.4704043288741921, + "flos": 69240227950080.0, + "grad_norm": 0.688773522865109, + "language_loss": 0.56644595, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.58687049, + "num_input_tokens_seen": 168084080, + "step": 7824, + "time_per_iteration": 3.2227060794830322 + }, + { + "auxiliary_loss_clip": 0.01097695, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.04258275, + "balance_loss_mlp": 1.02022123, + "epoch": 0.47046445212686006, + "flos": 18041090584320.0, + "grad_norm": 1.775493884160791, + "language_loss": 0.81207913, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.83339918, + "num_input_tokens_seen": 168101555, + "step": 7825, + "time_per_iteration": 2.541686534881592 + }, + { + "auxiliary_loss_clip": 0.01102238, + "auxiliary_loss_mlp": 0.01030419, + "balance_loss_clip": 1.0446701, + "balance_loss_mlp": 1.01679945, + "epoch": 0.470524575379528, + "flos": 23951448017280.0, + "grad_norm": 1.5666679305659326, + "language_loss": 0.66119087, + "learning_rate": 2.287012545338324e-06, + "loss": 0.68251741, + "num_input_tokens_seen": 168121530, + "step": 7826, + "time_per_iteration": 2.549367904663086 + }, + { + "auxiliary_loss_clip": 0.01098056, + "auxiliary_loss_mlp": 0.01035617, + "balance_loss_clip": 1.04243529, + "balance_loss_mlp": 1.02149045, + "epoch": 0.470584698632196, + "flos": 18113558273280.0, + "grad_norm": 1.6692542552206366, + "language_loss": 0.83901948, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.86035627, + "num_input_tokens_seen": 168140335, + "step": 7827, + "time_per_iteration": 2.5409069061279297 + }, + { + "auxiliary_loss_clip": 0.01024521, + "auxiliary_loss_mlp": 0.01003383, + "balance_loss_clip": 1.01735473, + "balance_loss_mlp": 1.00177407, + "epoch": 0.47064482188486395, + "flos": 57251916224640.0, + "grad_norm": 0.8610311408973074, + "language_loss": 0.55682701, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57710606, + "num_input_tokens_seen": 168200535, + "step": 7828, + "time_per_iteration": 3.111931085586548 + }, + { + "auxiliary_loss_clip": 0.01118422, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.04340935, + "balance_loss_mlp": 1.01743269, + "epoch": 0.4707049451375319, + "flos": 17895077798400.0, + "grad_norm": 1.8425627780204024, + "language_loss": 0.80984652, + "learning_rate": 2.285856204861245e-06, + "loss": 0.83134323, + "num_input_tokens_seen": 168219610, + "step": 7829, + "time_per_iteration": 2.471048355102539 + }, + { + "auxiliary_loss_clip": 0.01120703, + "auxiliary_loss_mlp": 0.01029969, + "balance_loss_clip": 1.04541135, + "balance_loss_mlp": 1.01766062, + "epoch": 0.47076506839019994, + "flos": 25232669210880.0, + "grad_norm": 1.5783653202488326, + "language_loss": 0.75909483, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.78060156, + "num_input_tokens_seen": 168242505, + "step": 7830, + "time_per_iteration": 2.5308735370635986 + }, + { + "auxiliary_loss_clip": 0.01087555, + "auxiliary_loss_mlp": 0.01030064, + "balance_loss_clip": 1.04439831, + "balance_loss_mlp": 1.01606905, + "epoch": 0.4708251916428679, + "flos": 13479681438720.0, + "grad_norm": 2.0672343326669322, + "language_loss": 0.78963065, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.81080687, + "num_input_tokens_seen": 168260220, + "step": 7831, + "time_per_iteration": 2.5483953952789307 + }, + { + "auxiliary_loss_clip": 0.01088349, + "auxiliary_loss_mlp": 0.01042904, + "balance_loss_clip": 1.04201126, + "balance_loss_mlp": 1.0249393, + "epoch": 0.47088531489553587, + "flos": 30147833450880.0, + "grad_norm": 1.9436963257224102, + "language_loss": 0.75794828, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.77926075, + "num_input_tokens_seen": 168277360, + "step": 7832, + "time_per_iteration": 2.6287007331848145 + }, + { + "auxiliary_loss_clip": 0.01095006, + "auxiliary_loss_mlp": 0.01024493, + "balance_loss_clip": 1.04665768, + "balance_loss_mlp": 1.01216054, + "epoch": 0.47094543814820383, + "flos": 21798280172160.0, + "grad_norm": 1.2996970511585297, + "language_loss": 0.74528241, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.76647741, + "num_input_tokens_seen": 168296605, + "step": 7833, + "time_per_iteration": 2.5544190406799316 + }, + { + "auxiliary_loss_clip": 0.01109942, + "auxiliary_loss_mlp": 0.01037558, + "balance_loss_clip": 1.04435849, + "balance_loss_mlp": 1.0242126, + "epoch": 0.4710055614008718, + "flos": 23003011353600.0, + "grad_norm": 1.7413351217128663, + "language_loss": 0.7531215, + "learning_rate": 2.283928754133762e-06, + "loss": 0.77459651, + "num_input_tokens_seen": 168316205, + "step": 7834, + "time_per_iteration": 2.5212562084198 + }, + { + "auxiliary_loss_clip": 0.01075123, + "auxiliary_loss_mlp": 0.0103065, + "balance_loss_clip": 1.04543972, + "balance_loss_mlp": 1.01795983, + "epoch": 0.47106568465353976, + "flos": 42741346452480.0, + "grad_norm": 1.4618629374053351, + "language_loss": 0.66071272, + "learning_rate": 2.283543231629972e-06, + "loss": 0.68177044, + "num_input_tokens_seen": 168338935, + "step": 7835, + "time_per_iteration": 2.7880265712738037 + }, + { + "auxiliary_loss_clip": 0.01034263, + "auxiliary_loss_mlp": 0.00769413, + "balance_loss_clip": 1.01788187, + "balance_loss_mlp": 1.00876665, + "epoch": 0.4711258079062077, + "flos": 68554008570240.0, + "grad_norm": 0.8692285815251268, + "language_loss": 0.62083316, + "learning_rate": 2.283157698374194e-06, + "loss": 0.63887, + "num_input_tokens_seen": 168392800, + "step": 7836, + "time_per_iteration": 3.0945003032684326 + }, + { + "auxiliary_loss_clip": 0.01087043, + "auxiliary_loss_mlp": 0.00790056, + "balance_loss_clip": 1.04354787, + "balance_loss_mlp": 1.01240993, + "epoch": 0.4711859311588757, + "flos": 25446588658560.0, + "grad_norm": 1.5660742697408774, + "language_loss": 0.69760656, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.71637756, + "num_input_tokens_seen": 168412940, + "step": 7837, + "time_per_iteration": 2.6097195148468018 + }, + { + "auxiliary_loss_clip": 0.01117851, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.04707813, + "balance_loss_mlp": 1.02103591, + "epoch": 0.47124605441154366, + "flos": 21981891519360.0, + "grad_norm": 1.710675075871333, + "language_loss": 0.66048002, + "learning_rate": 2.282386599665153e-06, + "loss": 0.6820128, + "num_input_tokens_seen": 168431995, + "step": 7838, + "time_per_iteration": 5.204299211502075 + }, + { + "auxiliary_loss_clip": 0.01096684, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.04153037, + "balance_loss_mlp": 1.01870584, + "epoch": 0.4713061776642116, + "flos": 25412689198080.0, + "grad_norm": 1.7531822055245156, + "language_loss": 0.77314091, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.79444182, + "num_input_tokens_seen": 168454585, + "step": 7839, + "time_per_iteration": 2.59617018699646 + }, + { + "auxiliary_loss_clip": 0.01082936, + "auxiliary_loss_mlp": 0.01030686, + "balance_loss_clip": 1.04307461, + "balance_loss_mlp": 1.01790667, + "epoch": 0.4713663009168796, + "flos": 26542259170560.0, + "grad_norm": 2.5594549056912546, + "language_loss": 0.72397053, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.74510676, + "num_input_tokens_seen": 168471265, + "step": 7840, + "time_per_iteration": 2.580916404724121 + }, + { + "auxiliary_loss_clip": 0.01094262, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.04370677, + "balance_loss_mlp": 1.0151186, + "epoch": 0.47142642416954755, + "flos": 23623583650560.0, + "grad_norm": 1.6452167510457716, + "language_loss": 0.74973953, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.77096659, + "num_input_tokens_seen": 168491360, + "step": 7841, + "time_per_iteration": 3.928304672241211 + }, + { + "auxiliary_loss_clip": 0.01100921, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.04623127, + "balance_loss_mlp": 1.01679397, + "epoch": 0.4714865474222155, + "flos": 22310150935680.0, + "grad_norm": 1.7741685167163892, + "language_loss": 0.70594919, + "learning_rate": 2.280844273866501e-06, + "loss": 0.72725749, + "num_input_tokens_seen": 168511335, + "step": 7842, + "time_per_iteration": 2.5535922050476074 + }, + { + "auxiliary_loss_clip": 0.01113939, + "auxiliary_loss_mlp": 0.0103105, + "balance_loss_clip": 1.04729843, + "balance_loss_mlp": 1.01789498, + "epoch": 0.4715466706748835, + "flos": 17822430541440.0, + "grad_norm": 3.5798273298963963, + "language_loss": 0.78538764, + "learning_rate": 2.280458665756177e-06, + "loss": 0.80683756, + "num_input_tokens_seen": 168529920, + "step": 7843, + "time_per_iteration": 3.889812707901001 + }, + { + "auxiliary_loss_clip": 0.01113656, + "auxiliary_loss_mlp": 0.0103075, + "balance_loss_clip": 1.04398429, + "balance_loss_mlp": 1.01795292, + "epoch": 0.4716067939275515, + "flos": 23659530186240.0, + "grad_norm": 1.6752335315284426, + "language_loss": 0.74167001, + "learning_rate": 2.280073047010832e-06, + "loss": 0.76311409, + "num_input_tokens_seen": 168550595, + "step": 7844, + "time_per_iteration": 2.5311155319213867 + }, + { + "auxiliary_loss_clip": 0.01096476, + "auxiliary_loss_mlp": 0.01037635, + "balance_loss_clip": 1.04676473, + "balance_loss_mlp": 1.02396202, + "epoch": 0.47166691718021947, + "flos": 17930162407680.0, + "grad_norm": 1.4484723075210906, + "language_loss": 0.78517878, + "learning_rate": 2.279687417645088e-06, + "loss": 0.80651987, + "num_input_tokens_seen": 168569765, + "step": 7845, + "time_per_iteration": 2.5316691398620605 + }, + { + "auxiliary_loss_clip": 0.01108813, + "auxiliary_loss_mlp": 0.01035473, + "balance_loss_clip": 1.04236627, + "balance_loss_mlp": 1.02271175, + "epoch": 0.47172704043288743, + "flos": 26614583205120.0, + "grad_norm": 1.529334355987423, + "language_loss": 0.73058677, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.7520296, + "num_input_tokens_seen": 168591525, + "step": 7846, + "time_per_iteration": 2.539745807647705 + }, + { + "auxiliary_loss_clip": 0.01106496, + "auxiliary_loss_mlp": 0.0103382, + "balance_loss_clip": 1.04348421, + "balance_loss_mlp": 1.02039719, + "epoch": 0.4717871636855554, + "flos": 27922700707200.0, + "grad_norm": 1.3156578756186819, + "language_loss": 0.74193841, + "learning_rate": 2.2789161271109e-06, + "loss": 0.76334155, + "num_input_tokens_seen": 168611235, + "step": 7847, + "time_per_iteration": 2.532700300216675 + }, + { + "auxiliary_loss_clip": 0.01072013, + "auxiliary_loss_mlp": 0.01034028, + "balance_loss_clip": 1.04354692, + "balance_loss_mlp": 1.02073038, + "epoch": 0.47184728693822336, + "flos": 14502237816960.0, + "grad_norm": 1.6682808156659112, + "language_loss": 0.80529869, + "learning_rate": 2.278530465971703e-06, + "loss": 0.82635915, + "num_input_tokens_seen": 168628710, + "step": 7848, + "time_per_iteration": 2.5631067752838135 + }, + { + "auxiliary_loss_clip": 0.01112697, + "auxiliary_loss_mlp": 0.01035283, + "balance_loss_clip": 1.04700828, + "balance_loss_mlp": 1.02185404, + "epoch": 0.47190741019089133, + "flos": 17856545483520.0, + "grad_norm": 1.9010332329509758, + "language_loss": 0.70322335, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.72470313, + "num_input_tokens_seen": 168645645, + "step": 7849, + "time_per_iteration": 2.461242914199829 + }, + { + "auxiliary_loss_clip": 0.01094413, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.04442549, + "balance_loss_mlp": 1.01957917, + "epoch": 0.4719675334435593, + "flos": 17895472848000.0, + "grad_norm": 2.0793600970583075, + "language_loss": 0.69129956, + "learning_rate": 2.277759112022224e-06, + "loss": 0.71259189, + "num_input_tokens_seen": 168664165, + "step": 7850, + "time_per_iteration": 2.5414345264434814 + }, + { + "auxiliary_loss_clip": 0.01066811, + "auxiliary_loss_mlp": 0.01029284, + "balance_loss_clip": 1.04422486, + "balance_loss_mlp": 1.01527131, + "epoch": 0.47202765669622726, + "flos": 20704369426560.0, + "grad_norm": 1.8475353107094923, + "language_loss": 0.74707144, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.76803237, + "num_input_tokens_seen": 168681940, + "step": 7851, + "time_per_iteration": 2.6205170154571533 + }, + { + "auxiliary_loss_clip": 0.01059648, + "auxiliary_loss_mlp": 0.01050332, + "balance_loss_clip": 1.04149461, + "balance_loss_mlp": 1.03284407, + "epoch": 0.4720877799488952, + "flos": 16360255607040.0, + "grad_norm": 2.2788699304554303, + "language_loss": 0.76157784, + "learning_rate": 2.276987715942132e-06, + "loss": 0.78267765, + "num_input_tokens_seen": 168698830, + "step": 7852, + "time_per_iteration": 2.5995140075683594 + }, + { + "auxiliary_loss_clip": 0.01086114, + "auxiliary_loss_mlp": 0.01030326, + "balance_loss_clip": 1.04507947, + "balance_loss_mlp": 1.0163132, + "epoch": 0.4721479032015632, + "flos": 20668171495680.0, + "grad_norm": 1.570091584615071, + "language_loss": 0.68924516, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.71040952, + "num_input_tokens_seen": 168718305, + "step": 7853, + "time_per_iteration": 2.5585246086120605 + }, + { + "auxiliary_loss_clip": 0.01013121, + "auxiliary_loss_mlp": 0.01003499, + "balance_loss_clip": 1.02467847, + "balance_loss_mlp": 1.00190151, + "epoch": 0.47220802645423116, + "flos": 67750438435200.0, + "grad_norm": 0.7088281950419517, + "language_loss": 0.50131404, + "learning_rate": 2.276216277848432e-06, + "loss": 0.52148032, + "num_input_tokens_seen": 168782365, + "step": 7854, + "time_per_iteration": 3.3216397762298584 + }, + { + "auxiliary_loss_clip": 0.01114526, + "auxiliary_loss_mlp": 0.01032482, + "balance_loss_clip": 1.0461874, + "balance_loss_mlp": 1.0180217, + "epoch": 0.4722681497068991, + "flos": 20921449271040.0, + "grad_norm": 1.8691712893956496, + "language_loss": 0.64020348, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.66167361, + "num_input_tokens_seen": 168800485, + "step": 7855, + "time_per_iteration": 2.5090510845184326 + }, + { + "auxiliary_loss_clip": 0.01110962, + "auxiliary_loss_mlp": 0.01035165, + "balance_loss_clip": 1.04548383, + "balance_loss_mlp": 1.02109861, + "epoch": 0.4723282729595671, + "flos": 28293083798400.0, + "grad_norm": 2.198089473204247, + "language_loss": 0.75841093, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.77987218, + "num_input_tokens_seen": 168818965, + "step": 7856, + "time_per_iteration": 2.553051233291626 + }, + { + "auxiliary_loss_clip": 0.01099619, + "auxiliary_loss_mlp": 0.01030644, + "balance_loss_clip": 1.04647315, + "balance_loss_mlp": 1.0178411, + "epoch": 0.4723883962122351, + "flos": 27125053338240.0, + "grad_norm": 1.8782847456208085, + "language_loss": 0.74852812, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.7698307, + "num_input_tokens_seen": 168840355, + "step": 7857, + "time_per_iteration": 2.5784435272216797 + }, + { + "auxiliary_loss_clip": 0.0110286, + "auxiliary_loss_mlp": 0.0104119, + "balance_loss_clip": 1.04674232, + "balance_loss_mlp": 1.02820241, + "epoch": 0.47244851946490307, + "flos": 31537253387520.0, + "grad_norm": 1.4448572417124614, + "language_loss": 0.64798969, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.66943026, + "num_input_tokens_seen": 168861765, + "step": 7858, + "time_per_iteration": 2.5971856117248535 + }, + { + "auxiliary_loss_clip": 0.01109065, + "auxiliary_loss_mlp": 0.00804305, + "balance_loss_clip": 1.04224455, + "balance_loss_mlp": 1.04014659, + "epoch": 0.47250864271757104, + "flos": 20886544229760.0, + "grad_norm": 1.6956906450657268, + "language_loss": 0.70479804, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.72393173, + "num_input_tokens_seen": 168881310, + "step": 7859, + "time_per_iteration": 2.496814727783203 + }, + { + "auxiliary_loss_clip": 0.01128545, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.04651523, + "balance_loss_mlp": 1.02404809, + "epoch": 0.472568765970239, + "flos": 20522086882560.0, + "grad_norm": 1.574895694484457, + "language_loss": 0.61589599, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.63755763, + "num_input_tokens_seen": 168899470, + "step": 7860, + "time_per_iteration": 2.445223331451416 + }, + { + "auxiliary_loss_clip": 0.011032, + "auxiliary_loss_mlp": 0.01039639, + "balance_loss_clip": 1.04640949, + "balance_loss_mlp": 1.0256083, + "epoch": 0.47262888922290697, + "flos": 35805200417280.0, + "grad_norm": 2.013879950426509, + "language_loss": 0.71912366, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.74055201, + "num_input_tokens_seen": 168921495, + "step": 7861, + "time_per_iteration": 2.637028932571411 + }, + { + "auxiliary_loss_clip": 0.01100782, + "auxiliary_loss_mlp": 0.01037113, + "balance_loss_clip": 1.04730034, + "balance_loss_mlp": 1.02301073, + "epoch": 0.47268901247557493, + "flos": 20667740532480.0, + "grad_norm": 1.8787148958137825, + "language_loss": 0.85305953, + "learning_rate": 2.273130107677896e-06, + "loss": 0.87443846, + "num_input_tokens_seen": 168940515, + "step": 7862, + "time_per_iteration": 2.512751579284668 + }, + { + "auxiliary_loss_clip": 0.01124052, + "auxiliary_loss_mlp": 0.01034759, + "balance_loss_clip": 1.04486907, + "balance_loss_mlp": 1.02168798, + "epoch": 0.4727491357282429, + "flos": 19573291082880.0, + "grad_norm": 1.985312065509785, + "language_loss": 0.84358662, + "learning_rate": 2.272744289645927e-06, + "loss": 0.86517477, + "num_input_tokens_seen": 168958340, + "step": 7863, + "time_per_iteration": 2.456998586654663 + }, + { + "auxiliary_loss_clip": 0.01098597, + "auxiliary_loss_mlp": 0.01036043, + "balance_loss_clip": 1.04470229, + "balance_loss_mlp": 1.02313864, + "epoch": 0.47280925898091086, + "flos": 18217231902720.0, + "grad_norm": 2.0777094078912635, + "language_loss": 0.66088885, + "learning_rate": 2.272358461271467e-06, + "loss": 0.68223524, + "num_input_tokens_seen": 168974850, + "step": 7864, + "time_per_iteration": 2.5085079669952393 + }, + { + "auxiliary_loss_clip": 0.01122901, + "auxiliary_loss_mlp": 0.01032621, + "balance_loss_clip": 1.04537392, + "balance_loss_mlp": 1.01841736, + "epoch": 0.4728693822335788, + "flos": 17821820010240.0, + "grad_norm": 2.1358083086522384, + "language_loss": 0.65667403, + "learning_rate": 2.271972622569147e-06, + "loss": 0.67822921, + "num_input_tokens_seen": 168992860, + "step": 7865, + "time_per_iteration": 2.454620122909546 + }, + { + "auxiliary_loss_clip": 0.0109619, + "auxiliary_loss_mlp": 0.00796908, + "balance_loss_clip": 1.0468657, + "balance_loss_mlp": 1.02657783, + "epoch": 0.4729295054862468, + "flos": 20595057361920.0, + "grad_norm": 1.677259134499498, + "language_loss": 0.74465954, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.76359046, + "num_input_tokens_seen": 169010325, + "step": 7866, + "time_per_iteration": 2.5188405513763428 + }, + { + "auxiliary_loss_clip": 0.01124373, + "auxiliary_loss_mlp": 0.01032116, + "balance_loss_clip": 1.0450002, + "balance_loss_mlp": 1.01896739, + "epoch": 0.47298962873891476, + "flos": 23368079232000.0, + "grad_norm": 1.8417876940695144, + "language_loss": 0.82646149, + "learning_rate": 2.271200914239451e-06, + "loss": 0.84802634, + "num_input_tokens_seen": 169029840, + "step": 7867, + "time_per_iteration": 2.4861769676208496 + }, + { + "auxiliary_loss_clip": 0.01108496, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.04338634, + "balance_loss_mlp": 1.01710999, + "epoch": 0.4730497519915827, + "flos": 22052240305920.0, + "grad_norm": 1.6854733221413818, + "language_loss": 0.79784518, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.81923264, + "num_input_tokens_seen": 169049975, + "step": 7868, + "time_per_iteration": 2.4869577884674072 + }, + { + "auxiliary_loss_clip": 0.01047843, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.04658508, + "balance_loss_mlp": 1.0187881, + "epoch": 0.4731098752442507, + "flos": 21069724613760.0, + "grad_norm": 1.7643923253600329, + "language_loss": 0.74591804, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.76673096, + "num_input_tokens_seen": 169069540, + "step": 7869, + "time_per_iteration": 2.654275894165039 + }, + { + "auxiliary_loss_clip": 0.0110309, + "auxiliary_loss_mlp": 0.01042201, + "balance_loss_clip": 1.04459739, + "balance_loss_mlp": 1.0270133, + "epoch": 0.4731699984969187, + "flos": 22528775064960.0, + "grad_norm": 1.7137113779126913, + "language_loss": 0.73861885, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.76007175, + "num_input_tokens_seen": 169089940, + "step": 7870, + "time_per_iteration": 2.525596857070923 + }, + { + "auxiliary_loss_clip": 0.01132272, + "auxiliary_loss_mlp": 0.01033345, + "balance_loss_clip": 1.04959464, + "balance_loss_mlp": 1.01834869, + "epoch": 0.4732301217495867, + "flos": 24898124914560.0, + "grad_norm": 1.9799381224910668, + "language_loss": 0.81075835, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.83241451, + "num_input_tokens_seen": 169109650, + "step": 7871, + "time_per_iteration": 2.5570945739746094 + }, + { + "auxiliary_loss_clip": 0.01110673, + "auxiliary_loss_mlp": 0.01035853, + "balance_loss_clip": 1.0450722, + "balance_loss_mlp": 1.02224541, + "epoch": 0.47329024500225464, + "flos": 22784423137920.0, + "grad_norm": 1.6501589608011695, + "language_loss": 0.75898409, + "learning_rate": 2.269271463701879e-06, + "loss": 0.78044939, + "num_input_tokens_seen": 169128990, + "step": 7872, + "time_per_iteration": 2.508674383163452 + }, + { + "auxiliary_loss_clip": 0.010883, + "auxiliary_loss_mlp": 0.01033499, + "balance_loss_clip": 1.04230344, + "balance_loss_mlp": 1.01977229, + "epoch": 0.4733503682549226, + "flos": 38695902220800.0, + "grad_norm": 2.163357922429626, + "language_loss": 0.6794734, + "learning_rate": 2.268885542903428e-06, + "loss": 0.70069134, + "num_input_tokens_seen": 169154645, + "step": 7873, + "time_per_iteration": 2.745468854904175 + }, + { + "auxiliary_loss_clip": 0.01112973, + "auxiliary_loss_mlp": 0.01030942, + "balance_loss_clip": 1.04639578, + "balance_loss_mlp": 1.01761472, + "epoch": 0.47341049150759057, + "flos": 22966849336320.0, + "grad_norm": 1.7948968175629223, + "language_loss": 0.7243942, + "learning_rate": 2.26849961190881e-06, + "loss": 0.7458334, + "num_input_tokens_seen": 169174995, + "step": 7874, + "time_per_iteration": 2.5028791427612305 + }, + { + "auxiliary_loss_clip": 0.01104997, + "auxiliary_loss_mlp": 0.01033651, + "balance_loss_clip": 1.04604602, + "balance_loss_mlp": 1.02040732, + "epoch": 0.47347061476025853, + "flos": 14538471661440.0, + "grad_norm": 2.3135753533911116, + "language_loss": 0.6502344, + "learning_rate": 2.26811367073266e-06, + "loss": 0.67162085, + "num_input_tokens_seen": 169191815, + "step": 7875, + "time_per_iteration": 2.491527557373047 + }, + { + "auxiliary_loss_clip": 0.01077567, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.04841447, + "balance_loss_mlp": 1.01927257, + "epoch": 0.4735307380129265, + "flos": 30263250827520.0, + "grad_norm": 2.6277602920891683, + "language_loss": 0.81160057, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.83271289, + "num_input_tokens_seen": 169210430, + "step": 7876, + "time_per_iteration": 3.9887137413024902 + }, + { + "auxiliary_loss_clip": 0.01099623, + "auxiliary_loss_mlp": 0.01037115, + "balance_loss_clip": 1.04241133, + "balance_loss_mlp": 1.02283335, + "epoch": 0.47359086126559446, + "flos": 19391044452480.0, + "grad_norm": 1.6772959476146978, + "language_loss": 0.79430848, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81567585, + "num_input_tokens_seen": 169229295, + "step": 7877, + "time_per_iteration": 3.9000935554504395 + }, + { + "auxiliary_loss_clip": 0.01110987, + "auxiliary_loss_mlp": 0.0078808, + "balance_loss_clip": 1.04309833, + "balance_loss_mlp": 1.01328754, + "epoch": 0.47365098451826243, + "flos": 21939408708480.0, + "grad_norm": 1.9019161483187086, + "language_loss": 0.71163207, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.73062277, + "num_input_tokens_seen": 169247855, + "step": 7878, + "time_per_iteration": 2.5203518867492676 + }, + { + "auxiliary_loss_clip": 0.01082708, + "auxiliary_loss_mlp": 0.01030845, + "balance_loss_clip": 1.04672337, + "balance_loss_mlp": 1.01808989, + "epoch": 0.4737111077709304, + "flos": 25845053207040.0, + "grad_norm": 2.1897986426624754, + "language_loss": 0.75165552, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.77279103, + "num_input_tokens_seen": 169268860, + "step": 7879, + "time_per_iteration": 2.6223394870758057 + }, + { + "auxiliary_loss_clip": 0.01028255, + "auxiliary_loss_mlp": 0.0100513, + "balance_loss_clip": 1.02314484, + "balance_loss_mlp": 1.00363433, + "epoch": 0.47377123102359836, + "flos": 67760886314880.0, + "grad_norm": 0.7380541007532991, + "language_loss": 0.61276722, + "learning_rate": 2.266183812641164e-06, + "loss": 0.63310111, + "num_input_tokens_seen": 169331855, + "step": 7880, + "time_per_iteration": 3.12857985496521 + }, + { + "auxiliary_loss_clip": 0.01098855, + "auxiliary_loss_mlp": 0.01034712, + "balance_loss_clip": 1.04316902, + "balance_loss_mlp": 1.02041316, + "epoch": 0.4738313542762663, + "flos": 24315977191680.0, + "grad_norm": 3.7712616456423977, + "language_loss": 0.67881912, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.70015478, + "num_input_tokens_seen": 169352175, + "step": 7881, + "time_per_iteration": 3.9322140216827393 + }, + { + "auxiliary_loss_clip": 0.01055675, + "auxiliary_loss_mlp": 0.01026281, + "balance_loss_clip": 1.0423038, + "balance_loss_mlp": 1.01350725, + "epoch": 0.4738914775289343, + "flos": 20705339093760.0, + "grad_norm": 1.6720640069037578, + "language_loss": 0.77328652, + "learning_rate": 2.265411798646092e-06, + "loss": 0.79410607, + "num_input_tokens_seen": 169371215, + "step": 7882, + "time_per_iteration": 4.012754678726196 + }, + { + "auxiliary_loss_clip": 0.01108917, + "auxiliary_loss_mlp": 0.01031878, + "balance_loss_clip": 1.04257596, + "balance_loss_mlp": 1.01815093, + "epoch": 0.4739516007816023, + "flos": 25446337263360.0, + "grad_norm": 1.5283700066439676, + "language_loss": 0.76204574, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.7834537, + "num_input_tokens_seen": 169391745, + "step": 7883, + "time_per_iteration": 2.539094924926758 + }, + { + "auxiliary_loss_clip": 0.01101106, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.04510975, + "balance_loss_mlp": 1.01939297, + "epoch": 0.4740117240342703, + "flos": 19974341410560.0, + "grad_norm": 1.7409257644805376, + "language_loss": 0.72069812, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.74202919, + "num_input_tokens_seen": 169409845, + "step": 7884, + "time_per_iteration": 2.535902976989746 + }, + { + "auxiliary_loss_clip": 0.01115072, + "auxiliary_loss_mlp": 0.01032762, + "balance_loss_clip": 1.04584384, + "balance_loss_mlp": 1.01851058, + "epoch": 0.47407184728693824, + "flos": 15661146222720.0, + "grad_norm": 1.9230322521204166, + "language_loss": 0.81940675, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.8408851, + "num_input_tokens_seen": 169426085, + "step": 7885, + "time_per_iteration": 2.466367721557617 + }, + { + "auxiliary_loss_clip": 0.01092952, + "auxiliary_loss_mlp": 0.01051328, + "balance_loss_clip": 1.04568911, + "balance_loss_mlp": 1.03528845, + "epoch": 0.4741319705396062, + "flos": 18588800142720.0, + "grad_norm": 2.019541119891715, + "language_loss": 0.7347672, + "learning_rate": 2.263867649999751e-06, + "loss": 0.75620997, + "num_input_tokens_seen": 169444705, + "step": 7886, + "time_per_iteration": 2.513636827468872 + }, + { + "auxiliary_loss_clip": 0.01104759, + "auxiliary_loss_mlp": 0.01031339, + "balance_loss_clip": 1.04805231, + "balance_loss_mlp": 1.01724851, + "epoch": 0.47419209379227417, + "flos": 13261093223040.0, + "grad_norm": 2.011873528410458, + "language_loss": 0.74302578, + "learning_rate": 2.263481587786849e-06, + "loss": 0.76438677, + "num_input_tokens_seen": 169460850, + "step": 7887, + "time_per_iteration": 2.5251471996307373 + }, + { + "auxiliary_loss_clip": 0.01108833, + "auxiliary_loss_mlp": 0.01026238, + "balance_loss_clip": 1.04304671, + "balance_loss_mlp": 1.01413238, + "epoch": 0.47425221704494214, + "flos": 20044043752320.0, + "grad_norm": 1.8357431524510155, + "language_loss": 0.7745316, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.79588228, + "num_input_tokens_seen": 169478890, + "step": 7888, + "time_per_iteration": 2.528989315032959 + }, + { + "auxiliary_loss_clip": 0.01112893, + "auxiliary_loss_mlp": 0.01031638, + "balance_loss_clip": 1.04553425, + "balance_loss_mlp": 1.01834595, + "epoch": 0.4743123402976101, + "flos": 27271892136960.0, + "grad_norm": 1.8562524681909491, + "language_loss": 0.72367698, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.74512225, + "num_input_tokens_seen": 169499690, + "step": 7889, + "time_per_iteration": 2.639451503753662 + }, + { + "auxiliary_loss_clip": 0.01045352, + "auxiliary_loss_mlp": 0.01002741, + "balance_loss_clip": 1.01943314, + "balance_loss_mlp": 1.00135791, + "epoch": 0.47437246355027807, + "flos": 55393970261760.0, + "grad_norm": 0.7114644091171369, + "language_loss": 0.56075668, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58123755, + "num_input_tokens_seen": 169560475, + "step": 7890, + "time_per_iteration": 3.1443679332733154 + }, + { + "auxiliary_loss_clip": 0.01114261, + "auxiliary_loss_mlp": 0.01035119, + "balance_loss_clip": 1.04875112, + "balance_loss_mlp": 1.02004457, + "epoch": 0.47443258680294603, + "flos": 23878477537920.0, + "grad_norm": 1.8856654805758213, + "language_loss": 0.6575532, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.67904699, + "num_input_tokens_seen": 169580110, + "step": 7891, + "time_per_iteration": 2.516343832015991 + }, + { + "auxiliary_loss_clip": 0.01127774, + "auxiliary_loss_mlp": 0.01033178, + "balance_loss_clip": 1.04667401, + "balance_loss_mlp": 1.01815152, + "epoch": 0.474492710055614, + "flos": 21977761455360.0, + "grad_norm": 2.1510240608337297, + "language_loss": 0.69735885, + "learning_rate": 2.26155112714642e-06, + "loss": 0.71896839, + "num_input_tokens_seen": 169597510, + "step": 7892, + "time_per_iteration": 2.477475881576538 + }, + { + "auxiliary_loss_clip": 0.01045705, + "auxiliary_loss_mlp": 0.01002059, + "balance_loss_clip": 1.04469252, + "balance_loss_mlp": 1.00016403, + "epoch": 0.47455283330828196, + "flos": 62557180122240.0, + "grad_norm": 1.5400359516014954, + "language_loss": 0.58623183, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60670942, + "num_input_tokens_seen": 169660010, + "step": 7893, + "time_per_iteration": 3.2231192588806152 + }, + { + "auxiliary_loss_clip": 0.0111291, + "auxiliary_loss_mlp": 0.01032103, + "balance_loss_clip": 1.04746151, + "balance_loss_mlp": 1.01941884, + "epoch": 0.47461295656094993, + "flos": 12093637380480.0, + "grad_norm": 2.2568191013430074, + "language_loss": 0.77532029, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.79677033, + "num_input_tokens_seen": 169678485, + "step": 7894, + "time_per_iteration": 2.471271514892578 + }, + { + "auxiliary_loss_clip": 0.01112303, + "auxiliary_loss_mlp": 0.01032609, + "balance_loss_clip": 1.04475832, + "balance_loss_mlp": 1.01940036, + "epoch": 0.4746730798136179, + "flos": 20884568981760.0, + "grad_norm": 1.74479702897336, + "language_loss": 0.74589288, + "learning_rate": 2.260392731628497e-06, + "loss": 0.76734203, + "num_input_tokens_seen": 169697335, + "step": 7895, + "time_per_iteration": 2.5045104026794434 + }, + { + "auxiliary_loss_clip": 0.01110755, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.0445559, + "balance_loss_mlp": 1.01684248, + "epoch": 0.4747332030662859, + "flos": 19974808287360.0, + "grad_norm": 1.8843327432471817, + "language_loss": 0.82327378, + "learning_rate": 2.260006580021429e-06, + "loss": 0.84469277, + "num_input_tokens_seen": 169715395, + "step": 7896, + "time_per_iteration": 2.4799885749816895 + }, + { + "auxiliary_loss_clip": 0.01111688, + "auxiliary_loss_mlp": 0.01031695, + "balance_loss_clip": 1.04618239, + "balance_loss_mlp": 1.01759887, + "epoch": 0.4747933263189539, + "flos": 16034186920320.0, + "grad_norm": 1.7529518095034993, + "language_loss": 0.75357842, + "learning_rate": 2.259620418554886e-06, + "loss": 0.77501225, + "num_input_tokens_seen": 169733755, + "step": 7897, + "time_per_iteration": 2.4667129516601562 + }, + { + "auxiliary_loss_clip": 0.01100867, + "auxiliary_loss_mlp": 0.01033804, + "balance_loss_clip": 1.04713607, + "balance_loss_mlp": 1.02019596, + "epoch": 0.47485344957162184, + "flos": 13955102876160.0, + "grad_norm": 2.4089453231735503, + "language_loss": 0.63765466, + "learning_rate": 2.25923424724351e-06, + "loss": 0.65900135, + "num_input_tokens_seen": 169751390, + "step": 7898, + "time_per_iteration": 2.5213942527770996 + }, + { + "auxiliary_loss_clip": 0.01088989, + "auxiliary_loss_mlp": 0.01045183, + "balance_loss_clip": 1.04433858, + "balance_loss_mlp": 1.02900052, + "epoch": 0.4749135728242898, + "flos": 20449080489600.0, + "grad_norm": 2.086891726909934, + "language_loss": 0.70067298, + "learning_rate": 2.258848066101946e-06, + "loss": 0.72201478, + "num_input_tokens_seen": 169769500, + "step": 7899, + "time_per_iteration": 2.5454118251800537 + }, + { + "auxiliary_loss_clip": 0.01114291, + "auxiliary_loss_mlp": 0.01037254, + "balance_loss_clip": 1.04686189, + "balance_loss_mlp": 1.023175, + "epoch": 0.4749736960769578, + "flos": 28949961767040.0, + "grad_norm": 2.3551802891398976, + "language_loss": 0.68554485, + "learning_rate": 2.258461875144837e-06, + "loss": 0.70706022, + "num_input_tokens_seen": 169789215, + "step": 7900, + "time_per_iteration": 2.554299831390381 + }, + { + "auxiliary_loss_clip": 0.01086875, + "auxiliary_loss_mlp": 0.01038304, + "balance_loss_clip": 1.04686034, + "balance_loss_mlp": 1.024369, + "epoch": 0.47503381932962574, + "flos": 31938770592000.0, + "grad_norm": 1.9842125223785956, + "language_loss": 0.70866525, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.72991699, + "num_input_tokens_seen": 169808825, + "step": 7901, + "time_per_iteration": 2.6424949169158936 + }, + { + "auxiliary_loss_clip": 0.01101185, + "auxiliary_loss_mlp": 0.01050744, + "balance_loss_clip": 1.04841745, + "balance_loss_mlp": 1.03569961, + "epoch": 0.4750939425822937, + "flos": 22127257860480.0, + "grad_norm": 1.8829046912479168, + "language_loss": 0.73688036, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.75839967, + "num_input_tokens_seen": 169827590, + "step": 7902, + "time_per_iteration": 2.564584732055664 + }, + { + "auxiliary_loss_clip": 0.01084355, + "auxiliary_loss_mlp": 0.01034364, + "balance_loss_clip": 1.04687369, + "balance_loss_mlp": 1.02166867, + "epoch": 0.47515406583496167, + "flos": 20850094903680.0, + "grad_norm": 1.9668977996043955, + "language_loss": 0.68993938, + "learning_rate": 2.257303243526688e-06, + "loss": 0.71112657, + "num_input_tokens_seen": 169844925, + "step": 7903, + "time_per_iteration": 2.548550844192505 + }, + { + "auxiliary_loss_clip": 0.01098501, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.04421973, + "balance_loss_mlp": 1.02002096, + "epoch": 0.47521418908762963, + "flos": 17524802448000.0, + "grad_norm": 1.4669599633873274, + "language_loss": 0.72007382, + "learning_rate": 2.256917013453848e-06, + "loss": 0.7413851, + "num_input_tokens_seen": 169862705, + "step": 7904, + "time_per_iteration": 2.5268239974975586 + }, + { + "auxiliary_loss_clip": 0.0106323, + "auxiliary_loss_mlp": 0.01041948, + "balance_loss_clip": 1.04060245, + "balance_loss_mlp": 1.02758288, + "epoch": 0.4752743123402976, + "flos": 20559434048640.0, + "grad_norm": 1.5730561824044802, + "language_loss": 0.86303055, + "learning_rate": 2.25653077363869e-06, + "loss": 0.88408238, + "num_input_tokens_seen": 169880155, + "step": 7905, + "time_per_iteration": 2.60209321975708 + }, + { + "auxiliary_loss_clip": 0.01103136, + "auxiliary_loss_mlp": 0.01033589, + "balance_loss_clip": 1.04284191, + "balance_loss_mlp": 1.02098811, + "epoch": 0.47533443559296557, + "flos": 26360623071360.0, + "grad_norm": 1.561531353196979, + "language_loss": 0.82168734, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.84305465, + "num_input_tokens_seen": 169901525, + "step": 7906, + "time_per_iteration": 2.5461008548736572 + }, + { + "auxiliary_loss_clip": 0.01028043, + "auxiliary_loss_mlp": 0.01000803, + "balance_loss_clip": 1.03249574, + "balance_loss_mlp": 0.99935418, + "epoch": 0.47539455884563353, + "flos": 65949660967680.0, + "grad_norm": 0.6676522658775641, + "language_loss": 0.58979368, + "learning_rate": 2.255758264840002e-06, + "loss": 0.61008215, + "num_input_tokens_seen": 169970345, + "step": 7907, + "time_per_iteration": 3.273958921432495 + }, + { + "auxiliary_loss_clip": 0.01110473, + "auxiliary_loss_mlp": 0.01036867, + "balance_loss_clip": 1.04586864, + "balance_loss_mlp": 1.02359951, + "epoch": 0.4754546820983015, + "flos": 17238128002560.0, + "grad_norm": 2.1791024539493633, + "language_loss": 0.81223208, + "learning_rate": 2.255371995885765e-06, + "loss": 0.83370554, + "num_input_tokens_seen": 169986440, + "step": 7908, + "time_per_iteration": 2.473233699798584 + }, + { + "auxiliary_loss_clip": 0.01113691, + "auxiliary_loss_mlp": 0.01043627, + "balance_loss_clip": 1.04707849, + "balance_loss_mlp": 1.02957249, + "epoch": 0.47551480535096946, + "flos": 19825886499840.0, + "grad_norm": 1.6361442303373408, + "language_loss": 0.73846221, + "learning_rate": 2.254985717247797e-06, + "loss": 0.7600354, + "num_input_tokens_seen": 170005705, + "step": 7909, + "time_per_iteration": 2.5353007316589355 + }, + { + "auxiliary_loss_clip": 0.01097418, + "auxiliary_loss_mlp": 0.01040114, + "balance_loss_clip": 1.0461942, + "balance_loss_mlp": 1.02612472, + "epoch": 0.4755749286036375, + "flos": 22163958581760.0, + "grad_norm": 1.5886671237432652, + "language_loss": 0.7530821, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.7744574, + "num_input_tokens_seen": 170023415, + "step": 7910, + "time_per_iteration": 2.5333316326141357 + }, + { + "auxiliary_loss_clip": 0.0110793, + "auxiliary_loss_mlp": 0.01031387, + "balance_loss_clip": 1.04362202, + "balance_loss_mlp": 1.01929331, + "epoch": 0.47563505185630545, + "flos": 21648280976640.0, + "grad_norm": 1.9210796249640134, + "language_loss": 0.78899139, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.81038457, + "num_input_tokens_seen": 170042395, + "step": 7911, + "time_per_iteration": 2.516482353210449 + }, + { + "auxiliary_loss_clip": 0.01095383, + "auxiliary_loss_mlp": 0.00790299, + "balance_loss_clip": 1.04274929, + "balance_loss_mlp": 1.01078188, + "epoch": 0.4756951751089734, + "flos": 20628777254400.0, + "grad_norm": 1.671895736457528, + "language_loss": 0.76453614, + "learning_rate": 2.253826823377983e-06, + "loss": 0.78339297, + "num_input_tokens_seen": 170061610, + "step": 7912, + "time_per_iteration": 2.5568339824676514 + }, + { + "auxiliary_loss_clip": 0.01121226, + "auxiliary_loss_mlp": 0.0104083, + "balance_loss_clip": 1.04505527, + "balance_loss_mlp": 1.02795553, + "epoch": 0.4757552983616414, + "flos": 25848788221440.0, + "grad_norm": 1.590925139416758, + "language_loss": 0.74086475, + "learning_rate": 2.253440506151569e-06, + "loss": 0.76248533, + "num_input_tokens_seen": 170083505, + "step": 7913, + "time_per_iteration": 2.5217502117156982 + }, + { + "auxiliary_loss_clip": 0.01104029, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.04732132, + "balance_loss_mlp": 1.01940775, + "epoch": 0.47581542161430934, + "flos": 18223013992320.0, + "grad_norm": 2.0117762738409053, + "language_loss": 0.72284842, + "learning_rate": 2.253054179314666e-06, + "loss": 0.74422562, + "num_input_tokens_seen": 170100690, + "step": 7914, + "time_per_iteration": 2.484370470046997 + }, + { + "auxiliary_loss_clip": 0.01102276, + "auxiliary_loss_mlp": 0.01037785, + "balance_loss_clip": 1.04976499, + "balance_loss_mlp": 1.02491081, + "epoch": 0.4758755448669773, + "flos": 21579763783680.0, + "grad_norm": 2.633992041386149, + "language_loss": 0.64630121, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.66770184, + "num_input_tokens_seen": 170119240, + "step": 7915, + "time_per_iteration": 3.911830425262451 + }, + { + "auxiliary_loss_clip": 0.01118066, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_clip": 1.04475534, + "balance_loss_mlp": 1.02325964, + "epoch": 0.47593566811964527, + "flos": 15231152511360.0, + "grad_norm": 1.7175023619391758, + "language_loss": 0.77001595, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.79156148, + "num_input_tokens_seen": 170136450, + "step": 7916, + "time_per_iteration": 3.809297561645508 + }, + { + "auxiliary_loss_clip": 0.01122374, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.04574382, + "balance_loss_mlp": 1.02098703, + "epoch": 0.47599579137231324, + "flos": 21543242630400.0, + "grad_norm": 2.0782437350111653, + "language_loss": 0.64476168, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.66632098, + "num_input_tokens_seen": 170155295, + "step": 7917, + "time_per_iteration": 2.46280837059021 + }, + { + "auxiliary_loss_clip": 0.01026135, + "auxiliary_loss_mlp": 0.01003682, + "balance_loss_clip": 1.03013968, + "balance_loss_mlp": 1.00223351, + "epoch": 0.4760559146249812, + "flos": 64554602595840.0, + "grad_norm": 0.8396465506933027, + "language_loss": 0.65727633, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.67757452, + "num_input_tokens_seen": 170222325, + "step": 7918, + "time_per_iteration": 3.1705453395843506 + }, + { + "auxiliary_loss_clip": 0.0111256, + "auxiliary_loss_mlp": 0.00787714, + "balance_loss_clip": 1.04609179, + "balance_loss_mlp": 1.00953531, + "epoch": 0.47611603787764917, + "flos": 22233876405120.0, + "grad_norm": 1.5987381903912339, + "language_loss": 0.68757474, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.70657742, + "num_input_tokens_seen": 170241625, + "step": 7919, + "time_per_iteration": 3.9664504528045654 + }, + { + "auxiliary_loss_clip": 0.0110193, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.04592621, + "balance_loss_mlp": 1.01939034, + "epoch": 0.47617616113031713, + "flos": 22780005765120.0, + "grad_norm": 2.6398904412739266, + "language_loss": 0.74782836, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.76917523, + "num_input_tokens_seen": 170262470, + "step": 7920, + "time_per_iteration": 2.596222162246704 + }, + { + "auxiliary_loss_clip": 0.01104455, + "auxiliary_loss_mlp": 0.01030631, + "balance_loss_clip": 1.04631662, + "balance_loss_mlp": 1.01638579, + "epoch": 0.4762362843829851, + "flos": 24133802388480.0, + "grad_norm": 1.5695924284316067, + "language_loss": 0.77119309, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.79254395, + "num_input_tokens_seen": 170283460, + "step": 7921, + "time_per_iteration": 3.9479143619537354 + }, + { + "auxiliary_loss_clip": 0.01103807, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.04562604, + "balance_loss_mlp": 1.02472925, + "epoch": 0.47629640763565306, + "flos": 22452069571200.0, + "grad_norm": 1.587347881364558, + "language_loss": 0.78345311, + "learning_rate": 2.249963220399845e-06, + "loss": 0.80488563, + "num_input_tokens_seen": 170304225, + "step": 7922, + "time_per_iteration": 2.563905954360962 + }, + { + "auxiliary_loss_clip": 0.0109222, + "auxiliary_loss_mlp": 0.0103327, + "balance_loss_clip": 1.04757655, + "balance_loss_mlp": 1.01851153, + "epoch": 0.4763565308883211, + "flos": 11181398647680.0, + "grad_norm": 1.7929784272903542, + "language_loss": 0.72654396, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.74779886, + "num_input_tokens_seen": 170322110, + "step": 7923, + "time_per_iteration": 2.533186912536621 + }, + { + "auxiliary_loss_clip": 0.0109164, + "auxiliary_loss_mlp": 0.01033432, + "balance_loss_clip": 1.04366875, + "balance_loss_mlp": 1.02012873, + "epoch": 0.47641665414098905, + "flos": 22382151747840.0, + "grad_norm": 2.0167660621365147, + "language_loss": 0.81609678, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.83734751, + "num_input_tokens_seen": 170340700, + "step": 7924, + "time_per_iteration": 2.566941738128662 + }, + { + "auxiliary_loss_clip": 0.01122371, + "auxiliary_loss_mlp": 0.01035016, + "balance_loss_clip": 1.05002296, + "balance_loss_mlp": 1.0203476, + "epoch": 0.476476777393657, + "flos": 25046148862080.0, + "grad_norm": 1.7953145071776975, + "language_loss": 0.80332279, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.82489669, + "num_input_tokens_seen": 170359780, + "step": 7925, + "time_per_iteration": 2.5432941913604736 + }, + { + "auxiliary_loss_clip": 0.01098328, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.04421604, + "balance_loss_mlp": 1.02476764, + "epoch": 0.476536900646325, + "flos": 27269916888960.0, + "grad_norm": 1.6033672469144027, + "language_loss": 0.72047043, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.74183226, + "num_input_tokens_seen": 170381260, + "step": 7926, + "time_per_iteration": 2.6178035736083984 + }, + { + "auxiliary_loss_clip": 0.01117145, + "auxiliary_loss_mlp": 0.01031365, + "balance_loss_clip": 1.04817152, + "balance_loss_mlp": 1.01697636, + "epoch": 0.47659702389899294, + "flos": 25301401885440.0, + "grad_norm": 2.0501690344687744, + "language_loss": 0.67980039, + "learning_rate": 2.248031062546432e-06, + "loss": 0.70128548, + "num_input_tokens_seen": 170400595, + "step": 7927, + "time_per_iteration": 2.5520544052124023 + }, + { + "auxiliary_loss_clip": 0.01087383, + "auxiliary_loss_mlp": 0.01029788, + "balance_loss_clip": 1.0466857, + "balance_loss_mlp": 1.01704431, + "epoch": 0.4766571471516609, + "flos": 25992861672960.0, + "grad_norm": 1.5605122569709726, + "language_loss": 0.6821239, + "learning_rate": 2.247644602701045e-06, + "loss": 0.70329559, + "num_input_tokens_seen": 170421110, + "step": 7928, + "time_per_iteration": 2.629636764526367 + }, + { + "auxiliary_loss_clip": 0.01123896, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.04616499, + "balance_loss_mlp": 1.01990843, + "epoch": 0.4767172704043289, + "flos": 16032211672320.0, + "grad_norm": 2.00702516433815, + "language_loss": 0.7849443, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.80651551, + "num_input_tokens_seen": 170436700, + "step": 7929, + "time_per_iteration": 2.427865505218506 + }, + { + "auxiliary_loss_clip": 0.01101275, + "auxiliary_loss_mlp": 0.01033625, + "balance_loss_clip": 1.04529786, + "balance_loss_mlp": 1.02041042, + "epoch": 0.47677739365699684, + "flos": 39235351651200.0, + "grad_norm": 1.8095765079488166, + "language_loss": 0.66343653, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.68478549, + "num_input_tokens_seen": 170459555, + "step": 7930, + "time_per_iteration": 2.6742751598358154 + }, + { + "auxiliary_loss_clip": 0.01115544, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.04621625, + "balance_loss_mlp": 1.01940536, + "epoch": 0.4768375169096648, + "flos": 24717781704960.0, + "grad_norm": 1.6642785757148895, + "language_loss": 0.79614997, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.8176313, + "num_input_tokens_seen": 170479175, + "step": 7931, + "time_per_iteration": 2.5198466777801514 + }, + { + "auxiliary_loss_clip": 0.01095633, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.04190636, + "balance_loss_mlp": 1.01947021, + "epoch": 0.47689764016233277, + "flos": 22528667324160.0, + "grad_norm": 2.1045854461336755, + "language_loss": 0.76201361, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.78330898, + "num_input_tokens_seen": 170498450, + "step": 7932, + "time_per_iteration": 2.5161991119384766 + }, + { + "auxiliary_loss_clip": 0.01101995, + "auxiliary_loss_mlp": 0.00789817, + "balance_loss_clip": 1.04675627, + "balance_loss_mlp": 1.01639545, + "epoch": 0.47695776341500074, + "flos": 15120619384320.0, + "grad_norm": 1.7107425499347664, + "language_loss": 0.79247594, + "learning_rate": 2.245712162906593e-06, + "loss": 0.8113941, + "num_input_tokens_seen": 170516255, + "step": 7933, + "time_per_iteration": 2.518139123916626 + }, + { + "auxiliary_loss_clip": 0.01120134, + "auxiliary_loss_mlp": 0.01040114, + "balance_loss_clip": 1.04700589, + "balance_loss_mlp": 1.02446151, + "epoch": 0.4770178866676687, + "flos": 14678917839360.0, + "grad_norm": 8.693257429788233, + "language_loss": 0.73857754, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.76018006, + "num_input_tokens_seen": 170532705, + "step": 7934, + "time_per_iteration": 2.4681923389434814 + }, + { + "auxiliary_loss_clip": 0.0111536, + "auxiliary_loss_mlp": 0.01032328, + "balance_loss_clip": 1.04652548, + "balance_loss_mlp": 1.01898265, + "epoch": 0.47707800992033667, + "flos": 22565583527040.0, + "grad_norm": 1.7949289412479341, + "language_loss": 0.79820287, + "learning_rate": 2.244939121664211e-06, + "loss": 0.81967974, + "num_input_tokens_seen": 170551925, + "step": 7935, + "time_per_iteration": 2.5077433586120605 + }, + { + "auxiliary_loss_clip": 0.0109932, + "auxiliary_loss_mlp": 0.01041884, + "balance_loss_clip": 1.04692435, + "balance_loss_mlp": 1.02679193, + "epoch": 0.4771381331730047, + "flos": 30918225375360.0, + "grad_norm": 1.760701819261821, + "language_loss": 0.71020305, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.73161507, + "num_input_tokens_seen": 170572320, + "step": 7936, + "time_per_iteration": 2.626458168029785 + }, + { + "auxiliary_loss_clip": 0.01127231, + "auxiliary_loss_mlp": 0.01034535, + "balance_loss_clip": 1.04676819, + "balance_loss_mlp": 1.02127862, + "epoch": 0.47719825642567265, + "flos": 25738901539200.0, + "grad_norm": 1.9104246093182893, + "language_loss": 0.67686248, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.69848013, + "num_input_tokens_seen": 170589470, + "step": 7937, + "time_per_iteration": 2.4908578395843506 + }, + { + "auxiliary_loss_clip": 0.01037597, + "auxiliary_loss_mlp": 0.0100328, + "balance_loss_clip": 1.02235472, + "balance_loss_mlp": 1.00184917, + "epoch": 0.4772583796783406, + "flos": 66355128668160.0, + "grad_norm": 0.7110192493229954, + "language_loss": 0.56349671, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58390546, + "num_input_tokens_seen": 170662265, + "step": 7938, + "time_per_iteration": 3.2714197635650635 + }, + { + "auxiliary_loss_clip": 0.01099767, + "auxiliary_loss_mlp": 0.01050533, + "balance_loss_clip": 1.04658461, + "balance_loss_mlp": 1.03404665, + "epoch": 0.4773185029310086, + "flos": 22051091070720.0, + "grad_norm": 1.699524750303965, + "language_loss": 0.88906789, + "learning_rate": 2.243392927839317e-06, + "loss": 0.91057086, + "num_input_tokens_seen": 170679680, + "step": 7939, + "time_per_iteration": 2.5305051803588867 + }, + { + "auxiliary_loss_clip": 0.01112727, + "auxiliary_loss_mlp": 0.01037542, + "balance_loss_clip": 1.04337811, + "balance_loss_mlp": 1.02433968, + "epoch": 0.47737862618367655, + "flos": 16727801523840.0, + "grad_norm": 1.8968093786999194, + "language_loss": 0.77089846, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.79240119, + "num_input_tokens_seen": 170697340, + "step": 7940, + "time_per_iteration": 2.483955144882202 + }, + { + "auxiliary_loss_clip": 0.01099375, + "auxiliary_loss_mlp": 0.01038243, + "balance_loss_clip": 1.04545951, + "balance_loss_mlp": 1.02530873, + "epoch": 0.4774387494363445, + "flos": 19609453100160.0, + "grad_norm": 1.7398917826531954, + "language_loss": 0.84929073, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.87066686, + "num_input_tokens_seen": 170714905, + "step": 7941, + "time_per_iteration": 2.516010046005249 + }, + { + "auxiliary_loss_clip": 0.01107913, + "auxiliary_loss_mlp": 0.01037107, + "balance_loss_clip": 1.04793012, + "balance_loss_mlp": 1.0230881, + "epoch": 0.4774988726890125, + "flos": 16653969118080.0, + "grad_norm": 1.833841966006201, + "language_loss": 0.76032317, + "learning_rate": 2.24223318550976e-06, + "loss": 0.78177339, + "num_input_tokens_seen": 170731810, + "step": 7942, + "time_per_iteration": 2.5157697200775146 + }, + { + "auxiliary_loss_clip": 0.01115821, + "auxiliary_loss_mlp": 0.01036617, + "balance_loss_clip": 1.05051303, + "balance_loss_mlp": 1.02314651, + "epoch": 0.47755899594168044, + "flos": 20485565729280.0, + "grad_norm": 1.808400357675475, + "language_loss": 0.64583564, + "learning_rate": 2.241846586342682e-06, + "loss": 0.66736001, + "num_input_tokens_seen": 170750270, + "step": 7943, + "time_per_iteration": 2.5122225284576416 + }, + { + "auxiliary_loss_clip": 0.0109831, + "auxiliary_loss_mlp": 0.01038951, + "balance_loss_clip": 1.0486815, + "balance_loss_mlp": 1.02374005, + "epoch": 0.4776191191943484, + "flos": 21652806090240.0, + "grad_norm": 1.597904530364636, + "language_loss": 0.73229277, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.75366533, + "num_input_tokens_seen": 170769015, + "step": 7944, + "time_per_iteration": 2.582731008529663 + }, + { + "auxiliary_loss_clip": 0.01116027, + "auxiliary_loss_mlp": 0.01038434, + "balance_loss_clip": 1.05278254, + "balance_loss_mlp": 1.02372324, + "epoch": 0.4776792424470164, + "flos": 18770220760320.0, + "grad_norm": 2.166749323689797, + "language_loss": 0.67973548, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.70128012, + "num_input_tokens_seen": 170785725, + "step": 7945, + "time_per_iteration": 2.498991012573242 + }, + { + "auxiliary_loss_clip": 0.0108794, + "auxiliary_loss_mlp": 0.00793568, + "balance_loss_clip": 1.04279232, + "balance_loss_mlp": 1.01537585, + "epoch": 0.47773936569968434, + "flos": 29715828577920.0, + "grad_norm": 1.8123795503095148, + "language_loss": 0.75423908, + "learning_rate": 2.240686733875009e-06, + "loss": 0.77305412, + "num_input_tokens_seen": 170804600, + "step": 7946, + "time_per_iteration": 2.642031669616699 + }, + { + "auxiliary_loss_clip": 0.01110198, + "auxiliary_loss_mlp": 0.01044016, + "balance_loss_clip": 1.04856277, + "balance_loss_mlp": 1.02900743, + "epoch": 0.4777994889523523, + "flos": 24791542283520.0, + "grad_norm": 1.7604116918559216, + "language_loss": 0.79638869, + "learning_rate": 2.240300098112506e-06, + "loss": 0.81793082, + "num_input_tokens_seen": 170824230, + "step": 7947, + "time_per_iteration": 2.5756144523620605 + }, + { + "auxiliary_loss_clip": 0.01092892, + "auxiliary_loss_mlp": 0.01037165, + "balance_loss_clip": 1.04896283, + "balance_loss_mlp": 1.02346146, + "epoch": 0.47785961220502027, + "flos": 17858161595520.0, + "grad_norm": 1.7826879333591938, + "language_loss": 0.7381711, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.75947165, + "num_input_tokens_seen": 170843365, + "step": 7948, + "time_per_iteration": 2.539172410964966 + }, + { + "auxiliary_loss_clip": 0.01105724, + "auxiliary_loss_mlp": 0.01034616, + "balance_loss_clip": 1.04681802, + "balance_loss_mlp": 1.01964355, + "epoch": 0.4779197354576883, + "flos": 20266546550400.0, + "grad_norm": 1.484903326231955, + "language_loss": 0.78211546, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.80351883, + "num_input_tokens_seen": 170863515, + "step": 7949, + "time_per_iteration": 2.536595106124878 + }, + { + "auxiliary_loss_clip": 0.01095368, + "auxiliary_loss_mlp": 0.01033488, + "balance_loss_clip": 1.04348207, + "balance_loss_mlp": 1.01999378, + "epoch": 0.47797985871035625, + "flos": 17056599644160.0, + "grad_norm": 2.0141560676206165, + "language_loss": 0.73682326, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.75811183, + "num_input_tokens_seen": 170881245, + "step": 7950, + "time_per_iteration": 2.5155513286590576 + }, + { + "auxiliary_loss_clip": 0.01093854, + "auxiliary_loss_mlp": 0.01044925, + "balance_loss_clip": 1.04504406, + "balance_loss_mlp": 1.02916574, + "epoch": 0.4780399819630242, + "flos": 31358418549120.0, + "grad_norm": 1.689160258156553, + "language_loss": 0.73826838, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.75965613, + "num_input_tokens_seen": 170901285, + "step": 7951, + "time_per_iteration": 2.614638328552246 + }, + { + "auxiliary_loss_clip": 0.01093145, + "auxiliary_loss_mlp": 0.01035318, + "balance_loss_clip": 1.04725981, + "balance_loss_mlp": 1.02050018, + "epoch": 0.4781001052156922, + "flos": 24899597372160.0, + "grad_norm": 1.9087978471958877, + "language_loss": 0.80701482, + "learning_rate": 2.238366782910174e-06, + "loss": 0.82829946, + "num_input_tokens_seen": 170919740, + "step": 7952, + "time_per_iteration": 2.604689836502075 + }, + { + "auxiliary_loss_clip": 0.01106182, + "auxiliary_loss_mlp": 0.01041333, + "balance_loss_clip": 1.04605055, + "balance_loss_mlp": 1.02644348, + "epoch": 0.47816022846836015, + "flos": 18697717157760.0, + "grad_norm": 1.651025336400082, + "language_loss": 0.78296095, + "learning_rate": 2.23798009269438e-06, + "loss": 0.80443615, + "num_input_tokens_seen": 170938510, + "step": 7953, + "time_per_iteration": 2.533233642578125 + }, + { + "auxiliary_loss_clip": 0.0111994, + "auxiliary_loss_mlp": 0.0103718, + "balance_loss_clip": 1.04947746, + "balance_loss_mlp": 1.02298796, + "epoch": 0.4782203517210281, + "flos": 11977573559040.0, + "grad_norm": 2.2192884360369445, + "language_loss": 0.84048921, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.86206043, + "num_input_tokens_seen": 170951170, + "step": 7954, + "time_per_iteration": 3.8337700366973877 + }, + { + "auxiliary_loss_clip": 0.01095608, + "auxiliary_loss_mlp": 0.01040932, + "balance_loss_clip": 1.045017, + "balance_loss_mlp": 1.02649009, + "epoch": 0.4782804749736961, + "flos": 20813501923200.0, + "grad_norm": 1.570816463641743, + "language_loss": 0.70689481, + "learning_rate": 2.237206685204768e-06, + "loss": 0.72826016, + "num_input_tokens_seen": 170970990, + "step": 7955, + "time_per_iteration": 3.910712957382202 + }, + { + "auxiliary_loss_clip": 0.01109581, + "auxiliary_loss_mlp": 0.01037219, + "balance_loss_clip": 1.04923904, + "balance_loss_mlp": 1.02373636, + "epoch": 0.47834059822636404, + "flos": 23840304359040.0, + "grad_norm": 1.5825527638432113, + "language_loss": 0.81722689, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.83869493, + "num_input_tokens_seen": 170991215, + "step": 7956, + "time_per_iteration": 2.5663464069366455 + }, + { + "auxiliary_loss_clip": 0.01106545, + "auxiliary_loss_mlp": 0.01037945, + "balance_loss_clip": 1.04988432, + "balance_loss_mlp": 1.02253079, + "epoch": 0.478400721479032, + "flos": 22633777497600.0, + "grad_norm": 1.989595018942994, + "language_loss": 0.84586465, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.86730957, + "num_input_tokens_seen": 171007325, + "step": 7957, + "time_per_iteration": 2.5528128147125244 + }, + { + "auxiliary_loss_clip": 0.01115839, + "auxiliary_loss_mlp": 0.01038473, + "balance_loss_clip": 1.04694676, + "balance_loss_mlp": 1.02495456, + "epoch": 0.4784608447317, + "flos": 19354954262400.0, + "grad_norm": 1.6605531612423299, + "language_loss": 0.79764867, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.81919181, + "num_input_tokens_seen": 171025650, + "step": 7958, + "time_per_iteration": 3.8734793663024902 + }, + { + "auxiliary_loss_clip": 0.01086009, + "auxiliary_loss_mlp": 0.00794047, + "balance_loss_clip": 1.04212773, + "balance_loss_mlp": 1.01255167, + "epoch": 0.47852096798436794, + "flos": 24021114445440.0, + "grad_norm": 1.9865551539004418, + "language_loss": 0.82634449, + "learning_rate": 2.235659762404047e-06, + "loss": 0.84514499, + "num_input_tokens_seen": 171045045, + "step": 7959, + "time_per_iteration": 2.586444139480591 + }, + { + "auxiliary_loss_clip": 0.01089134, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.046875, + "balance_loss_mlp": 1.02063251, + "epoch": 0.4785810912370359, + "flos": 25666433850240.0, + "grad_norm": 2.590380949950751, + "language_loss": 0.7345264, + "learning_rate": 2.235273009326599e-06, + "loss": 0.75575191, + "num_input_tokens_seen": 171062910, + "step": 7960, + "time_per_iteration": 4.0475547313690186 + }, + { + "auxiliary_loss_clip": 0.01089162, + "auxiliary_loss_mlp": 0.01036314, + "balance_loss_clip": 1.04598641, + "balance_loss_mlp": 1.02339721, + "epoch": 0.47864121448970387, + "flos": 21432134885760.0, + "grad_norm": 1.9727759984894584, + "language_loss": 0.76854444, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.78979921, + "num_input_tokens_seen": 171080875, + "step": 7961, + "time_per_iteration": 2.5597259998321533 + }, + { + "auxiliary_loss_clip": 0.01086172, + "auxiliary_loss_mlp": 0.01030413, + "balance_loss_clip": 1.04772818, + "balance_loss_mlp": 1.01707292, + "epoch": 0.47870133774237184, + "flos": 16143894034560.0, + "grad_norm": 1.7352285306360908, + "language_loss": 0.77921069, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.80037653, + "num_input_tokens_seen": 171099190, + "step": 7962, + "time_per_iteration": 2.546804904937744 + }, + { + "auxiliary_loss_clip": 0.01100811, + "auxiliary_loss_mlp": 0.01037213, + "balance_loss_clip": 1.04947925, + "balance_loss_mlp": 1.02359927, + "epoch": 0.47876146099503986, + "flos": 26906788344960.0, + "grad_norm": 1.6786853142608151, + "language_loss": 0.6504966, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.67187691, + "num_input_tokens_seen": 171119060, + "step": 7963, + "time_per_iteration": 2.594326972961426 + }, + { + "auxiliary_loss_clip": 0.01115553, + "auxiliary_loss_mlp": 0.01031533, + "balance_loss_clip": 1.04705954, + "balance_loss_mlp": 1.01712084, + "epoch": 0.4788215842477078, + "flos": 45332085778560.0, + "grad_norm": 2.148445581241582, + "language_loss": 0.77539259, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.7968635, + "num_input_tokens_seen": 171141900, + "step": 7964, + "time_per_iteration": 2.7331790924072266 + }, + { + "auxiliary_loss_clip": 0.0111937, + "auxiliary_loss_mlp": 0.01033811, + "balance_loss_clip": 1.0482614, + "balance_loss_mlp": 1.01817024, + "epoch": 0.4788817075003758, + "flos": 22237180456320.0, + "grad_norm": 1.6952171789585355, + "language_loss": 0.76253617, + "learning_rate": 2.233339110409044e-06, + "loss": 0.78406787, + "num_input_tokens_seen": 171161045, + "step": 7965, + "time_per_iteration": 2.5991547107696533 + }, + { + "auxiliary_loss_clip": 0.01073149, + "auxiliary_loss_mlp": 0.0104244, + "balance_loss_clip": 1.04382575, + "balance_loss_mlp": 1.02732444, + "epoch": 0.47894183075304375, + "flos": 16471183783680.0, + "grad_norm": 2.715780852327255, + "language_loss": 0.74698782, + "learning_rate": 2.232952304022137e-06, + "loss": 0.76814365, + "num_input_tokens_seen": 171179675, + "step": 7966, + "time_per_iteration": 2.5939178466796875 + }, + { + "auxiliary_loss_clip": 0.01101344, + "auxiliary_loss_mlp": 0.01036206, + "balance_loss_clip": 1.0493474, + "balance_loss_mlp": 1.02138865, + "epoch": 0.4790019540057117, + "flos": 24282688262400.0, + "grad_norm": 1.6130462295433614, + "language_loss": 0.73149097, + "learning_rate": 2.232565488801655e-06, + "loss": 0.75286651, + "num_input_tokens_seen": 171201175, + "step": 7967, + "time_per_iteration": 2.605095148086548 + }, + { + "auxiliary_loss_clip": 0.01099671, + "auxiliary_loss_mlp": 0.01030357, + "balance_loss_clip": 1.04426003, + "balance_loss_mlp": 1.01677942, + "epoch": 0.4790620772583797, + "flos": 25666469763840.0, + "grad_norm": 1.8537947714380258, + "language_loss": 0.79240012, + "learning_rate": 2.232178664762267e-06, + "loss": 0.81370044, + "num_input_tokens_seen": 171221750, + "step": 7968, + "time_per_iteration": 2.590744972229004 + }, + { + "auxiliary_loss_clip": 0.0102481, + "auxiliary_loss_mlp": 0.01001746, + "balance_loss_clip": 1.02739024, + "balance_loss_mlp": 1.00016057, + "epoch": 0.47912220051104765, + "flos": 69428077102080.0, + "grad_norm": 0.7591894852888579, + "language_loss": 0.6227926, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.64305818, + "num_input_tokens_seen": 171292235, + "step": 7969, + "time_per_iteration": 3.2979280948638916 + }, + { + "auxiliary_loss_clip": 0.01087407, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.04925132, + "balance_loss_mlp": 1.01807094, + "epoch": 0.4791823237637156, + "flos": 24168922911360.0, + "grad_norm": 1.3992240489168608, + "language_loss": 0.7716403, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.79282784, + "num_input_tokens_seen": 171312215, + "step": 7970, + "time_per_iteration": 2.5915300846099854 + }, + { + "auxiliary_loss_clip": 0.01110429, + "auxiliary_loss_mlp": 0.01035666, + "balance_loss_clip": 1.04323077, + "balance_loss_mlp": 1.02144432, + "epoch": 0.4792424470163836, + "flos": 24751465683840.0, + "grad_norm": 1.7241747510640428, + "language_loss": 0.70566279, + "learning_rate": 2.231018139877349e-06, + "loss": 0.72712374, + "num_input_tokens_seen": 171332975, + "step": 7971, + "time_per_iteration": 2.552602767944336 + }, + { + "auxiliary_loss_clip": 0.01079764, + "auxiliary_loss_mlp": 0.01031007, + "balance_loss_clip": 1.0461762, + "balance_loss_mlp": 1.01582003, + "epoch": 0.47930257026905154, + "flos": 23257905240960.0, + "grad_norm": 1.3396476627013838, + "language_loss": 0.80201268, + "learning_rate": 2.230631280709021e-06, + "loss": 0.82312042, + "num_input_tokens_seen": 171353880, + "step": 7972, + "time_per_iteration": 2.6406657695770264 + }, + { + "auxiliary_loss_clip": 0.01118018, + "auxiliary_loss_mlp": 0.01031018, + "balance_loss_clip": 1.0493952, + "balance_loss_mlp": 1.01668942, + "epoch": 0.4793626935217195, + "flos": 14064091718400.0, + "grad_norm": 1.9637957087103555, + "language_loss": 0.69868326, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.72017366, + "num_input_tokens_seen": 171370930, + "step": 7973, + "time_per_iteration": 2.4983367919921875 + }, + { + "auxiliary_loss_clip": 0.01117865, + "auxiliary_loss_mlp": 0.01035348, + "balance_loss_clip": 1.05195057, + "balance_loss_mlp": 1.02246189, + "epoch": 0.4794228167743875, + "flos": 21798854789760.0, + "grad_norm": 1.7663354622427891, + "language_loss": 0.78812718, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.80965924, + "num_input_tokens_seen": 171387575, + "step": 7974, + "time_per_iteration": 2.5343334674835205 + }, + { + "auxiliary_loss_clip": 0.01034345, + "auxiliary_loss_mlp": 0.0102342, + "balance_loss_clip": 1.02848315, + "balance_loss_mlp": 1.02150047, + "epoch": 0.47948294002705544, + "flos": 66968805553920.0, + "grad_norm": 0.7705526526284165, + "language_loss": 0.54045796, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.56103557, + "num_input_tokens_seen": 171449980, + "step": 7975, + "time_per_iteration": 3.1808247566223145 + }, + { + "auxiliary_loss_clip": 0.01111551, + "auxiliary_loss_mlp": 0.010396, + "balance_loss_clip": 1.04666221, + "balance_loss_mlp": 1.02427626, + "epoch": 0.47954306327972346, + "flos": 12422471414400.0, + "grad_norm": 1.9090714661436667, + "language_loss": 0.89613652, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.91764802, + "num_input_tokens_seen": 171465290, + "step": 7976, + "time_per_iteration": 2.5180437564849854 + }, + { + "auxiliary_loss_clip": 0.01132828, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.05128086, + "balance_loss_mlp": 1.0232687, + "epoch": 0.4796031865323914, + "flos": 18361951799040.0, + "grad_norm": 2.042549544397765, + "language_loss": 0.74030507, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.76201552, + "num_input_tokens_seen": 171481130, + "step": 7977, + "time_per_iteration": 2.469179391860962 + }, + { + "auxiliary_loss_clip": 0.01115423, + "auxiliary_loss_mlp": 0.00790689, + "balance_loss_clip": 1.04755938, + "balance_loss_mlp": 1.01507974, + "epoch": 0.4796633097850594, + "flos": 21835088634240.0, + "grad_norm": 1.5191511809010236, + "language_loss": 0.78275061, + "learning_rate": 2.228309942555734e-06, + "loss": 0.8018117, + "num_input_tokens_seen": 171501140, + "step": 7978, + "time_per_iteration": 2.5458803176879883 + }, + { + "auxiliary_loss_clip": 0.01104341, + "auxiliary_loss_mlp": 0.0104026, + "balance_loss_clip": 1.04708838, + "balance_loss_mlp": 1.02596724, + "epoch": 0.47972343303772735, + "flos": 23437350610560.0, + "grad_norm": 1.7589351522953791, + "language_loss": 0.8929801, + "learning_rate": 2.22792302247656e-06, + "loss": 0.91442609, + "num_input_tokens_seen": 171519835, + "step": 7979, + "time_per_iteration": 2.560894727706909 + }, + { + "auxiliary_loss_clip": 0.01119673, + "auxiliary_loss_mlp": 0.01038595, + "balance_loss_clip": 1.04921412, + "balance_loss_mlp": 1.02327657, + "epoch": 0.4797835562903953, + "flos": 24899776940160.0, + "grad_norm": 1.4680783202913248, + "language_loss": 0.7701292, + "learning_rate": 2.227536093754523e-06, + "loss": 0.79171193, + "num_input_tokens_seen": 171540980, + "step": 7980, + "time_per_iteration": 2.5638155937194824 + }, + { + "auxiliary_loss_clip": 0.01096145, + "auxiliary_loss_mlp": 0.01039311, + "balance_loss_clip": 1.04366946, + "balance_loss_mlp": 1.02381372, + "epoch": 0.4798436795430633, + "flos": 35042996793600.0, + "grad_norm": 1.7711988242157224, + "language_loss": 0.71917713, + "learning_rate": 2.227149156404295e-06, + "loss": 0.74053168, + "num_input_tokens_seen": 171563600, + "step": 7981, + "time_per_iteration": 2.67795467376709 + }, + { + "auxiliary_loss_clip": 0.01124364, + "auxiliary_loss_mlp": 0.01030261, + "balance_loss_clip": 1.04866433, + "balance_loss_mlp": 1.01709414, + "epoch": 0.47990380279573125, + "flos": 20590209025920.0, + "grad_norm": 2.3529311507642032, + "language_loss": 0.70075023, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.72229648, + "num_input_tokens_seen": 171580700, + "step": 7982, + "time_per_iteration": 2.482614278793335 + }, + { + "auxiliary_loss_clip": 0.0109771, + "auxiliary_loss_mlp": 0.01032394, + "balance_loss_clip": 1.04504037, + "balance_loss_mlp": 1.02035999, + "epoch": 0.4799639260483992, + "flos": 26359402008960.0, + "grad_norm": 1.6224412701761382, + "language_loss": 0.70710027, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.7284013, + "num_input_tokens_seen": 171602035, + "step": 7983, + "time_per_iteration": 2.5717480182647705 + }, + { + "auxiliary_loss_clip": 0.01039282, + "auxiliary_loss_mlp": 0.00775621, + "balance_loss_clip": 1.0227412, + "balance_loss_mlp": 1.01854312, + "epoch": 0.4800240493010672, + "flos": 70979021521920.0, + "grad_norm": 0.7945471295294019, + "language_loss": 0.59447157, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.61262059, + "num_input_tokens_seen": 171659215, + "step": 7984, + "time_per_iteration": 3.061324119567871 + }, + { + "auxiliary_loss_clip": 0.01068264, + "auxiliary_loss_mlp": 0.0104908, + "balance_loss_clip": 1.04175973, + "balance_loss_mlp": 1.03271282, + "epoch": 0.48008417255373514, + "flos": 17086656349440.0, + "grad_norm": 1.6277658017439955, + "language_loss": 0.66462791, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.68580139, + "num_input_tokens_seen": 171675710, + "step": 7985, + "time_per_iteration": 2.5727028846740723 + }, + { + "auxiliary_loss_clip": 0.01103497, + "auxiliary_loss_mlp": 0.01037872, + "balance_loss_clip": 1.04410672, + "balance_loss_mlp": 1.02342367, + "epoch": 0.4801442958064031, + "flos": 15413435055360.0, + "grad_norm": 2.0236654254725783, + "language_loss": 0.70275807, + "learning_rate": 2.225214340743835e-06, + "loss": 0.72417176, + "num_input_tokens_seen": 171692510, + "step": 7986, + "time_per_iteration": 2.5134100914001465 + }, + { + "auxiliary_loss_clip": 0.01090282, + "auxiliary_loss_mlp": 0.01038651, + "balance_loss_clip": 1.04899478, + "balance_loss_mlp": 1.0241791, + "epoch": 0.4802044190590711, + "flos": 11473747441920.0, + "grad_norm": 1.8262548037559965, + "language_loss": 0.78741789, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.80870724, + "num_input_tokens_seen": 171710235, + "step": 7987, + "time_per_iteration": 2.5443408489227295 + }, + { + "auxiliary_loss_clip": 0.01074386, + "auxiliary_loss_mlp": 0.01042989, + "balance_loss_clip": 1.04479837, + "balance_loss_mlp": 1.02895761, + "epoch": 0.48026454231173904, + "flos": 20951003185920.0, + "grad_norm": 1.940270962493836, + "language_loss": 0.74895096, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.77012467, + "num_input_tokens_seen": 171726715, + "step": 7988, + "time_per_iteration": 2.583739995956421 + }, + { + "auxiliary_loss_clip": 0.0109371, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.04683912, + "balance_loss_mlp": 1.02054477, + "epoch": 0.48032466556440706, + "flos": 20448110822400.0, + "grad_norm": 1.9571861851088301, + "language_loss": 0.78846782, + "learning_rate": 2.224053348748365e-06, + "loss": 0.80974638, + "num_input_tokens_seen": 171743605, + "step": 7989, + "time_per_iteration": 2.5617904663085938 + }, + { + "auxiliary_loss_clip": 0.01106184, + "auxiliary_loss_mlp": 0.01039684, + "balance_loss_clip": 1.04488981, + "balance_loss_mlp": 1.0247407, + "epoch": 0.480384788817075, + "flos": 37120823861760.0, + "grad_norm": 1.9483961266067635, + "language_loss": 0.73559642, + "learning_rate": 2.223666334404724e-06, + "loss": 0.75705504, + "num_input_tokens_seen": 171765445, + "step": 7990, + "time_per_iteration": 2.669351577758789 + }, + { + "auxiliary_loss_clip": 0.01037884, + "auxiliary_loss_mlp": 0.00771339, + "balance_loss_clip": 1.02086329, + "balance_loss_mlp": 1.01046467, + "epoch": 0.480444912069743, + "flos": 69552577641600.0, + "grad_norm": 0.7620089327861589, + "language_loss": 0.59031105, + "learning_rate": 2.223279311579633e-06, + "loss": 0.60840327, + "num_input_tokens_seen": 171830115, + "step": 7991, + "time_per_iteration": 3.2058353424072266 + }, + { + "auxiliary_loss_clip": 0.01113461, + "auxiliary_loss_mlp": 0.00789998, + "balance_loss_clip": 1.04475379, + "balance_loss_mlp": 1.01277518, + "epoch": 0.48050503532241096, + "flos": 29822231640960.0, + "grad_norm": 1.8491582114963667, + "language_loss": 0.67547894, + "learning_rate": 2.222892280287768e-06, + "loss": 0.69451356, + "num_input_tokens_seen": 171849135, + "step": 7992, + "time_per_iteration": 3.9340479373931885 + }, + { + "auxiliary_loss_clip": 0.01100241, + "auxiliary_loss_mlp": 0.01037667, + "balance_loss_clip": 1.04228377, + "balance_loss_mlp": 1.02321887, + "epoch": 0.4805651585750789, + "flos": 23948539015680.0, + "grad_norm": 1.6055940471661165, + "language_loss": 0.76036125, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.78174031, + "num_input_tokens_seen": 171868880, + "step": 7993, + "time_per_iteration": 2.5635008811950684 + }, + { + "auxiliary_loss_clip": 0.01085436, + "auxiliary_loss_mlp": 0.01036296, + "balance_loss_clip": 1.04647148, + "balance_loss_mlp": 1.02256322, + "epoch": 0.4806252818277469, + "flos": 25665428269440.0, + "grad_norm": 1.6382368159501521, + "language_loss": 0.78782767, + "learning_rate": 2.222118192362422e-06, + "loss": 0.80904496, + "num_input_tokens_seen": 171889455, + "step": 7994, + "time_per_iteration": 3.992957830429077 + }, + { + "auxiliary_loss_clip": 0.01103584, + "auxiliary_loss_mlp": 0.01033275, + "balance_loss_clip": 1.04339409, + "balance_loss_mlp": 1.01938105, + "epoch": 0.48068540508041485, + "flos": 13151996640000.0, + "grad_norm": 1.9498878008857032, + "language_loss": 0.79708552, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.81845409, + "num_input_tokens_seen": 171906070, + "step": 7995, + "time_per_iteration": 2.5124006271362305 + }, + { + "auxiliary_loss_clip": 0.01077832, + "auxiliary_loss_mlp": 0.0103577, + "balance_loss_clip": 1.04481721, + "balance_loss_mlp": 1.02200699, + "epoch": 0.4807455283330828, + "flos": 21176738208000.0, + "grad_norm": 1.4070564968906234, + "language_loss": 0.82778364, + "learning_rate": 2.2213440707461e-06, + "loss": 0.84891963, + "num_input_tokens_seen": 171926515, + "step": 7996, + "time_per_iteration": 3.989732027053833 + }, + { + "auxiliary_loss_clip": 0.01055952, + "auxiliary_loss_mlp": 0.01035714, + "balance_loss_clip": 1.0447911, + "balance_loss_mlp": 1.02156997, + "epoch": 0.4808056515857508, + "flos": 12275991751680.0, + "grad_norm": 1.7227742658126144, + "language_loss": 0.81089115, + "learning_rate": 2.220956997340516e-06, + "loss": 0.83180785, + "num_input_tokens_seen": 171943845, + "step": 7997, + "time_per_iteration": 2.635472536087036 + }, + { + "auxiliary_loss_clip": 0.01074309, + "auxiliary_loss_mlp": 0.01035421, + "balance_loss_clip": 1.04094899, + "balance_loss_mlp": 1.02134848, + "epoch": 0.48086577483841875, + "flos": 24826052275200.0, + "grad_norm": 1.6900927296619095, + "language_loss": 0.72196531, + "learning_rate": 2.220569915556221e-06, + "loss": 0.74306262, + "num_input_tokens_seen": 171964970, + "step": 7998, + "time_per_iteration": 2.6376898288726807 + }, + { + "auxiliary_loss_clip": 0.01122586, + "auxiliary_loss_mlp": 0.01037273, + "balance_loss_clip": 1.04459012, + "balance_loss_mlp": 1.02318287, + "epoch": 0.4809258980910867, + "flos": 24465365856000.0, + "grad_norm": 1.9081667847185362, + "language_loss": 0.7096349, + "learning_rate": 2.220182825407892e-06, + "loss": 0.73123348, + "num_input_tokens_seen": 171986340, + "step": 7999, + "time_per_iteration": 3.9272279739379883 + }, + { + "auxiliary_loss_clip": 0.0111552, + "auxiliary_loss_mlp": 0.01041962, + "balance_loss_clip": 1.04428041, + "balance_loss_mlp": 1.02775252, + "epoch": 0.4809860213437547, + "flos": 21215952881280.0, + "grad_norm": 1.6514655308337884, + "language_loss": 0.71156329, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.73313808, + "num_input_tokens_seen": 172007300, + "step": 8000, + "time_per_iteration": 2.525585412979126 + }, + { + "auxiliary_loss_clip": 0.01112499, + "auxiliary_loss_mlp": 0.01041162, + "balance_loss_clip": 1.04567921, + "balance_loss_mlp": 1.02682757, + "epoch": 0.48104614459642264, + "flos": 37632084094080.0, + "grad_norm": 1.6597023065713772, + "language_loss": 0.74955338, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.77109003, + "num_input_tokens_seen": 172029585, + "step": 8001, + "time_per_iteration": 2.6539931297302246 + }, + { + "auxiliary_loss_clip": 0.0111224, + "auxiliary_loss_mlp": 0.01036988, + "balance_loss_clip": 1.04421353, + "balance_loss_mlp": 1.02355909, + "epoch": 0.48110626784909066, + "flos": 18406122549120.0, + "grad_norm": 1.685485422686156, + "language_loss": 0.81354845, + "learning_rate": 2.219021504925493e-06, + "loss": 0.83504075, + "num_input_tokens_seen": 172047495, + "step": 8002, + "time_per_iteration": 2.500030994415283 + }, + { + "auxiliary_loss_clip": 0.01117619, + "auxiliary_loss_mlp": 0.01037832, + "balance_loss_clip": 1.04646564, + "balance_loss_mlp": 1.0231514, + "epoch": 0.48116639110175863, + "flos": 28439814856320.0, + "grad_norm": 2.156423837600068, + "language_loss": 0.71788907, + "learning_rate": 2.218634381467819e-06, + "loss": 0.73944354, + "num_input_tokens_seen": 172067625, + "step": 8003, + "time_per_iteration": 2.559398889541626 + }, + { + "auxiliary_loss_clip": 0.01106115, + "auxiliary_loss_mlp": 0.01037714, + "balance_loss_clip": 1.04408574, + "balance_loss_mlp": 1.02452326, + "epoch": 0.4812265143544266, + "flos": 21725237865600.0, + "grad_norm": 1.644911104529485, + "language_loss": 0.82822073, + "learning_rate": 2.218247249719507e-06, + "loss": 0.84965897, + "num_input_tokens_seen": 172087885, + "step": 8004, + "time_per_iteration": 2.521289587020874 + }, + { + "auxiliary_loss_clip": 0.01105942, + "auxiliary_loss_mlp": 0.01043304, + "balance_loss_clip": 1.04618657, + "balance_loss_mlp": 1.02741921, + "epoch": 0.48128663760709456, + "flos": 13224679810560.0, + "grad_norm": 2.1749628497112075, + "language_loss": 0.77248269, + "learning_rate": 2.217860109695239e-06, + "loss": 0.79397511, + "num_input_tokens_seen": 172105815, + "step": 8005, + "time_per_iteration": 2.5096490383148193 + }, + { + "auxiliary_loss_clip": 0.01106079, + "auxiliary_loss_mlp": 0.01036391, + "balance_loss_clip": 1.04356909, + "balance_loss_mlp": 1.02281928, + "epoch": 0.4813467608597625, + "flos": 24243437675520.0, + "grad_norm": 1.8507650394411095, + "language_loss": 0.70384717, + "learning_rate": 2.217472961409692e-06, + "loss": 0.72527194, + "num_input_tokens_seen": 172126125, + "step": 8006, + "time_per_iteration": 2.528286933898926 + }, + { + "auxiliary_loss_clip": 0.01094962, + "auxiliary_loss_mlp": 0.01040489, + "balance_loss_clip": 1.04188347, + "balance_loss_mlp": 1.02580249, + "epoch": 0.4814068841124305, + "flos": 27480424544640.0, + "grad_norm": 1.7821661923740897, + "language_loss": 0.70166117, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.72301567, + "num_input_tokens_seen": 172141945, + "step": 8007, + "time_per_iteration": 2.5516135692596436 + }, + { + "auxiliary_loss_clip": 0.01124542, + "auxiliary_loss_mlp": 0.01036693, + "balance_loss_clip": 1.04480386, + "balance_loss_mlp": 1.02250695, + "epoch": 0.48146700736509845, + "flos": 19572896033280.0, + "grad_norm": 2.2019587951159467, + "language_loss": 0.71662116, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.73823357, + "num_input_tokens_seen": 172161095, + "step": 8008, + "time_per_iteration": 2.4734253883361816 + }, + { + "auxiliary_loss_clip": 0.01097363, + "auxiliary_loss_mlp": 0.01042396, + "balance_loss_clip": 1.04553401, + "balance_loss_mlp": 1.02735198, + "epoch": 0.4815271306177664, + "flos": 20627771673600.0, + "grad_norm": 2.0053163149361706, + "language_loss": 0.60602337, + "learning_rate": 2.216311467132199e-06, + "loss": 0.62742096, + "num_input_tokens_seen": 172178750, + "step": 8009, + "time_per_iteration": 2.572190523147583 + }, + { + "auxiliary_loss_clip": 0.01029817, + "auxiliary_loss_mlp": 0.01006484, + "balance_loss_clip": 1.02358317, + "balance_loss_mlp": 1.00511312, + "epoch": 0.4815872538704344, + "flos": 67691076232320.0, + "grad_norm": 0.8631709055799563, + "language_loss": 0.61366773, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.63403082, + "num_input_tokens_seen": 172240235, + "step": 8010, + "time_per_iteration": 3.1639304161071777 + }, + { + "auxiliary_loss_clip": 0.01116484, + "auxiliary_loss_mlp": 0.01047555, + "balance_loss_clip": 1.04741108, + "balance_loss_mlp": 1.03271973, + "epoch": 0.48164737712310235, + "flos": 22820764723200.0, + "grad_norm": 1.6450113516418963, + "language_loss": 0.73827612, + "learning_rate": 2.215537096576639e-06, + "loss": 0.75991648, + "num_input_tokens_seen": 172259875, + "step": 8011, + "time_per_iteration": 2.5265486240386963 + }, + { + "auxiliary_loss_clip": 0.01099077, + "auxiliary_loss_mlp": 0.01036954, + "balance_loss_clip": 1.04377937, + "balance_loss_mlp": 1.02381754, + "epoch": 0.4817075003757703, + "flos": 23733865382400.0, + "grad_norm": 1.7998846031039775, + "language_loss": 0.7934078, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.81476814, + "num_input_tokens_seen": 172280150, + "step": 8012, + "time_per_iteration": 2.5880823135375977 + }, + { + "auxiliary_loss_clip": 0.01089424, + "auxiliary_loss_mlp": 0.01041885, + "balance_loss_clip": 1.04727674, + "balance_loss_mlp": 1.02704906, + "epoch": 0.4817676236284383, + "flos": 28182909807360.0, + "grad_norm": 1.8985004389057338, + "language_loss": 0.73571336, + "learning_rate": 2.214762693328326e-06, + "loss": 0.75702643, + "num_input_tokens_seen": 172300810, + "step": 8013, + "time_per_iteration": 2.6248950958251953 + }, + { + "auxiliary_loss_clip": 0.01097941, + "auxiliary_loss_mlp": 0.01028232, + "balance_loss_clip": 1.04761219, + "balance_loss_mlp": 1.01520884, + "epoch": 0.48182774688110624, + "flos": 17091756080640.0, + "grad_norm": 1.9895398990617188, + "language_loss": 0.90716505, + "learning_rate": 2.214375479481094e-06, + "loss": 0.92842686, + "num_input_tokens_seen": 172317930, + "step": 8014, + "time_per_iteration": 2.52713680267334 + }, + { + "auxiliary_loss_clip": 0.01127322, + "auxiliary_loss_mlp": 0.01038637, + "balance_loss_clip": 1.04556227, + "balance_loss_mlp": 1.02394462, + "epoch": 0.4818878701337742, + "flos": 12567873669120.0, + "grad_norm": 2.0312465884024027, + "language_loss": 0.74541664, + "learning_rate": 2.213988257504722e-06, + "loss": 0.76707625, + "num_input_tokens_seen": 172336340, + "step": 8015, + "time_per_iteration": 2.463778018951416 + }, + { + "auxiliary_loss_clip": 0.01107328, + "auxiliary_loss_mlp": 0.01038179, + "balance_loss_clip": 1.04411995, + "balance_loss_mlp": 1.02359974, + "epoch": 0.48194799338644223, + "flos": 24608505553920.0, + "grad_norm": 1.888295930245216, + "language_loss": 0.80672365, + "learning_rate": 2.213601027413894e-06, + "loss": 0.8281787, + "num_input_tokens_seen": 172354315, + "step": 8016, + "time_per_iteration": 2.5680277347564697 + }, + { + "auxiliary_loss_clip": 0.0110834, + "auxiliary_loss_mlp": 0.01032918, + "balance_loss_clip": 1.04708862, + "balance_loss_mlp": 1.01945901, + "epoch": 0.4820081166391102, + "flos": 21105204272640.0, + "grad_norm": 1.7199304680982905, + "language_loss": 0.77431905, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.79573166, + "num_input_tokens_seen": 172372695, + "step": 8017, + "time_per_iteration": 2.502817392349243 + }, + { + "auxiliary_loss_clip": 0.01108557, + "auxiliary_loss_mlp": 0.01031079, + "balance_loss_clip": 1.0454185, + "balance_loss_mlp": 1.01694727, + "epoch": 0.48206823989177816, + "flos": 25264593423360.0, + "grad_norm": 1.8198048844894492, + "language_loss": 0.80788171, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.82927811, + "num_input_tokens_seen": 172390905, + "step": 8018, + "time_per_iteration": 2.5359535217285156 + }, + { + "auxiliary_loss_clip": 0.01099141, + "auxiliary_loss_mlp": 0.01031325, + "balance_loss_clip": 1.04795361, + "balance_loss_mlp": 1.0181706, + "epoch": 0.4821283631444461, + "flos": 24645062620800.0, + "grad_norm": 1.6358633695959177, + "language_loss": 0.76281679, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.78412151, + "num_input_tokens_seen": 172412295, + "step": 8019, + "time_per_iteration": 2.6041665077209473 + }, + { + "auxiliary_loss_clip": 0.01088168, + "auxiliary_loss_mlp": 0.01035981, + "balance_loss_clip": 1.04522681, + "balance_loss_mlp": 1.02197397, + "epoch": 0.4821884863971141, + "flos": 23952094462080.0, + "grad_norm": 1.6053213852591102, + "language_loss": 0.79011923, + "learning_rate": 2.212052026199701e-06, + "loss": 0.81136072, + "num_input_tokens_seen": 172432625, + "step": 8020, + "time_per_iteration": 2.6097640991210938 + }, + { + "auxiliary_loss_clip": 0.01120396, + "auxiliary_loss_mlp": 0.0103496, + "balance_loss_clip": 1.04421973, + "balance_loss_mlp": 1.0212568, + "epoch": 0.48224860964978206, + "flos": 17160668323200.0, + "grad_norm": 1.9303905443822806, + "language_loss": 0.69620752, + "learning_rate": 2.211664755756855e-06, + "loss": 0.7177611, + "num_input_tokens_seen": 172450010, + "step": 8021, + "time_per_iteration": 2.448242664337158 + }, + { + "auxiliary_loss_clip": 0.01099282, + "auxiliary_loss_mlp": 0.01033248, + "balance_loss_clip": 1.04470956, + "balance_loss_mlp": 1.01832259, + "epoch": 0.48230873290245, + "flos": 23075838178560.0, + "grad_norm": 1.6925421995993188, + "language_loss": 0.63056177, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.65188706, + "num_input_tokens_seen": 172469080, + "step": 8022, + "time_per_iteration": 2.5611445903778076 + }, + { + "auxiliary_loss_clip": 0.01099057, + "auxiliary_loss_mlp": 0.0079135, + "balance_loss_clip": 1.04456615, + "balance_loss_mlp": 1.01697516, + "epoch": 0.482368856155118, + "flos": 19353517718400.0, + "grad_norm": 2.2580684518982514, + "language_loss": 0.66276914, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.68167317, + "num_input_tokens_seen": 172484850, + "step": 8023, + "time_per_iteration": 2.5222325325012207 + }, + { + "auxiliary_loss_clip": 0.01049726, + "auxiliary_loss_mlp": 0.01043639, + "balance_loss_clip": 1.03846383, + "balance_loss_mlp": 1.02802277, + "epoch": 0.48242897940778595, + "flos": 20078984707200.0, + "grad_norm": 2.0431999219666577, + "language_loss": 0.76597983, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.78691351, + "num_input_tokens_seen": 172503525, + "step": 8024, + "time_per_iteration": 2.7242565155029297 + }, + { + "auxiliary_loss_clip": 0.01098628, + "auxiliary_loss_mlp": 0.0103598, + "balance_loss_clip": 1.04198337, + "balance_loss_mlp": 1.02214587, + "epoch": 0.4824891026604539, + "flos": 23403989854080.0, + "grad_norm": 1.5314800273362705, + "language_loss": 0.75274265, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.77408868, + "num_input_tokens_seen": 172524360, + "step": 8025, + "time_per_iteration": 2.5508241653442383 + }, + { + "auxiliary_loss_clip": 0.01120816, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.04406047, + "balance_loss_mlp": 1.01915145, + "epoch": 0.4825492259131219, + "flos": 20368675895040.0, + "grad_norm": 2.208461433576477, + "language_loss": 0.71061063, + "learning_rate": 2.209728283441112e-06, + "loss": 0.73214066, + "num_input_tokens_seen": 172541480, + "step": 8026, + "time_per_iteration": 2.4596126079559326 + }, + { + "auxiliary_loss_clip": 0.01111994, + "auxiliary_loss_mlp": 0.01042607, + "balance_loss_clip": 1.04457629, + "balance_loss_mlp": 1.02685928, + "epoch": 0.48260934916578985, + "flos": 14319021519360.0, + "grad_norm": 2.2022314326436185, + "language_loss": 0.74541032, + "learning_rate": 2.209340965060465e-06, + "loss": 0.76695633, + "num_input_tokens_seen": 172559005, + "step": 8027, + "time_per_iteration": 2.497581720352173 + }, + { + "auxiliary_loss_clip": 0.01097733, + "auxiliary_loss_mlp": 0.01035645, + "balance_loss_clip": 1.04534197, + "balance_loss_mlp": 1.02230549, + "epoch": 0.4826694724184578, + "flos": 22121152548480.0, + "grad_norm": 1.633610579920971, + "language_loss": 0.67147934, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.6928131, + "num_input_tokens_seen": 172578435, + "step": 8028, + "time_per_iteration": 2.5563721656799316 + }, + { + "auxiliary_loss_clip": 0.01100639, + "auxiliary_loss_mlp": 0.01036325, + "balance_loss_clip": 1.04374409, + "balance_loss_mlp": 1.02265167, + "epoch": 0.48272959567112583, + "flos": 16181169373440.0, + "grad_norm": 1.664348241645772, + "language_loss": 0.72752446, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.74889404, + "num_input_tokens_seen": 172596095, + "step": 8029, + "time_per_iteration": 2.496027708053589 + }, + { + "auxiliary_loss_clip": 0.01099044, + "auxiliary_loss_mlp": 0.01032606, + "balance_loss_clip": 1.04616046, + "balance_loss_mlp": 1.01805091, + "epoch": 0.4827897189237938, + "flos": 23180445561600.0, + "grad_norm": 2.2927232451485566, + "language_loss": 0.84567171, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.86698824, + "num_input_tokens_seen": 172615255, + "step": 8030, + "time_per_iteration": 2.5523974895477295 + }, + { + "auxiliary_loss_clip": 0.01092082, + "auxiliary_loss_mlp": 0.01035135, + "balance_loss_clip": 1.04350376, + "balance_loss_mlp": 1.02149773, + "epoch": 0.48284984217646176, + "flos": 21652626522240.0, + "grad_norm": 2.373108753315014, + "language_loss": 0.74026269, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.76153487, + "num_input_tokens_seen": 172633185, + "step": 8031, + "time_per_iteration": 3.9009711742401123 + }, + { + "auxiliary_loss_clip": 0.01094642, + "auxiliary_loss_mlp": 0.0104698, + "balance_loss_clip": 1.04175663, + "balance_loss_mlp": 1.03128588, + "epoch": 0.48290996542912973, + "flos": 31467443304960.0, + "grad_norm": 1.6482639868287778, + "language_loss": 0.71899223, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.74040842, + "num_input_tokens_seen": 172654280, + "step": 8032, + "time_per_iteration": 4.063181638717651 + }, + { + "auxiliary_loss_clip": 0.01103946, + "auxiliary_loss_mlp": 0.01042394, + "balance_loss_clip": 1.03987479, + "balance_loss_mlp": 1.02739704, + "epoch": 0.4829700886817977, + "flos": 24461954064000.0, + "grad_norm": 1.6089281863100537, + "language_loss": 0.74130857, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.76277196, + "num_input_tokens_seen": 172675545, + "step": 8033, + "time_per_iteration": 2.528202772140503 + }, + { + "auxiliary_loss_clip": 0.01068944, + "auxiliary_loss_mlp": 0.01034933, + "balance_loss_clip": 1.04533207, + "balance_loss_mlp": 1.02115297, + "epoch": 0.48303021193446566, + "flos": 25702164904320.0, + "grad_norm": 1.4784240599143876, + "language_loss": 0.8342582, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.85529703, + "num_input_tokens_seen": 172696455, + "step": 8034, + "time_per_iteration": 2.6595723628997803 + }, + { + "auxiliary_loss_clip": 0.01085981, + "auxiliary_loss_mlp": 0.01030286, + "balance_loss_clip": 1.04177272, + "balance_loss_mlp": 1.01750088, + "epoch": 0.4830903351871336, + "flos": 20085233673600.0, + "grad_norm": 1.6495252519259012, + "language_loss": 0.79664314, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.81780583, + "num_input_tokens_seen": 172716720, + "step": 8035, + "time_per_iteration": 4.010541200637817 + }, + { + "auxiliary_loss_clip": 0.01097017, + "auxiliary_loss_mlp": 0.00790615, + "balance_loss_clip": 1.04088974, + "balance_loss_mlp": 1.01177144, + "epoch": 0.4831504584398016, + "flos": 39452216014080.0, + "grad_norm": 2.455228954952055, + "language_loss": 0.69607526, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.71495152, + "num_input_tokens_seen": 172737435, + "step": 8036, + "time_per_iteration": 2.6928060054779053 + }, + { + "auxiliary_loss_clip": 0.01106709, + "auxiliary_loss_mlp": 0.01033809, + "balance_loss_clip": 1.04055691, + "balance_loss_mlp": 1.02021861, + "epoch": 0.48321058169246955, + "flos": 20006588845440.0, + "grad_norm": 1.7806490882050996, + "language_loss": 0.72682321, + "learning_rate": 2.205467347074847e-06, + "loss": 0.74822843, + "num_input_tokens_seen": 172755700, + "step": 8037, + "time_per_iteration": 2.4832327365875244 + }, + { + "auxiliary_loss_clip": 0.01075281, + "auxiliary_loss_mlp": 0.01039803, + "balance_loss_clip": 1.04387951, + "balance_loss_mlp": 1.02339935, + "epoch": 0.4832707049451375, + "flos": 20741465197440.0, + "grad_norm": 2.0360338876128137, + "language_loss": 0.69039655, + "learning_rate": 2.205079942181525e-06, + "loss": 0.71154743, + "num_input_tokens_seen": 172775185, + "step": 8038, + "time_per_iteration": 3.977391481399536 + }, + { + "auxiliary_loss_clip": 0.01085256, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.04075265, + "balance_loss_mlp": 1.02085328, + "epoch": 0.4833308281978055, + "flos": 33145584762240.0, + "grad_norm": 1.4612275800319958, + "language_loss": 0.79194593, + "learning_rate": 2.20469252951155e-06, + "loss": 0.81314933, + "num_input_tokens_seen": 172796990, + "step": 8039, + "time_per_iteration": 2.6699161529541016 + }, + { + "auxiliary_loss_clip": 0.01108263, + "auxiliary_loss_mlp": 0.01029922, + "balance_loss_clip": 1.04388165, + "balance_loss_mlp": 1.01682734, + "epoch": 0.48339095145047345, + "flos": 19099234362240.0, + "grad_norm": 1.7034153962295335, + "language_loss": 0.77699292, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.79837477, + "num_input_tokens_seen": 172814915, + "step": 8040, + "time_per_iteration": 2.5116214752197266 + }, + { + "auxiliary_loss_clip": 0.01107793, + "auxiliary_loss_mlp": 0.01039403, + "balance_loss_clip": 1.04019368, + "balance_loss_mlp": 1.02460909, + "epoch": 0.4834510747031414, + "flos": 34459448440320.0, + "grad_norm": 1.8037629274348885, + "language_loss": 0.75449324, + "learning_rate": 2.203917680900409e-06, + "loss": 0.77596521, + "num_input_tokens_seen": 172837060, + "step": 8041, + "time_per_iteration": 2.6189022064208984 + }, + { + "auxiliary_loss_clip": 0.01085498, + "auxiliary_loss_mlp": 0.01037252, + "balance_loss_clip": 1.04266047, + "balance_loss_mlp": 1.02299464, + "epoch": 0.48351119795580944, + "flos": 27380845065600.0, + "grad_norm": 1.9666504543158554, + "language_loss": 0.66742814, + "learning_rate": 2.203530244988624e-06, + "loss": 0.68865561, + "num_input_tokens_seen": 172856545, + "step": 8042, + "time_per_iteration": 2.6303889751434326 + }, + { + "auxiliary_loss_clip": 0.0103154, + "auxiliary_loss_mlp": 0.01006055, + "balance_loss_clip": 1.02686262, + "balance_loss_mlp": 1.00463009, + "epoch": 0.4835713212084774, + "flos": 67143941291520.0, + "grad_norm": 0.691919962628056, + "language_loss": 0.58555067, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.60592663, + "num_input_tokens_seen": 172923055, + "step": 8043, + "time_per_iteration": 3.218369960784912 + }, + { + "auxiliary_loss_clip": 0.01098506, + "auxiliary_loss_mlp": 0.01041109, + "balance_loss_clip": 1.04154694, + "balance_loss_mlp": 1.02484894, + "epoch": 0.48363144446114537, + "flos": 17967473660160.0, + "grad_norm": 2.091096202790917, + "language_loss": 0.71887302, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.74026918, + "num_input_tokens_seen": 172940700, + "step": 8044, + "time_per_iteration": 2.4962875843048096 + }, + { + "auxiliary_loss_clip": 0.01079079, + "auxiliary_loss_mlp": 0.01036219, + "balance_loss_clip": 1.04973173, + "balance_loss_mlp": 1.02091289, + "epoch": 0.48369156771381333, + "flos": 20593513077120.0, + "grad_norm": 1.2904734858580333, + "language_loss": 0.76047087, + "learning_rate": 2.202367891004714e-06, + "loss": 0.78162384, + "num_input_tokens_seen": 172961125, + "step": 8045, + "time_per_iteration": 2.6222071647644043 + }, + { + "auxiliary_loss_clip": 0.01075026, + "auxiliary_loss_mlp": 0.01034437, + "balance_loss_clip": 1.04565001, + "balance_loss_mlp": 1.0208348, + "epoch": 0.4837516909664813, + "flos": 22675075159680.0, + "grad_norm": 1.820455420435124, + "language_loss": 0.69157314, + "learning_rate": 2.201980424309533e-06, + "loss": 0.71266776, + "num_input_tokens_seen": 172980405, + "step": 8046, + "time_per_iteration": 2.613757610321045 + }, + { + "auxiliary_loss_clip": 0.01120903, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.04292965, + "balance_loss_mlp": 1.02037692, + "epoch": 0.48381181421914926, + "flos": 25518625384320.0, + "grad_norm": 1.854325052014233, + "language_loss": 0.82130551, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.84286237, + "num_input_tokens_seen": 172999105, + "step": 8047, + "time_per_iteration": 2.5068960189819336 + }, + { + "auxiliary_loss_clip": 0.010895, + "auxiliary_loss_mlp": 0.01036576, + "balance_loss_clip": 1.03982663, + "balance_loss_mlp": 1.02305186, + "epoch": 0.4838719374718172, + "flos": 24207491139840.0, + "grad_norm": 1.9411473043732483, + "language_loss": 0.80556458, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.82682538, + "num_input_tokens_seen": 173019935, + "step": 8048, + "time_per_iteration": 2.5656442642211914 + }, + { + "auxiliary_loss_clip": 0.01112461, + "auxiliary_loss_mlp": 0.01035908, + "balance_loss_clip": 1.04302192, + "balance_loss_mlp": 1.0221808, + "epoch": 0.4839320607244852, + "flos": 26724577628160.0, + "grad_norm": 1.735927416868701, + "language_loss": 0.81283885, + "learning_rate": 2.200817978328054e-06, + "loss": 0.83432257, + "num_input_tokens_seen": 173039700, + "step": 8049, + "time_per_iteration": 2.5722410678863525 + }, + { + "auxiliary_loss_clip": 0.01101002, + "auxiliary_loss_mlp": 0.01031981, + "balance_loss_clip": 1.04440165, + "balance_loss_mlp": 1.01932716, + "epoch": 0.48399218397715316, + "flos": 20448900921600.0, + "grad_norm": 1.624811572387827, + "language_loss": 0.72720158, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.74853146, + "num_input_tokens_seen": 173059170, + "step": 8050, + "time_per_iteration": 2.5246028900146484 + }, + { + "auxiliary_loss_clip": 0.01038378, + "auxiliary_loss_mlp": 0.0077353, + "balance_loss_clip": 1.02284312, + "balance_loss_mlp": 1.01269627, + "epoch": 0.4840523072298211, + "flos": 67180570185600.0, + "grad_norm": 0.6958646383943926, + "language_loss": 0.56389439, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58201349, + "num_input_tokens_seen": 173119000, + "step": 8051, + "time_per_iteration": 3.1603474617004395 + }, + { + "auxiliary_loss_clip": 0.0108534, + "auxiliary_loss_mlp": 0.01032236, + "balance_loss_clip": 1.04448593, + "balance_loss_mlp": 1.01821089, + "epoch": 0.4841124304824891, + "flos": 22411490181120.0, + "grad_norm": 2.72459774369678, + "language_loss": 0.7515325, + "learning_rate": 2.199655463811236e-06, + "loss": 0.77270824, + "num_input_tokens_seen": 173137570, + "step": 8052, + "time_per_iteration": 2.5679843425750732 + }, + { + "auxiliary_loss_clip": 0.01107296, + "auxiliary_loss_mlp": 0.01029724, + "balance_loss_clip": 1.04607463, + "balance_loss_mlp": 1.01681352, + "epoch": 0.48417255373515705, + "flos": 13843959217920.0, + "grad_norm": 2.087045168635755, + "language_loss": 0.66287655, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.68424678, + "num_input_tokens_seen": 173154355, + "step": 8053, + "time_per_iteration": 2.5085248947143555 + }, + { + "auxiliary_loss_clip": 0.0110683, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.04169178, + "balance_loss_mlp": 1.01741385, + "epoch": 0.484232676987825, + "flos": 31649689935360.0, + "grad_norm": 1.931049164115971, + "language_loss": 0.69593501, + "learning_rate": 2.198880416254091e-06, + "loss": 0.71730882, + "num_input_tokens_seen": 173174845, + "step": 8054, + "time_per_iteration": 2.5860259532928467 + }, + { + "auxiliary_loss_clip": 0.01058627, + "auxiliary_loss_mlp": 0.01032984, + "balance_loss_clip": 1.03959489, + "balance_loss_mlp": 1.01935267, + "epoch": 0.48429280024049304, + "flos": 24095377814400.0, + "grad_norm": 1.6836944741831343, + "language_loss": 0.69572872, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.71664488, + "num_input_tokens_seen": 173195025, + "step": 8055, + "time_per_iteration": 2.7091689109802246 + }, + { + "auxiliary_loss_clip": 0.01113528, + "auxiliary_loss_mlp": 0.01037876, + "balance_loss_clip": 1.04603899, + "balance_loss_mlp": 1.0238328, + "epoch": 0.484352923493161, + "flos": 17530081747200.0, + "grad_norm": 2.3204561315998107, + "language_loss": 0.63325429, + "learning_rate": 2.198105338530685e-06, + "loss": 0.65476829, + "num_input_tokens_seen": 173213065, + "step": 8056, + "time_per_iteration": 2.4866795539855957 + }, + { + "auxiliary_loss_clip": 0.01107602, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.04094923, + "balance_loss_mlp": 1.01636326, + "epoch": 0.48441304674582897, + "flos": 29166862043520.0, + "grad_norm": 1.928495605151099, + "language_loss": 0.67061591, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.69199806, + "num_input_tokens_seen": 173234545, + "step": 8057, + "time_per_iteration": 2.5925471782684326 + }, + { + "auxiliary_loss_clip": 0.0108087, + "auxiliary_loss_mlp": 0.01039968, + "balance_loss_clip": 1.03869903, + "balance_loss_mlp": 1.02450657, + "epoch": 0.48447316999849693, + "flos": 15886701676800.0, + "grad_norm": 1.5810196737291642, + "language_loss": 0.81700504, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.83821338, + "num_input_tokens_seen": 173252175, + "step": 8058, + "time_per_iteration": 2.546215534210205 + }, + { + "auxiliary_loss_clip": 0.01108649, + "auxiliary_loss_mlp": 0.01036727, + "balance_loss_clip": 1.04285967, + "balance_loss_mlp": 1.02248764, + "epoch": 0.4845332932511649, + "flos": 24381405815040.0, + "grad_norm": 1.5883840125623667, + "language_loss": 0.79572439, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.81717813, + "num_input_tokens_seen": 173268790, + "step": 8059, + "time_per_iteration": 2.5413990020751953 + }, + { + "auxiliary_loss_clip": 0.01126615, + "auxiliary_loss_mlp": 0.01039704, + "balance_loss_clip": 1.04565454, + "balance_loss_mlp": 1.02492261, + "epoch": 0.48459341650383286, + "flos": 37116478316160.0, + "grad_norm": 2.072628164023448, + "language_loss": 0.66365272, + "learning_rate": 2.196555093055352e-06, + "loss": 0.68531591, + "num_input_tokens_seen": 173288030, + "step": 8060, + "time_per_iteration": 2.6061532497406006 + }, + { + "auxiliary_loss_clip": 0.01109042, + "auxiliary_loss_mlp": 0.01037398, + "balance_loss_clip": 1.04587615, + "balance_loss_mlp": 1.02321172, + "epoch": 0.48465353975650083, + "flos": 22966777509120.0, + "grad_norm": 1.7953507557679207, + "language_loss": 0.67175353, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.69321799, + "num_input_tokens_seen": 173305965, + "step": 8061, + "time_per_iteration": 2.538194417953491 + }, + { + "auxiliary_loss_clip": 0.01104356, + "auxiliary_loss_mlp": 0.01040843, + "balance_loss_clip": 1.04501653, + "balance_loss_mlp": 1.02531016, + "epoch": 0.4847136630091688, + "flos": 17707695523200.0, + "grad_norm": 1.9579794918130407, + "language_loss": 0.82165825, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.8431102, + "num_input_tokens_seen": 173321985, + "step": 8062, + "time_per_iteration": 2.5257492065429688 + }, + { + "auxiliary_loss_clip": 0.0106197, + "auxiliary_loss_mlp": 0.0103648, + "balance_loss_clip": 1.04705429, + "balance_loss_mlp": 1.0229857, + "epoch": 0.48477378626183676, + "flos": 22018269018240.0, + "grad_norm": 1.5645273839193174, + "language_loss": 0.74499762, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.76598209, + "num_input_tokens_seen": 173341315, + "step": 8063, + "time_per_iteration": 2.65763783454895 + }, + { + "auxiliary_loss_clip": 0.01098927, + "auxiliary_loss_mlp": 0.01033278, + "balance_loss_clip": 1.04450142, + "balance_loss_mlp": 1.01922882, + "epoch": 0.4848339095145047, + "flos": 27962956874880.0, + "grad_norm": 1.6381294335013055, + "language_loss": 0.79016471, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.81148672, + "num_input_tokens_seen": 173361055, + "step": 8064, + "time_per_iteration": 2.6216468811035156 + }, + { + "auxiliary_loss_clip": 0.01117728, + "auxiliary_loss_mlp": 0.00791995, + "balance_loss_clip": 1.04405642, + "balance_loss_mlp": 1.02030241, + "epoch": 0.4848940327671727, + "flos": 21688752625920.0, + "grad_norm": 1.7414379390169912, + "language_loss": 0.79106331, + "learning_rate": 2.194617118620173e-06, + "loss": 0.81016052, + "num_input_tokens_seen": 173379255, + "step": 8065, + "time_per_iteration": 2.495110511779785 + }, + { + "auxiliary_loss_clip": 0.0110174, + "auxiliary_loss_mlp": 0.00789491, + "balance_loss_clip": 1.04047585, + "balance_loss_mlp": 1.01603293, + "epoch": 0.48495415601984065, + "flos": 20631578515200.0, + "grad_norm": 1.536726861859149, + "language_loss": 0.75938404, + "learning_rate": 2.194229501534644e-06, + "loss": 0.77829635, + "num_input_tokens_seen": 173398370, + "step": 8066, + "time_per_iteration": 2.5116422176361084 + }, + { + "auxiliary_loss_clip": 0.01119907, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.0449667, + "balance_loss_mlp": 1.02063131, + "epoch": 0.4850142792725086, + "flos": 25628152930560.0, + "grad_norm": 1.349258332726653, + "language_loss": 0.72234511, + "learning_rate": 2.193841877083912e-06, + "loss": 0.74387914, + "num_input_tokens_seen": 173419595, + "step": 8067, + "time_per_iteration": 2.5332400798797607 + }, + { + "auxiliary_loss_clip": 0.0105807, + "auxiliary_loss_mlp": 0.01032236, + "balance_loss_clip": 1.05013061, + "balance_loss_mlp": 1.01884842, + "epoch": 0.4850744025251766, + "flos": 13771958405760.0, + "grad_norm": 2.0045816852838754, + "language_loss": 0.78928649, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.81018949, + "num_input_tokens_seen": 173435390, + "step": 8068, + "time_per_iteration": 2.6218581199645996 + }, + { + "auxiliary_loss_clip": 0.01091965, + "auxiliary_loss_mlp": 0.01032422, + "balance_loss_clip": 1.04138136, + "balance_loss_mlp": 1.02005458, + "epoch": 0.4851345257778446, + "flos": 20261339078400.0, + "grad_norm": 1.4168946530767803, + "language_loss": 0.84502017, + "learning_rate": 2.193066606145638e-06, + "loss": 0.86626399, + "num_input_tokens_seen": 173454095, + "step": 8069, + "time_per_iteration": 3.9706478118896484 + }, + { + "auxiliary_loss_clip": 0.01087441, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.04649746, + "balance_loss_mlp": 1.01798725, + "epoch": 0.48519464903051257, + "flos": 27089681420160.0, + "grad_norm": 1.7905385944213423, + "language_loss": 0.78190672, + "learning_rate": 2.192678959687493e-06, + "loss": 0.80308747, + "num_input_tokens_seen": 173475300, + "step": 8070, + "time_per_iteration": 2.607417345046997 + }, + { + "auxiliary_loss_clip": 0.01062934, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.04338765, + "balance_loss_mlp": 1.01821125, + "epoch": 0.48525477228318054, + "flos": 17127235739520.0, + "grad_norm": 1.9883996317401373, + "language_loss": 0.77856141, + "learning_rate": 2.192291305922943e-06, + "loss": 0.7995103, + "num_input_tokens_seen": 173492005, + "step": 8071, + "time_per_iteration": 3.954204559326172 + }, + { + "auxiliary_loss_clip": 0.01062736, + "auxiliary_loss_mlp": 0.01031877, + "balance_loss_clip": 1.0433073, + "balance_loss_mlp": 1.01823306, + "epoch": 0.4853148955358485, + "flos": 28180324028160.0, + "grad_norm": 2.047495110874624, + "language_loss": 0.71805716, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.73900324, + "num_input_tokens_seen": 173511995, + "step": 8072, + "time_per_iteration": 2.6705756187438965 + }, + { + "auxiliary_loss_clip": 0.01083582, + "auxiliary_loss_mlp": 0.01037228, + "balance_loss_clip": 1.04679, + "balance_loss_mlp": 1.0232383, + "epoch": 0.48537501878851647, + "flos": 17493309198720.0, + "grad_norm": 2.9145693944352784, + "language_loss": 0.87731016, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.8985182, + "num_input_tokens_seen": 173530215, + "step": 8073, + "time_per_iteration": 2.5898513793945312 + }, + { + "auxiliary_loss_clip": 0.01080516, + "auxiliary_loss_mlp": 0.01037457, + "balance_loss_clip": 1.04322577, + "balance_loss_mlp": 1.02283621, + "epoch": 0.48543514204118443, + "flos": 28584857975040.0, + "grad_norm": 1.741053796057841, + "language_loss": 0.60822344, + "learning_rate": 2.19112830093786e-06, + "loss": 0.62940317, + "num_input_tokens_seen": 173550920, + "step": 8074, + "time_per_iteration": 4.005762815475464 + }, + { + "auxiliary_loss_clip": 0.01084992, + "auxiliary_loss_mlp": 0.0078769, + "balance_loss_clip": 1.04244399, + "balance_loss_mlp": 1.00885046, + "epoch": 0.4854952652938524, + "flos": 20959981585920.0, + "grad_norm": 1.5644213884341898, + "language_loss": 0.7304672, + "learning_rate": 2.19074061809469e-06, + "loss": 0.74919403, + "num_input_tokens_seen": 173569065, + "step": 8075, + "time_per_iteration": 2.5772905349731445 + }, + { + "auxiliary_loss_clip": 0.01115605, + "auxiliary_loss_mlp": 0.01032692, + "balance_loss_clip": 1.04364038, + "balance_loss_mlp": 1.02027035, + "epoch": 0.48555538854652036, + "flos": 66529543155840.0, + "grad_norm": 1.513104823085422, + "language_loss": 0.81688601, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.83836901, + "num_input_tokens_seen": 173596085, + "step": 8076, + "time_per_iteration": 4.3069748878479 + }, + { + "auxiliary_loss_clip": 0.01100896, + "auxiliary_loss_mlp": 0.01033121, + "balance_loss_clip": 1.04442561, + "balance_loss_mlp": 1.01799345, + "epoch": 0.4856155117991883, + "flos": 15924982596480.0, + "grad_norm": 1.7976956511031954, + "language_loss": 0.86342371, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.88476384, + "num_input_tokens_seen": 173613900, + "step": 8077, + "time_per_iteration": 2.5475127696990967 + }, + { + "auxiliary_loss_clip": 0.0102125, + "auxiliary_loss_mlp": 0.01001125, + "balance_loss_clip": 1.0251441, + "balance_loss_mlp": 0.99969465, + "epoch": 0.4856756350518563, + "flos": 71047395060480.0, + "grad_norm": 0.9110556763079507, + "language_loss": 0.58458692, + "learning_rate": 2.189577526226564e-06, + "loss": 0.60481066, + "num_input_tokens_seen": 173671305, + "step": 8078, + "time_per_iteration": 3.1366753578186035 + }, + { + "auxiliary_loss_clip": 0.01123985, + "auxiliary_loss_mlp": 0.01031958, + "balance_loss_clip": 1.04551184, + "balance_loss_mlp": 1.01918459, + "epoch": 0.48573575830452426, + "flos": 29825679346560.0, + "grad_norm": 1.5690117770070544, + "language_loss": 0.72231007, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.74386948, + "num_input_tokens_seen": 173692070, + "step": 8079, + "time_per_iteration": 2.6014459133148193 + }, + { + "auxiliary_loss_clip": 0.01085168, + "auxiliary_loss_mlp": 0.01035042, + "balance_loss_clip": 1.04521048, + "balance_loss_mlp": 1.02164292, + "epoch": 0.4857958815571922, + "flos": 17639501552640.0, + "grad_norm": 3.0704592043172254, + "language_loss": 0.79344553, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.81464761, + "num_input_tokens_seen": 173709785, + "step": 8080, + "time_per_iteration": 2.55288028717041 + }, + { + "auxiliary_loss_clip": 0.01096101, + "auxiliary_loss_mlp": 0.01035649, + "balance_loss_clip": 1.04326391, + "balance_loss_mlp": 1.02251792, + "epoch": 0.4858560048098602, + "flos": 21105491581440.0, + "grad_norm": 1.930115923571366, + "language_loss": 0.83935159, + "learning_rate": 2.188414369659251e-06, + "loss": 0.86066908, + "num_input_tokens_seen": 173728770, + "step": 8081, + "time_per_iteration": 2.5439059734344482 + }, + { + "auxiliary_loss_clip": 0.01106556, + "auxiliary_loss_mlp": 0.0103689, + "balance_loss_clip": 1.04207873, + "balance_loss_mlp": 1.02217984, + "epoch": 0.4859161280625282, + "flos": 22090844448000.0, + "grad_norm": 1.6142836334549737, + "language_loss": 0.83102417, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.85245866, + "num_input_tokens_seen": 173747355, + "step": 8082, + "time_per_iteration": 2.5086960792541504 + }, + { + "auxiliary_loss_clip": 0.01099711, + "auxiliary_loss_mlp": 0.01036217, + "balance_loss_clip": 1.04653335, + "balance_loss_mlp": 1.02353919, + "epoch": 0.4859762513151962, + "flos": 17493452853120.0, + "grad_norm": 1.9256235275155948, + "language_loss": 0.87131095, + "learning_rate": 2.187638896199746e-06, + "loss": 0.89267021, + "num_input_tokens_seen": 173764825, + "step": 8083, + "time_per_iteration": 2.5098512172698975 + }, + { + "auxiliary_loss_clip": 0.01073338, + "auxiliary_loss_mlp": 0.01040413, + "balance_loss_clip": 1.04475081, + "balance_loss_mlp": 1.0281229, + "epoch": 0.48603637456786414, + "flos": 18004246208640.0, + "grad_norm": 1.5835787809866984, + "language_loss": 0.80791634, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.82905388, + "num_input_tokens_seen": 173783215, + "step": 8084, + "time_per_iteration": 2.5649662017822266 + }, + { + "auxiliary_loss_clip": 0.01109753, + "auxiliary_loss_mlp": 0.01037947, + "balance_loss_clip": 1.04381752, + "balance_loss_mlp": 1.02455926, + "epoch": 0.4860964978205321, + "flos": 22492038430080.0, + "grad_norm": 2.262186526282106, + "language_loss": 0.68337119, + "learning_rate": 2.186863394279098e-06, + "loss": 0.70484823, + "num_input_tokens_seen": 173801905, + "step": 8085, + "time_per_iteration": 2.524003505706787 + }, + { + "auxiliary_loss_clip": 0.01109734, + "auxiliary_loss_mlp": 0.01035618, + "balance_loss_clip": 1.04473948, + "balance_loss_mlp": 1.02256417, + "epoch": 0.48615662107320007, + "flos": 23372532518400.0, + "grad_norm": 1.5424836260046138, + "language_loss": 0.77530861, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.79676211, + "num_input_tokens_seen": 173824690, + "step": 8086, + "time_per_iteration": 2.5818378925323486 + }, + { + "auxiliary_loss_clip": 0.01119708, + "auxiliary_loss_mlp": 0.01029038, + "balance_loss_clip": 1.04351377, + "balance_loss_mlp": 1.01591873, + "epoch": 0.48621674432586803, + "flos": 34418833136640.0, + "grad_norm": 2.1830885465321646, + "language_loss": 0.69795281, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.71944022, + "num_input_tokens_seen": 173844450, + "step": 8087, + "time_per_iteration": 2.593947649002075 + }, + { + "auxiliary_loss_clip": 0.01112712, + "auxiliary_loss_mlp": 0.0103913, + "balance_loss_clip": 1.04513669, + "balance_loss_mlp": 1.02510524, + "epoch": 0.486276867578536, + "flos": 33107555237760.0, + "grad_norm": 1.7765760825699368, + "language_loss": 0.72726607, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.74878454, + "num_input_tokens_seen": 173864975, + "step": 8088, + "time_per_iteration": 2.598118782043457 + }, + { + "auxiliary_loss_clip": 0.01095207, + "auxiliary_loss_mlp": 0.01035932, + "balance_loss_clip": 1.03989434, + "balance_loss_mlp": 1.02282536, + "epoch": 0.48633699083120396, + "flos": 21470703114240.0, + "grad_norm": 1.47348351548496, + "language_loss": 0.75322795, + "learning_rate": 2.185312305524892e-06, + "loss": 0.77453929, + "num_input_tokens_seen": 173883805, + "step": 8089, + "time_per_iteration": 2.5351123809814453 + }, + { + "auxiliary_loss_clip": 0.01089177, + "auxiliary_loss_mlp": 0.0103385, + "balance_loss_clip": 1.0440346, + "balance_loss_mlp": 1.02014065, + "epoch": 0.48639711408387193, + "flos": 20084335833600.0, + "grad_norm": 2.093287653790774, + "language_loss": 0.84154546, + "learning_rate": 2.184924515731926e-06, + "loss": 0.86277574, + "num_input_tokens_seen": 173903520, + "step": 8090, + "time_per_iteration": 2.5421924591064453 + }, + { + "auxiliary_loss_clip": 0.01116458, + "auxiliary_loss_mlp": 0.01029755, + "balance_loss_clip": 1.0429616, + "balance_loss_mlp": 1.0172019, + "epoch": 0.4864572373365399, + "flos": 20778884190720.0, + "grad_norm": 1.5122913225045533, + "language_loss": 0.76171499, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.78317714, + "num_input_tokens_seen": 173924255, + "step": 8091, + "time_per_iteration": 2.5014290809631348 + }, + { + "auxiliary_loss_clip": 0.01111727, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.04444051, + "balance_loss_mlp": 1.01543045, + "epoch": 0.48651736058920786, + "flos": 26025360503040.0, + "grad_norm": 1.5154144875098614, + "language_loss": 0.80394107, + "learning_rate": 2.184148915123631e-06, + "loss": 0.82534522, + "num_input_tokens_seen": 173943285, + "step": 8092, + "time_per_iteration": 2.5358846187591553 + }, + { + "auxiliary_loss_clip": 0.01096227, + "auxiliary_loss_mlp": 0.00793595, + "balance_loss_clip": 1.04417777, + "balance_loss_mlp": 1.01579905, + "epoch": 0.4865774838418758, + "flos": 20485601642880.0, + "grad_norm": 1.553319426938984, + "language_loss": 0.72031498, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.73921317, + "num_input_tokens_seen": 173962205, + "step": 8093, + "time_per_iteration": 2.5700132846832275 + }, + { + "auxiliary_loss_clip": 0.01119111, + "auxiliary_loss_mlp": 0.01034132, + "balance_loss_clip": 1.04392397, + "balance_loss_mlp": 1.02181208, + "epoch": 0.4866376070945438, + "flos": 23547704169600.0, + "grad_norm": 1.892545253406927, + "language_loss": 0.67960298, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.7011354, + "num_input_tokens_seen": 173980945, + "step": 8094, + "time_per_iteration": 2.4802374839782715 + }, + { + "auxiliary_loss_clip": 0.01104176, + "auxiliary_loss_mlp": 0.01038143, + "balance_loss_clip": 1.04653108, + "balance_loss_mlp": 1.02432108, + "epoch": 0.4866977303472118, + "flos": 16690598012160.0, + "grad_norm": 2.2632429911363126, + "language_loss": 0.66322744, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.68465066, + "num_input_tokens_seen": 173998860, + "step": 8095, + "time_per_iteration": 2.4903366565704346 + }, + { + "auxiliary_loss_clip": 0.01104543, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.0433352, + "balance_loss_mlp": 1.02109122, + "epoch": 0.4867578535998798, + "flos": 17896011552000.0, + "grad_norm": 2.0457537628089195, + "language_loss": 0.78377211, + "learning_rate": 2.182597630229345e-06, + "loss": 0.80517018, + "num_input_tokens_seen": 174016665, + "step": 8096, + "time_per_iteration": 2.4708750247955322 + }, + { + "auxiliary_loss_clip": 0.01093148, + "auxiliary_loss_mlp": 0.01036048, + "balance_loss_clip": 1.0427959, + "balance_loss_mlp": 1.02232671, + "epoch": 0.48681797685254774, + "flos": 22637799820800.0, + "grad_norm": 1.7913719425930572, + "language_loss": 0.67421496, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.69550693, + "num_input_tokens_seen": 174034800, + "step": 8097, + "time_per_iteration": 2.5455687046051025 + }, + { + "auxiliary_loss_clip": 0.01092317, + "auxiliary_loss_mlp": 0.01037859, + "balance_loss_clip": 1.04289055, + "balance_loss_mlp": 1.02429307, + "epoch": 0.4868781001052157, + "flos": 20886077352960.0, + "grad_norm": 1.4461231272134294, + "language_loss": 0.71358031, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.73488206, + "num_input_tokens_seen": 174054445, + "step": 8098, + "time_per_iteration": 2.5258939266204834 + }, + { + "auxiliary_loss_clip": 0.01114057, + "auxiliary_loss_mlp": 0.01034923, + "balance_loss_clip": 1.04391003, + "balance_loss_mlp": 1.02085078, + "epoch": 0.48693822335788367, + "flos": 41974940937600.0, + "grad_norm": 2.071861821066187, + "language_loss": 0.66022909, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.68171883, + "num_input_tokens_seen": 174077890, + "step": 8099, + "time_per_iteration": 2.670231342315674 + }, + { + "auxiliary_loss_clip": 0.01069077, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.04140329, + "balance_loss_mlp": 1.02392316, + "epoch": 0.48699834661055164, + "flos": 24243294021120.0, + "grad_norm": 1.9499664476428467, + "language_loss": 0.66903949, + "learning_rate": 2.181046234549138e-06, + "loss": 0.69010168, + "num_input_tokens_seen": 174097460, + "step": 8100, + "time_per_iteration": 2.620561122894287 + }, + { + "auxiliary_loss_clip": 0.01083296, + "auxiliary_loss_mlp": 0.01031217, + "balance_loss_clip": 1.04127455, + "balance_loss_mlp": 1.01884365, + "epoch": 0.4870584698632196, + "flos": 25923877603200.0, + "grad_norm": 1.3640007621508292, + "language_loss": 0.76923907, + "learning_rate": 2.180658368429088e-06, + "loss": 0.79038417, + "num_input_tokens_seen": 174120775, + "step": 8101, + "time_per_iteration": 2.6080002784729004 + }, + { + "auxiliary_loss_clip": 0.01043951, + "auxiliary_loss_mlp": 0.01004653, + "balance_loss_clip": 1.018538, + "balance_loss_mlp": 1.00310373, + "epoch": 0.48711859311588757, + "flos": 70211933648640.0, + "grad_norm": 0.6753748363785065, + "language_loss": 0.52264321, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54312921, + "num_input_tokens_seen": 174189135, + "step": 8102, + "time_per_iteration": 3.209597110748291 + }, + { + "auxiliary_loss_clip": 0.01094413, + "auxiliary_loss_mlp": 0.01029807, + "balance_loss_clip": 1.04813147, + "balance_loss_mlp": 1.01699769, + "epoch": 0.48717871636855553, + "flos": 12342964659840.0, + "grad_norm": 2.129419546935938, + "language_loss": 0.73577547, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.75701767, + "num_input_tokens_seen": 174203250, + "step": 8103, + "time_per_iteration": 2.4841301441192627 + }, + { + "auxiliary_loss_clip": 0.01112477, + "auxiliary_loss_mlp": 0.01037246, + "balance_loss_clip": 1.04553795, + "balance_loss_mlp": 1.02323341, + "epoch": 0.4872388396212235, + "flos": 23477139901440.0, + "grad_norm": 1.6432199233469378, + "language_loss": 0.63237584, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.65387309, + "num_input_tokens_seen": 174224145, + "step": 8104, + "time_per_iteration": 2.5205342769622803 + }, + { + "auxiliary_loss_clip": 0.01119659, + "auxiliary_loss_mlp": 0.01028788, + "balance_loss_clip": 1.0446806, + "balance_loss_mlp": 1.01557326, + "epoch": 0.48729896287389146, + "flos": 31427582186880.0, + "grad_norm": 1.5259325128706405, + "language_loss": 0.69243836, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.71392286, + "num_input_tokens_seen": 174244435, + "step": 8105, + "time_per_iteration": 2.5404717922210693 + }, + { + "auxiliary_loss_clip": 0.01084546, + "auxiliary_loss_mlp": 0.01028115, + "balance_loss_clip": 1.04322159, + "balance_loss_mlp": 1.01571774, + "epoch": 0.4873590861265594, + "flos": 19057936700160.0, + "grad_norm": 1.6274532734657456, + "language_loss": 0.73808122, + "learning_rate": 2.178718935364259e-06, + "loss": 0.75920779, + "num_input_tokens_seen": 174262710, + "step": 8106, + "time_per_iteration": 2.530871868133545 + }, + { + "auxiliary_loss_clip": 0.01101996, + "auxiliary_loss_mlp": 0.00791149, + "balance_loss_clip": 1.04640186, + "balance_loss_mlp": 1.01616895, + "epoch": 0.4874192093792274, + "flos": 24348296453760.0, + "grad_norm": 2.2555824197212986, + "language_loss": 0.76555526, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.78448677, + "num_input_tokens_seen": 174281545, + "step": 8107, + "time_per_iteration": 3.9273533821105957 + }, + { + "auxiliary_loss_clip": 0.01069989, + "auxiliary_loss_mlp": 0.01027448, + "balance_loss_clip": 1.05000997, + "balance_loss_mlp": 1.01488936, + "epoch": 0.4874793326318954, + "flos": 23112610727040.0, + "grad_norm": 1.5323598903890174, + "language_loss": 0.74862814, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.76960254, + "num_input_tokens_seen": 174300290, + "step": 8108, + "time_per_iteration": 2.6327316761016846 + }, + { + "auxiliary_loss_clip": 0.01104897, + "auxiliary_loss_mlp": 0.01027144, + "balance_loss_clip": 1.0425694, + "balance_loss_mlp": 1.01601624, + "epoch": 0.4875394558845634, + "flos": 19026156142080.0, + "grad_norm": 2.032787600270871, + "language_loss": 0.73974454, + "learning_rate": 2.177555194083212e-06, + "loss": 0.76106501, + "num_input_tokens_seen": 174318490, + "step": 8109, + "time_per_iteration": 3.8498401641845703 + }, + { + "auxiliary_loss_clip": 0.01104728, + "auxiliary_loss_mlp": 0.01033045, + "balance_loss_clip": 1.04164934, + "balance_loss_mlp": 1.01967561, + "epoch": 0.48759957913723134, + "flos": 21433607343360.0, + "grad_norm": 2.173245237349726, + "language_loss": 0.78550184, + "learning_rate": 2.177167266837428e-06, + "loss": 0.80687958, + "num_input_tokens_seen": 174335505, + "step": 8110, + "time_per_iteration": 2.4934215545654297 + }, + { + "auxiliary_loss_clip": 0.01107773, + "auxiliary_loss_mlp": 0.01035965, + "balance_loss_clip": 1.04535818, + "balance_loss_mlp": 1.02299523, + "epoch": 0.4876597023898993, + "flos": 17748669962880.0, + "grad_norm": 1.8832152702783895, + "language_loss": 0.72686356, + "learning_rate": 2.176779332873444e-06, + "loss": 0.74830091, + "num_input_tokens_seen": 174353990, + "step": 8111, + "time_per_iteration": 2.4821102619171143 + }, + { + "auxiliary_loss_clip": 0.01106484, + "auxiliary_loss_mlp": 0.01033979, + "balance_loss_clip": 1.04365933, + "balance_loss_mlp": 1.02100897, + "epoch": 0.4877198256425673, + "flos": 17019647527680.0, + "grad_norm": 1.5198546934688855, + "language_loss": 0.7613883, + "learning_rate": 2.17639139220597e-06, + "loss": 0.78279299, + "num_input_tokens_seen": 174373425, + "step": 8112, + "time_per_iteration": 3.9488158226013184 + }, + { + "auxiliary_loss_clip": 0.01110645, + "auxiliary_loss_mlp": 0.01036857, + "balance_loss_clip": 1.04330277, + "balance_loss_mlp": 1.02374995, + "epoch": 0.48777994889523524, + "flos": 22384091082240.0, + "grad_norm": 1.471544131966931, + "language_loss": 0.75022453, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.77169955, + "num_input_tokens_seen": 174393070, + "step": 8113, + "time_per_iteration": 2.527771472930908 + }, + { + "auxiliary_loss_clip": 0.01028129, + "auxiliary_loss_mlp": 0.0076805, + "balance_loss_clip": 1.02232218, + "balance_loss_mlp": 1.00289619, + "epoch": 0.4878400721479032, + "flos": 61241772159360.0, + "grad_norm": 0.7765191612344156, + "language_loss": 0.4885354, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.5064972, + "num_input_tokens_seen": 174446880, + "step": 8114, + "time_per_iteration": 3.025965690612793 + }, + { + "auxiliary_loss_clip": 0.01093277, + "auxiliary_loss_mlp": 0.01038653, + "balance_loss_clip": 1.04246044, + "balance_loss_mlp": 1.02429974, + "epoch": 0.48790019540057117, + "flos": 24536612482560.0, + "grad_norm": 1.3282334923925103, + "language_loss": 0.76622808, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.78754735, + "num_input_tokens_seen": 174468485, + "step": 8115, + "time_per_iteration": 4.069878339767456 + }, + { + "auxiliary_loss_clip": 0.01101799, + "auxiliary_loss_mlp": 0.01033808, + "balance_loss_clip": 1.04689503, + "balance_loss_mlp": 1.02009273, + "epoch": 0.48796031865323913, + "flos": 21833939399040.0, + "grad_norm": 2.1200828757190924, + "language_loss": 0.72356194, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.74491799, + "num_input_tokens_seen": 174486360, + "step": 8116, + "time_per_iteration": 2.529463291168213 + }, + { + "auxiliary_loss_clip": 0.01083968, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.04188156, + "balance_loss_mlp": 1.01946068, + "epoch": 0.4880204419059071, + "flos": 18588907883520.0, + "grad_norm": 1.6563634175350912, + "language_loss": 0.62728024, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.64844251, + "num_input_tokens_seen": 174505075, + "step": 8117, + "time_per_iteration": 2.5554823875427246 + }, + { + "auxiliary_loss_clip": 0.0108886, + "auxiliary_loss_mlp": 0.01037181, + "balance_loss_clip": 1.04035211, + "balance_loss_mlp": 1.02263713, + "epoch": 0.48808056515857506, + "flos": 19172168928000.0, + "grad_norm": 1.764715300942369, + "language_loss": 0.79514009, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.81640053, + "num_input_tokens_seen": 174523385, + "step": 8118, + "time_per_iteration": 2.5467748641967773 + }, + { + "auxiliary_loss_clip": 0.0109593, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.04235959, + "balance_loss_mlp": 1.01958525, + "epoch": 0.48814068841124303, + "flos": 20120497850880.0, + "grad_norm": 1.605964734107816, + "language_loss": 0.63049293, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.65177745, + "num_input_tokens_seen": 174542200, + "step": 8119, + "time_per_iteration": 2.5297837257385254 + }, + { + "auxiliary_loss_clip": 0.01064031, + "auxiliary_loss_mlp": 0.00788711, + "balance_loss_clip": 1.04449069, + "balance_loss_mlp": 1.01524878, + "epoch": 0.488200811663911, + "flos": 22965592360320.0, + "grad_norm": 1.8106293666514048, + "language_loss": 0.72157699, + "learning_rate": 2.173287627305878e-06, + "loss": 0.74010444, + "num_input_tokens_seen": 174563620, + "step": 8120, + "time_per_iteration": 2.6518826484680176 + }, + { + "auxiliary_loss_clip": 0.01108767, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.04120183, + "balance_loss_mlp": 1.01704693, + "epoch": 0.48826093491657896, + "flos": 33910697387520.0, + "grad_norm": 1.9834580113204097, + "language_loss": 0.63527429, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.65666813, + "num_input_tokens_seen": 174586465, + "step": 8121, + "time_per_iteration": 2.604771852493286 + }, + { + "auxiliary_loss_clip": 0.01107259, + "auxiliary_loss_mlp": 0.01038214, + "balance_loss_clip": 1.04153943, + "balance_loss_mlp": 1.02393293, + "epoch": 0.488321058169247, + "flos": 23070307484160.0, + "grad_norm": 2.049899536914779, + "language_loss": 0.82906401, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.8505187, + "num_input_tokens_seen": 174604035, + "step": 8122, + "time_per_iteration": 2.5064961910247803 + }, + { + "auxiliary_loss_clip": 0.01108529, + "auxiliary_loss_mlp": 0.01042636, + "balance_loss_clip": 1.04306912, + "balance_loss_mlp": 1.02747846, + "epoch": 0.48838118142191494, + "flos": 19317714837120.0, + "grad_norm": 1.685721105364919, + "language_loss": 0.84874392, + "learning_rate": 2.172123606640866e-06, + "loss": 0.87025559, + "num_input_tokens_seen": 174621715, + "step": 8123, + "time_per_iteration": 2.472954273223877 + }, + { + "auxiliary_loss_clip": 0.01083099, + "auxiliary_loss_mlp": 0.01028829, + "balance_loss_clip": 1.04154336, + "balance_loss_mlp": 1.01613355, + "epoch": 0.4884413046745829, + "flos": 25410678036480.0, + "grad_norm": 1.4409330014640631, + "language_loss": 0.85731828, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.87843752, + "num_input_tokens_seen": 174643835, + "step": 8124, + "time_per_iteration": 2.589334011077881 + }, + { + "auxiliary_loss_clip": 0.010965, + "auxiliary_loss_mlp": 0.01033152, + "balance_loss_clip": 1.04508674, + "balance_loss_mlp": 1.02003312, + "epoch": 0.4885014279272509, + "flos": 20991546662400.0, + "grad_norm": 1.8014661625210506, + "language_loss": 0.7934413, + "learning_rate": 2.171347560204948e-06, + "loss": 0.8147378, + "num_input_tokens_seen": 174660955, + "step": 8125, + "time_per_iteration": 2.517944574356079 + }, + { + "auxiliary_loss_clip": 0.01073206, + "auxiliary_loss_mlp": 0.01038533, + "balance_loss_clip": 1.04199886, + "balance_loss_mlp": 1.024562, + "epoch": 0.48856155117991884, + "flos": 13771599269760.0, + "grad_norm": 1.8015891234723764, + "language_loss": 0.72931683, + "learning_rate": 2.170959527233356e-06, + "loss": 0.75043416, + "num_input_tokens_seen": 174678270, + "step": 8126, + "time_per_iteration": 2.5601820945739746 + }, + { + "auxiliary_loss_clip": 0.01106437, + "auxiliary_loss_mlp": 0.01035455, + "balance_loss_clip": 1.04032207, + "balance_loss_mlp": 1.02227688, + "epoch": 0.4886216744325868, + "flos": 32087764206720.0, + "grad_norm": 1.6226084213284804, + "language_loss": 0.68716657, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.70858544, + "num_input_tokens_seen": 174698360, + "step": 8127, + "time_per_iteration": 2.603609323501587 + }, + { + "auxiliary_loss_clip": 0.01119016, + "auxiliary_loss_mlp": 0.01035766, + "balance_loss_clip": 1.04029047, + "balance_loss_mlp": 1.02217019, + "epoch": 0.48868179768525477, + "flos": 19610063631360.0, + "grad_norm": 1.8006941149214921, + "language_loss": 0.75913417, + "learning_rate": 2.170183441856481e-06, + "loss": 0.78068203, + "num_input_tokens_seen": 174716755, + "step": 8128, + "time_per_iteration": 2.4768261909484863 + }, + { + "auxiliary_loss_clip": 0.01120039, + "auxiliary_loss_mlp": 0.01031692, + "balance_loss_clip": 1.04355311, + "balance_loss_mlp": 1.01926458, + "epoch": 0.48874192093792274, + "flos": 21286912199040.0, + "grad_norm": 1.5527896005823756, + "language_loss": 0.7595433, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.78106058, + "num_input_tokens_seen": 174735560, + "step": 8129, + "time_per_iteration": 2.4578840732574463 + }, + { + "auxiliary_loss_clip": 0.01106915, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.04175377, + "balance_loss_mlp": 1.0185225, + "epoch": 0.4888020441905907, + "flos": 14173439696640.0, + "grad_norm": 2.1324097630163967, + "language_loss": 0.64730191, + "learning_rate": 2.169407330666114e-06, + "loss": 0.66869259, + "num_input_tokens_seen": 174752730, + "step": 8130, + "time_per_iteration": 2.4855897426605225 + }, + { + "auxiliary_loss_clip": 0.01080492, + "auxiliary_loss_mlp": 0.01026184, + "balance_loss_clip": 1.03758669, + "balance_loss_mlp": 1.01320231, + "epoch": 0.48886216744325867, + "flos": 24097891766400.0, + "grad_norm": 2.131177809456845, + "language_loss": 0.72187507, + "learning_rate": 2.169019265427658e-06, + "loss": 0.7429418, + "num_input_tokens_seen": 174772520, + "step": 8131, + "time_per_iteration": 2.581220865249634 + }, + { + "auxiliary_loss_clip": 0.01110038, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.04270315, + "balance_loss_mlp": 1.02092242, + "epoch": 0.48892229069592663, + "flos": 38431419402240.0, + "grad_norm": 1.362130759186972, + "language_loss": 0.69617534, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.71762049, + "num_input_tokens_seen": 174796540, + "step": 8132, + "time_per_iteration": 2.67264723777771 + }, + { + "auxiliary_loss_clip": 0.01110478, + "auxiliary_loss_mlp": 0.0102957, + "balance_loss_clip": 1.04317725, + "balance_loss_mlp": 1.01626658, + "epoch": 0.4889824139485946, + "flos": 23843321101440.0, + "grad_norm": 1.3418582900106162, + "language_loss": 0.70487022, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.72627068, + "num_input_tokens_seen": 174817840, + "step": 8133, + "time_per_iteration": 2.5461366176605225 + }, + { + "auxiliary_loss_clip": 0.01064592, + "auxiliary_loss_mlp": 0.01040548, + "balance_loss_clip": 1.04325628, + "balance_loss_mlp": 1.02706587, + "epoch": 0.48904253720126256, + "flos": 24425827960320.0, + "grad_norm": 2.001697220511099, + "language_loss": 0.7036562, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.7247076, + "num_input_tokens_seen": 174837885, + "step": 8134, + "time_per_iteration": 2.6415648460388184 + }, + { + "auxiliary_loss_clip": 0.01079893, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.04276013, + "balance_loss_mlp": 1.01799083, + "epoch": 0.4891026604539306, + "flos": 24170682677760.0, + "grad_norm": 1.9612867255745032, + "language_loss": 0.80016589, + "learning_rate": 2.167466940528718e-06, + "loss": 0.82128435, + "num_input_tokens_seen": 174855240, + "step": 8135, + "time_per_iteration": 2.6160531044006348 + }, + { + "auxiliary_loss_clip": 0.01116213, + "auxiliary_loss_mlp": 0.01034546, + "balance_loss_clip": 1.04117036, + "balance_loss_mlp": 1.02286959, + "epoch": 0.48916278370659855, + "flos": 21470954509440.0, + "grad_norm": 1.5139315342196416, + "language_loss": 0.74057674, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.76208436, + "num_input_tokens_seen": 174875145, + "step": 8136, + "time_per_iteration": 2.5476903915405273 + }, + { + "auxiliary_loss_clip": 0.01093483, + "auxiliary_loss_mlp": 0.01038322, + "balance_loss_clip": 1.04297185, + "balance_loss_mlp": 1.02464867, + "epoch": 0.4892229069592665, + "flos": 22309755886080.0, + "grad_norm": 1.9199294323693348, + "language_loss": 0.73642403, + "learning_rate": 2.166690739918204e-06, + "loss": 0.75774205, + "num_input_tokens_seen": 174894770, + "step": 8137, + "time_per_iteration": 2.583580255508423 + }, + { + "auxiliary_loss_clip": 0.01048045, + "auxiliary_loss_mlp": 0.01032757, + "balance_loss_clip": 1.03976607, + "balance_loss_mlp": 1.01931643, + "epoch": 0.4892830302119345, + "flos": 12786856934400.0, + "grad_norm": 4.192472822841957, + "language_loss": 0.75560141, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.77640939, + "num_input_tokens_seen": 174912780, + "step": 8138, + "time_per_iteration": 2.6532206535339355 + }, + { + "auxiliary_loss_clip": 0.01085305, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.04516327, + "balance_loss_mlp": 1.01955163, + "epoch": 0.48934315346460244, + "flos": 20813896972800.0, + "grad_norm": 1.7596590423628173, + "language_loss": 0.74386334, + "learning_rate": 2.165914514023972e-06, + "loss": 0.76503599, + "num_input_tokens_seen": 174931250, + "step": 8139, + "time_per_iteration": 2.586831569671631 + }, + { + "auxiliary_loss_clip": 0.01109783, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.04308248, + "balance_loss_mlp": 1.02063096, + "epoch": 0.4894032767172704, + "flos": 19755537713280.0, + "grad_norm": 1.7945618229847682, + "language_loss": 0.62678093, + "learning_rate": 2.165526391632255e-06, + "loss": 0.648206, + "num_input_tokens_seen": 174951105, + "step": 8140, + "time_per_iteration": 2.5224621295928955 + }, + { + "auxiliary_loss_clip": 0.01085635, + "auxiliary_loss_mlp": 0.01036665, + "balance_loss_clip": 1.04068983, + "balance_loss_mlp": 1.02271128, + "epoch": 0.4894633999699384, + "flos": 17818982835840.0, + "grad_norm": 5.436897034256438, + "language_loss": 0.82428455, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.84550756, + "num_input_tokens_seen": 174969120, + "step": 8141, + "time_per_iteration": 2.563535213470459 + }, + { + "auxiliary_loss_clip": 0.0109143, + "auxiliary_loss_mlp": 0.01033459, + "balance_loss_clip": 1.04844356, + "balance_loss_mlp": 1.0198276, + "epoch": 0.48952352322260634, + "flos": 25523222325120.0, + "grad_norm": 1.5747471227180858, + "language_loss": 0.72389442, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.74514329, + "num_input_tokens_seen": 174991295, + "step": 8142, + "time_per_iteration": 2.699516773223877 + }, + { + "auxiliary_loss_clip": 0.01116483, + "auxiliary_loss_mlp": 0.01035111, + "balance_loss_clip": 1.0416038, + "balance_loss_mlp": 1.02266526, + "epoch": 0.4895836464752743, + "flos": 29055502903680.0, + "grad_norm": 1.6841293516879212, + "language_loss": 0.6709801, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.69249606, + "num_input_tokens_seen": 175012830, + "step": 8143, + "time_per_iteration": 2.559818983078003 + }, + { + "auxiliary_loss_clip": 0.01107251, + "auxiliary_loss_mlp": 0.00788635, + "balance_loss_clip": 1.04220676, + "balance_loss_mlp": 1.01701093, + "epoch": 0.48964376972794227, + "flos": 33546958312320.0, + "grad_norm": 1.5193816723872193, + "language_loss": 0.74939322, + "learning_rate": 2.163973839444793e-06, + "loss": 0.76835209, + "num_input_tokens_seen": 175035695, + "step": 8144, + "time_per_iteration": 2.669614791870117 + }, + { + "auxiliary_loss_clip": 0.01094989, + "auxiliary_loss_mlp": 0.0103335, + "balance_loss_clip": 1.04217386, + "balance_loss_mlp": 1.02027845, + "epoch": 0.48970389298061023, + "flos": 22054035985920.0, + "grad_norm": 1.363853225907896, + "language_loss": 0.75677049, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.77805382, + "num_input_tokens_seen": 175056425, + "step": 8145, + "time_per_iteration": 2.5973618030548096 + }, + { + "auxiliary_loss_clip": 0.01099817, + "auxiliary_loss_mlp": 0.00788891, + "balance_loss_clip": 1.041857, + "balance_loss_mlp": 1.01273489, + "epoch": 0.4897640162332782, + "flos": 20084299920000.0, + "grad_norm": 1.8168382473016025, + "language_loss": 0.8061372, + "learning_rate": 2.163197525984761e-06, + "loss": 0.82502425, + "num_input_tokens_seen": 175074800, + "step": 8146, + "time_per_iteration": 3.8753280639648438 + }, + { + "auxiliary_loss_clip": 0.01102648, + "auxiliary_loss_mlp": 0.01030811, + "balance_loss_clip": 1.03956544, + "balance_loss_mlp": 1.01799059, + "epoch": 0.48982413948594616, + "flos": 23806225330560.0, + "grad_norm": 7.772188255698242, + "language_loss": 0.74265862, + "learning_rate": 2.162809359964687e-06, + "loss": 0.7639932, + "num_input_tokens_seen": 175094500, + "step": 8147, + "time_per_iteration": 2.5145633220672607 + }, + { + "auxiliary_loss_clip": 0.01097369, + "auxiliary_loss_mlp": 0.01031616, + "balance_loss_clip": 1.04662931, + "balance_loss_mlp": 1.01835382, + "epoch": 0.4898842627386142, + "flos": 17639645207040.0, + "grad_norm": 2.027271021788659, + "language_loss": 0.82957882, + "learning_rate": 2.162421187770864e-06, + "loss": 0.8508687, + "num_input_tokens_seen": 175112920, + "step": 8148, + "time_per_iteration": 3.86165714263916 + }, + { + "auxiliary_loss_clip": 0.01084006, + "auxiliary_loss_mlp": 0.01030095, + "balance_loss_clip": 1.04030752, + "balance_loss_mlp": 1.01868081, + "epoch": 0.48994438599128215, + "flos": 16617914841600.0, + "grad_norm": 1.706604264247751, + "language_loss": 0.73782754, + "learning_rate": 2.162033009418015e-06, + "loss": 0.75896847, + "num_input_tokens_seen": 175129910, + "step": 8149, + "time_per_iteration": 2.5396735668182373 + }, + { + "auxiliary_loss_clip": 0.0112385, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.04451156, + "balance_loss_mlp": 1.01786327, + "epoch": 0.4900045092439501, + "flos": 26614834600320.0, + "grad_norm": 1.7595967727109412, + "language_loss": 0.75966281, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.78122067, + "num_input_tokens_seen": 175148705, + "step": 8150, + "time_per_iteration": 2.5234949588775635 + }, + { + "auxiliary_loss_clip": 0.01097136, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.04675341, + "balance_loss_mlp": 1.01717138, + "epoch": 0.4900646324966181, + "flos": 19902125116800.0, + "grad_norm": 1.8445976789335938, + "language_loss": 0.72935802, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.75063312, + "num_input_tokens_seen": 175167425, + "step": 8151, + "time_per_iteration": 4.036783456802368 + }, + { + "auxiliary_loss_clip": 0.01017781, + "auxiliary_loss_mlp": 0.01011436, + "balance_loss_clip": 1.02117801, + "balance_loss_mlp": 1.01014888, + "epoch": 0.49012475574928605, + "flos": 59189620337280.0, + "grad_norm": 0.8319905186555866, + "language_loss": 0.54320681, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.56349897, + "num_input_tokens_seen": 175227985, + "step": 8152, + "time_per_iteration": 3.118217945098877 + }, + { + "auxiliary_loss_clip": 0.01070979, + "auxiliary_loss_mlp": 0.01034556, + "balance_loss_clip": 1.04408002, + "balance_loss_mlp": 1.02143121, + "epoch": 0.490184879001954, + "flos": 45259797657600.0, + "grad_norm": 1.8257337771520936, + "language_loss": 0.60662997, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.62768531, + "num_input_tokens_seen": 175251895, + "step": 8153, + "time_per_iteration": 4.287131309509277 + }, + { + "auxiliary_loss_clip": 0.01084782, + "auxiliary_loss_mlp": 0.01035076, + "balance_loss_clip": 1.04303563, + "balance_loss_mlp": 1.0217191, + "epoch": 0.490245002254622, + "flos": 28002135634560.0, + "grad_norm": 1.5351365461960136, + "language_loss": 0.77033615, + "learning_rate": 2.160092025783549e-06, + "loss": 0.79153478, + "num_input_tokens_seen": 175272770, + "step": 8154, + "time_per_iteration": 2.6193149089813232 + }, + { + "auxiliary_loss_clip": 0.01024639, + "auxiliary_loss_mlp": 0.01002443, + "balance_loss_clip": 1.01804638, + "balance_loss_mlp": 1.0010184, + "epoch": 0.49030512550728994, + "flos": 58951318533120.0, + "grad_norm": 0.9862919204668381, + "language_loss": 0.67072666, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.69099748, + "num_input_tokens_seen": 175336320, + "step": 8155, + "time_per_iteration": 3.2242379188537598 + }, + { + "auxiliary_loss_clip": 0.01119356, + "auxiliary_loss_mlp": 0.01031473, + "balance_loss_clip": 1.04346871, + "balance_loss_mlp": 1.01888502, + "epoch": 0.4903652487599579, + "flos": 19791843384960.0, + "grad_norm": 1.9189194030993046, + "language_loss": 0.76798302, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.7894913, + "num_input_tokens_seen": 175353540, + "step": 8156, + "time_per_iteration": 2.465203285217285 + }, + { + "auxiliary_loss_clip": 0.01108051, + "auxiliary_loss_mlp": 0.01036198, + "balance_loss_clip": 1.04309487, + "balance_loss_mlp": 1.02356136, + "epoch": 0.49042537201262587, + "flos": 21762082241280.0, + "grad_norm": 2.1078381562573805, + "language_loss": 0.8385272, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.85996974, + "num_input_tokens_seen": 175370445, + "step": 8157, + "time_per_iteration": 2.5290350914001465 + }, + { + "auxiliary_loss_clip": 0.01109237, + "auxiliary_loss_mlp": 0.0103457, + "balance_loss_clip": 1.04240417, + "balance_loss_mlp": 1.02135527, + "epoch": 0.49048549526529384, + "flos": 18953042008320.0, + "grad_norm": 1.8783022424843117, + "language_loss": 0.79737127, + "learning_rate": 2.158539129514956e-06, + "loss": 0.81880927, + "num_input_tokens_seen": 175389020, + "step": 8158, + "time_per_iteration": 2.4776599407196045 + }, + { + "auxiliary_loss_clip": 0.01121501, + "auxiliary_loss_mlp": 0.01031891, + "balance_loss_clip": 1.04308701, + "balance_loss_mlp": 1.0186348, + "epoch": 0.4905456185179618, + "flos": 26906393295360.0, + "grad_norm": 1.5501797550037004, + "language_loss": 0.69289935, + "learning_rate": 2.158150890381454e-06, + "loss": 0.71443331, + "num_input_tokens_seen": 175409545, + "step": 8159, + "time_per_iteration": 2.531996250152588 + }, + { + "auxiliary_loss_clip": 0.01101653, + "auxiliary_loss_mlp": 0.01041428, + "balance_loss_clip": 1.04014874, + "balance_loss_mlp": 1.0269022, + "epoch": 0.49060574177062977, + "flos": 20412343854720.0, + "grad_norm": 2.4179240226182435, + "language_loss": 0.73228616, + "learning_rate": 2.157762645250854e-06, + "loss": 0.75371695, + "num_input_tokens_seen": 175429335, + "step": 8160, + "time_per_iteration": 2.488171339035034 + }, + { + "auxiliary_loss_clip": 0.01106691, + "auxiliary_loss_mlp": 0.01041741, + "balance_loss_clip": 1.04269314, + "balance_loss_mlp": 1.02822268, + "epoch": 0.4906658650232978, + "flos": 17493704248320.0, + "grad_norm": 2.0219950610111503, + "language_loss": 0.71496272, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.7364471, + "num_input_tokens_seen": 175446955, + "step": 8161, + "time_per_iteration": 2.4966702461242676 + }, + { + "auxiliary_loss_clip": 0.01074796, + "auxiliary_loss_mlp": 0.01040495, + "balance_loss_clip": 1.04266167, + "balance_loss_mlp": 1.02760291, + "epoch": 0.49072598827596575, + "flos": 26614439550720.0, + "grad_norm": 2.349048493282997, + "language_loss": 0.68559825, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.70675123, + "num_input_tokens_seen": 175468195, + "step": 8162, + "time_per_iteration": 2.6270155906677246 + }, + { + "auxiliary_loss_clip": 0.01107748, + "auxiliary_loss_mlp": 0.01037921, + "balance_loss_clip": 1.04360032, + "balance_loss_mlp": 1.02327013, + "epoch": 0.4907861115286337, + "flos": 20412595249920.0, + "grad_norm": 2.485934092439231, + "language_loss": 0.63295412, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.65441084, + "num_input_tokens_seen": 175487455, + "step": 8163, + "time_per_iteration": 2.526893138885498 + }, + { + "auxiliary_loss_clip": 0.01082337, + "auxiliary_loss_mlp": 0.01035008, + "balance_loss_clip": 1.04263115, + "balance_loss_mlp": 1.0220263, + "epoch": 0.4908462347813017, + "flos": 14064271286400.0, + "grad_norm": 2.1156573574833026, + "language_loss": 0.77520347, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.79637688, + "num_input_tokens_seen": 175504450, + "step": 8164, + "time_per_iteration": 2.5376946926116943 + }, + { + "auxiliary_loss_clip": 0.01102297, + "auxiliary_loss_mlp": 0.01037919, + "balance_loss_clip": 1.03904188, + "balance_loss_mlp": 1.02298248, + "epoch": 0.49090635803396965, + "flos": 18735100237440.0, + "grad_norm": 1.5613918788906072, + "language_loss": 0.76597595, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.78737807, + "num_input_tokens_seen": 175523600, + "step": 8165, + "time_per_iteration": 2.502760171890259 + }, + { + "auxiliary_loss_clip": 0.01098356, + "auxiliary_loss_mlp": 0.01033258, + "balance_loss_clip": 1.04367399, + "balance_loss_mlp": 1.02036572, + "epoch": 0.4909664812866376, + "flos": 20558500295040.0, + "grad_norm": 1.5894416628421937, + "language_loss": 0.7737875, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.79510367, + "num_input_tokens_seen": 175542720, + "step": 8166, + "time_per_iteration": 2.5327980518341064 + }, + { + "auxiliary_loss_clip": 0.01035128, + "auxiliary_loss_mlp": 0.01001411, + "balance_loss_clip": 1.01836753, + "balance_loss_mlp": 0.99993914, + "epoch": 0.4910266045393056, + "flos": 54684017948160.0, + "grad_norm": 0.8073798051043309, + "language_loss": 0.5421083, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56247377, + "num_input_tokens_seen": 175598640, + "step": 8167, + "time_per_iteration": 3.1865673065185547 + }, + { + "auxiliary_loss_clip": 0.01072112, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.03988433, + "balance_loss_mlp": 1.01947868, + "epoch": 0.49108672779197354, + "flos": 16246454342400.0, + "grad_norm": 2.083874731765354, + "language_loss": 0.86155379, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.8825978, + "num_input_tokens_seen": 175615675, + "step": 8168, + "time_per_iteration": 2.554624080657959 + }, + { + "auxiliary_loss_clip": 0.01099465, + "auxiliary_loss_mlp": 0.01035926, + "balance_loss_clip": 1.03933775, + "balance_loss_mlp": 1.02197862, + "epoch": 0.4911468510446415, + "flos": 19825419623040.0, + "grad_norm": 1.6722816474304885, + "language_loss": 0.73383689, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.75519079, + "num_input_tokens_seen": 175632255, + "step": 8169, + "time_per_iteration": 2.512869358062744 + }, + { + "auxiliary_loss_clip": 0.01104601, + "auxiliary_loss_mlp": 0.0102872, + "balance_loss_clip": 1.04063165, + "balance_loss_mlp": 1.01695979, + "epoch": 0.4912069742973095, + "flos": 21212684743680.0, + "grad_norm": 1.5203929831468295, + "language_loss": 0.77745038, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.79878354, + "num_input_tokens_seen": 175651625, + "step": 8170, + "time_per_iteration": 2.5029845237731934 + }, + { + "auxiliary_loss_clip": 0.01092141, + "auxiliary_loss_mlp": 0.01038853, + "balance_loss_clip": 1.04196894, + "balance_loss_mlp": 1.02651453, + "epoch": 0.49126709754997744, + "flos": 19537129065600.0, + "grad_norm": 2.3122093064033806, + "language_loss": 0.75967467, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.78098464, + "num_input_tokens_seen": 175669265, + "step": 8171, + "time_per_iteration": 2.5726845264434814 + }, + { + "auxiliary_loss_clip": 0.01098075, + "auxiliary_loss_mlp": 0.01035548, + "balance_loss_clip": 1.04071212, + "balance_loss_mlp": 1.02275658, + "epoch": 0.4913272208026454, + "flos": 12239686080000.0, + "grad_norm": 1.7433070699281912, + "language_loss": 0.81155986, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.83289611, + "num_input_tokens_seen": 175686065, + "step": 8172, + "time_per_iteration": 2.511493444442749 + }, + { + "auxiliary_loss_clip": 0.01032847, + "auxiliary_loss_mlp": 0.01007874, + "balance_loss_clip": 1.01721597, + "balance_loss_mlp": 1.00632465, + "epoch": 0.49138734405531337, + "flos": 65465871661440.0, + "grad_norm": 0.6920903811599551, + "language_loss": 0.53301167, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.55341887, + "num_input_tokens_seen": 175748595, + "step": 8173, + "time_per_iteration": 3.091998815536499 + }, + { + "auxiliary_loss_clip": 0.01110386, + "auxiliary_loss_mlp": 0.0078913, + "balance_loss_clip": 1.04217362, + "balance_loss_mlp": 1.01544785, + "epoch": 0.4914474673079814, + "flos": 18439052342400.0, + "grad_norm": 1.948709681630015, + "language_loss": 0.62824488, + "learning_rate": 2.152326591972107e-06, + "loss": 0.64724004, + "num_input_tokens_seen": 175766770, + "step": 8174, + "time_per_iteration": 2.4670445919036865 + }, + { + "auxiliary_loss_clip": 0.01080843, + "auxiliary_loss_mlp": 0.01040713, + "balance_loss_clip": 1.04117, + "balance_loss_mlp": 1.02587152, + "epoch": 0.49150759056064935, + "flos": 21685053525120.0, + "grad_norm": 1.8175318427883584, + "language_loss": 0.68825454, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.70947009, + "num_input_tokens_seen": 175783605, + "step": 8175, + "time_per_iteration": 2.54353666305542 + }, + { + "auxiliary_loss_clip": 0.01106559, + "auxiliary_loss_mlp": 0.0102869, + "balance_loss_clip": 1.04069161, + "balance_loss_mlp": 1.01651239, + "epoch": 0.4915677138133173, + "flos": 22382439056640.0, + "grad_norm": 1.9204395516689636, + "language_loss": 0.74159741, + "learning_rate": 2.151549919570068e-06, + "loss": 0.76294994, + "num_input_tokens_seen": 175801390, + "step": 8176, + "time_per_iteration": 2.485837936401367 + }, + { + "auxiliary_loss_clip": 0.01108691, + "auxiliary_loss_mlp": 0.01040028, + "balance_loss_clip": 1.04147863, + "balance_loss_mlp": 1.0270406, + "epoch": 0.4916278370659853, + "flos": 18402890325120.0, + "grad_norm": 1.6189458434969857, + "language_loss": 0.69592416, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.7174114, + "num_input_tokens_seen": 175819830, + "step": 8177, + "time_per_iteration": 2.487250328063965 + }, + { + "auxiliary_loss_clip": 0.01024525, + "auxiliary_loss_mlp": 0.00777279, + "balance_loss_clip": 1.01947427, + "balance_loss_mlp": 1.02545762, + "epoch": 0.49168796031865325, + "flos": 66609124715520.0, + "grad_norm": 0.6872048288824115, + "language_loss": 0.46260095, + "learning_rate": 2.150773224180877e-06, + "loss": 0.48061901, + "num_input_tokens_seen": 175881765, + "step": 8178, + "time_per_iteration": 3.1001181602478027 + }, + { + "auxiliary_loss_clip": 0.01122505, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.0434767, + "balance_loss_mlp": 1.01926851, + "epoch": 0.4917480835713212, + "flos": 20959335141120.0, + "grad_norm": 1.801389592271759, + "language_loss": 0.65958345, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.68114269, + "num_input_tokens_seen": 175901795, + "step": 8179, + "time_per_iteration": 2.471752405166626 + }, + { + "auxiliary_loss_clip": 0.01043284, + "auxiliary_loss_mlp": 0.01047214, + "balance_loss_clip": 1.0449059, + "balance_loss_mlp": 1.0314188, + "epoch": 0.4918082068239892, + "flos": 15772900412160.0, + "grad_norm": 1.8918293400675485, + "language_loss": 0.70324445, + "learning_rate": 2.149996505922343e-06, + "loss": 0.72414947, + "num_input_tokens_seen": 175917770, + "step": 8180, + "time_per_iteration": 2.690161943435669 + }, + { + "auxiliary_loss_clip": 0.01093835, + "auxiliary_loss_mlp": 0.01029206, + "balance_loss_clip": 1.04067683, + "balance_loss_mlp": 1.01606894, + "epoch": 0.49186833007665715, + "flos": 24604806453120.0, + "grad_norm": 1.7080577910753465, + "language_loss": 0.84273708, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.86396748, + "num_input_tokens_seen": 175937000, + "step": 8181, + "time_per_iteration": 2.715656042098999 + }, + { + "auxiliary_loss_clip": 0.01115778, + "auxiliary_loss_mlp": 0.01032121, + "balance_loss_clip": 1.04210734, + "balance_loss_mlp": 1.01971149, + "epoch": 0.4919284533293251, + "flos": 22090557139200.0, + "grad_norm": 2.3301684490852135, + "language_loss": 0.72564888, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.74712789, + "num_input_tokens_seen": 175955170, + "step": 8182, + "time_per_iteration": 2.4785163402557373 + }, + { + "auxiliary_loss_clip": 0.01081984, + "auxiliary_loss_mlp": 0.01036288, + "balance_loss_clip": 1.04353404, + "balance_loss_mlp": 1.02366364, + "epoch": 0.4919885765819931, + "flos": 23368043318400.0, + "grad_norm": 1.7835190969667145, + "language_loss": 0.72403538, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.74521804, + "num_input_tokens_seen": 175973725, + "step": 8183, + "time_per_iteration": 2.565495491027832 + }, + { + "auxiliary_loss_clip": 0.01061324, + "auxiliary_loss_mlp": 0.01032347, + "balance_loss_clip": 1.0381583, + "balance_loss_mlp": 1.01755333, + "epoch": 0.49204869983466104, + "flos": 21360493209600.0, + "grad_norm": 1.9981745164762756, + "language_loss": 0.77233094, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.79326761, + "num_input_tokens_seen": 175993885, + "step": 8184, + "time_per_iteration": 2.597191572189331 + }, + { + "auxiliary_loss_clip": 0.01090303, + "auxiliary_loss_mlp": 0.01030585, + "balance_loss_clip": 1.04309297, + "balance_loss_mlp": 1.01785314, + "epoch": 0.492108823087329, + "flos": 21142695093120.0, + "grad_norm": 1.62097660916587, + "language_loss": 0.70611238, + "learning_rate": 2.148054610995789e-06, + "loss": 0.72732127, + "num_input_tokens_seen": 176014210, + "step": 8185, + "time_per_iteration": 4.228662014007568 + }, + { + "auxiliary_loss_clip": 0.01100955, + "auxiliary_loss_mlp": 0.01036694, + "balance_loss_clip": 1.04361618, + "balance_loss_mlp": 1.02252603, + "epoch": 0.49216894633999697, + "flos": 25116605389440.0, + "grad_norm": 1.6830366602612554, + "language_loss": 0.74962902, + "learning_rate": 2.147666215108831e-06, + "loss": 0.77100551, + "num_input_tokens_seen": 176033890, + "step": 8186, + "time_per_iteration": 2.593404769897461 + }, + { + "auxiliary_loss_clip": 0.01106909, + "auxiliary_loss_mlp": 0.01036207, + "balance_loss_clip": 1.04278398, + "balance_loss_mlp": 1.02312398, + "epoch": 0.49222906959266494, + "flos": 22637943475200.0, + "grad_norm": 1.833931425884278, + "language_loss": 0.67632341, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.6977545, + "num_input_tokens_seen": 176052720, + "step": 8187, + "time_per_iteration": 3.8550517559051514 + }, + { + "auxiliary_loss_clip": 0.01073542, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.04005408, + "balance_loss_mlp": 1.01814842, + "epoch": 0.49228919284533296, + "flos": 20410548174720.0, + "grad_norm": 1.4090276286909484, + "language_loss": 0.66701066, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.68806255, + "num_input_tokens_seen": 176072545, + "step": 8188, + "time_per_iteration": 2.650702953338623 + }, + { + "auxiliary_loss_clip": 0.01109799, + "auxiliary_loss_mlp": 0.01025596, + "balance_loss_clip": 1.04404426, + "balance_loss_mlp": 1.01369345, + "epoch": 0.4923493160980009, + "flos": 27122359818240.0, + "grad_norm": 1.7132627030585639, + "language_loss": 0.74939322, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.77074718, + "num_input_tokens_seen": 176091490, + "step": 8189, + "time_per_iteration": 3.964816093444824 + }, + { + "auxiliary_loss_clip": 0.01094647, + "auxiliary_loss_mlp": 0.01028455, + "balance_loss_clip": 1.03959036, + "balance_loss_mlp": 1.01555026, + "epoch": 0.4924094393506689, + "flos": 35736683224320.0, + "grad_norm": 1.5044756666948194, + "language_loss": 0.64630812, + "learning_rate": 2.146112575713104e-06, + "loss": 0.66753912, + "num_input_tokens_seen": 176113200, + "step": 8190, + "time_per_iteration": 2.6320598125457764 + }, + { + "auxiliary_loss_clip": 0.01116716, + "auxiliary_loss_mlp": 0.01027166, + "balance_loss_clip": 1.04219639, + "balance_loss_mlp": 1.0139699, + "epoch": 0.49246956260333685, + "flos": 20412487509120.0, + "grad_norm": 1.9581182575716516, + "language_loss": 0.71478641, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.73622525, + "num_input_tokens_seen": 176132485, + "step": 8191, + "time_per_iteration": 2.446629285812378 + }, + { + "auxiliary_loss_clip": 0.01116914, + "auxiliary_loss_mlp": 0.00808011, + "balance_loss_clip": 1.04081917, + "balance_loss_mlp": 1.05142498, + "epoch": 0.4925296858560048, + "flos": 38976938231040.0, + "grad_norm": 1.5021267755938106, + "language_loss": 0.71638829, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.73563755, + "num_input_tokens_seen": 176155755, + "step": 8192, + "time_per_iteration": 4.032135963439941 + }, + { + "auxiliary_loss_clip": 0.01023999, + "auxiliary_loss_mlp": 0.01002789, + "balance_loss_clip": 1.01788235, + "balance_loss_mlp": 1.00123942, + "epoch": 0.4925898091086728, + "flos": 64278917712000.0, + "grad_norm": 0.7173293710945046, + "language_loss": 0.52089703, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54116493, + "num_input_tokens_seen": 176216295, + "step": 8193, + "time_per_iteration": 3.1938841342926025 + }, + { + "auxiliary_loss_clip": 0.01118458, + "auxiliary_loss_mlp": 0.0103662, + "balance_loss_clip": 1.04418921, + "balance_loss_mlp": 1.02375114, + "epoch": 0.49264993236134075, + "flos": 23036372110080.0, + "grad_norm": 1.3849243611771886, + "language_loss": 0.76958716, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.79113793, + "num_input_tokens_seen": 176235925, + "step": 8194, + "time_per_iteration": 2.4860715866088867 + }, + { + "auxiliary_loss_clip": 0.01091339, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.039729, + "balance_loss_mlp": 1.01713085, + "epoch": 0.4927100556140087, + "flos": 24718212668160.0, + "grad_norm": 2.170937148500071, + "language_loss": 0.7013936, + "learning_rate": 2.144170401915341e-06, + "loss": 0.72260201, + "num_input_tokens_seen": 176253865, + "step": 8195, + "time_per_iteration": 2.534700870513916 + }, + { + "auxiliary_loss_clip": 0.01076064, + "auxiliary_loss_mlp": 0.01026794, + "balance_loss_clip": 1.04099345, + "balance_loss_mlp": 1.01446176, + "epoch": 0.4927701788666767, + "flos": 23505544581120.0, + "grad_norm": 1.8678204686243725, + "language_loss": 0.80775541, + "learning_rate": 2.143781950696001e-06, + "loss": 0.82878399, + "num_input_tokens_seen": 176271525, + "step": 8196, + "time_per_iteration": 2.608266592025757 + }, + { + "auxiliary_loss_clip": 0.01085893, + "auxiliary_loss_mlp": 0.01035814, + "balance_loss_clip": 1.03935099, + "balance_loss_mlp": 1.02165771, + "epoch": 0.49283030211934464, + "flos": 22928891639040.0, + "grad_norm": 1.7068693132646402, + "language_loss": 0.70260966, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.72382665, + "num_input_tokens_seen": 176290810, + "step": 8197, + "time_per_iteration": 2.5475881099700928 + }, + { + "auxiliary_loss_clip": 0.01104205, + "auxiliary_loss_mlp": 0.01032423, + "balance_loss_clip": 1.04301095, + "balance_loss_mlp": 1.02031732, + "epoch": 0.4928904253720126, + "flos": 16873024210560.0, + "grad_norm": 1.7722197562865314, + "language_loss": 0.83681887, + "learning_rate": 2.143005031915374e-06, + "loss": 0.85818511, + "num_input_tokens_seen": 176309165, + "step": 8198, + "time_per_iteration": 2.4652392864227295 + }, + { + "auxiliary_loss_clip": 0.01111305, + "auxiliary_loss_mlp": 0.01035592, + "balance_loss_clip": 1.04295754, + "balance_loss_mlp": 1.02214575, + "epoch": 0.4929505486246806, + "flos": 14866551509760.0, + "grad_norm": 1.7837995826813415, + "language_loss": 0.76211095, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.78357995, + "num_input_tokens_seen": 176324960, + "step": 8199, + "time_per_iteration": 2.4514119625091553 + }, + { + "auxiliary_loss_clip": 0.01100827, + "auxiliary_loss_mlp": 0.01035113, + "balance_loss_clip": 1.04276049, + "balance_loss_mlp": 1.0212431, + "epoch": 0.49301067187734854, + "flos": 23842351434240.0, + "grad_norm": 1.6454780080899907, + "language_loss": 0.60006422, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.6214236, + "num_input_tokens_seen": 176346195, + "step": 8200, + "time_per_iteration": 2.566474676132202 + }, + { + "auxiliary_loss_clip": 0.01104552, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.04122996, + "balance_loss_mlp": 1.02284336, + "epoch": 0.49307079513001656, + "flos": 22491284244480.0, + "grad_norm": 1.4222360666984644, + "language_loss": 0.793917, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.81531566, + "num_input_tokens_seen": 176366735, + "step": 8201, + "time_per_iteration": 2.4994237422943115 + }, + { + "auxiliary_loss_clip": 0.01110423, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.04084098, + "balance_loss_mlp": 1.01808977, + "epoch": 0.4931309183826845, + "flos": 15924587546880.0, + "grad_norm": 2.100316512426948, + "language_loss": 0.67547423, + "learning_rate": 2.141451129398785e-06, + "loss": 0.69689918, + "num_input_tokens_seen": 176384475, + "step": 8202, + "time_per_iteration": 2.4602534770965576 + }, + { + "auxiliary_loss_clip": 0.01092109, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.04063725, + "balance_loss_mlp": 1.01949215, + "epoch": 0.4931910416353525, + "flos": 27309059735040.0, + "grad_norm": 1.978266229650916, + "language_loss": 0.7520625, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.77330482, + "num_input_tokens_seen": 176402645, + "step": 8203, + "time_per_iteration": 2.574143648147583 + }, + { + "auxiliary_loss_clip": 0.01066459, + "auxiliary_loss_mlp": 0.01032823, + "balance_loss_clip": 1.04338944, + "balance_loss_mlp": 1.02004409, + "epoch": 0.49325116488802045, + "flos": 20806139635200.0, + "grad_norm": 2.3128701371195124, + "language_loss": 0.80443156, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.82542437, + "num_input_tokens_seen": 176416715, + "step": 8204, + "time_per_iteration": 2.5704519748687744 + }, + { + "auxiliary_loss_clip": 0.01103939, + "auxiliary_loss_mlp": 0.01033059, + "balance_loss_clip": 1.04117048, + "balance_loss_mlp": 1.02121568, + "epoch": 0.4933112881406884, + "flos": 19865963099520.0, + "grad_norm": 1.886996923939048, + "language_loss": 0.66605663, + "learning_rate": 2.140285646139455e-06, + "loss": 0.68742657, + "num_input_tokens_seen": 176435755, + "step": 8205, + "time_per_iteration": 2.4763028621673584 + }, + { + "auxiliary_loss_clip": 0.01122677, + "auxiliary_loss_mlp": 0.01035712, + "balance_loss_clip": 1.04249549, + "balance_loss_mlp": 1.0212816, + "epoch": 0.4933714113933564, + "flos": 21827977741440.0, + "grad_norm": 1.603523546904001, + "language_loss": 0.65942907, + "learning_rate": 2.139897141060744e-06, + "loss": 0.68101293, + "num_input_tokens_seen": 176453915, + "step": 8206, + "time_per_iteration": 2.4716951847076416 + }, + { + "auxiliary_loss_clip": 0.01074138, + "auxiliary_loss_mlp": 0.01038538, + "balance_loss_clip": 1.04042101, + "balance_loss_mlp": 1.02302885, + "epoch": 0.49343153464602435, + "flos": 27890130049920.0, + "grad_norm": 1.55371843668325, + "language_loss": 0.76413816, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.78526491, + "num_input_tokens_seen": 176475175, + "step": 8207, + "time_per_iteration": 2.6033709049224854 + }, + { + "auxiliary_loss_clip": 0.01099875, + "auxiliary_loss_mlp": 0.01033857, + "balance_loss_clip": 1.04211402, + "balance_loss_mlp": 1.01998651, + "epoch": 0.4934916578986923, + "flos": 24681080983680.0, + "grad_norm": 2.141399692839832, + "language_loss": 0.60236597, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.62370324, + "num_input_tokens_seen": 176494250, + "step": 8208, + "time_per_iteration": 2.555893898010254 + }, + { + "auxiliary_loss_clip": 0.01099271, + "auxiliary_loss_mlp": 0.01031106, + "balance_loss_clip": 1.04118562, + "balance_loss_mlp": 1.01790917, + "epoch": 0.4935517811513603, + "flos": 23405139089280.0, + "grad_norm": 1.955368470513386, + "language_loss": 0.7840665, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.80537021, + "num_input_tokens_seen": 176513325, + "step": 8209, + "time_per_iteration": 2.527578592300415 + }, + { + "auxiliary_loss_clip": 0.0109113, + "auxiliary_loss_mlp": 0.00805273, + "balance_loss_clip": 1.04020905, + "balance_loss_mlp": 1.03762448, + "epoch": 0.49361190440402825, + "flos": 21944508439680.0, + "grad_norm": 1.9276409194238697, + "language_loss": 0.78949094, + "learning_rate": 2.138343067844089e-06, + "loss": 0.80845493, + "num_input_tokens_seen": 176532915, + "step": 8210, + "time_per_iteration": 2.5510919094085693 + }, + { + "auxiliary_loss_clip": 0.0110848, + "auxiliary_loss_mlp": 0.01033892, + "balance_loss_clip": 1.04342842, + "balance_loss_mlp": 1.0201894, + "epoch": 0.4936720276566962, + "flos": 25115671635840.0, + "grad_norm": 1.6911658739001645, + "language_loss": 0.81184864, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.8332724, + "num_input_tokens_seen": 176552775, + "step": 8211, + "time_per_iteration": 2.5586726665496826 + }, + { + "auxiliary_loss_clip": 0.01077549, + "auxiliary_loss_mlp": 0.01047554, + "balance_loss_clip": 1.04179156, + "balance_loss_mlp": 1.03175879, + "epoch": 0.4937321509093642, + "flos": 26358935132160.0, + "grad_norm": 2.3032584658075077, + "language_loss": 0.9135769, + "learning_rate": 2.137565999700933e-06, + "loss": 0.93482792, + "num_input_tokens_seen": 176572185, + "step": 8212, + "time_per_iteration": 2.6023058891296387 + }, + { + "auxiliary_loss_clip": 0.01076205, + "auxiliary_loss_mlp": 0.01043381, + "balance_loss_clip": 1.04086995, + "balance_loss_mlp": 1.02892709, + "epoch": 0.49379227416203214, + "flos": 22961390469120.0, + "grad_norm": 1.7418930394709895, + "language_loss": 0.65006077, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.6712566, + "num_input_tokens_seen": 176591490, + "step": 8213, + "time_per_iteration": 2.5764567852020264 + }, + { + "auxiliary_loss_clip": 0.01068894, + "auxiliary_loss_mlp": 0.00800115, + "balance_loss_clip": 1.03929889, + "balance_loss_mlp": 1.03407586, + "epoch": 0.49385239741470016, + "flos": 32489101843200.0, + "grad_norm": 1.7439613070307782, + "language_loss": 0.75564373, + "learning_rate": 2.136788910691711e-06, + "loss": 0.77433378, + "num_input_tokens_seen": 176612715, + "step": 8214, + "time_per_iteration": 2.7135825157165527 + }, + { + "auxiliary_loss_clip": 0.01119665, + "auxiliary_loss_mlp": 0.01038105, + "balance_loss_clip": 1.04347944, + "balance_loss_mlp": 1.02460456, + "epoch": 0.4939125206673681, + "flos": 22492864442880.0, + "grad_norm": 1.7723785978564133, + "language_loss": 0.8471694, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.86874712, + "num_input_tokens_seen": 176631950, + "step": 8215, + "time_per_iteration": 2.4965803623199463 + }, + { + "auxiliary_loss_clip": 0.01100935, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.03995395, + "balance_loss_mlp": 1.01974249, + "epoch": 0.4939726439200361, + "flos": 31176351486720.0, + "grad_norm": 1.5841864041273628, + "language_loss": 0.83359343, + "learning_rate": 2.136011800934292e-06, + "loss": 0.85491592, + "num_input_tokens_seen": 176653060, + "step": 8216, + "time_per_iteration": 2.5763843059539795 + }, + { + "auxiliary_loss_clip": 0.01089987, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.0424757, + "balance_loss_mlp": 1.01866615, + "epoch": 0.49403276717270406, + "flos": 22674213233280.0, + "grad_norm": 1.5605228003675826, + "language_loss": 0.74527383, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.76648772, + "num_input_tokens_seen": 176673895, + "step": 8217, + "time_per_iteration": 2.5487473011016846 + }, + { + "auxiliary_loss_clip": 0.01115838, + "auxiliary_loss_mlp": 0.00795905, + "balance_loss_clip": 1.04222, + "balance_loss_mlp": 1.02696741, + "epoch": 0.494092890425372, + "flos": 20741070147840.0, + "grad_norm": 1.552105665876363, + "language_loss": 0.78706884, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.80618632, + "num_input_tokens_seen": 176692550, + "step": 8218, + "time_per_iteration": 2.4653639793395996 + }, + { + "auxiliary_loss_clip": 0.01068799, + "auxiliary_loss_mlp": 0.00790134, + "balance_loss_clip": 1.04076374, + "balance_loss_mlp": 1.01710963, + "epoch": 0.49415301367804, + "flos": 18369026778240.0, + "grad_norm": 2.0003490260863708, + "language_loss": 0.76008332, + "learning_rate": 2.134846097653142e-06, + "loss": 0.77867264, + "num_input_tokens_seen": 176709335, + "step": 8219, + "time_per_iteration": 2.5492849349975586 + }, + { + "auxiliary_loss_clip": 0.01094751, + "auxiliary_loss_mlp": 0.0103274, + "balance_loss_clip": 1.04090226, + "balance_loss_mlp": 1.01978827, + "epoch": 0.49421313693070795, + "flos": 17530620451200.0, + "grad_norm": 2.0574417411977017, + "language_loss": 0.62444079, + "learning_rate": 2.134457519646357e-06, + "loss": 0.64571571, + "num_input_tokens_seen": 176727715, + "step": 8220, + "time_per_iteration": 2.486302137374878 + }, + { + "auxiliary_loss_clip": 0.0111588, + "auxiliary_loss_mlp": 0.0103041, + "balance_loss_clip": 1.04026604, + "balance_loss_mlp": 1.01790547, + "epoch": 0.4942732601833759, + "flos": 20812173120000.0, + "grad_norm": 1.9065258085619627, + "language_loss": 0.72104102, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.74250388, + "num_input_tokens_seen": 176747530, + "step": 8221, + "time_per_iteration": 2.4592385292053223 + }, + { + "auxiliary_loss_clip": 0.01082769, + "auxiliary_loss_mlp": 0.0103572, + "balance_loss_clip": 1.04552722, + "balance_loss_mlp": 1.02322054, + "epoch": 0.4943333834360439, + "flos": 15048941794560.0, + "grad_norm": 1.6278659193754357, + "language_loss": 0.79071248, + "learning_rate": 2.133680348351595e-06, + "loss": 0.8118974, + "num_input_tokens_seen": 176765260, + "step": 8222, + "time_per_iteration": 2.5291025638580322 + }, + { + "auxiliary_loss_clip": 0.01105867, + "auxiliary_loss_mlp": 0.01038239, + "balance_loss_clip": 1.04186487, + "balance_loss_mlp": 1.02523899, + "epoch": 0.49439350668871185, + "flos": 16070420764800.0, + "grad_norm": 2.229720205338952, + "language_loss": 0.72220767, + "learning_rate": 2.133291755093088e-06, + "loss": 0.74364877, + "num_input_tokens_seen": 176781770, + "step": 8223, + "time_per_iteration": 2.44869327545166 + }, + { + "auxiliary_loss_clip": 0.01108918, + "auxiliary_loss_mlp": 0.01038011, + "balance_loss_clip": 1.04276705, + "balance_loss_mlp": 1.02463543, + "epoch": 0.4944536299413798, + "flos": 20880079781760.0, + "grad_norm": 1.8686341889274716, + "language_loss": 0.75132382, + "learning_rate": 2.132903156780144e-06, + "loss": 0.77279305, + "num_input_tokens_seen": 176800655, + "step": 8224, + "time_per_iteration": 3.846817970275879 + }, + { + "auxiliary_loss_clip": 0.0110344, + "auxiliary_loss_mlp": 0.01029758, + "balance_loss_clip": 1.04538703, + "balance_loss_mlp": 1.01722884, + "epoch": 0.4945137531940478, + "flos": 26608908856320.0, + "grad_norm": 2.0135140411923, + "language_loss": 0.63577646, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.65710843, + "num_input_tokens_seen": 176820610, + "step": 8225, + "time_per_iteration": 3.9198665618896484 + }, + { + "auxiliary_loss_clip": 0.01098277, + "auxiliary_loss_mlp": 0.01033758, + "balance_loss_clip": 1.04285073, + "balance_loss_mlp": 1.02075863, + "epoch": 0.49457387644671574, + "flos": 23988148738560.0, + "grad_norm": 2.9966261578870705, + "language_loss": 0.76168728, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.78300762, + "num_input_tokens_seen": 176840520, + "step": 8226, + "time_per_iteration": 2.5569581985473633 + }, + { + "auxiliary_loss_clip": 0.0111888, + "auxiliary_loss_mlp": 0.01037322, + "balance_loss_clip": 1.04131126, + "balance_loss_mlp": 1.02280235, + "epoch": 0.49463399969938376, + "flos": 26976598427520.0, + "grad_norm": 1.5930431820923325, + "language_loss": 0.7089982, + "learning_rate": 2.131737331662051e-06, + "loss": 0.73056024, + "num_input_tokens_seen": 176860265, + "step": 8227, + "time_per_iteration": 2.498302459716797 + }, + { + "auxiliary_loss_clip": 0.01099905, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.04320192, + "balance_loss_mlp": 1.02070892, + "epoch": 0.49469412295205173, + "flos": 29681534067840.0, + "grad_norm": 2.151800891737294, + "language_loss": 0.71889138, + "learning_rate": 2.131348713278718e-06, + "loss": 0.74023014, + "num_input_tokens_seen": 176882910, + "step": 8228, + "time_per_iteration": 4.022500991821289 + }, + { + "auxiliary_loss_clip": 0.01116246, + "auxiliary_loss_mlp": 0.01027738, + "balance_loss_clip": 1.041942, + "balance_loss_mlp": 1.01506603, + "epoch": 0.4947542462047197, + "flos": 24131791226880.0, + "grad_norm": 1.4343638101702127, + "language_loss": 0.83861768, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.86005753, + "num_input_tokens_seen": 176903030, + "step": 8229, + "time_per_iteration": 2.493532180786133 + }, + { + "auxiliary_loss_clip": 0.01106864, + "auxiliary_loss_mlp": 0.01035363, + "balance_loss_clip": 1.03864026, + "balance_loss_mlp": 1.02078938, + "epoch": 0.49481436945738766, + "flos": 20045049333120.0, + "grad_norm": 1.9470988235146696, + "language_loss": 0.7460494, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.76747167, + "num_input_tokens_seen": 176919025, + "step": 8230, + "time_per_iteration": 2.491112470626831 + }, + { + "auxiliary_loss_clip": 0.01102112, + "auxiliary_loss_mlp": 0.01030215, + "balance_loss_clip": 1.04261851, + "balance_loss_mlp": 1.01777506, + "epoch": 0.4948744927100556, + "flos": 15669550005120.0, + "grad_norm": 2.7357266854396025, + "language_loss": 0.79139841, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.81272173, + "num_input_tokens_seen": 176937945, + "step": 8231, + "time_per_iteration": 3.9317104816436768 + }, + { + "auxiliary_loss_clip": 0.01038907, + "auxiliary_loss_mlp": 0.01005086, + "balance_loss_clip": 1.02473545, + "balance_loss_mlp": 1.00364971, + "epoch": 0.4949346159627236, + "flos": 68872071502080.0, + "grad_norm": 0.7567164058654038, + "language_loss": 0.60184014, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62228006, + "num_input_tokens_seen": 177004575, + "step": 8232, + "time_per_iteration": 3.226020097732544 + }, + { + "auxiliary_loss_clip": 0.01098705, + "auxiliary_loss_mlp": 0.01034017, + "balance_loss_clip": 1.04238272, + "balance_loss_mlp": 1.01984274, + "epoch": 0.49499473921539155, + "flos": 24790285307520.0, + "grad_norm": 3.2247624155933683, + "language_loss": 0.69048202, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.71180922, + "num_input_tokens_seen": 177024155, + "step": 8233, + "time_per_iteration": 2.5669918060302734 + }, + { + "auxiliary_loss_clip": 0.01065076, + "auxiliary_loss_mlp": 0.01034137, + "balance_loss_clip": 1.04140592, + "balance_loss_mlp": 1.01989174, + "epoch": 0.4950548624680595, + "flos": 32707905540480.0, + "grad_norm": 2.1147745658915196, + "language_loss": 0.66628325, + "learning_rate": 2.129016898898633e-06, + "loss": 0.68727541, + "num_input_tokens_seen": 177046185, + "step": 8234, + "time_per_iteration": 2.661149501800537 + }, + { + "auxiliary_loss_clip": 0.0103074, + "auxiliary_loss_mlp": 0.01009624, + "balance_loss_clip": 1.02841949, + "balance_loss_mlp": 1.00834298, + "epoch": 0.4951149857207275, + "flos": 50082173066880.0, + "grad_norm": 0.7996729967131923, + "language_loss": 0.58016485, + "learning_rate": 2.128628245959482e-06, + "loss": 0.60056847, + "num_input_tokens_seen": 177099025, + "step": 8235, + "time_per_iteration": 3.056988000869751 + }, + { + "auxiliary_loss_clip": 0.01085857, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.03782916, + "balance_loss_mlp": 1.02045369, + "epoch": 0.49517510897339545, + "flos": 22236785406720.0, + "grad_norm": 1.6370835549888947, + "language_loss": 0.76989377, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.79109871, + "num_input_tokens_seen": 177118365, + "step": 8236, + "time_per_iteration": 2.55275559425354 + }, + { + "auxiliary_loss_clip": 0.01077641, + "auxiliary_loss_mlp": 0.01030353, + "balance_loss_clip": 1.04453015, + "balance_loss_mlp": 1.01763356, + "epoch": 0.4952352322260634, + "flos": 25374120969600.0, + "grad_norm": 1.7194510867408521, + "language_loss": 0.72785258, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.74893248, + "num_input_tokens_seen": 177136415, + "step": 8237, + "time_per_iteration": 2.5933873653411865 + }, + { + "auxiliary_loss_clip": 0.01115543, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.0417732, + "balance_loss_mlp": 1.0222466, + "epoch": 0.4952953554787314, + "flos": 24608721035520.0, + "grad_norm": 1.9113703578944181, + "language_loss": 0.75778323, + "learning_rate": 2.127462257935406e-06, + "loss": 0.77929538, + "num_input_tokens_seen": 177155690, + "step": 8238, + "time_per_iteration": 2.4981977939605713 + }, + { + "auxiliary_loss_clip": 0.01075034, + "auxiliary_loss_mlp": 0.01039101, + "balance_loss_clip": 1.041363, + "balance_loss_mlp": 1.02354991, + "epoch": 0.49535547873139935, + "flos": 17311278049920.0, + "grad_norm": 2.1417790602362508, + "language_loss": 0.74613726, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.76727867, + "num_input_tokens_seen": 177173350, + "step": 8239, + "time_per_iteration": 2.5486977100372314 + }, + { + "auxiliary_loss_clip": 0.01030846, + "auxiliary_loss_mlp": 0.01035378, + "balance_loss_clip": 1.0373894, + "balance_loss_mlp": 1.0202384, + "epoch": 0.4954156019840673, + "flos": 20740315962240.0, + "grad_norm": 2.4496463608147914, + "language_loss": 0.78849626, + "learning_rate": 2.126684908394552e-06, + "loss": 0.80915844, + "num_input_tokens_seen": 177191115, + "step": 8240, + "time_per_iteration": 2.8489527702331543 + }, + { + "auxiliary_loss_clip": 0.01104155, + "auxiliary_loss_mlp": 0.01038411, + "balance_loss_clip": 1.04082799, + "balance_loss_mlp": 1.02581072, + "epoch": 0.49547572523673533, + "flos": 12820684567680.0, + "grad_norm": 2.114649149884295, + "language_loss": 0.85849869, + "learning_rate": 2.126296226410898e-06, + "loss": 0.87992436, + "num_input_tokens_seen": 177206155, + "step": 8241, + "time_per_iteration": 2.802905797958374 + }, + { + "auxiliary_loss_clip": 0.01065379, + "auxiliary_loss_mlp": 0.01029922, + "balance_loss_clip": 1.04510927, + "balance_loss_mlp": 1.01695204, + "epoch": 0.4955358484894033, + "flos": 15597046402560.0, + "grad_norm": 3.556286279935567, + "language_loss": 0.7700932, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.7910462, + "num_input_tokens_seen": 177224815, + "step": 8242, + "time_per_iteration": 2.577310562133789 + }, + { + "auxiliary_loss_clip": 0.01094192, + "auxiliary_loss_mlp": 0.00788736, + "balance_loss_clip": 1.04036832, + "balance_loss_mlp": 1.01332009, + "epoch": 0.49559597174207126, + "flos": 26464368528000.0, + "grad_norm": 1.7650318253653374, + "language_loss": 0.67253149, + "learning_rate": 2.125518848090833e-06, + "loss": 0.69136077, + "num_input_tokens_seen": 177244490, + "step": 8243, + "time_per_iteration": 2.5766868591308594 + }, + { + "auxiliary_loss_clip": 0.01102861, + "auxiliary_loss_mlp": 0.010278, + "balance_loss_clip": 1.04324484, + "balance_loss_mlp": 1.01559317, + "epoch": 0.4956560949947392, + "flos": 23148234040320.0, + "grad_norm": 1.7879303114443938, + "language_loss": 0.68124151, + "learning_rate": 2.125130151783901e-06, + "loss": 0.70254815, + "num_input_tokens_seen": 177264340, + "step": 8244, + "time_per_iteration": 2.5274124145507812 + }, + { + "auxiliary_loss_clip": 0.01097277, + "auxiliary_loss_mlp": 0.0103436, + "balance_loss_clip": 1.0423367, + "balance_loss_mlp": 1.01994181, + "epoch": 0.4957162182474072, + "flos": 20773461237120.0, + "grad_norm": 2.0981901438317774, + "language_loss": 0.75122911, + "learning_rate": 2.12474145073202e-06, + "loss": 0.77254546, + "num_input_tokens_seen": 177283055, + "step": 8245, + "time_per_iteration": 2.514234781265259 + }, + { + "auxiliary_loss_clip": 0.0110625, + "auxiliary_loss_mlp": 0.01029418, + "balance_loss_clip": 1.04293156, + "balance_loss_mlp": 1.01653767, + "epoch": 0.49577634150007516, + "flos": 18734202397440.0, + "grad_norm": 1.749115349064246, + "language_loss": 0.8133688, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.83472544, + "num_input_tokens_seen": 177301140, + "step": 8246, + "time_per_iteration": 2.473923683166504 + }, + { + "auxiliary_loss_clip": 0.01082053, + "auxiliary_loss_mlp": 0.01036361, + "balance_loss_clip": 1.04619312, + "balance_loss_mlp": 1.02149582, + "epoch": 0.4958364647527431, + "flos": 25554176870400.0, + "grad_norm": 1.6925442231089483, + "language_loss": 0.83501804, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.85620213, + "num_input_tokens_seen": 177323095, + "step": 8247, + "time_per_iteration": 2.58484148979187 + }, + { + "auxiliary_loss_clip": 0.01087246, + "auxiliary_loss_mlp": 0.01028547, + "balance_loss_clip": 1.04121399, + "balance_loss_mlp": 1.01589286, + "epoch": 0.4958965880054111, + "flos": 24425325169920.0, + "grad_norm": 2.0322335961317433, + "language_loss": 0.83493853, + "learning_rate": 2.123575319254087e-06, + "loss": 0.85609651, + "num_input_tokens_seen": 177339845, + "step": 8248, + "time_per_iteration": 2.5696663856506348 + }, + { + "auxiliary_loss_clip": 0.01108867, + "auxiliary_loss_mlp": 0.01027978, + "balance_loss_clip": 1.04189014, + "balance_loss_mlp": 1.01417375, + "epoch": 0.49595671125807905, + "flos": 25083460114560.0, + "grad_norm": 2.169301649188192, + "language_loss": 0.73727107, + "learning_rate": 2.123186599369812e-06, + "loss": 0.75863945, + "num_input_tokens_seen": 177359980, + "step": 8249, + "time_per_iteration": 2.529914617538452 + }, + { + "auxiliary_loss_clip": 0.01098777, + "auxiliary_loss_mlp": 0.01042292, + "balance_loss_clip": 1.04141212, + "balance_loss_mlp": 1.02838051, + "epoch": 0.496016834510747, + "flos": 16435883692800.0, + "grad_norm": 1.6690860858099164, + "language_loss": 0.75954348, + "learning_rate": 2.122797874814289e-06, + "loss": 0.78095424, + "num_input_tokens_seen": 177378580, + "step": 8250, + "time_per_iteration": 2.5000510215759277 + }, + { + "auxiliary_loss_clip": 0.01120342, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.04261708, + "balance_loss_mlp": 1.02016485, + "epoch": 0.496076957763415, + "flos": 23437925228160.0, + "grad_norm": 1.8884236459554091, + "language_loss": 0.70307249, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.72461665, + "num_input_tokens_seen": 177398790, + "step": 8251, + "time_per_iteration": 2.4961278438568115 + }, + { + "auxiliary_loss_clip": 0.01081991, + "auxiliary_loss_mlp": 0.00787999, + "balance_loss_clip": 1.04330492, + "balance_loss_mlp": 1.0115304, + "epoch": 0.49613708101608295, + "flos": 16909509450240.0, + "grad_norm": 2.0339657502628836, + "language_loss": 0.80320144, + "learning_rate": 2.122020411748461e-06, + "loss": 0.82190132, + "num_input_tokens_seen": 177416515, + "step": 8252, + "time_per_iteration": 2.5906784534454346 + }, + { + "auxiliary_loss_clip": 0.01119823, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.04259717, + "balance_loss_mlp": 1.01668859, + "epoch": 0.4961972042687509, + "flos": 16618094409600.0, + "grad_norm": 1.733846591994071, + "language_loss": 0.80749488, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.829014, + "num_input_tokens_seen": 177434425, + "step": 8253, + "time_per_iteration": 2.4527575969696045 + }, + { + "auxiliary_loss_clip": 0.01084869, + "auxiliary_loss_mlp": 0.0103154, + "balance_loss_clip": 1.03841114, + "balance_loss_mlp": 1.018659, + "epoch": 0.49625732752141893, + "flos": 28956749437440.0, + "grad_norm": 1.4359889427262444, + "language_loss": 0.67382818, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.6949923, + "num_input_tokens_seen": 177459675, + "step": 8254, + "time_per_iteration": 2.663331985473633 + }, + { + "auxiliary_loss_clip": 0.01081584, + "auxiliary_loss_mlp": 0.01045159, + "balance_loss_clip": 1.04036856, + "balance_loss_mlp": 1.02945924, + "epoch": 0.4963174507740869, + "flos": 23112359331840.0, + "grad_norm": 1.999028083230742, + "language_loss": 0.74119014, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.76245761, + "num_input_tokens_seen": 177478895, + "step": 8255, + "time_per_iteration": 2.5629055500030518 + }, + { + "auxiliary_loss_clip": 0.01088395, + "auxiliary_loss_mlp": 0.0103663, + "balance_loss_clip": 1.0406096, + "balance_loss_mlp": 1.02315342, + "epoch": 0.49637757402675486, + "flos": 13917863450880.0, + "grad_norm": 1.8298861570806362, + "language_loss": 0.81195819, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.83320832, + "num_input_tokens_seen": 177494920, + "step": 8256, + "time_per_iteration": 2.4840705394744873 + }, + { + "auxiliary_loss_clip": 0.01091504, + "auxiliary_loss_mlp": 0.01026729, + "balance_loss_clip": 1.04250872, + "balance_loss_mlp": 1.01419449, + "epoch": 0.49643769727942283, + "flos": 22309001700480.0, + "grad_norm": 1.4316603479984693, + "language_loss": 0.80890715, + "learning_rate": 2.120076673368901e-06, + "loss": 0.83008957, + "num_input_tokens_seen": 177515455, + "step": 8257, + "time_per_iteration": 2.5427818298339844 + }, + { + "auxiliary_loss_clip": 0.01121028, + "auxiliary_loss_mlp": 0.01040869, + "balance_loss_clip": 1.0410006, + "balance_loss_mlp": 1.02659369, + "epoch": 0.4964978205320908, + "flos": 19500248776320.0, + "grad_norm": 1.9434309262960543, + "language_loss": 0.6604377, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.68205667, + "num_input_tokens_seen": 177534040, + "step": 8258, + "time_per_iteration": 2.4589219093322754 + }, + { + "auxiliary_loss_clip": 0.01103835, + "auxiliary_loss_mlp": 0.01029435, + "balance_loss_clip": 1.04080963, + "balance_loss_mlp": 1.01714468, + "epoch": 0.49655794378475876, + "flos": 23436524597760.0, + "grad_norm": 1.504772974661805, + "language_loss": 0.77591479, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.79724747, + "num_input_tokens_seen": 177554510, + "step": 8259, + "time_per_iteration": 2.508404016494751 + }, + { + "auxiliary_loss_clip": 0.01091971, + "auxiliary_loss_mlp": 0.01034998, + "balance_loss_clip": 1.04319739, + "balance_loss_mlp": 1.02153969, + "epoch": 0.4966180670374267, + "flos": 26831124345600.0, + "grad_norm": 1.4535594909619085, + "language_loss": 0.78662145, + "learning_rate": 2.1189103755834e-06, + "loss": 0.80789113, + "num_input_tokens_seen": 177575780, + "step": 8260, + "time_per_iteration": 2.592573642730713 + }, + { + "auxiliary_loss_clip": 0.01096943, + "auxiliary_loss_mlp": 0.01035081, + "balance_loss_clip": 1.04222417, + "balance_loss_mlp": 1.02107358, + "epoch": 0.4966781902900947, + "flos": 22009326531840.0, + "grad_norm": 2.699834418626499, + "language_loss": 0.76301008, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.78433037, + "num_input_tokens_seen": 177588965, + "step": 8261, + "time_per_iteration": 2.4924728870391846 + }, + { + "auxiliary_loss_clip": 0.01068778, + "auxiliary_loss_mlp": 0.01036501, + "balance_loss_clip": 1.04025197, + "balance_loss_mlp": 1.02340627, + "epoch": 0.49673831354276266, + "flos": 26213353309440.0, + "grad_norm": 1.7672741715571316, + "language_loss": 0.89215088, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.91320366, + "num_input_tokens_seen": 177608425, + "step": 8262, + "time_per_iteration": 3.9917151927948 + }, + { + "auxiliary_loss_clip": 0.01062561, + "auxiliary_loss_mlp": 0.01030536, + "balance_loss_clip": 1.03879166, + "balance_loss_mlp": 1.01782274, + "epoch": 0.4967984367954306, + "flos": 23182277155200.0, + "grad_norm": 1.4595109239297823, + "language_loss": 0.74021685, + "learning_rate": 2.11774403721606e-06, + "loss": 0.76114786, + "num_input_tokens_seen": 177628240, + "step": 8263, + "time_per_iteration": 2.641376256942749 + }, + { + "auxiliary_loss_clip": 0.01078911, + "auxiliary_loss_mlp": 0.01036501, + "balance_loss_clip": 1.04718328, + "balance_loss_mlp": 1.02121258, + "epoch": 0.4968585600480986, + "flos": 19281445079040.0, + "grad_norm": 1.8564942184666366, + "language_loss": 0.69077015, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.71192425, + "num_input_tokens_seen": 177645920, + "step": 8264, + "time_per_iteration": 4.352736949920654 + }, + { + "auxiliary_loss_clip": 0.01093772, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.04242849, + "balance_loss_mlp": 1.01666331, + "epoch": 0.49691868330076655, + "flos": 22528703237760.0, + "grad_norm": 1.5329417271442203, + "language_loss": 0.64855707, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.66979325, + "num_input_tokens_seen": 177667185, + "step": 8265, + "time_per_iteration": 2.572495222091675 + }, + { + "auxiliary_loss_clip": 0.0102586, + "auxiliary_loss_mlp": 0.01014397, + "balance_loss_clip": 1.02034211, + "balance_loss_mlp": 1.01285958, + "epoch": 0.4969788065534345, + "flos": 66577128675840.0, + "grad_norm": 0.8252788363779046, + "language_loss": 0.53484285, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.55524546, + "num_input_tokens_seen": 177733020, + "step": 8266, + "time_per_iteration": 3.1974520683288574 + }, + { + "auxiliary_loss_clip": 0.0110382, + "auxiliary_loss_mlp": 0.01029441, + "balance_loss_clip": 1.04078603, + "balance_loss_mlp": 1.0168165, + "epoch": 0.49703892980610254, + "flos": 24059503105920.0, + "grad_norm": 1.6195972177143436, + "language_loss": 0.79405022, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.81538278, + "num_input_tokens_seen": 177753370, + "step": 8267, + "time_per_iteration": 3.937965154647827 + }, + { + "auxiliary_loss_clip": 0.01095416, + "auxiliary_loss_mlp": 0.01031467, + "balance_loss_clip": 1.04329419, + "balance_loss_mlp": 1.01759052, + "epoch": 0.4970990530587705, + "flos": 29126174912640.0, + "grad_norm": 2.265842549311999, + "language_loss": 0.74567807, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.76694679, + "num_input_tokens_seen": 177771530, + "step": 8268, + "time_per_iteration": 2.600353479385376 + }, + { + "auxiliary_loss_clip": 0.01106834, + "auxiliary_loss_mlp": 0.00785669, + "balance_loss_clip": 1.04018569, + "balance_loss_mlp": 1.00859439, + "epoch": 0.49715917631143847, + "flos": 46026167258880.0, + "grad_norm": 1.3780759505113545, + "language_loss": 0.67920923, + "learning_rate": 2.115411240328073e-06, + "loss": 0.69813424, + "num_input_tokens_seen": 177796355, + "step": 8269, + "time_per_iteration": 4.095922946929932 + }, + { + "auxiliary_loss_clip": 0.01090047, + "auxiliary_loss_mlp": 0.01034709, + "balance_loss_clip": 1.04288852, + "balance_loss_mlp": 1.02124429, + "epoch": 0.49721929956410643, + "flos": 20191277600640.0, + "grad_norm": 1.4635929748015757, + "language_loss": 0.85351253, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.87476003, + "num_input_tokens_seen": 177814300, + "step": 8270, + "time_per_iteration": 2.533360004425049 + }, + { + "auxiliary_loss_clip": 0.01074215, + "auxiliary_loss_mlp": 0.00788274, + "balance_loss_clip": 1.0408473, + "balance_loss_mlp": 1.01257157, + "epoch": 0.4972794228167744, + "flos": 21653560275840.0, + "grad_norm": 1.6923897967273798, + "language_loss": 0.70878959, + "learning_rate": 2.114633606196899e-06, + "loss": 0.72741449, + "num_input_tokens_seen": 177833615, + "step": 8271, + "time_per_iteration": 2.5946452617645264 + }, + { + "auxiliary_loss_clip": 0.0110408, + "auxiliary_loss_mlp": 0.01033471, + "balance_loss_clip": 1.04351509, + "balance_loss_mlp": 1.01978612, + "epoch": 0.49733954606944236, + "flos": 24279743347200.0, + "grad_norm": 1.6867031908897805, + "language_loss": 0.78347111, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.80484653, + "num_input_tokens_seen": 177855315, + "step": 8272, + "time_per_iteration": 2.518927574157715 + }, + { + "auxiliary_loss_clip": 0.01089391, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.04304445, + "balance_loss_mlp": 1.02369201, + "epoch": 0.4973996693221103, + "flos": 37852575730560.0, + "grad_norm": 2.3337748567340157, + "language_loss": 0.66458786, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.68584931, + "num_input_tokens_seen": 177875590, + "step": 8273, + "time_per_iteration": 2.6774890422821045 + }, + { + "auxiliary_loss_clip": 0.01089566, + "auxiliary_loss_mlp": 0.01031115, + "balance_loss_clip": 1.0422008, + "balance_loss_mlp": 1.01827633, + "epoch": 0.4974597925747783, + "flos": 21361426963200.0, + "grad_norm": 1.6234890371802386, + "language_loss": 0.78005719, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.80126405, + "num_input_tokens_seen": 177894175, + "step": 8274, + "time_per_iteration": 2.541865587234497 + }, + { + "auxiliary_loss_clip": 0.01085264, + "auxiliary_loss_mlp": 0.01034474, + "balance_loss_clip": 1.04453993, + "balance_loss_mlp": 1.02025282, + "epoch": 0.49751991582744626, + "flos": 30738133560960.0, + "grad_norm": 1.7102443936000828, + "language_loss": 0.75710166, + "learning_rate": 2.113078285889493e-06, + "loss": 0.77829909, + "num_input_tokens_seen": 177913920, + "step": 8275, + "time_per_iteration": 2.6322953701019287 + }, + { + "auxiliary_loss_clip": 0.01108286, + "auxiliary_loss_mlp": 0.01035471, + "balance_loss_clip": 1.04438305, + "balance_loss_mlp": 1.02020073, + "epoch": 0.4975800390801142, + "flos": 14100541044480.0, + "grad_norm": 1.9323262937798913, + "language_loss": 0.83768767, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.85912526, + "num_input_tokens_seen": 177930425, + "step": 8276, + "time_per_iteration": 2.4601824283599854 + }, + { + "auxiliary_loss_clip": 0.01112732, + "auxiliary_loss_mlp": 0.00786133, + "balance_loss_clip": 1.04028809, + "balance_loss_mlp": 1.01058018, + "epoch": 0.4976401623327822, + "flos": 24207275658240.0, + "grad_norm": 1.4279676044015661, + "language_loss": 0.70351017, + "learning_rate": 2.112300599949172e-06, + "loss": 0.72249877, + "num_input_tokens_seen": 177949885, + "step": 8277, + "time_per_iteration": 2.4973554611206055 + }, + { + "auxiliary_loss_clip": 0.01103589, + "auxiliary_loss_mlp": 0.0103556, + "balance_loss_clip": 1.04526651, + "balance_loss_mlp": 1.02201772, + "epoch": 0.49770028558545015, + "flos": 21136769349120.0, + "grad_norm": 2.02878644567125, + "language_loss": 0.82437861, + "learning_rate": 2.111911750583964e-06, + "loss": 0.84577006, + "num_input_tokens_seen": 177965720, + "step": 8278, + "time_per_iteration": 2.4844093322753906 + }, + { + "auxiliary_loss_clip": 0.01108557, + "auxiliary_loss_mlp": 0.01037336, + "balance_loss_clip": 1.04083514, + "balance_loss_mlp": 1.02426457, + "epoch": 0.4977604088381181, + "flos": 16763927627520.0, + "grad_norm": 2.066400570371332, + "language_loss": 0.67696869, + "learning_rate": 2.111522896975052e-06, + "loss": 0.69842762, + "num_input_tokens_seen": 177983190, + "step": 8279, + "time_per_iteration": 2.4708824157714844 + }, + { + "auxiliary_loss_clip": 0.0110886, + "auxiliary_loss_mlp": 0.01035812, + "balance_loss_clip": 1.04022825, + "balance_loss_mlp": 1.02154279, + "epoch": 0.49782053209078614, + "flos": 15703521292800.0, + "grad_norm": 2.210821753885307, + "language_loss": 0.71667689, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.73812354, + "num_input_tokens_seen": 178000155, + "step": 8280, + "time_per_iteration": 2.4534268379211426 + }, + { + "auxiliary_loss_clip": 0.01085837, + "auxiliary_loss_mlp": 0.01035687, + "balance_loss_clip": 1.04127645, + "balance_loss_mlp": 1.02195442, + "epoch": 0.4978806553434541, + "flos": 24753692327040.0, + "grad_norm": 1.6253790667484762, + "language_loss": 0.64472961, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.66594487, + "num_input_tokens_seen": 178021060, + "step": 8281, + "time_per_iteration": 2.580148458480835 + }, + { + "auxiliary_loss_clip": 0.01112235, + "auxiliary_loss_mlp": 0.01032598, + "balance_loss_clip": 1.04339468, + "balance_loss_mlp": 1.01897228, + "epoch": 0.49794077859612207, + "flos": 13115726881920.0, + "grad_norm": 1.9004071881717206, + "language_loss": 0.72860074, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.75004911, + "num_input_tokens_seen": 178038180, + "step": 8282, + "time_per_iteration": 2.4723243713378906 + }, + { + "auxiliary_loss_clip": 0.01091799, + "auxiliary_loss_mlp": 0.01031642, + "balance_loss_clip": 1.04412222, + "balance_loss_mlp": 1.02022791, + "epoch": 0.49800090184879003, + "flos": 27525133998720.0, + "grad_norm": 1.4670804018170165, + "language_loss": 0.72923625, + "learning_rate": 2.109967440397263e-06, + "loss": 0.75047064, + "num_input_tokens_seen": 178057565, + "step": 8283, + "time_per_iteration": 2.58441424369812 + }, + { + "auxiliary_loss_clip": 0.01063508, + "auxiliary_loss_mlp": 0.01048211, + "balance_loss_clip": 1.04073596, + "balance_loss_mlp": 1.03232026, + "epoch": 0.498061025101458, + "flos": 19792489829760.0, + "grad_norm": 1.7031671598654679, + "language_loss": 0.7834301, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.80454731, + "num_input_tokens_seen": 178076965, + "step": 8284, + "time_per_iteration": 2.595801591873169 + }, + { + "auxiliary_loss_clip": 0.0110144, + "auxiliary_loss_mlp": 0.01038357, + "balance_loss_clip": 1.04657888, + "balance_loss_mlp": 1.02328861, + "epoch": 0.49812114835412596, + "flos": 29893909230720.0, + "grad_norm": 2.182713384667605, + "language_loss": 0.73730111, + "learning_rate": 2.109189687029526e-06, + "loss": 0.75869912, + "num_input_tokens_seen": 178095105, + "step": 8285, + "time_per_iteration": 2.618523597717285 + }, + { + "auxiliary_loss_clip": 0.01114204, + "auxiliary_loss_mlp": 0.01033861, + "balance_loss_clip": 1.04663348, + "balance_loss_mlp": 1.02028275, + "epoch": 0.49818127160679393, + "flos": 23147048891520.0, + "grad_norm": 1.619073990137544, + "language_loss": 0.74184692, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.76332754, + "num_input_tokens_seen": 178114505, + "step": 8286, + "time_per_iteration": 2.4945077896118164 + }, + { + "auxiliary_loss_clip": 0.0110234, + "auxiliary_loss_mlp": 0.01044971, + "balance_loss_clip": 1.0465827, + "balance_loss_mlp": 1.03120792, + "epoch": 0.4982413948594619, + "flos": 21652806090240.0, + "grad_norm": 1.6591269687158565, + "language_loss": 0.85065866, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.87213176, + "num_input_tokens_seen": 178131595, + "step": 8287, + "time_per_iteration": 2.5278849601745605 + }, + { + "auxiliary_loss_clip": 0.01077971, + "auxiliary_loss_mlp": 0.01027204, + "balance_loss_clip": 1.04207039, + "balance_loss_mlp": 1.0136677, + "epoch": 0.49830151811212986, + "flos": 32486982940800.0, + "grad_norm": 1.7161703889111457, + "language_loss": 0.72539276, + "learning_rate": 2.108023025961159e-06, + "loss": 0.74644446, + "num_input_tokens_seen": 178152055, + "step": 8288, + "time_per_iteration": 2.6799209117889404 + }, + { + "auxiliary_loss_clip": 0.01102303, + "auxiliary_loss_mlp": 0.01038378, + "balance_loss_clip": 1.04235888, + "balance_loss_mlp": 1.02329803, + "epoch": 0.4983616413647978, + "flos": 18142358002560.0, + "grad_norm": 2.805373891426384, + "language_loss": 0.80313712, + "learning_rate": 2.10763413072622e-06, + "loss": 0.82454395, + "num_input_tokens_seen": 178168150, + "step": 8289, + "time_per_iteration": 2.511997938156128 + }, + { + "auxiliary_loss_clip": 0.01108133, + "auxiliary_loss_mlp": 0.01034449, + "balance_loss_clip": 1.04140103, + "balance_loss_mlp": 1.0211159, + "epoch": 0.4984217646174658, + "flos": 19718836992000.0, + "grad_norm": 2.236405902196735, + "language_loss": 0.73419756, + "learning_rate": 2.107245231409784e-06, + "loss": 0.75562346, + "num_input_tokens_seen": 178186150, + "step": 8290, + "time_per_iteration": 2.5093703269958496 + }, + { + "auxiliary_loss_clip": 0.01112265, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.04502344, + "balance_loss_mlp": 1.01955199, + "epoch": 0.49848188787013376, + "flos": 24936549488640.0, + "grad_norm": 1.5531516323638668, + "language_loss": 0.838732, + "learning_rate": 2.106856328026598e-06, + "loss": 0.86021131, + "num_input_tokens_seen": 178207665, + "step": 8291, + "time_per_iteration": 2.5451269149780273 + }, + { + "auxiliary_loss_clip": 0.01100723, + "auxiliary_loss_mlp": 0.01047171, + "balance_loss_clip": 1.04451764, + "balance_loss_mlp": 1.0304693, + "epoch": 0.4985420111228017, + "flos": 22382439056640.0, + "grad_norm": 1.685407558999026, + "language_loss": 0.66790563, + "learning_rate": 2.106467420591409e-06, + "loss": 0.68938458, + "num_input_tokens_seen": 178226325, + "step": 8292, + "time_per_iteration": 2.5298144817352295 + }, + { + "auxiliary_loss_clip": 0.01120679, + "auxiliary_loss_mlp": 0.01033613, + "balance_loss_clip": 1.04440463, + "balance_loss_mlp": 1.02060699, + "epoch": 0.4986021343754697, + "flos": 16216469464320.0, + "grad_norm": 1.6234432827980412, + "language_loss": 0.66481757, + "learning_rate": 2.106078509118965e-06, + "loss": 0.68636048, + "num_input_tokens_seen": 178244960, + "step": 8293, + "time_per_iteration": 2.448354721069336 + }, + { + "auxiliary_loss_clip": 0.01107038, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.04348874, + "balance_loss_mlp": 1.01693988, + "epoch": 0.4986622576281377, + "flos": 23403594804480.0, + "grad_norm": 1.7356468875953197, + "language_loss": 0.81552148, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.83689696, + "num_input_tokens_seen": 178265400, + "step": 8294, + "time_per_iteration": 2.5063512325286865 + }, + { + "auxiliary_loss_clip": 0.01111318, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.04295301, + "balance_loss_mlp": 1.01941395, + "epoch": 0.49872238088080567, + "flos": 19974556892160.0, + "grad_norm": 1.7694458269328155, + "language_loss": 0.73167044, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.75312054, + "num_input_tokens_seen": 178284535, + "step": 8295, + "time_per_iteration": 2.5252130031585693 + }, + { + "auxiliary_loss_clip": 0.01066127, + "auxiliary_loss_mlp": 0.01036102, + "balance_loss_clip": 1.04441404, + "balance_loss_mlp": 1.02288163, + "epoch": 0.49878250413347364, + "flos": 22893016930560.0, + "grad_norm": 1.8343158368498729, + "language_loss": 0.67166209, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.69268435, + "num_input_tokens_seen": 178302425, + "step": 8296, + "time_per_iteration": 2.612426280975342 + }, + { + "auxiliary_loss_clip": 0.0110577, + "auxiliary_loss_mlp": 0.01035974, + "balance_loss_clip": 1.04502773, + "balance_loss_mlp": 1.02188945, + "epoch": 0.4988426273861416, + "flos": 32598449821440.0, + "grad_norm": 1.8091622422287084, + "language_loss": 0.64141661, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.66283405, + "num_input_tokens_seen": 178323065, + "step": 8297, + "time_per_iteration": 2.6240246295928955 + }, + { + "auxiliary_loss_clip": 0.01072491, + "auxiliary_loss_mlp": 0.0103525, + "balance_loss_clip": 1.04259777, + "balance_loss_mlp": 1.0231086, + "epoch": 0.49890275063880957, + "flos": 20923604087040.0, + "grad_norm": 1.7107852096567888, + "language_loss": 0.69397879, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.71505618, + "num_input_tokens_seen": 178343985, + "step": 8298, + "time_per_iteration": 2.6101129055023193 + }, + { + "auxiliary_loss_clip": 0.01115646, + "auxiliary_loss_mlp": 0.01034633, + "balance_loss_clip": 1.04128814, + "balance_loss_mlp": 1.02160978, + "epoch": 0.49896287389147753, + "flos": 18624459369600.0, + "grad_norm": 1.773800760619909, + "language_loss": 0.84174359, + "learning_rate": 2.103744956327814e-06, + "loss": 0.86324644, + "num_input_tokens_seen": 178362345, + "step": 8299, + "time_per_iteration": 2.442152261734009 + }, + { + "auxiliary_loss_clip": 0.01089915, + "auxiliary_loss_mlp": 0.0103724, + "balance_loss_clip": 1.04108739, + "balance_loss_mlp": 1.02243388, + "epoch": 0.4990229971441455, + "flos": 24826555065600.0, + "grad_norm": 2.087036896447266, + "language_loss": 0.69062155, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.71189308, + "num_input_tokens_seen": 178383190, + "step": 8300, + "time_per_iteration": 3.952367067337036 + }, + { + "auxiliary_loss_clip": 0.01049197, + "auxiliary_loss_mlp": 0.00999463, + "balance_loss_clip": 1.05027223, + "balance_loss_mlp": 0.99795455, + "epoch": 0.49908312039681346, + "flos": 71384525136000.0, + "grad_norm": 0.7753404980013431, + "language_loss": 0.51178277, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.53226936, + "num_input_tokens_seen": 178444250, + "step": 8301, + "time_per_iteration": 3.2598118782043457 + }, + { + "auxiliary_loss_clip": 0.01090249, + "auxiliary_loss_mlp": 0.01038638, + "balance_loss_clip": 1.04272902, + "balance_loss_mlp": 1.02574003, + "epoch": 0.4991432436494814, + "flos": 19828651847040.0, + "grad_norm": 1.7220747278854545, + "language_loss": 0.84425759, + "learning_rate": 2.102578126623879e-06, + "loss": 0.86554646, + "num_input_tokens_seen": 178463250, + "step": 8302, + "time_per_iteration": 3.940723419189453 + }, + { + "auxiliary_loss_clip": 0.01104913, + "auxiliary_loss_mlp": 0.01030328, + "balance_loss_clip": 1.04245794, + "balance_loss_mlp": 1.01770353, + "epoch": 0.4992033669021494, + "flos": 15121912273920.0, + "grad_norm": 1.8053186726669859, + "language_loss": 0.69153666, + "learning_rate": 2.102189175590024e-06, + "loss": 0.71288902, + "num_input_tokens_seen": 178481340, + "step": 8303, + "time_per_iteration": 2.492274522781372 + }, + { + "auxiliary_loss_clip": 0.01120545, + "auxiliary_loss_mlp": 0.0103473, + "balance_loss_clip": 1.04267526, + "balance_loss_mlp": 1.0213424, + "epoch": 0.49926349015481736, + "flos": 31207952476800.0, + "grad_norm": 1.6288999303678884, + "language_loss": 0.72740132, + "learning_rate": 2.101800220681144e-06, + "loss": 0.74895406, + "num_input_tokens_seen": 178501545, + "step": 8304, + "time_per_iteration": 2.5359036922454834 + }, + { + "auxiliary_loss_clip": 0.01108181, + "auxiliary_loss_mlp": 0.01035318, + "balance_loss_clip": 1.04286361, + "balance_loss_mlp": 1.02348065, + "epoch": 0.4993236134074853, + "flos": 24900207903360.0, + "grad_norm": 2.5173084494061615, + "language_loss": 0.80925357, + "learning_rate": 2.10141126191199e-06, + "loss": 0.83068854, + "num_input_tokens_seen": 178519700, + "step": 8305, + "time_per_iteration": 3.968463182449341 + }, + { + "auxiliary_loss_clip": 0.01023546, + "auxiliary_loss_mlp": 0.01017304, + "balance_loss_clip": 1.0263238, + "balance_loss_mlp": 1.01593351, + "epoch": 0.4993837366601533, + "flos": 70420573797120.0, + "grad_norm": 0.7159508786839746, + "language_loss": 0.56861955, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.589028, + "num_input_tokens_seen": 178576740, + "step": 8306, + "time_per_iteration": 3.2563695907592773 + }, + { + "auxiliary_loss_clip": 0.01120917, + "auxiliary_loss_mlp": 0.01038373, + "balance_loss_clip": 1.04466963, + "balance_loss_mlp": 1.02397823, + "epoch": 0.4994438599128213, + "flos": 15961216440960.0, + "grad_norm": 1.7746295249646513, + "language_loss": 0.8258794, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.84747231, + "num_input_tokens_seen": 178594745, + "step": 8307, + "time_per_iteration": 2.4751482009887695 + }, + { + "auxiliary_loss_clip": 0.01118389, + "auxiliary_loss_mlp": 0.01035619, + "balance_loss_clip": 1.04283535, + "balance_loss_mlp": 1.02220178, + "epoch": 0.4995039831654893, + "flos": 27928303228800.0, + "grad_norm": 1.8963119590418955, + "language_loss": 0.6083709, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.62991095, + "num_input_tokens_seen": 178614110, + "step": 8308, + "time_per_iteration": 3.9243569374084473 + }, + { + "auxiliary_loss_clip": 0.01114351, + "auxiliary_loss_mlp": 0.01034349, + "balance_loss_clip": 1.04029083, + "balance_loss_mlp": 1.02159357, + "epoch": 0.49956410641815724, + "flos": 24204797619840.0, + "grad_norm": 1.5392031226942395, + "language_loss": 0.74820209, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.76968908, + "num_input_tokens_seen": 178634170, + "step": 8309, + "time_per_iteration": 2.479135751724243 + }, + { + "auxiliary_loss_clip": 0.01096451, + "auxiliary_loss_mlp": 0.01038101, + "balance_loss_clip": 1.04047871, + "balance_loss_mlp": 1.02501774, + "epoch": 0.4996242296708252, + "flos": 16180127879040.0, + "grad_norm": 2.3759373225641776, + "language_loss": 0.79886603, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.82021153, + "num_input_tokens_seen": 178651775, + "step": 8310, + "time_per_iteration": 2.5267388820648193 + }, + { + "auxiliary_loss_clip": 0.01113241, + "auxiliary_loss_mlp": 0.01038815, + "balance_loss_clip": 1.0436089, + "balance_loss_mlp": 1.02604151, + "epoch": 0.49968435292349317, + "flos": 16873527000960.0, + "grad_norm": 1.4853859276698715, + "language_loss": 0.70698142, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.72850192, + "num_input_tokens_seen": 178669720, + "step": 8311, + "time_per_iteration": 2.4571785926818848 + }, + { + "auxiliary_loss_clip": 0.01094513, + "auxiliary_loss_mlp": 0.01038758, + "balance_loss_clip": 1.04648066, + "balance_loss_mlp": 1.02627087, + "epoch": 0.49974447617616113, + "flos": 14939521989120.0, + "grad_norm": 1.8909078889077193, + "language_loss": 0.7707873, + "learning_rate": 2.098688443679187e-06, + "loss": 0.79211998, + "num_input_tokens_seen": 178686765, + "step": 8312, + "time_per_iteration": 2.5197155475616455 + }, + { + "auxiliary_loss_clip": 0.01088037, + "auxiliary_loss_mlp": 0.01036255, + "balance_loss_clip": 1.0415833, + "balance_loss_mlp": 1.02322495, + "epoch": 0.4998045994288291, + "flos": 26651535321600.0, + "grad_norm": 1.8073231747630303, + "language_loss": 0.84653461, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.86777747, + "num_input_tokens_seen": 178705845, + "step": 8313, + "time_per_iteration": 2.6040706634521484 + }, + { + "auxiliary_loss_clip": 0.01095167, + "auxiliary_loss_mlp": 0.01034221, + "balance_loss_clip": 1.04311907, + "balance_loss_mlp": 1.0204227, + "epoch": 0.49986472268149706, + "flos": 20953768533120.0, + "grad_norm": 1.9232233904885856, + "language_loss": 0.80765975, + "learning_rate": 2.097910461710939e-06, + "loss": 0.82895362, + "num_input_tokens_seen": 178723410, + "step": 8314, + "time_per_iteration": 2.550657272338867 + }, + { + "auxiliary_loss_clip": 0.01089821, + "auxiliary_loss_mlp": 0.00790799, + "balance_loss_clip": 1.04377961, + "balance_loss_mlp": 1.01382256, + "epoch": 0.49992484593416503, + "flos": 22783884433920.0, + "grad_norm": 1.6976015436860743, + "language_loss": 0.79403436, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.81284052, + "num_input_tokens_seen": 178743560, + "step": 8315, + "time_per_iteration": 2.575409412384033 + }, + { + "auxiliary_loss_clip": 0.01118394, + "auxiliary_loss_mlp": 0.01036156, + "balance_loss_clip": 1.04334402, + "balance_loss_mlp": 1.02368665, + "epoch": 0.499984969186833, + "flos": 46786970252160.0, + "grad_norm": 1.8001269282836145, + "language_loss": 0.74541283, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.7669583, + "num_input_tokens_seen": 178767225, + "step": 8316, + "time_per_iteration": 2.691697359085083 + }, + { + "auxiliary_loss_clip": 0.01102825, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.04529154, + "balance_loss_mlp": 1.01967108, + "epoch": 0.500045092439501, + "flos": 25556978131200.0, + "grad_norm": 1.7634050928814375, + "language_loss": 0.81316793, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.83451211, + "num_input_tokens_seen": 178786810, + "step": 8317, + "time_per_iteration": 2.5452218055725098 + }, + { + "auxiliary_loss_clip": 0.01096502, + "auxiliary_loss_mlp": 0.01034825, + "balance_loss_clip": 1.04122663, + "balance_loss_mlp": 1.02071631, + "epoch": 0.5001052156921689, + "flos": 20704764476160.0, + "grad_norm": 2.1710839300990163, + "language_loss": 0.83173037, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.85304368, + "num_input_tokens_seen": 178805660, + "step": 8318, + "time_per_iteration": 2.513848304748535 + }, + { + "auxiliary_loss_clip": 0.01108307, + "auxiliary_loss_mlp": 0.01035566, + "balance_loss_clip": 1.04295683, + "balance_loss_mlp": 1.02296591, + "epoch": 0.500165338944837, + "flos": 21251109317760.0, + "grad_norm": 1.9303645935194018, + "language_loss": 0.81663924, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.83807796, + "num_input_tokens_seen": 178824780, + "step": 8319, + "time_per_iteration": 2.516273021697998 + }, + { + "auxiliary_loss_clip": 0.01077373, + "auxiliary_loss_mlp": 0.01031205, + "balance_loss_clip": 1.04015779, + "balance_loss_mlp": 1.01819921, + "epoch": 0.5002254621975049, + "flos": 27854398995840.0, + "grad_norm": 1.650030160563755, + "language_loss": 0.71858674, + "learning_rate": 2.095576427171635e-06, + "loss": 0.73967254, + "num_input_tokens_seen": 178845640, + "step": 8320, + "time_per_iteration": 2.601989269256592 + }, + { + "auxiliary_loss_clip": 0.01092565, + "auxiliary_loss_mlp": 0.01039989, + "balance_loss_clip": 1.04522562, + "balance_loss_mlp": 1.02591038, + "epoch": 0.5002855854501729, + "flos": 15551941898880.0, + "grad_norm": 4.627201082572055, + "language_loss": 0.76974654, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.79107213, + "num_input_tokens_seen": 178862290, + "step": 8321, + "time_per_iteration": 2.5370495319366455 + }, + { + "auxiliary_loss_clip": 0.01108808, + "auxiliary_loss_mlp": 0.00790283, + "balance_loss_clip": 1.04253054, + "balance_loss_mlp": 1.01375437, + "epoch": 0.5003457087028408, + "flos": 16107408794880.0, + "grad_norm": 1.7035437152457817, + "language_loss": 0.82761467, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.84660554, + "num_input_tokens_seen": 178879805, + "step": 8322, + "time_per_iteration": 2.463927745819092 + }, + { + "auxiliary_loss_clip": 0.01110869, + "auxiliary_loss_mlp": 0.01036983, + "balance_loss_clip": 1.04285789, + "balance_loss_mlp": 1.02383983, + "epoch": 0.5004058319555088, + "flos": 22710518904960.0, + "grad_norm": 2.78765820846744, + "language_loss": 0.73320967, + "learning_rate": 2.094409360775228e-06, + "loss": 0.75468826, + "num_input_tokens_seen": 178896985, + "step": 8323, + "time_per_iteration": 2.50274395942688 + }, + { + "auxiliary_loss_clip": 0.01082674, + "auxiliary_loss_mlp": 0.01035041, + "balance_loss_clip": 1.04422081, + "balance_loss_mlp": 1.0217433, + "epoch": 0.5004659552081767, + "flos": 30117956313600.0, + "grad_norm": 1.5467413601669846, + "language_loss": 0.69575584, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.71693289, + "num_input_tokens_seen": 178920605, + "step": 8324, + "time_per_iteration": 2.627406120300293 + }, + { + "auxiliary_loss_clip": 0.01104864, + "auxiliary_loss_mlp": 0.00786575, + "balance_loss_clip": 1.04323494, + "balance_loss_mlp": 1.00875092, + "epoch": 0.5005260784608447, + "flos": 18624710764800.0, + "grad_norm": 1.9134528655293435, + "language_loss": 0.72068781, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.73960221, + "num_input_tokens_seen": 178937760, + "step": 8325, + "time_per_iteration": 2.511399507522583 + }, + { + "auxiliary_loss_clip": 0.0108392, + "auxiliary_loss_mlp": 0.01037578, + "balance_loss_clip": 1.04034901, + "balance_loss_mlp": 1.02283156, + "epoch": 0.5005862017135126, + "flos": 24859987649280.0, + "grad_norm": 1.6107122135647332, + "language_loss": 0.73869491, + "learning_rate": 2.093242262158709e-06, + "loss": 0.75990987, + "num_input_tokens_seen": 178957985, + "step": 8326, + "time_per_iteration": 2.5955214500427246 + }, + { + "auxiliary_loss_clip": 0.01092702, + "auxiliary_loss_mlp": 0.01033336, + "balance_loss_clip": 1.0419116, + "balance_loss_mlp": 1.02042007, + "epoch": 0.5006463249661807, + "flos": 18734381965440.0, + "grad_norm": 1.6999796156529614, + "language_loss": 0.78296912, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.8042295, + "num_input_tokens_seen": 178977070, + "step": 8327, + "time_per_iteration": 2.5630922317504883 + }, + { + "auxiliary_loss_clip": 0.01121394, + "auxiliary_loss_mlp": 0.01040196, + "balance_loss_clip": 1.04362106, + "balance_loss_mlp": 1.0268805, + "epoch": 0.5007064482188487, + "flos": 13042145871360.0, + "grad_norm": 2.197871095785912, + "language_loss": 0.87982273, + "learning_rate": 2.092464178710997e-06, + "loss": 0.90143859, + "num_input_tokens_seen": 178994175, + "step": 8328, + "time_per_iteration": 2.5174624919891357 + }, + { + "auxiliary_loss_clip": 0.01086122, + "auxiliary_loss_mlp": 0.01034344, + "balance_loss_clip": 1.04282236, + "balance_loss_mlp": 1.0212965, + "epoch": 0.5007665714715166, + "flos": 21288671965440.0, + "grad_norm": 2.091605870323064, + "language_loss": 0.74248588, + "learning_rate": 2.092075131720388e-06, + "loss": 0.76369053, + "num_input_tokens_seen": 179013710, + "step": 8329, + "time_per_iteration": 2.5970966815948486 + }, + { + "auxiliary_loss_clip": 0.01118476, + "auxiliary_loss_mlp": 0.01034034, + "balance_loss_clip": 1.04362738, + "balance_loss_mlp": 1.0215826, + "epoch": 0.5008266947241846, + "flos": 29754576374400.0, + "grad_norm": 1.5951068969860314, + "language_loss": 0.79709327, + "learning_rate": 2.091686081238281e-06, + "loss": 0.8186183, + "num_input_tokens_seen": 179035255, + "step": 8330, + "time_per_iteration": 2.550640344619751 + }, + { + "auxiliary_loss_clip": 0.01040266, + "auxiliary_loss_mlp": 0.00788151, + "balance_loss_clip": 1.04207885, + "balance_loss_mlp": 1.04400337, + "epoch": 0.5008868179768525, + "flos": 63557829204480.0, + "grad_norm": 0.7398171423400741, + "language_loss": 0.56050718, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.57879138, + "num_input_tokens_seen": 179090915, + "step": 8331, + "time_per_iteration": 2.9876041412353516 + }, + { + "auxiliary_loss_clip": 0.01106812, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.04333484, + "balance_loss_mlp": 1.01763678, + "epoch": 0.5009469412295205, + "flos": 27375637593600.0, + "grad_norm": 1.904676676204854, + "language_loss": 0.65000272, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.67136687, + "num_input_tokens_seen": 179109160, + "step": 8332, + "time_per_iteration": 2.5699105262756348 + }, + { + "auxiliary_loss_clip": 0.01114272, + "auxiliary_loss_mlp": 0.01032045, + "balance_loss_clip": 1.04036605, + "balance_loss_mlp": 1.02035081, + "epoch": 0.5010070644821885, + "flos": 27378833904000.0, + "grad_norm": 1.4382196612078442, + "language_loss": 0.74899262, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.77045572, + "num_input_tokens_seen": 179130610, + "step": 8333, + "time_per_iteration": 2.5824947357177734 + }, + { + "auxiliary_loss_clip": 0.01118958, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.04153872, + "balance_loss_mlp": 1.01971745, + "epoch": 0.5010671877348565, + "flos": 20662748542080.0, + "grad_norm": 2.212567988923357, + "language_loss": 0.8039124, + "learning_rate": 2.090129844689929e-06, + "loss": 0.82542461, + "num_input_tokens_seen": 179147860, + "step": 8334, + "time_per_iteration": 2.4836337566375732 + }, + { + "auxiliary_loss_clip": 0.01032896, + "auxiliary_loss_mlp": 0.0100399, + "balance_loss_clip": 1.01674795, + "balance_loss_mlp": 1.00277448, + "epoch": 0.5011273109875244, + "flos": 59128645000320.0, + "grad_norm": 0.9001251420511123, + "language_loss": 0.62674439, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64711332, + "num_input_tokens_seen": 179210490, + "step": 8335, + "time_per_iteration": 3.0748324394226074 + }, + { + "auxiliary_loss_clip": 0.01103816, + "auxiliary_loss_mlp": 0.01029961, + "balance_loss_clip": 1.03994548, + "balance_loss_mlp": 1.01758718, + "epoch": 0.5011874342401924, + "flos": 25336342840320.0, + "grad_norm": 1.385136279422082, + "language_loss": 0.79632974, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.81766748, + "num_input_tokens_seen": 179231360, + "step": 8336, + "time_per_iteration": 2.5285725593566895 + }, + { + "auxiliary_loss_clip": 0.01079704, + "auxiliary_loss_mlp": 0.01031669, + "balance_loss_clip": 1.04044843, + "balance_loss_mlp": 1.01831162, + "epoch": 0.5012475574928603, + "flos": 20229953569920.0, + "grad_norm": 1.7242012536977913, + "language_loss": 0.79654658, + "learning_rate": 2.088962631340836e-06, + "loss": 0.81766027, + "num_input_tokens_seen": 179250625, + "step": 8337, + "time_per_iteration": 2.5640838146209717 + }, + { + "auxiliary_loss_clip": 0.01120688, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.04047585, + "balance_loss_mlp": 1.02046108, + "epoch": 0.5013076807455283, + "flos": 22710123855360.0, + "grad_norm": 1.9652573496082573, + "language_loss": 0.79249191, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.81403702, + "num_input_tokens_seen": 179267360, + "step": 8338, + "time_per_iteration": 2.4686102867126465 + }, + { + "auxiliary_loss_clip": 0.01093061, + "auxiliary_loss_mlp": 0.0102707, + "balance_loss_clip": 1.04058933, + "balance_loss_mlp": 1.01414168, + "epoch": 0.5013678039981962, + "flos": 24245161528320.0, + "grad_norm": 1.5582642459852403, + "language_loss": 0.85071665, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87191796, + "num_input_tokens_seen": 179289810, + "step": 8339, + "time_per_iteration": 3.9518470764160156 + }, + { + "auxiliary_loss_clip": 0.01104002, + "auxiliary_loss_mlp": 0.01032442, + "balance_loss_clip": 1.03941965, + "balance_loss_mlp": 1.01978767, + "epoch": 0.5014279272508643, + "flos": 26176688501760.0, + "grad_norm": 1.5023565926583609, + "language_loss": 0.70657265, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.7279371, + "num_input_tokens_seen": 179310620, + "step": 8340, + "time_per_iteration": 2.578184127807617 + }, + { + "auxiliary_loss_clip": 0.01084743, + "auxiliary_loss_mlp": 0.01041899, + "balance_loss_clip": 1.04122651, + "balance_loss_mlp": 1.02672315, + "epoch": 0.5014880505035323, + "flos": 21430446946560.0, + "grad_norm": 1.8615752404837116, + "language_loss": 0.77822018, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.79948652, + "num_input_tokens_seen": 179329005, + "step": 8341, + "time_per_iteration": 3.919377088546753 + }, + { + "auxiliary_loss_clip": 0.01092707, + "auxiliary_loss_mlp": 0.01038028, + "balance_loss_clip": 1.04465115, + "balance_loss_mlp": 1.02388978, + "epoch": 0.5015481737562002, + "flos": 15770745596160.0, + "grad_norm": 2.5044583866212085, + "language_loss": 0.89393723, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.91524458, + "num_input_tokens_seen": 179343785, + "step": 8342, + "time_per_iteration": 2.526456832885742 + }, + { + "auxiliary_loss_clip": 0.01092642, + "auxiliary_loss_mlp": 0.01031849, + "balance_loss_clip": 1.03812397, + "balance_loss_mlp": 1.01915288, + "epoch": 0.5016082970088682, + "flos": 26830801123200.0, + "grad_norm": 1.678121041228467, + "language_loss": 0.76508659, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.78633153, + "num_input_tokens_seen": 179364070, + "step": 8343, + "time_per_iteration": 2.5638020038604736 + }, + { + "auxiliary_loss_clip": 0.01105893, + "auxiliary_loss_mlp": 0.01029349, + "balance_loss_clip": 1.04182792, + "balance_loss_mlp": 1.01725578, + "epoch": 0.5016684202615361, + "flos": 21470595373440.0, + "grad_norm": 1.6715845985726676, + "language_loss": 0.66906053, + "learning_rate": 2.086239016143293e-06, + "loss": 0.690413, + "num_input_tokens_seen": 179384225, + "step": 8344, + "time_per_iteration": 3.8884036540985107 + }, + { + "auxiliary_loss_clip": 0.01096983, + "auxiliary_loss_mlp": 0.01032424, + "balance_loss_clip": 1.04104829, + "balance_loss_mlp": 1.01996088, + "epoch": 0.5017285435142042, + "flos": 26246821806720.0, + "grad_norm": 2.3370189494265543, + "language_loss": 0.75373924, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.77503324, + "num_input_tokens_seen": 179402595, + "step": 8345, + "time_per_iteration": 2.594977378845215 + }, + { + "auxiliary_loss_clip": 0.01106592, + "auxiliary_loss_mlp": 0.0103434, + "balance_loss_clip": 1.0470376, + "balance_loss_mlp": 1.02066064, + "epoch": 0.5017886667668721, + "flos": 20777555387520.0, + "grad_norm": 2.019970485225884, + "language_loss": 0.78519583, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.8066051, + "num_input_tokens_seen": 179419635, + "step": 8346, + "time_per_iteration": 2.5184171199798584 + }, + { + "auxiliary_loss_clip": 0.01094346, + "auxiliary_loss_mlp": 0.00798231, + "balance_loss_clip": 1.03872895, + "balance_loss_mlp": 1.03484702, + "epoch": 0.5018487900195401, + "flos": 20156408472960.0, + "grad_norm": 1.5663734697411769, + "language_loss": 0.69378477, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.7127105, + "num_input_tokens_seen": 179438770, + "step": 8347, + "time_per_iteration": 4.018402814865112 + }, + { + "auxiliary_loss_clip": 0.0108201, + "auxiliary_loss_mlp": 0.01036352, + "balance_loss_clip": 1.0413599, + "balance_loss_mlp": 1.02323318, + "epoch": 0.501908913272208, + "flos": 18150689957760.0, + "grad_norm": 2.1661089127939976, + "language_loss": 0.71219385, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.7333774, + "num_input_tokens_seen": 179457475, + "step": 8348, + "time_per_iteration": 2.5724430084228516 + }, + { + "auxiliary_loss_clip": 0.01104929, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.0423857, + "balance_loss_mlp": 1.01949358, + "epoch": 0.501969036524876, + "flos": 23112287504640.0, + "grad_norm": 1.4330641828254942, + "language_loss": 0.74088562, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.76224709, + "num_input_tokens_seen": 179478140, + "step": 8349, + "time_per_iteration": 2.5157666206359863 + }, + { + "auxiliary_loss_clip": 0.01106736, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.04048491, + "balance_loss_mlp": 1.01614761, + "epoch": 0.5020291597775439, + "flos": 11363214314880.0, + "grad_norm": 2.3345324305738875, + "language_loss": 0.64319181, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.66455686, + "num_input_tokens_seen": 179494325, + "step": 8350, + "time_per_iteration": 2.5016677379608154 + }, + { + "auxiliary_loss_clip": 0.01016, + "auxiliary_loss_mlp": 0.01010465, + "balance_loss_clip": 1.01904535, + "balance_loss_mlp": 1.00895071, + "epoch": 0.5020892830302119, + "flos": 64011094928640.0, + "grad_norm": 0.7782445063299406, + "language_loss": 0.59862143, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.61888611, + "num_input_tokens_seen": 179553545, + "step": 8351, + "time_per_iteration": 3.3042829036712646 + }, + { + "auxiliary_loss_clip": 0.01092734, + "auxiliary_loss_mlp": 0.0103472, + "balance_loss_clip": 1.04183519, + "balance_loss_mlp": 1.02127349, + "epoch": 0.5021494062828799, + "flos": 23732859801600.0, + "grad_norm": 1.6823765142920162, + "language_loss": 0.75188833, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.77316284, + "num_input_tokens_seen": 179573645, + "step": 8352, + "time_per_iteration": 2.5590567588806152 + }, + { + "auxiliary_loss_clip": 0.01097948, + "auxiliary_loss_mlp": 0.01031693, + "balance_loss_clip": 1.04278874, + "balance_loss_mlp": 1.01813924, + "epoch": 0.5022095295355479, + "flos": 21576747041280.0, + "grad_norm": 1.6179659374671547, + "language_loss": 0.71630204, + "learning_rate": 2.082736990429464e-06, + "loss": 0.73759848, + "num_input_tokens_seen": 179591435, + "step": 8353, + "time_per_iteration": 2.5196127891540527 + }, + { + "auxiliary_loss_clip": 0.01112838, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.04568052, + "balance_loss_mlp": 1.01904619, + "epoch": 0.5022696527882159, + "flos": 21397229844480.0, + "grad_norm": 1.6711307480417537, + "language_loss": 0.74147379, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.762932, + "num_input_tokens_seen": 179609955, + "step": 8354, + "time_per_iteration": 2.5023159980773926 + }, + { + "auxiliary_loss_clip": 0.01094441, + "auxiliary_loss_mlp": 0.0103902, + "balance_loss_clip": 1.04148459, + "balance_loss_mlp": 1.02526283, + "epoch": 0.5023297760408838, + "flos": 27160712565120.0, + "grad_norm": 1.668388441414166, + "language_loss": 0.7223568, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.74369138, + "num_input_tokens_seen": 179630875, + "step": 8355, + "time_per_iteration": 2.57133150100708 + }, + { + "auxiliary_loss_clip": 0.01106881, + "auxiliary_loss_mlp": 0.01035775, + "balance_loss_clip": 1.04014313, + "balance_loss_mlp": 1.02195835, + "epoch": 0.5023898992935518, + "flos": 26213820186240.0, + "grad_norm": 4.245687406793476, + "language_loss": 0.81649113, + "learning_rate": 2.081569591520548e-06, + "loss": 0.83791769, + "num_input_tokens_seen": 179649835, + "step": 8356, + "time_per_iteration": 2.5295908451080322 + }, + { + "auxiliary_loss_clip": 0.01109294, + "auxiliary_loss_mlp": 0.01033785, + "balance_loss_clip": 1.0405395, + "balance_loss_mlp": 1.01908696, + "epoch": 0.5024500225462197, + "flos": 13440323111040.0, + "grad_norm": 2.027064224657736, + "language_loss": 0.75739896, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.77882975, + "num_input_tokens_seen": 179667605, + "step": 8357, + "time_per_iteration": 2.4750051498413086 + }, + { + "auxiliary_loss_clip": 0.01106908, + "auxiliary_loss_mlp": 0.01034996, + "balance_loss_clip": 1.04140472, + "balance_loss_mlp": 1.02087522, + "epoch": 0.5025101457988878, + "flos": 21579584215680.0, + "grad_norm": 1.776294459394396, + "language_loss": 0.76360142, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.78502047, + "num_input_tokens_seen": 179686910, + "step": 8358, + "time_per_iteration": 2.505016326904297 + }, + { + "auxiliary_loss_clip": 0.01095827, + "auxiliary_loss_mlp": 0.01034181, + "balance_loss_clip": 1.04082155, + "balance_loss_mlp": 1.02044833, + "epoch": 0.5025702690515557, + "flos": 24645134448000.0, + "grad_norm": 2.006728332983904, + "language_loss": 0.71918261, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.74048269, + "num_input_tokens_seen": 179706395, + "step": 8359, + "time_per_iteration": 2.55633282661438 + }, + { + "auxiliary_loss_clip": 0.01091156, + "auxiliary_loss_mlp": 0.01040344, + "balance_loss_clip": 1.04306269, + "balance_loss_mlp": 1.02698708, + "epoch": 0.5026303923042237, + "flos": 22090162089600.0, + "grad_norm": 1.587518122587937, + "language_loss": 0.76975691, + "learning_rate": 2.080013016407077e-06, + "loss": 0.79107189, + "num_input_tokens_seen": 179725735, + "step": 8360, + "time_per_iteration": 2.5397491455078125 + }, + { + "auxiliary_loss_clip": 0.01085468, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.04627287, + "balance_loss_mlp": 1.02068686, + "epoch": 0.5026905155568916, + "flos": 23697200574720.0, + "grad_norm": 1.9241471861288912, + "language_loss": 0.76819336, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.78937787, + "num_input_tokens_seen": 179746150, + "step": 8361, + "time_per_iteration": 2.582095146179199 + }, + { + "auxiliary_loss_clip": 0.01087362, + "auxiliary_loss_mlp": 0.01033917, + "balance_loss_clip": 1.03914595, + "balance_loss_mlp": 1.01961792, + "epoch": 0.5027506388095596, + "flos": 25812410722560.0, + "grad_norm": 1.5499227791851073, + "language_loss": 0.85201824, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.87323105, + "num_input_tokens_seen": 179767550, + "step": 8362, + "time_per_iteration": 2.5837697982788086 + }, + { + "auxiliary_loss_clip": 0.01093362, + "auxiliary_loss_mlp": 0.01032706, + "balance_loss_clip": 1.03812337, + "balance_loss_mlp": 1.02008748, + "epoch": 0.5028107620622275, + "flos": 27526606456320.0, + "grad_norm": 1.5850197923234448, + "language_loss": 0.78325808, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.80451882, + "num_input_tokens_seen": 179790075, + "step": 8363, + "time_per_iteration": 2.5604465007781982 + }, + { + "auxiliary_loss_clip": 0.01102411, + "auxiliary_loss_mlp": 0.01029789, + "balance_loss_clip": 1.04128313, + "balance_loss_mlp": 1.01683092, + "epoch": 0.5028708853148955, + "flos": 24534278098560.0, + "grad_norm": 2.2138726464268768, + "language_loss": 0.75600857, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.77733058, + "num_input_tokens_seen": 179806515, + "step": 8364, + "time_per_iteration": 2.5202434062957764 + }, + { + "auxiliary_loss_clip": 0.0111425, + "auxiliary_loss_mlp": 0.01028072, + "balance_loss_clip": 1.04075074, + "balance_loss_mlp": 1.01551974, + "epoch": 0.5029310085675635, + "flos": 20813609664000.0, + "grad_norm": 1.5403258390333454, + "language_loss": 0.69314456, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.71456778, + "num_input_tokens_seen": 179826450, + "step": 8365, + "time_per_iteration": 2.4808809757232666 + }, + { + "auxiliary_loss_clip": 0.0109873, + "auxiliary_loss_mlp": 0.01033818, + "balance_loss_clip": 1.04261589, + "balance_loss_mlp": 1.01978683, + "epoch": 0.5029911318202315, + "flos": 22342470197760.0, + "grad_norm": 1.6974462876632457, + "language_loss": 0.73300099, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.75432646, + "num_input_tokens_seen": 179846770, + "step": 8366, + "time_per_iteration": 2.554692506790161 + }, + { + "auxiliary_loss_clip": 0.01105989, + "auxiliary_loss_mlp": 0.01032146, + "balance_loss_clip": 1.04349041, + "balance_loss_mlp": 1.01980782, + "epoch": 0.5030512550728995, + "flos": 24352713826560.0, + "grad_norm": 1.49011278551894, + "language_loss": 0.78182185, + "learning_rate": 2.077288893713735e-06, + "loss": 0.80320323, + "num_input_tokens_seen": 179866585, + "step": 8367, + "time_per_iteration": 2.5213589668273926 + }, + { + "auxiliary_loss_clip": 0.0110373, + "auxiliary_loss_mlp": 0.01027098, + "balance_loss_clip": 1.0389185, + "balance_loss_mlp": 1.01465893, + "epoch": 0.5031113783255674, + "flos": 18259930195200.0, + "grad_norm": 1.673509604444769, + "language_loss": 0.69807196, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.71938026, + "num_input_tokens_seen": 179885575, + "step": 8368, + "time_per_iteration": 2.4764156341552734 + }, + { + "auxiliary_loss_clip": 0.01031603, + "auxiliary_loss_mlp": 0.01011223, + "balance_loss_clip": 1.01519513, + "balance_loss_mlp": 1.00973272, + "epoch": 0.5031715015782354, + "flos": 57253173200640.0, + "grad_norm": 0.8572507678480243, + "language_loss": 0.63373452, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65416276, + "num_input_tokens_seen": 179939650, + "step": 8369, + "time_per_iteration": 3.0477166175842285 + }, + { + "auxiliary_loss_clip": 0.01100808, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.04158235, + "balance_loss_mlp": 1.01903296, + "epoch": 0.5032316248309033, + "flos": 27527360641920.0, + "grad_norm": 1.9000713188379037, + "language_loss": 0.60082871, + "learning_rate": 2.076121368302263e-06, + "loss": 0.62214732, + "num_input_tokens_seen": 179961765, + "step": 8370, + "time_per_iteration": 2.559363603591919 + }, + { + "auxiliary_loss_clip": 0.01069761, + "auxiliary_loss_mlp": 0.01037001, + "balance_loss_clip": 1.04024315, + "balance_loss_mlp": 1.02285695, + "epoch": 0.5032917480835714, + "flos": 34495825939200.0, + "grad_norm": 1.4947196692731288, + "language_loss": 0.68107289, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.70214045, + "num_input_tokens_seen": 179983015, + "step": 8371, + "time_per_iteration": 2.6963963508605957 + }, + { + "auxiliary_loss_clip": 0.01091771, + "auxiliary_loss_mlp": 0.01030442, + "balance_loss_clip": 1.0416894, + "balance_loss_mlp": 1.01618457, + "epoch": 0.5033518713362393, + "flos": 33656773167360.0, + "grad_norm": 1.6292297431572758, + "language_loss": 0.67793065, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.69915277, + "num_input_tokens_seen": 180003210, + "step": 8372, + "time_per_iteration": 2.6160614490509033 + }, + { + "auxiliary_loss_clip": 0.01076952, + "auxiliary_loss_mlp": 0.0103698, + "balance_loss_clip": 1.0378325, + "balance_loss_mlp": 1.02118456, + "epoch": 0.5034119945889073, + "flos": 28185495586560.0, + "grad_norm": 4.171915779559252, + "language_loss": 0.6657064, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.68684572, + "num_input_tokens_seen": 180025530, + "step": 8373, + "time_per_iteration": 2.6331064701080322 + }, + { + "auxiliary_loss_clip": 0.01088916, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.0381918, + "balance_loss_mlp": 1.01678455, + "epoch": 0.5034721178415752, + "flos": 21358697529600.0, + "grad_norm": 1.8738025898459554, + "language_loss": 0.74379241, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.76497352, + "num_input_tokens_seen": 180043180, + "step": 8374, + "time_per_iteration": 2.5294289588928223 + }, + { + "auxiliary_loss_clip": 0.01093016, + "auxiliary_loss_mlp": 0.01036604, + "balance_loss_clip": 1.04265118, + "balance_loss_mlp": 1.02238274, + "epoch": 0.5035322410942432, + "flos": 22674823764480.0, + "grad_norm": 1.6660186105529953, + "language_loss": 0.68219459, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.70349079, + "num_input_tokens_seen": 180062905, + "step": 8375, + "time_per_iteration": 2.5358998775482178 + }, + { + "auxiliary_loss_clip": 0.01076513, + "auxiliary_loss_mlp": 0.01035544, + "balance_loss_clip": 1.04410505, + "balance_loss_mlp": 1.021209, + "epoch": 0.5035923643469111, + "flos": 19828723674240.0, + "grad_norm": 1.6995363312710676, + "language_loss": 0.78610766, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.80722827, + "num_input_tokens_seen": 180082000, + "step": 8376, + "time_per_iteration": 2.5749588012695312 + }, + { + "auxiliary_loss_clip": 0.01108137, + "auxiliary_loss_mlp": 0.00788239, + "balance_loss_clip": 1.04034758, + "balance_loss_mlp": 1.01196957, + "epoch": 0.5036524875995791, + "flos": 30514625182080.0, + "grad_norm": 1.955028721695317, + "language_loss": 0.59369361, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.61265731, + "num_input_tokens_seen": 180101340, + "step": 8377, + "time_per_iteration": 3.8920562267303467 + }, + { + "auxiliary_loss_clip": 0.01093915, + "auxiliary_loss_mlp": 0.01038174, + "balance_loss_clip": 1.0399164, + "balance_loss_mlp": 1.02498353, + "epoch": 0.5037126108522471, + "flos": 14720574637440.0, + "grad_norm": 1.9590073934771954, + "language_loss": 0.76073194, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.78205287, + "num_input_tokens_seen": 180119160, + "step": 8378, + "time_per_iteration": 2.51405930519104 + }, + { + "auxiliary_loss_clip": 0.01084516, + "auxiliary_loss_mlp": 0.01034434, + "balance_loss_clip": 1.0423764, + "balance_loss_mlp": 1.02200031, + "epoch": 0.5037727341049151, + "flos": 25297702784640.0, + "grad_norm": 1.9535416930969416, + "language_loss": 0.7472626, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.76845205, + "num_input_tokens_seen": 180138730, + "step": 8379, + "time_per_iteration": 4.035807847976685 + }, + { + "auxiliary_loss_clip": 0.01101964, + "auxiliary_loss_mlp": 0.01034991, + "balance_loss_clip": 1.0419085, + "balance_loss_mlp": 1.0222652, + "epoch": 0.5038328573575831, + "flos": 28541764632960.0, + "grad_norm": 2.176342160680064, + "language_loss": 0.66610205, + "learning_rate": 2.072229431544548e-06, + "loss": 0.68747157, + "num_input_tokens_seen": 180158810, + "step": 8380, + "time_per_iteration": 2.5718562602996826 + }, + { + "auxiliary_loss_clip": 0.01065072, + "auxiliary_loss_mlp": 0.01031638, + "balance_loss_clip": 1.04503679, + "balance_loss_mlp": 1.01939559, + "epoch": 0.503892980610251, + "flos": 31649869503360.0, + "grad_norm": 1.9382180392536867, + "language_loss": 0.63046485, + "learning_rate": 2.071840222561051e-06, + "loss": 0.65143198, + "num_input_tokens_seen": 180179700, + "step": 8381, + "time_per_iteration": 2.6786820888519287 + }, + { + "auxiliary_loss_clip": 0.0108933, + "auxiliary_loss_mlp": 0.01038228, + "balance_loss_clip": 1.03652084, + "balance_loss_mlp": 1.02558541, + "epoch": 0.503953103862919, + "flos": 27089358197760.0, + "grad_norm": 1.5035041232399846, + "language_loss": 0.67213088, + "learning_rate": 2.071451010853365e-06, + "loss": 0.69340646, + "num_input_tokens_seen": 180199890, + "step": 8382, + "time_per_iteration": 2.5657899379730225 + }, + { + "auxiliary_loss_clip": 0.01102598, + "auxiliary_loss_mlp": 0.01039453, + "balance_loss_clip": 1.04116189, + "balance_loss_mlp": 1.02579153, + "epoch": 0.5040132271155869, + "flos": 15632957024640.0, + "grad_norm": 1.675740406981606, + "language_loss": 0.62068951, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.64211011, + "num_input_tokens_seen": 180217840, + "step": 8383, + "time_per_iteration": 3.9274253845214844 + }, + { + "auxiliary_loss_clip": 0.01083982, + "auxiliary_loss_mlp": 0.01028109, + "balance_loss_clip": 1.04009008, + "balance_loss_mlp": 1.01612878, + "epoch": 0.504073350368255, + "flos": 13590106824960.0, + "grad_norm": 1.7156675448869185, + "language_loss": 0.66710418, + "learning_rate": 2.070672579324465e-06, + "loss": 0.68822509, + "num_input_tokens_seen": 180236465, + "step": 8384, + "time_per_iteration": 2.5659804344177246 + }, + { + "auxiliary_loss_clip": 0.01100934, + "auxiliary_loss_mlp": 0.01038604, + "balance_loss_clip": 1.04214466, + "balance_loss_mlp": 1.02656996, + "epoch": 0.5041334736209229, + "flos": 29058160510080.0, + "grad_norm": 1.9073153573329455, + "language_loss": 0.71046567, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.73186111, + "num_input_tokens_seen": 180258025, + "step": 8385, + "time_per_iteration": 2.57018780708313 + }, + { + "auxiliary_loss_clip": 0.01102447, + "auxiliary_loss_mlp": 0.01029635, + "balance_loss_clip": 1.04081357, + "balance_loss_mlp": 1.01739788, + "epoch": 0.5041935968735909, + "flos": 24608361899520.0, + "grad_norm": 1.8334123881679454, + "language_loss": 0.83424425, + "learning_rate": 2.069894137075919e-06, + "loss": 0.85556507, + "num_input_tokens_seen": 180277825, + "step": 8386, + "time_per_iteration": 3.880866527557373 + }, + { + "auxiliary_loss_clip": 0.01101565, + "auxiliary_loss_mlp": 0.0103184, + "balance_loss_clip": 1.04210579, + "balance_loss_mlp": 1.01935875, + "epoch": 0.5042537201262588, + "flos": 26286934320000.0, + "grad_norm": 1.8732999463607578, + "language_loss": 0.66957593, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.69090998, + "num_input_tokens_seen": 180300465, + "step": 8387, + "time_per_iteration": 2.5349254608154297 + }, + { + "auxiliary_loss_clip": 0.01060981, + "auxiliary_loss_mlp": 0.01033215, + "balance_loss_clip": 1.03958988, + "balance_loss_mlp": 1.02068591, + "epoch": 0.5043138433789268, + "flos": 22017371178240.0, + "grad_norm": 1.4571745400429825, + "language_loss": 0.80275702, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.82369906, + "num_input_tokens_seen": 180321050, + "step": 8388, + "time_per_iteration": 2.6253745555877686 + }, + { + "auxiliary_loss_clip": 0.01104212, + "auxiliary_loss_mlp": 0.01031644, + "balance_loss_clip": 1.04124272, + "balance_loss_mlp": 1.01968706, + "epoch": 0.5043739666315947, + "flos": 28767104605440.0, + "grad_norm": 2.1590236706838164, + "language_loss": 0.70275986, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.72411847, + "num_input_tokens_seen": 180338870, + "step": 8389, + "time_per_iteration": 2.570636034011841 + }, + { + "auxiliary_loss_clip": 0.01085175, + "auxiliary_loss_mlp": 0.0103858, + "balance_loss_clip": 1.04025006, + "balance_loss_mlp": 1.02628374, + "epoch": 0.5044340898842627, + "flos": 27599253713280.0, + "grad_norm": 1.6404694891923484, + "language_loss": 0.69485068, + "learning_rate": 2.068337220892191e-06, + "loss": 0.71608818, + "num_input_tokens_seen": 180361285, + "step": 8390, + "time_per_iteration": 2.595111846923828 + }, + { + "auxiliary_loss_clip": 0.01023936, + "auxiliary_loss_mlp": 0.01008447, + "balance_loss_clip": 1.01851201, + "balance_loss_mlp": 1.00714195, + "epoch": 0.5044942131369307, + "flos": 67458050749440.0, + "grad_norm": 0.8548741014068696, + "language_loss": 0.53004336, + "learning_rate": 2.067947985330974e-06, + "loss": 0.55036724, + "num_input_tokens_seen": 180415170, + "step": 8391, + "time_per_iteration": 2.943901300430298 + }, + { + "auxiliary_loss_clip": 0.01015997, + "auxiliary_loss_mlp": 0.01011256, + "balance_loss_clip": 1.02420056, + "balance_loss_mlp": 1.00983715, + "epoch": 0.5045543363895987, + "flos": 58630849390080.0, + "grad_norm": 0.8584399321609673, + "language_loss": 0.60789144, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.62816393, + "num_input_tokens_seen": 180468060, + "step": 8392, + "time_per_iteration": 2.993121862411499 + }, + { + "auxiliary_loss_clip": 0.01075417, + "auxiliary_loss_mlp": 0.01029697, + "balance_loss_clip": 1.03665912, + "balance_loss_mlp": 1.01760268, + "epoch": 0.5046144596422667, + "flos": 22526620248960.0, + "grad_norm": 1.612416800045726, + "language_loss": 0.84507912, + "learning_rate": 2.067169506493517e-06, + "loss": 0.86613023, + "num_input_tokens_seen": 180486610, + "step": 8393, + "time_per_iteration": 2.559434413909912 + }, + { + "auxiliary_loss_clip": 0.01081874, + "auxiliary_loss_mlp": 0.01028891, + "balance_loss_clip": 1.03991294, + "balance_loss_mlp": 1.01669598, + "epoch": 0.5046745828949346, + "flos": 27454246508160.0, + "grad_norm": 2.1378123720809854, + "language_loss": 0.51084638, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.53195399, + "num_input_tokens_seen": 180508135, + "step": 8394, + "time_per_iteration": 2.608208417892456 + }, + { + "auxiliary_loss_clip": 0.0111493, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.03938246, + "balance_loss_mlp": 1.01584327, + "epoch": 0.5047347061476026, + "flos": 17274541415040.0, + "grad_norm": 2.3769163853923296, + "language_loss": 0.7518599, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.77330285, + "num_input_tokens_seen": 180527000, + "step": 8395, + "time_per_iteration": 2.4684853553771973 + }, + { + "auxiliary_loss_clip": 0.01099397, + "auxiliary_loss_mlp": 0.01029833, + "balance_loss_clip": 1.04087424, + "balance_loss_mlp": 1.01736355, + "epoch": 0.5047948294002705, + "flos": 16649515831680.0, + "grad_norm": 1.843748109855953, + "language_loss": 0.67756963, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.69886196, + "num_input_tokens_seen": 180544715, + "step": 8396, + "time_per_iteration": 2.4828054904937744 + }, + { + "auxiliary_loss_clip": 0.01106409, + "auxiliary_loss_mlp": 0.01026244, + "balance_loss_clip": 1.04371953, + "balance_loss_mlp": 1.01459098, + "epoch": 0.5048549526529386, + "flos": 26865706164480.0, + "grad_norm": 2.1504927008001844, + "language_loss": 0.78947288, + "learning_rate": 2.065612518371792e-06, + "loss": 0.81079936, + "num_input_tokens_seen": 180565365, + "step": 8397, + "time_per_iteration": 2.5937998294830322 + }, + { + "auxiliary_loss_clip": 0.01071171, + "auxiliary_loss_mlp": 0.0102644, + "balance_loss_clip": 1.03864324, + "balance_loss_mlp": 1.01419091, + "epoch": 0.5049150759056065, + "flos": 21833939399040.0, + "grad_norm": 1.5773040752954472, + "language_loss": 0.66137278, + "learning_rate": 2.065223265084376e-06, + "loss": 0.68234891, + "num_input_tokens_seen": 180586670, + "step": 8398, + "time_per_iteration": 2.635952949523926 + }, + { + "auxiliary_loss_clip": 0.01103362, + "auxiliary_loss_mlp": 0.00786081, + "balance_loss_clip": 1.04233491, + "balance_loss_mlp": 1.01142883, + "epoch": 0.5049751991582745, + "flos": 21685807710720.0, + "grad_norm": 1.6423942580528361, + "language_loss": 0.71823138, + "learning_rate": 2.064834009323688e-06, + "loss": 0.73712581, + "num_input_tokens_seen": 180605085, + "step": 8399, + "time_per_iteration": 2.496854782104492 + }, + { + "auxiliary_loss_clip": 0.01088005, + "auxiliary_loss_mlp": 0.01048203, + "balance_loss_clip": 1.04220414, + "balance_loss_mlp": 1.03402901, + "epoch": 0.5050353224109424, + "flos": 21359379888000.0, + "grad_norm": 1.7224260386942996, + "language_loss": 0.81551576, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.83687782, + "num_input_tokens_seen": 180624370, + "step": 8400, + "time_per_iteration": 2.5499682426452637 + }, + { + "auxiliary_loss_clip": 0.01078583, + "auxiliary_loss_mlp": 0.01036967, + "balance_loss_clip": 1.0408442, + "balance_loss_mlp": 1.02291179, + "epoch": 0.5050954456636104, + "flos": 22820082364800.0, + "grad_norm": 1.9543535402900545, + "language_loss": 0.78482634, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.80598181, + "num_input_tokens_seen": 180642450, + "step": 8401, + "time_per_iteration": 2.571368932723999 + }, + { + "auxiliary_loss_clip": 0.01118805, + "auxiliary_loss_mlp": 0.00785619, + "balance_loss_clip": 1.04163337, + "balance_loss_mlp": 1.00819874, + "epoch": 0.5051555689162783, + "flos": 30448226891520.0, + "grad_norm": 2.220460810037052, + "language_loss": 0.69922721, + "learning_rate": 2.063666227349593e-06, + "loss": 0.71827137, + "num_input_tokens_seen": 180665250, + "step": 8402, + "time_per_iteration": 2.599173069000244 + }, + { + "auxiliary_loss_clip": 0.01103932, + "auxiliary_loss_mlp": 0.00787385, + "balance_loss_clip": 1.03810143, + "balance_loss_mlp": 1.01063085, + "epoch": 0.5052156921689464, + "flos": 21287953693440.0, + "grad_norm": 1.6194415505025195, + "language_loss": 0.69350284, + "learning_rate": 2.063276961843422e-06, + "loss": 0.71241599, + "num_input_tokens_seen": 180687425, + "step": 8403, + "time_per_iteration": 2.5743980407714844 + }, + { + "auxiliary_loss_clip": 0.01103633, + "auxiliary_loss_mlp": 0.01036017, + "balance_loss_clip": 1.04155076, + "balance_loss_mlp": 1.02335143, + "epoch": 0.5052758154216143, + "flos": 25081305298560.0, + "grad_norm": 1.4181534875648099, + "language_loss": 0.8563683, + "learning_rate": 2.062887693937781e-06, + "loss": 0.87776482, + "num_input_tokens_seen": 180708725, + "step": 8404, + "time_per_iteration": 2.530034303665161 + }, + { + "auxiliary_loss_clip": 0.01082937, + "auxiliary_loss_mlp": 0.00785783, + "balance_loss_clip": 1.04170132, + "balance_loss_mlp": 1.00983238, + "epoch": 0.5053359386742823, + "flos": 20885502735360.0, + "grad_norm": 1.9053414524239496, + "language_loss": 0.75773805, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.77642524, + "num_input_tokens_seen": 180727990, + "step": 8405, + "time_per_iteration": 2.5523202419281006 + }, + { + "auxiliary_loss_clip": 0.01119013, + "auxiliary_loss_mlp": 0.01025467, + "balance_loss_clip": 1.04306197, + "balance_loss_mlp": 1.01175249, + "epoch": 0.5053960619269503, + "flos": 37743335493120.0, + "grad_norm": 1.6671367491852005, + "language_loss": 0.73220342, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.75364822, + "num_input_tokens_seen": 180749765, + "step": 8406, + "time_per_iteration": 2.601382255554199 + }, + { + "auxiliary_loss_clip": 0.01078806, + "auxiliary_loss_mlp": 0.01027112, + "balance_loss_clip": 1.04232192, + "balance_loss_mlp": 1.01493478, + "epoch": 0.5054561851796182, + "flos": 23513840622720.0, + "grad_norm": 1.7351113290178766, + "language_loss": 0.76697052, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.78802967, + "num_input_tokens_seen": 180769580, + "step": 8407, + "time_per_iteration": 2.605022668838501 + }, + { + "auxiliary_loss_clip": 0.01084533, + "auxiliary_loss_mlp": 0.01029959, + "balance_loss_clip": 1.03714442, + "balance_loss_mlp": 1.0178895, + "epoch": 0.5055163084322862, + "flos": 30410233280640.0, + "grad_norm": 1.637995566245657, + "language_loss": 0.63068825, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.65183318, + "num_input_tokens_seen": 180790295, + "step": 8408, + "time_per_iteration": 2.6036229133605957 + }, + { + "auxiliary_loss_clip": 0.01090863, + "auxiliary_loss_mlp": 0.01037448, + "balance_loss_clip": 1.03879428, + "balance_loss_mlp": 1.0224756, + "epoch": 0.5055764316849541, + "flos": 20259651139200.0, + "grad_norm": 2.32852673126693, + "language_loss": 0.63454366, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.65582675, + "num_input_tokens_seen": 180807875, + "step": 8409, + "time_per_iteration": 2.5086472034454346 + }, + { + "auxiliary_loss_clip": 0.01093199, + "auxiliary_loss_mlp": 0.01024442, + "balance_loss_clip": 1.04343963, + "balance_loss_mlp": 1.01310492, + "epoch": 0.5056365549376222, + "flos": 26070895969920.0, + "grad_norm": 1.316193627172706, + "language_loss": 0.70765156, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.72882795, + "num_input_tokens_seen": 180831300, + "step": 8410, + "time_per_iteration": 2.5814642906188965 + }, + { + "auxiliary_loss_clip": 0.01091251, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.04200876, + "balance_loss_mlp": 1.02104068, + "epoch": 0.5056966781902901, + "flos": 19279074781440.0, + "grad_norm": 1.5752895007860812, + "language_loss": 0.7915864, + "learning_rate": 2.060162752653113e-06, + "loss": 0.81284112, + "num_input_tokens_seen": 180849055, + "step": 8411, + "time_per_iteration": 2.498537063598633 + }, + { + "auxiliary_loss_clip": 0.01118095, + "auxiliary_loss_mlp": 0.0103443, + "balance_loss_clip": 1.04213452, + "balance_loss_mlp": 1.01987433, + "epoch": 0.5057568014429581, + "flos": 21323325611520.0, + "grad_norm": 2.8093825839201303, + "language_loss": 0.81804323, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.8395685, + "num_input_tokens_seen": 180867395, + "step": 8412, + "time_per_iteration": 2.4487051963806152 + }, + { + "auxiliary_loss_clip": 0.01098319, + "auxiliary_loss_mlp": 0.01033441, + "balance_loss_clip": 1.04329634, + "balance_loss_mlp": 1.02070975, + "epoch": 0.505816924695626, + "flos": 17493596507520.0, + "grad_norm": 1.7282082041217908, + "language_loss": 0.80373216, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.82504982, + "num_input_tokens_seen": 180886670, + "step": 8413, + "time_per_iteration": 2.5198514461517334 + }, + { + "auxiliary_loss_clip": 0.01086645, + "auxiliary_loss_mlp": 0.0078815, + "balance_loss_clip": 1.04271638, + "balance_loss_mlp": 1.01414049, + "epoch": 0.505877047948294, + "flos": 21142084561920.0, + "grad_norm": 1.7481565294224306, + "language_loss": 0.80492842, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.82367635, + "num_input_tokens_seen": 180904645, + "step": 8414, + "time_per_iteration": 2.5708065032958984 + }, + { + "auxiliary_loss_clip": 0.0110375, + "auxiliary_loss_mlp": 0.01029111, + "balance_loss_clip": 1.03886151, + "balance_loss_mlp": 1.01573563, + "epoch": 0.5059371712009619, + "flos": 36350036887680.0, + "grad_norm": 1.938291123186673, + "language_loss": 0.62169158, + "learning_rate": 2.058605592832528e-06, + "loss": 0.64302021, + "num_input_tokens_seen": 180922340, + "step": 8415, + "time_per_iteration": 4.009791851043701 + }, + { + "auxiliary_loss_clip": 0.01082351, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.0399704, + "balance_loss_mlp": 1.01780701, + "epoch": 0.50599729445363, + "flos": 22673387220480.0, + "grad_norm": 1.7056369403840987, + "language_loss": 0.82024163, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.84137231, + "num_input_tokens_seen": 180941350, + "step": 8416, + "time_per_iteration": 2.5643579959869385 + }, + { + "auxiliary_loss_clip": 0.01082369, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.04217124, + "balance_loss_mlp": 1.02465177, + "epoch": 0.5060574177062979, + "flos": 22747866071040.0, + "grad_norm": 3.7272451435757907, + "language_loss": 0.79133308, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.81252015, + "num_input_tokens_seen": 180960720, + "step": 8417, + "time_per_iteration": 2.5754427909851074 + }, + { + "auxiliary_loss_clip": 0.01067923, + "auxiliary_loss_mlp": 0.01033945, + "balance_loss_clip": 1.04260206, + "balance_loss_mlp": 1.02111256, + "epoch": 0.5061175409589659, + "flos": 21653201139840.0, + "grad_norm": 8.64729736212539, + "language_loss": 0.62633532, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.64735401, + "num_input_tokens_seen": 180979725, + "step": 8418, + "time_per_iteration": 4.041779518127441 + }, + { + "auxiliary_loss_clip": 0.01085166, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.0401814, + "balance_loss_mlp": 1.02010381, + "epoch": 0.5061776642116339, + "flos": 21616249023360.0, + "grad_norm": 2.4718902341180895, + "language_loss": 0.77418119, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.79536009, + "num_input_tokens_seen": 180998980, + "step": 8419, + "time_per_iteration": 2.542443037033081 + }, + { + "auxiliary_loss_clip": 0.01062278, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.04149306, + "balance_loss_mlp": 1.01862931, + "epoch": 0.5062377874643018, + "flos": 24426294837120.0, + "grad_norm": 1.9437993632229984, + "language_loss": 0.76838434, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.78932863, + "num_input_tokens_seen": 181019165, + "step": 8420, + "time_per_iteration": 2.6317360401153564 + }, + { + "auxiliary_loss_clip": 0.01115265, + "auxiliary_loss_mlp": 0.0103597, + "balance_loss_clip": 1.03962278, + "balance_loss_mlp": 1.02266645, + "epoch": 0.5062979107169698, + "flos": 22524429519360.0, + "grad_norm": 1.7443460358000449, + "language_loss": 0.77435565, + "learning_rate": 2.056269786726999e-06, + "loss": 0.79586798, + "num_input_tokens_seen": 181037110, + "step": 8421, + "time_per_iteration": 3.860727310180664 + }, + { + "auxiliary_loss_clip": 0.01098929, + "auxiliary_loss_mlp": 0.01034541, + "balance_loss_clip": 1.03840685, + "balance_loss_mlp": 1.02141643, + "epoch": 0.5063580339696377, + "flos": 24571984400640.0, + "grad_norm": 1.4799360759487314, + "language_loss": 0.66523343, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.68656814, + "num_input_tokens_seen": 181057775, + "step": 8422, + "time_per_iteration": 2.561185121536255 + }, + { + "auxiliary_loss_clip": 0.0111579, + "auxiliary_loss_mlp": 0.01033874, + "balance_loss_clip": 1.04231596, + "balance_loss_mlp": 1.02093959, + "epoch": 0.5064181572223058, + "flos": 22596143022720.0, + "grad_norm": 1.551843060205616, + "language_loss": 0.81736052, + "learning_rate": 2.05549116746431e-06, + "loss": 0.83885717, + "num_input_tokens_seen": 181078260, + "step": 8423, + "time_per_iteration": 2.5180530548095703 + }, + { + "auxiliary_loss_clip": 0.01117767, + "auxiliary_loss_mlp": 0.00789273, + "balance_loss_clip": 1.04139352, + "balance_loss_mlp": 1.01465607, + "epoch": 0.5064782804749737, + "flos": 25994944661760.0, + "grad_norm": 6.583816641917283, + "language_loss": 0.74760264, + "learning_rate": 2.055101854669237e-06, + "loss": 0.76667309, + "num_input_tokens_seen": 181098755, + "step": 8424, + "time_per_iteration": 3.921456813812256 + }, + { + "auxiliary_loss_clip": 0.01112463, + "auxiliary_loss_mlp": 0.01035973, + "balance_loss_clip": 1.03954947, + "balance_loss_mlp": 1.02309847, + "epoch": 0.5065384037276417, + "flos": 28553041503360.0, + "grad_norm": 1.4076924634346928, + "language_loss": 0.7120589, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.73354328, + "num_input_tokens_seen": 181121570, + "step": 8425, + "time_per_iteration": 2.5431594848632812 + }, + { + "auxiliary_loss_clip": 0.0107749, + "auxiliary_loss_mlp": 0.010393, + "balance_loss_clip": 1.03799415, + "balance_loss_mlp": 1.02547181, + "epoch": 0.5065985269803096, + "flos": 22966023323520.0, + "grad_norm": 1.8312556042584032, + "language_loss": 0.78635436, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.8075223, + "num_input_tokens_seen": 181140240, + "step": 8426, + "time_per_iteration": 2.554767608642578 + }, + { + "auxiliary_loss_clip": 0.01105866, + "auxiliary_loss_mlp": 0.0103987, + "balance_loss_clip": 1.04233778, + "balance_loss_mlp": 1.02683997, + "epoch": 0.5066586502329776, + "flos": 21608563512960.0, + "grad_norm": 2.2273907031298945, + "language_loss": 0.77974129, + "learning_rate": 2.053933903806265e-06, + "loss": 0.80119866, + "num_input_tokens_seen": 181158630, + "step": 8427, + "time_per_iteration": 2.497082471847534 + }, + { + "auxiliary_loss_clip": 0.01110695, + "auxiliary_loss_mlp": 0.01027892, + "balance_loss_clip": 1.03863752, + "balance_loss_mlp": 1.01564884, + "epoch": 0.5067187734856455, + "flos": 20339912079360.0, + "grad_norm": 1.7268489914193934, + "language_loss": 0.71977621, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.74116206, + "num_input_tokens_seen": 181176405, + "step": 8428, + "time_per_iteration": 2.4352338314056396 + }, + { + "auxiliary_loss_clip": 0.01104993, + "auxiliary_loss_mlp": 0.00789167, + "balance_loss_clip": 1.03911841, + "balance_loss_mlp": 1.0162816, + "epoch": 0.5067788967383136, + "flos": 28841080665600.0, + "grad_norm": 1.6301540872370726, + "language_loss": 0.82731915, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.84626079, + "num_input_tokens_seen": 181197595, + "step": 8429, + "time_per_iteration": 2.556468963623047 + }, + { + "auxiliary_loss_clip": 0.01087104, + "auxiliary_loss_mlp": 0.01035316, + "balance_loss_clip": 1.04222012, + "balance_loss_mlp": 1.0213809, + "epoch": 0.5068390199909815, + "flos": 32450174478720.0, + "grad_norm": 1.6519146587137663, + "language_loss": 0.73263961, + "learning_rate": 2.052765934536682e-06, + "loss": 0.75386381, + "num_input_tokens_seen": 181218560, + "step": 8430, + "time_per_iteration": 2.6230990886688232 + }, + { + "auxiliary_loss_clip": 0.01052622, + "auxiliary_loss_mlp": 0.01035953, + "balance_loss_clip": 1.035707, + "balance_loss_mlp": 1.02173114, + "epoch": 0.5068991432436495, + "flos": 23146582014720.0, + "grad_norm": 1.628870406088993, + "language_loss": 0.76042998, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.78131568, + "num_input_tokens_seen": 181237095, + "step": 8431, + "time_per_iteration": 2.6198713779449463 + }, + { + "auxiliary_loss_clip": 0.01101067, + "auxiliary_loss_mlp": 0.01034017, + "balance_loss_clip": 1.04151106, + "balance_loss_mlp": 1.02107668, + "epoch": 0.5069592664963174, + "flos": 19936096404480.0, + "grad_norm": 1.4580897092131913, + "language_loss": 0.71959198, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.74094284, + "num_input_tokens_seen": 181255940, + "step": 8432, + "time_per_iteration": 2.4795711040496826 + }, + { + "auxiliary_loss_clip": 0.01000145, + "auxiliary_loss_mlp": 0.0100751, + "balance_loss_clip": 1.02003932, + "balance_loss_mlp": 1.00591242, + "epoch": 0.5070193897489854, + "flos": 65793771941760.0, + "grad_norm": 0.7572006243203351, + "language_loss": 0.63664293, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.65671945, + "num_input_tokens_seen": 181316945, + "step": 8433, + "time_per_iteration": 3.229689836502075 + }, + { + "auxiliary_loss_clip": 0.01083446, + "auxiliary_loss_mlp": 0.01041054, + "balance_loss_clip": 1.04095042, + "balance_loss_mlp": 1.02800083, + "epoch": 0.5070795130016534, + "flos": 17275331514240.0, + "grad_norm": 2.027464371140594, + "language_loss": 0.77640438, + "learning_rate": 2.051208614233681e-06, + "loss": 0.79764938, + "num_input_tokens_seen": 181335555, + "step": 8434, + "time_per_iteration": 2.537355422973633 + }, + { + "auxiliary_loss_clip": 0.01094121, + "auxiliary_loss_mlp": 0.01035145, + "balance_loss_clip": 1.03935814, + "balance_loss_mlp": 1.02233553, + "epoch": 0.5071396362543213, + "flos": 21069940095360.0, + "grad_norm": 1.7571671880483892, + "language_loss": 0.7080127, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.72930539, + "num_input_tokens_seen": 181354580, + "step": 8435, + "time_per_iteration": 2.526679515838623 + }, + { + "auxiliary_loss_clip": 0.01107302, + "auxiliary_loss_mlp": 0.01036525, + "balance_loss_clip": 1.04042554, + "balance_loss_mlp": 1.02295256, + "epoch": 0.5071997595069894, + "flos": 23144822248320.0, + "grad_norm": 1.9047286734412912, + "language_loss": 0.72185457, + "learning_rate": 2.050429942372112e-06, + "loss": 0.74329293, + "num_input_tokens_seen": 181374320, + "step": 8436, + "time_per_iteration": 2.506650686264038 + }, + { + "auxiliary_loss_clip": 0.01116706, + "auxiliary_loss_mlp": 0.01036595, + "balance_loss_clip": 1.04163933, + "balance_loss_mlp": 1.02274287, + "epoch": 0.5072598827596573, + "flos": 22747183712640.0, + "grad_norm": 1.559618948584808, + "language_loss": 0.83808744, + "learning_rate": 2.050040603565483e-06, + "loss": 0.85962045, + "num_input_tokens_seen": 181392190, + "step": 8437, + "time_per_iteration": 2.489640235900879 + }, + { + "auxiliary_loss_clip": 0.01100696, + "auxiliary_loss_mlp": 0.01029433, + "balance_loss_clip": 1.03826857, + "balance_loss_mlp": 1.01690459, + "epoch": 0.5073200060123253, + "flos": 22566301799040.0, + "grad_norm": 1.4443720024593156, + "language_loss": 0.80518192, + "learning_rate": 2.049651262861309e-06, + "loss": 0.82648325, + "num_input_tokens_seen": 181413890, + "step": 8438, + "time_per_iteration": 2.53745174407959 + }, + { + "auxiliary_loss_clip": 0.0107599, + "auxiliary_loss_mlp": 0.01039016, + "balance_loss_clip": 1.042364, + "balance_loss_mlp": 1.02393007, + "epoch": 0.5073801292649932, + "flos": 25806341324160.0, + "grad_norm": 1.8734157588145635, + "language_loss": 0.79491121, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.81606126, + "num_input_tokens_seen": 181433240, + "step": 8439, + "time_per_iteration": 2.6027188301086426 + }, + { + "auxiliary_loss_clip": 0.01086594, + "auxiliary_loss_mlp": 0.00789586, + "balance_loss_clip": 1.04235137, + "balance_loss_mlp": 1.01187348, + "epoch": 0.5074402525176612, + "flos": 25373941401600.0, + "grad_norm": 1.6145564089304947, + "language_loss": 0.71176219, + "learning_rate": 2.048872575819383e-06, + "loss": 0.730524, + "num_input_tokens_seen": 181453535, + "step": 8440, + "time_per_iteration": 2.5860679149627686 + }, + { + "auxiliary_loss_clip": 0.0109029, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.03911543, + "balance_loss_mlp": 1.01965082, + "epoch": 0.5075003757703291, + "flos": 26064431521920.0, + "grad_norm": 1.5976432251225625, + "language_loss": 0.70990372, + "learning_rate": 2.048483229511158e-06, + "loss": 0.7311334, + "num_input_tokens_seen": 181474195, + "step": 8441, + "time_per_iteration": 2.5856683254241943 + }, + { + "auxiliary_loss_clip": 0.01105978, + "auxiliary_loss_mlp": 0.00786074, + "balance_loss_clip": 1.0394392, + "balance_loss_mlp": 1.00959945, + "epoch": 0.5075604990229972, + "flos": 21835447770240.0, + "grad_norm": 1.5880638316458038, + "language_loss": 0.63820767, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.65712821, + "num_input_tokens_seen": 181494000, + "step": 8442, + "time_per_iteration": 2.509188652038574 + }, + { + "auxiliary_loss_clip": 0.01061477, + "auxiliary_loss_mlp": 0.01030075, + "balance_loss_clip": 1.0414176, + "balance_loss_mlp": 1.01819551, + "epoch": 0.5076206222756651, + "flos": 31978703537280.0, + "grad_norm": 1.4448797561748918, + "language_loss": 0.7106123, + "learning_rate": 2.047704531394006e-06, + "loss": 0.73152781, + "num_input_tokens_seen": 181515955, + "step": 8443, + "time_per_iteration": 2.675706386566162 + }, + { + "auxiliary_loss_clip": 0.01035665, + "auxiliary_loss_mlp": 0.01042641, + "balance_loss_clip": 1.0385499, + "balance_loss_mlp": 1.02713215, + "epoch": 0.5076807455283331, + "flos": 36904031326080.0, + "grad_norm": 1.337808592395016, + "language_loss": 0.62067145, + "learning_rate": 2.047315179614607e-06, + "loss": 0.64145446, + "num_input_tokens_seen": 181540225, + "step": 8444, + "time_per_iteration": 2.783217430114746 + }, + { + "auxiliary_loss_clip": 0.01080601, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.03804421, + "balance_loss_mlp": 1.02062941, + "epoch": 0.507740868781001, + "flos": 29862415981440.0, + "grad_norm": 1.6606660395372757, + "language_loss": 0.63766843, + "learning_rate": 2.046925826041012e-06, + "loss": 0.65880567, + "num_input_tokens_seen": 181560125, + "step": 8445, + "time_per_iteration": 2.614365339279175 + }, + { + "auxiliary_loss_clip": 0.01014222, + "auxiliary_loss_mlp": 0.01001772, + "balance_loss_clip": 1.01780057, + "balance_loss_mlp": 1.00048435, + "epoch": 0.507800992033669, + "flos": 61918974247680.0, + "grad_norm": 0.829498800244111, + "language_loss": 0.61935854, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.63951844, + "num_input_tokens_seen": 181618830, + "step": 8446, + "time_per_iteration": 3.172197103500366 + }, + { + "auxiliary_loss_clip": 0.01075919, + "auxiliary_loss_mlp": 0.0102812, + "balance_loss_clip": 1.04047191, + "balance_loss_mlp": 1.0157702, + "epoch": 0.507861115286337, + "flos": 20700490757760.0, + "grad_norm": 1.708007794034752, + "language_loss": 0.80623627, + "learning_rate": 2.04614711357029e-06, + "loss": 0.82727671, + "num_input_tokens_seen": 181637120, + "step": 8447, + "time_per_iteration": 2.5604207515716553 + }, + { + "auxiliary_loss_clip": 0.01103641, + "auxiliary_loss_mlp": 0.01031158, + "balance_loss_clip": 1.04045546, + "balance_loss_mlp": 1.01831365, + "epoch": 0.507921238539005, + "flos": 30847050576000.0, + "grad_norm": 1.53814364491918, + "language_loss": 0.70370436, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.72505236, + "num_input_tokens_seen": 181659965, + "step": 8448, + "time_per_iteration": 2.5771970748901367 + }, + { + "auxiliary_loss_clip": 0.01113468, + "auxiliary_loss_mlp": 0.00784896, + "balance_loss_clip": 1.04071712, + "balance_loss_mlp": 1.01076531, + "epoch": 0.507981361791673, + "flos": 35700197984640.0, + "grad_norm": 1.4912174781121486, + "language_loss": 0.72009009, + "learning_rate": 2.045368394099955e-06, + "loss": 0.73907375, + "num_input_tokens_seen": 181685290, + "step": 8449, + "time_per_iteration": 2.592029333114624 + }, + { + "auxiliary_loss_clip": 0.01088503, + "auxiliary_loss_mlp": 0.01028557, + "balance_loss_clip": 1.03767896, + "balance_loss_mlp": 1.0164572, + "epoch": 0.5080414850443409, + "flos": 27161466750720.0, + "grad_norm": 1.4206283014566663, + "language_loss": 0.72786367, + "learning_rate": 2.044979031776844e-06, + "loss": 0.74903429, + "num_input_tokens_seen": 181706080, + "step": 8450, + "time_per_iteration": 2.5862224102020264 + }, + { + "auxiliary_loss_clip": 0.01117151, + "auxiliary_loss_mlp": 0.01033525, + "balance_loss_clip": 1.0414958, + "balance_loss_mlp": 1.02081084, + "epoch": 0.5081016082970089, + "flos": 27085192220160.0, + "grad_norm": 1.7650605983807157, + "language_loss": 0.76565534, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.78716207, + "num_input_tokens_seen": 181724805, + "step": 8451, + "time_per_iteration": 2.4883201122283936 + }, + { + "auxiliary_loss_clip": 0.01115838, + "auxiliary_loss_mlp": 0.01034954, + "balance_loss_clip": 1.04077506, + "balance_loss_mlp": 1.02258015, + "epoch": 0.5081617315496768, + "flos": 22856531690880.0, + "grad_norm": 1.6642244546573524, + "language_loss": 0.84962279, + "learning_rate": 2.044200302028559e-06, + "loss": 0.8711307, + "num_input_tokens_seen": 181743725, + "step": 8452, + "time_per_iteration": 2.483795166015625 + }, + { + "auxiliary_loss_clip": 0.01118991, + "auxiliary_loss_mlp": 0.01034649, + "balance_loss_clip": 1.04135275, + "balance_loss_mlp": 1.02131534, + "epoch": 0.5082218548023448, + "flos": 16281898087680.0, + "grad_norm": 2.687465201865233, + "language_loss": 0.77168679, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.79322326, + "num_input_tokens_seen": 181757720, + "step": 8453, + "time_per_iteration": 2.4038193225860596 + }, + { + "auxiliary_loss_clip": 0.0107959, + "auxiliary_loss_mlp": 0.01034765, + "balance_loss_clip": 1.03957427, + "balance_loss_mlp": 1.02253401, + "epoch": 0.5082819780550127, + "flos": 24460768915200.0, + "grad_norm": 1.71255675120932, + "language_loss": 0.76246387, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.78360742, + "num_input_tokens_seen": 181778545, + "step": 8454, + "time_per_iteration": 3.948051929473877 + }, + { + "auxiliary_loss_clip": 0.01094485, + "auxiliary_loss_mlp": 0.01034388, + "balance_loss_clip": 1.03987026, + "balance_loss_mlp": 1.02091146, + "epoch": 0.5083421013076808, + "flos": 23403271582080.0, + "grad_norm": 1.7522330201822223, + "language_loss": 0.89191318, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.91320187, + "num_input_tokens_seen": 181799495, + "step": 8455, + "time_per_iteration": 2.5544421672821045 + }, + { + "auxiliary_loss_clip": 0.01100648, + "auxiliary_loss_mlp": 0.00786775, + "balance_loss_clip": 1.04043043, + "balance_loss_mlp": 1.00923765, + "epoch": 0.5084022245603487, + "flos": 23872695448320.0, + "grad_norm": 1.729811049759438, + "language_loss": 0.62270033, + "learning_rate": 2.042642822537149e-06, + "loss": 0.6415745, + "num_input_tokens_seen": 181818400, + "step": 8456, + "time_per_iteration": 2.5391440391540527 + }, + { + "auxiliary_loss_clip": 0.01031895, + "auxiliary_loss_mlp": 0.01000606, + "balance_loss_clip": 1.01633096, + "balance_loss_mlp": 0.99928904, + "epoch": 0.5084623478130167, + "flos": 62873336655360.0, + "grad_norm": 0.8328637254632598, + "language_loss": 0.62473732, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.64506233, + "num_input_tokens_seen": 181875975, + "step": 8457, + "time_per_iteration": 4.340438604354858 + }, + { + "auxiliary_loss_clip": 0.01108196, + "auxiliary_loss_mlp": 0.01031485, + "balance_loss_clip": 1.0414288, + "balance_loss_mlp": 1.01794863, + "epoch": 0.5085224710656846, + "flos": 22346133384960.0, + "grad_norm": 1.6915483785118888, + "language_loss": 0.67247725, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.69387406, + "num_input_tokens_seen": 181896450, + "step": 8458, + "time_per_iteration": 2.509958505630493 + }, + { + "auxiliary_loss_clip": 0.01102526, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.0378058, + "balance_loss_mlp": 1.01850045, + "epoch": 0.5085825943183526, + "flos": 26066263115520.0, + "grad_norm": 1.7673153416869285, + "language_loss": 0.77605283, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.79739422, + "num_input_tokens_seen": 181916770, + "step": 8459, + "time_per_iteration": 2.531630039215088 + }, + { + "auxiliary_loss_clip": 0.01123367, + "auxiliary_loss_mlp": 0.01035485, + "balance_loss_clip": 1.04563808, + "balance_loss_mlp": 1.02196085, + "epoch": 0.5086427175710206, + "flos": 17420733768960.0, + "grad_norm": 1.883996362320534, + "language_loss": 0.80658954, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.82817811, + "num_input_tokens_seen": 181932710, + "step": 8460, + "time_per_iteration": 3.8592710494995117 + }, + { + "auxiliary_loss_clip": 0.01094328, + "auxiliary_loss_mlp": 0.01036103, + "balance_loss_clip": 1.03921711, + "balance_loss_mlp": 1.02313304, + "epoch": 0.5087028408236886, + "flos": 20631758083200.0, + "grad_norm": 1.6313074723916021, + "language_loss": 0.68980557, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.71110988, + "num_input_tokens_seen": 181950665, + "step": 8461, + "time_per_iteration": 2.5057876110076904 + }, + { + "auxiliary_loss_clip": 0.01112191, + "auxiliary_loss_mlp": 0.0103037, + "balance_loss_clip": 1.03963757, + "balance_loss_mlp": 1.01745987, + "epoch": 0.5087629640763566, + "flos": 25593822506880.0, + "grad_norm": 1.6392007372773147, + "language_loss": 0.7583248, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.77975041, + "num_input_tokens_seen": 181971270, + "step": 8462, + "time_per_iteration": 2.491183280944824 + }, + { + "auxiliary_loss_clip": 0.01079014, + "auxiliary_loss_mlp": 0.01037238, + "balance_loss_clip": 1.04060483, + "balance_loss_mlp": 1.0228076, + "epoch": 0.5088230873290245, + "flos": 13261631927040.0, + "grad_norm": 1.978030416566282, + "language_loss": 0.81222528, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.83338779, + "num_input_tokens_seen": 181988410, + "step": 8463, + "time_per_iteration": 2.526637315750122 + }, + { + "auxiliary_loss_clip": 0.01100473, + "auxiliary_loss_mlp": 0.01038952, + "balance_loss_clip": 1.04120779, + "balance_loss_mlp": 1.02580345, + "epoch": 0.5088832105816925, + "flos": 20043469134720.0, + "grad_norm": 1.7174502814789807, + "language_loss": 0.76133454, + "learning_rate": 2.039527786882341e-06, + "loss": 0.78272879, + "num_input_tokens_seen": 182006530, + "step": 8464, + "time_per_iteration": 3.9028267860412598 + }, + { + "auxiliary_loss_clip": 0.01030279, + "auxiliary_loss_mlp": 0.01001937, + "balance_loss_clip": 1.01459348, + "balance_loss_mlp": 1.0005722, + "epoch": 0.5089433338343604, + "flos": 67422179018880.0, + "grad_norm": 0.6839108282888562, + "language_loss": 0.5941779, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61449999, + "num_input_tokens_seen": 182074240, + "step": 8465, + "time_per_iteration": 3.2179949283599854 + }, + { + "auxiliary_loss_clip": 0.01112183, + "auxiliary_loss_mlp": 0.01032277, + "balance_loss_clip": 1.03861642, + "balance_loss_mlp": 1.01977825, + "epoch": 0.5090034570870284, + "flos": 22710339336960.0, + "grad_norm": 1.8699010143913102, + "language_loss": 0.80060565, + "learning_rate": 2.038749012684354e-06, + "loss": 0.82205033, + "num_input_tokens_seen": 182093360, + "step": 8466, + "time_per_iteration": 2.465264081954956 + }, + { + "auxiliary_loss_clip": 0.01103136, + "auxiliary_loss_mlp": 0.01029446, + "balance_loss_clip": 1.03935778, + "balance_loss_mlp": 1.01671422, + "epoch": 0.5090635803396963, + "flos": 20445812352000.0, + "grad_norm": 1.5288649731756847, + "language_loss": 0.78713989, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.80846578, + "num_input_tokens_seen": 182110170, + "step": 8467, + "time_per_iteration": 2.4934592247009277 + }, + { + "auxiliary_loss_clip": 0.01111274, + "auxiliary_loss_mlp": 0.01029534, + "balance_loss_clip": 1.03986096, + "balance_loss_mlp": 1.01694512, + "epoch": 0.5091237035923644, + "flos": 23768878164480.0, + "grad_norm": 1.6737165431161254, + "language_loss": 0.74663037, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.76803845, + "num_input_tokens_seen": 182129570, + "step": 8468, + "time_per_iteration": 2.472665309906006 + }, + { + "auxiliary_loss_clip": 0.01111538, + "auxiliary_loss_mlp": 0.0103153, + "balance_loss_clip": 1.03814864, + "balance_loss_mlp": 1.01871467, + "epoch": 0.5091838268450323, + "flos": 18327908684160.0, + "grad_norm": 1.7516080704623367, + "language_loss": 0.77903152, + "learning_rate": 2.03758084040404e-06, + "loss": 0.80046225, + "num_input_tokens_seen": 182147565, + "step": 8469, + "time_per_iteration": 2.4365696907043457 + }, + { + "auxiliary_loss_clip": 0.0109964, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.04064012, + "balance_loss_mlp": 1.02096486, + "epoch": 0.5092439500977003, + "flos": 29057621806080.0, + "grad_norm": 1.5561405095662946, + "language_loss": 0.69820547, + "learning_rate": 2.037191446774109e-06, + "loss": 0.71955228, + "num_input_tokens_seen": 182169695, + "step": 8470, + "time_per_iteration": 2.5422539710998535 + }, + { + "auxiliary_loss_clip": 0.01089575, + "auxiliary_loss_mlp": 0.01042589, + "balance_loss_clip": 1.03766513, + "balance_loss_mlp": 1.02772939, + "epoch": 0.5093040733503682, + "flos": 13553908894080.0, + "grad_norm": 2.0002968520619584, + "language_loss": 0.73626792, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.75758952, + "num_input_tokens_seen": 182186385, + "step": 8471, + "time_per_iteration": 2.485443353652954 + }, + { + "auxiliary_loss_clip": 0.01038483, + "auxiliary_loss_mlp": 0.01000802, + "balance_loss_clip": 1.01219559, + "balance_loss_mlp": 0.99951476, + "epoch": 0.5093641966030362, + "flos": 68906617407360.0, + "grad_norm": 0.7482423278703207, + "language_loss": 0.5804947, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60088754, + "num_input_tokens_seen": 182247095, + "step": 8472, + "time_per_iteration": 3.0671560764312744 + }, + { + "auxiliary_loss_clip": 0.01064387, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.03913081, + "balance_loss_mlp": 1.02104032, + "epoch": 0.5094243198557042, + "flos": 21580948932480.0, + "grad_norm": 2.0267896110819326, + "language_loss": 0.69016147, + "learning_rate": 2.03602325748156e-06, + "loss": 0.71113884, + "num_input_tokens_seen": 182266380, + "step": 8473, + "time_per_iteration": 2.5876779556274414 + }, + { + "auxiliary_loss_clip": 0.01092375, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.04048944, + "balance_loss_mlp": 1.01805103, + "epoch": 0.5094844431083722, + "flos": 28840721529600.0, + "grad_norm": 1.8318567928214506, + "language_loss": 0.85659617, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.87782788, + "num_input_tokens_seen": 182284685, + "step": 8474, + "time_per_iteration": 2.5861573219299316 + }, + { + "auxiliary_loss_clip": 0.01098306, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.04160357, + "balance_loss_mlp": 1.0185883, + "epoch": 0.5095445663610402, + "flos": 14976114969600.0, + "grad_norm": 2.1489455909799644, + "language_loss": 0.64890987, + "learning_rate": 2.035244457765222e-06, + "loss": 0.67020726, + "num_input_tokens_seen": 182301810, + "step": 8475, + "time_per_iteration": 2.5175931453704834 + }, + { + "auxiliary_loss_clip": 0.01097527, + "auxiliary_loss_mlp": 0.01035871, + "balance_loss_clip": 1.03992915, + "balance_loss_mlp": 1.02130413, + "epoch": 0.5096046896137081, + "flos": 20777088510720.0, + "grad_norm": 2.6617836706042057, + "language_loss": 0.81966078, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.84099472, + "num_input_tokens_seen": 182320285, + "step": 8476, + "time_per_iteration": 2.5134925842285156 + }, + { + "auxiliary_loss_clip": 0.01063856, + "auxiliary_loss_mlp": 0.01038334, + "balance_loss_clip": 1.03732347, + "balance_loss_mlp": 1.02238393, + "epoch": 0.5096648128663761, + "flos": 23185078416000.0, + "grad_norm": 1.9236571564381704, + "language_loss": 0.80454385, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.8255657, + "num_input_tokens_seen": 182339465, + "step": 8477, + "time_per_iteration": 2.6243512630462646 + }, + { + "auxiliary_loss_clip": 0.01091397, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.04028118, + "balance_loss_mlp": 1.01356149, + "epoch": 0.509724936119044, + "flos": 22309432663680.0, + "grad_norm": 1.9018708146122782, + "language_loss": 0.61918348, + "learning_rate": 2.034076248204082e-06, + "loss": 0.64038259, + "num_input_tokens_seen": 182358375, + "step": 8478, + "time_per_iteration": 2.51712703704834 + }, + { + "auxiliary_loss_clip": 0.01099827, + "auxiliary_loss_mlp": 0.01038606, + "balance_loss_clip": 1.04236007, + "balance_loss_mlp": 1.02625585, + "epoch": 0.509785059371712, + "flos": 26287077974400.0, + "grad_norm": 1.7747420593673056, + "language_loss": 0.66177303, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.68315732, + "num_input_tokens_seen": 182377935, + "step": 8479, + "time_per_iteration": 2.5715749263763428 + }, + { + "auxiliary_loss_clip": 0.01101688, + "auxiliary_loss_mlp": 0.01031518, + "balance_loss_clip": 1.04017591, + "balance_loss_mlp": 1.01907277, + "epoch": 0.50984518262438, + "flos": 22964586779520.0, + "grad_norm": 1.6559938122407516, + "language_loss": 0.6949656, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.71629769, + "num_input_tokens_seen": 182396440, + "step": 8480, + "time_per_iteration": 2.5230281352996826 + }, + { + "auxiliary_loss_clip": 0.01114796, + "auxiliary_loss_mlp": 0.01033291, + "balance_loss_clip": 1.0383774, + "balance_loss_mlp": 1.02019036, + "epoch": 0.509905305877048, + "flos": 26213389223040.0, + "grad_norm": 2.2008227530417273, + "language_loss": 0.79506421, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.81654513, + "num_input_tokens_seen": 182415890, + "step": 8481, + "time_per_iteration": 2.5043692588806152 + }, + { + "auxiliary_loss_clip": 0.01096248, + "auxiliary_loss_mlp": 0.0103316, + "balance_loss_clip": 1.0368284, + "balance_loss_mlp": 1.02032709, + "epoch": 0.5099654291297159, + "flos": 20340055733760.0, + "grad_norm": 1.6021707058205554, + "language_loss": 0.83377618, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.85507029, + "num_input_tokens_seen": 182434235, + "step": 8482, + "time_per_iteration": 2.475356340408325 + }, + { + "auxiliary_loss_clip": 0.01107841, + "auxiliary_loss_mlp": 0.00788308, + "balance_loss_clip": 1.03903186, + "balance_loss_mlp": 1.01259327, + "epoch": 0.5100255523823839, + "flos": 29054820545280.0, + "grad_norm": 6.717126338626177, + "language_loss": 0.8581934, + "learning_rate": 2.032129206622238e-06, + "loss": 0.87715483, + "num_input_tokens_seen": 182454360, + "step": 8483, + "time_per_iteration": 2.5575509071350098 + }, + { + "auxiliary_loss_clip": 0.01101165, + "auxiliary_loss_mlp": 0.01032976, + "balance_loss_clip": 1.03721094, + "balance_loss_mlp": 1.02063835, + "epoch": 0.5100856756350518, + "flos": 22455912326400.0, + "grad_norm": 2.3297139326039487, + "language_loss": 0.8309195, + "learning_rate": 2.031739794591775e-06, + "loss": 0.85226095, + "num_input_tokens_seen": 182471940, + "step": 8484, + "time_per_iteration": 2.4820194244384766 + }, + { + "auxiliary_loss_clip": 0.01090087, + "auxiliary_loss_mlp": 0.01029557, + "balance_loss_clip": 1.04021943, + "balance_loss_mlp": 1.01599669, + "epoch": 0.5101457988877198, + "flos": 19171055606400.0, + "grad_norm": 2.338739298438024, + "language_loss": 0.81667411, + "learning_rate": 2.031350381357736e-06, + "loss": 0.8378706, + "num_input_tokens_seen": 182490685, + "step": 8485, + "time_per_iteration": 2.5058841705322266 + }, + { + "auxiliary_loss_clip": 0.01085618, + "auxiliary_loss_mlp": 0.01030695, + "balance_loss_clip": 1.03466165, + "balance_loss_mlp": 1.01818943, + "epoch": 0.5102059221403878, + "flos": 14866371941760.0, + "grad_norm": 1.9322666227458032, + "language_loss": 0.74030137, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.76146448, + "num_input_tokens_seen": 182508325, + "step": 8486, + "time_per_iteration": 2.490628242492676 + }, + { + "auxiliary_loss_clip": 0.01072678, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.03834391, + "balance_loss_mlp": 1.01427603, + "epoch": 0.5102660453930558, + "flos": 22961103160320.0, + "grad_norm": 1.5000149869592727, + "language_loss": 0.70107067, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.72208089, + "num_input_tokens_seen": 182527020, + "step": 8487, + "time_per_iteration": 2.564749002456665 + }, + { + "auxiliary_loss_clip": 0.01090768, + "auxiliary_loss_mlp": 0.01031025, + "balance_loss_clip": 1.038239, + "balance_loss_mlp": 1.01733959, + "epoch": 0.5103261686457238, + "flos": 23149311448320.0, + "grad_norm": 2.521989436124946, + "language_loss": 0.7265504, + "learning_rate": 2.030182134581827e-06, + "loss": 0.74776828, + "num_input_tokens_seen": 182543505, + "step": 8488, + "time_per_iteration": 2.5093986988067627 + }, + { + "auxiliary_loss_clip": 0.01073016, + "auxiliary_loss_mlp": 0.00788837, + "balance_loss_clip": 1.03661394, + "balance_loss_mlp": 1.01471996, + "epoch": 0.5103862918983917, + "flos": 14319237000960.0, + "grad_norm": 2.5924326841466256, + "language_loss": 0.69448793, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.71310651, + "num_input_tokens_seen": 182562250, + "step": 8489, + "time_per_iteration": 2.5387628078460693 + }, + { + "auxiliary_loss_clip": 0.01095016, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_clip": 1.03738999, + "balance_loss_mlp": 1.0165453, + "epoch": 0.5104464151510597, + "flos": 25848536826240.0, + "grad_norm": 1.8229367280545934, + "language_loss": 0.72329259, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.74453354, + "num_input_tokens_seen": 182581910, + "step": 8490, + "time_per_iteration": 2.5432918071746826 + }, + { + "auxiliary_loss_clip": 0.0108759, + "auxiliary_loss_mlp": 0.01027577, + "balance_loss_clip": 1.04130125, + "balance_loss_mlp": 1.01561427, + "epoch": 0.5105065384037276, + "flos": 21652913831040.0, + "grad_norm": 1.5132712692514443, + "language_loss": 0.80656165, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.82771337, + "num_input_tokens_seen": 182601350, + "step": 8491, + "time_per_iteration": 2.526655912399292 + }, + { + "auxiliary_loss_clip": 0.01099633, + "auxiliary_loss_mlp": 0.0102917, + "balance_loss_clip": 1.03737497, + "balance_loss_mlp": 1.01673663, + "epoch": 0.5105666616563956, + "flos": 22491571553280.0, + "grad_norm": 4.285961807182633, + "language_loss": 0.78407836, + "learning_rate": 2.028624456259728e-06, + "loss": 0.80536634, + "num_input_tokens_seen": 182619660, + "step": 8492, + "time_per_iteration": 3.8466880321502686 + }, + { + "auxiliary_loss_clip": 0.01082895, + "auxiliary_loss_mlp": 0.01040736, + "balance_loss_clip": 1.0390631, + "balance_loss_mlp": 1.0270927, + "epoch": 0.5106267849090635, + "flos": 22455768672000.0, + "grad_norm": 2.3670580246367523, + "language_loss": 0.77883375, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.80007011, + "num_input_tokens_seen": 182639815, + "step": 8493, + "time_per_iteration": 2.54646635055542 + }, + { + "auxiliary_loss_clip": 0.01072955, + "auxiliary_loss_mlp": 0.01025611, + "balance_loss_clip": 1.03940129, + "balance_loss_mlp": 1.01153219, + "epoch": 0.5106869081617316, + "flos": 23547093638400.0, + "grad_norm": 2.331740091399332, + "language_loss": 0.83726615, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.85825181, + "num_input_tokens_seen": 182659655, + "step": 8494, + "time_per_iteration": 2.5968892574310303 + }, + { + "auxiliary_loss_clip": 0.01115586, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.0398041, + "balance_loss_mlp": 1.02146435, + "epoch": 0.5107470314143995, + "flos": 26792987080320.0, + "grad_norm": 2.0799626050217106, + "language_loss": 0.79026008, + "learning_rate": 2.027456186069326e-06, + "loss": 0.81175196, + "num_input_tokens_seen": 182677075, + "step": 8495, + "time_per_iteration": 3.869382619857788 + }, + { + "auxiliary_loss_clip": 0.01080485, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.03772688, + "balance_loss_mlp": 1.01788282, + "epoch": 0.5108071546670675, + "flos": 25739691638400.0, + "grad_norm": 2.4993039138549844, + "language_loss": 0.78538018, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.80649555, + "num_input_tokens_seen": 182699625, + "step": 8496, + "time_per_iteration": 2.6154603958129883 + }, + { + "auxiliary_loss_clip": 0.01099764, + "auxiliary_loss_mlp": 0.01027523, + "balance_loss_clip": 1.03796947, + "balance_loss_mlp": 1.01533961, + "epoch": 0.5108672779197354, + "flos": 18697537589760.0, + "grad_norm": 1.9278343638979454, + "language_loss": 0.78665555, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.80792844, + "num_input_tokens_seen": 182717020, + "step": 8497, + "time_per_iteration": 2.4776406288146973 + }, + { + "auxiliary_loss_clip": 0.01111241, + "auxiliary_loss_mlp": 0.01031168, + "balance_loss_clip": 1.03771973, + "balance_loss_mlp": 1.01863289, + "epoch": 0.5109274011724034, + "flos": 26688164215680.0, + "grad_norm": 1.5942703418103918, + "language_loss": 0.81735247, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.83877659, + "num_input_tokens_seen": 182736955, + "step": 8498, + "time_per_iteration": 2.501861095428467 + }, + { + "auxiliary_loss_clip": 0.010833, + "auxiliary_loss_mlp": 0.00786907, + "balance_loss_clip": 1.0403372, + "balance_loss_mlp": 1.00919867, + "epoch": 0.5109875244250714, + "flos": 22784028088320.0, + "grad_norm": 1.6919608992134287, + "language_loss": 0.70547915, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.72418123, + "num_input_tokens_seen": 182757620, + "step": 8499, + "time_per_iteration": 4.068626403808594 + }, + { + "auxiliary_loss_clip": 0.01058926, + "auxiliary_loss_mlp": 0.01036769, + "balance_loss_clip": 1.03816581, + "balance_loss_mlp": 1.02232075, + "epoch": 0.5110476476777394, + "flos": 35588515622400.0, + "grad_norm": 1.452259863659716, + "language_loss": 0.72447097, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.74542797, + "num_input_tokens_seen": 182780195, + "step": 8500, + "time_per_iteration": 2.688023328781128 + }, + { + "auxiliary_loss_clip": 0.01106402, + "auxiliary_loss_mlp": 0.01033261, + "balance_loss_clip": 1.0378406, + "balance_loss_mlp": 1.01897955, + "epoch": 0.5111077709304074, + "flos": 19280798634240.0, + "grad_norm": 2.7629780418112753, + "language_loss": 0.63141894, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.65281558, + "num_input_tokens_seen": 182795765, + "step": 8501, + "time_per_iteration": 2.463298797607422 + }, + { + "auxiliary_loss_clip": 0.01112202, + "auxiliary_loss_mlp": 0.01033182, + "balance_loss_clip": 1.03539145, + "balance_loss_mlp": 1.01990843, + "epoch": 0.5111678941830753, + "flos": 20668207409280.0, + "grad_norm": 1.8090197384645332, + "language_loss": 0.87373227, + "learning_rate": 2.024730186540907e-06, + "loss": 0.89518607, + "num_input_tokens_seen": 182813120, + "step": 8502, + "time_per_iteration": 3.8608434200286865 + }, + { + "auxiliary_loss_clip": 0.01096829, + "auxiliary_loss_mlp": 0.01032275, + "balance_loss_clip": 1.03561354, + "balance_loss_mlp": 1.0204916, + "epoch": 0.5112280174357433, + "flos": 26287903987200.0, + "grad_norm": 4.329037807519145, + "language_loss": 0.822294, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.84358501, + "num_input_tokens_seen": 182835745, + "step": 8503, + "time_per_iteration": 2.5430634021759033 + }, + { + "auxiliary_loss_clip": 0.01018893, + "auxiliary_loss_mlp": 0.01004878, + "balance_loss_clip": 1.0172354, + "balance_loss_mlp": 1.00352514, + "epoch": 0.5112881406884112, + "flos": 59474247707520.0, + "grad_norm": 0.8571187369412036, + "language_loss": 0.63892269, + "learning_rate": 2.023951320871339e-06, + "loss": 0.65916044, + "num_input_tokens_seen": 182892540, + "step": 8504, + "time_per_iteration": 3.1448419094085693 + }, + { + "auxiliary_loss_clip": 0.01079456, + "auxiliary_loss_mlp": 0.00785556, + "balance_loss_clip": 1.03706396, + "balance_loss_mlp": 1.00715065, + "epoch": 0.5113482639410792, + "flos": 26468857728000.0, + "grad_norm": 1.8494964572134716, + "language_loss": 0.8439728, + "learning_rate": 2.023561886666816e-06, + "loss": 0.86262298, + "num_input_tokens_seen": 182911515, + "step": 8505, + "time_per_iteration": 2.5878028869628906 + }, + { + "auxiliary_loss_clip": 0.01101961, + "auxiliary_loss_mlp": 0.01025019, + "balance_loss_clip": 1.04045725, + "balance_loss_mlp": 1.01266897, + "epoch": 0.5114083871937471, + "flos": 29895848565120.0, + "grad_norm": 1.857848781181716, + "language_loss": 0.74885678, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.77012658, + "num_input_tokens_seen": 182930860, + "step": 8506, + "time_per_iteration": 2.5313973426818848 + }, + { + "auxiliary_loss_clip": 0.01114123, + "auxiliary_loss_mlp": 0.01034216, + "balance_loss_clip": 1.0384717, + "balance_loss_mlp": 1.02012515, + "epoch": 0.5114685104464152, + "flos": 24314576561280.0, + "grad_norm": 2.039076578613514, + "language_loss": 0.57599789, + "learning_rate": 2.022783015592131e-06, + "loss": 0.59748125, + "num_input_tokens_seen": 182949960, + "step": 8507, + "time_per_iteration": 2.465965986251831 + }, + { + "auxiliary_loss_clip": 0.01106415, + "auxiliary_loss_mlp": 0.01044944, + "balance_loss_clip": 1.04207301, + "balance_loss_mlp": 1.03075171, + "epoch": 0.5115286336990831, + "flos": 17019288391680.0, + "grad_norm": 1.876484982791679, + "language_loss": 0.8539052, + "learning_rate": 2.022393578751503e-06, + "loss": 0.87541878, + "num_input_tokens_seen": 182968085, + "step": 8508, + "time_per_iteration": 2.4681830406188965 + }, + { + "auxiliary_loss_clip": 0.01083082, + "auxiliary_loss_mlp": 0.00787266, + "balance_loss_clip": 1.04010773, + "balance_loss_mlp": 1.00973368, + "epoch": 0.5115887569517511, + "flos": 23659386531840.0, + "grad_norm": 1.6843393329277068, + "language_loss": 0.72307301, + "learning_rate": 2.022004141061709e-06, + "loss": 0.74177647, + "num_input_tokens_seen": 182987275, + "step": 8509, + "time_per_iteration": 2.540834426879883 + }, + { + "auxiliary_loss_clip": 0.01111678, + "auxiliary_loss_mlp": 0.00787104, + "balance_loss_clip": 1.03836727, + "balance_loss_mlp": 1.01188612, + "epoch": 0.511648880204419, + "flos": 16107193313280.0, + "grad_norm": 1.8705904004860707, + "language_loss": 0.75991559, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.77890337, + "num_input_tokens_seen": 183004700, + "step": 8510, + "time_per_iteration": 2.4169161319732666 + }, + { + "auxiliary_loss_clip": 0.01112732, + "auxiliary_loss_mlp": 0.01034187, + "balance_loss_clip": 1.03974283, + "balance_loss_mlp": 1.02196169, + "epoch": 0.511709003457087, + "flos": 32634970974720.0, + "grad_norm": 1.528929768791269, + "language_loss": 0.70965147, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.73112065, + "num_input_tokens_seen": 183025830, + "step": 8511, + "time_per_iteration": 2.529033660888672 + }, + { + "auxiliary_loss_clip": 0.010885, + "auxiliary_loss_mlp": 0.01027795, + "balance_loss_clip": 1.03839064, + "balance_loss_mlp": 1.01461029, + "epoch": 0.511769126709755, + "flos": 21762082241280.0, + "grad_norm": 2.6962623091539584, + "language_loss": 0.6648218, + "learning_rate": 2.020835823045001e-06, + "loss": 0.68598473, + "num_input_tokens_seen": 183045140, + "step": 8512, + "time_per_iteration": 2.502185821533203 + }, + { + "auxiliary_loss_clip": 0.01060885, + "auxiliary_loss_mlp": 0.01036956, + "balance_loss_clip": 1.03628612, + "balance_loss_mlp": 1.02197146, + "epoch": 0.511829249962423, + "flos": 23915357827200.0, + "grad_norm": 1.704397315355566, + "language_loss": 0.66290194, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.68388033, + "num_input_tokens_seen": 183063935, + "step": 8513, + "time_per_iteration": 2.614583730697632 + }, + { + "auxiliary_loss_clip": 0.01072289, + "auxiliary_loss_mlp": 0.0103108, + "balance_loss_clip": 1.0384922, + "balance_loss_mlp": 1.01745391, + "epoch": 0.511889373215091, + "flos": 23727005884800.0, + "grad_norm": 1.8400632377403194, + "language_loss": 0.69256794, + "learning_rate": 2.0200569403921e-06, + "loss": 0.71360165, + "num_input_tokens_seen": 183084135, + "step": 8514, + "time_per_iteration": 2.5788586139678955 + }, + { + "auxiliary_loss_clip": 0.01111161, + "auxiliary_loss_mlp": 0.01028245, + "balance_loss_clip": 1.03791761, + "balance_loss_mlp": 1.01582956, + "epoch": 0.5119494964677589, + "flos": 28111519526400.0, + "grad_norm": 1.5418333678729363, + "language_loss": 0.6604147, + "learning_rate": 2.019667497917424e-06, + "loss": 0.68180871, + "num_input_tokens_seen": 183104570, + "step": 8515, + "time_per_iteration": 2.50140380859375 + }, + { + "auxiliary_loss_clip": 0.0110073, + "auxiliary_loss_mlp": 0.01031181, + "balance_loss_clip": 1.0379529, + "balance_loss_mlp": 1.01833582, + "epoch": 0.5120096197204269, + "flos": 24973214296320.0, + "grad_norm": 2.0047258642285004, + "language_loss": 0.75152266, + "learning_rate": 2.019278054696955e-06, + "loss": 0.77284169, + "num_input_tokens_seen": 183123850, + "step": 8516, + "time_per_iteration": 2.5207293033599854 + }, + { + "auxiliary_loss_clip": 0.01088858, + "auxiliary_loss_mlp": 0.01037793, + "balance_loss_clip": 1.04163325, + "balance_loss_mlp": 1.02391684, + "epoch": 0.5120697429730948, + "flos": 17968012364160.0, + "grad_norm": 1.9310224234446716, + "language_loss": 0.77854902, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.79981554, + "num_input_tokens_seen": 183141725, + "step": 8517, + "time_per_iteration": 2.475924015045166 + }, + { + "auxiliary_loss_clip": 0.01104853, + "auxiliary_loss_mlp": 0.01033525, + "balance_loss_clip": 1.03863788, + "balance_loss_mlp": 1.01958334, + "epoch": 0.5121298662257628, + "flos": 23292343405440.0, + "grad_norm": 2.6015770295702074, + "language_loss": 0.73833001, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.75971383, + "num_input_tokens_seen": 183161300, + "step": 8518, + "time_per_iteration": 2.542253017425537 + }, + { + "auxiliary_loss_clip": 0.01100062, + "auxiliary_loss_mlp": 0.01036792, + "balance_loss_clip": 1.03933835, + "balance_loss_mlp": 1.0232265, + "epoch": 0.5121899894784308, + "flos": 17311062568320.0, + "grad_norm": 1.8037453586835346, + "language_loss": 0.78030556, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.80167413, + "num_input_tokens_seen": 183180495, + "step": 8519, + "time_per_iteration": 2.4592535495758057 + }, + { + "auxiliary_loss_clip": 0.01115858, + "auxiliary_loss_mlp": 0.01031742, + "balance_loss_clip": 1.04075921, + "balance_loss_mlp": 1.01802063, + "epoch": 0.5122501127310988, + "flos": 24930085040640.0, + "grad_norm": 1.5129154003819458, + "language_loss": 0.79281324, + "learning_rate": 2.017720274652497e-06, + "loss": 0.81428921, + "num_input_tokens_seen": 183200330, + "step": 8520, + "time_per_iteration": 2.493725299835205 + }, + { + "auxiliary_loss_clip": 0.0109773, + "auxiliary_loss_mlp": 0.01038216, + "balance_loss_clip": 1.03854382, + "balance_loss_mlp": 1.02350557, + "epoch": 0.5123102359837667, + "flos": 18442859184000.0, + "grad_norm": 1.6400908849105167, + "language_loss": 0.81306684, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.83442628, + "num_input_tokens_seen": 183218230, + "step": 8521, + "time_per_iteration": 2.4813666343688965 + }, + { + "auxiliary_loss_clip": 0.01100181, + "auxiliary_loss_mlp": 0.0102917, + "balance_loss_clip": 1.0343082, + "balance_loss_mlp": 1.01547861, + "epoch": 0.5123703592364347, + "flos": 26684860164480.0, + "grad_norm": 1.7728031028366473, + "language_loss": 0.68200958, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.70330298, + "num_input_tokens_seen": 183236735, + "step": 8522, + "time_per_iteration": 2.4941482543945312 + }, + { + "auxiliary_loss_clip": 0.01089388, + "auxiliary_loss_mlp": 0.01040951, + "balance_loss_clip": 1.04107571, + "balance_loss_mlp": 1.02419055, + "epoch": 0.5124304824891026, + "flos": 28803948981120.0, + "grad_norm": 1.94464710511069, + "language_loss": 0.61520886, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.63651228, + "num_input_tokens_seen": 183257550, + "step": 8523, + "time_per_iteration": 2.590646266937256 + }, + { + "auxiliary_loss_clip": 0.01078036, + "auxiliary_loss_mlp": 0.01038553, + "balance_loss_clip": 1.03704572, + "balance_loss_mlp": 1.02492774, + "epoch": 0.5124906057417706, + "flos": 21761830846080.0, + "grad_norm": 1.8659488603551109, + "language_loss": 0.77699399, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.79815984, + "num_input_tokens_seen": 183275515, + "step": 8524, + "time_per_iteration": 2.531919479370117 + }, + { + "auxiliary_loss_clip": 0.01090834, + "auxiliary_loss_mlp": 0.01034098, + "balance_loss_clip": 1.03833783, + "balance_loss_mlp": 1.02115214, + "epoch": 0.5125507289944387, + "flos": 18880538405760.0, + "grad_norm": 1.8235639507165855, + "language_loss": 0.74825752, + "learning_rate": 2.015773034588706e-06, + "loss": 0.76950687, + "num_input_tokens_seen": 183293880, + "step": 8525, + "time_per_iteration": 2.49753999710083 + }, + { + "auxiliary_loss_clip": 0.01089542, + "auxiliary_loss_mlp": 0.01040189, + "balance_loss_clip": 1.03742075, + "balance_loss_mlp": 1.02464437, + "epoch": 0.5126108522471066, + "flos": 35627838036480.0, + "grad_norm": 1.6084871907089167, + "language_loss": 0.74656641, + "learning_rate": 2.015383584722531e-06, + "loss": 0.76786363, + "num_input_tokens_seen": 183315860, + "step": 8526, + "time_per_iteration": 2.6375234127044678 + }, + { + "auxiliary_loss_clip": 0.01105222, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.03904283, + "balance_loss_mlp": 1.02302313, + "epoch": 0.5126709754997746, + "flos": 20190918464640.0, + "grad_norm": 1.633462355052535, + "language_loss": 0.64857852, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.66999495, + "num_input_tokens_seen": 183335480, + "step": 8527, + "time_per_iteration": 2.4771621227264404 + }, + { + "auxiliary_loss_clip": 0.01090536, + "auxiliary_loss_mlp": 0.01039145, + "balance_loss_clip": 1.0409534, + "balance_loss_mlp": 1.02699161, + "epoch": 0.5127310987524425, + "flos": 18588548747520.0, + "grad_norm": 1.4988698568027683, + "language_loss": 0.74329489, + "learning_rate": 2.014604683254908e-06, + "loss": 0.76459175, + "num_input_tokens_seen": 183354395, + "step": 8528, + "time_per_iteration": 2.528162717819214 + }, + { + "auxiliary_loss_clip": 0.01100507, + "auxiliary_loss_mlp": 0.0103178, + "balance_loss_clip": 1.0365777, + "balance_loss_mlp": 1.01848817, + "epoch": 0.5127912220051105, + "flos": 22454691264000.0, + "grad_norm": 1.663616603328329, + "language_loss": 0.83027667, + "learning_rate": 2.014215231682995e-06, + "loss": 0.85159957, + "num_input_tokens_seen": 183372980, + "step": 8529, + "time_per_iteration": 2.4947621822357178 + }, + { + "auxiliary_loss_clip": 0.01064836, + "auxiliary_loss_mlp": 0.01033797, + "balance_loss_clip": 1.03843641, + "balance_loss_mlp": 1.0204215, + "epoch": 0.5128513452577784, + "flos": 19093703667840.0, + "grad_norm": 1.9177734936778659, + "language_loss": 0.73940903, + "learning_rate": 2.01382577957204e-06, + "loss": 0.76039541, + "num_input_tokens_seen": 183390160, + "step": 8530, + "time_per_iteration": 2.565117597579956 + }, + { + "auxiliary_loss_clip": 0.01011947, + "auxiliary_loss_mlp": 0.01003082, + "balance_loss_clip": 1.02039075, + "balance_loss_mlp": 1.00172317, + "epoch": 0.5129114685104464, + "flos": 67892285243520.0, + "grad_norm": 0.7513837790134419, + "language_loss": 0.60844225, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.62859249, + "num_input_tokens_seen": 183455280, + "step": 8531, + "time_per_iteration": 4.57397985458374 + }, + { + "auxiliary_loss_clip": 0.01086054, + "auxiliary_loss_mlp": 0.01034251, + "balance_loss_clip": 1.03860235, + "balance_loss_mlp": 1.0192306, + "epoch": 0.5129715917631144, + "flos": 20449152316800.0, + "grad_norm": 1.8214363311986992, + "language_loss": 0.76958299, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.79078609, + "num_input_tokens_seen": 183473955, + "step": 8532, + "time_per_iteration": 2.5325894355773926 + }, + { + "auxiliary_loss_clip": 0.01092134, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.03907561, + "balance_loss_mlp": 1.01956367, + "epoch": 0.5130317150157824, + "flos": 35116146840960.0, + "grad_norm": 3.3401705694862573, + "language_loss": 0.66560805, + "learning_rate": 2.012657420152597e-06, + "loss": 0.68685806, + "num_input_tokens_seen": 183497195, + "step": 8533, + "time_per_iteration": 2.6420252323150635 + }, + { + "auxiliary_loss_clip": 0.01084196, + "auxiliary_loss_mlp": 0.01035432, + "balance_loss_clip": 1.03988624, + "balance_loss_mlp": 1.02159142, + "epoch": 0.5130918382684503, + "flos": 19791627903360.0, + "grad_norm": 1.8812003050440453, + "language_loss": 0.82081807, + "learning_rate": 2.01226796603315e-06, + "loss": 0.84201431, + "num_input_tokens_seen": 183513675, + "step": 8534, + "time_per_iteration": 3.8917222023010254 + }, + { + "auxiliary_loss_clip": 0.01102644, + "auxiliary_loss_mlp": 0.01037513, + "balance_loss_clip": 1.03678107, + "balance_loss_mlp": 1.02311277, + "epoch": 0.5131519615211183, + "flos": 26323096337280.0, + "grad_norm": 1.445567655969626, + "language_loss": 0.63663131, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.65803289, + "num_input_tokens_seen": 183535165, + "step": 8535, + "time_per_iteration": 2.5275259017944336 + }, + { + "auxiliary_loss_clip": 0.0110612, + "auxiliary_loss_mlp": 0.01029183, + "balance_loss_clip": 1.04111242, + "balance_loss_mlp": 1.01610565, + "epoch": 0.5132120847737862, + "flos": 19171917532800.0, + "grad_norm": 1.5491483535888144, + "language_loss": 0.6954205, + "learning_rate": 2.011489056413418e-06, + "loss": 0.71677351, + "num_input_tokens_seen": 183553780, + "step": 8536, + "time_per_iteration": 2.476519823074341 + }, + { + "auxiliary_loss_clip": 0.01104998, + "auxiliary_loss_mlp": 0.01031496, + "balance_loss_clip": 1.03939915, + "balance_loss_mlp": 1.01726198, + "epoch": 0.5132722080264542, + "flos": 20230420446720.0, + "grad_norm": 2.028299354143275, + "language_loss": 0.71351743, + "learning_rate": 2.011099600942669e-06, + "loss": 0.73488235, + "num_input_tokens_seen": 183572285, + "step": 8537, + "time_per_iteration": 3.9539103507995605 + }, + { + "auxiliary_loss_clip": 0.01070915, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.0367825, + "balance_loss_mlp": 1.01873136, + "epoch": 0.5133323312791223, + "flos": 16469459930880.0, + "grad_norm": 2.2462021568051833, + "language_loss": 0.81038719, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.8314274, + "num_input_tokens_seen": 183589330, + "step": 8538, + "time_per_iteration": 2.5341107845306396 + }, + { + "auxiliary_loss_clip": 0.01102783, + "auxiliary_loss_mlp": 0.01027022, + "balance_loss_clip": 1.03755832, + "balance_loss_mlp": 1.0139035, + "epoch": 0.5133924545317902, + "flos": 26068094709120.0, + "grad_norm": 1.7264771425756702, + "language_loss": 0.78475606, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.80605412, + "num_input_tokens_seen": 183609205, + "step": 8539, + "time_per_iteration": 2.515965223312378 + }, + { + "auxiliary_loss_clip": 0.0109309, + "auxiliary_loss_mlp": 0.01037779, + "balance_loss_clip": 1.03835499, + "balance_loss_mlp": 1.0240221, + "epoch": 0.5134525777844582, + "flos": 29131023248640.0, + "grad_norm": 1.9639788220740237, + "language_loss": 0.76075006, + "learning_rate": 2.009931232064105e-06, + "loss": 0.78205872, + "num_input_tokens_seen": 183629985, + "step": 8540, + "time_per_iteration": 2.570103168487549 + }, + { + "auxiliary_loss_clip": 0.0106693, + "auxiliary_loss_mlp": 0.01036062, + "balance_loss_clip": 1.04174554, + "balance_loss_mlp": 1.0217092, + "epoch": 0.5135127010371261, + "flos": 17454776883840.0, + "grad_norm": 2.421915761297425, + "language_loss": 0.74761152, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.76864147, + "num_input_tokens_seen": 183648220, + "step": 8541, + "time_per_iteration": 3.973365545272827 + }, + { + "auxiliary_loss_clip": 0.01061088, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.04061317, + "balance_loss_mlp": 1.02377772, + "epoch": 0.5135728242897941, + "flos": 21944975316480.0, + "grad_norm": 1.6272618754516988, + "language_loss": 0.70229125, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.723279, + "num_input_tokens_seen": 183668230, + "step": 8542, + "time_per_iteration": 2.614380121231079 + }, + { + "auxiliary_loss_clip": 0.01095181, + "auxiliary_loss_mlp": 0.0103425, + "balance_loss_clip": 1.03928328, + "balance_loss_mlp": 1.02066624, + "epoch": 0.513632947542462, + "flos": 22674859678080.0, + "grad_norm": 1.9011353756502676, + "language_loss": 0.79268742, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.81398177, + "num_input_tokens_seen": 183687800, + "step": 8543, + "time_per_iteration": 2.5174856185913086 + }, + { + "auxiliary_loss_clip": 0.01095664, + "auxiliary_loss_mlp": 0.01037641, + "balance_loss_clip": 1.04577708, + "balance_loss_mlp": 1.02335405, + "epoch": 0.51369307079513, + "flos": 29457163762560.0, + "grad_norm": 4.541670995116027, + "language_loss": 0.67852771, + "learning_rate": 2.008373401689299e-06, + "loss": 0.69986069, + "num_input_tokens_seen": 183709025, + "step": 8544, + "time_per_iteration": 2.5607926845550537 + }, + { + "auxiliary_loss_clip": 0.01083944, + "auxiliary_loss_mlp": 0.01054567, + "balance_loss_clip": 1.04018867, + "balance_loss_mlp": 1.03918314, + "epoch": 0.513753194047798, + "flos": 18989347680000.0, + "grad_norm": 1.922452754399939, + "language_loss": 0.72351301, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.74489808, + "num_input_tokens_seen": 183725740, + "step": 8545, + "time_per_iteration": 2.540252923965454 + }, + { + "auxiliary_loss_clip": 0.01108523, + "auxiliary_loss_mlp": 0.01038819, + "balance_loss_clip": 1.04002571, + "balance_loss_mlp": 1.02409053, + "epoch": 0.513813317300466, + "flos": 17821855923840.0, + "grad_norm": 1.9614383164009759, + "language_loss": 0.81687069, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.8383441, + "num_input_tokens_seen": 183743995, + "step": 8546, + "time_per_iteration": 2.440058946609497 + }, + { + "auxiliary_loss_clip": 0.0110423, + "auxiliary_loss_mlp": 0.01036442, + "balance_loss_clip": 1.03879285, + "balance_loss_mlp": 1.02233362, + "epoch": 0.5138734405531339, + "flos": 24061191045120.0, + "grad_norm": 1.7579648262569623, + "language_loss": 0.73152816, + "learning_rate": 2.007205025522544e-06, + "loss": 0.75293493, + "num_input_tokens_seen": 183764150, + "step": 8547, + "time_per_iteration": 2.515700340270996 + }, + { + "auxiliary_loss_clip": 0.01106029, + "auxiliary_loss_mlp": 0.01045145, + "balance_loss_clip": 1.03886485, + "balance_loss_mlp": 1.03129315, + "epoch": 0.5139335638058019, + "flos": 26097253574400.0, + "grad_norm": 1.651548583529879, + "language_loss": 0.73236871, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.75388044, + "num_input_tokens_seen": 183783280, + "step": 8548, + "time_per_iteration": 2.5228259563446045 + }, + { + "auxiliary_loss_clip": 0.01089247, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.0380789, + "balance_loss_mlp": 1.02102065, + "epoch": 0.5139936870584698, + "flos": 18917095472640.0, + "grad_norm": 1.5465203950515087, + "language_loss": 0.81758618, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.83883291, + "num_input_tokens_seen": 183800725, + "step": 8549, + "time_per_iteration": 2.5294625759124756 + }, + { + "auxiliary_loss_clip": 0.01105525, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.04010272, + "balance_loss_mlp": 1.02032161, + "epoch": 0.5140538103111378, + "flos": 16144001775360.0, + "grad_norm": 1.9996120092906235, + "language_loss": 0.72400796, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.74538815, + "num_input_tokens_seen": 183818735, + "step": 8550, + "time_per_iteration": 2.464104175567627 + }, + { + "auxiliary_loss_clip": 0.01105447, + "auxiliary_loss_mlp": 0.01039684, + "balance_loss_clip": 1.04095864, + "balance_loss_mlp": 1.02571893, + "epoch": 0.5141139335638057, + "flos": 22420145358720.0, + "grad_norm": 1.8563793690085533, + "language_loss": 0.75494033, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.77639163, + "num_input_tokens_seen": 183840015, + "step": 8551, + "time_per_iteration": 2.5059967041015625 + }, + { + "auxiliary_loss_clip": 0.01093192, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.04416585, + "balance_loss_mlp": 1.01847219, + "epoch": 0.5141740568164738, + "flos": 27089645506560.0, + "grad_norm": 1.7560681170953631, + "language_loss": 0.69494355, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.71619475, + "num_input_tokens_seen": 183860145, + "step": 8552, + "time_per_iteration": 2.5637855529785156 + }, + { + "auxiliary_loss_clip": 0.0110637, + "auxiliary_loss_mlp": 0.01036493, + "balance_loss_clip": 1.03978395, + "balance_loss_mlp": 1.02205098, + "epoch": 0.5142341800691418, + "flos": 24973250209920.0, + "grad_norm": 1.8659987082348768, + "language_loss": 0.74622309, + "learning_rate": 2.004868266210965e-06, + "loss": 0.76765168, + "num_input_tokens_seen": 183880540, + "step": 8553, + "time_per_iteration": 2.5392589569091797 + }, + { + "auxiliary_loss_clip": 0.01116562, + "auxiliary_loss_mlp": 0.01038236, + "balance_loss_clip": 1.04065275, + "balance_loss_mlp": 1.02494979, + "epoch": 0.5142943033218097, + "flos": 20704513080960.0, + "grad_norm": 1.8889408439106794, + "language_loss": 0.67895406, + "learning_rate": 2.004478805593435e-06, + "loss": 0.70050204, + "num_input_tokens_seen": 183900895, + "step": 8554, + "time_per_iteration": 2.456629514694214 + }, + { + "auxiliary_loss_clip": 0.01108686, + "auxiliary_loss_mlp": 0.01042488, + "balance_loss_clip": 1.03826857, + "balance_loss_mlp": 1.02641904, + "epoch": 0.5143544265744777, + "flos": 22925479847040.0, + "grad_norm": 2.2582367517775075, + "language_loss": 0.7293582, + "learning_rate": 2.004089344806068e-06, + "loss": 0.75086993, + "num_input_tokens_seen": 183920335, + "step": 8555, + "time_per_iteration": 2.4943737983703613 + }, + { + "auxiliary_loss_clip": 0.01082023, + "auxiliary_loss_mlp": 0.01039064, + "balance_loss_clip": 1.04231262, + "balance_loss_mlp": 1.02557528, + "epoch": 0.5144145498271456, + "flos": 15921391236480.0, + "grad_norm": 2.436324916022814, + "language_loss": 0.74615932, + "learning_rate": 2.003699883863633e-06, + "loss": 0.76737022, + "num_input_tokens_seen": 183936220, + "step": 8556, + "time_per_iteration": 2.511085271835327 + }, + { + "auxiliary_loss_clip": 0.01083344, + "auxiliary_loss_mlp": 0.01032539, + "balance_loss_clip": 1.03810716, + "balance_loss_mlp": 1.01964021, + "epoch": 0.5144746730798136, + "flos": 19681238430720.0, + "grad_norm": 1.9181443825740911, + "language_loss": 0.86331737, + "learning_rate": 2.003310422780898e-06, + "loss": 0.88447618, + "num_input_tokens_seen": 183953250, + "step": 8557, + "time_per_iteration": 2.537074565887451 + }, + { + "auxiliary_loss_clip": 0.01097555, + "auxiliary_loss_mlp": 0.01042703, + "balance_loss_clip": 1.03719175, + "balance_loss_mlp": 1.02882123, + "epoch": 0.5145347963324816, + "flos": 23914711382400.0, + "grad_norm": 1.6134952408352903, + "language_loss": 0.89045501, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.9118576, + "num_input_tokens_seen": 183973865, + "step": 8558, + "time_per_iteration": 2.4992167949676514 + }, + { + "auxiliary_loss_clip": 0.01112251, + "auxiliary_loss_mlp": 0.00787504, + "balance_loss_clip": 1.03897524, + "balance_loss_mlp": 1.01240468, + "epoch": 0.5145949195851496, + "flos": 18260002022400.0, + "grad_norm": 1.871781169462433, + "language_loss": 0.64908433, + "learning_rate": 2.002531500253602e-06, + "loss": 0.66808188, + "num_input_tokens_seen": 183992555, + "step": 8559, + "time_per_iteration": 2.4540417194366455 + }, + { + "auxiliary_loss_clip": 0.01100767, + "auxiliary_loss_mlp": 0.00787521, + "balance_loss_clip": 1.04168081, + "balance_loss_mlp": 1.01117313, + "epoch": 0.5146550428378175, + "flos": 26213425136640.0, + "grad_norm": 1.5867776149050856, + "language_loss": 0.63171953, + "learning_rate": 2.002142038838577e-06, + "loss": 0.6506024, + "num_input_tokens_seen": 184010825, + "step": 8560, + "time_per_iteration": 2.5214316844940186 + }, + { + "auxiliary_loss_clip": 0.01115182, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.04016554, + "balance_loss_mlp": 1.01971066, + "epoch": 0.5147151660904855, + "flos": 22674177319680.0, + "grad_norm": 1.6517460657280272, + "language_loss": 0.70090306, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.7223841, + "num_input_tokens_seen": 184030155, + "step": 8561, + "time_per_iteration": 2.466305732727051 + }, + { + "auxiliary_loss_clip": 0.01092583, + "auxiliary_loss_mlp": 0.01032801, + "balance_loss_clip": 1.03860736, + "balance_loss_mlp": 1.02029586, + "epoch": 0.5147752893431534, + "flos": 24972388283520.0, + "grad_norm": 1.484331517466963, + "language_loss": 0.66421539, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.68546927, + "num_input_tokens_seen": 184051440, + "step": 8562, + "time_per_iteration": 2.5400145053863525 + }, + { + "auxiliary_loss_clip": 0.01110105, + "auxiliary_loss_mlp": 0.01033929, + "balance_loss_clip": 1.04195166, + "balance_loss_mlp": 1.02031517, + "epoch": 0.5148354125958214, + "flos": 22744669760640.0, + "grad_norm": 1.650623770853723, + "language_loss": 0.77521873, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.79665905, + "num_input_tokens_seen": 184070205, + "step": 8563, + "time_per_iteration": 2.489290237426758 + }, + { + "auxiliary_loss_clip": 0.01109019, + "auxiliary_loss_mlp": 0.01036794, + "balance_loss_clip": 1.03962755, + "balance_loss_mlp": 1.02154708, + "epoch": 0.5148955358484893, + "flos": 23068763199360.0, + "grad_norm": 1.9590385129349255, + "language_loss": 0.82628089, + "learning_rate": 2.0005841925139e-06, + "loss": 0.84773904, + "num_input_tokens_seen": 184087345, + "step": 8564, + "time_per_iteration": 2.4640767574310303 + }, + { + "auxiliary_loss_clip": 0.01100596, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.04047012, + "balance_loss_mlp": 1.01963711, + "epoch": 0.5149556591011574, + "flos": 20340127560960.0, + "grad_norm": 1.6955493044883954, + "language_loss": 0.729792, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.75113893, + "num_input_tokens_seen": 184107110, + "step": 8565, + "time_per_iteration": 2.5263516902923584 + }, + { + "auxiliary_loss_clip": 0.01110177, + "auxiliary_loss_mlp": 0.01033668, + "balance_loss_clip": 1.04088855, + "balance_loss_mlp": 1.01787281, + "epoch": 0.5150157823538254, + "flos": 22638230784000.0, + "grad_norm": 2.5479449379220553, + "language_loss": 0.6869483, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.70838678, + "num_input_tokens_seen": 184127105, + "step": 8566, + "time_per_iteration": 2.493751049041748 + }, + { + "auxiliary_loss_clip": 0.0111705, + "auxiliary_loss_mlp": 0.00789716, + "balance_loss_clip": 1.03705025, + "balance_loss_mlp": 1.01374412, + "epoch": 0.5150759056064933, + "flos": 26067627832320.0, + "grad_norm": 1.9293801864144686, + "language_loss": 0.78126305, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.8003307, + "num_input_tokens_seen": 184148060, + "step": 8567, + "time_per_iteration": 2.50801944732666 + }, + { + "auxiliary_loss_clip": 0.01109856, + "auxiliary_loss_mlp": 0.01033577, + "balance_loss_clip": 1.04128003, + "balance_loss_mlp": 1.01873517, + "epoch": 0.5151360288591613, + "flos": 25952641418880.0, + "grad_norm": 2.005405802350575, + "language_loss": 0.79084373, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.81227803, + "num_input_tokens_seen": 184166175, + "step": 8568, + "time_per_iteration": 2.5143179893493652 + }, + { + "auxiliary_loss_clip": 0.01092709, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.03774607, + "balance_loss_mlp": 1.01744187, + "epoch": 0.5151961521118292, + "flos": 18507246312960.0, + "grad_norm": 2.2223144145395315, + "language_loss": 0.90768683, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.92891997, + "num_input_tokens_seen": 184182600, + "step": 8569, + "time_per_iteration": 2.5098636150360107 + }, + { + "auxiliary_loss_clip": 0.01119809, + "auxiliary_loss_mlp": 0.01035851, + "balance_loss_clip": 1.0410316, + "balance_loss_mlp": 1.02182007, + "epoch": 0.5152562753644973, + "flos": 22233696837120.0, + "grad_norm": 2.4055869379734034, + "language_loss": 0.76730418, + "learning_rate": 1.998247422657674e-06, + "loss": 0.78886074, + "num_input_tokens_seen": 184202020, + "step": 8570, + "time_per_iteration": 4.401336431503296 + }, + { + "auxiliary_loss_clip": 0.0110532, + "auxiliary_loss_mlp": 0.0103797, + "balance_loss_clip": 1.03846729, + "balance_loss_mlp": 1.02296197, + "epoch": 0.5153163986171652, + "flos": 38436555047040.0, + "grad_norm": 1.9808930605527721, + "language_loss": 0.73692054, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.75835341, + "num_input_tokens_seen": 184224850, + "step": 8571, + "time_per_iteration": 2.6345598697662354 + }, + { + "auxiliary_loss_clip": 0.0102522, + "auxiliary_loss_mlp": 0.01003603, + "balance_loss_clip": 1.01870131, + "balance_loss_mlp": 1.00211239, + "epoch": 0.5153765218698332, + "flos": 66384503015040.0, + "grad_norm": 0.7770042520564989, + "language_loss": 0.52896047, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.5492487, + "num_input_tokens_seen": 184288520, + "step": 8572, + "time_per_iteration": 3.238548994064331 + }, + { + "auxiliary_loss_clip": 0.01105502, + "auxiliary_loss_mlp": 0.01036747, + "balance_loss_clip": 1.04176402, + "balance_loss_mlp": 1.02392554, + "epoch": 0.5154366451225011, + "flos": 24024669891840.0, + "grad_norm": 1.7892363458854494, + "language_loss": 0.75761038, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.77903289, + "num_input_tokens_seen": 184308565, + "step": 8573, + "time_per_iteration": 3.9702603816986084 + }, + { + "auxiliary_loss_clip": 0.01104523, + "auxiliary_loss_mlp": 0.01029112, + "balance_loss_clip": 1.03978264, + "balance_loss_mlp": 1.01516497, + "epoch": 0.5154967683751691, + "flos": 23468843859840.0, + "grad_norm": 1.8610930435920277, + "language_loss": 0.77387792, + "learning_rate": 1.996689577219102e-06, + "loss": 0.7952143, + "num_input_tokens_seen": 184326795, + "step": 8574, + "time_per_iteration": 2.48578143119812 + }, + { + "auxiliary_loss_clip": 0.01100085, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.04223371, + "balance_loss_mlp": 1.01820338, + "epoch": 0.515556891627837, + "flos": 23805650712960.0, + "grad_norm": 1.7513908657195885, + "language_loss": 0.85479772, + "learning_rate": 1.996300116136367e-06, + "loss": 0.87610888, + "num_input_tokens_seen": 184345990, + "step": 8575, + "time_per_iteration": 3.9542157649993896 + }, + { + "auxiliary_loss_clip": 0.01106625, + "auxiliary_loss_mlp": 0.01036166, + "balance_loss_clip": 1.03836823, + "balance_loss_mlp": 1.02271342, + "epoch": 0.515617014880505, + "flos": 19828544106240.0, + "grad_norm": 1.5475710772224927, + "language_loss": 0.76632905, + "learning_rate": 1.995910655193932e-06, + "loss": 0.78775692, + "num_input_tokens_seen": 184366300, + "step": 8576, + "time_per_iteration": 2.5008721351623535 + }, + { + "auxiliary_loss_clip": 0.01074987, + "auxiliary_loss_mlp": 0.00790632, + "balance_loss_clip": 1.04318547, + "balance_loss_mlp": 1.01209164, + "epoch": 0.515677138133173, + "flos": 14245907385600.0, + "grad_norm": 2.666310295926506, + "language_loss": 0.75199008, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.77064621, + "num_input_tokens_seen": 184383030, + "step": 8577, + "time_per_iteration": 2.5712759494781494 + }, + { + "auxiliary_loss_clip": 0.01094741, + "auxiliary_loss_mlp": 0.01047416, + "balance_loss_clip": 1.0407337, + "balance_loss_mlp": 1.03115559, + "epoch": 0.515737261385841, + "flos": 28289707920000.0, + "grad_norm": 1.8315170792185766, + "language_loss": 0.81215495, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.8335765, + "num_input_tokens_seen": 184403410, + "step": 8578, + "time_per_iteration": 2.553809642791748 + }, + { + "auxiliary_loss_clip": 0.01114676, + "auxiliary_loss_mlp": 0.01030168, + "balance_loss_clip": 1.04019427, + "balance_loss_mlp": 1.01690042, + "epoch": 0.515797384638509, + "flos": 27891925729920.0, + "grad_norm": 1.6977384414414518, + "language_loss": 0.76024151, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.78169, + "num_input_tokens_seen": 184423830, + "step": 8579, + "time_per_iteration": 2.499141216278076 + }, + { + "auxiliary_loss_clip": 0.01082437, + "auxiliary_loss_mlp": 0.01030039, + "balance_loss_clip": 1.04215777, + "balance_loss_mlp": 1.01678884, + "epoch": 0.5158575078911769, + "flos": 23040071210880.0, + "grad_norm": 2.059121257818275, + "language_loss": 0.79071826, + "learning_rate": 1.994352813122559e-06, + "loss": 0.81184304, + "num_input_tokens_seen": 184445050, + "step": 8580, + "time_per_iteration": 3.9380955696105957 + }, + { + "auxiliary_loss_clip": 0.01085147, + "auxiliary_loss_mlp": 0.0104807, + "balance_loss_clip": 1.04545593, + "balance_loss_mlp": 1.03205454, + "epoch": 0.5159176311438449, + "flos": 12641346938880.0, + "grad_norm": 1.9886861527977173, + "language_loss": 0.7280103, + "learning_rate": 1.99396335310315e-06, + "loss": 0.74934244, + "num_input_tokens_seen": 184460775, + "step": 8581, + "time_per_iteration": 2.4991724491119385 + }, + { + "auxiliary_loss_clip": 0.01105123, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.04088664, + "balance_loss_mlp": 1.02003109, + "epoch": 0.5159777543965128, + "flos": 15558154951680.0, + "grad_norm": 2.1893564940004455, + "language_loss": 0.73795068, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.75932819, + "num_input_tokens_seen": 184477365, + "step": 8582, + "time_per_iteration": 2.454841375350952 + }, + { + "auxiliary_loss_clip": 0.01079283, + "auxiliary_loss_mlp": 0.01036087, + "balance_loss_clip": 1.04117155, + "balance_loss_mlp": 1.02326047, + "epoch": 0.5160378776491809, + "flos": 23221671396480.0, + "grad_norm": 1.6823874019931488, + "language_loss": 0.65850091, + "learning_rate": 1.99318443376583e-06, + "loss": 0.67965466, + "num_input_tokens_seen": 184497045, + "step": 8583, + "time_per_iteration": 2.548562526702881 + }, + { + "auxiliary_loss_clip": 0.01108923, + "auxiliary_loss_mlp": 0.01037745, + "balance_loss_clip": 1.04267263, + "balance_loss_mlp": 1.02378535, + "epoch": 0.5160980009018488, + "flos": 21944616180480.0, + "grad_norm": 1.4675508228576437, + "language_loss": 0.75739199, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.77885866, + "num_input_tokens_seen": 184517675, + "step": 8584, + "time_per_iteration": 2.4956626892089844 + }, + { + "auxiliary_loss_clip": 0.01091084, + "auxiliary_loss_mlp": 0.01045651, + "balance_loss_clip": 1.04490578, + "balance_loss_mlp": 1.03181648, + "epoch": 0.5161581241545168, + "flos": 22784064001920.0, + "grad_norm": 3.1448325532382153, + "language_loss": 0.78881103, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.8101784, + "num_input_tokens_seen": 184537745, + "step": 8585, + "time_per_iteration": 2.548659086227417 + }, + { + "auxiliary_loss_clip": 0.01106024, + "auxiliary_loss_mlp": 0.01037828, + "balance_loss_clip": 1.04117298, + "balance_loss_mlp": 1.02503729, + "epoch": 0.5162182474071847, + "flos": 19675384513920.0, + "grad_norm": 2.2624279082238505, + "language_loss": 0.81474686, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.83618534, + "num_input_tokens_seen": 184553630, + "step": 8586, + "time_per_iteration": 2.450187921524048 + }, + { + "auxiliary_loss_clip": 0.01094833, + "auxiliary_loss_mlp": 0.01032462, + "balance_loss_clip": 1.04062903, + "balance_loss_mlp": 1.01984978, + "epoch": 0.5162783706598527, + "flos": 20046198568320.0, + "grad_norm": 1.8519621160237456, + "language_loss": 0.71505368, + "learning_rate": 1.991626598310701e-06, + "loss": 0.73632663, + "num_input_tokens_seen": 184573530, + "step": 8587, + "time_per_iteration": 2.5211684703826904 + }, + { + "auxiliary_loss_clip": 0.01033424, + "auxiliary_loss_mlp": 0.01005307, + "balance_loss_clip": 1.01680112, + "balance_loss_mlp": 1.00400209, + "epoch": 0.5163384939125206, + "flos": 69959553713280.0, + "grad_norm": 0.7337102037370647, + "language_loss": 0.57840645, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.5987938, + "num_input_tokens_seen": 184637875, + "step": 8588, + "time_per_iteration": 3.1115260124206543 + }, + { + "auxiliary_loss_clip": 0.01101441, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.04240263, + "balance_loss_mlp": 1.02286673, + "epoch": 0.5163986171651886, + "flos": 17417034668160.0, + "grad_norm": 1.8306779288341224, + "language_loss": 0.75185496, + "learning_rate": 1.990847682429185e-06, + "loss": 0.77324188, + "num_input_tokens_seen": 184656125, + "step": 8589, + "time_per_iteration": 2.508363723754883 + }, + { + "auxiliary_loss_clip": 0.01109791, + "auxiliary_loss_mlp": 0.01033641, + "balance_loss_clip": 1.04191113, + "balance_loss_mlp": 1.02086151, + "epoch": 0.5164587404178566, + "flos": 21322679166720.0, + "grad_norm": 1.488653688351353, + "language_loss": 0.6752255, + "learning_rate": 1.990458225001627e-06, + "loss": 0.6966598, + "num_input_tokens_seen": 184675920, + "step": 8590, + "time_per_iteration": 2.5060672760009766 + }, + { + "auxiliary_loss_clip": 0.01031619, + "auxiliary_loss_mlp": 0.01002712, + "balance_loss_clip": 1.01607966, + "balance_loss_mlp": 1.00141859, + "epoch": 0.5165188636705246, + "flos": 68057149691520.0, + "grad_norm": 0.8039291787768882, + "language_loss": 0.55860424, + "learning_rate": 1.990068767935895e-06, + "loss": 0.57894754, + "num_input_tokens_seen": 184730520, + "step": 8591, + "time_per_iteration": 3.027583599090576 + }, + { + "auxiliary_loss_clip": 0.01089897, + "auxiliary_loss_mlp": 0.01026033, + "balance_loss_clip": 1.03914237, + "balance_loss_mlp": 1.01383185, + "epoch": 0.5165789869231926, + "flos": 19385657412480.0, + "grad_norm": 1.5978202799351757, + "language_loss": 0.8138454, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.83500463, + "num_input_tokens_seen": 184748340, + "step": 8592, + "time_per_iteration": 2.505753755569458 + }, + { + "auxiliary_loss_clip": 0.01101542, + "auxiliary_loss_mlp": 0.01029269, + "balance_loss_clip": 1.04052997, + "balance_loss_mlp": 1.01665676, + "epoch": 0.5166391101758605, + "flos": 20960197067520.0, + "grad_norm": 1.7479290385938921, + "language_loss": 0.83339155, + "learning_rate": 1.989289854948979e-06, + "loss": 0.85469967, + "num_input_tokens_seen": 184766615, + "step": 8593, + "time_per_iteration": 2.481851577758789 + }, + { + "auxiliary_loss_clip": 0.01092255, + "auxiliary_loss_mlp": 0.01036841, + "balance_loss_clip": 1.04125547, + "balance_loss_mlp": 1.02272058, + "epoch": 0.5166992334285285, + "flos": 29462407148160.0, + "grad_norm": 1.6274928695054172, + "language_loss": 0.69300187, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.71429282, + "num_input_tokens_seen": 184788075, + "step": 8594, + "time_per_iteration": 2.589886426925659 + }, + { + "auxiliary_loss_clip": 0.01077325, + "auxiliary_loss_mlp": 0.01034025, + "balance_loss_clip": 1.04122496, + "balance_loss_mlp": 1.02050102, + "epoch": 0.5167593566811964, + "flos": 20304360593280.0, + "grad_norm": 1.5075187437363977, + "language_loss": 0.77566659, + "learning_rate": 1.988510943586582e-06, + "loss": 0.79678011, + "num_input_tokens_seen": 184808710, + "step": 8595, + "time_per_iteration": 2.5821850299835205 + }, + { + "auxiliary_loss_clip": 0.01117039, + "auxiliary_loss_mlp": 0.01032609, + "balance_loss_clip": 1.04245305, + "balance_loss_mlp": 1.01944256, + "epoch": 0.5168194799338645, + "flos": 14611370313600.0, + "grad_norm": 1.4807585562488366, + "language_loss": 0.65330303, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.67479944, + "num_input_tokens_seen": 184826475, + "step": 8596, + "time_per_iteration": 2.4479734897613525 + }, + { + "auxiliary_loss_clip": 0.01086051, + "auxiliary_loss_mlp": 0.0103148, + "balance_loss_clip": 1.04546177, + "balance_loss_mlp": 1.01673985, + "epoch": 0.5168796031865324, + "flos": 25007257411200.0, + "grad_norm": 4.6756378246922425, + "language_loss": 0.75697803, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.77815336, + "num_input_tokens_seen": 184845245, + "step": 8597, + "time_per_iteration": 2.575133800506592 + }, + { + "auxiliary_loss_clip": 0.01115765, + "auxiliary_loss_mlp": 0.01023144, + "balance_loss_clip": 1.04075348, + "balance_loss_mlp": 1.0104121, + "epoch": 0.5169397264392004, + "flos": 26939969533440.0, + "grad_norm": 1.5264818311645665, + "language_loss": 0.81380886, + "learning_rate": 1.987342579847403e-06, + "loss": 0.83519793, + "num_input_tokens_seen": 184866605, + "step": 8598, + "time_per_iteration": 2.495180130004883 + }, + { + "auxiliary_loss_clip": 0.01069084, + "auxiliary_loss_mlp": 0.01036324, + "balance_loss_clip": 1.04006314, + "balance_loss_mlp": 1.02259731, + "epoch": 0.5169998496918683, + "flos": 25407804948480.0, + "grad_norm": 1.6448997796989744, + "language_loss": 0.7534405, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.77449453, + "num_input_tokens_seen": 184886945, + "step": 8599, + "time_per_iteration": 2.609812021255493 + }, + { + "auxiliary_loss_clip": 0.01097918, + "auxiliary_loss_mlp": 0.01033474, + "balance_loss_clip": 1.04172754, + "balance_loss_mlp": 1.02040255, + "epoch": 0.5170599729445363, + "flos": 24680793674880.0, + "grad_norm": 2.1230286585075655, + "language_loss": 0.72361958, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.74493355, + "num_input_tokens_seen": 184905590, + "step": 8600, + "time_per_iteration": 2.5428106784820557 + }, + { + "auxiliary_loss_clip": 0.01083676, + "auxiliary_loss_mlp": 0.01034362, + "balance_loss_clip": 1.04336452, + "balance_loss_mlp": 1.02058148, + "epoch": 0.5171200961972042, + "flos": 20994455664000.0, + "grad_norm": 2.180641977959101, + "language_loss": 0.74425685, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.76543725, + "num_input_tokens_seen": 184925555, + "step": 8601, + "time_per_iteration": 2.566192626953125 + }, + { + "auxiliary_loss_clip": 0.01105722, + "auxiliary_loss_mlp": 0.01038287, + "balance_loss_clip": 1.04074609, + "balance_loss_mlp": 1.0245955, + "epoch": 0.5171802194498722, + "flos": 22745639427840.0, + "grad_norm": 1.9881379230108247, + "language_loss": 0.83963472, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.86107481, + "num_input_tokens_seen": 184944490, + "step": 8602, + "time_per_iteration": 2.468792200088501 + }, + { + "auxiliary_loss_clip": 0.01116176, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.0411756, + "balance_loss_mlp": 1.01985264, + "epoch": 0.5172403427025402, + "flos": 28176732668160.0, + "grad_norm": 1.8942001734722118, + "language_loss": 0.74595451, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.76744837, + "num_input_tokens_seen": 184963190, + "step": 8603, + "time_per_iteration": 2.5143773555755615 + }, + { + "auxiliary_loss_clip": 0.01096605, + "auxiliary_loss_mlp": 0.01034888, + "balance_loss_clip": 1.04209101, + "balance_loss_mlp": 1.02173889, + "epoch": 0.5173004659552082, + "flos": 20337829090560.0, + "grad_norm": 2.057429506120012, + "language_loss": 0.72203207, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.74334705, + "num_input_tokens_seen": 184981220, + "step": 8604, + "time_per_iteration": 2.5089542865753174 + }, + { + "auxiliary_loss_clip": 0.01100328, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.04264891, + "balance_loss_mlp": 1.01988792, + "epoch": 0.5173605892078762, + "flos": 19063323740160.0, + "grad_norm": 2.0697242504782287, + "language_loss": 0.84679824, + "learning_rate": 1.984616415277469e-06, + "loss": 0.86813885, + "num_input_tokens_seen": 184998810, + "step": 8605, + "time_per_iteration": 2.53499174118042 + }, + { + "auxiliary_loss_clip": 0.01101002, + "auxiliary_loss_mlp": 0.01026386, + "balance_loss_clip": 1.04162335, + "balance_loss_mlp": 1.01367247, + "epoch": 0.5174207124605441, + "flos": 27995168396160.0, + "grad_norm": 1.4840250830994521, + "language_loss": 0.64516449, + "learning_rate": 1.984226965411294e-06, + "loss": 0.66643834, + "num_input_tokens_seen": 185021185, + "step": 8606, + "time_per_iteration": 2.551459789276123 + }, + { + "auxiliary_loss_clip": 0.01091644, + "auxiliary_loss_mlp": 0.01033346, + "balance_loss_clip": 1.04339194, + "balance_loss_mlp": 1.02017915, + "epoch": 0.5174808357132121, + "flos": 19496657416320.0, + "grad_norm": 1.4977145397267289, + "language_loss": 0.77585161, + "learning_rate": 1.983837516143234e-06, + "loss": 0.79710144, + "num_input_tokens_seen": 185038465, + "step": 8607, + "time_per_iteration": 2.5023813247680664 + }, + { + "auxiliary_loss_clip": 0.01105354, + "auxiliary_loss_mlp": 0.01038693, + "balance_loss_clip": 1.04021573, + "balance_loss_mlp": 1.02462637, + "epoch": 0.51754095896588, + "flos": 22784171742720.0, + "grad_norm": 1.7001225941145333, + "language_loss": 0.71555322, + "learning_rate": 1.983448067488057e-06, + "loss": 0.73699367, + "num_input_tokens_seen": 185057340, + "step": 8608, + "time_per_iteration": 3.8431813716888428 + }, + { + "auxiliary_loss_clip": 0.01112579, + "auxiliary_loss_mlp": 0.01030899, + "balance_loss_clip": 1.04196024, + "balance_loss_mlp": 1.01688588, + "epoch": 0.5176010822185481, + "flos": 22669257156480.0, + "grad_norm": 1.7990904336028233, + "language_loss": 0.8646729, + "learning_rate": 1.983058619460531e-06, + "loss": 0.88610768, + "num_input_tokens_seen": 185074935, + "step": 8609, + "time_per_iteration": 2.5052738189697266 + }, + { + "auxiliary_loss_clip": 0.01102731, + "auxiliary_loss_mlp": 0.01032207, + "balance_loss_clip": 1.03873444, + "balance_loss_mlp": 1.01988101, + "epoch": 0.517661205471216, + "flos": 23951196622080.0, + "grad_norm": 1.6359443929812598, + "language_loss": 0.73773992, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.75908923, + "num_input_tokens_seen": 185095050, + "step": 8610, + "time_per_iteration": 2.5109024047851562 + }, + { + "auxiliary_loss_clip": 0.01120958, + "auxiliary_loss_mlp": 0.0103153, + "balance_loss_clip": 1.04194593, + "balance_loss_mlp": 1.01689672, + "epoch": 0.517721328723884, + "flos": 15596076735360.0, + "grad_norm": 1.8980612708239204, + "language_loss": 0.67072654, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.69225138, + "num_input_tokens_seen": 185112275, + "step": 8611, + "time_per_iteration": 2.4561238288879395 + }, + { + "auxiliary_loss_clip": 0.01114688, + "auxiliary_loss_mlp": 0.01032243, + "balance_loss_clip": 1.0398258, + "balance_loss_mlp": 1.01887357, + "epoch": 0.5177814519765519, + "flos": 20960197067520.0, + "grad_norm": 2.113023557949929, + "language_loss": 0.77132505, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.79279435, + "num_input_tokens_seen": 185132165, + "step": 8612, + "time_per_iteration": 3.89871883392334 + }, + { + "auxiliary_loss_clip": 0.01105385, + "auxiliary_loss_mlp": 0.01037613, + "balance_loss_clip": 1.03916693, + "balance_loss_mlp": 1.02485716, + "epoch": 0.5178415752292199, + "flos": 17967832796160.0, + "grad_norm": 1.9329634531389925, + "language_loss": 0.8221401, + "learning_rate": 1.981500833922294e-06, + "loss": 0.84357005, + "num_input_tokens_seen": 185151025, + "step": 8613, + "time_per_iteration": 2.4571821689605713 + }, + { + "auxiliary_loss_clip": 0.01118652, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.04296279, + "balance_loss_mlp": 1.02273762, + "epoch": 0.5179016984818878, + "flos": 17821496787840.0, + "grad_norm": 2.5347526307930255, + "language_loss": 0.66523302, + "learning_rate": 1.981111389254541e-06, + "loss": 0.68678707, + "num_input_tokens_seen": 185168455, + "step": 8614, + "time_per_iteration": 3.8440372943878174 + }, + { + "auxiliary_loss_clip": 0.01098199, + "auxiliary_loss_mlp": 0.01033326, + "balance_loss_clip": 1.04061699, + "balance_loss_mlp": 1.01952744, + "epoch": 0.5179618217345558, + "flos": 17820455293440.0, + "grad_norm": 1.9245565061443928, + "language_loss": 0.86345506, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.88477027, + "num_input_tokens_seen": 185184415, + "step": 8615, + "time_per_iteration": 2.4730215072631836 + }, + { + "auxiliary_loss_clip": 0.01103307, + "auxiliary_loss_mlp": 0.01040736, + "balance_loss_clip": 1.0403161, + "balance_loss_mlp": 1.02874982, + "epoch": 0.5180219449872238, + "flos": 22522131048960.0, + "grad_norm": 1.457679491366675, + "language_loss": 0.80696118, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.82840163, + "num_input_tokens_seen": 185202910, + "step": 8616, + "time_per_iteration": 2.5070748329162598 + }, + { + "auxiliary_loss_clip": 0.01112415, + "auxiliary_loss_mlp": 0.00791932, + "balance_loss_clip": 1.04645741, + "balance_loss_mlp": 1.01826954, + "epoch": 0.5180820682398918, + "flos": 23915465568000.0, + "grad_norm": 1.8703678131432038, + "language_loss": 0.75376189, + "learning_rate": 1.9799430596079e-06, + "loss": 0.77280533, + "num_input_tokens_seen": 185223085, + "step": 8617, + "time_per_iteration": 2.5211021900177 + }, + { + "auxiliary_loss_clip": 0.01116094, + "auxiliary_loss_mlp": 0.01040142, + "balance_loss_clip": 1.04019082, + "balance_loss_mlp": 1.02600408, + "epoch": 0.5181421914925598, + "flos": 16979930064000.0, + "grad_norm": 1.7310799716943774, + "language_loss": 0.70150149, + "learning_rate": 1.979553617893785e-06, + "loss": 0.72306389, + "num_input_tokens_seen": 185241295, + "step": 8618, + "time_per_iteration": 2.4386653900146484 + }, + { + "auxiliary_loss_clip": 0.01027701, + "auxiliary_loss_mlp": 0.01004069, + "balance_loss_clip": 1.01135397, + "balance_loss_mlp": 1.00273943, + "epoch": 0.5182023147452277, + "flos": 66059870872320.0, + "grad_norm": 0.9552707444842252, + "language_loss": 0.67323828, + "learning_rate": 1.979164176954999e-06, + "loss": 0.69355601, + "num_input_tokens_seen": 185298295, + "step": 8619, + "time_per_iteration": 4.444633483886719 + }, + { + "auxiliary_loss_clip": 0.01066565, + "auxiliary_loss_mlp": 0.01028074, + "balance_loss_clip": 1.03708029, + "balance_loss_mlp": 1.01578379, + "epoch": 0.5182624379978957, + "flos": 18187749815040.0, + "grad_norm": 2.0407339789914674, + "language_loss": 0.79770744, + "learning_rate": 1.97877473680631e-06, + "loss": 0.81865382, + "num_input_tokens_seen": 185317000, + "step": 8620, + "time_per_iteration": 2.565626621246338 + }, + { + "auxiliary_loss_clip": 0.01052473, + "auxiliary_loss_mlp": 0.00789746, + "balance_loss_clip": 1.04146719, + "balance_loss_mlp": 1.01435924, + "epoch": 0.5183225612505636, + "flos": 14026708638720.0, + "grad_norm": 2.144723481257175, + "language_loss": 0.82410729, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.84252942, + "num_input_tokens_seen": 185331185, + "step": 8621, + "time_per_iteration": 2.617462158203125 + }, + { + "auxiliary_loss_clip": 0.01089281, + "auxiliary_loss_mlp": 0.01036978, + "balance_loss_clip": 1.03642046, + "balance_loss_mlp": 1.02479458, + "epoch": 0.5183826845032317, + "flos": 23659781581440.0, + "grad_norm": 2.6932757791805613, + "language_loss": 0.65788847, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.67915106, + "num_input_tokens_seen": 185348955, + "step": 8622, + "time_per_iteration": 2.542924404144287 + }, + { + "auxiliary_loss_clip": 0.01097311, + "auxiliary_loss_mlp": 0.01044226, + "balance_loss_clip": 1.04015231, + "balance_loss_mlp": 1.03045094, + "epoch": 0.5184428077558996, + "flos": 15888605097600.0, + "grad_norm": 1.8790793169907474, + "language_loss": 0.60726023, + "learning_rate": 1.977606421248497e-06, + "loss": 0.62867558, + "num_input_tokens_seen": 185367330, + "step": 8623, + "time_per_iteration": 2.5110342502593994 + }, + { + "auxiliary_loss_clip": 0.01115131, + "auxiliary_loss_mlp": 0.0103223, + "balance_loss_clip": 1.03985214, + "balance_loss_mlp": 1.01982045, + "epoch": 0.5185029310085676, + "flos": 21030833162880.0, + "grad_norm": 1.8068808834345298, + "language_loss": 0.76377332, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.78524697, + "num_input_tokens_seen": 185385060, + "step": 8624, + "time_per_iteration": 2.462740898132324 + }, + { + "auxiliary_loss_clip": 0.01070055, + "auxiliary_loss_mlp": 0.01044908, + "balance_loss_clip": 1.03627169, + "balance_loss_mlp": 1.03068078, + "epoch": 0.5185630542612355, + "flos": 26542690133760.0, + "grad_norm": 1.9402870740706513, + "language_loss": 0.71271795, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.73386759, + "num_input_tokens_seen": 185403745, + "step": 8625, + "time_per_iteration": 2.602905511856079 + }, + { + "auxiliary_loss_clip": 0.01092654, + "auxiliary_loss_mlp": 0.01032411, + "balance_loss_clip": 1.03752184, + "balance_loss_mlp": 1.01991749, + "epoch": 0.5186231775139035, + "flos": 20668422890880.0, + "grad_norm": 1.8991767441278926, + "language_loss": 0.67653024, + "learning_rate": 1.976438113333184e-06, + "loss": 0.69778085, + "num_input_tokens_seen": 185422620, + "step": 8626, + "time_per_iteration": 2.5211689472198486 + }, + { + "auxiliary_loss_clip": 0.01101416, + "auxiliary_loss_mlp": 0.01031772, + "balance_loss_clip": 1.03826964, + "balance_loss_mlp": 1.01927876, + "epoch": 0.5186833007665714, + "flos": 20885502735360.0, + "grad_norm": 2.262655165882253, + "language_loss": 0.70462686, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.7259587, + "num_input_tokens_seen": 185439380, + "step": 8627, + "time_per_iteration": 2.513781785964966 + }, + { + "auxiliary_loss_clip": 0.01119817, + "auxiliary_loss_mlp": 0.0079075, + "balance_loss_clip": 1.04184008, + "balance_loss_mlp": 1.01597798, + "epoch": 0.5187434240192395, + "flos": 20886903365760.0, + "grad_norm": 1.732148273090294, + "language_loss": 0.73344553, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.75255126, + "num_input_tokens_seen": 185458830, + "step": 8628, + "time_per_iteration": 2.4724137783050537 + }, + { + "auxiliary_loss_clip": 0.01091875, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.04012024, + "balance_loss_mlp": 1.01974559, + "epoch": 0.5188035472719074, + "flos": 19859929614720.0, + "grad_norm": 1.7421064604622019, + "language_loss": 0.77487451, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.79611254, + "num_input_tokens_seen": 185477270, + "step": 8629, + "time_per_iteration": 2.5529510974884033 + }, + { + "auxiliary_loss_clip": 0.011081, + "auxiliary_loss_mlp": 0.01030999, + "balance_loss_clip": 1.04129827, + "balance_loss_mlp": 1.01721287, + "epoch": 0.5188636705245754, + "flos": 21138313633920.0, + "grad_norm": 2.0608653472631775, + "language_loss": 0.74582458, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.76721555, + "num_input_tokens_seen": 185495795, + "step": 8630, + "time_per_iteration": 2.4912517070770264 + }, + { + "auxiliary_loss_clip": 0.01104471, + "auxiliary_loss_mlp": 0.01037631, + "balance_loss_clip": 1.0394485, + "balance_loss_mlp": 1.02380884, + "epoch": 0.5189237937772434, + "flos": 22419786222720.0, + "grad_norm": 1.6261331927624005, + "language_loss": 0.80480981, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.82623088, + "num_input_tokens_seen": 185514885, + "step": 8631, + "time_per_iteration": 2.515448808670044 + }, + { + "auxiliary_loss_clip": 0.01105655, + "auxiliary_loss_mlp": 0.01030781, + "balance_loss_clip": 1.04018819, + "balance_loss_mlp": 1.01759684, + "epoch": 0.5189839170299113, + "flos": 25446696399360.0, + "grad_norm": 1.5483085180614025, + "language_loss": 0.74489039, + "learning_rate": 1.974101522024942e-06, + "loss": 0.76625472, + "num_input_tokens_seen": 185537155, + "step": 8632, + "time_per_iteration": 2.5398783683776855 + }, + { + "auxiliary_loss_clip": 0.01077034, + "auxiliary_loss_mlp": 0.01030477, + "balance_loss_clip": 1.03555846, + "balance_loss_mlp": 1.01798368, + "epoch": 0.5190440402825793, + "flos": 18587722734720.0, + "grad_norm": 3.9692696883533416, + "language_loss": 0.78825802, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.80933315, + "num_input_tokens_seen": 185555520, + "step": 8633, + "time_per_iteration": 2.539912700653076 + }, + { + "auxiliary_loss_clip": 0.0110449, + "auxiliary_loss_mlp": 0.01032066, + "balance_loss_clip": 1.03854156, + "balance_loss_mlp": 1.01938832, + "epoch": 0.5191041635352472, + "flos": 21908633731200.0, + "grad_norm": 1.6070408338019309, + "language_loss": 0.80434287, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.82570845, + "num_input_tokens_seen": 185573855, + "step": 8634, + "time_per_iteration": 2.558678388595581 + }, + { + "auxiliary_loss_clip": 0.01110802, + "auxiliary_loss_mlp": 0.0103714, + "balance_loss_clip": 1.03786922, + "balance_loss_mlp": 1.02431917, + "epoch": 0.5191642867879153, + "flos": 27527971173120.0, + "grad_norm": 2.503726346579817, + "language_loss": 0.68689042, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.70836985, + "num_input_tokens_seen": 185595145, + "step": 8635, + "time_per_iteration": 2.518061399459839 + }, + { + "auxiliary_loss_clip": 0.01091588, + "auxiliary_loss_mlp": 0.01033248, + "balance_loss_clip": 1.0377512, + "balance_loss_mlp": 1.02031398, + "epoch": 0.5192244100405832, + "flos": 15705999331200.0, + "grad_norm": 1.7070705428814292, + "language_loss": 0.77734834, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.79859668, + "num_input_tokens_seen": 185613320, + "step": 8636, + "time_per_iteration": 2.484654188156128 + }, + { + "auxiliary_loss_clip": 0.01116639, + "auxiliary_loss_mlp": 0.01031858, + "balance_loss_clip": 1.04002023, + "balance_loss_mlp": 1.01842952, + "epoch": 0.5192845332932512, + "flos": 12057080313600.0, + "grad_norm": 2.2779517998232692, + "language_loss": 0.71702075, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.73850572, + "num_input_tokens_seen": 185630730, + "step": 8637, + "time_per_iteration": 2.4275684356689453 + }, + { + "auxiliary_loss_clip": 0.01077792, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.03835809, + "balance_loss_mlp": 1.0206871, + "epoch": 0.5193446565459191, + "flos": 18953185662720.0, + "grad_norm": 1.882226768787918, + "language_loss": 0.75868416, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.77979374, + "num_input_tokens_seen": 185648515, + "step": 8638, + "time_per_iteration": 2.519571304321289 + }, + { + "auxiliary_loss_clip": 0.01081546, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.03955531, + "balance_loss_mlp": 1.01970518, + "epoch": 0.5194047797985871, + "flos": 20374960775040.0, + "grad_norm": 1.8233617478808053, + "language_loss": 0.74827063, + "learning_rate": 1.971375543740272e-06, + "loss": 0.76941127, + "num_input_tokens_seen": 185665220, + "step": 8639, + "time_per_iteration": 2.5331456661224365 + }, + { + "auxiliary_loss_clip": 0.01112773, + "auxiliary_loss_mlp": 0.01027994, + "balance_loss_clip": 1.03929496, + "balance_loss_mlp": 1.01505983, + "epoch": 0.519464903051255, + "flos": 24353001135360.0, + "grad_norm": 1.8245252823827376, + "language_loss": 0.77578926, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.79719698, + "num_input_tokens_seen": 185683750, + "step": 8640, + "time_per_iteration": 2.4831273555755615 + }, + { + "auxiliary_loss_clip": 0.0107576, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.04236495, + "balance_loss_mlp": 1.01864398, + "epoch": 0.519525026303923, + "flos": 14061829161600.0, + "grad_norm": 2.23274805898455, + "language_loss": 0.65969908, + "learning_rate": 1.97059670234927e-06, + "loss": 0.68076849, + "num_input_tokens_seen": 185700625, + "step": 8641, + "time_per_iteration": 2.511431932449341 + }, + { + "auxiliary_loss_clip": 0.01112971, + "auxiliary_loss_mlp": 0.01029658, + "balance_loss_clip": 1.0400697, + "balance_loss_mlp": 1.01720643, + "epoch": 0.519585149556591, + "flos": 28835873193600.0, + "grad_norm": 1.8754827005115735, + "language_loss": 0.76196408, + "learning_rate": 1.97020728331885e-06, + "loss": 0.7833904, + "num_input_tokens_seen": 185721155, + "step": 8642, + "time_per_iteration": 2.5133354663848877 + }, + { + "auxiliary_loss_clip": 0.01113205, + "auxiliary_loss_mlp": 0.01029929, + "balance_loss_clip": 1.04036272, + "balance_loss_mlp": 1.0173825, + "epoch": 0.519645272809259, + "flos": 25373007648000.0, + "grad_norm": 1.5287769252861678, + "language_loss": 0.82806933, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.84950066, + "num_input_tokens_seen": 185740990, + "step": 8643, + "time_per_iteration": 2.5063583850860596 + }, + { + "auxiliary_loss_clip": 0.01116197, + "auxiliary_loss_mlp": 0.01040539, + "balance_loss_clip": 1.03926909, + "balance_loss_mlp": 1.02707982, + "epoch": 0.519705396061927, + "flos": 25372863993600.0, + "grad_norm": 1.5872518823115098, + "language_loss": 0.70307505, + "learning_rate": 1.969428448662004e-06, + "loss": 0.7246424, + "num_input_tokens_seen": 185762235, + "step": 8644, + "time_per_iteration": 2.493152141571045 + }, + { + "auxiliary_loss_clip": 0.01102603, + "auxiliary_loss_mlp": 0.00791586, + "balance_loss_clip": 1.03869605, + "balance_loss_mlp": 1.01806927, + "epoch": 0.5197655193145949, + "flos": 28476228268800.0, + "grad_norm": 1.5936948648610312, + "language_loss": 0.80137736, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.82031929, + "num_input_tokens_seen": 185783415, + "step": 8645, + "time_per_iteration": 2.5556018352508545 + }, + { + "auxiliary_loss_clip": 0.01112383, + "auxiliary_loss_mlp": 0.01027202, + "balance_loss_clip": 1.03740573, + "balance_loss_mlp": 1.01390433, + "epoch": 0.5198256425672629, + "flos": 20009138711040.0, + "grad_norm": 1.689974194669513, + "language_loss": 0.77839077, + "learning_rate": 1.968649618642264e-06, + "loss": 0.79978669, + "num_input_tokens_seen": 185801345, + "step": 8646, + "time_per_iteration": 2.451239585876465 + }, + { + "auxiliary_loss_clip": 0.01106524, + "auxiliary_loss_mlp": 0.01035618, + "balance_loss_clip": 1.04076004, + "balance_loss_mlp": 1.02287447, + "epoch": 0.5198857658199308, + "flos": 19828867328640.0, + "grad_norm": 1.8369655537570007, + "language_loss": 0.65590304, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.67732447, + "num_input_tokens_seen": 185820815, + "step": 8647, + "time_per_iteration": 3.8405275344848633 + }, + { + "auxiliary_loss_clip": 0.01115774, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.03884268, + "balance_loss_mlp": 1.01750708, + "epoch": 0.5199458890725989, + "flos": 24461918150400.0, + "grad_norm": 2.01948628128364, + "language_loss": 0.71659976, + "learning_rate": 1.967870793377763e-06, + "loss": 0.73807681, + "num_input_tokens_seen": 185841450, + "step": 8648, + "time_per_iteration": 2.5002434253692627 + }, + { + "auxiliary_loss_clip": 0.01095778, + "auxiliary_loss_mlp": 0.01029353, + "balance_loss_clip": 1.04074311, + "balance_loss_mlp": 1.01601398, + "epoch": 0.5200060123252668, + "flos": 23404779953280.0, + "grad_norm": 1.9758222854651328, + "language_loss": 0.64301622, + "learning_rate": 1.967481382565642e-06, + "loss": 0.66426748, + "num_input_tokens_seen": 185859935, + "step": 8649, + "time_per_iteration": 2.542113780975342 + }, + { + "auxiliary_loss_clip": 0.01092584, + "auxiliary_loss_mlp": 0.01035441, + "balance_loss_clip": 1.0396595, + "balance_loss_mlp": 1.02071285, + "epoch": 0.5200661355779348, + "flos": 17201355454080.0, + "grad_norm": 1.6925558475643878, + "language_loss": 0.70253748, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.72381771, + "num_input_tokens_seen": 185876795, + "step": 8650, + "time_per_iteration": 2.4972283840179443 + }, + { + "auxiliary_loss_clip": 0.01112512, + "auxiliary_loss_mlp": 0.01028925, + "balance_loss_clip": 1.03820491, + "balance_loss_mlp": 1.01615739, + "epoch": 0.5201262588306027, + "flos": 18515075477760.0, + "grad_norm": 1.777585827141164, + "language_loss": 0.78004038, + "learning_rate": 1.966702564655496e-06, + "loss": 0.80145478, + "num_input_tokens_seen": 185895570, + "step": 8651, + "time_per_iteration": 4.09261679649353 + }, + { + "auxiliary_loss_clip": 0.01061116, + "auxiliary_loss_mlp": 0.01036794, + "balance_loss_clip": 1.04038095, + "balance_loss_mlp": 1.02291834, + "epoch": 0.5201863820832707, + "flos": 18619395552000.0, + "grad_norm": 2.7212502647484373, + "language_loss": 0.78867269, + "learning_rate": 1.966313157587003e-06, + "loss": 0.80965173, + "num_input_tokens_seen": 185913700, + "step": 8652, + "time_per_iteration": 2.6391310691833496 + }, + { + "auxiliary_loss_clip": 0.01080732, + "auxiliary_loss_mlp": 0.01032611, + "balance_loss_clip": 1.04332149, + "balance_loss_mlp": 1.01735771, + "epoch": 0.5202465053359386, + "flos": 22857142222080.0, + "grad_norm": 1.7378500904169487, + "language_loss": 0.70319384, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.72432733, + "num_input_tokens_seen": 185932460, + "step": 8653, + "time_per_iteration": 3.9839365482330322 + }, + { + "auxiliary_loss_clip": 0.01083811, + "auxiliary_loss_mlp": 0.01040773, + "balance_loss_clip": 1.03956068, + "balance_loss_mlp": 1.02678931, + "epoch": 0.5203066285886067, + "flos": 21981532383360.0, + "grad_norm": 2.465497332261721, + "language_loss": 0.78455937, + "learning_rate": 1.965534347297008e-06, + "loss": 0.80580521, + "num_input_tokens_seen": 185952030, + "step": 8654, + "time_per_iteration": 2.5471973419189453 + }, + { + "auxiliary_loss_clip": 0.01108188, + "auxiliary_loss_mlp": 0.01037601, + "balance_loss_clip": 1.04179239, + "balance_loss_mlp": 1.02391601, + "epoch": 0.5203667518412746, + "flos": 20233329448320.0, + "grad_norm": 2.0324611007248534, + "language_loss": 0.84102285, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.86248076, + "num_input_tokens_seen": 185973130, + "step": 8655, + "time_per_iteration": 2.5385210514068604 + }, + { + "auxiliary_loss_clip": 0.0110487, + "auxiliary_loss_mlp": 0.01031588, + "balance_loss_clip": 1.04070377, + "balance_loss_mlp": 1.01955414, + "epoch": 0.5204268750939426, + "flos": 15705460627200.0, + "grad_norm": 2.6022067176134818, + "language_loss": 0.65828109, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.67964566, + "num_input_tokens_seen": 185990200, + "step": 8656, + "time_per_iteration": 2.461646556854248 + }, + { + "auxiliary_loss_clip": 0.01076417, + "auxiliary_loss_mlp": 0.0104012, + "balance_loss_clip": 1.04244661, + "balance_loss_mlp": 1.02675664, + "epoch": 0.5204869983466105, + "flos": 27449469999360.0, + "grad_norm": 1.842141207239616, + "language_loss": 0.73149371, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.75265908, + "num_input_tokens_seen": 186009880, + "step": 8657, + "time_per_iteration": 2.603379249572754 + }, + { + "auxiliary_loss_clip": 0.01081224, + "auxiliary_loss_mlp": 0.01031077, + "balance_loss_clip": 1.038468, + "balance_loss_mlp": 1.01732051, + "epoch": 0.5205471215992785, + "flos": 20595452411520.0, + "grad_norm": 1.8091173280179478, + "language_loss": 0.7185241, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.73964703, + "num_input_tokens_seen": 186026680, + "step": 8658, + "time_per_iteration": 4.023704290390015 + }, + { + "auxiliary_loss_clip": 0.0111374, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.03816843, + "balance_loss_mlp": 1.01890242, + "epoch": 0.5206072448519465, + "flos": 22127904305280.0, + "grad_norm": 1.9619235705195328, + "language_loss": 0.83147341, + "learning_rate": 1.963587344701897e-06, + "loss": 0.85292816, + "num_input_tokens_seen": 186046920, + "step": 8659, + "time_per_iteration": 2.4564363956451416 + }, + { + "auxiliary_loss_clip": 0.01095368, + "auxiliary_loss_mlp": 0.01039163, + "balance_loss_clip": 1.04005694, + "balance_loss_mlp": 1.02420807, + "epoch": 0.5206673681046144, + "flos": 18330422636160.0, + "grad_norm": 1.9226158220637388, + "language_loss": 0.75475729, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.7761026, + "num_input_tokens_seen": 186062090, + "step": 8660, + "time_per_iteration": 2.490968704223633 + }, + { + "auxiliary_loss_clip": 0.01114008, + "auxiliary_loss_mlp": 0.01031966, + "balance_loss_clip": 1.03960741, + "balance_loss_mlp": 1.01958585, + "epoch": 0.5207274913572825, + "flos": 20230240878720.0, + "grad_norm": 1.8905497208617705, + "language_loss": 0.77989209, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.80135185, + "num_input_tokens_seen": 186081135, + "step": 8661, + "time_per_iteration": 2.4487507343292236 + }, + { + "auxiliary_loss_clip": 0.01094765, + "auxiliary_loss_mlp": 0.01029644, + "balance_loss_clip": 1.03846431, + "balance_loss_mlp": 1.01718652, + "epoch": 0.5207876146099504, + "flos": 22127042378880.0, + "grad_norm": 2.0435291822304946, + "language_loss": 0.70238805, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.7236321, + "num_input_tokens_seen": 186099700, + "step": 8662, + "time_per_iteration": 2.5561916828155518 + }, + { + "auxiliary_loss_clip": 0.01100255, + "auxiliary_loss_mlp": 0.01028809, + "balance_loss_clip": 1.03891993, + "balance_loss_mlp": 1.01496255, + "epoch": 0.5208477378626184, + "flos": 23878908501120.0, + "grad_norm": 1.676964131272476, + "language_loss": 0.69494367, + "learning_rate": 1.962029767391098e-06, + "loss": 0.71623433, + "num_input_tokens_seen": 186119740, + "step": 8663, + "time_per_iteration": 2.5034971237182617 + }, + { + "auxiliary_loss_clip": 0.01086297, + "auxiliary_loss_mlp": 0.00790036, + "balance_loss_clip": 1.03815198, + "balance_loss_mlp": 1.0140326, + "epoch": 0.5209078611152863, + "flos": 20961525870720.0, + "grad_norm": 2.6085773381028927, + "language_loss": 0.76811326, + "learning_rate": 1.961640376626072e-06, + "loss": 0.78687662, + "num_input_tokens_seen": 186140645, + "step": 8664, + "time_per_iteration": 2.5453169345855713 + }, + { + "auxiliary_loss_clip": 0.01090071, + "auxiliary_loss_mlp": 0.01032676, + "balance_loss_clip": 1.0413065, + "balance_loss_mlp": 1.0195744, + "epoch": 0.5209679843679543, + "flos": 20667740532480.0, + "grad_norm": 3.8763948052211767, + "language_loss": 0.76015508, + "learning_rate": 1.961250987315646e-06, + "loss": 0.78138262, + "num_input_tokens_seen": 186160130, + "step": 8665, + "time_per_iteration": 2.52441143989563 + }, + { + "auxiliary_loss_clip": 0.01103812, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_clip": 1.04054928, + "balance_loss_mlp": 1.01809347, + "epoch": 0.5210281076206222, + "flos": 20227295963520.0, + "grad_norm": 1.9239550526599731, + "language_loss": 0.72301877, + "learning_rate": 1.960861599474586e-06, + "loss": 0.74435914, + "num_input_tokens_seen": 186179485, + "step": 8666, + "time_per_iteration": 2.501018524169922 + }, + { + "auxiliary_loss_clip": 0.0110119, + "auxiliary_loss_mlp": 0.01034778, + "balance_loss_clip": 1.04129028, + "balance_loss_mlp": 1.01955462, + "epoch": 0.5210882308732903, + "flos": 16069989801600.0, + "grad_norm": 2.3906737167742182, + "language_loss": 0.68475461, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.70611429, + "num_input_tokens_seen": 186197140, + "step": 8667, + "time_per_iteration": 2.507136583328247 + }, + { + "auxiliary_loss_clip": 0.01070726, + "auxiliary_loss_mlp": 0.01028167, + "balance_loss_clip": 1.03863716, + "balance_loss_mlp": 1.01545882, + "epoch": 0.5211483541259582, + "flos": 24825298089600.0, + "grad_norm": 1.4432150085768403, + "language_loss": 0.80775458, + "learning_rate": 1.960082828259629e-06, + "loss": 0.82874346, + "num_input_tokens_seen": 186216800, + "step": 8668, + "time_per_iteration": 2.637395143508911 + }, + { + "auxiliary_loss_clip": 0.01094062, + "auxiliary_loss_mlp": 0.01029462, + "balance_loss_clip": 1.0389092, + "balance_loss_mlp": 1.01647377, + "epoch": 0.5212084773786262, + "flos": 20370651143040.0, + "grad_norm": 1.9936162939812752, + "language_loss": 0.64025676, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.66149199, + "num_input_tokens_seen": 186235320, + "step": 8669, + "time_per_iteration": 2.5106399059295654 + }, + { + "auxiliary_loss_clip": 0.01092902, + "auxiliary_loss_mlp": 0.00791674, + "balance_loss_clip": 1.04062152, + "balance_loss_mlp": 1.01676881, + "epoch": 0.5212686006312941, + "flos": 23145468693120.0, + "grad_norm": 1.5867963858312661, + "language_loss": 0.66485476, + "learning_rate": 1.959304063099325e-06, + "loss": 0.68370044, + "num_input_tokens_seen": 186254460, + "step": 8670, + "time_per_iteration": 2.564114809036255 + }, + { + "auxiliary_loss_clip": 0.01075817, + "auxiliary_loss_mlp": 0.01030887, + "balance_loss_clip": 1.0377295, + "balance_loss_mlp": 1.01826823, + "epoch": 0.5213287238839621, + "flos": 27774030314880.0, + "grad_norm": 2.0371900041078645, + "language_loss": 0.75988162, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.78094864, + "num_input_tokens_seen": 186269465, + "step": 8671, + "time_per_iteration": 2.577138662338257 + }, + { + "auxiliary_loss_clip": 0.010869, + "auxiliary_loss_mlp": 0.01032904, + "balance_loss_clip": 1.04657304, + "balance_loss_mlp": 1.01929677, + "epoch": 0.5213888471366301, + "flos": 19937676602880.0, + "grad_norm": 17.105786659557662, + "language_loss": 0.78165907, + "learning_rate": 1.958525304111796e-06, + "loss": 0.8028571, + "num_input_tokens_seen": 186288660, + "step": 8672, + "time_per_iteration": 2.5721752643585205 + }, + { + "auxiliary_loss_clip": 0.01074656, + "auxiliary_loss_mlp": 0.01027845, + "balance_loss_clip": 1.03516054, + "balance_loss_mlp": 1.01585841, + "epoch": 0.521448970389298, + "flos": 16982731324800.0, + "grad_norm": 2.0719204737226855, + "language_loss": 0.72157842, + "learning_rate": 1.958135926969736e-06, + "loss": 0.74260342, + "num_input_tokens_seen": 186305760, + "step": 8673, + "time_per_iteration": 2.514864444732666 + }, + { + "auxiliary_loss_clip": 0.01099283, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.03796792, + "balance_loss_mlp": 1.01799107, + "epoch": 0.5215090936419661, + "flos": 18989706816000.0, + "grad_norm": 1.6930346337464368, + "language_loss": 0.74821937, + "learning_rate": 1.957746551415166e-06, + "loss": 0.76952457, + "num_input_tokens_seen": 186324135, + "step": 8674, + "time_per_iteration": 2.4813055992126465 + }, + { + "auxiliary_loss_clip": 0.01088927, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.03878188, + "balance_loss_mlp": 1.0211786, + "epoch": 0.521569216894634, + "flos": 16143427157760.0, + "grad_norm": 1.959735143198719, + "language_loss": 0.8638131, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.88506198, + "num_input_tokens_seen": 186340205, + "step": 8675, + "time_per_iteration": 2.48199462890625 + }, + { + "auxiliary_loss_clip": 0.01023717, + "auxiliary_loss_mlp": 0.01001321, + "balance_loss_clip": 1.02431059, + "balance_loss_mlp": 0.99999791, + "epoch": 0.521629340147302, + "flos": 57579493282560.0, + "grad_norm": 0.8727889866804611, + "language_loss": 0.63111597, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65136635, + "num_input_tokens_seen": 186396940, + "step": 8676, + "time_per_iteration": 3.087775945663452 + }, + { + "auxiliary_loss_clip": 0.01099878, + "auxiliary_loss_mlp": 0.01027862, + "balance_loss_clip": 1.03805399, + "balance_loss_mlp": 1.01529741, + "epoch": 0.5216894633999699, + "flos": 26796901662720.0, + "grad_norm": 1.7445827327926693, + "language_loss": 0.69051492, + "learning_rate": 1.956578434424046e-06, + "loss": 0.71179235, + "num_input_tokens_seen": 186418680, + "step": 8677, + "time_per_iteration": 2.5275986194610596 + }, + { + "auxiliary_loss_clip": 0.01099798, + "auxiliary_loss_mlp": 0.0103156, + "balance_loss_clip": 1.03684926, + "balance_loss_mlp": 1.01825595, + "epoch": 0.5217495866526379, + "flos": 26358719650560.0, + "grad_norm": 5.657907097070374, + "language_loss": 0.65031618, + "learning_rate": 1.956189065367086e-06, + "loss": 0.67162973, + "num_input_tokens_seen": 186438265, + "step": 8678, + "time_per_iteration": 2.539461135864258 + }, + { + "auxiliary_loss_clip": 0.01092376, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.03655982, + "balance_loss_mlp": 1.01836276, + "epoch": 0.5218097099053058, + "flos": 23584009841280.0, + "grad_norm": 2.3486404898146778, + "language_loss": 0.68526196, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.70651019, + "num_input_tokens_seen": 186456870, + "step": 8679, + "time_per_iteration": 2.5256125926971436 + }, + { + "auxiliary_loss_clip": 0.01115563, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.04051638, + "balance_loss_mlp": 1.01897919, + "epoch": 0.5218698331579739, + "flos": 18077396256000.0, + "grad_norm": 1.8897415674150861, + "language_loss": 0.6679877, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.68946618, + "num_input_tokens_seen": 186476425, + "step": 8680, + "time_per_iteration": 2.4672603607177734 + }, + { + "auxiliary_loss_clip": 0.01114402, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.03952765, + "balance_loss_mlp": 1.0153383, + "epoch": 0.5219299564106418, + "flos": 19281121856640.0, + "grad_norm": 1.9409112336652046, + "language_loss": 0.83081436, + "learning_rate": 1.955020968223156e-06, + "loss": 0.85224569, + "num_input_tokens_seen": 186492555, + "step": 8681, + "time_per_iteration": 2.444559335708618 + }, + { + "auxiliary_loss_clip": 0.01088146, + "auxiliary_loss_mlp": 0.01028997, + "balance_loss_clip": 1.03766418, + "balance_loss_mlp": 1.01644993, + "epoch": 0.5219900796633098, + "flos": 26651355753600.0, + "grad_norm": 1.7068716944222566, + "language_loss": 0.77768308, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.79885447, + "num_input_tokens_seen": 186513190, + "step": 8682, + "time_per_iteration": 2.5654208660125732 + }, + { + "auxiliary_loss_clip": 0.01077515, + "auxiliary_loss_mlp": 0.01039359, + "balance_loss_clip": 1.0367806, + "balance_loss_mlp": 1.026407, + "epoch": 0.5220502029159777, + "flos": 34312717382400.0, + "grad_norm": 1.5186975305501975, + "language_loss": 0.69082403, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.71199274, + "num_input_tokens_seen": 186534830, + "step": 8683, + "time_per_iteration": 2.6406726837158203 + }, + { + "auxiliary_loss_clip": 0.01081916, + "auxiliary_loss_mlp": 0.01035121, + "balance_loss_clip": 1.03840232, + "balance_loss_mlp": 1.02167392, + "epoch": 0.5221103261686457, + "flos": 22156488552960.0, + "grad_norm": 1.624111477684696, + "language_loss": 0.76172948, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.78289986, + "num_input_tokens_seen": 186554390, + "step": 8684, + "time_per_iteration": 2.5561280250549316 + }, + { + "auxiliary_loss_clip": 0.01093688, + "auxiliary_loss_mlp": 0.00788077, + "balance_loss_clip": 1.03638148, + "balance_loss_mlp": 1.01393366, + "epoch": 0.5221704494213137, + "flos": 19208402772480.0, + "grad_norm": 1.649278347652008, + "language_loss": 0.75947666, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.77829432, + "num_input_tokens_seen": 186572360, + "step": 8685, + "time_per_iteration": 3.879373788833618 + }, + { + "auxiliary_loss_clip": 0.01090929, + "auxiliary_loss_mlp": 0.0103485, + "balance_loss_clip": 1.03983414, + "balance_loss_mlp": 1.02224922, + "epoch": 0.5222305726739817, + "flos": 19354056422400.0, + "grad_norm": 1.8468944599401247, + "language_loss": 0.81012225, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.83138001, + "num_input_tokens_seen": 186590655, + "step": 8686, + "time_per_iteration": 2.4995908737182617 + }, + { + "auxiliary_loss_clip": 0.01088291, + "auxiliary_loss_mlp": 0.01030008, + "balance_loss_clip": 1.03483987, + "balance_loss_mlp": 1.01803958, + "epoch": 0.5222906959266497, + "flos": 27814789272960.0, + "grad_norm": 1.843240689994817, + "language_loss": 0.69451106, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.71569407, + "num_input_tokens_seen": 186610345, + "step": 8687, + "time_per_iteration": 2.579038143157959 + }, + { + "auxiliary_loss_clip": 0.01107926, + "auxiliary_loss_mlp": 0.01027485, + "balance_loss_clip": 1.03627372, + "balance_loss_mlp": 1.015499, + "epoch": 0.5223508191793176, + "flos": 12712988615040.0, + "grad_norm": 2.0685933752717505, + "language_loss": 0.83317316, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85452724, + "num_input_tokens_seen": 186624360, + "step": 8688, + "time_per_iteration": 2.4149386882781982 + }, + { + "auxiliary_loss_clip": 0.01099284, + "auxiliary_loss_mlp": 0.00786259, + "balance_loss_clip": 1.03697848, + "balance_loss_mlp": 1.00750661, + "epoch": 0.5224109424319856, + "flos": 15632238752640.0, + "grad_norm": 2.3979638977707998, + "language_loss": 0.73249626, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.75135165, + "num_input_tokens_seen": 186638680, + "step": 8689, + "time_per_iteration": 3.857513189315796 + }, + { + "auxiliary_loss_clip": 0.01087374, + "auxiliary_loss_mlp": 0.01026873, + "balance_loss_clip": 1.03503776, + "balance_loss_mlp": 1.01392651, + "epoch": 0.5224710656846535, + "flos": 15742233175680.0, + "grad_norm": 1.9199663752926093, + "language_loss": 0.82369959, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.84484208, + "num_input_tokens_seen": 186655840, + "step": 8690, + "time_per_iteration": 2.4782347679138184 + }, + { + "auxiliary_loss_clip": 0.01083907, + "auxiliary_loss_mlp": 0.01034509, + "balance_loss_clip": 1.03830159, + "balance_loss_mlp": 1.02096689, + "epoch": 0.5225311889373215, + "flos": 26030998938240.0, + "grad_norm": 2.4013411171283674, + "language_loss": 0.79319954, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.81438375, + "num_input_tokens_seen": 186674150, + "step": 8691, + "time_per_iteration": 2.57486629486084 + }, + { + "auxiliary_loss_clip": 0.0110675, + "auxiliary_loss_mlp": 0.01031797, + "balance_loss_clip": 1.03858805, + "balance_loss_mlp": 1.01758146, + "epoch": 0.5225913121899894, + "flos": 18369278173440.0, + "grad_norm": 2.0922449570523143, + "language_loss": 0.77166033, + "learning_rate": 1.950738079725646e-06, + "loss": 0.79304582, + "num_input_tokens_seen": 186690675, + "step": 8692, + "time_per_iteration": 3.891012668609619 + }, + { + "auxiliary_loss_clip": 0.01097766, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.03742647, + "balance_loss_mlp": 1.01959348, + "epoch": 0.5226514354426575, + "flos": 29273516501760.0, + "grad_norm": 1.6522018230322035, + "language_loss": 0.72648281, + "learning_rate": 1.950348737138691e-06, + "loss": 0.7477746, + "num_input_tokens_seen": 186710380, + "step": 8693, + "time_per_iteration": 2.5275096893310547 + }, + { + "auxiliary_loss_clip": 0.01116571, + "auxiliary_loss_mlp": 0.01035423, + "balance_loss_clip": 1.03856099, + "balance_loss_mlp": 1.02095747, + "epoch": 0.5227115586953254, + "flos": 22853299466880.0, + "grad_norm": 1.9583509776823018, + "language_loss": 0.81809103, + "learning_rate": 1.949959396434517e-06, + "loss": 0.83961099, + "num_input_tokens_seen": 186729135, + "step": 8694, + "time_per_iteration": 2.4680869579315186 + }, + { + "auxiliary_loss_clip": 0.01010814, + "auxiliary_loss_mlp": 0.0100502, + "balance_loss_clip": 1.01996565, + "balance_loss_mlp": 1.00347078, + "epoch": 0.5227716819479934, + "flos": 57474419022720.0, + "grad_norm": 0.7688733641910941, + "language_loss": 0.55761546, + "learning_rate": 1.949570057627888e-06, + "loss": 0.57777381, + "num_input_tokens_seen": 186791115, + "step": 8695, + "time_per_iteration": 3.1782405376434326 + }, + { + "auxiliary_loss_clip": 0.01057112, + "auxiliary_loss_mlp": 0.01035315, + "balance_loss_clip": 1.03825879, + "balance_loss_mlp": 1.02232122, + "epoch": 0.5228318052006613, + "flos": 13808264077440.0, + "grad_norm": 2.2608282030909495, + "language_loss": 0.73245674, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75338095, + "num_input_tokens_seen": 186808660, + "step": 8696, + "time_per_iteration": 4.035473823547363 + }, + { + "auxiliary_loss_clip": 0.01088869, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.03721988, + "balance_loss_mlp": 1.01854253, + "epoch": 0.5228919284533293, + "flos": 15596184476160.0, + "grad_norm": 1.5287826612410875, + "language_loss": 0.71477211, + "learning_rate": 1.948791385766319e-06, + "loss": 0.73597383, + "num_input_tokens_seen": 186825900, + "step": 8697, + "time_per_iteration": 2.5421485900878906 + }, + { + "auxiliary_loss_clip": 0.01085407, + "auxiliary_loss_mlp": 0.0103197, + "balance_loss_clip": 1.04171693, + "balance_loss_mlp": 1.01970959, + "epoch": 0.5229520517059973, + "flos": 22491499726080.0, + "grad_norm": 1.804976649049219, + "language_loss": 0.81010628, + "learning_rate": 1.948402052740906e-06, + "loss": 0.83127999, + "num_input_tokens_seen": 186843735, + "step": 8698, + "time_per_iteration": 2.515723466873169 + }, + { + "auxiliary_loss_clip": 0.01101743, + "auxiliary_loss_mlp": 0.01036162, + "balance_loss_clip": 1.03705716, + "balance_loss_mlp": 1.02301955, + "epoch": 0.5230121749586653, + "flos": 22090880361600.0, + "grad_norm": 1.6403860049947263, + "language_loss": 0.74162734, + "learning_rate": 1.948012721672093e-06, + "loss": 0.76300633, + "num_input_tokens_seen": 186862440, + "step": 8699, + "time_per_iteration": 2.4957876205444336 + }, + { + "auxiliary_loss_clip": 0.01103536, + "auxiliary_loss_mlp": 0.00788205, + "balance_loss_clip": 1.03603351, + "balance_loss_mlp": 1.0092628, + "epoch": 0.5230722982113333, + "flos": 22127150119680.0, + "grad_norm": 1.844928870266146, + "language_loss": 0.7343812, + "learning_rate": 1.947623392574642e-06, + "loss": 0.75329852, + "num_input_tokens_seen": 186880940, + "step": 8700, + "time_per_iteration": 2.491260528564453 + }, + { + "auxiliary_loss_clip": 0.01090994, + "auxiliary_loss_mlp": 0.01037795, + "balance_loss_clip": 1.03999245, + "balance_loss_mlp": 1.02354348, + "epoch": 0.5231324214640012, + "flos": 25009268572800.0, + "grad_norm": 1.70911873312012, + "language_loss": 0.6664986, + "learning_rate": 1.947234065463318e-06, + "loss": 0.68778646, + "num_input_tokens_seen": 186900785, + "step": 8701, + "time_per_iteration": 2.570749282836914 + }, + { + "auxiliary_loss_clip": 0.0109557, + "auxiliary_loss_mlp": 0.00788418, + "balance_loss_clip": 1.03925312, + "balance_loss_mlp": 1.01158309, + "epoch": 0.5231925447166692, + "flos": 25740517651200.0, + "grad_norm": 2.007791589512374, + "language_loss": 0.66690004, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.68573987, + "num_input_tokens_seen": 186920895, + "step": 8702, + "time_per_iteration": 2.542449474334717 + }, + { + "auxiliary_loss_clip": 0.01092835, + "auxiliary_loss_mlp": 0.01035997, + "balance_loss_clip": 1.03876638, + "balance_loss_mlp": 1.02218091, + "epoch": 0.5232526679693371, + "flos": 21433930565760.0, + "grad_norm": 1.8321740174622712, + "language_loss": 0.76433766, + "learning_rate": 1.946455417258101e-06, + "loss": 0.78562593, + "num_input_tokens_seen": 186940605, + "step": 8703, + "time_per_iteration": 2.5341320037841797 + }, + { + "auxiliary_loss_clip": 0.01107759, + "auxiliary_loss_mlp": 0.01042035, + "balance_loss_clip": 1.03887486, + "balance_loss_mlp": 1.02629948, + "epoch": 0.5233127912220051, + "flos": 35298393471360.0, + "grad_norm": 2.0931583108628766, + "language_loss": 0.77200091, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.79349887, + "num_input_tokens_seen": 186960820, + "step": 8704, + "time_per_iteration": 2.5919010639190674 + }, + { + "auxiliary_loss_clip": 0.01093082, + "auxiliary_loss_mlp": 0.01041053, + "balance_loss_clip": 1.04044926, + "balance_loss_mlp": 1.02816629, + "epoch": 0.523372914474673, + "flos": 17051320344960.0, + "grad_norm": 1.7078220883326165, + "language_loss": 0.78048301, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.80182433, + "num_input_tokens_seen": 186976240, + "step": 8705, + "time_per_iteration": 2.4931838512420654 + }, + { + "auxiliary_loss_clip": 0.01096726, + "auxiliary_loss_mlp": 0.01037097, + "balance_loss_clip": 1.0391897, + "balance_loss_mlp": 1.02304196, + "epoch": 0.5234330377273411, + "flos": 18406302117120.0, + "grad_norm": 2.3698762430049376, + "language_loss": 0.69435143, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.71568966, + "num_input_tokens_seen": 186992855, + "step": 8706, + "time_per_iteration": 2.4978771209716797 + }, + { + "auxiliary_loss_clip": 0.01036206, + "auxiliary_loss_mlp": 0.01001807, + "balance_loss_clip": 1.0198164, + "balance_loss_mlp": 1.00045955, + "epoch": 0.523493160980009, + "flos": 65850296970240.0, + "grad_norm": 0.6736277615222989, + "language_loss": 0.52423972, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.54461986, + "num_input_tokens_seen": 187051205, + "step": 8707, + "time_per_iteration": 3.1839168071746826 + }, + { + "auxiliary_loss_clip": 0.01091507, + "auxiliary_loss_mlp": 0.01039825, + "balance_loss_clip": 1.0377363, + "balance_loss_mlp": 1.02686119, + "epoch": 0.523553284232677, + "flos": 21872076664320.0, + "grad_norm": 1.7655117076606108, + "language_loss": 0.74599135, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.76730466, + "num_input_tokens_seen": 187070540, + "step": 8708, + "time_per_iteration": 2.522873640060425 + }, + { + "auxiliary_loss_clip": 0.01089592, + "auxiliary_loss_mlp": 0.01029833, + "balance_loss_clip": 1.04245305, + "balance_loss_mlp": 1.01689911, + "epoch": 0.5236134074853449, + "flos": 20848191482880.0, + "grad_norm": 1.5699490454702734, + "language_loss": 0.77213848, + "learning_rate": 1.944119521844849e-06, + "loss": 0.7933327, + "num_input_tokens_seen": 187089975, + "step": 8709, + "time_per_iteration": 2.5431578159332275 + }, + { + "auxiliary_loss_clip": 0.01068655, + "auxiliary_loss_mlp": 0.01042045, + "balance_loss_clip": 1.03533757, + "balance_loss_mlp": 1.02579033, + "epoch": 0.5236735307380129, + "flos": 25520421064320.0, + "grad_norm": 1.9151045521292223, + "language_loss": 0.83609593, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.85720295, + "num_input_tokens_seen": 187108775, + "step": 8710, + "time_per_iteration": 2.6103403568267822 + }, + { + "auxiliary_loss_clip": 0.01088739, + "auxiliary_loss_mlp": 0.01033079, + "balance_loss_clip": 1.03981674, + "balance_loss_mlp": 1.01955497, + "epoch": 0.523733653990681, + "flos": 23583112001280.0, + "grad_norm": 2.5695939917736883, + "language_loss": 0.69107461, + "learning_rate": 1.943340906834908e-06, + "loss": 0.71229273, + "num_input_tokens_seen": 187128830, + "step": 8711, + "time_per_iteration": 2.592047691345215 + }, + { + "auxiliary_loss_clip": 0.01102208, + "auxiliary_loss_mlp": 0.01034814, + "balance_loss_clip": 1.03765416, + "balance_loss_mlp": 1.0216887, + "epoch": 0.5237937772433489, + "flos": 21106245767040.0, + "grad_norm": 2.0123222747919955, + "language_loss": 0.83267754, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.85404783, + "num_input_tokens_seen": 187149570, + "step": 8712, + "time_per_iteration": 2.488284111022949 + }, + { + "auxiliary_loss_clip": 0.01117119, + "auxiliary_loss_mlp": 0.01038516, + "balance_loss_clip": 1.03876901, + "balance_loss_mlp": 1.02411592, + "epoch": 0.5238539004960169, + "flos": 19172887200000.0, + "grad_norm": 1.8564009908708676, + "language_loss": 0.69867665, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.72023302, + "num_input_tokens_seen": 187170575, + "step": 8713, + "time_per_iteration": 2.4852495193481445 + }, + { + "auxiliary_loss_clip": 0.01078974, + "auxiliary_loss_mlp": 0.01041201, + "balance_loss_clip": 1.03527677, + "balance_loss_mlp": 1.02442789, + "epoch": 0.5239140237486848, + "flos": 17888218300800.0, + "grad_norm": 2.752794153358208, + "language_loss": 0.76825464, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.78945637, + "num_input_tokens_seen": 187187190, + "step": 8714, + "time_per_iteration": 2.5047988891601562 + }, + { + "auxiliary_loss_clip": 0.01079565, + "auxiliary_loss_mlp": 0.01038349, + "balance_loss_clip": 1.03809917, + "balance_loss_mlp": 1.02167177, + "epoch": 0.5239741470013528, + "flos": 17930413802880.0, + "grad_norm": 1.8740416002762232, + "language_loss": 0.75704336, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.7782225, + "num_input_tokens_seen": 187204350, + "step": 8715, + "time_per_iteration": 2.549891710281372 + }, + { + "auxiliary_loss_clip": 0.01089722, + "auxiliary_loss_mlp": 0.01029783, + "balance_loss_clip": 1.03718758, + "balance_loss_mlp": 1.0165385, + "epoch": 0.5240342702540207, + "flos": 30993386584320.0, + "grad_norm": 1.5661019729460361, + "language_loss": 0.71173388, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.73292887, + "num_input_tokens_seen": 187225605, + "step": 8716, + "time_per_iteration": 2.587007522583008 + }, + { + "auxiliary_loss_clip": 0.01114639, + "auxiliary_loss_mlp": 0.01036077, + "balance_loss_clip": 1.03913188, + "balance_loss_mlp": 1.02388155, + "epoch": 0.5240943935066887, + "flos": 25005066681600.0, + "grad_norm": 2.3391550836856854, + "language_loss": 0.86920726, + "learning_rate": 1.941005113841926e-06, + "loss": 0.89071441, + "num_input_tokens_seen": 187241335, + "step": 8717, + "time_per_iteration": 2.4550609588623047 + }, + { + "auxiliary_loss_clip": 0.01101924, + "auxiliary_loss_mlp": 0.01034714, + "balance_loss_clip": 1.04214692, + "balance_loss_mlp": 1.02126169, + "epoch": 0.5241545167593566, + "flos": 23659099223040.0, + "grad_norm": 1.9312547466147425, + "language_loss": 0.61441904, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.63578546, + "num_input_tokens_seen": 187259925, + "step": 8718, + "time_per_iteration": 2.513948917388916 + }, + { + "auxiliary_loss_clip": 0.01089298, + "auxiliary_loss_mlp": 0.01036237, + "balance_loss_clip": 1.04176033, + "balance_loss_mlp": 1.02199745, + "epoch": 0.5242146400120247, + "flos": 23400398494080.0, + "grad_norm": 1.6803650926277252, + "language_loss": 0.71983242, + "learning_rate": 1.940226533916872e-06, + "loss": 0.74108779, + "num_input_tokens_seen": 187279035, + "step": 8719, + "time_per_iteration": 2.560619354248047 + }, + { + "auxiliary_loss_clip": 0.01098694, + "auxiliary_loss_mlp": 0.01027124, + "balance_loss_clip": 1.03732991, + "balance_loss_mlp": 1.01532853, + "epoch": 0.5242747632646926, + "flos": 17749065012480.0, + "grad_norm": 2.1605978537716424, + "language_loss": 0.73222649, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.75348473, + "num_input_tokens_seen": 187297555, + "step": 8720, + "time_per_iteration": 2.4765515327453613 + }, + { + "auxiliary_loss_clip": 0.01102629, + "auxiliary_loss_mlp": 0.01032963, + "balance_loss_clip": 1.03714252, + "balance_loss_mlp": 1.01958764, + "epoch": 0.5243348865173606, + "flos": 32597731549440.0, + "grad_norm": 1.6013333602451, + "language_loss": 0.70355785, + "learning_rate": 1.939447963058281e-06, + "loss": 0.72491378, + "num_input_tokens_seen": 187320265, + "step": 8721, + "time_per_iteration": 2.5593059062957764 + }, + { + "auxiliary_loss_clip": 0.0105838, + "auxiliary_loss_mlp": 0.0103393, + "balance_loss_clip": 1.03512335, + "balance_loss_mlp": 1.02016115, + "epoch": 0.5243950097700285, + "flos": 25484115392640.0, + "grad_norm": 1.754915162271086, + "language_loss": 0.86436117, + "learning_rate": 1.939058681065813e-06, + "loss": 0.8852843, + "num_input_tokens_seen": 187338045, + "step": 8722, + "time_per_iteration": 2.58982253074646 + }, + { + "auxiliary_loss_clip": 0.01112868, + "auxiliary_loss_mlp": 0.0102969, + "balance_loss_clip": 1.03897548, + "balance_loss_mlp": 1.01555729, + "epoch": 0.5244551330226965, + "flos": 15268391936640.0, + "grad_norm": 1.7893624686833824, + "language_loss": 0.79821312, + "learning_rate": 1.938669401384247e-06, + "loss": 0.81963867, + "num_input_tokens_seen": 187356040, + "step": 8723, + "time_per_iteration": 2.4377899169921875 + }, + { + "auxiliary_loss_clip": 0.01105652, + "auxiliary_loss_mlp": 0.01041454, + "balance_loss_clip": 1.04452538, + "balance_loss_mlp": 1.02679682, + "epoch": 0.5245152562753645, + "flos": 22237108629120.0, + "grad_norm": 2.304215940609295, + "language_loss": 0.74813628, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.76960731, + "num_input_tokens_seen": 187374185, + "step": 8724, + "time_per_iteration": 3.865424394607544 + }, + { + "auxiliary_loss_clip": 0.01119721, + "auxiliary_loss_mlp": 0.01033141, + "balance_loss_clip": 1.03965557, + "balance_loss_mlp": 1.01779842, + "epoch": 0.5245753795280325, + "flos": 29426460612480.0, + "grad_norm": 1.8266430287742712, + "language_loss": 0.7056551, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.7271837, + "num_input_tokens_seen": 187396640, + "step": 8725, + "time_per_iteration": 2.5272607803344727 + }, + { + "auxiliary_loss_clip": 0.01021145, + "auxiliary_loss_mlp": 0.01013254, + "balance_loss_clip": 1.02527869, + "balance_loss_mlp": 1.01144171, + "epoch": 0.5246355027807005, + "flos": 58834392785280.0, + "grad_norm": 0.7587613476094566, + "language_loss": 0.55685878, + "learning_rate": 1.937501576352568e-06, + "loss": 0.5772028, + "num_input_tokens_seen": 187455945, + "step": 8726, + "time_per_iteration": 3.165379762649536 + }, + { + "auxiliary_loss_clip": 0.0102999, + "auxiliary_loss_mlp": 0.01008475, + "balance_loss_clip": 1.031129, + "balance_loss_mlp": 1.00641251, + "epoch": 0.5246956260333684, + "flos": 64526592965760.0, + "grad_norm": 0.8004615717218573, + "language_loss": 0.58393478, + "learning_rate": 1.937112306062219e-06, + "loss": 0.60431945, + "num_input_tokens_seen": 187519975, + "step": 8727, + "time_per_iteration": 3.1170053482055664 + }, + { + "auxiliary_loss_clip": 0.01108464, + "auxiliary_loss_mlp": 0.01031549, + "balance_loss_clip": 1.0397141, + "balance_loss_mlp": 1.01727414, + "epoch": 0.5247557492860364, + "flos": 24533631653760.0, + "grad_norm": 1.6500189483872674, + "language_loss": 0.70486724, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.72626734, + "num_input_tokens_seen": 187541775, + "step": 8728, + "time_per_iteration": 3.8922770023345947 + }, + { + "auxiliary_loss_clip": 0.01102032, + "auxiliary_loss_mlp": 0.01028116, + "balance_loss_clip": 1.03779984, + "balance_loss_mlp": 1.01569438, + "epoch": 0.5248158725387043, + "flos": 18806131382400.0, + "grad_norm": 1.472238882268332, + "language_loss": 0.69734502, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.71864647, + "num_input_tokens_seen": 187560425, + "step": 8729, + "time_per_iteration": 2.5030877590179443 + }, + { + "auxiliary_loss_clip": 0.01078935, + "auxiliary_loss_mlp": 0.01031255, + "balance_loss_clip": 1.04031134, + "balance_loss_mlp": 1.01780224, + "epoch": 0.5248759957913723, + "flos": 20955851521920.0, + "grad_norm": 1.773979241842481, + "language_loss": 0.83741182, + "learning_rate": 1.935944509558464e-06, + "loss": 0.85851371, + "num_input_tokens_seen": 187579930, + "step": 8730, + "time_per_iteration": 2.5479469299316406 + }, + { + "auxiliary_loss_clip": 0.0107961, + "auxiliary_loss_mlp": 0.01032536, + "balance_loss_clip": 1.03739667, + "balance_loss_mlp": 1.01914883, + "epoch": 0.5249361190440403, + "flos": 18660980522880.0, + "grad_norm": 15.874104710957344, + "language_loss": 0.79655683, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.81767821, + "num_input_tokens_seen": 187595365, + "step": 8731, + "time_per_iteration": 3.8872251510620117 + }, + { + "auxiliary_loss_clip": 0.01097067, + "auxiliary_loss_mlp": 0.01029303, + "balance_loss_clip": 1.03607988, + "balance_loss_mlp": 1.01585007, + "epoch": 0.5249962422967083, + "flos": 24863327614080.0, + "grad_norm": 1.679343717844904, + "language_loss": 0.83085221, + "learning_rate": 1.935165990676312e-06, + "loss": 0.85211599, + "num_input_tokens_seen": 187614715, + "step": 8732, + "time_per_iteration": 2.5287296772003174 + }, + { + "auxiliary_loss_clip": 0.0110179, + "auxiliary_loss_mlp": 0.01033179, + "balance_loss_clip": 1.03795397, + "balance_loss_mlp": 1.02053738, + "epoch": 0.5250563655493762, + "flos": 15262681674240.0, + "grad_norm": 1.6752122474050988, + "language_loss": 0.77650261, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.79785228, + "num_input_tokens_seen": 187630745, + "step": 8733, + "time_per_iteration": 2.432487964630127 + }, + { + "auxiliary_loss_clip": 0.01118262, + "auxiliary_loss_mlp": 0.01038246, + "balance_loss_clip": 1.04037309, + "balance_loss_mlp": 1.02437568, + "epoch": 0.5251164888020442, + "flos": 18625177641600.0, + "grad_norm": 2.4242306782419245, + "language_loss": 0.81816769, + "learning_rate": 1.934387481628208e-06, + "loss": 0.83973283, + "num_input_tokens_seen": 187648200, + "step": 8734, + "time_per_iteration": 2.428380250930786 + }, + { + "auxiliary_loss_clip": 0.01086979, + "auxiliary_loss_mlp": 0.01031016, + "balance_loss_clip": 1.04016435, + "balance_loss_mlp": 1.01767647, + "epoch": 0.5251766120547121, + "flos": 29710764760320.0, + "grad_norm": 1.431980013944061, + "language_loss": 0.76728791, + "learning_rate": 1.933998230828826e-06, + "loss": 0.78846788, + "num_input_tokens_seen": 187669205, + "step": 8735, + "time_per_iteration": 3.9099881649017334 + }, + { + "auxiliary_loss_clip": 0.01103534, + "auxiliary_loss_mlp": 0.0103034, + "balance_loss_clip": 1.03952146, + "balance_loss_mlp": 1.01832414, + "epoch": 0.5252367353073801, + "flos": 23440295525760.0, + "grad_norm": 1.90823147654049, + "language_loss": 0.80534697, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.82668573, + "num_input_tokens_seen": 187690890, + "step": 8736, + "time_per_iteration": 2.521735191345215 + }, + { + "auxiliary_loss_clip": 0.01116154, + "auxiliary_loss_mlp": 0.01035315, + "balance_loss_clip": 1.03986359, + "balance_loss_mlp": 1.02193356, + "epoch": 0.5252968585600482, + "flos": 30810708990720.0, + "grad_norm": 2.39095008799196, + "language_loss": 0.69893038, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.72044516, + "num_input_tokens_seen": 187713045, + "step": 8737, + "time_per_iteration": 2.529759407043457 + }, + { + "auxiliary_loss_clip": 0.01091121, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.03723693, + "balance_loss_mlp": 1.01913714, + "epoch": 0.5253569818127161, + "flos": 20628274464000.0, + "grad_norm": 1.54763237067798, + "language_loss": 0.7729758, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.79420674, + "num_input_tokens_seen": 187733640, + "step": 8738, + "time_per_iteration": 2.5520503520965576 + }, + { + "auxiliary_loss_clip": 0.01025362, + "auxiliary_loss_mlp": 0.00775365, + "balance_loss_clip": 1.01867914, + "balance_loss_mlp": 1.01960647, + "epoch": 0.5254171050653841, + "flos": 63428695810560.0, + "grad_norm": 0.7453024967392763, + "language_loss": 0.54465032, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56265759, + "num_input_tokens_seen": 187792930, + "step": 8739, + "time_per_iteration": 3.113673448562622 + }, + { + "auxiliary_loss_clip": 0.0108667, + "auxiliary_loss_mlp": 0.0103158, + "balance_loss_clip": 1.03758585, + "balance_loss_mlp": 1.01904488, + "epoch": 0.525477228318052, + "flos": 34670782108800.0, + "grad_norm": 1.6506604187133807, + "language_loss": 0.84365582, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.8648383, + "num_input_tokens_seen": 187812495, + "step": 8740, + "time_per_iteration": 2.629868745803833 + }, + { + "auxiliary_loss_clip": 0.01098376, + "auxiliary_loss_mlp": 0.00789305, + "balance_loss_clip": 1.03589976, + "balance_loss_mlp": 1.01393938, + "epoch": 0.52553735157072, + "flos": 17930844766080.0, + "grad_norm": 1.9490823115248268, + "language_loss": 0.69834214, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.71721888, + "num_input_tokens_seen": 187829685, + "step": 8741, + "time_per_iteration": 2.4583370685577393 + }, + { + "auxiliary_loss_clip": 0.01094397, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.03901696, + "balance_loss_mlp": 1.01961875, + "epoch": 0.5255974748233879, + "flos": 9940864584960.0, + "grad_norm": 1.8550474595218813, + "language_loss": 0.66424233, + "learning_rate": 1.931273546137947e-06, + "loss": 0.68551564, + "num_input_tokens_seen": 187846495, + "step": 8742, + "time_per_iteration": 2.539607048034668 + }, + { + "auxiliary_loss_clip": 0.01080288, + "auxiliary_loss_mlp": 0.01040521, + "balance_loss_clip": 1.03840637, + "balance_loss_mlp": 1.02488041, + "epoch": 0.5256575980760559, + "flos": 16868427269760.0, + "grad_norm": 2.2847074711141926, + "language_loss": 0.6357438, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.6569519, + "num_input_tokens_seen": 187862010, + "step": 8743, + "time_per_iteration": 2.506277084350586 + }, + { + "auxiliary_loss_clip": 0.0103348, + "auxiliary_loss_mlp": 0.01006182, + "balance_loss_clip": 1.01725364, + "balance_loss_mlp": 1.00473964, + "epoch": 0.5257177213287239, + "flos": 62386210362240.0, + "grad_norm": 0.7854302653966595, + "language_loss": 0.54115582, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56155246, + "num_input_tokens_seen": 187922730, + "step": 8744, + "time_per_iteration": 3.2051408290863037 + }, + { + "auxiliary_loss_clip": 0.01098493, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.04423082, + "balance_loss_mlp": 1.02048063, + "epoch": 0.5257778445813919, + "flos": 20776908942720.0, + "grad_norm": 2.4300920843025735, + "language_loss": 0.75861394, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.77994657, + "num_input_tokens_seen": 187940160, + "step": 8745, + "time_per_iteration": 2.525797128677368 + }, + { + "auxiliary_loss_clip": 0.01102098, + "auxiliary_loss_mlp": 0.01033682, + "balance_loss_clip": 1.03777289, + "balance_loss_mlp": 1.02114165, + "epoch": 0.5258379678340598, + "flos": 17018606033280.0, + "grad_norm": 1.8451093705348274, + "language_loss": 0.81816769, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.83952546, + "num_input_tokens_seen": 187958625, + "step": 8746, + "time_per_iteration": 2.4902639389038086 + }, + { + "auxiliary_loss_clip": 0.01100951, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.03706956, + "balance_loss_mlp": 1.01931751, + "epoch": 0.5258980910867278, + "flos": 21068754946560.0, + "grad_norm": 1.8976963714524646, + "language_loss": 0.7579363, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.77927542, + "num_input_tokens_seen": 187977575, + "step": 8747, + "time_per_iteration": 2.5029735565185547 + }, + { + "auxiliary_loss_clip": 0.01048828, + "auxiliary_loss_mlp": 0.01032271, + "balance_loss_clip": 1.03603172, + "balance_loss_mlp": 1.01900923, + "epoch": 0.5259582143393957, + "flos": 18004461690240.0, + "grad_norm": 1.9619672488026192, + "language_loss": 0.830715, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.85152602, + "num_input_tokens_seen": 187996650, + "step": 8748, + "time_per_iteration": 2.6143271923065186 + }, + { + "auxiliary_loss_clip": 0.010925, + "auxiliary_loss_mlp": 0.01032891, + "balance_loss_clip": 1.03604746, + "balance_loss_mlp": 1.01839495, + "epoch": 0.5260183375920637, + "flos": 22783848520320.0, + "grad_norm": 1.9865028714177118, + "language_loss": 0.80437124, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.82562506, + "num_input_tokens_seen": 188013510, + "step": 8749, + "time_per_iteration": 2.5303120613098145 + }, + { + "auxiliary_loss_clip": 0.0110192, + "auxiliary_loss_mlp": 0.01035656, + "balance_loss_clip": 1.04081452, + "balance_loss_mlp": 1.02232218, + "epoch": 0.5260784608447318, + "flos": 27052406081280.0, + "grad_norm": 1.9137734935401607, + "language_loss": 0.72589874, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.74727452, + "num_input_tokens_seen": 188032085, + "step": 8750, + "time_per_iteration": 2.5260841846466064 + }, + { + "auxiliary_loss_clip": 0.01090016, + "auxiliary_loss_mlp": 0.0103127, + "balance_loss_clip": 1.03455818, + "balance_loss_mlp": 1.01832414, + "epoch": 0.5261385840973997, + "flos": 20662820369280.0, + "grad_norm": 1.513532697986344, + "language_loss": 0.76519632, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.7864092, + "num_input_tokens_seen": 188050590, + "step": 8751, + "time_per_iteration": 2.5140843391418457 + }, + { + "auxiliary_loss_clip": 0.01110582, + "auxiliary_loss_mlp": 0.01037628, + "balance_loss_clip": 1.03830051, + "balance_loss_mlp": 1.02496815, + "epoch": 0.5261987073500677, + "flos": 23622649896960.0, + "grad_norm": 1.37578034543914, + "language_loss": 0.75937521, + "learning_rate": 1.927381362210902e-06, + "loss": 0.78085732, + "num_input_tokens_seen": 188071620, + "step": 8752, + "time_per_iteration": 2.473507881164551 + }, + { + "auxiliary_loss_clip": 0.01104805, + "auxiliary_loss_mlp": 0.0103585, + "balance_loss_clip": 1.03832579, + "balance_loss_mlp": 1.02160466, + "epoch": 0.5262588306027356, + "flos": 27636241743360.0, + "grad_norm": 1.4313284955959715, + "language_loss": 0.67802441, + "learning_rate": 1.926992158720058e-06, + "loss": 0.69943094, + "num_input_tokens_seen": 188091740, + "step": 8753, + "time_per_iteration": 2.5171830654144287 + }, + { + "auxiliary_loss_clip": 0.01102388, + "auxiliary_loss_mlp": 0.01034519, + "balance_loss_clip": 1.03840876, + "balance_loss_mlp": 1.02174544, + "epoch": 0.5263189538554036, + "flos": 21759711943680.0, + "grad_norm": 1.4857943818650503, + "language_loss": 0.83956176, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.8609308, + "num_input_tokens_seen": 188111165, + "step": 8754, + "time_per_iteration": 2.4751389026641846 + }, + { + "auxiliary_loss_clip": 0.0110078, + "auxiliary_loss_mlp": 0.01032398, + "balance_loss_clip": 1.03789735, + "balance_loss_mlp": 1.01862383, + "epoch": 0.5263790771080715, + "flos": 14276359140480.0, + "grad_norm": 2.091364656377627, + "language_loss": 0.87477779, + "learning_rate": 1.926213760058522e-06, + "loss": 0.8961097, + "num_input_tokens_seen": 188127825, + "step": 8755, + "time_per_iteration": 2.4508447647094727 + }, + { + "auxiliary_loss_clip": 0.01015278, + "auxiliary_loss_mlp": 0.01016607, + "balance_loss_clip": 1.02509058, + "balance_loss_mlp": 1.01528955, + "epoch": 0.5264392003607395, + "flos": 65806413528960.0, + "grad_norm": 0.7252144031363951, + "language_loss": 0.58847034, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.60878921, + "num_input_tokens_seen": 188194050, + "step": 8756, + "time_per_iteration": 3.2170541286468506 + }, + { + "auxiliary_loss_clip": 0.01083571, + "auxiliary_loss_mlp": 0.0102772, + "balance_loss_clip": 1.03610897, + "balance_loss_mlp": 1.01441038, + "epoch": 0.5264993236134075, + "flos": 21032413361280.0, + "grad_norm": 1.6580312490106202, + "language_loss": 0.7033093, + "learning_rate": 1.925435372588913e-06, + "loss": 0.72442222, + "num_input_tokens_seen": 188212565, + "step": 8757, + "time_per_iteration": 2.525977373123169 + }, + { + "auxiliary_loss_clip": 0.01101005, + "auxiliary_loss_mlp": 0.01031387, + "balance_loss_clip": 1.03686535, + "balance_loss_mlp": 1.01815498, + "epoch": 0.5265594468660755, + "flos": 16618202150400.0, + "grad_norm": 2.1628097329276317, + "language_loss": 0.87727249, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.89859641, + "num_input_tokens_seen": 188229505, + "step": 8758, + "time_per_iteration": 2.447530508041382 + }, + { + "auxiliary_loss_clip": 0.0105304, + "auxiliary_loss_mlp": 0.010353, + "balance_loss_clip": 1.03562498, + "balance_loss_mlp": 1.02101827, + "epoch": 0.5266195701187434, + "flos": 24134125610880.0, + "grad_norm": 1.558003131295313, + "language_loss": 0.75944114, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.78032458, + "num_input_tokens_seen": 188250395, + "step": 8759, + "time_per_iteration": 2.6150898933410645 + }, + { + "auxiliary_loss_clip": 0.01088366, + "auxiliary_loss_mlp": 0.01025233, + "balance_loss_clip": 1.03640616, + "balance_loss_mlp": 1.01219761, + "epoch": 0.5266796933714114, + "flos": 15844111125120.0, + "grad_norm": 2.20014332190006, + "language_loss": 0.71758115, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.73871708, + "num_input_tokens_seen": 188266785, + "step": 8760, + "time_per_iteration": 2.4882659912109375 + }, + { + "auxiliary_loss_clip": 0.01093168, + "auxiliary_loss_mlp": 0.01032629, + "balance_loss_clip": 1.04041111, + "balance_loss_mlp": 1.0185864, + "epoch": 0.5267398166240793, + "flos": 20951434149120.0, + "grad_norm": 2.5770813794599916, + "language_loss": 0.75720441, + "learning_rate": 1.923878631697736e-06, + "loss": 0.77846241, + "num_input_tokens_seen": 188282525, + "step": 8761, + "time_per_iteration": 2.5288515090942383 + }, + { + "auxiliary_loss_clip": 0.01097739, + "auxiliary_loss_mlp": 0.00799474, + "balance_loss_clip": 1.0361172, + "balance_loss_mlp": 1.03104079, + "epoch": 0.5267999398767473, + "flos": 20996394998400.0, + "grad_norm": 1.6357661820363198, + "language_loss": 0.71001863, + "learning_rate": 1.923489453654373e-06, + "loss": 0.72899067, + "num_input_tokens_seen": 188301395, + "step": 8762, + "time_per_iteration": 2.4837486743927 + }, + { + "auxiliary_loss_clip": 0.01020802, + "auxiliary_loss_mlp": 0.01003086, + "balance_loss_clip": 1.02040768, + "balance_loss_mlp": 1.00191772, + "epoch": 0.5268600631294152, + "flos": 66849401767680.0, + "grad_norm": 0.9282053988036777, + "language_loss": 0.65442872, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.6746676, + "num_input_tokens_seen": 188357665, + "step": 8763, + "time_per_iteration": 4.388975143432617 + }, + { + "auxiliary_loss_clip": 0.01099311, + "auxiliary_loss_mlp": 0.01025882, + "balance_loss_clip": 1.0356431, + "balance_loss_mlp": 1.01282275, + "epoch": 0.5269201863820833, + "flos": 17165552572800.0, + "grad_norm": 2.328471188473617, + "language_loss": 0.7106244, + "learning_rate": 1.922711106286265e-06, + "loss": 0.73187625, + "num_input_tokens_seen": 188376935, + "step": 8764, + "time_per_iteration": 2.4630911350250244 + }, + { + "auxiliary_loss_clip": 0.01072168, + "auxiliary_loss_mlp": 0.01033618, + "balance_loss_clip": 1.0352385, + "balance_loss_mlp": 1.01860321, + "epoch": 0.5269803096347513, + "flos": 20522589672960.0, + "grad_norm": 1.6848563622492858, + "language_loss": 0.74117392, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.76223177, + "num_input_tokens_seen": 188394995, + "step": 8765, + "time_per_iteration": 2.530003070831299 + }, + { + "auxiliary_loss_clip": 0.01090848, + "auxiliary_loss_mlp": 0.01031658, + "balance_loss_clip": 1.03428078, + "balance_loss_mlp": 1.0173645, + "epoch": 0.5270404328874192, + "flos": 27230989524480.0, + "grad_norm": 1.4736679451753976, + "language_loss": 0.85367787, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.87490296, + "num_input_tokens_seen": 188415475, + "step": 8766, + "time_per_iteration": 3.929028272628784 + }, + { + "auxiliary_loss_clip": 0.01116803, + "auxiliary_loss_mlp": 0.01034115, + "balance_loss_clip": 1.04054379, + "balance_loss_mlp": 1.01983964, + "epoch": 0.5271005561400872, + "flos": 23110491824640.0, + "grad_norm": 2.1549988913092943, + "language_loss": 0.79211247, + "learning_rate": 1.921543607252017e-06, + "loss": 0.81362164, + "num_input_tokens_seen": 188435665, + "step": 8767, + "time_per_iteration": 2.4735889434814453 + }, + { + "auxiliary_loss_clip": 0.01104873, + "auxiliary_loss_mlp": 0.01031467, + "balance_loss_clip": 1.03779876, + "balance_loss_mlp": 1.01709056, + "epoch": 0.5271606793927551, + "flos": 22564793427840.0, + "grad_norm": 1.8582806303516148, + "language_loss": 0.73821193, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.75957531, + "num_input_tokens_seen": 188455405, + "step": 8768, + "time_per_iteration": 2.5003201961517334 + }, + { + "auxiliary_loss_clip": 0.01085458, + "auxiliary_loss_mlp": 0.01041004, + "balance_loss_clip": 1.0357368, + "balance_loss_mlp": 1.02784872, + "epoch": 0.5272208026454231, + "flos": 18764259102720.0, + "grad_norm": 2.2739189662658954, + "language_loss": 0.74598145, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.76724613, + "num_input_tokens_seen": 188472940, + "step": 8769, + "time_per_iteration": 3.878206253051758 + }, + { + "auxiliary_loss_clip": 0.01075737, + "auxiliary_loss_mlp": 0.01030189, + "balance_loss_clip": 1.0349617, + "balance_loss_mlp": 1.01696324, + "epoch": 0.5272809258980911, + "flos": 20412164286720.0, + "grad_norm": 1.7114294198484459, + "language_loss": 0.74189192, + "learning_rate": 1.920376134993436e-06, + "loss": 0.7629512, + "num_input_tokens_seen": 188493035, + "step": 8770, + "time_per_iteration": 2.53420352935791 + }, + { + "auxiliary_loss_clip": 0.01113597, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.03863168, + "balance_loss_mlp": 1.01783741, + "epoch": 0.5273410491507591, + "flos": 28256742213120.0, + "grad_norm": 1.9888282488461249, + "language_loss": 0.67910552, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.70055288, + "num_input_tokens_seen": 188513860, + "step": 8771, + "time_per_iteration": 2.506368637084961 + }, + { + "auxiliary_loss_clip": 0.01099512, + "auxiliary_loss_mlp": 0.01038363, + "balance_loss_clip": 1.03726149, + "balance_loss_mlp": 1.02508342, + "epoch": 0.527401172403427, + "flos": 22455158140800.0, + "grad_norm": 1.8280771152763378, + "language_loss": 0.7672298, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.78860855, + "num_input_tokens_seen": 188533345, + "step": 8772, + "time_per_iteration": 2.479888916015625 + }, + { + "auxiliary_loss_clip": 0.01098336, + "auxiliary_loss_mlp": 0.01039468, + "balance_loss_clip": 1.0369966, + "balance_loss_mlp": 1.0252223, + "epoch": 0.527461295656095, + "flos": 21031084558080.0, + "grad_norm": 2.4591070126743797, + "language_loss": 0.6626935, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.68407154, + "num_input_tokens_seen": 188551550, + "step": 8773, + "time_per_iteration": 3.850606679916382 + }, + { + "auxiliary_loss_clip": 0.01081411, + "auxiliary_loss_mlp": 0.01037131, + "balance_loss_clip": 1.03733945, + "balance_loss_mlp": 1.02506709, + "epoch": 0.5275214189087629, + "flos": 26322018929280.0, + "grad_norm": 1.7402422253504257, + "language_loss": 0.85879868, + "learning_rate": 1.91881954765502e-06, + "loss": 0.87998414, + "num_input_tokens_seen": 188571615, + "step": 8774, + "time_per_iteration": 2.5890674591064453 + }, + { + "auxiliary_loss_clip": 0.01081116, + "auxiliary_loss_mlp": 0.01031287, + "balance_loss_clip": 1.03658891, + "balance_loss_mlp": 1.0184598, + "epoch": 0.5275815421614309, + "flos": 20047024581120.0, + "grad_norm": 1.5031323413406452, + "language_loss": 0.80095267, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.82207668, + "num_input_tokens_seen": 188591965, + "step": 8775, + "time_per_iteration": 2.5047855377197266 + }, + { + "auxiliary_loss_clip": 0.01087304, + "auxiliary_loss_mlp": 0.01035428, + "balance_loss_clip": 1.03458476, + "balance_loss_mlp": 1.02202332, + "epoch": 0.5276416654140988, + "flos": 21432206712960.0, + "grad_norm": 1.7424391304097946, + "language_loss": 0.83600855, + "learning_rate": 1.918041272397012e-06, + "loss": 0.85723585, + "num_input_tokens_seen": 188610675, + "step": 8776, + "time_per_iteration": 2.5054194927215576 + }, + { + "auxiliary_loss_clip": 0.01086547, + "auxiliary_loss_mlp": 0.01030136, + "balance_loss_clip": 1.0421344, + "balance_loss_mlp": 1.01652265, + "epoch": 0.5277017886667669, + "flos": 17165085696000.0, + "grad_norm": 1.839343081238076, + "language_loss": 0.67329097, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.69445777, + "num_input_tokens_seen": 188628235, + "step": 8777, + "time_per_iteration": 2.48655104637146 + }, + { + "auxiliary_loss_clip": 0.01092961, + "auxiliary_loss_mlp": 0.01037147, + "balance_loss_clip": 1.03936529, + "balance_loss_mlp": 1.02461267, + "epoch": 0.5277619119194349, + "flos": 20448146736000.0, + "grad_norm": 1.51519378765227, + "language_loss": 0.82578886, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.84708989, + "num_input_tokens_seen": 188648925, + "step": 8778, + "time_per_iteration": 2.523207664489746 + }, + { + "auxiliary_loss_clip": 0.01103771, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.03966248, + "balance_loss_mlp": 1.0219084, + "epoch": 0.5278220351721028, + "flos": 24061083304320.0, + "grad_norm": 1.9086379584411584, + "language_loss": 0.79408193, + "learning_rate": 1.916873882856013e-06, + "loss": 0.81547785, + "num_input_tokens_seen": 188668125, + "step": 8779, + "time_per_iteration": 2.499063014984131 + }, + { + "auxiliary_loss_clip": 0.01096369, + "auxiliary_loss_mlp": 0.01032062, + "balance_loss_clip": 1.03559589, + "balance_loss_mlp": 1.02033758, + "epoch": 0.5278821584247708, + "flos": 24642907804800.0, + "grad_norm": 2.84478715134165, + "language_loss": 0.76659805, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.78788233, + "num_input_tokens_seen": 188684410, + "step": 8780, + "time_per_iteration": 2.5104031562805176 + }, + { + "auxiliary_loss_clip": 0.01085147, + "auxiliary_loss_mlp": 0.01029641, + "balance_loss_clip": 1.03968799, + "balance_loss_mlp": 1.01595581, + "epoch": 0.5279422816774387, + "flos": 35408244240000.0, + "grad_norm": 1.5504035762518522, + "language_loss": 0.69428086, + "learning_rate": 1.916095638898174e-06, + "loss": 0.71542871, + "num_input_tokens_seen": 188706130, + "step": 8781, + "time_per_iteration": 2.643890142440796 + }, + { + "auxiliary_loss_clip": 0.01097896, + "auxiliary_loss_mlp": 0.0102834, + "balance_loss_clip": 1.03702712, + "balance_loss_mlp": 1.01668167, + "epoch": 0.5280024049301068, + "flos": 22967028904320.0, + "grad_norm": 1.597486996682263, + "language_loss": 0.72057271, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.74183506, + "num_input_tokens_seen": 188725030, + "step": 8782, + "time_per_iteration": 2.510983467102051 + }, + { + "auxiliary_loss_clip": 0.01086, + "auxiliary_loss_mlp": 0.01027642, + "balance_loss_clip": 1.03913343, + "balance_loss_mlp": 1.01461256, + "epoch": 0.5280625281827747, + "flos": 21507619317120.0, + "grad_norm": 1.8543426203779696, + "language_loss": 0.68641186, + "learning_rate": 1.915317407666982e-06, + "loss": 0.70754826, + "num_input_tokens_seen": 188744325, + "step": 8783, + "time_per_iteration": 2.5230393409729004 + }, + { + "auxiliary_loss_clip": 0.01110435, + "auxiliary_loss_mlp": 0.01040192, + "balance_loss_clip": 1.03967094, + "balance_loss_mlp": 1.02476656, + "epoch": 0.5281226514354427, + "flos": 31208167958400.0, + "grad_norm": 1.9930674524024772, + "language_loss": 0.69487369, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.71637988, + "num_input_tokens_seen": 188765100, + "step": 8784, + "time_per_iteration": 2.5743744373321533 + }, + { + "auxiliary_loss_clip": 0.01116069, + "auxiliary_loss_mlp": 0.01030107, + "balance_loss_clip": 1.03734517, + "balance_loss_mlp": 1.01619542, + "epoch": 0.5281827746881106, + "flos": 25077821679360.0, + "grad_norm": 2.211887463883251, + "language_loss": 0.75191069, + "learning_rate": 1.91453918928048e-06, + "loss": 0.77337247, + "num_input_tokens_seen": 188783995, + "step": 8785, + "time_per_iteration": 2.4650795459747314 + }, + { + "auxiliary_loss_clip": 0.01104204, + "auxiliary_loss_mlp": 0.0103318, + "balance_loss_clip": 1.04009986, + "balance_loss_mlp": 1.01934588, + "epoch": 0.5282428979407786, + "flos": 20631255292800.0, + "grad_norm": 1.7297466688498235, + "language_loss": 0.83484375, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.85621762, + "num_input_tokens_seen": 188803120, + "step": 8786, + "time_per_iteration": 2.4867002964019775 + }, + { + "auxiliary_loss_clip": 0.01073853, + "auxiliary_loss_mlp": 0.01026898, + "balance_loss_clip": 1.03518224, + "balance_loss_mlp": 1.01532888, + "epoch": 0.5283030211934465, + "flos": 22419391173120.0, + "grad_norm": 2.2262280595857824, + "language_loss": 0.82892692, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.84993446, + "num_input_tokens_seen": 188820960, + "step": 8787, + "time_per_iteration": 2.5288782119750977 + }, + { + "auxiliary_loss_clip": 0.01060071, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.03881454, + "balance_loss_mlp": 1.01950264, + "epoch": 0.5283631444461145, + "flos": 23615467176960.0, + "grad_norm": 1.8242972321961186, + "language_loss": 0.83473521, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.85564852, + "num_input_tokens_seen": 188837165, + "step": 8788, + "time_per_iteration": 2.5809125900268555 + }, + { + "auxiliary_loss_clip": 0.01078238, + "auxiliary_loss_mlp": 0.01040578, + "balance_loss_clip": 1.04147923, + "balance_loss_mlp": 1.02698851, + "epoch": 0.5284232676987825, + "flos": 32671994918400.0, + "grad_norm": 3.7437486723179845, + "language_loss": 0.7515614, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.7727496, + "num_input_tokens_seen": 188858555, + "step": 8789, + "time_per_iteration": 2.6311721801757812 + }, + { + "auxiliary_loss_clip": 0.01103336, + "auxiliary_loss_mlp": 0.01033487, + "balance_loss_clip": 1.03890955, + "balance_loss_mlp": 1.0206542, + "epoch": 0.5284833909514505, + "flos": 26760919213440.0, + "grad_norm": 1.8307351550993258, + "language_loss": 0.69937754, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.7207458, + "num_input_tokens_seen": 188879050, + "step": 8790, + "time_per_iteration": 2.536754608154297 + }, + { + "auxiliary_loss_clip": 0.01109516, + "auxiliary_loss_mlp": 0.01026304, + "balance_loss_clip": 1.03735662, + "balance_loss_mlp": 1.01402557, + "epoch": 0.5285435142041185, + "flos": 22090700793600.0, + "grad_norm": 1.5749588783401367, + "language_loss": 0.79040545, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.81176364, + "num_input_tokens_seen": 188898885, + "step": 8791, + "time_per_iteration": 2.479295492172241 + }, + { + "auxiliary_loss_clip": 0.01066234, + "auxiliary_loss_mlp": 0.01027761, + "balance_loss_clip": 1.04116201, + "balance_loss_mlp": 1.01482677, + "epoch": 0.5286036374567864, + "flos": 20375463565440.0, + "grad_norm": 2.0122397834961947, + "language_loss": 0.66251415, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.68345416, + "num_input_tokens_seen": 188917225, + "step": 8792, + "time_per_iteration": 2.5758509635925293 + }, + { + "auxiliary_loss_clip": 0.01083508, + "auxiliary_loss_mlp": 0.01041334, + "balance_loss_clip": 1.03471971, + "balance_loss_mlp": 1.0281074, + "epoch": 0.5286637607094544, + "flos": 24352175122560.0, + "grad_norm": 1.959065383644262, + "language_loss": 0.79847378, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.81972218, + "num_input_tokens_seen": 188936120, + "step": 8793, + "time_per_iteration": 2.5378692150115967 + }, + { + "auxiliary_loss_clip": 0.01112714, + "auxiliary_loss_mlp": 0.01040285, + "balance_loss_clip": 1.03875935, + "balance_loss_mlp": 1.02697515, + "epoch": 0.5287238839621223, + "flos": 17271165536640.0, + "grad_norm": 2.011786586860156, + "language_loss": 0.85092175, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.87245166, + "num_input_tokens_seen": 188953405, + "step": 8794, + "time_per_iteration": 2.4437456130981445 + }, + { + "auxiliary_loss_clip": 0.01093681, + "auxiliary_loss_mlp": 0.01036395, + "balance_loss_clip": 1.04109693, + "balance_loss_mlp": 1.0221076, + "epoch": 0.5287840072147904, + "flos": 17566890209280.0, + "grad_norm": 2.6654457031694587, + "language_loss": 0.68050051, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.70180124, + "num_input_tokens_seen": 188971150, + "step": 8795, + "time_per_iteration": 2.4985647201538086 + }, + { + "auxiliary_loss_clip": 0.01088587, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.03899491, + "balance_loss_mlp": 1.01879263, + "epoch": 0.5288441304674583, + "flos": 18552099421440.0, + "grad_norm": 1.8338784703551305, + "language_loss": 0.80688709, + "learning_rate": 1.910259223028374e-06, + "loss": 0.82808667, + "num_input_tokens_seen": 188989550, + "step": 8796, + "time_per_iteration": 2.5006144046783447 + }, + { + "auxiliary_loss_clip": 0.0107164, + "auxiliary_loss_mlp": 0.01043675, + "balance_loss_clip": 1.03526008, + "balance_loss_mlp": 1.02792752, + "epoch": 0.5289042537201263, + "flos": 20814507504000.0, + "grad_norm": 1.7429985262340641, + "language_loss": 0.69441444, + "learning_rate": 1.909870155310071e-06, + "loss": 0.71556759, + "num_input_tokens_seen": 189008795, + "step": 8797, + "time_per_iteration": 2.530583381652832 + }, + { + "auxiliary_loss_clip": 0.0109465, + "auxiliary_loss_mlp": 0.01036467, + "balance_loss_clip": 1.03958988, + "balance_loss_mlp": 1.02408719, + "epoch": 0.5289643769727942, + "flos": 15735265937280.0, + "grad_norm": 1.5172498369020773, + "language_loss": 0.82386565, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.84517688, + "num_input_tokens_seen": 189025540, + "step": 8798, + "time_per_iteration": 2.4581429958343506 + }, + { + "auxiliary_loss_clip": 0.01088611, + "auxiliary_loss_mlp": 0.00798745, + "balance_loss_clip": 1.03587651, + "balance_loss_mlp": 1.02617455, + "epoch": 0.5290245002254622, + "flos": 19537308633600.0, + "grad_norm": 1.9671604210827982, + "language_loss": 0.70817506, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.72704864, + "num_input_tokens_seen": 189044885, + "step": 8799, + "time_per_iteration": 2.49458384513855 + }, + { + "auxiliary_loss_clip": 0.01099038, + "auxiliary_loss_mlp": 0.01035159, + "balance_loss_clip": 1.03981745, + "balance_loss_mlp": 1.02289796, + "epoch": 0.5290846234781301, + "flos": 15815131827840.0, + "grad_norm": 1.9078503403709233, + "language_loss": 0.68971741, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.71105939, + "num_input_tokens_seen": 189061280, + "step": 8800, + "time_per_iteration": 2.460244655609131 + }, + { + "auxiliary_loss_clip": 0.0101171, + "auxiliary_loss_mlp": 0.01008655, + "balance_loss_clip": 1.01485229, + "balance_loss_mlp": 1.00731945, + "epoch": 0.5291447467307981, + "flos": 70057624821120.0, + "grad_norm": 1.0095888968722198, + "language_loss": 0.57046759, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.5906713, + "num_input_tokens_seen": 189114775, + "step": 8801, + "time_per_iteration": 4.424926042556763 + }, + { + "auxiliary_loss_clip": 0.01097415, + "auxiliary_loss_mlp": 0.01035249, + "balance_loss_clip": 1.04018021, + "balance_loss_mlp": 1.02245808, + "epoch": 0.529204869983466, + "flos": 28364186770560.0, + "grad_norm": 1.505397784479068, + "language_loss": 0.63628584, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.65761244, + "num_input_tokens_seen": 189134700, + "step": 8802, + "time_per_iteration": 2.567531108856201 + }, + { + "auxiliary_loss_clip": 0.01092525, + "auxiliary_loss_mlp": 0.01027263, + "balance_loss_clip": 1.04003406, + "balance_loss_mlp": 1.01413834, + "epoch": 0.5292649932361341, + "flos": 33758830684800.0, + "grad_norm": 1.5434640690065624, + "language_loss": 0.68880737, + "learning_rate": 1.907535821289003e-06, + "loss": 0.71000528, + "num_input_tokens_seen": 189155365, + "step": 8803, + "time_per_iteration": 2.634021043777466 + }, + { + "auxiliary_loss_clip": 0.01096368, + "auxiliary_loss_mlp": 0.00790468, + "balance_loss_clip": 1.0362643, + "balance_loss_mlp": 1.01746297, + "epoch": 0.5293251164888021, + "flos": 20447679859200.0, + "grad_norm": 1.6597999254506213, + "language_loss": 0.7643441, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.78321242, + "num_input_tokens_seen": 189173885, + "step": 8804, + "time_per_iteration": 2.4745709896087646 + }, + { + "auxiliary_loss_clip": 0.01031976, + "auxiliary_loss_mlp": 0.01003489, + "balance_loss_clip": 1.01580024, + "balance_loss_mlp": 1.00225496, + "epoch": 0.52938523974147, + "flos": 66545312204160.0, + "grad_norm": 0.7526647173236996, + "language_loss": 0.5297249, + "learning_rate": 1.906757737841291e-06, + "loss": 0.55007958, + "num_input_tokens_seen": 189236515, + "step": 8805, + "time_per_iteration": 4.5177929401397705 + }, + { + "auxiliary_loss_clip": 0.01030523, + "auxiliary_loss_mlp": 0.01006132, + "balance_loss_clip": 1.01418293, + "balance_loss_mlp": 1.00459373, + "epoch": 0.529445362994138, + "flos": 67151734542720.0, + "grad_norm": 0.7368576678339734, + "language_loss": 0.63772273, + "learning_rate": 1.906368701413693e-06, + "loss": 0.65808928, + "num_input_tokens_seen": 189300500, + "step": 8806, + "time_per_iteration": 3.100186586380005 + }, + { + "auxiliary_loss_clip": 0.01104209, + "auxiliary_loss_mlp": 0.01030991, + "balance_loss_clip": 1.03837538, + "balance_loss_mlp": 1.01811016, + "epoch": 0.5295054862468059, + "flos": 17749316407680.0, + "grad_norm": 1.6385880970395554, + "language_loss": 0.7227385, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.74409056, + "num_input_tokens_seen": 189319745, + "step": 8807, + "time_per_iteration": 2.466193675994873 + }, + { + "auxiliary_loss_clip": 0.01080501, + "auxiliary_loss_mlp": 0.01029892, + "balance_loss_clip": 1.03908706, + "balance_loss_mlp": 1.01819766, + "epoch": 0.529565609499474, + "flos": 11397401084160.0, + "grad_norm": 2.2615758654520786, + "language_loss": 0.69373488, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.71483874, + "num_input_tokens_seen": 189334550, + "step": 8808, + "time_per_iteration": 3.892692804336548 + }, + { + "auxiliary_loss_clip": 0.01100043, + "auxiliary_loss_mlp": 0.01031469, + "balance_loss_clip": 1.03770971, + "balance_loss_mlp": 1.01949441, + "epoch": 0.5296257327521419, + "flos": 17196363463680.0, + "grad_norm": 2.026108909974527, + "language_loss": 0.86613929, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.88745439, + "num_input_tokens_seen": 189351735, + "step": 8809, + "time_per_iteration": 2.4426333904266357 + }, + { + "auxiliary_loss_clip": 0.01105817, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.03905237, + "balance_loss_mlp": 1.01865697, + "epoch": 0.5296858560048099, + "flos": 39964086777600.0, + "grad_norm": 1.8689752856074047, + "language_loss": 0.64191604, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.66330075, + "num_input_tokens_seen": 189373105, + "step": 8810, + "time_per_iteration": 2.6315553188323975 + }, + { + "auxiliary_loss_clip": 0.01109029, + "auxiliary_loss_mlp": 0.01030302, + "balance_loss_clip": 1.03765941, + "balance_loss_mlp": 1.01749945, + "epoch": 0.5297459792574778, + "flos": 20961418129920.0, + "grad_norm": 1.60926233169571, + "language_loss": 0.68098927, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.70238256, + "num_input_tokens_seen": 189394615, + "step": 8811, + "time_per_iteration": 2.4903066158294678 + }, + { + "auxiliary_loss_clip": 0.0100176, + "auxiliary_loss_mlp": 0.01006006, + "balance_loss_clip": 1.01468039, + "balance_loss_mlp": 1.00457561, + "epoch": 0.5298061025101458, + "flos": 66523620389760.0, + "grad_norm": 0.6646458687708908, + "language_loss": 0.53352982, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.55360746, + "num_input_tokens_seen": 189459750, + "step": 8812, + "time_per_iteration": 4.6251256465911865 + }, + { + "auxiliary_loss_clip": 0.0102073, + "auxiliary_loss_mlp": 0.01007061, + "balance_loss_clip": 1.0154233, + "balance_loss_mlp": 1.00551116, + "epoch": 0.5298662257628137, + "flos": 67662994775040.0, + "grad_norm": 0.7250707842165657, + "language_loss": 0.56333899, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.58361697, + "num_input_tokens_seen": 189527540, + "step": 8813, + "time_per_iteration": 3.178156614303589 + }, + { + "auxiliary_loss_clip": 0.01060476, + "auxiliary_loss_mlp": 0.01031507, + "balance_loss_clip": 1.03429401, + "balance_loss_mlp": 1.01925218, + "epoch": 0.5299263490154817, + "flos": 19646405216640.0, + "grad_norm": 1.5607167064876728, + "language_loss": 0.81642097, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.83734083, + "num_input_tokens_seen": 189546900, + "step": 8814, + "time_per_iteration": 2.544072151184082 + }, + { + "auxiliary_loss_clip": 0.01117883, + "auxiliary_loss_mlp": 0.01028198, + "balance_loss_clip": 1.0414778, + "balance_loss_mlp": 1.0158186, + "epoch": 0.5299864722681497, + "flos": 22055005653120.0, + "grad_norm": 1.541376360201913, + "language_loss": 0.8487488, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.87020963, + "num_input_tokens_seen": 189566490, + "step": 8815, + "time_per_iteration": 2.4473845958709717 + }, + { + "auxiliary_loss_clip": 0.01108933, + "auxiliary_loss_mlp": 0.01030556, + "balance_loss_clip": 1.03812003, + "balance_loss_mlp": 1.01870692, + "epoch": 0.5300465955208177, + "flos": 21763698353280.0, + "grad_norm": 2.052153190873808, + "language_loss": 0.6629948, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.68438971, + "num_input_tokens_seen": 189585580, + "step": 8816, + "time_per_iteration": 2.4568278789520264 + }, + { + "auxiliary_loss_clip": 0.01086469, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.03479123, + "balance_loss_mlp": 1.01998353, + "epoch": 0.5301067187734857, + "flos": 42996491735040.0, + "grad_norm": 1.6192045599106437, + "language_loss": 0.72049272, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.74168229, + "num_input_tokens_seen": 189608485, + "step": 8817, + "time_per_iteration": 2.7027342319488525 + }, + { + "auxiliary_loss_clip": 0.01083771, + "auxiliary_loss_mlp": 0.01034017, + "balance_loss_clip": 1.03864694, + "balance_loss_mlp": 1.01944351, + "epoch": 0.5301668420261536, + "flos": 20554298403840.0, + "grad_norm": 1.6949087051134715, + "language_loss": 0.6534133, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.67459118, + "num_input_tokens_seen": 189627815, + "step": 8818, + "time_per_iteration": 2.549274444580078 + }, + { + "auxiliary_loss_clip": 0.01063469, + "auxiliary_loss_mlp": 0.01029937, + "balance_loss_clip": 1.03526986, + "balance_loss_mlp": 1.0158112, + "epoch": 0.5302269652788216, + "flos": 17486665182720.0, + "grad_norm": 1.9988119199409549, + "language_loss": 0.75040686, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.77134085, + "num_input_tokens_seen": 189644850, + "step": 8819, + "time_per_iteration": 2.549574613571167 + }, + { + "auxiliary_loss_clip": 0.01078472, + "auxiliary_loss_mlp": 0.01042435, + "balance_loss_clip": 1.0372076, + "balance_loss_mlp": 1.02724767, + "epoch": 0.5302870885314895, + "flos": 14574202715520.0, + "grad_norm": 1.7223021518799049, + "language_loss": 0.82230282, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.84351188, + "num_input_tokens_seen": 189660945, + "step": 8820, + "time_per_iteration": 2.4999117851257324 + }, + { + "auxiliary_loss_clip": 0.0109066, + "auxiliary_loss_mlp": 0.0102789, + "balance_loss_clip": 1.03676426, + "balance_loss_mlp": 1.0159694, + "epoch": 0.5303472117841576, + "flos": 23438032968960.0, + "grad_norm": 1.3701693847326617, + "language_loss": 0.72499669, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.7461822, + "num_input_tokens_seen": 189680425, + "step": 8821, + "time_per_iteration": 2.5279476642608643 + }, + { + "auxiliary_loss_clip": 0.01089517, + "auxiliary_loss_mlp": 0.01029088, + "balance_loss_clip": 1.03791213, + "balance_loss_mlp": 1.01773906, + "epoch": 0.5304073350368255, + "flos": 22709010533760.0, + "grad_norm": 1.388243600782498, + "language_loss": 0.74370039, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.76488644, + "num_input_tokens_seen": 189700375, + "step": 8822, + "time_per_iteration": 2.51324200630188 + }, + { + "auxiliary_loss_clip": 0.01077314, + "auxiliary_loss_mlp": 0.010316, + "balance_loss_clip": 1.0345695, + "balance_loss_mlp": 1.01812339, + "epoch": 0.5304674582894935, + "flos": 27928554624000.0, + "grad_norm": 1.6591147642005715, + "language_loss": 0.67588174, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.69697094, + "num_input_tokens_seen": 189721225, + "step": 8823, + "time_per_iteration": 2.6033661365509033 + }, + { + "auxiliary_loss_clip": 0.01114512, + "auxiliary_loss_mlp": 0.01035561, + "balance_loss_clip": 1.03810596, + "balance_loss_mlp": 1.02057099, + "epoch": 0.5305275815421614, + "flos": 21250642440960.0, + "grad_norm": 1.5888902252179407, + "language_loss": 0.6928134, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.71431422, + "num_input_tokens_seen": 189740170, + "step": 8824, + "time_per_iteration": 2.451096773147583 + }, + { + "auxiliary_loss_clip": 0.01085273, + "auxiliary_loss_mlp": 0.0078509, + "balance_loss_clip": 1.03660226, + "balance_loss_mlp": 1.00846243, + "epoch": 0.5305877047948294, + "flos": 17603088140160.0, + "grad_norm": 1.8468785738780347, + "language_loss": 0.76105809, + "learning_rate": 1.898977700702689e-06, + "loss": 0.77976173, + "num_input_tokens_seen": 189757890, + "step": 8825, + "time_per_iteration": 2.4923934936523438 + }, + { + "auxiliary_loss_clip": 0.01037063, + "auxiliary_loss_mlp": 0.01039817, + "balance_loss_clip": 1.03609324, + "balance_loss_mlp": 1.0268054, + "epoch": 0.5306478280474973, + "flos": 15195493284480.0, + "grad_norm": 1.810733362323992, + "language_loss": 0.85685831, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.87762713, + "num_input_tokens_seen": 189775390, + "step": 8826, + "time_per_iteration": 2.6935248374938965 + }, + { + "auxiliary_loss_clip": 0.01108904, + "auxiliary_loss_mlp": 0.0103208, + "balance_loss_clip": 1.03781676, + "balance_loss_mlp": 1.01948571, + "epoch": 0.5307079513001653, + "flos": 15341218761600.0, + "grad_norm": 1.4344760742001632, + "language_loss": 0.64074767, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.66215754, + "num_input_tokens_seen": 189793975, + "step": 8827, + "time_per_iteration": 2.5506958961486816 + }, + { + "auxiliary_loss_clip": 0.01092027, + "auxiliary_loss_mlp": 0.01036651, + "balance_loss_clip": 1.03888774, + "balance_loss_mlp": 1.02267981, + "epoch": 0.5307680745528333, + "flos": 43544452688640.0, + "grad_norm": 1.5165095006892586, + "language_loss": 0.6009624, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.62224919, + "num_input_tokens_seen": 189817870, + "step": 8828, + "time_per_iteration": 2.7139461040496826 + }, + { + "auxiliary_loss_clip": 0.01104112, + "auxiliary_loss_mlp": 0.01034345, + "balance_loss_clip": 1.03852534, + "balance_loss_mlp": 1.02090454, + "epoch": 0.5308281978055013, + "flos": 20048928001920.0, + "grad_norm": 1.8206481503508534, + "language_loss": 0.81830728, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.83969182, + "num_input_tokens_seen": 189837905, + "step": 8829, + "time_per_iteration": 2.475341796875 + }, + { + "auxiliary_loss_clip": 0.0109483, + "auxiliary_loss_mlp": 0.0103445, + "balance_loss_clip": 1.03921843, + "balance_loss_mlp": 1.02185559, + "epoch": 0.5308883210581693, + "flos": 20703938463360.0, + "grad_norm": 1.5178068612446072, + "language_loss": 0.78223705, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.80352986, + "num_input_tokens_seen": 189856970, + "step": 8830, + "time_per_iteration": 2.494155168533325 + }, + { + "auxiliary_loss_clip": 0.0109943, + "auxiliary_loss_mlp": 0.01030385, + "balance_loss_clip": 1.0378654, + "balance_loss_mlp": 1.0178982, + "epoch": 0.5309484443108372, + "flos": 14355506759040.0, + "grad_norm": 1.9570317922740803, + "language_loss": 0.80517089, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.82646906, + "num_input_tokens_seen": 189872830, + "step": 8831, + "time_per_iteration": 2.431706428527832 + }, + { + "auxiliary_loss_clip": 0.01095836, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.03578544, + "balance_loss_mlp": 1.02115941, + "epoch": 0.5310085675635052, + "flos": 20010503427840.0, + "grad_norm": 1.7306962771579704, + "language_loss": 0.73024249, + "learning_rate": 1.896255043672186e-06, + "loss": 0.75153977, + "num_input_tokens_seen": 189891635, + "step": 8832, + "time_per_iteration": 2.46694016456604 + }, + { + "auxiliary_loss_clip": 0.01079498, + "auxiliary_loss_mlp": 0.01033476, + "balance_loss_clip": 1.03745055, + "balance_loss_mlp": 1.02007651, + "epoch": 0.5310686908161731, + "flos": 22127293774080.0, + "grad_norm": 2.5453984931172355, + "language_loss": 0.75989699, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.78102678, + "num_input_tokens_seen": 189909050, + "step": 8833, + "time_per_iteration": 2.529231071472168 + }, + { + "auxiliary_loss_clip": 0.01077317, + "auxiliary_loss_mlp": 0.01029805, + "balance_loss_clip": 1.03443646, + "balance_loss_mlp": 1.01670957, + "epoch": 0.5311288140688412, + "flos": 24717889445760.0, + "grad_norm": 1.8849927235719406, + "language_loss": 0.73522967, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.75630087, + "num_input_tokens_seen": 189927405, + "step": 8834, + "time_per_iteration": 2.5676491260528564 + }, + { + "auxiliary_loss_clip": 0.01115219, + "auxiliary_loss_mlp": 0.01039172, + "balance_loss_clip": 1.03777611, + "balance_loss_mlp": 1.02546263, + "epoch": 0.5311889373215091, + "flos": 24097712198400.0, + "grad_norm": 1.7697506996630872, + "language_loss": 0.77522957, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.79677343, + "num_input_tokens_seen": 189947740, + "step": 8835, + "time_per_iteration": 2.472043752670288 + }, + { + "auxiliary_loss_clip": 0.01085646, + "auxiliary_loss_mlp": 0.01038329, + "balance_loss_clip": 1.03710377, + "balance_loss_mlp": 1.02481627, + "epoch": 0.5312490605741771, + "flos": 22017012042240.0, + "grad_norm": 1.6297169140076533, + "language_loss": 0.72666991, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.74790967, + "num_input_tokens_seen": 189966495, + "step": 8836, + "time_per_iteration": 2.528639316558838 + }, + { + "auxiliary_loss_clip": 0.01089143, + "auxiliary_loss_mlp": 0.01034296, + "balance_loss_clip": 1.03573227, + "balance_loss_mlp": 1.02083778, + "epoch": 0.531309183826845, + "flos": 19390541662080.0, + "grad_norm": 1.6820242543645831, + "language_loss": 0.80414808, + "learning_rate": 1.894310406375987e-06, + "loss": 0.82538247, + "num_input_tokens_seen": 189985325, + "step": 8837, + "time_per_iteration": 2.481680393218994 + }, + { + "auxiliary_loss_clip": 0.01097307, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.04013777, + "balance_loss_mlp": 1.01520014, + "epoch": 0.531369307079513, + "flos": 20190056538240.0, + "grad_norm": 1.8454643863168314, + "language_loss": 0.85842234, + "learning_rate": 1.893921490881035e-06, + "loss": 0.87967891, + "num_input_tokens_seen": 190003290, + "step": 8838, + "time_per_iteration": 2.4948747158050537 + }, + { + "auxiliary_loss_clip": 0.01086514, + "auxiliary_loss_mlp": 0.01032278, + "balance_loss_clip": 1.03731215, + "balance_loss_mlp": 1.02058935, + "epoch": 0.5314294303321809, + "flos": 18880143356160.0, + "grad_norm": 1.589437835727425, + "language_loss": 0.7282114, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.7493993, + "num_input_tokens_seen": 190023260, + "step": 8839, + "time_per_iteration": 2.5506763458251953 + }, + { + "auxiliary_loss_clip": 0.01089451, + "auxiliary_loss_mlp": 0.01033648, + "balance_loss_clip": 1.03523993, + "balance_loss_mlp": 1.02104795, + "epoch": 0.531489553584849, + "flos": 23040035297280.0, + "grad_norm": 2.3672513792432643, + "language_loss": 0.76840651, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.78963757, + "num_input_tokens_seen": 190042035, + "step": 8840, + "time_per_iteration": 3.9193923473358154 + }, + { + "auxiliary_loss_clip": 0.01075765, + "auxiliary_loss_mlp": 0.01032175, + "balance_loss_clip": 1.03995538, + "balance_loss_mlp": 1.0185616, + "epoch": 0.5315496768375169, + "flos": 19790478668160.0, + "grad_norm": 2.114153460744273, + "language_loss": 0.77165067, + "learning_rate": 1.892754768590216e-06, + "loss": 0.79273009, + "num_input_tokens_seen": 190057545, + "step": 8841, + "time_per_iteration": 2.4978842735290527 + }, + { + "auxiliary_loss_clip": 0.01023043, + "auxiliary_loss_mlp": 0.01011148, + "balance_loss_clip": 1.01688302, + "balance_loss_mlp": 1.00971115, + "epoch": 0.5316098000901849, + "flos": 71023228185600.0, + "grad_norm": 0.6964527354883857, + "language_loss": 0.56793249, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.58827436, + "num_input_tokens_seen": 190123800, + "step": 8842, + "time_per_iteration": 3.2480628490448 + }, + { + "auxiliary_loss_clip": 0.0108984, + "auxiliary_loss_mlp": 0.01038474, + "balance_loss_clip": 1.03701329, + "balance_loss_mlp": 1.02423465, + "epoch": 0.5316699233428529, + "flos": 16435560470400.0, + "grad_norm": 1.7767282875113812, + "language_loss": 0.73261827, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.75390142, + "num_input_tokens_seen": 190141625, + "step": 8843, + "time_per_iteration": 2.5629043579101562 + }, + { + "auxiliary_loss_clip": 0.01021872, + "auxiliary_loss_mlp": 0.01008534, + "balance_loss_clip": 1.01825929, + "balance_loss_mlp": 1.00718737, + "epoch": 0.5317300465955208, + "flos": 67420814302080.0, + "grad_norm": 0.884266741422586, + "language_loss": 0.61104798, + "learning_rate": 1.891588082900145e-06, + "loss": 0.63135207, + "num_input_tokens_seen": 190198110, + "step": 8844, + "time_per_iteration": 4.508849143981934 + }, + { + "auxiliary_loss_clip": 0.01029252, + "auxiliary_loss_mlp": 0.01004391, + "balance_loss_clip": 1.01285982, + "balance_loss_mlp": 1.00306785, + "epoch": 0.5317901698481888, + "flos": 59508075340800.0, + "grad_norm": 0.9214549893239615, + "language_loss": 0.62220001, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64253652, + "num_input_tokens_seen": 190259950, + "step": 8845, + "time_per_iteration": 3.0963025093078613 + }, + { + "auxiliary_loss_clip": 0.0107924, + "auxiliary_loss_mlp": 0.01036637, + "balance_loss_clip": 1.03666067, + "balance_loss_mlp": 1.02254677, + "epoch": 0.5318502931008567, + "flos": 19129219240320.0, + "grad_norm": 1.9460863601061407, + "language_loss": 0.75377858, + "learning_rate": 1.890810312970474e-06, + "loss": 0.77493727, + "num_input_tokens_seen": 190278265, + "step": 8846, + "time_per_iteration": 3.9222922325134277 + }, + { + "auxiliary_loss_clip": 0.01100045, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.03783047, + "balance_loss_mlp": 1.01860678, + "epoch": 0.5319104163535248, + "flos": 24681045070080.0, + "grad_norm": 2.3332897113657602, + "language_loss": 0.75595665, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.77725989, + "num_input_tokens_seen": 190298400, + "step": 8847, + "time_per_iteration": 2.5144827365875244 + }, + { + "auxiliary_loss_clip": 0.01088401, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.03498816, + "balance_loss_mlp": 1.02192438, + "epoch": 0.5319705396061927, + "flos": 19385513758080.0, + "grad_norm": 1.6118528524542113, + "language_loss": 0.87481391, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.8960346, + "num_input_tokens_seen": 190316235, + "step": 8848, + "time_per_iteration": 2.500892162322998 + }, + { + "auxiliary_loss_clip": 0.01073238, + "auxiliary_loss_mlp": 0.01042044, + "balance_loss_clip": 1.03651643, + "balance_loss_mlp": 1.02747679, + "epoch": 0.5320306628588607, + "flos": 18259319664000.0, + "grad_norm": 1.9194592834757513, + "language_loss": 0.74341285, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.76456559, + "num_input_tokens_seen": 190335060, + "step": 8849, + "time_per_iteration": 2.507690906524658 + }, + { + "auxiliary_loss_clip": 0.01098214, + "auxiliary_loss_mlp": 0.01030079, + "balance_loss_clip": 1.03563452, + "balance_loss_mlp": 1.01648879, + "epoch": 0.5320907861115286, + "flos": 23732321097600.0, + "grad_norm": 2.0803518343236513, + "language_loss": 0.80034107, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.82162398, + "num_input_tokens_seen": 190353265, + "step": 8850, + "time_per_iteration": 3.8599796295166016 + }, + { + "auxiliary_loss_clip": 0.01109002, + "auxiliary_loss_mlp": 0.01031979, + "balance_loss_clip": 1.03503299, + "balance_loss_mlp": 1.01934302, + "epoch": 0.5321509093641966, + "flos": 34495251321600.0, + "grad_norm": 1.452499425185094, + "language_loss": 0.5480473, + "learning_rate": 1.888865960862821e-06, + "loss": 0.56945717, + "num_input_tokens_seen": 190376575, + "step": 8851, + "time_per_iteration": 2.5714170932769775 + }, + { + "auxiliary_loss_clip": 0.01100574, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.03755212, + "balance_loss_mlp": 1.0185349, + "epoch": 0.5322110326168645, + "flos": 20010934391040.0, + "grad_norm": 2.305627235795431, + "language_loss": 0.68762118, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.70893961, + "num_input_tokens_seen": 190395185, + "step": 8852, + "time_per_iteration": 2.4640114307403564 + }, + { + "auxiliary_loss_clip": 0.01025032, + "auxiliary_loss_mlp": 0.00770314, + "balance_loss_clip": 1.0187819, + "balance_loss_mlp": 1.0128727, + "epoch": 0.5322711558695326, + "flos": 64631164435200.0, + "grad_norm": 0.8017353068623295, + "language_loss": 0.62855095, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.6465044, + "num_input_tokens_seen": 190452595, + "step": 8853, + "time_per_iteration": 3.0840625762939453 + }, + { + "auxiliary_loss_clip": 0.01102932, + "auxiliary_loss_mlp": 0.01030862, + "balance_loss_clip": 1.03599966, + "balance_loss_mlp": 1.017892, + "epoch": 0.5323312791222005, + "flos": 14939342421120.0, + "grad_norm": 2.7146717863509395, + "language_loss": 0.7978189, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.81915677, + "num_input_tokens_seen": 190469140, + "step": 8854, + "time_per_iteration": 2.446993589401245 + }, + { + "auxiliary_loss_clip": 0.01085087, + "auxiliary_loss_mlp": 0.01030757, + "balance_loss_clip": 1.03972244, + "balance_loss_mlp": 1.01891923, + "epoch": 0.5323914023748685, + "flos": 23440834229760.0, + "grad_norm": 1.6034142014212835, + "language_loss": 0.72991025, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.75106871, + "num_input_tokens_seen": 190489015, + "step": 8855, + "time_per_iteration": 2.5314016342163086 + }, + { + "auxiliary_loss_clip": 0.01084784, + "auxiliary_loss_mlp": 0.00787678, + "balance_loss_clip": 1.0345422, + "balance_loss_mlp": 1.01163268, + "epoch": 0.5324515256275365, + "flos": 26286180134400.0, + "grad_norm": 1.9676317023358667, + "language_loss": 0.65120047, + "learning_rate": 1.886921714110507e-06, + "loss": 0.66992515, + "num_input_tokens_seen": 190508065, + "step": 8856, + "time_per_iteration": 2.54663348197937 + }, + { + "auxiliary_loss_clip": 0.01093704, + "auxiliary_loss_mlp": 0.01041999, + "balance_loss_clip": 1.04278493, + "balance_loss_mlp": 1.02801585, + "epoch": 0.5325116488802044, + "flos": 26870913636480.0, + "grad_norm": 1.7394483018327733, + "language_loss": 0.77508807, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.79644513, + "num_input_tokens_seen": 190527045, + "step": 8857, + "time_per_iteration": 2.5581252574920654 + }, + { + "auxiliary_loss_clip": 0.01078226, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.03519368, + "balance_loss_mlp": 1.02083063, + "epoch": 0.5325717721328724, + "flos": 25884734757120.0, + "grad_norm": 3.300974127399562, + "language_loss": 0.71253586, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.733657, + "num_input_tokens_seen": 190544075, + "step": 8858, + "time_per_iteration": 2.570976734161377 + }, + { + "auxiliary_loss_clip": 0.011003, + "auxiliary_loss_mlp": 0.01035948, + "balance_loss_clip": 1.03818583, + "balance_loss_mlp": 1.02226269, + "epoch": 0.5326318953855403, + "flos": 21799321666560.0, + "grad_norm": 1.8036911421776591, + "language_loss": 0.69247031, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.7138328, + "num_input_tokens_seen": 190566030, + "step": 8859, + "time_per_iteration": 2.5476136207580566 + }, + { + "auxiliary_loss_clip": 0.01097535, + "auxiliary_loss_mlp": 0.01027081, + "balance_loss_clip": 1.03798366, + "balance_loss_mlp": 1.01573253, + "epoch": 0.5326920186382084, + "flos": 20922921728640.0, + "grad_norm": 1.5253963137840456, + "language_loss": 0.69729698, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.71854317, + "num_input_tokens_seen": 190585605, + "step": 8860, + "time_per_iteration": 2.4793801307678223 + }, + { + "auxiliary_loss_clip": 0.01090447, + "auxiliary_loss_mlp": 0.01033406, + "balance_loss_clip": 1.03846955, + "balance_loss_mlp": 1.02121651, + "epoch": 0.5327521418908763, + "flos": 21433427775360.0, + "grad_norm": 1.8416910874366852, + "language_loss": 0.78061098, + "learning_rate": 1.884977574556683e-06, + "loss": 0.80184948, + "num_input_tokens_seen": 190604625, + "step": 8861, + "time_per_iteration": 2.5385754108428955 + }, + { + "auxiliary_loss_clip": 0.01068515, + "auxiliary_loss_mlp": 0.01038013, + "balance_loss_clip": 1.03563857, + "balance_loss_mlp": 1.02446485, + "epoch": 0.5328122651435443, + "flos": 21760250647680.0, + "grad_norm": 1.6168232085497585, + "language_loss": 0.85558975, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.87665498, + "num_input_tokens_seen": 190625060, + "step": 8862, + "time_per_iteration": 2.6035940647125244 + }, + { + "auxiliary_loss_clip": 0.01089523, + "auxiliary_loss_mlp": 0.01035891, + "balance_loss_clip": 1.03618598, + "balance_loss_mlp": 1.0208708, + "epoch": 0.5328723883962122, + "flos": 18296487262080.0, + "grad_norm": 1.9317830966729932, + "language_loss": 0.61976731, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.64102149, + "num_input_tokens_seen": 190643150, + "step": 8863, + "time_per_iteration": 2.501035213470459 + }, + { + "auxiliary_loss_clip": 0.01084806, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.03646755, + "balance_loss_mlp": 1.02075517, + "epoch": 0.5329325116488802, + "flos": 25374911068800.0, + "grad_norm": 2.1153957919630813, + "language_loss": 0.73506457, + "learning_rate": 1.883811143046377e-06, + "loss": 0.7562474, + "num_input_tokens_seen": 190662725, + "step": 8864, + "time_per_iteration": 2.559576988220215 + }, + { + "auxiliary_loss_clip": 0.01109811, + "auxiliary_loss_mlp": 0.01033179, + "balance_loss_clip": 1.03644633, + "balance_loss_mlp": 1.02090037, + "epoch": 0.5329926349015481, + "flos": 25592098654080.0, + "grad_norm": 1.8087329027638728, + "language_loss": 0.64423448, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.66566437, + "num_input_tokens_seen": 190683680, + "step": 8865, + "time_per_iteration": 2.5059685707092285 + }, + { + "auxiliary_loss_clip": 0.01103147, + "auxiliary_loss_mlp": 0.01028963, + "balance_loss_clip": 1.03757572, + "balance_loss_mlp": 1.01585007, + "epoch": 0.5330527581542162, + "flos": 22889605138560.0, + "grad_norm": 1.7431726859571242, + "language_loss": 0.78397131, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.80529237, + "num_input_tokens_seen": 190703350, + "step": 8866, + "time_per_iteration": 2.4838199615478516 + }, + { + "auxiliary_loss_clip": 0.01100124, + "auxiliary_loss_mlp": 0.01030063, + "balance_loss_clip": 1.0372231, + "balance_loss_mlp": 1.01787341, + "epoch": 0.5331128814068841, + "flos": 16026752805120.0, + "grad_norm": 1.8639575183351227, + "language_loss": 0.73488742, + "learning_rate": 1.882644751189108e-06, + "loss": 0.75618929, + "num_input_tokens_seen": 190721170, + "step": 8867, + "time_per_iteration": 2.47721529006958 + }, + { + "auxiliary_loss_clip": 0.01091277, + "auxiliary_loss_mlp": 0.01035932, + "balance_loss_clip": 1.03729224, + "balance_loss_mlp": 1.02189505, + "epoch": 0.5331730046595521, + "flos": 39344699629440.0, + "grad_norm": 1.5235289337311557, + "language_loss": 0.71963638, + "learning_rate": 1.88225596278394e-06, + "loss": 0.7409085, + "num_input_tokens_seen": 190743795, + "step": 8868, + "time_per_iteration": 2.6715803146362305 + }, + { + "auxiliary_loss_clip": 0.01079845, + "auxiliary_loss_mlp": 0.0103175, + "balance_loss_clip": 1.03589606, + "balance_loss_mlp": 1.01900065, + "epoch": 0.5332331279122201, + "flos": 24024382583040.0, + "grad_norm": 1.6987140598097066, + "language_loss": 0.78299797, + "learning_rate": 1.881867178843637e-06, + "loss": 0.80411392, + "num_input_tokens_seen": 190761560, + "step": 8869, + "time_per_iteration": 2.55598521232605 + }, + { + "auxiliary_loss_clip": 0.01106015, + "auxiliary_loss_mlp": 0.01034442, + "balance_loss_clip": 1.03921533, + "balance_loss_mlp": 1.02135849, + "epoch": 0.533293251164888, + "flos": 17129318728320.0, + "grad_norm": 2.2120467209435355, + "language_loss": 0.76083565, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.78224027, + "num_input_tokens_seen": 190778875, + "step": 8870, + "time_per_iteration": 2.4554052352905273 + }, + { + "auxiliary_loss_clip": 0.01096725, + "auxiliary_loss_mlp": 0.01036731, + "balance_loss_clip": 1.04203832, + "balance_loss_mlp": 1.02241373, + "epoch": 0.533353374417556, + "flos": 22126360020480.0, + "grad_norm": 2.2588376931270484, + "language_loss": 0.75658548, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.77792013, + "num_input_tokens_seen": 190799830, + "step": 8871, + "time_per_iteration": 2.550447940826416 + }, + { + "auxiliary_loss_clip": 0.01090881, + "auxiliary_loss_mlp": 0.01029798, + "balance_loss_clip": 1.03847063, + "balance_loss_mlp": 1.01709008, + "epoch": 0.533413497670224, + "flos": 15011091838080.0, + "grad_norm": 1.8282517877914424, + "language_loss": 0.71990031, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.74110711, + "num_input_tokens_seen": 190817155, + "step": 8872, + "time_per_iteration": 2.4777228832244873 + }, + { + "auxiliary_loss_clip": 0.01096523, + "auxiliary_loss_mlp": 0.01040674, + "balance_loss_clip": 1.04248929, + "balance_loss_mlp": 1.02679777, + "epoch": 0.533473620922892, + "flos": 19609955890560.0, + "grad_norm": 1.7363853137474092, + "language_loss": 0.64858961, + "learning_rate": 1.880312088025936e-06, + "loss": 0.66996157, + "num_input_tokens_seen": 190835240, + "step": 8873, + "time_per_iteration": 2.5047240257263184 + }, + { + "auxiliary_loss_clip": 0.01089234, + "auxiliary_loss_mlp": 0.01040777, + "balance_loss_clip": 1.03890991, + "balance_loss_mlp": 1.02805746, + "epoch": 0.5335337441755599, + "flos": 14282644020480.0, + "grad_norm": 2.230796992790766, + "language_loss": 0.79716361, + "learning_rate": 1.879923326631099e-06, + "loss": 0.81846368, + "num_input_tokens_seen": 190851620, + "step": 8874, + "time_per_iteration": 2.460770845413208 + }, + { + "auxiliary_loss_clip": 0.01101512, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.03743339, + "balance_loss_mlp": 1.01690519, + "epoch": 0.5335938674282279, + "flos": 20814830726400.0, + "grad_norm": 1.797949795920948, + "language_loss": 0.69499087, + "learning_rate": 1.879534569789582e-06, + "loss": 0.71630675, + "num_input_tokens_seen": 190870545, + "step": 8875, + "time_per_iteration": 2.4655468463897705 + }, + { + "auxiliary_loss_clip": 0.01040372, + "auxiliary_loss_mlp": 0.01008783, + "balance_loss_clip": 1.01463079, + "balance_loss_mlp": 1.00745392, + "epoch": 0.5336539906808958, + "flos": 71396448451200.0, + "grad_norm": 0.7295463589188508, + "language_loss": 0.59673262, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61722422, + "num_input_tokens_seen": 190931995, + "step": 8876, + "time_per_iteration": 3.1551454067230225 + }, + { + "auxiliary_loss_clip": 0.01100858, + "auxiliary_loss_mlp": 0.01036225, + "balance_loss_clip": 1.03767931, + "balance_loss_mlp": 1.02388048, + "epoch": 0.5337141139335638, + "flos": 20152996680960.0, + "grad_norm": 1.7616443016570276, + "language_loss": 0.74893641, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.77030718, + "num_input_tokens_seen": 190949890, + "step": 8877, + "time_per_iteration": 2.466925859451294 + }, + { + "auxiliary_loss_clip": 0.01027719, + "auxiliary_loss_mlp": 0.01006753, + "balance_loss_clip": 1.01745057, + "balance_loss_mlp": 1.00536382, + "epoch": 0.5337742371862317, + "flos": 67728387484800.0, + "grad_norm": 0.7557983227766831, + "language_loss": 0.57166171, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59200644, + "num_input_tokens_seen": 191008480, + "step": 8878, + "time_per_iteration": 4.393060684204102 + }, + { + "auxiliary_loss_clip": 0.01115291, + "auxiliary_loss_mlp": 0.01032935, + "balance_loss_clip": 1.03868067, + "balance_loss_mlp": 1.01957726, + "epoch": 0.5338343604388998, + "flos": 25008909436800.0, + "grad_norm": 1.4166382808807116, + "language_loss": 0.72675014, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.74823242, + "num_input_tokens_seen": 191028995, + "step": 8879, + "time_per_iteration": 2.4932034015655518 + }, + { + "auxiliary_loss_clip": 0.01116124, + "auxiliary_loss_mlp": 0.01032356, + "balance_loss_clip": 1.04079676, + "balance_loss_mlp": 1.01928496, + "epoch": 0.5338944836915677, + "flos": 17601256546560.0, + "grad_norm": 2.7273697000534267, + "language_loss": 0.83437538, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.85586023, + "num_input_tokens_seen": 191045285, + "step": 8880, + "time_per_iteration": 2.4517805576324463 + }, + { + "auxiliary_loss_clip": 0.01048233, + "auxiliary_loss_mlp": 0.01033523, + "balance_loss_clip": 1.04017198, + "balance_loss_mlp": 1.02097654, + "epoch": 0.5339546069442357, + "flos": 21724124544000.0, + "grad_norm": 1.4839499407428323, + "language_loss": 0.79164863, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.8124662, + "num_input_tokens_seen": 191066105, + "step": 8881, + "time_per_iteration": 2.6266725063323975 + }, + { + "auxiliary_loss_clip": 0.01018255, + "auxiliary_loss_mlp": 0.0100028, + "balance_loss_clip": 1.01925111, + "balance_loss_mlp": 0.99889147, + "epoch": 0.5340147301969036, + "flos": 69723583315200.0, + "grad_norm": 0.7876625850245238, + "language_loss": 0.59219396, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61237931, + "num_input_tokens_seen": 191126315, + "step": 8882, + "time_per_iteration": 4.386980295181274 + }, + { + "auxiliary_loss_clip": 0.01017834, + "auxiliary_loss_mlp": 0.01004975, + "balance_loss_clip": 1.01094866, + "balance_loss_mlp": 1.00361562, + "epoch": 0.5340748534495716, + "flos": 63880701580800.0, + "grad_norm": 0.8822989567967736, + "language_loss": 0.63754034, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65776849, + "num_input_tokens_seen": 191174240, + "step": 8883, + "time_per_iteration": 2.942675828933716 + }, + { + "auxiliary_loss_clip": 0.01070417, + "auxiliary_loss_mlp": 0.01030394, + "balance_loss_clip": 1.03633618, + "balance_loss_mlp": 1.0168699, + "epoch": 0.5341349767022396, + "flos": 28694313694080.0, + "grad_norm": 2.255749921925642, + "language_loss": 0.81706226, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.83807027, + "num_input_tokens_seen": 191193335, + "step": 8884, + "time_per_iteration": 4.074026346206665 + }, + { + "auxiliary_loss_clip": 0.01077506, + "auxiliary_loss_mlp": 0.01033267, + "balance_loss_clip": 1.03473306, + "balance_loss_mlp": 1.01964712, + "epoch": 0.5341950999549075, + "flos": 16289691338880.0, + "grad_norm": 1.7164014047768612, + "language_loss": 0.71803379, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.73914146, + "num_input_tokens_seen": 191210900, + "step": 8885, + "time_per_iteration": 2.485318899154663 + }, + { + "auxiliary_loss_clip": 0.01093728, + "auxiliary_loss_mlp": 0.01033108, + "balance_loss_clip": 1.03687954, + "balance_loss_mlp": 1.01938748, + "epoch": 0.5342552232075756, + "flos": 14355650413440.0, + "grad_norm": 1.9035852968634204, + "language_loss": 0.78585005, + "learning_rate": 1.87525854926798e-06, + "loss": 0.80711842, + "num_input_tokens_seen": 191226730, + "step": 8886, + "time_per_iteration": 2.507058620452881 + }, + { + "auxiliary_loss_clip": 0.01075181, + "auxiliary_loss_mlp": 0.00804615, + "balance_loss_clip": 1.03742695, + "balance_loss_mlp": 1.04334879, + "epoch": 0.5343153464602435, + "flos": 30297976300800.0, + "grad_norm": 1.39343387785755, + "language_loss": 0.74625349, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.76505142, + "num_input_tokens_seen": 191250435, + "step": 8887, + "time_per_iteration": 2.6480603218078613 + }, + { + "auxiliary_loss_clip": 0.01085944, + "auxiliary_loss_mlp": 0.01029564, + "balance_loss_clip": 1.03524911, + "balance_loss_mlp": 1.0169276, + "epoch": 0.5343754697129115, + "flos": 15596292216960.0, + "grad_norm": 2.2899171369686306, + "language_loss": 0.69406605, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.71522117, + "num_input_tokens_seen": 191268315, + "step": 8888, + "time_per_iteration": 2.4928061962127686 + }, + { + "auxiliary_loss_clip": 0.0110891, + "auxiliary_loss_mlp": 0.0103624, + "balance_loss_clip": 1.03843808, + "balance_loss_mlp": 1.02288246, + "epoch": 0.5344355929655794, + "flos": 16909617191040.0, + "grad_norm": 3.4626087682820854, + "language_loss": 0.77534282, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.79679435, + "num_input_tokens_seen": 191287000, + "step": 8889, + "time_per_iteration": 3.863157033920288 + }, + { + "auxiliary_loss_clip": 0.01112028, + "auxiliary_loss_mlp": 0.01036966, + "balance_loss_clip": 1.03865254, + "balance_loss_mlp": 1.0237155, + "epoch": 0.5344957162182474, + "flos": 16798186224000.0, + "grad_norm": 1.9650683719511481, + "language_loss": 0.69346285, + "learning_rate": 1.873703773589102e-06, + "loss": 0.71495277, + "num_input_tokens_seen": 191304565, + "step": 8890, + "time_per_iteration": 2.4177639484405518 + }, + { + "auxiliary_loss_clip": 0.01116397, + "auxiliary_loss_mlp": 0.01038386, + "balance_loss_clip": 1.03845286, + "balance_loss_mlp": 1.02395535, + "epoch": 0.5345558394709153, + "flos": 12705590413440.0, + "grad_norm": 3.135280956408226, + "language_loss": 0.76970375, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.79125154, + "num_input_tokens_seen": 191318300, + "step": 8891, + "time_per_iteration": 2.386655330657959 + }, + { + "auxiliary_loss_clip": 0.01093729, + "auxiliary_loss_mlp": 0.01037799, + "balance_loss_clip": 1.03569019, + "balance_loss_mlp": 1.02445388, + "epoch": 0.5346159627235834, + "flos": 22455050400000.0, + "grad_norm": 1.6681495886216764, + "language_loss": 0.73889321, + "learning_rate": 1.872926414425699e-06, + "loss": 0.76020855, + "num_input_tokens_seen": 191337925, + "step": 8892, + "time_per_iteration": 2.5085411071777344 + }, + { + "auxiliary_loss_clip": 0.01092789, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.0367012, + "balance_loss_mlp": 1.01797962, + "epoch": 0.5346760859762513, + "flos": 22415763899520.0, + "grad_norm": 1.5279530923065847, + "language_loss": 0.87617207, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.89740443, + "num_input_tokens_seen": 191357120, + "step": 8893, + "time_per_iteration": 2.5016818046569824 + }, + { + "auxiliary_loss_clip": 0.01107833, + "auxiliary_loss_mlp": 0.01030459, + "balance_loss_clip": 1.03651381, + "balance_loss_mlp": 1.01875293, + "epoch": 0.5347362092289193, + "flos": 22816131868800.0, + "grad_norm": 1.7576140555522919, + "language_loss": 0.7268635, + "learning_rate": 1.872149074536869e-06, + "loss": 0.74824643, + "num_input_tokens_seen": 191375395, + "step": 8894, + "time_per_iteration": 2.4415650367736816 + }, + { + "auxiliary_loss_clip": 0.01098356, + "auxiliary_loss_mlp": 0.01028788, + "balance_loss_clip": 1.03535032, + "balance_loss_mlp": 1.01590705, + "epoch": 0.5347963324815872, + "flos": 23219480666880.0, + "grad_norm": 1.6736654905903399, + "language_loss": 0.74563682, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.76690823, + "num_input_tokens_seen": 191395595, + "step": 8895, + "time_per_iteration": 2.4711191654205322 + }, + { + "auxiliary_loss_clip": 0.01079011, + "auxiliary_loss_mlp": 0.01034528, + "balance_loss_clip": 1.03552985, + "balance_loss_mlp": 1.02138567, + "epoch": 0.5348564557342552, + "flos": 22601350494720.0, + "grad_norm": 1.670656835252365, + "language_loss": 0.76953089, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.79066622, + "num_input_tokens_seen": 191413730, + "step": 8896, + "time_per_iteration": 2.5109450817108154 + }, + { + "auxiliary_loss_clip": 0.01084855, + "auxiliary_loss_mlp": 0.01029306, + "balance_loss_clip": 1.0397799, + "balance_loss_mlp": 1.01608586, + "epoch": 0.5349165789869232, + "flos": 18002378701440.0, + "grad_norm": 1.7839552913938588, + "language_loss": 0.78747451, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.80861616, + "num_input_tokens_seen": 191432400, + "step": 8897, + "time_per_iteration": 2.491626024246216 + }, + { + "auxiliary_loss_clip": 0.01100499, + "auxiliary_loss_mlp": 0.01030194, + "balance_loss_clip": 1.03750265, + "balance_loss_mlp": 1.0171349, + "epoch": 0.5349767022395912, + "flos": 17159770483200.0, + "grad_norm": 1.9061309050400779, + "language_loss": 0.75872719, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.78003418, + "num_input_tokens_seen": 191448855, + "step": 8898, + "time_per_iteration": 2.445260524749756 + }, + { + "auxiliary_loss_clip": 0.01026779, + "auxiliary_loss_mlp": 0.01006708, + "balance_loss_clip": 1.01092434, + "balance_loss_mlp": 1.00555217, + "epoch": 0.5350368254922592, + "flos": 70992058158720.0, + "grad_norm": 1.094603363731391, + "language_loss": 0.57986009, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.60019499, + "num_input_tokens_seen": 191519690, + "step": 8899, + "time_per_iteration": 3.2612180709838867 + }, + { + "auxiliary_loss_clip": 0.01086621, + "auxiliary_loss_mlp": 0.01027841, + "balance_loss_clip": 1.03631639, + "balance_loss_mlp": 1.01577699, + "epoch": 0.5350969487449271, + "flos": 27417833095680.0, + "grad_norm": 1.704524789822995, + "language_loss": 0.69910288, + "learning_rate": 1.869817171696868e-06, + "loss": 0.72024751, + "num_input_tokens_seen": 191539380, + "step": 8900, + "time_per_iteration": 2.5682921409606934 + }, + { + "auxiliary_loss_clip": 0.01090842, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.03661263, + "balance_loss_mlp": 1.01713467, + "epoch": 0.5351570719975951, + "flos": 19316134638720.0, + "grad_norm": 1.7186931972719326, + "language_loss": 0.71464384, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.73585254, + "num_input_tokens_seen": 191557400, + "step": 8901, + "time_per_iteration": 2.5040676593780518 + }, + { + "auxiliary_loss_clip": 0.01078472, + "auxiliary_loss_mlp": 0.01028715, + "balance_loss_clip": 1.03499532, + "balance_loss_mlp": 1.01540565, + "epoch": 0.535217195250263, + "flos": 19828580019840.0, + "grad_norm": 1.9175726107131226, + "language_loss": 0.77480114, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.79587305, + "num_input_tokens_seen": 191575860, + "step": 8902, + "time_per_iteration": 2.533740282058716 + }, + { + "auxiliary_loss_clip": 0.01078443, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.03658271, + "balance_loss_mlp": 1.02166653, + "epoch": 0.535277318502931, + "flos": 22127868391680.0, + "grad_norm": 1.551114097450694, + "language_loss": 0.69892013, + "learning_rate": 1.868651286721281e-06, + "loss": 0.72004777, + "num_input_tokens_seen": 191595775, + "step": 8903, + "time_per_iteration": 2.534207820892334 + }, + { + "auxiliary_loss_clip": 0.01102388, + "auxiliary_loss_mlp": 0.00786981, + "balance_loss_clip": 1.03719354, + "balance_loss_mlp": 1.01051903, + "epoch": 0.5353374417555989, + "flos": 25045897466880.0, + "grad_norm": 1.4992524107329255, + "language_loss": 0.72362369, + "learning_rate": 1.86826266833795e-06, + "loss": 0.74251741, + "num_input_tokens_seen": 191617785, + "step": 8904, + "time_per_iteration": 2.523803234100342 + }, + { + "auxiliary_loss_clip": 0.01089668, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.04012585, + "balance_loss_mlp": 1.02200258, + "epoch": 0.535397565008267, + "flos": 19388710068480.0, + "grad_norm": 1.7563683257851448, + "language_loss": 0.73378301, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.7550323, + "num_input_tokens_seen": 191636900, + "step": 8905, + "time_per_iteration": 2.5006916522979736 + }, + { + "auxiliary_loss_clip": 0.01095148, + "auxiliary_loss_mlp": 0.01032679, + "balance_loss_clip": 1.03652143, + "balance_loss_mlp": 1.0216403, + "epoch": 0.5354576882609349, + "flos": 21471205904640.0, + "grad_norm": 1.7235856618897354, + "language_loss": 0.83773911, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.85901737, + "num_input_tokens_seen": 191656720, + "step": 8906, + "time_per_iteration": 2.4851813316345215 + }, + { + "auxiliary_loss_clip": 0.01102375, + "auxiliary_loss_mlp": 0.00786295, + "balance_loss_clip": 1.03774607, + "balance_loss_mlp": 1.00998139, + "epoch": 0.5355178115136029, + "flos": 20777519473920.0, + "grad_norm": 2.0021584411697786, + "language_loss": 0.73802584, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.75691259, + "num_input_tokens_seen": 191674445, + "step": 8907, + "time_per_iteration": 2.487630844116211 + }, + { + "auxiliary_loss_clip": 0.01095163, + "auxiliary_loss_mlp": 0.01032942, + "balance_loss_clip": 1.03531194, + "balance_loss_mlp": 1.01875019, + "epoch": 0.5355779347662708, + "flos": 23514020190720.0, + "grad_norm": 1.7719611686255785, + "language_loss": 0.76476085, + "learning_rate": 1.866708244906912e-06, + "loss": 0.78604186, + "num_input_tokens_seen": 191695000, + "step": 8908, + "time_per_iteration": 2.4954416751861572 + }, + { + "auxiliary_loss_clip": 0.01084163, + "auxiliary_loss_mlp": 0.00787081, + "balance_loss_clip": 1.03497124, + "balance_loss_mlp": 1.00915468, + "epoch": 0.5356380580189388, + "flos": 20303211358080.0, + "grad_norm": 1.9714029672802387, + "language_loss": 0.7414428, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.7601552, + "num_input_tokens_seen": 191713295, + "step": 8909, + "time_per_iteration": 2.5071089267730713 + }, + { + "auxiliary_loss_clip": 0.01078037, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.03829002, + "balance_loss_mlp": 1.02163029, + "epoch": 0.5356981812716068, + "flos": 21361642444800.0, + "grad_norm": 11.121529690526431, + "language_loss": 0.84191227, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.8630327, + "num_input_tokens_seen": 191732725, + "step": 8910, + "time_per_iteration": 2.543142318725586 + }, + { + "auxiliary_loss_clip": 0.01091764, + "auxiliary_loss_mlp": 0.01027181, + "balance_loss_clip": 1.0372839, + "balance_loss_mlp": 1.01377547, + "epoch": 0.5357583045242748, + "flos": 23111246010240.0, + "grad_norm": 1.5760762893812508, + "language_loss": 0.81649995, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.8376894, + "num_input_tokens_seen": 191753765, + "step": 8911, + "time_per_iteration": 2.5574283599853516 + }, + { + "auxiliary_loss_clip": 0.01072073, + "auxiliary_loss_mlp": 0.01039355, + "balance_loss_clip": 1.03520608, + "balance_loss_mlp": 1.0267725, + "epoch": 0.5358184277769428, + "flos": 21141761339520.0, + "grad_norm": 1.6406136273204073, + "language_loss": 0.68994147, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.7110557, + "num_input_tokens_seen": 191773560, + "step": 8912, + "time_per_iteration": 2.535024404525757 + }, + { + "auxiliary_loss_clip": 0.01088705, + "auxiliary_loss_mlp": 0.01035572, + "balance_loss_clip": 1.03659451, + "balance_loss_mlp": 1.02258408, + "epoch": 0.5358785510296107, + "flos": 16282400878080.0, + "grad_norm": 2.178268828648602, + "language_loss": 0.71479493, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.73603773, + "num_input_tokens_seen": 191791255, + "step": 8913, + "time_per_iteration": 2.4686648845672607 + }, + { + "auxiliary_loss_clip": 0.01083739, + "auxiliary_loss_mlp": 0.01036224, + "balance_loss_clip": 1.03838789, + "balance_loss_mlp": 1.02324235, + "epoch": 0.5359386742822787, + "flos": 16976877408000.0, + "grad_norm": 1.6163191546918634, + "language_loss": 0.72359121, + "learning_rate": 1.864376761688156e-06, + "loss": 0.74479079, + "num_input_tokens_seen": 191809325, + "step": 8914, + "time_per_iteration": 2.5103631019592285 + }, + { + "auxiliary_loss_clip": 0.01099529, + "auxiliary_loss_mlp": 0.01034842, + "balance_loss_clip": 1.04048657, + "balance_loss_mlp": 1.02001882, + "epoch": 0.5359987975349466, + "flos": 20812927305600.0, + "grad_norm": 1.7922389264544722, + "language_loss": 0.70658588, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.72792959, + "num_input_tokens_seen": 191829795, + "step": 8915, + "time_per_iteration": 2.550107002258301 + }, + { + "auxiliary_loss_clip": 0.01084687, + "auxiliary_loss_mlp": 0.01037855, + "balance_loss_clip": 1.03698206, + "balance_loss_mlp": 1.02413952, + "epoch": 0.5360589207876146, + "flos": 22199941031040.0, + "grad_norm": 1.7221890696256141, + "language_loss": 0.75130999, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.77253544, + "num_input_tokens_seen": 191850840, + "step": 8916, + "time_per_iteration": 2.5234532356262207 + }, + { + "auxiliary_loss_clip": 0.0107029, + "auxiliary_loss_mlp": 0.00785513, + "balance_loss_clip": 1.03715968, + "balance_loss_mlp": 1.00818968, + "epoch": 0.5361190440402825, + "flos": 31394365084800.0, + "grad_norm": 2.3221406271133045, + "language_loss": 0.72271752, + "learning_rate": 1.863211089308289e-06, + "loss": 0.74127561, + "num_input_tokens_seen": 191869520, + "step": 8917, + "time_per_iteration": 2.6759450435638428 + }, + { + "auxiliary_loss_clip": 0.01088855, + "auxiliary_loss_mlp": 0.0103816, + "balance_loss_clip": 1.03693211, + "balance_loss_mlp": 1.02473116, + "epoch": 0.5361791672929506, + "flos": 16069882060800.0, + "grad_norm": 1.981894412743698, + "language_loss": 0.71170914, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.7329793, + "num_input_tokens_seen": 191887240, + "step": 8918, + "time_per_iteration": 3.9495198726654053 + }, + { + "auxiliary_loss_clip": 0.01091629, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.03959763, + "balance_loss_mlp": 1.02194178, + "epoch": 0.5362392905456185, + "flos": 20740926493440.0, + "grad_norm": 1.710703532081005, + "language_loss": 0.75058365, + "learning_rate": 1.862434000299067e-06, + "loss": 0.77184486, + "num_input_tokens_seen": 191905690, + "step": 8919, + "time_per_iteration": 2.532991886138916 + }, + { + "auxiliary_loss_clip": 0.0108537, + "auxiliary_loss_mlp": 0.01029967, + "balance_loss_clip": 1.04037786, + "balance_loss_mlp": 1.01765823, + "epoch": 0.5362994137982865, + "flos": 17340077779200.0, + "grad_norm": 2.237403090290034, + "language_loss": 0.71441686, + "learning_rate": 1.862045463611864e-06, + "loss": 0.73557019, + "num_input_tokens_seen": 191920725, + "step": 8920, + "time_per_iteration": 2.48478102684021 + }, + { + "auxiliary_loss_clip": 0.01095063, + "auxiliary_loss_mlp": 0.01035092, + "balance_loss_clip": 1.03514338, + "balance_loss_mlp": 1.02069736, + "epoch": 0.5363595370509544, + "flos": 42813957795840.0, + "grad_norm": 1.7922836310646222, + "language_loss": 0.68832541, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.70962697, + "num_input_tokens_seen": 191944645, + "step": 8921, + "time_per_iteration": 4.064924001693726 + }, + { + "auxiliary_loss_clip": 0.01099985, + "auxiliary_loss_mlp": 0.01030729, + "balance_loss_clip": 1.03775787, + "balance_loss_mlp": 1.01771116, + "epoch": 0.5364196603036224, + "flos": 19171953446400.0, + "grad_norm": 2.1305176704732234, + "language_loss": 0.81924653, + "learning_rate": 1.86126840594594e-06, + "loss": 0.84055364, + "num_input_tokens_seen": 191962265, + "step": 8922, + "time_per_iteration": 2.4761569499969482 + }, + { + "auxiliary_loss_clip": 0.01101068, + "auxiliary_loss_mlp": 0.010267, + "balance_loss_clip": 1.03643286, + "balance_loss_mlp": 1.01461208, + "epoch": 0.5364797835562904, + "flos": 17931060247680.0, + "grad_norm": 1.9525577281782869, + "language_loss": 0.76908201, + "learning_rate": 1.860879884996686e-06, + "loss": 0.79035968, + "num_input_tokens_seen": 191978850, + "step": 8923, + "time_per_iteration": 3.8821146488189697 + }, + { + "auxiliary_loss_clip": 0.010887, + "auxiliary_loss_mlp": 0.0103472, + "balance_loss_clip": 1.03849816, + "balance_loss_mlp": 1.02098143, + "epoch": 0.5365399068089584, + "flos": 30228058477440.0, + "grad_norm": 1.3904465728104847, + "language_loss": 0.70315063, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.72438478, + "num_input_tokens_seen": 192002000, + "step": 8924, + "time_per_iteration": 2.585658311843872 + }, + { + "auxiliary_loss_clip": 0.01082502, + "auxiliary_loss_mlp": 0.01032079, + "balance_loss_clip": 1.03798163, + "balance_loss_mlp": 1.01800585, + "epoch": 0.5366000300616264, + "flos": 24891696380160.0, + "grad_norm": 1.8689126593599472, + "language_loss": 0.87042952, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.89157534, + "num_input_tokens_seen": 192019100, + "step": 8925, + "time_per_iteration": 2.5954110622406006 + }, + { + "auxiliary_loss_clip": 0.01112053, + "auxiliary_loss_mlp": 0.01033105, + "balance_loss_clip": 1.03596377, + "balance_loss_mlp": 1.02000391, + "epoch": 0.5366601533142943, + "flos": 29826649013760.0, + "grad_norm": 1.7866558047221521, + "language_loss": 0.77989936, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.80135095, + "num_input_tokens_seen": 192041660, + "step": 8926, + "time_per_iteration": 2.524592399597168 + }, + { + "auxiliary_loss_clip": 0.01077005, + "auxiliary_loss_mlp": 0.01027443, + "balance_loss_clip": 1.04053354, + "balance_loss_mlp": 1.01607013, + "epoch": 0.5367202765669623, + "flos": 27199352620800.0, + "grad_norm": 1.425367494953728, + "language_loss": 0.66990697, + "learning_rate": 1.85932585410148e-06, + "loss": 0.69095147, + "num_input_tokens_seen": 192063540, + "step": 8927, + "time_per_iteration": 2.6046059131622314 + }, + { + "auxiliary_loss_clip": 0.0110115, + "auxiliary_loss_mlp": 0.01028776, + "balance_loss_clip": 1.03526187, + "balance_loss_mlp": 1.01570511, + "epoch": 0.5367803998196302, + "flos": 20229953569920.0, + "grad_norm": 1.8071236903875367, + "language_loss": 0.73617846, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.75747776, + "num_input_tokens_seen": 192081760, + "step": 8928, + "time_per_iteration": 3.858457565307617 + }, + { + "auxiliary_loss_clip": 0.01087394, + "auxiliary_loss_mlp": 0.01028725, + "balance_loss_clip": 1.03588283, + "balance_loss_mlp": 1.01641083, + "epoch": 0.5368405230722982, + "flos": 32154629374080.0, + "grad_norm": 1.757375477986784, + "language_loss": 0.6329931, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.6541543, + "num_input_tokens_seen": 192101620, + "step": 8929, + "time_per_iteration": 2.6298763751983643 + }, + { + "auxiliary_loss_clip": 0.01100704, + "auxiliary_loss_mlp": 0.01029564, + "balance_loss_clip": 1.03688645, + "balance_loss_mlp": 1.01741052, + "epoch": 0.5369006463249661, + "flos": 26247935128320.0, + "grad_norm": 1.7773420048192499, + "language_loss": 0.66246402, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.68376672, + "num_input_tokens_seen": 192121805, + "step": 8930, + "time_per_iteration": 2.553950309753418 + }, + { + "auxiliary_loss_clip": 0.01063294, + "auxiliary_loss_mlp": 0.01029219, + "balance_loss_clip": 1.03907895, + "balance_loss_mlp": 1.01625443, + "epoch": 0.5369607695776342, + "flos": 26211306234240.0, + "grad_norm": 1.5126561385389086, + "language_loss": 0.67342651, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.69435161, + "num_input_tokens_seen": 192141765, + "step": 8931, + "time_per_iteration": 2.626486301422119 + }, + { + "auxiliary_loss_clip": 0.01062837, + "auxiliary_loss_mlp": 0.01033273, + "balance_loss_clip": 1.03529286, + "balance_loss_mlp": 1.01915264, + "epoch": 0.5370208928303021, + "flos": 25009017177600.0, + "grad_norm": 1.7147059236887303, + "language_loss": 0.75593829, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.7768994, + "num_input_tokens_seen": 192161560, + "step": 8932, + "time_per_iteration": 2.622781753540039 + }, + { + "auxiliary_loss_clip": 0.01086127, + "auxiliary_loss_mlp": 0.01036937, + "balance_loss_clip": 1.03999209, + "balance_loss_mlp": 1.02280521, + "epoch": 0.5370810160829701, + "flos": 31792147274880.0, + "grad_norm": 2.1614863671277074, + "language_loss": 0.65837419, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.67960489, + "num_input_tokens_seen": 192180190, + "step": 8933, + "time_per_iteration": 2.6252777576446533 + }, + { + "auxiliary_loss_clip": 0.01096774, + "auxiliary_loss_mlp": 0.00787517, + "balance_loss_clip": 1.04176331, + "balance_loss_mlp": 1.01287794, + "epoch": 0.537141139335638, + "flos": 23842602829440.0, + "grad_norm": 1.623774217188273, + "language_loss": 0.8308084, + "learning_rate": 1.856606505975565e-06, + "loss": 0.84965134, + "num_input_tokens_seen": 192198855, + "step": 8934, + "time_per_iteration": 2.532641649246216 + }, + { + "auxiliary_loss_clip": 0.01074536, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.0359751, + "balance_loss_mlp": 1.01999605, + "epoch": 0.537201262588306, + "flos": 18508826511360.0, + "grad_norm": 1.7227510801660562, + "language_loss": 0.79553771, + "learning_rate": 1.856218049303999e-06, + "loss": 0.81662011, + "num_input_tokens_seen": 192216555, + "step": 8935, + "time_per_iteration": 2.5425782203674316 + }, + { + "auxiliary_loss_clip": 0.01100116, + "auxiliary_loss_mlp": 0.01040439, + "balance_loss_clip": 1.03709674, + "balance_loss_mlp": 1.02776098, + "epoch": 0.537261385840974, + "flos": 25662950231040.0, + "grad_norm": 1.7740954502005102, + "language_loss": 0.8393265, + "learning_rate": 1.855829598084659e-06, + "loss": 0.86073202, + "num_input_tokens_seen": 192236910, + "step": 8936, + "time_per_iteration": 2.5221385955810547 + }, + { + "auxiliary_loss_clip": 0.01078764, + "auxiliary_loss_mlp": 0.01030945, + "balance_loss_clip": 1.03781295, + "balance_loss_mlp": 1.01848817, + "epoch": 0.537321509093642, + "flos": 40735017406080.0, + "grad_norm": 1.4968376646867219, + "language_loss": 0.72594219, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.74703932, + "num_input_tokens_seen": 192260790, + "step": 8937, + "time_per_iteration": 2.733687162399292 + }, + { + "auxiliary_loss_clip": 0.01086074, + "auxiliary_loss_mlp": 0.01036449, + "balance_loss_clip": 1.03420961, + "balance_loss_mlp": 1.02166748, + "epoch": 0.53738163234631, + "flos": 17238487138560.0, + "grad_norm": 2.9523899072059896, + "language_loss": 0.81532574, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.83655095, + "num_input_tokens_seen": 192277230, + "step": 8938, + "time_per_iteration": 2.5704145431518555 + }, + { + "auxiliary_loss_clip": 0.0111633, + "auxiliary_loss_mlp": 0.01032924, + "balance_loss_clip": 1.03881454, + "balance_loss_mlp": 1.02070522, + "epoch": 0.5374417555989779, + "flos": 12821977457280.0, + "grad_norm": 2.581111830647663, + "language_loss": 0.80771393, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.82920647, + "num_input_tokens_seen": 192292840, + "step": 8939, + "time_per_iteration": 2.4279019832611084 + }, + { + "auxiliary_loss_clip": 0.0100938, + "auxiliary_loss_mlp": 0.0100379, + "balance_loss_clip": 1.01319385, + "balance_loss_mlp": 1.00222874, + "epoch": 0.5375018788516459, + "flos": 67256018703360.0, + "grad_norm": 0.8217244339751164, + "language_loss": 0.524607, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.54473871, + "num_input_tokens_seen": 192358240, + "step": 8940, + "time_per_iteration": 3.160722255706787 + }, + { + "auxiliary_loss_clip": 0.01072641, + "auxiliary_loss_mlp": 0.01027153, + "balance_loss_clip": 1.03677011, + "balance_loss_mlp": 1.01455832, + "epoch": 0.5375620021043138, + "flos": 18114168804480.0, + "grad_norm": 2.1731706115368032, + "language_loss": 0.71649551, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.73749346, + "num_input_tokens_seen": 192377370, + "step": 8941, + "time_per_iteration": 2.5893988609313965 + }, + { + "auxiliary_loss_clip": 0.0108598, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.03660941, + "balance_loss_mlp": 1.01600993, + "epoch": 0.5376221253569818, + "flos": 23149383275520.0, + "grad_norm": 1.6273562073356052, + "language_loss": 0.79512596, + "learning_rate": 1.853499006090237e-06, + "loss": 0.81627119, + "num_input_tokens_seen": 192396450, + "step": 8942, + "time_per_iteration": 2.558892250061035 + }, + { + "auxiliary_loss_clip": 0.01115638, + "auxiliary_loss_mlp": 0.010362, + "balance_loss_clip": 1.03928542, + "balance_loss_mlp": 1.02289057, + "epoch": 0.5376822486096497, + "flos": 29972302663680.0, + "grad_norm": 1.9206287849499029, + "language_loss": 0.6984508, + "learning_rate": 1.853110593448911e-06, + "loss": 0.71996915, + "num_input_tokens_seen": 192417390, + "step": 8943, + "time_per_iteration": 2.5439293384552 + }, + { + "auxiliary_loss_clip": 0.01026858, + "auxiliary_loss_mlp": 0.01002389, + "balance_loss_clip": 1.01146913, + "balance_loss_mlp": 1.00103045, + "epoch": 0.5377423718623178, + "flos": 54168950874240.0, + "grad_norm": 0.7956089372104103, + "language_loss": 0.59615719, + "learning_rate": 1.852722186377645e-06, + "loss": 0.61644965, + "num_input_tokens_seen": 192478060, + "step": 8944, + "time_per_iteration": 3.1171228885650635 + }, + { + "auxiliary_loss_clip": 0.01069861, + "auxiliary_loss_mlp": 0.01032711, + "balance_loss_clip": 1.04065657, + "balance_loss_mlp": 1.01824462, + "epoch": 0.5378024951149857, + "flos": 23257079228160.0, + "grad_norm": 1.9195335770468946, + "language_loss": 0.7720226, + "learning_rate": 1.852333784891169e-06, + "loss": 0.79304832, + "num_input_tokens_seen": 192495985, + "step": 8945, + "time_per_iteration": 2.5904903411865234 + }, + { + "auxiliary_loss_clip": 0.01102723, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.0365026, + "balance_loss_mlp": 1.02037346, + "epoch": 0.5378626183676537, + "flos": 24024095274240.0, + "grad_norm": 1.6852921736213922, + "language_loss": 0.68432856, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.70568597, + "num_input_tokens_seen": 192515445, + "step": 8946, + "time_per_iteration": 2.5133869647979736 + }, + { + "auxiliary_loss_clip": 0.01068995, + "auxiliary_loss_mlp": 0.01036557, + "balance_loss_clip": 1.04022777, + "balance_loss_mlp": 1.02364707, + "epoch": 0.5379227416203216, + "flos": 27161789973120.0, + "grad_norm": 1.550835129867525, + "language_loss": 0.76972997, + "learning_rate": 1.851556998731498e-06, + "loss": 0.79078549, + "num_input_tokens_seen": 192536530, + "step": 8947, + "time_per_iteration": 2.627469778060913 + }, + { + "auxiliary_loss_clip": 0.01101992, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.03842509, + "balance_loss_mlp": 1.01878965, + "epoch": 0.5379828648729896, + "flos": 24681619687680.0, + "grad_norm": 1.5815137505445422, + "language_loss": 0.60249668, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.62382895, + "num_input_tokens_seen": 192556075, + "step": 8948, + "time_per_iteration": 2.554457426071167 + }, + { + "auxiliary_loss_clip": 0.01075094, + "auxiliary_loss_mlp": 0.01032037, + "balance_loss_clip": 1.03831923, + "balance_loss_mlp": 1.01931107, + "epoch": 0.5380429881256577, + "flos": 22523280284160.0, + "grad_norm": 1.857736213339742, + "language_loss": 0.79527497, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.81634629, + "num_input_tokens_seen": 192575535, + "step": 8949, + "time_per_iteration": 2.564959764480591 + }, + { + "auxiliary_loss_clip": 0.01071397, + "auxiliary_loss_mlp": 0.01041463, + "balance_loss_clip": 1.03570783, + "balance_loss_mlp": 1.02657342, + "epoch": 0.5381031113783256, + "flos": 26979543342720.0, + "grad_norm": 1.6886043704165576, + "language_loss": 0.77723074, + "learning_rate": 1.850391861746111e-06, + "loss": 0.79835927, + "num_input_tokens_seen": 192594490, + "step": 8950, + "time_per_iteration": 2.597773790359497 + }, + { + "auxiliary_loss_clip": 0.0109415, + "auxiliary_loss_mlp": 0.01029314, + "balance_loss_clip": 1.03973591, + "balance_loss_mlp": 1.01767325, + "epoch": 0.5381632346309936, + "flos": 24754087376640.0, + "grad_norm": 1.6693721306578446, + "language_loss": 0.72677785, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.74801248, + "num_input_tokens_seen": 192615650, + "step": 8951, + "time_per_iteration": 2.5851144790649414 + }, + { + "auxiliary_loss_clip": 0.0111338, + "auxiliary_loss_mlp": 0.00788854, + "balance_loss_clip": 1.03822589, + "balance_loss_mlp": 1.01364625, + "epoch": 0.5382233578836615, + "flos": 15560058372480.0, + "grad_norm": 2.0379043118044056, + "language_loss": 0.74970996, + "learning_rate": 1.849615132097085e-06, + "loss": 0.76873231, + "num_input_tokens_seen": 192633840, + "step": 8952, + "time_per_iteration": 2.4450502395629883 + }, + { + "auxiliary_loss_clip": 0.01091225, + "auxiliary_loss_mlp": 0.01029464, + "balance_loss_clip": 1.04296994, + "balance_loss_mlp": 1.01632094, + "epoch": 0.5382834811363295, + "flos": 25084501608960.0, + "grad_norm": 1.7912685242630695, + "language_loss": 0.79508352, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.81629038, + "num_input_tokens_seen": 192655890, + "step": 8953, + "time_per_iteration": 2.5817384719848633 + }, + { + "auxiliary_loss_clip": 0.01076072, + "auxiliary_loss_mlp": 0.01032894, + "balance_loss_clip": 1.0358057, + "balance_loss_mlp": 1.01847577, + "epoch": 0.5383436043889974, + "flos": 13297901685120.0, + "grad_norm": 1.9080768860268316, + "language_loss": 0.80637932, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.82746893, + "num_input_tokens_seen": 192673025, + "step": 8954, + "time_per_iteration": 2.5152320861816406 + }, + { + "auxiliary_loss_clip": 0.01114978, + "auxiliary_loss_mlp": 0.01032032, + "balance_loss_clip": 1.04075968, + "balance_loss_mlp": 1.01862144, + "epoch": 0.5384037276416654, + "flos": 23039388852480.0, + "grad_norm": 1.847672985210783, + "language_loss": 0.76295984, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.78442997, + "num_input_tokens_seen": 192692190, + "step": 8955, + "time_per_iteration": 2.487030029296875 + }, + { + "auxiliary_loss_clip": 0.01088854, + "auxiliary_loss_mlp": 0.01035328, + "balance_loss_clip": 1.03847671, + "balance_loss_mlp": 1.02229285, + "epoch": 0.5384638508943334, + "flos": 20631147552000.0, + "grad_norm": 1.6215195242212326, + "language_loss": 0.78587997, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.80712181, + "num_input_tokens_seen": 192710380, + "step": 8956, + "time_per_iteration": 3.9153616428375244 + }, + { + "auxiliary_loss_clip": 0.01020436, + "auxiliary_loss_mlp": 0.01001077, + "balance_loss_clip": 1.01385522, + "balance_loss_mlp": 0.99981892, + "epoch": 0.5385239741470014, + "flos": 66737683491840.0, + "grad_norm": 0.8608245383155181, + "language_loss": 0.63458884, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.65480399, + "num_input_tokens_seen": 192768995, + "step": 8957, + "time_per_iteration": 3.069251537322998 + }, + { + "auxiliary_loss_clip": 0.01004868, + "auxiliary_loss_mlp": 0.01001626, + "balance_loss_clip": 1.02332425, + "balance_loss_mlp": 1.00021935, + "epoch": 0.5385840973996693, + "flos": 64716058229760.0, + "grad_norm": 0.7004818267151707, + "language_loss": 0.51620746, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.53627241, + "num_input_tokens_seen": 192825585, + "step": 8958, + "time_per_iteration": 3.2046284675598145 + }, + { + "auxiliary_loss_clip": 0.01107496, + "auxiliary_loss_mlp": 0.01030644, + "balance_loss_clip": 1.04311562, + "balance_loss_mlp": 1.01651788, + "epoch": 0.5386442206523373, + "flos": 26141783460480.0, + "grad_norm": 2.7832927916946906, + "language_loss": 0.77180469, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.79318607, + "num_input_tokens_seen": 192847335, + "step": 8959, + "time_per_iteration": 3.943035840988159 + }, + { + "auxiliary_loss_clip": 0.0106982, + "auxiliary_loss_mlp": 0.01026804, + "balance_loss_clip": 1.04172206, + "balance_loss_mlp": 1.01399469, + "epoch": 0.5387043439050052, + "flos": 18251849635200.0, + "grad_norm": 1.9450873835299438, + "language_loss": 0.83387446, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.85484076, + "num_input_tokens_seen": 192862205, + "step": 8960, + "time_per_iteration": 2.5648298263549805 + }, + { + "auxiliary_loss_clip": 0.0110176, + "auxiliary_loss_mlp": 0.01032006, + "balance_loss_clip": 1.03888476, + "balance_loss_mlp": 1.01944757, + "epoch": 0.5387644671576732, + "flos": 29788296266880.0, + "grad_norm": 1.5138238534573276, + "language_loss": 0.78271234, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.80404997, + "num_input_tokens_seen": 192883695, + "step": 8961, + "time_per_iteration": 2.5684401988983154 + }, + { + "auxiliary_loss_clip": 0.01079737, + "auxiliary_loss_mlp": 0.01031935, + "balance_loss_clip": 1.03799152, + "balance_loss_mlp": 1.01956749, + "epoch": 0.5388245904103413, + "flos": 22374466237440.0, + "grad_norm": 1.6529682027253239, + "language_loss": 0.84323907, + "learning_rate": 1.845731828364681e-06, + "loss": 0.8643558, + "num_input_tokens_seen": 192900190, + "step": 8962, + "time_per_iteration": 2.539980173110962 + }, + { + "auxiliary_loss_clip": 0.01014657, + "auxiliary_loss_mlp": 0.01004993, + "balance_loss_clip": 1.01684284, + "balance_loss_mlp": 1.00387287, + "epoch": 0.5388847136630092, + "flos": 69807794751360.0, + "grad_norm": 0.7302228009693614, + "language_loss": 0.54185164, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.56204808, + "num_input_tokens_seen": 192958675, + "step": 8963, + "time_per_iteration": 4.494645118713379 + }, + { + "auxiliary_loss_clip": 0.01016213, + "auxiliary_loss_mlp": 0.01005063, + "balance_loss_clip": 1.01070714, + "balance_loss_mlp": 1.00386524, + "epoch": 0.5389448369156772, + "flos": 69822303845760.0, + "grad_norm": 0.8032540861171646, + "language_loss": 0.63396126, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.65417403, + "num_input_tokens_seen": 193033135, + "step": 8964, + "time_per_iteration": 3.189657688140869 + }, + { + "auxiliary_loss_clip": 0.01054799, + "auxiliary_loss_mlp": 0.01027984, + "balance_loss_clip": 1.03819358, + "balance_loss_mlp": 1.01501417, + "epoch": 0.5390049601683451, + "flos": 31722444933120.0, + "grad_norm": 1.5310846577892065, + "language_loss": 0.70003909, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.72086692, + "num_input_tokens_seen": 193055570, + "step": 8965, + "time_per_iteration": 2.7072057723999023 + }, + { + "auxiliary_loss_clip": 0.01086453, + "auxiliary_loss_mlp": 0.00788173, + "balance_loss_clip": 1.03756237, + "balance_loss_mlp": 1.01005554, + "epoch": 0.5390650834210131, + "flos": 18113486446080.0, + "grad_norm": 2.0430118499214935, + "language_loss": 0.82317597, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.84192222, + "num_input_tokens_seen": 193073120, + "step": 8966, + "time_per_iteration": 2.5024240016937256 + }, + { + "auxiliary_loss_clip": 0.01112712, + "auxiliary_loss_mlp": 0.01030779, + "balance_loss_clip": 1.03995252, + "balance_loss_mlp": 1.01779699, + "epoch": 0.539125206673681, + "flos": 17416711445760.0, + "grad_norm": 3.2694762089253273, + "language_loss": 0.72555935, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.74699432, + "num_input_tokens_seen": 193090105, + "step": 8967, + "time_per_iteration": 3.8562052249908447 + }, + { + "auxiliary_loss_clip": 0.01086516, + "auxiliary_loss_mlp": 0.01028724, + "balance_loss_clip": 1.03353548, + "balance_loss_mlp": 1.01681542, + "epoch": 0.539185329926349, + "flos": 22198935450240.0, + "grad_norm": 2.4174754686683264, + "language_loss": 0.81935549, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.84050786, + "num_input_tokens_seen": 193109325, + "step": 8968, + "time_per_iteration": 2.5234062671661377 + }, + { + "auxiliary_loss_clip": 0.01083209, + "auxiliary_loss_mlp": 0.01030566, + "balance_loss_clip": 1.03888309, + "balance_loss_mlp": 1.01700568, + "epoch": 0.539245453179017, + "flos": 21434397442560.0, + "grad_norm": 1.4498448325163416, + "language_loss": 0.73872691, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.75986463, + "num_input_tokens_seen": 193130595, + "step": 8969, + "time_per_iteration": 2.5730488300323486 + }, + { + "auxiliary_loss_clip": 0.01078993, + "auxiliary_loss_mlp": 0.00787802, + "balance_loss_clip": 1.03503621, + "balance_loss_mlp": 1.01004243, + "epoch": 0.539305576431685, + "flos": 20735000749440.0, + "grad_norm": 1.8559793787074723, + "language_loss": 0.82414252, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.84281051, + "num_input_tokens_seen": 193148930, + "step": 8970, + "time_per_iteration": 2.5401108264923096 + }, + { + "auxiliary_loss_clip": 0.01087227, + "auxiliary_loss_mlp": 0.01026178, + "balance_loss_clip": 1.03720665, + "balance_loss_mlp": 1.01387584, + "epoch": 0.5393656996843529, + "flos": 30920452018560.0, + "grad_norm": 1.3538298827353374, + "language_loss": 0.75321865, + "learning_rate": 1.842237354749146e-06, + "loss": 0.77435267, + "num_input_tokens_seen": 193170140, + "step": 8971, + "time_per_iteration": 2.599991798400879 + }, + { + "auxiliary_loss_clip": 0.0102668, + "auxiliary_loss_mlp": 0.01022115, + "balance_loss_clip": 1.01067686, + "balance_loss_mlp": 1.02049339, + "epoch": 0.5394258229370209, + "flos": 50317781351040.0, + "grad_norm": 0.8980780205792912, + "language_loss": 0.60345495, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62394285, + "num_input_tokens_seen": 193227235, + "step": 8972, + "time_per_iteration": 3.108747959136963 + }, + { + "auxiliary_loss_clip": 0.01102131, + "auxiliary_loss_mlp": 0.01039127, + "balance_loss_clip": 1.03776753, + "balance_loss_mlp": 1.02593076, + "epoch": 0.5394859461896888, + "flos": 25411935012480.0, + "grad_norm": 1.389896566125859, + "language_loss": 0.78473043, + "learning_rate": 1.841460870485045e-06, + "loss": 0.80614305, + "num_input_tokens_seen": 193248435, + "step": 8973, + "time_per_iteration": 2.5342893600463867 + }, + { + "auxiliary_loss_clip": 0.0110821, + "auxiliary_loss_mlp": 0.0103565, + "balance_loss_clip": 1.03819692, + "balance_loss_mlp": 1.02122617, + "epoch": 0.5395460694423568, + "flos": 25478476957440.0, + "grad_norm": 2.037819006284775, + "language_loss": 0.7387737, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.7602123, + "num_input_tokens_seen": 193267490, + "step": 8974, + "time_per_iteration": 2.525157928466797 + }, + { + "auxiliary_loss_clip": 0.01034237, + "auxiliary_loss_mlp": 0.01005025, + "balance_loss_clip": 1.00874305, + "balance_loss_mlp": 1.00373173, + "epoch": 0.5396061926950249, + "flos": 53249493507840.0, + "grad_norm": 0.7354506955113285, + "language_loss": 0.51044875, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.53084135, + "num_input_tokens_seen": 193326050, + "step": 8975, + "time_per_iteration": 3.0601558685302734 + }, + { + "auxiliary_loss_clip": 0.01100065, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.03755164, + "balance_loss_mlp": 1.02275801, + "epoch": 0.5396663159476928, + "flos": 26725080418560.0, + "grad_norm": 1.4713408842978375, + "language_loss": 0.72080517, + "learning_rate": 1.840296189214344e-06, + "loss": 0.74216658, + "num_input_tokens_seen": 193348785, + "step": 8976, + "time_per_iteration": 2.56437611579895 + }, + { + "auxiliary_loss_clip": 0.0109804, + "auxiliary_loss_mlp": 0.00790177, + "balance_loss_clip": 1.03591943, + "balance_loss_mlp": 1.01650476, + "epoch": 0.5397264392003608, + "flos": 23253380127360.0, + "grad_norm": 1.5898124373749882, + "language_loss": 0.69634992, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.71523207, + "num_input_tokens_seen": 193367080, + "step": 8977, + "time_per_iteration": 2.518348217010498 + }, + { + "auxiliary_loss_clip": 0.01046584, + "auxiliary_loss_mlp": 0.01033166, + "balance_loss_clip": 1.0418191, + "balance_loss_mlp": 1.02044618, + "epoch": 0.5397865624530287, + "flos": 18294188791680.0, + "grad_norm": 1.6349730671587264, + "language_loss": 0.7273187, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.74811614, + "num_input_tokens_seen": 193383715, + "step": 8978, + "time_per_iteration": 2.6040353775024414 + }, + { + "auxiliary_loss_clip": 0.01085298, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.04603219, + "balance_loss_mlp": 1.02139211, + "epoch": 0.5398466857056967, + "flos": 15297514888320.0, + "grad_norm": 2.116793839666026, + "language_loss": 0.74471521, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.76592731, + "num_input_tokens_seen": 193400560, + "step": 8979, + "time_per_iteration": 2.5447163581848145 + }, + { + "auxiliary_loss_clip": 0.01062467, + "auxiliary_loss_mlp": 0.01044959, + "balance_loss_clip": 1.03942513, + "balance_loss_mlp": 1.0299449, + "epoch": 0.5399068089583646, + "flos": 17821748183040.0, + "grad_norm": 2.019653663028868, + "language_loss": 0.76644987, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.7875241, + "num_input_tokens_seen": 193418680, + "step": 8980, + "time_per_iteration": 2.597350597381592 + }, + { + "auxiliary_loss_clip": 0.0111169, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.03664529, + "balance_loss_mlp": 1.0199033, + "epoch": 0.5399669322110326, + "flos": 27381635164800.0, + "grad_norm": 1.7517860604137983, + "language_loss": 0.81925678, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.84069514, + "num_input_tokens_seen": 193439310, + "step": 8981, + "time_per_iteration": 2.5185766220092773 + }, + { + "auxiliary_loss_clip": 0.01103233, + "auxiliary_loss_mlp": 0.01032914, + "balance_loss_clip": 1.03661942, + "balance_loss_mlp": 1.01953864, + "epoch": 0.5400270554637006, + "flos": 20449116403200.0, + "grad_norm": 1.8610331352040506, + "language_loss": 0.66786838, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.68922985, + "num_input_tokens_seen": 193458115, + "step": 8982, + "time_per_iteration": 2.489445209503174 + }, + { + "auxiliary_loss_clip": 0.0107952, + "auxiliary_loss_mlp": 0.00786378, + "balance_loss_clip": 1.04083228, + "balance_loss_mlp": 1.01105237, + "epoch": 0.5400871787163686, + "flos": 21689578638720.0, + "grad_norm": 1.516081190680007, + "language_loss": 0.82631624, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.84497523, + "num_input_tokens_seen": 193477365, + "step": 8983, + "time_per_iteration": 2.5980191230773926 + }, + { + "auxiliary_loss_clip": 0.01063665, + "auxiliary_loss_mlp": 0.01038051, + "balance_loss_clip": 1.03467512, + "balance_loss_mlp": 1.02483034, + "epoch": 0.5401473019690365, + "flos": 19204739585280.0, + "grad_norm": 2.0185699101337375, + "language_loss": 0.70384634, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.72486347, + "num_input_tokens_seen": 193495595, + "step": 8984, + "time_per_iteration": 2.561084270477295 + }, + { + "auxiliary_loss_clip": 0.01117203, + "auxiliary_loss_mlp": 0.01032341, + "balance_loss_clip": 1.04021561, + "balance_loss_mlp": 1.01883459, + "epoch": 0.5402074252217045, + "flos": 20627376624000.0, + "grad_norm": 1.6983648034322454, + "language_loss": 0.8010385, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.82253391, + "num_input_tokens_seen": 193514035, + "step": 8985, + "time_per_iteration": 2.474837303161621 + }, + { + "auxiliary_loss_clip": 0.01067908, + "auxiliary_loss_mlp": 0.0102828, + "balance_loss_clip": 1.03586555, + "balance_loss_mlp": 1.01517916, + "epoch": 0.5402675484743724, + "flos": 24973465691520.0, + "grad_norm": 1.476038245299173, + "language_loss": 0.78848535, + "learning_rate": 1.83641431418363e-06, + "loss": 0.80944729, + "num_input_tokens_seen": 193535445, + "step": 8986, + "time_per_iteration": 2.598332643508911 + }, + { + "auxiliary_loss_clip": 0.01095431, + "auxiliary_loss_mlp": 0.01035798, + "balance_loss_clip": 1.03484893, + "balance_loss_mlp": 1.0224824, + "epoch": 0.5403276717270404, + "flos": 19459022941440.0, + "grad_norm": 1.665564339634343, + "language_loss": 0.76875085, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.79006314, + "num_input_tokens_seen": 193554780, + "step": 8987, + "time_per_iteration": 2.5021634101867676 + }, + { + "auxiliary_loss_clip": 0.01087319, + "auxiliary_loss_mlp": 0.01032048, + "balance_loss_clip": 1.03954387, + "balance_loss_mlp": 1.01957834, + "epoch": 0.5403877949797083, + "flos": 18442140912000.0, + "grad_norm": 1.806909544124405, + "language_loss": 0.70866454, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.72985822, + "num_input_tokens_seen": 193573580, + "step": 8988, + "time_per_iteration": 2.5186872482299805 + }, + { + "auxiliary_loss_clip": 0.01067522, + "auxiliary_loss_mlp": 0.01036822, + "balance_loss_clip": 1.04112542, + "balance_loss_mlp": 1.02333939, + "epoch": 0.5404479182323764, + "flos": 28292868316800.0, + "grad_norm": 2.3624582471215256, + "language_loss": 0.67279983, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.69384325, + "num_input_tokens_seen": 193590490, + "step": 8989, + "time_per_iteration": 2.670578956604004 + }, + { + "auxiliary_loss_clip": 0.0109984, + "auxiliary_loss_mlp": 0.01042541, + "balance_loss_clip": 1.03620303, + "balance_loss_mlp": 1.02913547, + "epoch": 0.5405080414850444, + "flos": 23367325046400.0, + "grad_norm": 1.5372061292660113, + "language_loss": 0.77815855, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.7995823, + "num_input_tokens_seen": 193609900, + "step": 8990, + "time_per_iteration": 2.5062739849090576 + }, + { + "auxiliary_loss_clip": 0.01096009, + "auxiliary_loss_mlp": 0.01027886, + "balance_loss_clip": 1.03445649, + "balance_loss_mlp": 1.0161978, + "epoch": 0.5405681647377123, + "flos": 21106425335040.0, + "grad_norm": 1.6943494911565302, + "language_loss": 0.69358146, + "learning_rate": 1.834473608367745e-06, + "loss": 0.71482038, + "num_input_tokens_seen": 193629775, + "step": 8991, + "time_per_iteration": 2.4924204349517822 + }, + { + "auxiliary_loss_clip": 0.0105727, + "auxiliary_loss_mlp": 0.01037648, + "balance_loss_clip": 1.0374999, + "balance_loss_mlp": 1.02374196, + "epoch": 0.5406282879903803, + "flos": 20449188230400.0, + "grad_norm": 1.734297729509414, + "language_loss": 0.7639643, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.78491354, + "num_input_tokens_seen": 193648070, + "step": 8992, + "time_per_iteration": 2.6205179691314697 + }, + { + "auxiliary_loss_clip": 0.01086617, + "auxiliary_loss_mlp": 0.01039318, + "balance_loss_clip": 1.03492975, + "balance_loss_mlp": 1.02512634, + "epoch": 0.5406884112430482, + "flos": 14209493973120.0, + "grad_norm": 2.8342517261033646, + "language_loss": 0.7647841, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.78604352, + "num_input_tokens_seen": 193665060, + "step": 8993, + "time_per_iteration": 2.5117013454437256 + }, + { + "auxiliary_loss_clip": 0.01097053, + "auxiliary_loss_mlp": 0.01033006, + "balance_loss_clip": 1.03522456, + "balance_loss_mlp": 1.02035832, + "epoch": 0.5407485344957162, + "flos": 23875568536320.0, + "grad_norm": 1.6092732413014166, + "language_loss": 0.70641994, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.72772062, + "num_input_tokens_seen": 193683620, + "step": 8994, + "time_per_iteration": 3.8526992797851562 + }, + { + "auxiliary_loss_clip": 0.01098969, + "auxiliary_loss_mlp": 0.01031094, + "balance_loss_clip": 1.03746009, + "balance_loss_mlp": 1.01695561, + "epoch": 0.5408086577483842, + "flos": 23148485435520.0, + "grad_norm": 1.887956651489494, + "language_loss": 0.74918735, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.77048802, + "num_input_tokens_seen": 193702990, + "step": 8995, + "time_per_iteration": 2.503073215484619 + }, + { + "auxiliary_loss_clip": 0.01096647, + "auxiliary_loss_mlp": 0.0103194, + "balance_loss_clip": 1.03555882, + "balance_loss_mlp": 1.019894, + "epoch": 0.5408687810010522, + "flos": 18771046773120.0, + "grad_norm": 1.9124453428725674, + "language_loss": 0.73430395, + "learning_rate": 1.832533059471282e-06, + "loss": 0.75558984, + "num_input_tokens_seen": 193721785, + "step": 8996, + "time_per_iteration": 2.481541633605957 + }, + { + "auxiliary_loss_clip": 0.01065822, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.03668904, + "balance_loss_mlp": 1.02109778, + "epoch": 0.5409289042537201, + "flos": 13881557779200.0, + "grad_norm": 5.093146298279868, + "language_loss": 0.72760212, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.74859452, + "num_input_tokens_seen": 193740315, + "step": 8997, + "time_per_iteration": 2.5911202430725098 + }, + { + "auxiliary_loss_clip": 0.01111267, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.03773379, + "balance_loss_mlp": 1.01890802, + "epoch": 0.5409890275063881, + "flos": 14465357527680.0, + "grad_norm": 2.205689543848071, + "language_loss": 0.71490097, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.73633206, + "num_input_tokens_seen": 193757580, + "step": 8998, + "time_per_iteration": 3.8622753620147705 + }, + { + "auxiliary_loss_clip": 0.01077485, + "auxiliary_loss_mlp": 0.01037496, + "balance_loss_clip": 1.03660285, + "balance_loss_mlp": 1.02493167, + "epoch": 0.541049150759056, + "flos": 48977449349760.0, + "grad_norm": 1.5353029117854466, + "language_loss": 0.70503473, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.72618449, + "num_input_tokens_seen": 193780965, + "step": 8999, + "time_per_iteration": 2.7941699028015137 + }, + { + "auxiliary_loss_clip": 0.01087633, + "auxiliary_loss_mlp": 0.01031613, + "balance_loss_clip": 1.04038405, + "balance_loss_mlp": 1.01855409, + "epoch": 0.541109274011724, + "flos": 18147601388160.0, + "grad_norm": 2.296375129540677, + "language_loss": 0.80612016, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.82731259, + "num_input_tokens_seen": 193797855, + "step": 9000, + "time_per_iteration": 2.5019028186798096 + }, + { + "auxiliary_loss_clip": 0.01064081, + "auxiliary_loss_mlp": 0.01032041, + "balance_loss_clip": 1.03634465, + "balance_loss_mlp": 1.01900506, + "epoch": 0.541169397264392, + "flos": 20522553759360.0, + "grad_norm": 1.9877262960966249, + "language_loss": 0.72775459, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.74871582, + "num_input_tokens_seen": 193817375, + "step": 9001, + "time_per_iteration": 3.960714340209961 + }, + { + "auxiliary_loss_clip": 0.01079827, + "auxiliary_loss_mlp": 0.01030612, + "balance_loss_clip": 1.03557348, + "balance_loss_mlp": 1.01694453, + "epoch": 0.54122952051706, + "flos": 20044043752320.0, + "grad_norm": 2.328365229761933, + "language_loss": 0.85296762, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.87407196, + "num_input_tokens_seen": 193832205, + "step": 9002, + "time_per_iteration": 2.533674716949463 + }, + { + "auxiliary_loss_clip": 0.01063926, + "auxiliary_loss_mlp": 0.01027372, + "balance_loss_clip": 1.03707337, + "balance_loss_mlp": 1.01615477, + "epoch": 0.541289643769728, + "flos": 19062246332160.0, + "grad_norm": 1.7180419713044937, + "language_loss": 0.77926314, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.80017614, + "num_input_tokens_seen": 193849830, + "step": 9003, + "time_per_iteration": 2.571176767349243 + }, + { + "auxiliary_loss_clip": 0.01099996, + "auxiliary_loss_mlp": 0.01030045, + "balance_loss_clip": 1.03824914, + "balance_loss_mlp": 1.0170455, + "epoch": 0.5413497670223959, + "flos": 22382295402240.0, + "grad_norm": 1.8245218035359125, + "language_loss": 0.69655013, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.71785051, + "num_input_tokens_seen": 193869945, + "step": 9004, + "time_per_iteration": 2.5167720317840576 + }, + { + "auxiliary_loss_clip": 0.01026332, + "auxiliary_loss_mlp": 0.0100922, + "balance_loss_clip": 1.01177895, + "balance_loss_mlp": 1.00788534, + "epoch": 0.5414098902750639, + "flos": 70031734093440.0, + "grad_norm": 0.9744414470977771, + "language_loss": 0.59128577, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.61164129, + "num_input_tokens_seen": 193930860, + "step": 9005, + "time_per_iteration": 4.561850070953369 + }, + { + "auxiliary_loss_clip": 0.0111435, + "auxiliary_loss_mlp": 0.00787489, + "balance_loss_clip": 1.03940248, + "balance_loss_mlp": 1.01237082, + "epoch": 0.5414700135277318, + "flos": 21798962530560.0, + "grad_norm": 1.829621112750986, + "language_loss": 0.77954936, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.79856777, + "num_input_tokens_seen": 193949075, + "step": 9006, + "time_per_iteration": 2.490750551223755 + }, + { + "auxiliary_loss_clip": 0.01088761, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.03726792, + "balance_loss_mlp": 1.0201267, + "epoch": 0.5415301367803999, + "flos": 16907929251840.0, + "grad_norm": 1.7304624427593889, + "language_loss": 0.8300035, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.85120308, + "num_input_tokens_seen": 193967630, + "step": 9007, + "time_per_iteration": 2.5465402603149414 + }, + { + "auxiliary_loss_clip": 0.01099296, + "auxiliary_loss_mlp": 0.01029115, + "balance_loss_clip": 1.03999591, + "balance_loss_mlp": 1.01652646, + "epoch": 0.5415902600330678, + "flos": 25704176065920.0, + "grad_norm": 2.164805792944478, + "language_loss": 0.67006731, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.69135141, + "num_input_tokens_seen": 193988730, + "step": 9008, + "time_per_iteration": 2.5700058937072754 + }, + { + "auxiliary_loss_clip": 0.01117051, + "auxiliary_loss_mlp": 0.01030141, + "balance_loss_clip": 1.03925014, + "balance_loss_mlp": 1.01607442, + "epoch": 0.5416503832857358, + "flos": 19208151377280.0, + "grad_norm": 1.96650324802902, + "language_loss": 0.73738062, + "learning_rate": 1.827488379924234e-06, + "loss": 0.75885254, + "num_input_tokens_seen": 194005160, + "step": 9009, + "time_per_iteration": 2.4481890201568604 + }, + { + "auxiliary_loss_clip": 0.01066669, + "auxiliary_loss_mlp": 0.01037681, + "balance_loss_clip": 1.04091513, + "balance_loss_mlp": 1.02398372, + "epoch": 0.5417105065384037, + "flos": 12713706887040.0, + "grad_norm": 2.481950048039745, + "language_loss": 0.87103724, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.89208078, + "num_input_tokens_seen": 194021700, + "step": 9010, + "time_per_iteration": 2.567962884902954 + }, + { + "auxiliary_loss_clip": 0.0111018, + "auxiliary_loss_mlp": 0.01032929, + "balance_loss_clip": 1.03703988, + "balance_loss_mlp": 1.02060246, + "epoch": 0.5417706297910717, + "flos": 30335933998080.0, + "grad_norm": 2.0290939267670964, + "language_loss": 0.6419155, + "learning_rate": 1.826712372694122e-06, + "loss": 0.66334653, + "num_input_tokens_seen": 194042620, + "step": 9011, + "time_per_iteration": 2.5366978645324707 + }, + { + "auxiliary_loss_clip": 0.01101676, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.03946483, + "balance_loss_mlp": 1.01969087, + "epoch": 0.5418307530437396, + "flos": 29020992912000.0, + "grad_norm": 2.4660140687050025, + "language_loss": 0.79273152, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.81406832, + "num_input_tokens_seen": 194061800, + "step": 9012, + "time_per_iteration": 2.5396668910980225 + }, + { + "auxiliary_loss_clip": 0.01110597, + "auxiliary_loss_mlp": 0.0102844, + "balance_loss_clip": 1.03703058, + "balance_loss_mlp": 1.01566112, + "epoch": 0.5418908762964076, + "flos": 16873455173760.0, + "grad_norm": 1.9026068004204453, + "language_loss": 0.74460232, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.76599264, + "num_input_tokens_seen": 194079890, + "step": 9013, + "time_per_iteration": 2.440497636795044 + }, + { + "auxiliary_loss_clip": 0.010836, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.03814483, + "balance_loss_mlp": 1.01754344, + "epoch": 0.5419509995490756, + "flos": 18949702043520.0, + "grad_norm": 1.797670527425618, + "language_loss": 0.72167557, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.74281752, + "num_input_tokens_seen": 194097625, + "step": 9014, + "time_per_iteration": 2.540179967880249 + }, + { + "auxiliary_loss_clip": 0.01091472, + "auxiliary_loss_mlp": 0.01032746, + "balance_loss_clip": 1.03869987, + "balance_loss_mlp": 1.01978159, + "epoch": 0.5420111228017436, + "flos": 18077719478400.0, + "grad_norm": 1.5851409787372044, + "language_loss": 0.80587494, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.82711709, + "num_input_tokens_seen": 194116055, + "step": 9015, + "time_per_iteration": 2.4949209690093994 + }, + { + "auxiliary_loss_clip": 0.01105458, + "auxiliary_loss_mlp": 0.01037514, + "balance_loss_clip": 1.0389322, + "balance_loss_mlp": 1.02438939, + "epoch": 0.5420712460544116, + "flos": 19061779455360.0, + "grad_norm": 2.1876972861777606, + "language_loss": 0.81075823, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.83218789, + "num_input_tokens_seen": 194130365, + "step": 9016, + "time_per_iteration": 2.496673345565796 + }, + { + "auxiliary_loss_clip": 0.01112133, + "auxiliary_loss_mlp": 0.01028427, + "balance_loss_clip": 1.03839231, + "balance_loss_mlp": 1.01629138, + "epoch": 0.5421313693070795, + "flos": 18187103370240.0, + "grad_norm": 1.5799604000639118, + "language_loss": 0.81194806, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.8333537, + "num_input_tokens_seen": 194148975, + "step": 9017, + "time_per_iteration": 2.4462225437164307 + }, + { + "auxiliary_loss_clip": 0.01110453, + "auxiliary_loss_mlp": 0.01033291, + "balance_loss_clip": 1.0386529, + "balance_loss_mlp": 1.02053595, + "epoch": 0.5421914925597475, + "flos": 13005947940480.0, + "grad_norm": 1.626466044312829, + "language_loss": 0.77270234, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.79413986, + "num_input_tokens_seen": 194167185, + "step": 9018, + "time_per_iteration": 2.445436477661133 + }, + { + "auxiliary_loss_clip": 0.0111323, + "auxiliary_loss_mlp": 0.01034975, + "balance_loss_clip": 1.03649855, + "balance_loss_mlp": 1.02199888, + "epoch": 0.5422516158124154, + "flos": 46758457831680.0, + "grad_norm": 1.4598934170250157, + "language_loss": 0.66739309, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.6888752, + "num_input_tokens_seen": 194192840, + "step": 9019, + "time_per_iteration": 2.716231346130371 + }, + { + "auxiliary_loss_clip": 0.0109325, + "auxiliary_loss_mlp": 0.01028046, + "balance_loss_clip": 1.03508842, + "balance_loss_mlp": 1.01579106, + "epoch": 0.5423117390650835, + "flos": 31758642864000.0, + "grad_norm": 2.0181835289150447, + "language_loss": 0.69360793, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.71482092, + "num_input_tokens_seen": 194213150, + "step": 9020, + "time_per_iteration": 2.5642106533050537 + }, + { + "auxiliary_loss_clip": 0.01074971, + "auxiliary_loss_mlp": 0.01034782, + "balance_loss_clip": 1.03558731, + "balance_loss_mlp": 1.02238464, + "epoch": 0.5423718623177514, + "flos": 27201974313600.0, + "grad_norm": 1.439624582948833, + "language_loss": 0.80503327, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.82613081, + "num_input_tokens_seen": 194234665, + "step": 9021, + "time_per_iteration": 2.6191725730895996 + }, + { + "auxiliary_loss_clip": 0.01064879, + "auxiliary_loss_mlp": 0.01038306, + "balance_loss_clip": 1.03731728, + "balance_loss_mlp": 1.0250442, + "epoch": 0.5424319855704194, + "flos": 23546447193600.0, + "grad_norm": 1.6671805105907298, + "language_loss": 0.78964758, + "learning_rate": 1.822444805916788e-06, + "loss": 0.81067944, + "num_input_tokens_seen": 194253790, + "step": 9022, + "time_per_iteration": 2.5976905822753906 + }, + { + "auxiliary_loss_clip": 0.01082774, + "auxiliary_loss_mlp": 0.00787268, + "balance_loss_clip": 1.03949833, + "balance_loss_mlp": 1.00947332, + "epoch": 0.5424921088230873, + "flos": 26615624699520.0, + "grad_norm": 1.5419661356733698, + "language_loss": 0.81814778, + "learning_rate": 1.822056885403915e-06, + "loss": 0.83684826, + "num_input_tokens_seen": 194274950, + "step": 9023, + "time_per_iteration": 2.5814690589904785 + }, + { + "auxiliary_loss_clip": 0.01098211, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.0401324, + "balance_loss_mlp": 1.01627421, + "epoch": 0.5425522320757553, + "flos": 23586811102080.0, + "grad_norm": 3.2072712072118685, + "language_loss": 0.71036136, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.7316308, + "num_input_tokens_seen": 194296155, + "step": 9024, + "time_per_iteration": 2.526933431625366 + }, + { + "auxiliary_loss_clip": 0.01100473, + "auxiliary_loss_mlp": 0.01029757, + "balance_loss_clip": 1.03571081, + "balance_loss_mlp": 1.01740086, + "epoch": 0.5426123553284232, + "flos": 30592264429440.0, + "grad_norm": 1.6955890752835932, + "language_loss": 0.65148836, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.67279065, + "num_input_tokens_seen": 194318025, + "step": 9025, + "time_per_iteration": 2.567335844039917 + }, + { + "auxiliary_loss_clip": 0.010888, + "auxiliary_loss_mlp": 0.00785393, + "balance_loss_clip": 1.04107428, + "balance_loss_mlp": 1.00912964, + "epoch": 0.5426724785810912, + "flos": 12495118671360.0, + "grad_norm": 1.8892355541168753, + "language_loss": 0.73817015, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.75691211, + "num_input_tokens_seen": 194336150, + "step": 9026, + "time_per_iteration": 2.5372366905212402 + }, + { + "auxiliary_loss_clip": 0.01088316, + "auxiliary_loss_mlp": 0.0104074, + "balance_loss_clip": 1.03508484, + "balance_loss_mlp": 1.02624416, + "epoch": 0.5427326018337592, + "flos": 26064611089920.0, + "grad_norm": 1.6224016608846457, + "language_loss": 0.78815281, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.80944335, + "num_input_tokens_seen": 194355980, + "step": 9027, + "time_per_iteration": 2.5523788928985596 + }, + { + "auxiliary_loss_clip": 0.0101166, + "auxiliary_loss_mlp": 0.01000587, + "balance_loss_clip": 1.02074337, + "balance_loss_mlp": 0.99937135, + "epoch": 0.5427927250864272, + "flos": 65984745576960.0, + "grad_norm": 0.7448524556632176, + "language_loss": 0.56509846, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.58522093, + "num_input_tokens_seen": 194422660, + "step": 9028, + "time_per_iteration": 3.2044901847839355 + }, + { + "auxiliary_loss_clip": 0.01070536, + "auxiliary_loss_mlp": 0.01028012, + "balance_loss_clip": 1.04061913, + "balance_loss_mlp": 1.01450586, + "epoch": 0.5428528483390952, + "flos": 19975382904960.0, + "grad_norm": 2.9108428548857646, + "language_loss": 0.77966297, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.80064845, + "num_input_tokens_seen": 194438545, + "step": 9029, + "time_per_iteration": 2.590405225753784 + }, + { + "auxiliary_loss_clip": 0.01068568, + "auxiliary_loss_mlp": 0.01028015, + "balance_loss_clip": 1.03545821, + "balance_loss_mlp": 1.01465774, + "epoch": 0.5429129715917631, + "flos": 21832323287040.0, + "grad_norm": 1.4987413113833492, + "language_loss": 0.83415121, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.85511708, + "num_input_tokens_seen": 194458060, + "step": 9030, + "time_per_iteration": 2.6201014518737793 + }, + { + "auxiliary_loss_clip": 0.01111003, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.03846836, + "balance_loss_mlp": 1.01998472, + "epoch": 0.5429730948444311, + "flos": 27782685492480.0, + "grad_norm": 1.7448337417438335, + "language_loss": 0.7505796, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.77201003, + "num_input_tokens_seen": 194477405, + "step": 9031, + "time_per_iteration": 2.5539753437042236 + }, + { + "auxiliary_loss_clip": 0.01093674, + "auxiliary_loss_mlp": 0.01035064, + "balance_loss_clip": 1.03664088, + "balance_loss_mlp": 1.0224452, + "epoch": 0.543033218097099, + "flos": 26760452336640.0, + "grad_norm": 1.649333596332213, + "language_loss": 0.85120881, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.87249625, + "num_input_tokens_seen": 194497085, + "step": 9032, + "time_per_iteration": 2.531602382659912 + }, + { + "auxiliary_loss_clip": 0.01094836, + "auxiliary_loss_mlp": 0.01035454, + "balance_loss_clip": 1.03813934, + "balance_loss_mlp": 1.02276397, + "epoch": 0.5430933413497671, + "flos": 22675254727680.0, + "grad_norm": 1.6691807673468801, + "language_loss": 0.73818803, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.75949097, + "num_input_tokens_seen": 194516785, + "step": 9033, + "time_per_iteration": 4.019034147262573 + }, + { + "auxiliary_loss_clip": 0.01076812, + "auxiliary_loss_mlp": 0.01039901, + "balance_loss_clip": 1.04054165, + "balance_loss_mlp": 1.02657318, + "epoch": 0.543153464602435, + "flos": 24607499973120.0, + "grad_norm": 2.177149551833534, + "language_loss": 0.75962365, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.78079081, + "num_input_tokens_seen": 194536475, + "step": 9034, + "time_per_iteration": 2.608349323272705 + }, + { + "auxiliary_loss_clip": 0.01073814, + "auxiliary_loss_mlp": 0.01033606, + "balance_loss_clip": 1.04040384, + "balance_loss_mlp": 1.02194715, + "epoch": 0.543213587855103, + "flos": 19025725178880.0, + "grad_norm": 1.7314573153893202, + "language_loss": 0.84264791, + "learning_rate": 1.817402369770655e-06, + "loss": 0.86372221, + "num_input_tokens_seen": 194554495, + "step": 9035, + "time_per_iteration": 2.597259521484375 + }, + { + "auxiliary_loss_clip": 0.01009304, + "auxiliary_loss_mlp": 0.0100764, + "balance_loss_clip": 1.01642275, + "balance_loss_mlp": 1.00638235, + "epoch": 0.5432737111077709, + "flos": 65686435125120.0, + "grad_norm": 0.7232411195063265, + "language_loss": 0.55878568, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.57895517, + "num_input_tokens_seen": 194617620, + "step": 9036, + "time_per_iteration": 3.1327948570251465 + }, + { + "auxiliary_loss_clip": 0.01057429, + "auxiliary_loss_mlp": 0.01033254, + "balance_loss_clip": 1.0377841, + "balance_loss_mlp": 1.02063608, + "epoch": 0.5433338343604389, + "flos": 22091670460800.0, + "grad_norm": 1.6156970382036415, + "language_loss": 0.75243801, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.77334487, + "num_input_tokens_seen": 194637690, + "step": 9037, + "time_per_iteration": 3.9420387744903564 + }, + { + "auxiliary_loss_clip": 0.01091466, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.04248655, + "balance_loss_mlp": 1.02242565, + "epoch": 0.5433939576131068, + "flos": 34672649616000.0, + "grad_norm": 1.846956549857097, + "language_loss": 0.66237342, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.68363965, + "num_input_tokens_seen": 194659520, + "step": 9038, + "time_per_iteration": 2.6579747200012207 + }, + { + "auxiliary_loss_clip": 0.01097383, + "auxiliary_loss_mlp": 0.01029744, + "balance_loss_clip": 1.03508306, + "balance_loss_mlp": 1.01845479, + "epoch": 0.5434540808657748, + "flos": 20303355012480.0, + "grad_norm": 1.8885040058958604, + "language_loss": 0.78409743, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.80536872, + "num_input_tokens_seen": 194677645, + "step": 9039, + "time_per_iteration": 2.491894483566284 + }, + { + "auxiliary_loss_clip": 0.01078009, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.03773236, + "balance_loss_mlp": 1.02285385, + "epoch": 0.5435142041184428, + "flos": 23112790295040.0, + "grad_norm": 1.7568204511025118, + "language_loss": 0.76658249, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.78771925, + "num_input_tokens_seen": 194697400, + "step": 9040, + "time_per_iteration": 3.959312677383423 + }, + { + "auxiliary_loss_clip": 0.01018391, + "auxiliary_loss_mlp": 0.01005324, + "balance_loss_clip": 1.01505589, + "balance_loss_mlp": 1.00409627, + "epoch": 0.5435743273711108, + "flos": 64012746954240.0, + "grad_norm": 0.6675623494882927, + "language_loss": 0.52433085, + "learning_rate": 1.815075484268074e-06, + "loss": 0.54456806, + "num_input_tokens_seen": 194761205, + "step": 9041, + "time_per_iteration": 3.1104893684387207 + }, + { + "auxiliary_loss_clip": 0.01087213, + "auxiliary_loss_mlp": 0.01037125, + "balance_loss_clip": 1.03828955, + "balance_loss_mlp": 1.02455389, + "epoch": 0.5436344506237788, + "flos": 25118903859840.0, + "grad_norm": 1.582556449652193, + "language_loss": 0.76215678, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.78340018, + "num_input_tokens_seen": 194782445, + "step": 9042, + "time_per_iteration": 2.550203323364258 + }, + { + "auxiliary_loss_clip": 0.01074291, + "auxiliary_loss_mlp": 0.01028118, + "balance_loss_clip": 1.0361079, + "balance_loss_mlp": 1.01644766, + "epoch": 0.5436945738764467, + "flos": 19572967860480.0, + "grad_norm": 1.6194063047556018, + "language_loss": 0.67279786, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.69382197, + "num_input_tokens_seen": 194800325, + "step": 9043, + "time_per_iteration": 2.5432426929473877 + }, + { + "auxiliary_loss_clip": 0.01073419, + "auxiliary_loss_mlp": 0.01027417, + "balance_loss_clip": 1.03991759, + "balance_loss_mlp": 1.01541281, + "epoch": 0.5437546971291147, + "flos": 21142515525120.0, + "grad_norm": 1.7008843979854196, + "language_loss": 0.84020931, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.86121762, + "num_input_tokens_seen": 194818675, + "step": 9044, + "time_per_iteration": 4.035784721374512 + }, + { + "auxiliary_loss_clip": 0.01115294, + "auxiliary_loss_mlp": 0.01027595, + "balance_loss_clip": 1.03845739, + "balance_loss_mlp": 1.01467872, + "epoch": 0.5438148203817826, + "flos": 25118688378240.0, + "grad_norm": 1.6251611139407796, + "language_loss": 0.62207472, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.64350361, + "num_input_tokens_seen": 194836595, + "step": 9045, + "time_per_iteration": 2.480785369873047 + }, + { + "auxiliary_loss_clip": 0.01112877, + "auxiliary_loss_mlp": 0.01032363, + "balance_loss_clip": 1.03929281, + "balance_loss_mlp": 1.02018631, + "epoch": 0.5438749436344507, + "flos": 23002939526400.0, + "grad_norm": 1.506588028997212, + "language_loss": 0.70023298, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.72168541, + "num_input_tokens_seen": 194857520, + "step": 9046, + "time_per_iteration": 2.4960861206054688 + }, + { + "auxiliary_loss_clip": 0.01107879, + "auxiliary_loss_mlp": 0.01029237, + "balance_loss_clip": 1.03652823, + "balance_loss_mlp": 1.01769757, + "epoch": 0.5439350668871186, + "flos": 15487016065920.0, + "grad_norm": 1.6363992819873512, + "language_loss": 0.76999432, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.7913655, + "num_input_tokens_seen": 194876020, + "step": 9047, + "time_per_iteration": 2.4601383209228516 + }, + { + "auxiliary_loss_clip": 0.01086822, + "auxiliary_loss_mlp": 0.01039641, + "balance_loss_clip": 1.03717864, + "balance_loss_mlp": 1.02594948, + "epoch": 0.5439951901397866, + "flos": 17238415311360.0, + "grad_norm": 1.7753024210787685, + "language_loss": 0.73034161, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.75160623, + "num_input_tokens_seen": 194894650, + "step": 9048, + "time_per_iteration": 2.4942758083343506 + }, + { + "auxiliary_loss_clip": 0.01064068, + "auxiliary_loss_mlp": 0.01039026, + "balance_loss_clip": 1.03591108, + "balance_loss_mlp": 1.02441108, + "epoch": 0.5440553133924545, + "flos": 18661016436480.0, + "grad_norm": 2.1115039067885095, + "language_loss": 0.93189508, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.95292598, + "num_input_tokens_seen": 194911935, + "step": 9049, + "time_per_iteration": 2.5587680339813232 + }, + { + "auxiliary_loss_clip": 0.0109937, + "auxiliary_loss_mlp": 0.01030508, + "balance_loss_clip": 1.03740788, + "balance_loss_mlp": 1.01909339, + "epoch": 0.5441154366451225, + "flos": 27122934435840.0, + "grad_norm": 1.779089241638626, + "language_loss": 0.74020743, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.76150626, + "num_input_tokens_seen": 194931620, + "step": 9050, + "time_per_iteration": 2.522371530532837 + }, + { + "auxiliary_loss_clip": 0.01102681, + "auxiliary_loss_mlp": 0.01027546, + "balance_loss_clip": 1.03944254, + "balance_loss_mlp": 1.0156126, + "epoch": 0.5441755598977904, + "flos": 25993867253760.0, + "grad_norm": 1.811214799253383, + "language_loss": 0.66952026, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.69082248, + "num_input_tokens_seen": 194952560, + "step": 9051, + "time_per_iteration": 2.526942253112793 + }, + { + "auxiliary_loss_clip": 0.01073427, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.03525507, + "balance_loss_mlp": 1.02013171, + "epoch": 0.5442356831504584, + "flos": 32380041173760.0, + "grad_norm": 1.7337497313243895, + "language_loss": 0.67624438, + "learning_rate": 1.810810185460011e-06, + "loss": 0.69729871, + "num_input_tokens_seen": 194973915, + "step": 9052, + "time_per_iteration": 2.628087043762207 + }, + { + "auxiliary_loss_clip": 0.01112482, + "auxiliary_loss_mlp": 0.01033291, + "balance_loss_clip": 1.03801537, + "balance_loss_mlp": 1.02061915, + "epoch": 0.5442958064031264, + "flos": 24164290056960.0, + "grad_norm": 1.8082724514407518, + "language_loss": 0.9288038, + "learning_rate": 1.810422473773436e-06, + "loss": 0.95026153, + "num_input_tokens_seen": 194990170, + "step": 9053, + "time_per_iteration": 2.4998860359191895 + }, + { + "auxiliary_loss_clip": 0.0109381, + "auxiliary_loss_mlp": 0.01034754, + "balance_loss_clip": 1.03827691, + "balance_loss_mlp": 1.02208853, + "epoch": 0.5443559296557944, + "flos": 18764690065920.0, + "grad_norm": 8.638178631778084, + "language_loss": 0.83121467, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.85250038, + "num_input_tokens_seen": 195006395, + "step": 9054, + "time_per_iteration": 2.5083248615264893 + }, + { + "auxiliary_loss_clip": 0.01090456, + "auxiliary_loss_mlp": 0.01031982, + "balance_loss_clip": 1.0386256, + "balance_loss_mlp": 1.01908398, + "epoch": 0.5444160529084624, + "flos": 22632556435200.0, + "grad_norm": 1.9685317471784334, + "language_loss": 0.68473017, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.70595455, + "num_input_tokens_seen": 195025080, + "step": 9055, + "time_per_iteration": 2.5200605392456055 + }, + { + "auxiliary_loss_clip": 0.0100702, + "auxiliary_loss_mlp": 0.01004894, + "balance_loss_clip": 1.01095891, + "balance_loss_mlp": 1.00358891, + "epoch": 0.5444761761611303, + "flos": 69671909600640.0, + "grad_norm": 0.725917733276565, + "language_loss": 0.57704818, + "learning_rate": 1.80925938190531e-06, + "loss": 0.59716731, + "num_input_tokens_seen": 195085725, + "step": 9056, + "time_per_iteration": 3.1435201168060303 + }, + { + "auxiliary_loss_clip": 0.01076636, + "auxiliary_loss_mlp": 0.01031076, + "balance_loss_clip": 1.04070544, + "balance_loss_mlp": 1.01867795, + "epoch": 0.5445362994137983, + "flos": 14278442129280.0, + "grad_norm": 1.8917457824076311, + "language_loss": 0.69350058, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.71457767, + "num_input_tokens_seen": 195102585, + "step": 9057, + "time_per_iteration": 2.520031690597534 + }, + { + "auxiliary_loss_clip": 0.01099148, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.03764343, + "balance_loss_mlp": 1.01890731, + "epoch": 0.5445964226664662, + "flos": 28986195611520.0, + "grad_norm": 2.155498460602353, + "language_loss": 0.75044382, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.77174878, + "num_input_tokens_seen": 195120055, + "step": 9058, + "time_per_iteration": 2.553114891052246 + }, + { + "auxiliary_loss_clip": 0.0101676, + "auxiliary_loss_mlp": 0.01009968, + "balance_loss_clip": 1.02650714, + "balance_loss_mlp": 1.00858498, + "epoch": 0.5446565459191343, + "flos": 68620230270720.0, + "grad_norm": 0.8048550723931328, + "language_loss": 0.62723708, + "learning_rate": 1.808096355133312e-06, + "loss": 0.64750433, + "num_input_tokens_seen": 195181045, + "step": 9059, + "time_per_iteration": 3.257617473602295 + }, + { + "auxiliary_loss_clip": 0.0109825, + "auxiliary_loss_mlp": 0.0103244, + "balance_loss_clip": 1.03770018, + "balance_loss_mlp": 1.02048886, + "epoch": 0.5447166691718022, + "flos": 16216469464320.0, + "grad_norm": 1.7657786315333013, + "language_loss": 0.7965287, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.81783557, + "num_input_tokens_seen": 195198840, + "step": 9060, + "time_per_iteration": 2.473949432373047 + }, + { + "auxiliary_loss_clip": 0.01101269, + "auxiliary_loss_mlp": 0.01033198, + "balance_loss_clip": 1.03676617, + "balance_loss_mlp": 1.02067566, + "epoch": 0.5447767924244702, + "flos": 25849039616640.0, + "grad_norm": 1.5700885608539095, + "language_loss": 0.7956872, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.81703186, + "num_input_tokens_seen": 195218720, + "step": 9061, + "time_per_iteration": 2.528481960296631 + }, + { + "auxiliary_loss_clip": 0.01097665, + "auxiliary_loss_mlp": 0.01027185, + "balance_loss_clip": 1.03706169, + "balance_loss_mlp": 1.01525819, + "epoch": 0.5448369156771381, + "flos": 19677718897920.0, + "grad_norm": 1.6945780076340198, + "language_loss": 0.87041277, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.89166129, + "num_input_tokens_seen": 195235770, + "step": 9062, + "time_per_iteration": 2.4849300384521484 + }, + { + "auxiliary_loss_clip": 0.01091394, + "auxiliary_loss_mlp": 0.01032756, + "balance_loss_clip": 1.03740311, + "balance_loss_mlp": 1.01781321, + "epoch": 0.5448970389298061, + "flos": 19281804215040.0, + "grad_norm": 1.71200442947359, + "language_loss": 0.82353669, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.84477818, + "num_input_tokens_seen": 195254870, + "step": 9063, + "time_per_iteration": 2.5113160610198975 + }, + { + "auxiliary_loss_clip": 0.01109921, + "auxiliary_loss_mlp": 0.01029919, + "balance_loss_clip": 1.0360496, + "balance_loss_mlp": 1.01675189, + "epoch": 0.544957162182474, + "flos": 20991690316800.0, + "grad_norm": 1.720453524271078, + "language_loss": 0.63998973, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.66138816, + "num_input_tokens_seen": 195273390, + "step": 9064, + "time_per_iteration": 2.4803221225738525 + }, + { + "auxiliary_loss_clip": 0.01111928, + "auxiliary_loss_mlp": 0.01032882, + "balance_loss_clip": 1.03676772, + "balance_loss_mlp": 1.01988244, + "epoch": 0.545017285435142, + "flos": 25374587846400.0, + "grad_norm": 1.837266156051042, + "language_loss": 0.79722536, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.81867349, + "num_input_tokens_seen": 195295635, + "step": 9065, + "time_per_iteration": 2.5089051723480225 + }, + { + "auxiliary_loss_clip": 0.01073946, + "auxiliary_loss_mlp": 0.01028961, + "balance_loss_clip": 1.03595042, + "balance_loss_mlp": 1.01814842, + "epoch": 0.54507740868781, + "flos": 19134749934720.0, + "grad_norm": 2.1605137285326292, + "language_loss": 0.78284389, + "learning_rate": 1.805382881379827e-06, + "loss": 0.803873, + "num_input_tokens_seen": 195312545, + "step": 9066, + "time_per_iteration": 2.5532727241516113 + }, + { + "auxiliary_loss_clip": 0.01104484, + "auxiliary_loss_mlp": 0.01031982, + "balance_loss_clip": 1.03657341, + "balance_loss_mlp": 1.01893413, + "epoch": 0.545137531940478, + "flos": 26249802635520.0, + "grad_norm": 1.776655665004028, + "language_loss": 0.75673556, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.77810019, + "num_input_tokens_seen": 195332955, + "step": 9067, + "time_per_iteration": 2.5351815223693848 + }, + { + "auxiliary_loss_clip": 0.01081281, + "auxiliary_loss_mlp": 0.01034262, + "balance_loss_clip": 1.04158652, + "balance_loss_mlp": 1.01923537, + "epoch": 0.545197655193146, + "flos": 37555629995520.0, + "grad_norm": 2.12782606443098, + "language_loss": 0.63100278, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.65215826, + "num_input_tokens_seen": 195355930, + "step": 9068, + "time_per_iteration": 2.7010996341705322 + }, + { + "auxiliary_loss_clip": 0.01077001, + "auxiliary_loss_mlp": 0.01035297, + "balance_loss_clip": 1.04055059, + "balance_loss_mlp": 1.02307236, + "epoch": 0.5452577784458139, + "flos": 26031250333440.0, + "grad_norm": 1.5083720289963207, + "language_loss": 0.72082126, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.74194419, + "num_input_tokens_seen": 195376445, + "step": 9069, + "time_per_iteration": 2.5882680416107178 + }, + { + "auxiliary_loss_clip": 0.01108921, + "auxiliary_loss_mlp": 0.01025729, + "balance_loss_clip": 1.03889608, + "balance_loss_mlp": 1.01461256, + "epoch": 0.5453179016984819, + "flos": 17639034675840.0, + "grad_norm": 1.6896226298518662, + "language_loss": 0.73785514, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.75920165, + "num_input_tokens_seen": 195393725, + "step": 9070, + "time_per_iteration": 2.45109486579895 + }, + { + "auxiliary_loss_clip": 0.01098038, + "auxiliary_loss_mlp": 0.01028184, + "balance_loss_clip": 1.04171908, + "balance_loss_mlp": 1.01594758, + "epoch": 0.5453780249511498, + "flos": 23216679406080.0, + "grad_norm": 1.9704054612314836, + "language_loss": 0.60602748, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.62728965, + "num_input_tokens_seen": 195411380, + "step": 9071, + "time_per_iteration": 3.912318229675293 + }, + { + "auxiliary_loss_clip": 0.01033132, + "auxiliary_loss_mlp": 0.01004421, + "balance_loss_clip": 1.00809467, + "balance_loss_mlp": 1.00327063, + "epoch": 0.5454381482038179, + "flos": 68696504801280.0, + "grad_norm": 0.7015668975774855, + "language_loss": 0.57133102, + "learning_rate": 1.80305733435899e-06, + "loss": 0.59170663, + "num_input_tokens_seen": 195482015, + "step": 9072, + "time_per_iteration": 3.173682689666748 + }, + { + "auxiliary_loss_clip": 0.01081162, + "auxiliary_loss_mlp": 0.01033772, + "balance_loss_clip": 1.0347743, + "balance_loss_mlp": 1.02060509, + "epoch": 0.5454982714564858, + "flos": 13260626346240.0, + "grad_norm": 1.655506888137964, + "language_loss": 0.70027244, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.72142178, + "num_input_tokens_seen": 195500440, + "step": 9073, + "time_per_iteration": 2.527085304260254 + }, + { + "auxiliary_loss_clip": 0.01087558, + "auxiliary_loss_mlp": 0.01030307, + "balance_loss_clip": 1.03680432, + "balance_loss_mlp": 1.01874352, + "epoch": 0.5455583947091538, + "flos": 21835878733440.0, + "grad_norm": 2.767510351363698, + "language_loss": 0.71530104, + "learning_rate": 1.802282211606627e-06, + "loss": 0.7364797, + "num_input_tokens_seen": 195520860, + "step": 9074, + "time_per_iteration": 2.5396437644958496 + }, + { + "auxiliary_loss_clip": 0.01096946, + "auxiliary_loss_mlp": 0.01038335, + "balance_loss_clip": 1.03580821, + "balance_loss_mlp": 1.02647316, + "epoch": 0.5456185179618217, + "flos": 17817438551040.0, + "grad_norm": 3.732793104052465, + "language_loss": 0.68383515, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.70518792, + "num_input_tokens_seen": 195538615, + "step": 9075, + "time_per_iteration": 2.484367847442627 + }, + { + "auxiliary_loss_clip": 0.01097827, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.03685379, + "balance_loss_mlp": 1.01943028, + "epoch": 0.5456786412144897, + "flos": 21069401391360.0, + "grad_norm": 1.6545992355371972, + "language_loss": 0.81085992, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.83214217, + "num_input_tokens_seen": 195557460, + "step": 9076, + "time_per_iteration": 3.837646007537842 + }, + { + "auxiliary_loss_clip": 0.011008, + "auxiliary_loss_mlp": 0.01029899, + "balance_loss_clip": 1.03743911, + "balance_loss_mlp": 1.01769781, + "epoch": 0.5457387644671576, + "flos": 23294965098240.0, + "grad_norm": 1.7367395153774532, + "language_loss": 0.80189443, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.82320142, + "num_input_tokens_seen": 195577985, + "step": 9077, + "time_per_iteration": 2.5006985664367676 + }, + { + "auxiliary_loss_clip": 0.01101663, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.03645563, + "balance_loss_mlp": 1.0189817, + "epoch": 0.5457988877198257, + "flos": 21617039122560.0, + "grad_norm": 1.8416388927923766, + "language_loss": 0.67578876, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.69711298, + "num_input_tokens_seen": 195597620, + "step": 9078, + "time_per_iteration": 2.526109218597412 + }, + { + "auxiliary_loss_clip": 0.01103343, + "auxiliary_loss_mlp": 0.01033005, + "balance_loss_clip": 1.03761232, + "balance_loss_mlp": 1.02015388, + "epoch": 0.5458590109724936, + "flos": 23762485543680.0, + "grad_norm": 1.7697433132597218, + "language_loss": 0.81552184, + "learning_rate": 1.800344536188764e-06, + "loss": 0.83688533, + "num_input_tokens_seen": 195615910, + "step": 9079, + "time_per_iteration": 3.893725872039795 + }, + { + "auxiliary_loss_clip": 0.01116028, + "auxiliary_loss_mlp": 0.01031469, + "balance_loss_clip": 1.03879833, + "balance_loss_mlp": 1.0177722, + "epoch": 0.5459191342251616, + "flos": 24424283675520.0, + "grad_norm": 1.5955631509530896, + "language_loss": 0.75598013, + "learning_rate": 1.799957023759277e-06, + "loss": 0.77745509, + "num_input_tokens_seen": 195635620, + "step": 9080, + "time_per_iteration": 2.4910190105438232 + }, + { + "auxiliary_loss_clip": 0.01075057, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.03797495, + "balance_loss_mlp": 1.01907086, + "epoch": 0.5459792574778296, + "flos": 23623009032960.0, + "grad_norm": 2.4386449623215505, + "language_loss": 0.83679944, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.85787123, + "num_input_tokens_seen": 195652495, + "step": 9081, + "time_per_iteration": 2.5730223655700684 + }, + { + "auxiliary_loss_clip": 0.01115211, + "auxiliary_loss_mlp": 0.01030308, + "balance_loss_clip": 1.03886223, + "balance_loss_mlp": 1.01730824, + "epoch": 0.5460393807304975, + "flos": 19135540033920.0, + "grad_norm": 1.6754877599203015, + "language_loss": 0.70361197, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.72506714, + "num_input_tokens_seen": 195671965, + "step": 9082, + "time_per_iteration": 2.4609599113464355 + }, + { + "auxiliary_loss_clip": 0.01107181, + "auxiliary_loss_mlp": 0.01025787, + "balance_loss_clip": 1.03598082, + "balance_loss_mlp": 1.01358604, + "epoch": 0.5460995039831655, + "flos": 35918534805120.0, + "grad_norm": 1.607551248324354, + "language_loss": 0.66585368, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.68718338, + "num_input_tokens_seen": 195694725, + "step": 9083, + "time_per_iteration": 3.99273943901062 + }, + { + "auxiliary_loss_clip": 0.01088033, + "auxiliary_loss_mlp": 0.01027548, + "balance_loss_clip": 1.04325294, + "balance_loss_mlp": 1.01595497, + "epoch": 0.5461596272358334, + "flos": 26759231274240.0, + "grad_norm": 1.6296280274723032, + "language_loss": 0.78946543, + "learning_rate": 1.798407050044766e-06, + "loss": 0.81062132, + "num_input_tokens_seen": 195714090, + "step": 9084, + "time_per_iteration": 2.571420907974243 + }, + { + "auxiliary_loss_clip": 0.01104108, + "auxiliary_loss_mlp": 0.01029718, + "balance_loss_clip": 1.04017723, + "balance_loss_mlp": 1.01782095, + "epoch": 0.5462197504885015, + "flos": 20886580143360.0, + "grad_norm": 1.728308910760671, + "language_loss": 0.7531783, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.77451652, + "num_input_tokens_seen": 195733585, + "step": 9085, + "time_per_iteration": 2.507934093475342 + }, + { + "auxiliary_loss_clip": 0.01088993, + "auxiliary_loss_mlp": 0.01029241, + "balance_loss_clip": 1.03604543, + "balance_loss_mlp": 1.01679015, + "epoch": 0.5462798737411694, + "flos": 25804976607360.0, + "grad_norm": 2.5779307180765554, + "language_loss": 0.74743629, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.76861864, + "num_input_tokens_seen": 195752820, + "step": 9086, + "time_per_iteration": 2.5553877353668213 + }, + { + "auxiliary_loss_clip": 0.01099068, + "auxiliary_loss_mlp": 0.0102914, + "balance_loss_clip": 1.03991938, + "balance_loss_mlp": 1.0166111, + "epoch": 0.5463399969938374, + "flos": 25775027642880.0, + "grad_norm": 1.558258750295256, + "language_loss": 0.7683959, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.78967798, + "num_input_tokens_seen": 195773740, + "step": 9087, + "time_per_iteration": 2.548981189727783 + }, + { + "auxiliary_loss_clip": 0.01104792, + "auxiliary_loss_mlp": 0.01038875, + "balance_loss_clip": 1.03964388, + "balance_loss_mlp": 1.02513599, + "epoch": 0.5464001202465053, + "flos": 18843298980480.0, + "grad_norm": 1.6925034344298395, + "language_loss": 0.77573633, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.79717302, + "num_input_tokens_seen": 195792125, + "step": 9088, + "time_per_iteration": 2.4666686058044434 + }, + { + "auxiliary_loss_clip": 0.00990068, + "auxiliary_loss_mlp": 0.01001345, + "balance_loss_clip": 1.02576923, + "balance_loss_mlp": 1.00003326, + "epoch": 0.5464602434991733, + "flos": 69049541623680.0, + "grad_norm": 1.0024909420216783, + "language_loss": 0.57715523, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.59706938, + "num_input_tokens_seen": 195854935, + "step": 9089, + "time_per_iteration": 3.314000368118286 + }, + { + "auxiliary_loss_clip": 0.01077028, + "auxiliary_loss_mlp": 0.01033225, + "balance_loss_clip": 1.03647375, + "balance_loss_mlp": 1.01991498, + "epoch": 0.5465203667518412, + "flos": 27560039040000.0, + "grad_norm": 1.8946113531166529, + "language_loss": 0.76899409, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.79009664, + "num_input_tokens_seen": 195874715, + "step": 9090, + "time_per_iteration": 2.8511955738067627 + }, + { + "auxiliary_loss_clip": 0.01097919, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.03663933, + "balance_loss_mlp": 1.0203495, + "epoch": 0.5465804900045093, + "flos": 21210206705280.0, + "grad_norm": 1.89859236567917, + "language_loss": 0.73386395, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.75518638, + "num_input_tokens_seen": 195892610, + "step": 9091, + "time_per_iteration": 2.535501480102539 + }, + { + "auxiliary_loss_clip": 0.0109239, + "auxiliary_loss_mlp": 0.01033449, + "balance_loss_clip": 1.03855824, + "balance_loss_mlp": 1.02080703, + "epoch": 0.5466406132571772, + "flos": 22488949860480.0, + "grad_norm": 2.2432237533814745, + "language_loss": 0.77975792, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.80101633, + "num_input_tokens_seen": 195911085, + "step": 9092, + "time_per_iteration": 2.524381160736084 + }, + { + "auxiliary_loss_clip": 0.01114126, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.03911471, + "balance_loss_mlp": 1.01678646, + "epoch": 0.5467007365098452, + "flos": 17675843137920.0, + "grad_norm": 2.5560811533909646, + "language_loss": 0.74815691, + "learning_rate": 1.794920057818476e-06, + "loss": 0.76959455, + "num_input_tokens_seen": 195929845, + "step": 9093, + "time_per_iteration": 2.4311556816101074 + }, + { + "auxiliary_loss_clip": 0.01102788, + "auxiliary_loss_mlp": 0.01034996, + "balance_loss_clip": 1.03676414, + "balance_loss_mlp": 1.02067304, + "epoch": 0.5467608597625132, + "flos": 15698852524800.0, + "grad_norm": 4.632504497341471, + "language_loss": 0.69262433, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.71400213, + "num_input_tokens_seen": 195946350, + "step": 9094, + "time_per_iteration": 2.457397699356079 + }, + { + "auxiliary_loss_clip": 0.01090582, + "auxiliary_loss_mlp": 0.01029615, + "balance_loss_clip": 1.0390532, + "balance_loss_mlp": 1.01824832, + "epoch": 0.5468209830151811, + "flos": 24312816794880.0, + "grad_norm": 3.5106181196736723, + "language_loss": 0.67821211, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.69941413, + "num_input_tokens_seen": 195959840, + "step": 9095, + "time_per_iteration": 2.5091392993927 + }, + { + "auxiliary_loss_clip": 0.01080213, + "auxiliary_loss_mlp": 0.01031795, + "balance_loss_clip": 1.04425609, + "balance_loss_mlp": 1.01983821, + "epoch": 0.5468811062678491, + "flos": 29166323339520.0, + "grad_norm": 1.5026796906758875, + "language_loss": 0.66413796, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.68525803, + "num_input_tokens_seen": 195981125, + "step": 9096, + "time_per_iteration": 2.6219098567962646 + }, + { + "auxiliary_loss_clip": 0.01010715, + "auxiliary_loss_mlp": 0.01003331, + "balance_loss_clip": 1.01650739, + "balance_loss_mlp": 1.0020256, + "epoch": 0.546941229520517, + "flos": 67867037982720.0, + "grad_norm": 0.7377101599122364, + "language_loss": 0.57526577, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.59540617, + "num_input_tokens_seen": 196038880, + "step": 9097, + "time_per_iteration": 3.251030921936035 + }, + { + "auxiliary_loss_clip": 0.01040347, + "auxiliary_loss_mlp": 0.01004357, + "balance_loss_clip": 1.02954769, + "balance_loss_mlp": 1.00291431, + "epoch": 0.5470013527731851, + "flos": 58270306625280.0, + "grad_norm": 0.9562805267724301, + "language_loss": 0.64785743, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.66830444, + "num_input_tokens_seen": 196099215, + "step": 9098, + "time_per_iteration": 3.0809147357940674 + }, + { + "auxiliary_loss_clip": 0.01105056, + "auxiliary_loss_mlp": 0.01036154, + "balance_loss_clip": 1.03881073, + "balance_loss_mlp": 1.02352405, + "epoch": 0.547061476025853, + "flos": 22965915582720.0, + "grad_norm": 1.7879110172769614, + "language_loss": 0.73245716, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.75386918, + "num_input_tokens_seen": 196120370, + "step": 9099, + "time_per_iteration": 2.523378610610962 + }, + { + "auxiliary_loss_clip": 0.01086199, + "auxiliary_loss_mlp": 0.01029651, + "balance_loss_clip": 1.0370183, + "balance_loss_mlp": 1.01882052, + "epoch": 0.547121599278521, + "flos": 29968244426880.0, + "grad_norm": 1.9865718273102215, + "language_loss": 0.72587025, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.74702871, + "num_input_tokens_seen": 196139075, + "step": 9100, + "time_per_iteration": 2.5985329151153564 + }, + { + "auxiliary_loss_clip": 0.01098067, + "auxiliary_loss_mlp": 0.00786434, + "balance_loss_clip": 1.04210234, + "balance_loss_mlp": 1.01205826, + "epoch": 0.5471817225311889, + "flos": 36535443914880.0, + "grad_norm": 1.8013064486118635, + "language_loss": 0.67988044, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.69872546, + "num_input_tokens_seen": 196159990, + "step": 9101, + "time_per_iteration": 2.642435073852539 + }, + { + "auxiliary_loss_clip": 0.0110972, + "auxiliary_loss_mlp": 0.01027901, + "balance_loss_clip": 1.03669512, + "balance_loss_mlp": 1.0157299, + "epoch": 0.5472418457838569, + "flos": 25775243124480.0, + "grad_norm": 3.139420862311445, + "language_loss": 0.77886522, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.80024147, + "num_input_tokens_seen": 196180570, + "step": 9102, + "time_per_iteration": 2.508272647857666 + }, + { + "auxiliary_loss_clip": 0.010769, + "auxiliary_loss_mlp": 0.01031768, + "balance_loss_clip": 1.03672791, + "balance_loss_mlp": 1.01903653, + "epoch": 0.5473019690365248, + "flos": 27887687925120.0, + "grad_norm": 1.556925876717397, + "language_loss": 0.72307342, + "learning_rate": 1.791046361258413e-06, + "loss": 0.74416006, + "num_input_tokens_seen": 196200300, + "step": 9103, + "time_per_iteration": 2.5973784923553467 + }, + { + "auxiliary_loss_clip": 0.01086247, + "auxiliary_loss_mlp": 0.01027999, + "balance_loss_clip": 1.03858757, + "balance_loss_mlp": 1.01597059, + "epoch": 0.5473620922891929, + "flos": 57631490219520.0, + "grad_norm": 1.292142755368098, + "language_loss": 0.65295285, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.67409533, + "num_input_tokens_seen": 196228525, + "step": 9104, + "time_per_iteration": 2.8923470973968506 + }, + { + "auxiliary_loss_clip": 0.0109803, + "auxiliary_loss_mlp": 0.0102948, + "balance_loss_clip": 1.03963041, + "balance_loss_mlp": 1.01650369, + "epoch": 0.5474222155418608, + "flos": 19354056422400.0, + "grad_norm": 3.1867569241726272, + "language_loss": 0.81474429, + "learning_rate": 1.790271716558888e-06, + "loss": 0.8360194, + "num_input_tokens_seen": 196247690, + "step": 9105, + "time_per_iteration": 2.485377788543701 + }, + { + "auxiliary_loss_clip": 0.01108327, + "auxiliary_loss_mlp": 0.01027042, + "balance_loss_clip": 1.03625906, + "balance_loss_mlp": 1.01566982, + "epoch": 0.5474823387945288, + "flos": 25120448144640.0, + "grad_norm": 1.4552425807266383, + "language_loss": 0.8021878, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.8235414, + "num_input_tokens_seen": 196268555, + "step": 9106, + "time_per_iteration": 2.5222840309143066 + }, + { + "auxiliary_loss_clip": 0.01100481, + "auxiliary_loss_mlp": 0.0103184, + "balance_loss_clip": 1.0372858, + "balance_loss_mlp": 1.02020562, + "epoch": 0.5475424620471967, + "flos": 18004174381440.0, + "grad_norm": 1.7849866643417949, + "language_loss": 0.69666106, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.71798426, + "num_input_tokens_seen": 196285585, + "step": 9107, + "time_per_iteration": 2.4588370323181152 + }, + { + "auxiliary_loss_clip": 0.01101865, + "auxiliary_loss_mlp": 0.01028987, + "balance_loss_clip": 1.03543878, + "balance_loss_mlp": 1.01682186, + "epoch": 0.5476025852998647, + "flos": 22309324922880.0, + "grad_norm": 1.6988179954113647, + "language_loss": 0.63101363, + "learning_rate": 1.789109809193197e-06, + "loss": 0.65232205, + "num_input_tokens_seen": 196305085, + "step": 9108, + "time_per_iteration": 2.4863169193267822 + }, + { + "auxiliary_loss_clip": 0.01107632, + "auxiliary_loss_mlp": 0.01027908, + "balance_loss_clip": 1.03548121, + "balance_loss_mlp": 1.01653576, + "epoch": 0.5476627085525327, + "flos": 20120497850880.0, + "grad_norm": 1.6618998228949053, + "language_loss": 0.75062859, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.77198404, + "num_input_tokens_seen": 196323945, + "step": 9109, + "time_per_iteration": 3.852219343185425 + }, + { + "auxiliary_loss_clip": 0.01089465, + "auxiliary_loss_mlp": 0.01031192, + "balance_loss_clip": 1.03771591, + "balance_loss_mlp": 1.01852608, + "epoch": 0.5477228318052006, + "flos": 17712579772800.0, + "grad_norm": 1.988479209665081, + "language_loss": 0.77645838, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.797665, + "num_input_tokens_seen": 196342200, + "step": 9110, + "time_per_iteration": 2.497479200363159 + }, + { + "auxiliary_loss_clip": 0.01094623, + "auxiliary_loss_mlp": 0.01030159, + "balance_loss_clip": 1.03592563, + "balance_loss_mlp": 1.01904297, + "epoch": 0.5477829550578687, + "flos": 25848895962240.0, + "grad_norm": 1.4745311331084272, + "language_loss": 0.71200025, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.73324811, + "num_input_tokens_seen": 196362940, + "step": 9111, + "time_per_iteration": 2.5354013442993164 + }, + { + "auxiliary_loss_clip": 0.010992, + "auxiliary_loss_mlp": 0.01034125, + "balance_loss_clip": 1.03639507, + "balance_loss_mlp": 1.02201366, + "epoch": 0.5478430783105366, + "flos": 23039676161280.0, + "grad_norm": 1.5779325335810526, + "language_loss": 0.71172303, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.73305631, + "num_input_tokens_seen": 196383070, + "step": 9112, + "time_per_iteration": 2.4987633228302 + }, + { + "auxiliary_loss_clip": 0.0106972, + "auxiliary_loss_mlp": 0.01028087, + "balance_loss_clip": 1.04203367, + "balance_loss_mlp": 1.01595712, + "epoch": 0.5479032015632046, + "flos": 16071210864000.0, + "grad_norm": 2.201731698176773, + "language_loss": 0.88258183, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.90355986, + "num_input_tokens_seen": 196398485, + "step": 9113, + "time_per_iteration": 2.5496573448181152 + }, + { + "auxiliary_loss_clip": 0.01061169, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.03865457, + "balance_loss_mlp": 1.01780725, + "epoch": 0.5479633248158725, + "flos": 24278701852800.0, + "grad_norm": 1.7496574520780568, + "language_loss": 0.72767347, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.74858958, + "num_input_tokens_seen": 196417725, + "step": 9114, + "time_per_iteration": 2.633417844772339 + }, + { + "auxiliary_loss_clip": 0.01084555, + "auxiliary_loss_mlp": 0.00786272, + "balance_loss_clip": 1.03569877, + "balance_loss_mlp": 1.01198792, + "epoch": 0.5480234480685405, + "flos": 26358216860160.0, + "grad_norm": 1.6240438191481192, + "language_loss": 0.72103494, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.73974323, + "num_input_tokens_seen": 196437840, + "step": 9115, + "time_per_iteration": 3.8979034423828125 + }, + { + "auxiliary_loss_clip": 0.01075949, + "auxiliary_loss_mlp": 0.00789128, + "balance_loss_clip": 1.0352149, + "balance_loss_mlp": 1.01042271, + "epoch": 0.5480835713212084, + "flos": 22055077480320.0, + "grad_norm": 1.630516091270635, + "language_loss": 0.72263342, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.74128413, + "num_input_tokens_seen": 196457300, + "step": 9116, + "time_per_iteration": 2.5679996013641357 + }, + { + "auxiliary_loss_clip": 0.01088092, + "auxiliary_loss_mlp": 0.01033118, + "balance_loss_clip": 1.04064107, + "balance_loss_mlp": 1.02082181, + "epoch": 0.5481436945738765, + "flos": 25301042749440.0, + "grad_norm": 1.7591338751320926, + "language_loss": 0.76449931, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.78571141, + "num_input_tokens_seen": 196476720, + "step": 9117, + "time_per_iteration": 2.5712339878082275 + }, + { + "auxiliary_loss_clip": 0.0106678, + "auxiliary_loss_mlp": 0.01026374, + "balance_loss_clip": 1.03618455, + "balance_loss_mlp": 1.01461434, + "epoch": 0.5482038178265444, + "flos": 33580857772800.0, + "grad_norm": 1.7219317281507678, + "language_loss": 0.62723464, + "learning_rate": 1.785237306671674e-06, + "loss": 0.64816618, + "num_input_tokens_seen": 196496765, + "step": 9118, + "time_per_iteration": 4.037683010101318 + }, + { + "auxiliary_loss_clip": 0.01114479, + "auxiliary_loss_mlp": 0.01028156, + "balance_loss_clip": 1.0398767, + "balance_loss_mlp": 1.01498902, + "epoch": 0.5482639410792124, + "flos": 19026192055680.0, + "grad_norm": 1.9021889402381704, + "language_loss": 0.79078496, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.81221128, + "num_input_tokens_seen": 196516220, + "step": 9119, + "time_per_iteration": 2.4755282402038574 + }, + { + "auxiliary_loss_clip": 0.01086226, + "auxiliary_loss_mlp": 0.00783113, + "balance_loss_clip": 1.04251456, + "balance_loss_mlp": 1.00897145, + "epoch": 0.5483240643318803, + "flos": 25410318900480.0, + "grad_norm": 1.952237986788984, + "language_loss": 0.82475519, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.84344864, + "num_input_tokens_seen": 196533860, + "step": 9120, + "time_per_iteration": 2.570537567138672 + }, + { + "auxiliary_loss_clip": 0.01078023, + "auxiliary_loss_mlp": 0.01032149, + "balance_loss_clip": 1.0377388, + "balance_loss_mlp": 1.0195483, + "epoch": 0.5483841875845483, + "flos": 21466896272640.0, + "grad_norm": 3.214409804996618, + "language_loss": 0.80613893, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.82724071, + "num_input_tokens_seen": 196551305, + "step": 9121, + "time_per_iteration": 4.053008794784546 + }, + { + "auxiliary_loss_clip": 0.0106819, + "auxiliary_loss_mlp": 0.0103326, + "balance_loss_clip": 1.03818846, + "balance_loss_mlp": 1.02015328, + "epoch": 0.5484443108372163, + "flos": 24747263792640.0, + "grad_norm": 1.720306074841834, + "language_loss": 0.60944057, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.63045508, + "num_input_tokens_seen": 196569420, + "step": 9122, + "time_per_iteration": 2.6311779022216797 + }, + { + "auxiliary_loss_clip": 0.01090232, + "auxiliary_loss_mlp": 0.01032653, + "balance_loss_clip": 1.03752279, + "balance_loss_mlp": 1.02196038, + "epoch": 0.5485044340898843, + "flos": 25375377945600.0, + "grad_norm": 1.7375714350613267, + "language_loss": 0.71759999, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.7388289, + "num_input_tokens_seen": 196590610, + "step": 9123, + "time_per_iteration": 2.5749564170837402 + }, + { + "auxiliary_loss_clip": 0.0110989, + "auxiliary_loss_mlp": 0.01028374, + "balance_loss_clip": 1.0371964, + "balance_loss_mlp": 1.01691175, + "epoch": 0.5485645573425523, + "flos": 12641167370880.0, + "grad_norm": 2.9465734215715575, + "language_loss": 0.83138907, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.85277164, + "num_input_tokens_seen": 196606495, + "step": 9124, + "time_per_iteration": 2.4362239837646484 + }, + { + "auxiliary_loss_clip": 0.01087859, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.04143906, + "balance_loss_mlp": 1.01805115, + "epoch": 0.5486246805952202, + "flos": 28329425383680.0, + "grad_norm": 1.5270193027120782, + "language_loss": 0.80174333, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.82291889, + "num_input_tokens_seen": 196626365, + "step": 9125, + "time_per_iteration": 2.6047277450561523 + }, + { + "auxiliary_loss_clip": 0.01100805, + "auxiliary_loss_mlp": 0.01030935, + "balance_loss_clip": 1.03758359, + "balance_loss_mlp": 1.01884103, + "epoch": 0.5486848038478882, + "flos": 16800017817600.0, + "grad_norm": 2.016757199992043, + "language_loss": 0.74316865, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.76448607, + "num_input_tokens_seen": 196644465, + "step": 9126, + "time_per_iteration": 2.4601564407348633 + }, + { + "auxiliary_loss_clip": 0.01099153, + "auxiliary_loss_mlp": 0.01032595, + "balance_loss_clip": 1.03665328, + "balance_loss_mlp": 1.0192014, + "epoch": 0.5487449271005561, + "flos": 17236224581760.0, + "grad_norm": 2.2354961476678583, + "language_loss": 0.66772574, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.68904322, + "num_input_tokens_seen": 196659160, + "step": 9127, + "time_per_iteration": 2.470860004425049 + }, + { + "auxiliary_loss_clip": 0.01078002, + "auxiliary_loss_mlp": 0.0103292, + "balance_loss_clip": 1.03862715, + "balance_loss_mlp": 1.01978862, + "epoch": 0.5488050503532241, + "flos": 17340867878400.0, + "grad_norm": 1.749484320955835, + "language_loss": 0.83326924, + "learning_rate": 1.781365618532181e-06, + "loss": 0.85437846, + "num_input_tokens_seen": 196677410, + "step": 9128, + "time_per_iteration": 2.560222625732422 + }, + { + "auxiliary_loss_clip": 0.01071128, + "auxiliary_loss_mlp": 0.01029936, + "balance_loss_clip": 1.0409596, + "balance_loss_mlp": 1.01756835, + "epoch": 0.548865173605892, + "flos": 17239169496960.0, + "grad_norm": 2.1339805237501666, + "language_loss": 0.74420315, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.76521379, + "num_input_tokens_seen": 196696765, + "step": 9129, + "time_per_iteration": 2.591024875640869 + }, + { + "auxiliary_loss_clip": 0.01073561, + "auxiliary_loss_mlp": 0.01030011, + "balance_loss_clip": 1.03966689, + "balance_loss_mlp": 1.01664734, + "epoch": 0.5489252968585601, + "flos": 17456716218240.0, + "grad_norm": 2.98776978728015, + "language_loss": 0.6269443, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.64798003, + "num_input_tokens_seen": 196714895, + "step": 9130, + "time_per_iteration": 2.529120683670044 + }, + { + "auxiliary_loss_clip": 0.01112856, + "auxiliary_loss_mlp": 0.00786545, + "balance_loss_clip": 1.03765893, + "balance_loss_mlp": 1.01075208, + "epoch": 0.548985420111228, + "flos": 26323383646080.0, + "grad_norm": 1.7818070292069321, + "language_loss": 0.62715018, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.64614415, + "num_input_tokens_seen": 196735510, + "step": 9131, + "time_per_iteration": 2.5174167156219482 + }, + { + "auxiliary_loss_clip": 0.01101985, + "auxiliary_loss_mlp": 0.01032821, + "balance_loss_clip": 1.03668141, + "balance_loss_mlp": 1.01893258, + "epoch": 0.549045543363896, + "flos": 18693730748160.0, + "grad_norm": 2.164578285563406, + "language_loss": 0.7472018, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.76854986, + "num_input_tokens_seen": 196752855, + "step": 9132, + "time_per_iteration": 2.4597842693328857 + }, + { + "auxiliary_loss_clip": 0.01097992, + "auxiliary_loss_mlp": 0.01026874, + "balance_loss_clip": 1.03401458, + "balance_loss_mlp": 1.0151732, + "epoch": 0.5491056666165639, + "flos": 24717386655360.0, + "grad_norm": 1.555481337925201, + "language_loss": 0.8123821, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.83363074, + "num_input_tokens_seen": 196772230, + "step": 9133, + "time_per_iteration": 2.5265960693359375 + }, + { + "auxiliary_loss_clip": 0.01086135, + "auxiliary_loss_mlp": 0.00784822, + "balance_loss_clip": 1.03546023, + "balance_loss_mlp": 1.00997472, + "epoch": 0.5491657898692319, + "flos": 21576926609280.0, + "grad_norm": 2.032429761792675, + "language_loss": 0.7009443, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.71965384, + "num_input_tokens_seen": 196790405, + "step": 9134, + "time_per_iteration": 2.509330987930298 + }, + { + "auxiliary_loss_clip": 0.01083527, + "auxiliary_loss_mlp": 0.01031705, + "balance_loss_clip": 1.03638935, + "balance_loss_mlp": 1.0198679, + "epoch": 0.5492259131219, + "flos": 50476432746240.0, + "grad_norm": 2.3434157482377524, + "language_loss": 0.61746079, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.6386131, + "num_input_tokens_seen": 196813785, + "step": 9135, + "time_per_iteration": 2.8207311630249023 + }, + { + "auxiliary_loss_clip": 0.01101433, + "auxiliary_loss_mlp": 0.0102837, + "balance_loss_clip": 1.03707528, + "balance_loss_mlp": 1.01509643, + "epoch": 0.5492860363745679, + "flos": 25119262995840.0, + "grad_norm": 1.9605175986547425, + "language_loss": 0.7257266, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.74702466, + "num_input_tokens_seen": 196834390, + "step": 9136, + "time_per_iteration": 2.5177831649780273 + }, + { + "auxiliary_loss_clip": 0.01052819, + "auxiliary_loss_mlp": 0.01033173, + "balance_loss_clip": 1.04112053, + "balance_loss_mlp": 1.0194577, + "epoch": 0.5493461596272359, + "flos": 22633777497600.0, + "grad_norm": 2.1261136285034543, + "language_loss": 0.67374837, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.69460833, + "num_input_tokens_seen": 196853290, + "step": 9137, + "time_per_iteration": 2.640841245651245 + }, + { + "auxiliary_loss_clip": 0.01034908, + "auxiliary_loss_mlp": 0.01002308, + "balance_loss_clip": 1.01946437, + "balance_loss_mlp": 1.00115192, + "epoch": 0.5494062828799038, + "flos": 66151800754560.0, + "grad_norm": 0.740875386583104, + "language_loss": 0.65232223, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67269439, + "num_input_tokens_seen": 196913120, + "step": 9138, + "time_per_iteration": 3.1284584999084473 + }, + { + "auxiliary_loss_clip": 0.01100065, + "auxiliary_loss_mlp": 0.0102861, + "balance_loss_clip": 1.03799915, + "balance_loss_mlp": 1.01606345, + "epoch": 0.5494664061325718, + "flos": 21105958458240.0, + "grad_norm": 2.3287675391998355, + "language_loss": 0.74886763, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.77015442, + "num_input_tokens_seen": 196931530, + "step": 9139, + "time_per_iteration": 2.500924825668335 + }, + { + "auxiliary_loss_clip": 0.01100264, + "auxiliary_loss_mlp": 0.01026412, + "balance_loss_clip": 1.0370723, + "balance_loss_mlp": 1.01418126, + "epoch": 0.5495265293852397, + "flos": 14392566616320.0, + "grad_norm": 1.7920903742566054, + "language_loss": 0.71093786, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.73220462, + "num_input_tokens_seen": 196949430, + "step": 9140, + "time_per_iteration": 2.4827287197113037 + }, + { + "auxiliary_loss_clip": 0.01083584, + "auxiliary_loss_mlp": 0.01033285, + "balance_loss_clip": 1.03614902, + "balance_loss_mlp": 1.02045166, + "epoch": 0.5495866526379077, + "flos": 25549148966400.0, + "grad_norm": 1.6577703854096209, + "language_loss": 0.76387066, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.78503931, + "num_input_tokens_seen": 196968265, + "step": 9141, + "time_per_iteration": 2.5839693546295166 + }, + { + "auxiliary_loss_clip": 0.01074439, + "auxiliary_loss_mlp": 0.01028393, + "balance_loss_clip": 1.03862727, + "balance_loss_mlp": 1.01675797, + "epoch": 0.5496467758905756, + "flos": 21317256213120.0, + "grad_norm": 1.9563277771403476, + "language_loss": 0.75034821, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.77137649, + "num_input_tokens_seen": 196984930, + "step": 9142, + "time_per_iteration": 2.5676512718200684 + }, + { + "auxiliary_loss_clip": 0.01095419, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.03950453, + "balance_loss_mlp": 1.01739311, + "epoch": 0.5497068991432437, + "flos": 22233086305920.0, + "grad_norm": 1.9379801582030713, + "language_loss": 0.76630896, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.78757054, + "num_input_tokens_seen": 197002320, + "step": 9143, + "time_per_iteration": 2.544015407562256 + }, + { + "auxiliary_loss_clip": 0.01081911, + "auxiliary_loss_mlp": 0.01033129, + "balance_loss_clip": 1.03471541, + "balance_loss_mlp": 1.02033782, + "epoch": 0.5497670223959116, + "flos": 18479093028480.0, + "grad_norm": 4.099580023777893, + "language_loss": 0.80330348, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.82445389, + "num_input_tokens_seen": 197020825, + "step": 9144, + "time_per_iteration": 2.489915370941162 + }, + { + "auxiliary_loss_clip": 0.01101648, + "auxiliary_loss_mlp": 0.01027099, + "balance_loss_clip": 1.03811932, + "balance_loss_mlp": 1.01513064, + "epoch": 0.5498271456485796, + "flos": 29205107049600.0, + "grad_norm": 1.8185254524841954, + "language_loss": 0.71269548, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.73398298, + "num_input_tokens_seen": 197040450, + "step": 9145, + "time_per_iteration": 2.560147523880005 + }, + { + "auxiliary_loss_clip": 0.01100681, + "auxiliary_loss_mlp": 0.01027319, + "balance_loss_clip": 1.03704405, + "balance_loss_mlp": 1.01570237, + "epoch": 0.5498872689012475, + "flos": 34824372664320.0, + "grad_norm": 1.760744688100453, + "language_loss": 0.7038452, + "learning_rate": 1.774398678985076e-06, + "loss": 0.72512519, + "num_input_tokens_seen": 197063930, + "step": 9146, + "time_per_iteration": 2.6060314178466797 + }, + { + "auxiliary_loss_clip": 0.01084148, + "auxiliary_loss_mlp": 0.0102818, + "balance_loss_clip": 1.03428125, + "balance_loss_mlp": 1.01687932, + "epoch": 0.5499473921539155, + "flos": 25921938268800.0, + "grad_norm": 1.824634066142312, + "language_loss": 0.64010239, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.66122568, + "num_input_tokens_seen": 197082660, + "step": 9147, + "time_per_iteration": 3.919830083847046 + }, + { + "auxiliary_loss_clip": 0.01111746, + "auxiliary_loss_mlp": 0.01031343, + "balance_loss_clip": 1.03857374, + "balance_loss_mlp": 1.01932681, + "epoch": 0.5500075154065835, + "flos": 22273701609600.0, + "grad_norm": 2.0296121353461873, + "language_loss": 0.8074317, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.82886255, + "num_input_tokens_seen": 197100675, + "step": 9148, + "time_per_iteration": 2.473344087600708 + }, + { + "auxiliary_loss_clip": 0.01089832, + "auxiliary_loss_mlp": 0.0103612, + "balance_loss_clip": 1.03807521, + "balance_loss_mlp": 1.02421677, + "epoch": 0.5500676386592515, + "flos": 28037507552640.0, + "grad_norm": 1.7689018046858889, + "language_loss": 0.79149961, + "learning_rate": 1.773237789559453e-06, + "loss": 0.81275916, + "num_input_tokens_seen": 197121320, + "step": 9149, + "time_per_iteration": 2.5751101970672607 + }, + { + "auxiliary_loss_clip": 0.01075527, + "auxiliary_loss_mlp": 0.01027371, + "balance_loss_clip": 1.03975415, + "balance_loss_mlp": 1.0151639, + "epoch": 0.5501277619119195, + "flos": 23914819123200.0, + "grad_norm": 2.0292330639041136, + "language_loss": 0.71273017, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.73375916, + "num_input_tokens_seen": 197138965, + "step": 9150, + "time_per_iteration": 2.559328317642212 + }, + { + "auxiliary_loss_clip": 0.01090551, + "auxiliary_loss_mlp": 0.01027186, + "balance_loss_clip": 1.03465724, + "balance_loss_mlp": 1.01414502, + "epoch": 0.5501878851645874, + "flos": 20923783655040.0, + "grad_norm": 2.1607337095503167, + "language_loss": 0.74963742, + "learning_rate": 1.772463906245477e-06, + "loss": 0.77081478, + "num_input_tokens_seen": 197156460, + "step": 9151, + "time_per_iteration": 2.5070278644561768 + }, + { + "auxiliary_loss_clip": 0.0108567, + "auxiliary_loss_mlp": 0.01026133, + "balance_loss_clip": 1.03946686, + "balance_loss_mlp": 1.01476669, + "epoch": 0.5502480084172554, + "flos": 20665298407680.0, + "grad_norm": 2.319543398417275, + "language_loss": 0.76117802, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.78229606, + "num_input_tokens_seen": 197175140, + "step": 9152, + "time_per_iteration": 2.5324277877807617 + }, + { + "auxiliary_loss_clip": 0.0108517, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.03517222, + "balance_loss_mlp": 1.02073014, + "epoch": 0.5503081316699233, + "flos": 26432552056320.0, + "grad_norm": 2.486708010974758, + "language_loss": 0.82483357, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.8460077, + "num_input_tokens_seen": 197194345, + "step": 9153, + "time_per_iteration": 3.968703031539917 + }, + { + "auxiliary_loss_clip": 0.01097479, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.03706956, + "balance_loss_mlp": 1.01588094, + "epoch": 0.5503682549225913, + "flos": 30629144718720.0, + "grad_norm": 1.815863827952672, + "language_loss": 0.74294901, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.7642051, + "num_input_tokens_seen": 197215535, + "step": 9154, + "time_per_iteration": 2.558206081390381 + }, + { + "auxiliary_loss_clip": 0.01093477, + "auxiliary_loss_mlp": 0.01032964, + "balance_loss_clip": 1.03595996, + "balance_loss_mlp": 1.0196718, + "epoch": 0.5504283781752592, + "flos": 22565439872640.0, + "grad_norm": 1.6870966290325398, + "language_loss": 0.727575, + "learning_rate": 1.770916243273199e-06, + "loss": 0.74883944, + "num_input_tokens_seen": 197234945, + "step": 9155, + "time_per_iteration": 2.5434789657592773 + }, + { + "auxiliary_loss_clip": 0.01028537, + "auxiliary_loss_mlp": 0.01006263, + "balance_loss_clip": 1.01329494, + "balance_loss_mlp": 1.00500488, + "epoch": 0.5504885014279273, + "flos": 67901009270400.0, + "grad_norm": 0.7559758096156651, + "language_loss": 0.55377138, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.57411933, + "num_input_tokens_seen": 197302285, + "step": 9156, + "time_per_iteration": 3.228513717651367 + }, + { + "auxiliary_loss_clip": 0.01098842, + "auxiliary_loss_mlp": 0.01032227, + "balance_loss_clip": 1.03650784, + "balance_loss_mlp": 1.02044892, + "epoch": 0.5505486246805952, + "flos": 22450058409600.0, + "grad_norm": 1.7116578908025295, + "language_loss": 0.82887399, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.85018468, + "num_input_tokens_seen": 197321575, + "step": 9157, + "time_per_iteration": 4.127148151397705 + }, + { + "auxiliary_loss_clip": 0.01115824, + "auxiliary_loss_mlp": 0.01031216, + "balance_loss_clip": 1.03888226, + "balance_loss_mlp": 1.01785862, + "epoch": 0.5506087479332632, + "flos": 26906896085760.0, + "grad_norm": 3.8994073894539625, + "language_loss": 0.76044083, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.78191119, + "num_input_tokens_seen": 197340255, + "step": 9158, + "time_per_iteration": 2.5094587802886963 + }, + { + "auxiliary_loss_clip": 0.01068013, + "auxiliary_loss_mlp": 0.01026332, + "balance_loss_clip": 1.03470111, + "balance_loss_mlp": 1.01466155, + "epoch": 0.5506688711859311, + "flos": 22930256355840.0, + "grad_norm": 1.8901565007315186, + "language_loss": 0.69496763, + "learning_rate": 1.769368719290979e-06, + "loss": 0.71591109, + "num_input_tokens_seen": 197360360, + "step": 9159, + "time_per_iteration": 2.62975811958313 + }, + { + "auxiliary_loss_clip": 0.01065368, + "auxiliary_loss_mlp": 0.00786873, + "balance_loss_clip": 1.03515744, + "balance_loss_mlp": 1.0104928, + "epoch": 0.5507289944385991, + "flos": 29606408772480.0, + "grad_norm": 2.1811235040565333, + "language_loss": 0.67977452, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.6982969, + "num_input_tokens_seen": 197381905, + "step": 9160, + "time_per_iteration": 4.135736703872681 + }, + { + "auxiliary_loss_clip": 0.01108592, + "auxiliary_loss_mlp": 0.01027575, + "balance_loss_clip": 1.0374161, + "balance_loss_mlp": 1.01608956, + "epoch": 0.5507891176912671, + "flos": 15334431091200.0, + "grad_norm": 1.895495249725565, + "language_loss": 0.71360892, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.73497057, + "num_input_tokens_seen": 197398555, + "step": 9161, + "time_per_iteration": 2.4529480934143066 + }, + { + "auxiliary_loss_clip": 0.01099521, + "auxiliary_loss_mlp": 0.01035047, + "balance_loss_clip": 1.03763628, + "balance_loss_mlp": 1.02306628, + "epoch": 0.5508492409439351, + "flos": 26578313447040.0, + "grad_norm": 1.73705114642714, + "language_loss": 0.69468468, + "learning_rate": 1.768208168081359e-06, + "loss": 0.71603036, + "num_input_tokens_seen": 197419630, + "step": 9162, + "time_per_iteration": 2.531546115875244 + }, + { + "auxiliary_loss_clip": 0.01108823, + "auxiliary_loss_mlp": 0.01032896, + "balance_loss_clip": 1.03627169, + "balance_loss_mlp": 1.02101684, + "epoch": 0.5509093641966031, + "flos": 25443428261760.0, + "grad_norm": 1.6804727551668313, + "language_loss": 0.85837239, + "learning_rate": 1.767821335237733e-06, + "loss": 0.87978959, + "num_input_tokens_seen": 197438480, + "step": 9163, + "time_per_iteration": 2.486949920654297 + }, + { + "auxiliary_loss_clip": 0.01079671, + "auxiliary_loss_mlp": 0.01029241, + "balance_loss_clip": 1.03816891, + "balance_loss_mlp": 1.01718926, + "epoch": 0.550969487449271, + "flos": 18698543170560.0, + "grad_norm": 1.7238811900965705, + "language_loss": 0.80695719, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.82804632, + "num_input_tokens_seen": 197456755, + "step": 9164, + "time_per_iteration": 2.5517404079437256 + }, + { + "auxiliary_loss_clip": 0.01093426, + "auxiliary_loss_mlp": 0.01025818, + "balance_loss_clip": 1.0404768, + "balance_loss_mlp": 1.01342618, + "epoch": 0.551029610701939, + "flos": 22708723224960.0, + "grad_norm": 2.25742835822819, + "language_loss": 0.73406327, + "learning_rate": 1.767047695977863e-06, + "loss": 0.7552557, + "num_input_tokens_seen": 197475530, + "step": 9165, + "time_per_iteration": 2.557344675064087 + }, + { + "auxiliary_loss_clip": 0.01092023, + "auxiliary_loss_mlp": 0.01031013, + "balance_loss_clip": 1.03421283, + "balance_loss_mlp": 1.01835263, + "epoch": 0.5510897339546069, + "flos": 12420496166400.0, + "grad_norm": 2.151564440507071, + "language_loss": 0.7892493, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.81047964, + "num_input_tokens_seen": 197490835, + "step": 9166, + "time_per_iteration": 2.480553388595581 + }, + { + "auxiliary_loss_clip": 0.01079158, + "auxiliary_loss_mlp": 0.01025847, + "balance_loss_clip": 1.03541327, + "balance_loss_mlp": 1.01371157, + "epoch": 0.5511498572072749, + "flos": 18770579896320.0, + "grad_norm": 2.216688604592005, + "language_loss": 0.76407039, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.78512049, + "num_input_tokens_seen": 197508770, + "step": 9167, + "time_per_iteration": 2.5474066734313965 + }, + { + "auxiliary_loss_clip": 0.01098354, + "auxiliary_loss_mlp": 0.01027353, + "balance_loss_clip": 1.03647351, + "balance_loss_mlp": 1.0146811, + "epoch": 0.5512099804599428, + "flos": 19573326996480.0, + "grad_norm": 2.207078315857769, + "language_loss": 0.80298448, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.82424152, + "num_input_tokens_seen": 197527340, + "step": 9168, + "time_per_iteration": 2.477987289428711 + }, + { + "auxiliary_loss_clip": 0.01101744, + "auxiliary_loss_mlp": 0.01033624, + "balance_loss_clip": 1.03748608, + "balance_loss_mlp": 1.0216316, + "epoch": 0.5512701037126109, + "flos": 26245600744320.0, + "grad_norm": 1.7487305953637575, + "language_loss": 0.69116163, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.71251535, + "num_input_tokens_seen": 197547280, + "step": 9169, + "time_per_iteration": 2.5438039302825928 + }, + { + "auxiliary_loss_clip": 0.01092903, + "auxiliary_loss_mlp": 0.01023757, + "balance_loss_clip": 1.03344059, + "balance_loss_mlp": 1.01267052, + "epoch": 0.5513302269652788, + "flos": 21945406279680.0, + "grad_norm": 2.223150093606078, + "language_loss": 0.85670787, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.87787449, + "num_input_tokens_seen": 197565045, + "step": 9170, + "time_per_iteration": 2.4929566383361816 + }, + { + "auxiliary_loss_clip": 0.01025906, + "auxiliary_loss_mlp": 0.01006178, + "balance_loss_clip": 1.02775908, + "balance_loss_mlp": 1.0045805, + "epoch": 0.5513903502179468, + "flos": 68235948616320.0, + "grad_norm": 0.7844996675351996, + "language_loss": 0.59902245, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.61934328, + "num_input_tokens_seen": 197625005, + "step": 9171, + "time_per_iteration": 3.170624017715454 + }, + { + "auxiliary_loss_clip": 0.01074488, + "auxiliary_loss_mlp": 0.01033903, + "balance_loss_clip": 1.03444338, + "balance_loss_mlp": 1.02138019, + "epoch": 0.5514504734706147, + "flos": 18734238311040.0, + "grad_norm": 1.438863379179184, + "language_loss": 0.70347774, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.72456169, + "num_input_tokens_seen": 197645050, + "step": 9172, + "time_per_iteration": 2.5470964908599854 + }, + { + "auxiliary_loss_clip": 0.0110833, + "auxiliary_loss_mlp": 0.01032653, + "balance_loss_clip": 1.0356425, + "balance_loss_mlp": 1.02043366, + "epoch": 0.5515105967232827, + "flos": 22270972176000.0, + "grad_norm": 3.0715803079433814, + "language_loss": 0.76043999, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.7818498, + "num_input_tokens_seen": 197663910, + "step": 9173, + "time_per_iteration": 2.489335536956787 + }, + { + "auxiliary_loss_clip": 0.010726, + "auxiliary_loss_mlp": 0.01029931, + "balance_loss_clip": 1.03572893, + "balance_loss_mlp": 1.01759887, + "epoch": 0.5515707199759508, + "flos": 22557682535040.0, + "grad_norm": 4.629307836348251, + "language_loss": 0.75141406, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.77243936, + "num_input_tokens_seen": 197681580, + "step": 9174, + "time_per_iteration": 2.5552353858947754 + }, + { + "auxiliary_loss_clip": 0.01089819, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.03563261, + "balance_loss_mlp": 1.01952362, + "epoch": 0.5516308432286187, + "flos": 28291072636800.0, + "grad_norm": 1.7588985192034923, + "language_loss": 0.72754312, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.74875867, + "num_input_tokens_seen": 197702095, + "step": 9175, + "time_per_iteration": 2.579268217086792 + }, + { + "auxiliary_loss_clip": 0.01099585, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.0371691, + "balance_loss_mlp": 1.02079654, + "epoch": 0.5516909664812867, + "flos": 18764474584320.0, + "grad_norm": 1.759549723090395, + "language_loss": 0.69160211, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.71292484, + "num_input_tokens_seen": 197720720, + "step": 9176, + "time_per_iteration": 2.4819114208221436 + }, + { + "auxiliary_loss_clip": 0.01098268, + "auxiliary_loss_mlp": 0.01030195, + "balance_loss_clip": 1.03603411, + "balance_loss_mlp": 1.01827407, + "epoch": 0.5517510897339546, + "flos": 27740346336000.0, + "grad_norm": 1.7950917465037164, + "language_loss": 0.71046102, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.73174566, + "num_input_tokens_seen": 197741820, + "step": 9177, + "time_per_iteration": 2.5527186393737793 + }, + { + "auxiliary_loss_clip": 0.01100365, + "auxiliary_loss_mlp": 0.01028986, + "balance_loss_clip": 1.0382092, + "balance_loss_mlp": 1.01703501, + "epoch": 0.5518112129866226, + "flos": 18404470523520.0, + "grad_norm": 1.522700806431929, + "language_loss": 0.80141199, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.82270551, + "num_input_tokens_seen": 197759160, + "step": 9178, + "time_per_iteration": 2.473494291305542 + }, + { + "auxiliary_loss_clip": 0.0105906, + "auxiliary_loss_mlp": 0.01042042, + "balance_loss_clip": 1.03941488, + "balance_loss_mlp": 1.02830958, + "epoch": 0.5518713362392905, + "flos": 25082670015360.0, + "grad_norm": 1.958444900132511, + "language_loss": 0.74800748, + "learning_rate": 1.761633217089826e-06, + "loss": 0.76901847, + "num_input_tokens_seen": 197779760, + "step": 9179, + "time_per_iteration": 2.6429245471954346 + }, + { + "auxiliary_loss_clip": 0.01100626, + "auxiliary_loss_mlp": 0.01035327, + "balance_loss_clip": 1.03754091, + "balance_loss_mlp": 1.02354264, + "epoch": 0.5519314594919585, + "flos": 36538999361280.0, + "grad_norm": 1.941967734829439, + "language_loss": 0.70055342, + "learning_rate": 1.761246535912924e-06, + "loss": 0.72191298, + "num_input_tokens_seen": 197801545, + "step": 9180, + "time_per_iteration": 2.6099414825439453 + }, + { + "auxiliary_loss_clip": 0.01097539, + "auxiliary_loss_mlp": 0.01041077, + "balance_loss_clip": 1.03737533, + "balance_loss_mlp": 1.0280714, + "epoch": 0.5519915827446265, + "flos": 20448613612800.0, + "grad_norm": 1.8775381149689536, + "language_loss": 0.66910481, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.69049102, + "num_input_tokens_seen": 197820760, + "step": 9181, + "time_per_iteration": 2.484083652496338 + }, + { + "auxiliary_loss_clip": 0.01112912, + "auxiliary_loss_mlp": 0.01031289, + "balance_loss_clip": 1.03732347, + "balance_loss_mlp": 1.01865244, + "epoch": 0.5520517059972945, + "flos": 23768052151680.0, + "grad_norm": 1.8881515107107425, + "language_loss": 0.79315281, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.81459486, + "num_input_tokens_seen": 197840195, + "step": 9182, + "time_per_iteration": 2.461601734161377 + }, + { + "auxiliary_loss_clip": 0.01076978, + "auxiliary_loss_mlp": 0.01029311, + "balance_loss_clip": 1.03972721, + "balance_loss_mlp": 1.01741982, + "epoch": 0.5521118292499624, + "flos": 22196457411840.0, + "grad_norm": 1.9807399050987984, + "language_loss": 0.8253656, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.84642851, + "num_input_tokens_seen": 197859475, + "step": 9183, + "time_per_iteration": 2.577390432357788 + }, + { + "auxiliary_loss_clip": 0.01087516, + "auxiliary_loss_mlp": 0.01027296, + "balance_loss_clip": 1.03545189, + "balance_loss_mlp": 1.01515436, + "epoch": 0.5521719525026304, + "flos": 23583291569280.0, + "grad_norm": 1.3691207812979886, + "language_loss": 0.67124712, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.69239521, + "num_input_tokens_seen": 197879395, + "step": 9184, + "time_per_iteration": 2.5361216068267822 + }, + { + "auxiliary_loss_clip": 0.01099133, + "auxiliary_loss_mlp": 0.01027031, + "balance_loss_clip": 1.03682065, + "balance_loss_mlp": 1.01372719, + "epoch": 0.5522320757552983, + "flos": 26137617482880.0, + "grad_norm": 1.5643196818495981, + "language_loss": 0.76349288, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.78475451, + "num_input_tokens_seen": 197900815, + "step": 9185, + "time_per_iteration": 2.5326552391052246 + }, + { + "auxiliary_loss_clip": 0.01074204, + "auxiliary_loss_mlp": 0.01036946, + "balance_loss_clip": 1.03681254, + "balance_loss_mlp": 1.02404797, + "epoch": 0.5522921990079663, + "flos": 24676160820480.0, + "grad_norm": 1.656327483033755, + "language_loss": 0.73878682, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.75989836, + "num_input_tokens_seen": 197918985, + "step": 9186, + "time_per_iteration": 3.9809212684631348 + }, + { + "auxiliary_loss_clip": 0.0108285, + "auxiliary_loss_mlp": 0.01039482, + "balance_loss_clip": 1.03874826, + "balance_loss_mlp": 1.0276866, + "epoch": 0.5523523222606344, + "flos": 22748153379840.0, + "grad_norm": 4.473836625042513, + "language_loss": 0.66705084, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.68827415, + "num_input_tokens_seen": 197937725, + "step": 9187, + "time_per_iteration": 2.562554359436035 + }, + { + "auxiliary_loss_clip": 0.01089216, + "auxiliary_loss_mlp": 0.01028824, + "balance_loss_clip": 1.03812981, + "balance_loss_mlp": 1.01703405, + "epoch": 0.5524124455133023, + "flos": 19755825022080.0, + "grad_norm": 4.89989645539051, + "language_loss": 0.77515376, + "learning_rate": 1.758153413657318e-06, + "loss": 0.79633415, + "num_input_tokens_seen": 197955635, + "step": 9188, + "time_per_iteration": 2.511859655380249 + }, + { + "auxiliary_loss_clip": 0.01085198, + "auxiliary_loss_mlp": 0.01031317, + "balance_loss_clip": 1.03534079, + "balance_loss_mlp": 1.01836526, + "epoch": 0.5524725687659703, + "flos": 23294821443840.0, + "grad_norm": 1.731031988092773, + "language_loss": 0.81078517, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.83195031, + "num_input_tokens_seen": 197974490, + "step": 9189, + "time_per_iteration": 2.5119194984436035 + }, + { + "auxiliary_loss_clip": 0.01096036, + "auxiliary_loss_mlp": 0.00786532, + "balance_loss_clip": 1.03920031, + "balance_loss_mlp": 1.01278746, + "epoch": 0.5525326920186382, + "flos": 24862178378880.0, + "grad_norm": 1.548034912015803, + "language_loss": 0.76317739, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.78200305, + "num_input_tokens_seen": 197995735, + "step": 9190, + "time_per_iteration": 2.5424177646636963 + }, + { + "auxiliary_loss_clip": 0.01113975, + "auxiliary_loss_mlp": 0.0103527, + "balance_loss_clip": 1.03742433, + "balance_loss_mlp": 1.02150106, + "epoch": 0.5525928152713062, + "flos": 13735580906880.0, + "grad_norm": 2.3840229457497153, + "language_loss": 0.78950357, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.81099606, + "num_input_tokens_seen": 198009685, + "step": 9191, + "time_per_iteration": 2.4241771697998047 + }, + { + "auxiliary_loss_clip": 0.0104746, + "auxiliary_loss_mlp": 0.0103459, + "balance_loss_clip": 1.04087448, + "balance_loss_mlp": 1.02188194, + "epoch": 0.5526529385239741, + "flos": 13071592045440.0, + "grad_norm": 2.0273667058435345, + "language_loss": 0.68775868, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.70857918, + "num_input_tokens_seen": 198026845, + "step": 9192, + "time_per_iteration": 2.592529773712158 + }, + { + "auxiliary_loss_clip": 0.01097099, + "auxiliary_loss_mlp": 0.01029051, + "balance_loss_clip": 1.0364418, + "balance_loss_mlp": 1.0181495, + "epoch": 0.5527130617766421, + "flos": 23148377694720.0, + "grad_norm": 1.6134222717304798, + "language_loss": 0.77370894, + "learning_rate": 1.756220509823588e-06, + "loss": 0.79497045, + "num_input_tokens_seen": 198045275, + "step": 9193, + "time_per_iteration": 3.849308967590332 + }, + { + "auxiliary_loss_clip": 0.01074553, + "auxiliary_loss_mlp": 0.01032571, + "balance_loss_clip": 1.03490388, + "balance_loss_mlp": 1.02011919, + "epoch": 0.55277318502931, + "flos": 21285547482240.0, + "grad_norm": 1.549976468829453, + "language_loss": 0.78372353, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.80479479, + "num_input_tokens_seen": 198065760, + "step": 9194, + "time_per_iteration": 2.5448360443115234 + }, + { + "auxiliary_loss_clip": 0.01083978, + "auxiliary_loss_mlp": 0.01030934, + "balance_loss_clip": 1.03762555, + "balance_loss_mlp": 1.01803517, + "epoch": 0.5528333082819781, + "flos": 38324549462400.0, + "grad_norm": 2.241780925080222, + "language_loss": 0.69620025, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.71734935, + "num_input_tokens_seen": 198087595, + "step": 9195, + "time_per_iteration": 4.048158168792725 + }, + { + "auxiliary_loss_clip": 0.01096879, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.03946328, + "balance_loss_mlp": 1.01920366, + "epoch": 0.552893431534646, + "flos": 13553621585280.0, + "grad_norm": 2.3022190335135977, + "language_loss": 0.7398985, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.76119357, + "num_input_tokens_seen": 198104620, + "step": 9196, + "time_per_iteration": 2.483397960662842 + }, + { + "auxiliary_loss_clip": 0.01094451, + "auxiliary_loss_mlp": 0.01032259, + "balance_loss_clip": 1.03725684, + "balance_loss_mlp": 1.01969433, + "epoch": 0.552953554787314, + "flos": 21939408708480.0, + "grad_norm": 1.5651130025717144, + "language_loss": 0.76774412, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.78901124, + "num_input_tokens_seen": 198123565, + "step": 9197, + "time_per_iteration": 2.484361410140991 + }, + { + "auxiliary_loss_clip": 0.01087518, + "auxiliary_loss_mlp": 0.01026839, + "balance_loss_clip": 1.03640032, + "balance_loss_mlp": 1.01523972, + "epoch": 0.5530136780399819, + "flos": 43658002558080.0, + "grad_norm": 1.628978080815795, + "language_loss": 0.76238602, + "learning_rate": 1.754287837093407e-06, + "loss": 0.78352964, + "num_input_tokens_seen": 198148270, + "step": 9198, + "time_per_iteration": 2.73512601852417 + }, + { + "auxiliary_loss_clip": 0.0110815, + "auxiliary_loss_mlp": 0.01025125, + "balance_loss_clip": 1.03572226, + "balance_loss_mlp": 1.01363301, + "epoch": 0.5530738012926499, + "flos": 25045502417280.0, + "grad_norm": 1.4706444323606764, + "language_loss": 0.79431868, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.81565142, + "num_input_tokens_seen": 198168810, + "step": 9199, + "time_per_iteration": 3.9382777214050293 + }, + { + "auxiliary_loss_clip": 0.01071513, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.03774548, + "balance_loss_mlp": 1.02050972, + "epoch": 0.553133924545318, + "flos": 16472081623680.0, + "grad_norm": 1.8433772572409615, + "language_loss": 0.64030671, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.66135925, + "num_input_tokens_seen": 198186200, + "step": 9200, + "time_per_iteration": 2.521204948425293 + }, + { + "auxiliary_loss_clip": 0.01092547, + "auxiliary_loss_mlp": 0.01027662, + "balance_loss_clip": 1.03884149, + "balance_loss_mlp": 1.01372004, + "epoch": 0.5531940477979859, + "flos": 24606207083520.0, + "grad_norm": 1.9050925246039874, + "language_loss": 0.66281343, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.68401545, + "num_input_tokens_seen": 198207050, + "step": 9201, + "time_per_iteration": 2.5391438007354736 + }, + { + "auxiliary_loss_clip": 0.01099361, + "auxiliary_loss_mlp": 0.0103532, + "balance_loss_clip": 1.03848493, + "balance_loss_mlp": 1.02215385, + "epoch": 0.5532541710506539, + "flos": 22159577122560.0, + "grad_norm": 2.068933342127184, + "language_loss": 0.6073693, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.62871611, + "num_input_tokens_seen": 198224565, + "step": 9202, + "time_per_iteration": 2.4795942306518555 + }, + { + "auxiliary_loss_clip": 0.01098169, + "auxiliary_loss_mlp": 0.00785787, + "balance_loss_clip": 1.03810966, + "balance_loss_mlp": 1.01087368, + "epoch": 0.5533142943033218, + "flos": 21397265758080.0, + "grad_norm": 1.7308005648452462, + "language_loss": 0.6456387, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.6644783, + "num_input_tokens_seen": 198244790, + "step": 9203, + "time_per_iteration": 2.5000290870666504 + }, + { + "auxiliary_loss_clip": 0.01098529, + "auxiliary_loss_mlp": 0.01029641, + "balance_loss_clip": 1.03688419, + "balance_loss_mlp": 1.01777339, + "epoch": 0.5533744175559898, + "flos": 23550541344000.0, + "grad_norm": 2.0225689797249826, + "language_loss": 0.63412708, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.65540874, + "num_input_tokens_seen": 198264375, + "step": 9204, + "time_per_iteration": 2.5506935119628906 + }, + { + "auxiliary_loss_clip": 0.01096509, + "auxiliary_loss_mlp": 0.01025634, + "balance_loss_clip": 1.03670585, + "balance_loss_mlp": 1.01412439, + "epoch": 0.5534345408086577, + "flos": 24061514267520.0, + "grad_norm": 1.4923392811623066, + "language_loss": 0.77339876, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.79462016, + "num_input_tokens_seen": 198283895, + "step": 9205, + "time_per_iteration": 2.5277438163757324 + }, + { + "auxiliary_loss_clip": 0.01060611, + "auxiliary_loss_mlp": 0.01035673, + "balance_loss_clip": 1.03573966, + "balance_loss_mlp": 1.02306056, + "epoch": 0.5534946640613257, + "flos": 33771831408000.0, + "grad_norm": 1.4455693743846116, + "language_loss": 0.72512615, + "learning_rate": 1.751196045993537e-06, + "loss": 0.74608898, + "num_input_tokens_seen": 198310035, + "step": 9206, + "time_per_iteration": 2.7150843143463135 + }, + { + "auxiliary_loss_clip": 0.01064224, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.03492141, + "balance_loss_mlp": 1.01953542, + "epoch": 0.5535547873139937, + "flos": 15159223526400.0, + "grad_norm": 2.0217546338335213, + "language_loss": 0.75455046, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.77551174, + "num_input_tokens_seen": 198327810, + "step": 9207, + "time_per_iteration": 2.5404369831085205 + }, + { + "auxiliary_loss_clip": 0.01079858, + "auxiliary_loss_mlp": 0.01031932, + "balance_loss_clip": 1.03802347, + "balance_loss_mlp": 1.01948071, + "epoch": 0.5536149105666617, + "flos": 16980863817600.0, + "grad_norm": 2.4183595719787694, + "language_loss": 0.61700737, + "learning_rate": 1.750423192272189e-06, + "loss": 0.6381253, + "num_input_tokens_seen": 198343150, + "step": 9208, + "time_per_iteration": 2.4944663047790527 + }, + { + "auxiliary_loss_clip": 0.01112705, + "auxiliary_loss_mlp": 0.01032597, + "balance_loss_clip": 1.03887272, + "balance_loss_mlp": 1.02068794, + "epoch": 0.5536750338193296, + "flos": 18149935772160.0, + "grad_norm": 2.0926900898326872, + "language_loss": 0.64433318, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.66578621, + "num_input_tokens_seen": 198360925, + "step": 9209, + "time_per_iteration": 2.4638783931732178 + }, + { + "auxiliary_loss_clip": 0.01075837, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.03729177, + "balance_loss_mlp": 1.01952219, + "epoch": 0.5537351570719976, + "flos": 22747794243840.0, + "grad_norm": 2.004884717263606, + "language_loss": 0.82772017, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.84881061, + "num_input_tokens_seen": 198379265, + "step": 9210, + "time_per_iteration": 2.5391480922698975 + }, + { + "auxiliary_loss_clip": 0.0108742, + "auxiliary_loss_mlp": 0.01028596, + "balance_loss_clip": 1.03505921, + "balance_loss_mlp": 1.01711667, + "epoch": 0.5537952803246655, + "flos": 26356026130560.0, + "grad_norm": 2.039373873493051, + "language_loss": 0.72720921, + "learning_rate": 1.74926398270663e-06, + "loss": 0.74836934, + "num_input_tokens_seen": 198399490, + "step": 9211, + "time_per_iteration": 2.5721867084503174 + }, + { + "auxiliary_loss_clip": 0.01082191, + "auxiliary_loss_mlp": 0.01032535, + "balance_loss_clip": 1.03763032, + "balance_loss_mlp": 1.01871872, + "epoch": 0.5538554035773335, + "flos": 18037427397120.0, + "grad_norm": 1.8696746983669041, + "language_loss": 0.66595, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.68709725, + "num_input_tokens_seen": 198419110, + "step": 9212, + "time_per_iteration": 2.5349819660186768 + }, + { + "auxiliary_loss_clip": 0.01079438, + "auxiliary_loss_mlp": 0.01028115, + "balance_loss_clip": 1.03466356, + "balance_loss_mlp": 1.01408374, + "epoch": 0.5539155268300014, + "flos": 31686247002240.0, + "grad_norm": 1.4324252264799842, + "language_loss": 0.51715839, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.53823388, + "num_input_tokens_seen": 198441360, + "step": 9213, + "time_per_iteration": 2.6196541786193848 + }, + { + "auxiliary_loss_clip": 0.01084608, + "auxiliary_loss_mlp": 0.01035347, + "balance_loss_clip": 1.04235852, + "balance_loss_mlp": 1.02248406, + "epoch": 0.5539756500826695, + "flos": 15193769431680.0, + "grad_norm": 1.8636886443264649, + "language_loss": 0.85641068, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.87761021, + "num_input_tokens_seen": 198459835, + "step": 9214, + "time_per_iteration": 2.5426511764526367 + }, + { + "auxiliary_loss_clip": 0.01101033, + "auxiliary_loss_mlp": 0.01027522, + "balance_loss_clip": 1.03871202, + "balance_loss_mlp": 1.01545835, + "epoch": 0.5540357733353375, + "flos": 26353117128960.0, + "grad_norm": 1.5922579801598906, + "language_loss": 0.69933975, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.72062528, + "num_input_tokens_seen": 198478955, + "step": 9215, + "time_per_iteration": 2.502953290939331 + }, + { + "auxiliary_loss_clip": 0.01088703, + "auxiliary_loss_mlp": 0.01031955, + "balance_loss_clip": 1.04404366, + "balance_loss_mlp": 1.01903844, + "epoch": 0.5540958965880054, + "flos": 21323684747520.0, + "grad_norm": 1.6079157501598225, + "language_loss": 0.73063791, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.75184447, + "num_input_tokens_seen": 198499030, + "step": 9216, + "time_per_iteration": 2.5603458881378174 + }, + { + "auxiliary_loss_clip": 0.01087715, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.03921628, + "balance_loss_mlp": 1.01943338, + "epoch": 0.5541560198406734, + "flos": 25666828899840.0, + "grad_norm": 1.8843916401385659, + "language_loss": 0.7158742, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.73707145, + "num_input_tokens_seen": 198520265, + "step": 9217, + "time_per_iteration": 2.5675241947174072 + }, + { + "auxiliary_loss_clip": 0.01092956, + "auxiliary_loss_mlp": 0.0102795, + "balance_loss_clip": 1.03819513, + "balance_loss_mlp": 1.01590335, + "epoch": 0.5542161430933413, + "flos": 21939624190080.0, + "grad_norm": 1.6876168922932369, + "language_loss": 0.7828809, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.80408996, + "num_input_tokens_seen": 198539645, + "step": 9218, + "time_per_iteration": 2.4953677654266357 + }, + { + "auxiliary_loss_clip": 0.0107282, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.03889871, + "balance_loss_mlp": 1.01717627, + "epoch": 0.5542762663460093, + "flos": 19571459489280.0, + "grad_norm": 1.5336018196184211, + "language_loss": 0.72026777, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.74130964, + "num_input_tokens_seen": 198558710, + "step": 9219, + "time_per_iteration": 2.5532217025756836 + }, + { + "auxiliary_loss_clip": 0.01101989, + "auxiliary_loss_mlp": 0.01039675, + "balance_loss_clip": 1.04388523, + "balance_loss_mlp": 1.02644277, + "epoch": 0.5543363895986773, + "flos": 19499063627520.0, + "grad_norm": 1.5884246347454203, + "language_loss": 0.71396136, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.73537797, + "num_input_tokens_seen": 198577050, + "step": 9220, + "time_per_iteration": 2.496856927871704 + }, + { + "auxiliary_loss_clip": 0.01110228, + "auxiliary_loss_mlp": 0.01024064, + "balance_loss_clip": 1.03937697, + "balance_loss_mlp": 1.01284659, + "epoch": 0.5543965128513453, + "flos": 22635609091200.0, + "grad_norm": 1.6781320916828435, + "language_loss": 0.7919178, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.81326067, + "num_input_tokens_seen": 198595290, + "step": 9221, + "time_per_iteration": 2.456348419189453 + }, + { + "auxiliary_loss_clip": 0.0108488, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.03992224, + "balance_loss_mlp": 1.0182631, + "epoch": 0.5544566361040132, + "flos": 25989952671360.0, + "grad_norm": 1.8844998607941885, + "language_loss": 0.83779639, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.85895479, + "num_input_tokens_seen": 198614110, + "step": 9222, + "time_per_iteration": 2.58412504196167 + }, + { + "auxiliary_loss_clip": 0.010885, + "auxiliary_loss_mlp": 0.00789209, + "balance_loss_clip": 1.04125893, + "balance_loss_mlp": 1.01626444, + "epoch": 0.5545167593566812, + "flos": 28257568225920.0, + "grad_norm": 2.454960872768613, + "language_loss": 0.75720191, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.77597904, + "num_input_tokens_seen": 198633880, + "step": 9223, + "time_per_iteration": 2.6636600494384766 + }, + { + "auxiliary_loss_clip": 0.01084014, + "auxiliary_loss_mlp": 0.01031846, + "balance_loss_clip": 1.03745711, + "balance_loss_mlp": 1.01845324, + "epoch": 0.5545768826093491, + "flos": 28476551491200.0, + "grad_norm": 1.729386844148969, + "language_loss": 0.81540793, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.83656657, + "num_input_tokens_seen": 198653505, + "step": 9224, + "time_per_iteration": 3.961113452911377 + }, + { + "auxiliary_loss_clip": 0.01099412, + "auxiliary_loss_mlp": 0.010383, + "balance_loss_clip": 1.041448, + "balance_loss_mlp": 1.02575362, + "epoch": 0.5546370058620171, + "flos": 18478051534080.0, + "grad_norm": 2.3427705376369072, + "language_loss": 0.57385319, + "learning_rate": 1.743855475904141e-06, + "loss": 0.5952304, + "num_input_tokens_seen": 198671890, + "step": 9225, + "time_per_iteration": 2.4746692180633545 + }, + { + "auxiliary_loss_clip": 0.01103413, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.03905272, + "balance_loss_mlp": 1.02282381, + "epoch": 0.554697129114685, + "flos": 22930507751040.0, + "grad_norm": 1.5754252047546868, + "language_loss": 0.67601967, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.6974104, + "num_input_tokens_seen": 198691995, + "step": 9226, + "time_per_iteration": 2.529801368713379 + }, + { + "auxiliary_loss_clip": 0.0107589, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.03553343, + "balance_loss_mlp": 1.02216959, + "epoch": 0.5547572523673531, + "flos": 21797166850560.0, + "grad_norm": 1.4452483372481744, + "language_loss": 0.7447269, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.76583433, + "num_input_tokens_seen": 198712440, + "step": 9227, + "time_per_iteration": 2.543351888656616 + }, + { + "auxiliary_loss_clip": 0.0108192, + "auxiliary_loss_mlp": 0.0103139, + "balance_loss_clip": 1.04099202, + "balance_loss_mlp": 1.01774037, + "epoch": 0.5548173756200211, + "flos": 22342829333760.0, + "grad_norm": 1.5549632243988263, + "language_loss": 0.73412877, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.7552619, + "num_input_tokens_seen": 198731515, + "step": 9228, + "time_per_iteration": 2.5747568607330322 + }, + { + "auxiliary_loss_clip": 0.01113663, + "auxiliary_loss_mlp": 0.01031409, + "balance_loss_clip": 1.04031098, + "balance_loss_mlp": 1.01914263, + "epoch": 0.554877498872689, + "flos": 17858736213120.0, + "grad_norm": 1.7079664703445376, + "language_loss": 0.75590831, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.77735907, + "num_input_tokens_seen": 198749750, + "step": 9229, + "time_per_iteration": 2.4259326457977295 + }, + { + "auxiliary_loss_clip": 0.01103036, + "auxiliary_loss_mlp": 0.00787872, + "balance_loss_clip": 1.04107666, + "balance_loss_mlp": 1.01292682, + "epoch": 0.554937622125357, + "flos": 17238343484160.0, + "grad_norm": 1.392236585985139, + "language_loss": 0.68602949, + "learning_rate": 1.741924325613172e-06, + "loss": 0.70493865, + "num_input_tokens_seen": 198768320, + "step": 9230, + "time_per_iteration": 2.484239101409912 + }, + { + "auxiliary_loss_clip": 0.01066671, + "auxiliary_loss_mlp": 0.0103211, + "balance_loss_clip": 1.03852582, + "balance_loss_mlp": 1.01848459, + "epoch": 0.5549977453780249, + "flos": 25368087484800.0, + "grad_norm": 2.4816291506755324, + "language_loss": 0.67803216, + "learning_rate": 1.741538124855163e-06, + "loss": 0.69901991, + "num_input_tokens_seen": 198787230, + "step": 9231, + "time_per_iteration": 3.9619076251983643 + }, + { + "auxiliary_loss_clip": 0.01116943, + "auxiliary_loss_mlp": 0.01030176, + "balance_loss_clip": 1.0403595, + "balance_loss_mlp": 1.01626396, + "epoch": 0.555057868630693, + "flos": 25079114568960.0, + "grad_norm": 1.9456566159495485, + "language_loss": 0.78373992, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.80521107, + "num_input_tokens_seen": 198806720, + "step": 9232, + "time_per_iteration": 2.4839911460876465 + }, + { + "auxiliary_loss_clip": 0.01070521, + "auxiliary_loss_mlp": 0.01030729, + "balance_loss_clip": 1.03725934, + "balance_loss_mlp": 1.01886761, + "epoch": 0.5551179918833609, + "flos": 26104220812800.0, + "grad_norm": 1.6714053460051095, + "language_loss": 0.82636631, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.84737879, + "num_input_tokens_seen": 198826235, + "step": 9233, + "time_per_iteration": 2.595015048980713 + }, + { + "auxiliary_loss_clip": 0.0109895, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.0364517, + "balance_loss_mlp": 1.02114701, + "epoch": 0.5551781151360289, + "flos": 19384759572480.0, + "grad_norm": 2.570636962837449, + "language_loss": 0.74818951, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.76951969, + "num_input_tokens_seen": 198842655, + "step": 9234, + "time_per_iteration": 3.8458969593048096 + }, + { + "auxiliary_loss_clip": 0.01085751, + "auxiliary_loss_mlp": 0.01025802, + "balance_loss_clip": 1.03575516, + "balance_loss_mlp": 1.01410782, + "epoch": 0.5552382383886968, + "flos": 21725956137600.0, + "grad_norm": 2.4216947749386666, + "language_loss": 0.64437759, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.66549313, + "num_input_tokens_seen": 198861210, + "step": 9235, + "time_per_iteration": 2.515397071838379 + }, + { + "auxiliary_loss_clip": 0.0106001, + "auxiliary_loss_mlp": 0.01029439, + "balance_loss_clip": 1.03462648, + "balance_loss_mlp": 1.01630831, + "epoch": 0.5552983616413648, + "flos": 14356189117440.0, + "grad_norm": 1.8377859500445055, + "language_loss": 0.67924416, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.70013869, + "num_input_tokens_seen": 198880045, + "step": 9236, + "time_per_iteration": 2.559159517288208 + }, + { + "auxiliary_loss_clip": 0.01106953, + "auxiliary_loss_mlp": 0.01024875, + "balance_loss_clip": 1.03716922, + "balance_loss_mlp": 1.01251328, + "epoch": 0.5553584848940327, + "flos": 25478548784640.0, + "grad_norm": 2.607257502786614, + "language_loss": 0.86658078, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.8878991, + "num_input_tokens_seen": 198900210, + "step": 9237, + "time_per_iteration": 2.4844696521759033 + }, + { + "auxiliary_loss_clip": 0.01098543, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.03731704, + "balance_loss_mlp": 1.0177933, + "epoch": 0.5554186081467007, + "flos": 22163850840960.0, + "grad_norm": 2.1201127491948086, + "language_loss": 0.73213482, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.7534219, + "num_input_tokens_seen": 198919055, + "step": 9238, + "time_per_iteration": 3.860952854156494 + }, + { + "auxiliary_loss_clip": 0.01098973, + "auxiliary_loss_mlp": 0.01031919, + "balance_loss_clip": 1.03707111, + "balance_loss_mlp": 1.01879978, + "epoch": 0.5554787313993687, + "flos": 49746656125440.0, + "grad_norm": 1.5986924904293494, + "language_loss": 0.78359497, + "learning_rate": 1.73844887285358e-06, + "loss": 0.80490386, + "num_input_tokens_seen": 198943505, + "step": 9239, + "time_per_iteration": 2.7362375259399414 + }, + { + "auxiliary_loss_clip": 0.01091596, + "auxiliary_loss_mlp": 0.01026957, + "balance_loss_clip": 1.03996217, + "balance_loss_mlp": 1.01471376, + "epoch": 0.5555388546520367, + "flos": 22127365601280.0, + "grad_norm": 1.5797567868410776, + "language_loss": 0.79935038, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.82053584, + "num_input_tokens_seen": 198963590, + "step": 9240, + "time_per_iteration": 2.5304806232452393 + }, + { + "auxiliary_loss_clip": 0.010884, + "auxiliary_loss_mlp": 0.01030705, + "balance_loss_clip": 1.03815746, + "balance_loss_mlp": 1.01851535, + "epoch": 0.5555989779047047, + "flos": 24682122478080.0, + "grad_norm": 1.6418642776552745, + "language_loss": 0.65455174, + "learning_rate": 1.737676658740786e-06, + "loss": 0.67574281, + "num_input_tokens_seen": 198982680, + "step": 9241, + "time_per_iteration": 2.541163206100464 + }, + { + "auxiliary_loss_clip": 0.01101333, + "auxiliary_loss_mlp": 0.00785443, + "balance_loss_clip": 1.03913236, + "balance_loss_mlp": 1.00852275, + "epoch": 0.5556591011573726, + "flos": 16106510954880.0, + "grad_norm": 1.9522455324501808, + "language_loss": 0.72838306, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.74725085, + "num_input_tokens_seen": 199000185, + "step": 9242, + "time_per_iteration": 2.5102603435516357 + }, + { + "auxiliary_loss_clip": 0.01094133, + "auxiliary_loss_mlp": 0.0102875, + "balance_loss_clip": 1.03840399, + "balance_loss_mlp": 1.01502872, + "epoch": 0.5557192244100406, + "flos": 12933695733120.0, + "grad_norm": 1.7030309253366334, + "language_loss": 0.64064467, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.6618734, + "num_input_tokens_seen": 199018380, + "step": 9243, + "time_per_iteration": 2.515479803085327 + }, + { + "auxiliary_loss_clip": 0.01085555, + "auxiliary_loss_mlp": 0.00784378, + "balance_loss_clip": 1.04136014, + "balance_loss_mlp": 1.00763202, + "epoch": 0.5557793476627085, + "flos": 23111712887040.0, + "grad_norm": 1.7897037553694843, + "language_loss": 0.75145388, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.77015316, + "num_input_tokens_seen": 199037115, + "step": 9244, + "time_per_iteration": 2.554372549057007 + }, + { + "auxiliary_loss_clip": 0.01083768, + "auxiliary_loss_mlp": 0.01028383, + "balance_loss_clip": 1.03616667, + "balance_loss_mlp": 1.01679039, + "epoch": 0.5558394709153766, + "flos": 21428040735360.0, + "grad_norm": 2.1110454118428024, + "language_loss": 0.75248206, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.77360356, + "num_input_tokens_seen": 199053375, + "step": 9245, + "time_per_iteration": 2.544123649597168 + }, + { + "auxiliary_loss_clip": 0.01097791, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.04095864, + "balance_loss_mlp": 1.01776338, + "epoch": 0.5558995941680445, + "flos": 25078324469760.0, + "grad_norm": 2.2690338716604055, + "language_loss": 0.79468119, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.81597322, + "num_input_tokens_seen": 199070930, + "step": 9246, + "time_per_iteration": 2.5477354526519775 + }, + { + "auxiliary_loss_clip": 0.01111665, + "auxiliary_loss_mlp": 0.0103233, + "balance_loss_clip": 1.03897953, + "balance_loss_mlp": 1.01997995, + "epoch": 0.5559597174207125, + "flos": 20011149872640.0, + "grad_norm": 1.7955239648050072, + "language_loss": 0.73729455, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.75873446, + "num_input_tokens_seen": 199088675, + "step": 9247, + "time_per_iteration": 2.4549548625946045 + }, + { + "auxiliary_loss_clip": 0.0108993, + "auxiliary_loss_mlp": 0.01032757, + "balance_loss_clip": 1.04747367, + "balance_loss_mlp": 1.01933992, + "epoch": 0.5560198406733804, + "flos": 16835677044480.0, + "grad_norm": 3.3036057233172698, + "language_loss": 0.75539708, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.77662396, + "num_input_tokens_seen": 199103075, + "step": 9248, + "time_per_iteration": 2.4962079524993896 + }, + { + "auxiliary_loss_clip": 0.01007114, + "auxiliary_loss_mlp": 0.01009328, + "balance_loss_clip": 1.02209187, + "balance_loss_mlp": 1.00769448, + "epoch": 0.5560799639260484, + "flos": 70697051758080.0, + "grad_norm": 0.8645574125346883, + "language_loss": 0.59445953, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.61462402, + "num_input_tokens_seen": 199160325, + "step": 9249, + "time_per_iteration": 3.2806990146636963 + }, + { + "auxiliary_loss_clip": 0.01108776, + "auxiliary_loss_mlp": 0.01027543, + "balance_loss_clip": 1.0365293, + "balance_loss_mlp": 1.01473963, + "epoch": 0.5561400871787163, + "flos": 23148593176320.0, + "grad_norm": 2.1369669708512524, + "language_loss": 0.79854095, + "learning_rate": 1.734202189316832e-06, + "loss": 0.81990415, + "num_input_tokens_seen": 199179760, + "step": 9250, + "time_per_iteration": 2.4590866565704346 + }, + { + "auxiliary_loss_clip": 0.01091024, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.03848529, + "balance_loss_mlp": 1.01745033, + "epoch": 0.5562002104313843, + "flos": 17566423332480.0, + "grad_norm": 3.3083841367021454, + "language_loss": 0.68958294, + "learning_rate": 1.733816187358836e-06, + "loss": 0.71080279, + "num_input_tokens_seen": 199196695, + "step": 9251, + "time_per_iteration": 2.488954782485962 + }, + { + "auxiliary_loss_clip": 0.01100085, + "auxiliary_loss_mlp": 0.01032027, + "balance_loss_clip": 1.03843224, + "balance_loss_mlp": 1.01951051, + "epoch": 0.5562603336840523, + "flos": 25045430590080.0, + "grad_norm": 1.5191850042723756, + "language_loss": 0.75476271, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.77608383, + "num_input_tokens_seen": 199217845, + "step": 9252, + "time_per_iteration": 2.5424764156341553 + }, + { + "auxiliary_loss_clip": 0.01098183, + "auxiliary_loss_mlp": 0.01038468, + "balance_loss_clip": 1.03679466, + "balance_loss_mlp": 1.02427006, + "epoch": 0.5563204569367203, + "flos": 29059022436480.0, + "grad_norm": 1.4535720262549348, + "language_loss": 0.7262845, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.74765104, + "num_input_tokens_seen": 199239250, + "step": 9253, + "time_per_iteration": 2.56081485748291 + }, + { + "auxiliary_loss_clip": 0.01083012, + "auxiliary_loss_mlp": 0.01030001, + "balance_loss_clip": 1.03931928, + "balance_loss_mlp": 1.01789546, + "epoch": 0.5563805801893883, + "flos": 22090449398400.0, + "grad_norm": 6.315474985076652, + "language_loss": 0.83348107, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.85461122, + "num_input_tokens_seen": 199258320, + "step": 9254, + "time_per_iteration": 2.544121026992798 + }, + { + "auxiliary_loss_clip": 0.01027132, + "auxiliary_loss_mlp": 0.01003389, + "balance_loss_clip": 1.02202535, + "balance_loss_mlp": 1.00204742, + "epoch": 0.5564407034420562, + "flos": 58636128689280.0, + "grad_norm": 0.863284816924183, + "language_loss": 0.64806914, + "learning_rate": 1.732272280610387e-06, + "loss": 0.6683743, + "num_input_tokens_seen": 199314840, + "step": 9255, + "time_per_iteration": 2.9900217056274414 + }, + { + "auxiliary_loss_clip": 0.01103088, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.04153883, + "balance_loss_mlp": 1.02152228, + "epoch": 0.5565008266947242, + "flos": 23112323418240.0, + "grad_norm": 1.8659135847542294, + "language_loss": 0.69254965, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.71391451, + "num_input_tokens_seen": 199335405, + "step": 9256, + "time_per_iteration": 2.502626895904541 + }, + { + "auxiliary_loss_clip": 0.01074155, + "auxiliary_loss_mlp": 0.01028687, + "balance_loss_clip": 1.03710318, + "balance_loss_mlp": 1.01768374, + "epoch": 0.5565609499473921, + "flos": 21578399066880.0, + "grad_norm": 1.6561048640173295, + "language_loss": 0.75781006, + "learning_rate": 1.73150038809119e-06, + "loss": 0.7788384, + "num_input_tokens_seen": 199354345, + "step": 9257, + "time_per_iteration": 2.5551674365997314 + }, + { + "auxiliary_loss_clip": 0.01063328, + "auxiliary_loss_mlp": 0.01033384, + "balance_loss_clip": 1.03519392, + "balance_loss_mlp": 1.02099192, + "epoch": 0.5566210732000602, + "flos": 18369637309440.0, + "grad_norm": 1.9506178880043559, + "language_loss": 0.61418414, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.63515127, + "num_input_tokens_seen": 199372250, + "step": 9258, + "time_per_iteration": 2.5554490089416504 + }, + { + "auxiliary_loss_clip": 0.01076318, + "auxiliary_loss_mlp": 0.0103137, + "balance_loss_clip": 1.03725195, + "balance_loss_mlp": 1.0173986, + "epoch": 0.5566811964527281, + "flos": 25703350053120.0, + "grad_norm": 1.715468866147634, + "language_loss": 0.79034162, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.81141853, + "num_input_tokens_seen": 199392815, + "step": 9259, + "time_per_iteration": 2.5837697982788086 + }, + { + "auxiliary_loss_clip": 0.01081378, + "auxiliary_loss_mlp": 0.01031596, + "balance_loss_clip": 1.03923571, + "balance_loss_mlp": 1.01847744, + "epoch": 0.5567413197053961, + "flos": 26943991856640.0, + "grad_norm": 2.276803107768523, + "language_loss": 0.8167277, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.83785748, + "num_input_tokens_seen": 199412375, + "step": 9260, + "time_per_iteration": 2.5871708393096924 + }, + { + "auxiliary_loss_clip": 0.01109911, + "auxiliary_loss_mlp": 0.01037219, + "balance_loss_clip": 1.03747904, + "balance_loss_mlp": 1.02433896, + "epoch": 0.556801442958064, + "flos": 20850597694080.0, + "grad_norm": 1.5841920486549808, + "language_loss": 0.68875349, + "learning_rate": 1.729956725348256e-06, + "loss": 0.71022481, + "num_input_tokens_seen": 199431490, + "step": 9261, + "time_per_iteration": 2.4769859313964844 + }, + { + "auxiliary_loss_clip": 0.01012354, + "auxiliary_loss_mlp": 0.0100232, + "balance_loss_clip": 1.01759315, + "balance_loss_mlp": 1.0009371, + "epoch": 0.556861566210732, + "flos": 70498213044480.0, + "grad_norm": 0.7310752990794981, + "language_loss": 0.6110152, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63116193, + "num_input_tokens_seen": 199495855, + "step": 9262, + "time_per_iteration": 3.1564443111419678 + }, + { + "auxiliary_loss_clip": 0.0110256, + "auxiliary_loss_mlp": 0.01033327, + "balance_loss_clip": 1.03790021, + "balance_loss_mlp": 1.02101254, + "epoch": 0.5569216894633999, + "flos": 25337276593920.0, + "grad_norm": 1.536958729900559, + "language_loss": 0.6440993, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.6654582, + "num_input_tokens_seen": 199515870, + "step": 9263, + "time_per_iteration": 3.9006993770599365 + }, + { + "auxiliary_loss_clip": 0.01088302, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.03710258, + "balance_loss_mlp": 1.01881146, + "epoch": 0.556981812716068, + "flos": 22638733574400.0, + "grad_norm": 1.8950747552676095, + "language_loss": 0.73224789, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.75344837, + "num_input_tokens_seen": 199535745, + "step": 9264, + "time_per_iteration": 2.5430479049682617 + }, + { + "auxiliary_loss_clip": 0.01083097, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.03940511, + "balance_loss_mlp": 1.01657999, + "epoch": 0.5570419359687359, + "flos": 11035852738560.0, + "grad_norm": 1.8429761803584588, + "language_loss": 0.76437515, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.78549516, + "num_input_tokens_seen": 199554035, + "step": 9265, + "time_per_iteration": 2.5609729290008545 + }, + { + "auxiliary_loss_clip": 0.01083715, + "auxiliary_loss_mlp": 0.01029671, + "balance_loss_clip": 1.04055107, + "balance_loss_mlp": 1.01880479, + "epoch": 0.5571020592214039, + "flos": 22823135020800.0, + "grad_norm": 1.296277318398208, + "language_loss": 0.70678025, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.72791415, + "num_input_tokens_seen": 199576120, + "step": 9266, + "time_per_iteration": 2.5526628494262695 + }, + { + "auxiliary_loss_clip": 0.01085415, + "auxiliary_loss_mlp": 0.01032251, + "balance_loss_clip": 1.03604507, + "balance_loss_mlp": 1.02006817, + "epoch": 0.5571621824740719, + "flos": 22927778317440.0, + "grad_norm": 1.9024782108254799, + "language_loss": 0.67872924, + "learning_rate": 1.727641538728533e-06, + "loss": 0.69990587, + "num_input_tokens_seen": 199593780, + "step": 9267, + "time_per_iteration": 2.5226635932922363 + }, + { + "auxiliary_loss_clip": 0.01097601, + "auxiliary_loss_mlp": 0.01037509, + "balance_loss_clip": 1.03778744, + "balance_loss_mlp": 1.02626181, + "epoch": 0.5572223057267398, + "flos": 22966705681920.0, + "grad_norm": 2.132692049370671, + "language_loss": 0.74250698, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.76385808, + "num_input_tokens_seen": 199613220, + "step": 9268, + "time_per_iteration": 2.502830743789673 + }, + { + "auxiliary_loss_clip": 0.01093548, + "auxiliary_loss_mlp": 0.00784376, + "balance_loss_clip": 1.03892636, + "balance_loss_mlp": 1.00959504, + "epoch": 0.5572824289794078, + "flos": 20960053413120.0, + "grad_norm": 1.8846523052749902, + "language_loss": 0.75372112, + "learning_rate": 1.726869892322104e-06, + "loss": 0.7725004, + "num_input_tokens_seen": 199632085, + "step": 9269, + "time_per_iteration": 2.5183188915252686 + }, + { + "auxiliary_loss_clip": 0.01074248, + "auxiliary_loss_mlp": 0.01034971, + "balance_loss_clip": 1.035537, + "balance_loss_mlp": 1.02220976, + "epoch": 0.5573425522320757, + "flos": 25042413847680.0, + "grad_norm": 10.65934501529905, + "language_loss": 0.8284595, + "learning_rate": 1.726484084647256e-06, + "loss": 0.84955174, + "num_input_tokens_seen": 199649295, + "step": 9270, + "time_per_iteration": 3.9425487518310547 + }, + { + "auxiliary_loss_clip": 0.01075268, + "auxiliary_loss_mlp": 0.01033916, + "balance_loss_clip": 1.03774118, + "balance_loss_mlp": 1.02107692, + "epoch": 0.5574026754847438, + "flos": 23659637927040.0, + "grad_norm": 2.1630774781823874, + "language_loss": 0.79608065, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.81717253, + "num_input_tokens_seen": 199668870, + "step": 9271, + "time_per_iteration": 2.600764513015747 + }, + { + "auxiliary_loss_clip": 0.01089546, + "auxiliary_loss_mlp": 0.01028246, + "balance_loss_clip": 1.03716731, + "balance_loss_mlp": 1.01576471, + "epoch": 0.5574627987374117, + "flos": 24782240661120.0, + "grad_norm": 1.7868484966728213, + "language_loss": 0.90281993, + "learning_rate": 1.725712500427442e-06, + "loss": 0.92399788, + "num_input_tokens_seen": 199684870, + "step": 9272, + "time_per_iteration": 2.524334669113159 + }, + { + "auxiliary_loss_clip": 0.01076301, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.03709054, + "balance_loss_mlp": 1.01690078, + "epoch": 0.5575229219900797, + "flos": 21834944979840.0, + "grad_norm": 2.2198055549630262, + "language_loss": 0.8413626, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.862418, + "num_input_tokens_seen": 199701975, + "step": 9273, + "time_per_iteration": 3.9291181564331055 + }, + { + "auxiliary_loss_clip": 0.01102316, + "auxiliary_loss_mlp": 0.010364, + "balance_loss_clip": 1.03910935, + "balance_loss_mlp": 1.02295899, + "epoch": 0.5575830452427476, + "flos": 27815148408960.0, + "grad_norm": 2.257955534193018, + "language_loss": 0.74082726, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.76221436, + "num_input_tokens_seen": 199721865, + "step": 9274, + "time_per_iteration": 2.534677505493164 + }, + { + "auxiliary_loss_clip": 0.01091298, + "auxiliary_loss_mlp": 0.01034066, + "balance_loss_clip": 1.04075003, + "balance_loss_mlp": 1.01977277, + "epoch": 0.5576431684954156, + "flos": 17812805696640.0, + "grad_norm": 3.0099792473739604, + "language_loss": 0.77474451, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.79599822, + "num_input_tokens_seen": 199736455, + "step": 9275, + "time_per_iteration": 2.514914035797119 + }, + { + "auxiliary_loss_clip": 0.01086796, + "auxiliary_loss_mlp": 0.01029485, + "balance_loss_clip": 1.04049301, + "balance_loss_mlp": 1.01711702, + "epoch": 0.5577032917480835, + "flos": 15486872411520.0, + "grad_norm": 1.722481453024456, + "language_loss": 0.74795926, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.76912212, + "num_input_tokens_seen": 199753125, + "step": 9276, + "time_per_iteration": 2.4797325134277344 + }, + { + "auxiliary_loss_clip": 0.01088014, + "auxiliary_loss_mlp": 0.01034936, + "balance_loss_clip": 1.03590655, + "balance_loss_mlp": 1.02236557, + "epoch": 0.5577634150007516, + "flos": 21579763783680.0, + "grad_norm": 1.7235889009155767, + "language_loss": 0.75455391, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.77578342, + "num_input_tokens_seen": 199771365, + "step": 9277, + "time_per_iteration": 3.9213991165161133 + }, + { + "auxiliary_loss_clip": 0.01106342, + "auxiliary_loss_mlp": 0.01031913, + "balance_loss_clip": 1.03589463, + "balance_loss_mlp": 1.02005219, + "epoch": 0.5578235382534195, + "flos": 21139750177920.0, + "grad_norm": 1.6654597259938995, + "language_loss": 0.7135663, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.73494887, + "num_input_tokens_seen": 199790035, + "step": 9278, + "time_per_iteration": 2.4813168048858643 + }, + { + "auxiliary_loss_clip": 0.01076452, + "auxiliary_loss_mlp": 0.01033407, + "balance_loss_clip": 1.04011393, + "balance_loss_mlp": 1.02012682, + "epoch": 0.5578836615060875, + "flos": 26505199313280.0, + "grad_norm": 1.942565106206994, + "language_loss": 0.75457817, + "learning_rate": 1.723012284057868e-06, + "loss": 0.77567679, + "num_input_tokens_seen": 199811125, + "step": 9279, + "time_per_iteration": 2.621128559112549 + }, + { + "auxiliary_loss_clip": 0.01084726, + "auxiliary_loss_mlp": 0.01027221, + "balance_loss_clip": 1.03383732, + "balance_loss_mlp": 1.01465642, + "epoch": 0.5579437847587555, + "flos": 20153786780160.0, + "grad_norm": 1.5344578561257778, + "language_loss": 0.67429984, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.69541931, + "num_input_tokens_seen": 199829915, + "step": 9280, + "time_per_iteration": 2.5274171829223633 + }, + { + "auxiliary_loss_clip": 0.01097536, + "auxiliary_loss_mlp": 0.01034868, + "balance_loss_clip": 1.03527164, + "balance_loss_mlp": 1.02207041, + "epoch": 0.5580039080114234, + "flos": 26102281478400.0, + "grad_norm": 1.5339149355155148, + "language_loss": 0.73460191, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.75592601, + "num_input_tokens_seen": 199850670, + "step": 9281, + "time_per_iteration": 2.5548722743988037 + }, + { + "auxiliary_loss_clip": 0.01077825, + "auxiliary_loss_mlp": 0.00786272, + "balance_loss_clip": 1.03608155, + "balance_loss_mlp": 1.00974751, + "epoch": 0.5580640312640914, + "flos": 13771671096960.0, + "grad_norm": 4.047925833270152, + "language_loss": 0.74718153, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.76582253, + "num_input_tokens_seen": 199867645, + "step": 9282, + "time_per_iteration": 2.5195586681365967 + }, + { + "auxiliary_loss_clip": 0.0104781, + "auxiliary_loss_mlp": 0.01028052, + "balance_loss_clip": 1.03452599, + "balance_loss_mlp": 1.01514149, + "epoch": 0.5581241545167593, + "flos": 17675986792320.0, + "grad_norm": 1.8039225396648588, + "language_loss": 0.66199958, + "learning_rate": 1.721469534028297e-06, + "loss": 0.68275821, + "num_input_tokens_seen": 199886320, + "step": 9283, + "time_per_iteration": 2.611476421356201 + }, + { + "auxiliary_loss_clip": 0.01075334, + "auxiliary_loss_mlp": 0.01028203, + "balance_loss_clip": 1.03736901, + "balance_loss_mlp": 1.01681268, + "epoch": 0.5581842777694274, + "flos": 19569161018880.0, + "grad_norm": 1.7907966232406831, + "language_loss": 0.82992196, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.85095739, + "num_input_tokens_seen": 199904895, + "step": 9284, + "time_per_iteration": 2.538071632385254 + }, + { + "auxiliary_loss_clip": 0.01088708, + "auxiliary_loss_mlp": 0.01030789, + "balance_loss_clip": 1.03762078, + "balance_loss_mlp": 1.01840281, + "epoch": 0.5582444010220953, + "flos": 20595165102720.0, + "grad_norm": 2.831500349052182, + "language_loss": 0.85048562, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.87168062, + "num_input_tokens_seen": 199921090, + "step": 9285, + "time_per_iteration": 2.531393051147461 + }, + { + "auxiliary_loss_clip": 0.01086254, + "auxiliary_loss_mlp": 0.01031245, + "balance_loss_clip": 1.0395143, + "balance_loss_mlp": 1.01937127, + "epoch": 0.5583045242747633, + "flos": 19135504120320.0, + "grad_norm": 3.0833788721320388, + "language_loss": 0.73827493, + "learning_rate": 1.720312582354912e-06, + "loss": 0.7594499, + "num_input_tokens_seen": 199939925, + "step": 9286, + "time_per_iteration": 2.528172254562378 + }, + { + "auxiliary_loss_clip": 0.0111022, + "auxiliary_loss_mlp": 0.01030184, + "balance_loss_clip": 1.03732789, + "balance_loss_mlp": 1.01804852, + "epoch": 0.5583646475274312, + "flos": 27454569730560.0, + "grad_norm": 1.6421252882918709, + "language_loss": 0.74206048, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.76346451, + "num_input_tokens_seen": 199960015, + "step": 9287, + "time_per_iteration": 2.5323171615600586 + }, + { + "auxiliary_loss_clip": 0.01076446, + "auxiliary_loss_mlp": 0.01032983, + "balance_loss_clip": 1.03623533, + "balance_loss_mlp": 1.01917291, + "epoch": 0.5584247707800992, + "flos": 23653784010240.0, + "grad_norm": 1.5950398165563044, + "language_loss": 0.75009918, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.77119344, + "num_input_tokens_seen": 199980505, + "step": 9288, + "time_per_iteration": 2.5779786109924316 + }, + { + "auxiliary_loss_clip": 0.01090862, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.03878117, + "balance_loss_mlp": 1.02319956, + "epoch": 0.5584848940327671, + "flos": 13698880185600.0, + "grad_norm": 2.6200809035972528, + "language_loss": 0.77538633, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.79666245, + "num_input_tokens_seen": 199999020, + "step": 9289, + "time_per_iteration": 2.5084128379821777 + }, + { + "auxiliary_loss_clip": 0.01084042, + "auxiliary_loss_mlp": 0.01032878, + "balance_loss_clip": 1.03829741, + "balance_loss_mlp": 1.01974702, + "epoch": 0.5585450172854352, + "flos": 27016208150400.0, + "grad_norm": 1.8442385577077245, + "language_loss": 0.61099917, + "learning_rate": 1.718770128672817e-06, + "loss": 0.63216835, + "num_input_tokens_seen": 200019020, + "step": 9290, + "time_per_iteration": 2.584376573562622 + }, + { + "auxiliary_loss_clip": 0.01064133, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.03695512, + "balance_loss_mlp": 1.01891482, + "epoch": 0.5586051405381031, + "flos": 23185653033600.0, + "grad_norm": 2.177509334865663, + "language_loss": 0.67519283, + "learning_rate": 1.7183845418764e-06, + "loss": 0.69615275, + "num_input_tokens_seen": 200038110, + "step": 9291, + "time_per_iteration": 2.613873243331909 + }, + { + "auxiliary_loss_clip": 0.01081186, + "auxiliary_loss_mlp": 0.01036134, + "balance_loss_clip": 1.0370661, + "balance_loss_mlp": 1.02299106, + "epoch": 0.5586652637907711, + "flos": 20775544225920.0, + "grad_norm": 2.057270516585976, + "language_loss": 0.84080648, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.86197972, + "num_input_tokens_seen": 200056210, + "step": 9292, + "time_per_iteration": 2.5488624572753906 + }, + { + "auxiliary_loss_clip": 0.01082483, + "auxiliary_loss_mlp": 0.01040436, + "balance_loss_clip": 1.03684318, + "balance_loss_mlp": 1.02787685, + "epoch": 0.5587253870434391, + "flos": 28219897837440.0, + "grad_norm": 1.9812537329909878, + "language_loss": 0.73783636, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.75906563, + "num_input_tokens_seen": 200075620, + "step": 9293, + "time_per_iteration": 2.565099000930786 + }, + { + "auxiliary_loss_clip": 0.01085241, + "auxiliary_loss_mlp": 0.01031714, + "balance_loss_clip": 1.03964758, + "balance_loss_mlp": 1.01985919, + "epoch": 0.558785510296107, + "flos": 26615732440320.0, + "grad_norm": 1.7155395037241226, + "language_loss": 0.72868657, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.74985611, + "num_input_tokens_seen": 200095945, + "step": 9294, + "time_per_iteration": 2.5708858966827393 + }, + { + "auxiliary_loss_clip": 0.01088022, + "auxiliary_loss_mlp": 0.00784022, + "balance_loss_clip": 1.03776479, + "balance_loss_mlp": 1.00640321, + "epoch": 0.558845633548775, + "flos": 20156767608960.0, + "grad_norm": 2.115245257803104, + "language_loss": 0.68389428, + "learning_rate": 1.716842301625806e-06, + "loss": 0.70261478, + "num_input_tokens_seen": 200114185, + "step": 9295, + "time_per_iteration": 2.5001771450042725 + }, + { + "auxiliary_loss_clip": 0.01111829, + "auxiliary_loss_mlp": 0.01035651, + "balance_loss_clip": 1.03927064, + "balance_loss_mlp": 1.02317619, + "epoch": 0.5589057568014429, + "flos": 24350774492160.0, + "grad_norm": 1.4579667802869498, + "language_loss": 0.80545008, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.82692486, + "num_input_tokens_seen": 200135030, + "step": 9296, + "time_per_iteration": 2.493241548538208 + }, + { + "auxiliary_loss_clip": 0.01100381, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.0391562, + "balance_loss_mlp": 1.01910448, + "epoch": 0.558965880054111, + "flos": 21105168359040.0, + "grad_norm": 1.5628938698633716, + "language_loss": 0.65356159, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.6748786, + "num_input_tokens_seen": 200154290, + "step": 9297, + "time_per_iteration": 2.485689401626587 + }, + { + "auxiliary_loss_clip": 0.01079895, + "auxiliary_loss_mlp": 0.01038539, + "balance_loss_clip": 1.03919578, + "balance_loss_mlp": 1.02547407, + "epoch": 0.5590260033067789, + "flos": 18436071513600.0, + "grad_norm": 1.7148020006594242, + "language_loss": 0.75172293, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.77290732, + "num_input_tokens_seen": 200171555, + "step": 9298, + "time_per_iteration": 2.5237314701080322 + }, + { + "auxiliary_loss_clip": 0.01029385, + "auxiliary_loss_mlp": 0.01020868, + "balance_loss_clip": 1.02465034, + "balance_loss_mlp": 1.01938391, + "epoch": 0.5590861265594469, + "flos": 70577432490240.0, + "grad_norm": 0.6889700247642193, + "language_loss": 0.52480936, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.54531187, + "num_input_tokens_seen": 200237010, + "step": 9299, + "time_per_iteration": 3.1776397228240967 + }, + { + "auxiliary_loss_clip": 0.01094724, + "auxiliary_loss_mlp": 0.01029909, + "balance_loss_clip": 1.036973, + "balance_loss_mlp": 1.01793456, + "epoch": 0.5591462498121148, + "flos": 30664408896000.0, + "grad_norm": 2.081541097560223, + "language_loss": 0.68651921, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.70776552, + "num_input_tokens_seen": 200260820, + "step": 9300, + "time_per_iteration": 2.6008825302124023 + }, + { + "auxiliary_loss_clip": 0.01060439, + "auxiliary_loss_mlp": 0.01048505, + "balance_loss_clip": 1.03524494, + "balance_loss_mlp": 1.0330677, + "epoch": 0.5592063730647828, + "flos": 18150438562560.0, + "grad_norm": 1.7535674574501428, + "language_loss": 0.81873721, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.83982658, + "num_input_tokens_seen": 200278035, + "step": 9301, + "time_per_iteration": 3.9318954944610596 + }, + { + "auxiliary_loss_clip": 0.01108909, + "auxiliary_loss_mlp": 0.01028666, + "balance_loss_clip": 1.03641939, + "balance_loss_mlp": 1.01595795, + "epoch": 0.5592664963174507, + "flos": 24060400945920.0, + "grad_norm": 1.8715383153947094, + "language_loss": 0.67560685, + "learning_rate": 1.714143795138756e-06, + "loss": 0.69698262, + "num_input_tokens_seen": 200297255, + "step": 9302, + "time_per_iteration": 2.4877126216888428 + }, + { + "auxiliary_loss_clip": 0.01079143, + "auxiliary_loss_mlp": 0.01026456, + "balance_loss_clip": 1.03685999, + "balance_loss_mlp": 1.01280057, + "epoch": 0.5593266195701188, + "flos": 19827897661440.0, + "grad_norm": 1.6409799682839519, + "language_loss": 0.71072584, + "learning_rate": 1.713758337453878e-06, + "loss": 0.73178184, + "num_input_tokens_seen": 200317505, + "step": 9303, + "time_per_iteration": 2.5849597454071045 + }, + { + "auxiliary_loss_clip": 0.0105093, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.04224896, + "balance_loss_mlp": 1.02343917, + "epoch": 0.5593867428227867, + "flos": 25300755440640.0, + "grad_norm": 1.6142673580676925, + "language_loss": 0.72790986, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.74879122, + "num_input_tokens_seen": 200338350, + "step": 9304, + "time_per_iteration": 2.6511220932006836 + }, + { + "auxiliary_loss_clip": 0.01099198, + "auxiliary_loss_mlp": 0.01026887, + "balance_loss_clip": 1.03736722, + "balance_loss_mlp": 1.01496601, + "epoch": 0.5594468660754547, + "flos": 12933013374720.0, + "grad_norm": 1.874079009152534, + "language_loss": 0.77963239, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.80089325, + "num_input_tokens_seen": 200353965, + "step": 9305, + "time_per_iteration": 2.4741809368133545 + }, + { + "auxiliary_loss_clip": 0.01065702, + "auxiliary_loss_mlp": 0.01028429, + "balance_loss_clip": 1.04197383, + "balance_loss_mlp": 1.01681852, + "epoch": 0.5595069893281227, + "flos": 19062713208960.0, + "grad_norm": 1.676513155394174, + "language_loss": 0.69372118, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.71466255, + "num_input_tokens_seen": 200373595, + "step": 9306, + "time_per_iteration": 2.5882318019866943 + }, + { + "auxiliary_loss_clip": 0.01032163, + "auxiliary_loss_mlp": 0.01006556, + "balance_loss_clip": 1.03052533, + "balance_loss_mlp": 1.00542974, + "epoch": 0.5595671125807906, + "flos": 70273375862400.0, + "grad_norm": 0.9517494553785063, + "language_loss": 0.60333657, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62372375, + "num_input_tokens_seen": 200429155, + "step": 9307, + "time_per_iteration": 3.2533724308013916 + }, + { + "auxiliary_loss_clip": 0.01101653, + "auxiliary_loss_mlp": 0.0103418, + "balance_loss_clip": 1.03889966, + "balance_loss_mlp": 1.02252722, + "epoch": 0.5596272358334586, + "flos": 20665513889280.0, + "grad_norm": 1.662121201733906, + "language_loss": 0.74144435, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.76280272, + "num_input_tokens_seen": 200448290, + "step": 9308, + "time_per_iteration": 3.8773386478424072 + }, + { + "auxiliary_loss_clip": 0.01045679, + "auxiliary_loss_mlp": 0.01034167, + "balance_loss_clip": 1.03381205, + "balance_loss_mlp": 1.02021313, + "epoch": 0.5596873590861265, + "flos": 25041013217280.0, + "grad_norm": 1.8882836832514818, + "language_loss": 0.69511026, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.71590877, + "num_input_tokens_seen": 200466555, + "step": 9309, + "time_per_iteration": 2.6559715270996094 + }, + { + "auxiliary_loss_clip": 0.01090759, + "auxiliary_loss_mlp": 0.01033054, + "balance_loss_clip": 1.04302597, + "balance_loss_mlp": 1.01877916, + "epoch": 0.5597474823387946, + "flos": 25958387594880.0, + "grad_norm": 1.9948411755617754, + "language_loss": 0.75446773, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.77570587, + "num_input_tokens_seen": 200485980, + "step": 9310, + "time_per_iteration": 2.58152174949646 + }, + { + "auxiliary_loss_clip": 0.01103228, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.03931928, + "balance_loss_mlp": 1.01886201, + "epoch": 0.5598076055914625, + "flos": 26177442687360.0, + "grad_norm": 2.541352849066168, + "language_loss": 0.69325423, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.71460629, + "num_input_tokens_seen": 200504555, + "step": 9311, + "time_per_iteration": 2.52984619140625 + }, + { + "auxiliary_loss_clip": 0.01098545, + "auxiliary_loss_mlp": 0.01030531, + "balance_loss_clip": 1.03770733, + "balance_loss_mlp": 1.01866984, + "epoch": 0.5598677288441305, + "flos": 11655778590720.0, + "grad_norm": 1.8758990424242923, + "language_loss": 0.72236401, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.74365479, + "num_input_tokens_seen": 200522700, + "step": 9312, + "time_per_iteration": 3.8581783771514893 + }, + { + "auxiliary_loss_clip": 0.01076877, + "auxiliary_loss_mlp": 0.01031402, + "balance_loss_clip": 1.04378629, + "balance_loss_mlp": 1.01862895, + "epoch": 0.5599278520967984, + "flos": 22966597941120.0, + "grad_norm": 1.7961384951975135, + "language_loss": 0.8943485, + "learning_rate": 1.709904360003822e-06, + "loss": 0.91543126, + "num_input_tokens_seen": 200541910, + "step": 9313, + "time_per_iteration": 2.5529215335845947 + }, + { + "auxiliary_loss_clip": 0.01075227, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.03923512, + "balance_loss_mlp": 1.02487743, + "epoch": 0.5599879753494664, + "flos": 21215557831680.0, + "grad_norm": 1.4886928521994263, + "language_loss": 0.7788862, + "learning_rate": 1.709519022520204e-06, + "loss": 0.80002207, + "num_input_tokens_seen": 200562600, + "step": 9314, + "time_per_iteration": 2.591083526611328 + }, + { + "auxiliary_loss_clip": 0.01076787, + "auxiliary_loss_mlp": 0.01028373, + "balance_loss_clip": 1.04045606, + "balance_loss_mlp": 1.01643372, + "epoch": 0.5600480986021343, + "flos": 31903219105920.0, + "grad_norm": 2.105222841137118, + "language_loss": 0.70438743, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.72543907, + "num_input_tokens_seen": 200584795, + "step": 9315, + "time_per_iteration": 4.026345491409302 + }, + { + "auxiliary_loss_clip": 0.01092855, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_clip": 1.03894186, + "balance_loss_mlp": 1.02163553, + "epoch": 0.5601082218548024, + "flos": 28476048700800.0, + "grad_norm": 2.211467999961726, + "language_loss": 0.66429168, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.68556535, + "num_input_tokens_seen": 200606945, + "step": 9316, + "time_per_iteration": 2.569499969482422 + }, + { + "auxiliary_loss_clip": 0.01076533, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.03805971, + "balance_loss_mlp": 1.01884258, + "epoch": 0.5601683451074703, + "flos": 24097173494400.0, + "grad_norm": 2.0741026604657997, + "language_loss": 0.86383349, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.88493657, + "num_input_tokens_seen": 200626340, + "step": 9317, + "time_per_iteration": 2.570748805999756 + }, + { + "auxiliary_loss_clip": 0.01102515, + "auxiliary_loss_mlp": 0.0103889, + "balance_loss_clip": 1.03849125, + "balance_loss_mlp": 1.02474022, + "epoch": 0.5602284683601383, + "flos": 26356205698560.0, + "grad_norm": 1.7243457283227637, + "language_loss": 0.77314579, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.79455984, + "num_input_tokens_seen": 200644520, + "step": 9318, + "time_per_iteration": 2.5484185218811035 + }, + { + "auxiliary_loss_clip": 0.0109766, + "auxiliary_loss_mlp": 0.01038962, + "balance_loss_clip": 1.03782666, + "balance_loss_mlp": 1.02745795, + "epoch": 0.5602885916128063, + "flos": 24496392228480.0, + "grad_norm": 1.7943379673439557, + "language_loss": 0.76230872, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.78367496, + "num_input_tokens_seen": 200664845, + "step": 9319, + "time_per_iteration": 2.56019926071167 + }, + { + "auxiliary_loss_clip": 0.01099232, + "auxiliary_loss_mlp": 0.0103246, + "balance_loss_clip": 1.03906703, + "balance_loss_mlp": 1.02097988, + "epoch": 0.5603487148654742, + "flos": 27345006270720.0, + "grad_norm": 1.5092001707115357, + "language_loss": 0.85344785, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.87476474, + "num_input_tokens_seen": 200686535, + "step": 9320, + "time_per_iteration": 2.5362508296966553 + }, + { + "auxiliary_loss_clip": 0.01041524, + "auxiliary_loss_mlp": 0.01008643, + "balance_loss_clip": 1.02621961, + "balance_loss_mlp": 1.00734413, + "epoch": 0.5604088381181422, + "flos": 54087756180480.0, + "grad_norm": 0.7458539124914149, + "language_loss": 0.5254609, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54596257, + "num_input_tokens_seen": 200736965, + "step": 9321, + "time_per_iteration": 2.924009084701538 + }, + { + "auxiliary_loss_clip": 0.01091628, + "auxiliary_loss_mlp": 0.01032986, + "balance_loss_clip": 1.04160309, + "balance_loss_mlp": 1.02081442, + "epoch": 0.5604689613708101, + "flos": 22236390357120.0, + "grad_norm": 1.3940571223144915, + "language_loss": 0.74636042, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.76760662, + "num_input_tokens_seen": 200757420, + "step": 9322, + "time_per_iteration": 2.5459938049316406 + }, + { + "auxiliary_loss_clip": 0.01111726, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.03866124, + "balance_loss_mlp": 1.01843858, + "epoch": 0.5605290846234782, + "flos": 35297782940160.0, + "grad_norm": 1.54102256411904, + "language_loss": 0.73883295, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.76027024, + "num_input_tokens_seen": 200779520, + "step": 9323, + "time_per_iteration": 2.5787808895111084 + }, + { + "auxiliary_loss_clip": 0.01090983, + "auxiliary_loss_mlp": 0.01030303, + "balance_loss_clip": 1.04297519, + "balance_loss_mlp": 1.01696992, + "epoch": 0.5605892078761461, + "flos": 20263314326400.0, + "grad_norm": 1.876011712378322, + "language_loss": 0.61724555, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.63845837, + "num_input_tokens_seen": 200799485, + "step": 9324, + "time_per_iteration": 2.535074472427368 + }, + { + "auxiliary_loss_clip": 0.01071107, + "auxiliary_loss_mlp": 0.01031797, + "balance_loss_clip": 1.0370568, + "balance_loss_mlp": 1.01840377, + "epoch": 0.5606493311288141, + "flos": 17308333134720.0, + "grad_norm": 1.8382714956922233, + "language_loss": 0.87792736, + "learning_rate": 1.705281040409226e-06, + "loss": 0.89895636, + "num_input_tokens_seen": 200817540, + "step": 9325, + "time_per_iteration": 2.560593366622925 + }, + { + "auxiliary_loss_clip": 0.01089722, + "auxiliary_loss_mlp": 0.01029751, + "balance_loss_clip": 1.03756285, + "balance_loss_mlp": 1.01653099, + "epoch": 0.560709454381482, + "flos": 21652985658240.0, + "grad_norm": 1.6562763509642444, + "language_loss": 0.74071145, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.76190615, + "num_input_tokens_seen": 200838380, + "step": 9326, + "time_per_iteration": 2.5099828243255615 + }, + { + "auxiliary_loss_clip": 0.01091892, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.03790474, + "balance_loss_mlp": 1.01628172, + "epoch": 0.56076957763415, + "flos": 20303355012480.0, + "grad_norm": 1.9911858593542273, + "language_loss": 0.78044271, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.80166054, + "num_input_tokens_seen": 200855640, + "step": 9327, + "time_per_iteration": 2.510442018508911 + }, + { + "auxiliary_loss_clip": 0.01097905, + "auxiliary_loss_mlp": 0.01029824, + "balance_loss_clip": 1.04228163, + "balance_loss_mlp": 1.01640153, + "epoch": 0.5608297008868179, + "flos": 25045897466880.0, + "grad_norm": 1.4493306223439848, + "language_loss": 0.78121883, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.80249619, + "num_input_tokens_seen": 200876585, + "step": 9328, + "time_per_iteration": 2.5305118560791016 + }, + { + "auxiliary_loss_clip": 0.01110225, + "auxiliary_loss_mlp": 0.01029719, + "balance_loss_clip": 1.0378145, + "balance_loss_mlp": 1.01698709, + "epoch": 0.560889824139486, + "flos": 19866825025920.0, + "grad_norm": 1.5745731126480518, + "language_loss": 0.73454034, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.75593972, + "num_input_tokens_seen": 200898175, + "step": 9329, + "time_per_iteration": 2.4786343574523926 + }, + { + "auxiliary_loss_clip": 0.01090978, + "auxiliary_loss_mlp": 0.00789981, + "balance_loss_clip": 1.03804386, + "balance_loss_mlp": 1.01403785, + "epoch": 0.5609499473921539, + "flos": 22929394429440.0, + "grad_norm": 1.5866826951013648, + "language_loss": 0.83143938, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.85024893, + "num_input_tokens_seen": 200917515, + "step": 9330, + "time_per_iteration": 2.5354905128479004 + }, + { + "auxiliary_loss_clip": 0.01042815, + "auxiliary_loss_mlp": 0.01003857, + "balance_loss_clip": 1.01775885, + "balance_loss_mlp": 1.00229573, + "epoch": 0.5610100706448219, + "flos": 53035825455360.0, + "grad_norm": 0.7142013859279007, + "language_loss": 0.57880241, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.59926909, + "num_input_tokens_seen": 200978615, + "step": 9331, + "time_per_iteration": 3.087099313735962 + }, + { + "auxiliary_loss_clip": 0.0106843, + "auxiliary_loss_mlp": 0.01027695, + "balance_loss_clip": 1.03785789, + "balance_loss_mlp": 1.01510596, + "epoch": 0.5610701938974898, + "flos": 21834944979840.0, + "grad_norm": 1.879815463438497, + "language_loss": 0.8186813, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.83964252, + "num_input_tokens_seen": 200997745, + "step": 9332, + "time_per_iteration": 2.597041606903076 + }, + { + "auxiliary_loss_clip": 0.01100431, + "auxiliary_loss_mlp": 0.01035171, + "balance_loss_clip": 1.03918087, + "balance_loss_mlp": 1.02059186, + "epoch": 0.5611303171501578, + "flos": 17457183095040.0, + "grad_norm": 2.0775373317690713, + "language_loss": 0.8164537, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.83780968, + "num_input_tokens_seen": 201016370, + "step": 9333, + "time_per_iteration": 2.4655468463897705 + }, + { + "auxiliary_loss_clip": 0.01111977, + "auxiliary_loss_mlp": 0.01023601, + "balance_loss_clip": 1.03824282, + "balance_loss_mlp": 1.01164436, + "epoch": 0.5611904404028258, + "flos": 22637799820800.0, + "grad_norm": 1.8453909505917674, + "language_loss": 0.72838748, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.74974322, + "num_input_tokens_seen": 201034310, + "step": 9334, + "time_per_iteration": 2.473379373550415 + }, + { + "auxiliary_loss_clip": 0.01096113, + "auxiliary_loss_mlp": 0.01034211, + "balance_loss_clip": 1.0409447, + "balance_loss_mlp": 1.02095532, + "epoch": 0.5612505636554938, + "flos": 14316327999360.0, + "grad_norm": 1.8849672621860833, + "language_loss": 0.71426183, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.73556507, + "num_input_tokens_seen": 201052030, + "step": 9335, + "time_per_iteration": 2.4770567417144775 + }, + { + "auxiliary_loss_clip": 0.01090733, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.03755212, + "balance_loss_mlp": 1.01572657, + "epoch": 0.5613106869081618, + "flos": 16508279554560.0, + "grad_norm": 1.8156922276970768, + "language_loss": 0.76860809, + "learning_rate": 1.701044410566205e-06, + "loss": 0.78980076, + "num_input_tokens_seen": 201068445, + "step": 9336, + "time_per_iteration": 2.5070266723632812 + }, + { + "auxiliary_loss_clip": 0.01101448, + "auxiliary_loss_mlp": 0.01032146, + "balance_loss_clip": 1.03978574, + "balance_loss_mlp": 1.01964128, + "epoch": 0.5613708101608297, + "flos": 24058569352320.0, + "grad_norm": 2.2375545258636174, + "language_loss": 0.64473438, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.66607034, + "num_input_tokens_seen": 201082140, + "step": 9337, + "time_per_iteration": 2.4856998920440674 + }, + { + "auxiliary_loss_clip": 0.01024051, + "auxiliary_loss_mlp": 0.01001836, + "balance_loss_clip": 1.01985121, + "balance_loss_mlp": 1.00033951, + "epoch": 0.5614309334134977, + "flos": 64905735997440.0, + "grad_norm": 0.8868348179993262, + "language_loss": 0.62574863, + "learning_rate": 1.700274261035102e-06, + "loss": 0.64600754, + "num_input_tokens_seen": 201137245, + "step": 9338, + "time_per_iteration": 3.0840704441070557 + }, + { + "auxiliary_loss_clip": 0.01082776, + "auxiliary_loss_mlp": 0.01031496, + "balance_loss_clip": 1.03797936, + "balance_loss_mlp": 1.01868713, + "epoch": 0.5614910566661656, + "flos": 32919849740160.0, + "grad_norm": 3.279197656943634, + "language_loss": 0.65949774, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.68064046, + "num_input_tokens_seen": 201157270, + "step": 9339, + "time_per_iteration": 2.6490259170532227 + }, + { + "auxiliary_loss_clip": 0.01098018, + "auxiliary_loss_mlp": 0.01039473, + "balance_loss_clip": 1.03838491, + "balance_loss_mlp": 1.02504897, + "epoch": 0.5615511799188336, + "flos": 18588871969920.0, + "grad_norm": 2.303453153399956, + "language_loss": 0.70590496, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.7272799, + "num_input_tokens_seen": 201174530, + "step": 9340, + "time_per_iteration": 3.883044958114624 + }, + { + "auxiliary_loss_clip": 0.01071681, + "auxiliary_loss_mlp": 0.01028288, + "balance_loss_clip": 1.04391074, + "balance_loss_mlp": 1.01590264, + "epoch": 0.5616113031715015, + "flos": 22820010537600.0, + "grad_norm": 1.638559826412057, + "language_loss": 0.76829135, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.78929102, + "num_input_tokens_seen": 201194905, + "step": 9341, + "time_per_iteration": 2.601888656616211 + }, + { + "auxiliary_loss_clip": 0.01064228, + "auxiliary_loss_mlp": 0.01031881, + "balance_loss_clip": 1.0363977, + "balance_loss_mlp": 1.01804662, + "epoch": 0.5616714264241696, + "flos": 22345702421760.0, + "grad_norm": 1.6571228345986255, + "language_loss": 0.79375285, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.81471395, + "num_input_tokens_seen": 201213715, + "step": 9342, + "time_per_iteration": 2.568110704421997 + }, + { + "auxiliary_loss_clip": 0.01085196, + "auxiliary_loss_mlp": 0.01030522, + "balance_loss_clip": 1.03859007, + "balance_loss_mlp": 1.01674139, + "epoch": 0.5617315496768375, + "flos": 18807783408000.0, + "grad_norm": 2.0344669972784053, + "language_loss": 0.76482189, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.78597909, + "num_input_tokens_seen": 201231415, + "step": 9343, + "time_per_iteration": 2.5452048778533936 + }, + { + "auxiliary_loss_clip": 0.0107111, + "auxiliary_loss_mlp": 0.0103515, + "balance_loss_clip": 1.04025972, + "balance_loss_mlp": 1.02182865, + "epoch": 0.5617916729295055, + "flos": 18369314087040.0, + "grad_norm": 1.8572183807417713, + "language_loss": 0.68752396, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.70858657, + "num_input_tokens_seen": 201249625, + "step": 9344, + "time_per_iteration": 2.564502239227295 + }, + { + "auxiliary_loss_clip": 0.01113618, + "auxiliary_loss_mlp": 0.01034877, + "balance_loss_clip": 1.03929281, + "balance_loss_mlp": 1.02095318, + "epoch": 0.5618517961821734, + "flos": 28179964892160.0, + "grad_norm": 2.100795304990162, + "language_loss": 0.66206849, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.68355346, + "num_input_tokens_seen": 201271205, + "step": 9345, + "time_per_iteration": 2.5180368423461914 + }, + { + "auxiliary_loss_clip": 0.01092563, + "auxiliary_loss_mlp": 0.01030571, + "balance_loss_clip": 1.04067087, + "balance_loss_mlp": 1.0179702, + "epoch": 0.5619119194348414, + "flos": 15486872411520.0, + "grad_norm": 1.9679433082372297, + "language_loss": 0.87439847, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.89562976, + "num_input_tokens_seen": 201287700, + "step": 9346, + "time_per_iteration": 2.476707935333252 + }, + { + "auxiliary_loss_clip": 0.01093545, + "auxiliary_loss_mlp": 0.01031666, + "balance_loss_clip": 1.03859186, + "balance_loss_mlp": 1.01818347, + "epoch": 0.5619720426875094, + "flos": 29128652951040.0, + "grad_norm": 2.215882990486965, + "language_loss": 0.5927698, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.6140219, + "num_input_tokens_seen": 201307530, + "step": 9347, + "time_per_iteration": 3.933993101119995 + }, + { + "auxiliary_loss_clip": 0.01104372, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.03979254, + "balance_loss_mlp": 1.01818812, + "epoch": 0.5620321659401774, + "flos": 18003743418240.0, + "grad_norm": 2.304931331570922, + "language_loss": 0.69189036, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.71325308, + "num_input_tokens_seen": 201326210, + "step": 9348, + "time_per_iteration": 2.451066493988037 + }, + { + "auxiliary_loss_clip": 0.01068484, + "auxiliary_loss_mlp": 0.01029134, + "balance_loss_clip": 1.04046798, + "balance_loss_mlp": 1.01471531, + "epoch": 0.5620922891928454, + "flos": 20594518657920.0, + "grad_norm": 1.7982387438774174, + "language_loss": 0.79603934, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.81701553, + "num_input_tokens_seen": 201346120, + "step": 9349, + "time_per_iteration": 2.5846493244171143 + }, + { + "auxiliary_loss_clip": 0.01062185, + "auxiliary_loss_mlp": 0.01031226, + "balance_loss_clip": 1.03984249, + "balance_loss_mlp": 1.01785052, + "epoch": 0.5621524124455133, + "flos": 26287006147200.0, + "grad_norm": 2.016058167229315, + "language_loss": 0.66746902, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.68840313, + "num_input_tokens_seen": 201365700, + "step": 9350, + "time_per_iteration": 4.067665338516235 + }, + { + "auxiliary_loss_clip": 0.01073501, + "auxiliary_loss_mlp": 0.01038237, + "balance_loss_clip": 1.04419971, + "balance_loss_mlp": 1.02432501, + "epoch": 0.5622125356981813, + "flos": 12750299867520.0, + "grad_norm": 2.0500690679256444, + "language_loss": 0.78735316, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.80847055, + "num_input_tokens_seen": 201382795, + "step": 9351, + "time_per_iteration": 2.5577759742736816 + }, + { + "auxiliary_loss_clip": 0.01093227, + "auxiliary_loss_mlp": 0.00789144, + "balance_loss_clip": 1.0379926, + "balance_loss_mlp": 1.01164389, + "epoch": 0.5622726589508492, + "flos": 23805327490560.0, + "grad_norm": 1.6073788900393462, + "language_loss": 0.59213662, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.61096025, + "num_input_tokens_seen": 201402780, + "step": 9352, + "time_per_iteration": 2.5616302490234375 + }, + { + "auxiliary_loss_clip": 0.01099843, + "auxiliary_loss_mlp": 0.01032175, + "balance_loss_clip": 1.03955722, + "balance_loss_mlp": 1.01988459, + "epoch": 0.5623327822035172, + "flos": 24718212668160.0, + "grad_norm": 1.3064064455625075, + "language_loss": 0.72121274, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.74253297, + "num_input_tokens_seen": 201424140, + "step": 9353, + "time_per_iteration": 2.549245834350586 + }, + { + "auxiliary_loss_clip": 0.01091485, + "auxiliary_loss_mlp": 0.01027659, + "balance_loss_clip": 1.03886986, + "balance_loss_mlp": 1.01443863, + "epoch": 0.5623929054561851, + "flos": 14019274523520.0, + "grad_norm": 2.5594044568081027, + "language_loss": 0.76142114, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.78261262, + "num_input_tokens_seen": 201439645, + "step": 9354, + "time_per_iteration": 3.862574338912964 + }, + { + "auxiliary_loss_clip": 0.01082287, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.03830123, + "balance_loss_mlp": 1.02163625, + "epoch": 0.5624530287088532, + "flos": 20704405340160.0, + "grad_norm": 1.9631790305060648, + "language_loss": 0.72727287, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.74843955, + "num_input_tokens_seen": 201459970, + "step": 9355, + "time_per_iteration": 2.565553903579712 + }, + { + "auxiliary_loss_clip": 0.01100663, + "auxiliary_loss_mlp": 0.01030477, + "balance_loss_clip": 1.04190707, + "balance_loss_mlp": 1.01770973, + "epoch": 0.5625131519615211, + "flos": 21470918595840.0, + "grad_norm": 1.5587871446400752, + "language_loss": 0.73628783, + "learning_rate": 1.693344975084274e-06, + "loss": 0.75759917, + "num_input_tokens_seen": 201480055, + "step": 9356, + "time_per_iteration": 2.501264810562134 + }, + { + "auxiliary_loss_clip": 0.01111598, + "auxiliary_loss_mlp": 0.01034447, + "balance_loss_clip": 1.03874242, + "balance_loss_mlp": 1.02170348, + "epoch": 0.5625732752141891, + "flos": 18698004466560.0, + "grad_norm": 1.807961351399199, + "language_loss": 0.83234429, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.85380483, + "num_input_tokens_seen": 201497645, + "step": 9357, + "time_per_iteration": 2.454166889190674 + }, + { + "auxiliary_loss_clip": 0.01100978, + "auxiliary_loss_mlp": 0.01031851, + "balance_loss_clip": 1.03882813, + "balance_loss_mlp": 1.01958418, + "epoch": 0.562633398466857, + "flos": 16216900427520.0, + "grad_norm": 1.9660649183852326, + "language_loss": 0.72328264, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.74461091, + "num_input_tokens_seen": 201515455, + "step": 9358, + "time_per_iteration": 2.4750874042510986 + }, + { + "auxiliary_loss_clip": 0.01110108, + "auxiliary_loss_mlp": 0.01041243, + "balance_loss_clip": 1.03749764, + "balance_loss_mlp": 1.02810621, + "epoch": 0.562693521719525, + "flos": 22491930689280.0, + "grad_norm": 1.6024787990662106, + "language_loss": 0.77394766, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.79546118, + "num_input_tokens_seen": 201534500, + "step": 9359, + "time_per_iteration": 2.4744019508361816 + }, + { + "auxiliary_loss_clip": 0.01090111, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.03858745, + "balance_loss_mlp": 1.0175674, + "epoch": 0.562753644972193, + "flos": 25331171281920.0, + "grad_norm": 1.9846689235623096, + "language_loss": 0.70311201, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.72430944, + "num_input_tokens_seen": 201553280, + "step": 9360, + "time_per_iteration": 2.5659847259521484 + }, + { + "auxiliary_loss_clip": 0.01006198, + "auxiliary_loss_mlp": 0.01004226, + "balance_loss_clip": 1.02138138, + "balance_loss_mlp": 1.00301039, + "epoch": 0.562813768224861, + "flos": 67392622126080.0, + "grad_norm": 0.7739893344841449, + "language_loss": 0.55600226, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57610649, + "num_input_tokens_seen": 201610030, + "step": 9361, + "time_per_iteration": 3.1143572330474854 + }, + { + "auxiliary_loss_clip": 0.01088486, + "auxiliary_loss_mlp": 0.01032795, + "balance_loss_clip": 1.04013407, + "balance_loss_mlp": 1.02079654, + "epoch": 0.562873891477529, + "flos": 23331163029120.0, + "grad_norm": 1.488539213613743, + "language_loss": 0.81743741, + "learning_rate": 1.691036046141018e-06, + "loss": 0.83865023, + "num_input_tokens_seen": 201628370, + "step": 9362, + "time_per_iteration": 2.5324044227600098 + }, + { + "auxiliary_loss_clip": 0.01077023, + "auxiliary_loss_mlp": 0.00784802, + "balance_loss_clip": 1.03751123, + "balance_loss_mlp": 1.00905442, + "epoch": 0.5629340147301969, + "flos": 38472824805120.0, + "grad_norm": 1.6304881807604725, + "language_loss": 0.74809235, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.76671058, + "num_input_tokens_seen": 201649790, + "step": 9363, + "time_per_iteration": 2.703882932662964 + }, + { + "auxiliary_loss_clip": 0.01103976, + "auxiliary_loss_mlp": 0.01033823, + "balance_loss_clip": 1.03822982, + "balance_loss_mlp": 1.02025056, + "epoch": 0.5629941379828649, + "flos": 29242023252480.0, + "grad_norm": 1.710527994379688, + "language_loss": 0.83184212, + "learning_rate": 1.690266496731839e-06, + "loss": 0.85322011, + "num_input_tokens_seen": 201669175, + "step": 9364, + "time_per_iteration": 2.554075002670288 + }, + { + "auxiliary_loss_clip": 0.01079624, + "auxiliary_loss_mlp": 0.0103297, + "balance_loss_clip": 1.03799689, + "balance_loss_mlp": 1.02106071, + "epoch": 0.5630542612355328, + "flos": 19420885676160.0, + "grad_norm": 2.0800502094097832, + "language_loss": 0.64866316, + "learning_rate": 1.689881739637642e-06, + "loss": 0.66978908, + "num_input_tokens_seen": 201687000, + "step": 9365, + "time_per_iteration": 2.525122880935669 + }, + { + "auxiliary_loss_clip": 0.01091095, + "auxiliary_loss_mlp": 0.01034414, + "balance_loss_clip": 1.03925359, + "balance_loss_mlp": 1.02105093, + "epoch": 0.5631143844882008, + "flos": 22266303408000.0, + "grad_norm": 3.453108107726314, + "language_loss": 0.81807137, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.8393265, + "num_input_tokens_seen": 201703335, + "step": 9366, + "time_per_iteration": 2.5021493434906006 + }, + { + "auxiliary_loss_clip": 0.01109566, + "auxiliary_loss_mlp": 0.01027982, + "balance_loss_clip": 1.03839612, + "balance_loss_mlp": 1.01622832, + "epoch": 0.5631745077408687, + "flos": 22965305051520.0, + "grad_norm": 1.5515847825215352, + "language_loss": 0.73240149, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.75377691, + "num_input_tokens_seen": 201723495, + "step": 9367, + "time_per_iteration": 2.487976312637329 + }, + { + "auxiliary_loss_clip": 0.01021211, + "auxiliary_loss_mlp": 0.01007049, + "balance_loss_clip": 1.01759744, + "balance_loss_mlp": 1.00563645, + "epoch": 0.5632346309935368, + "flos": 65080515576960.0, + "grad_norm": 0.6263592657517756, + "language_loss": 0.53490776, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.55519038, + "num_input_tokens_seen": 201792615, + "step": 9368, + "time_per_iteration": 3.229602813720703 + }, + { + "auxiliary_loss_clip": 0.01113177, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.04071581, + "balance_loss_mlp": 1.02124834, + "epoch": 0.5632947542462047, + "flos": 23002903612800.0, + "grad_norm": 1.8234014189515582, + "language_loss": 0.69244248, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.71391279, + "num_input_tokens_seen": 201812520, + "step": 9369, + "time_per_iteration": 2.480339765548706 + }, + { + "auxiliary_loss_clip": 0.01075581, + "auxiliary_loss_mlp": 0.01034334, + "balance_loss_clip": 1.0343945, + "balance_loss_mlp": 1.02131617, + "epoch": 0.5633548774988727, + "flos": 30482593228800.0, + "grad_norm": 1.825745429638711, + "language_loss": 0.76096964, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.78206879, + "num_input_tokens_seen": 201834185, + "step": 9370, + "time_per_iteration": 2.6140341758728027 + }, + { + "auxiliary_loss_clip": 0.01095187, + "auxiliary_loss_mlp": 0.01032704, + "balance_loss_clip": 1.03973377, + "balance_loss_mlp": 1.01877999, + "epoch": 0.5634150007515406, + "flos": 18515039564160.0, + "grad_norm": 2.5909808535340755, + "language_loss": 0.75996423, + "learning_rate": 1.687573444537108e-06, + "loss": 0.78124309, + "num_input_tokens_seen": 201851305, + "step": 9371, + "time_per_iteration": 2.530202865600586 + }, + { + "auxiliary_loss_clip": 0.01098329, + "auxiliary_loss_mlp": 0.01034136, + "balance_loss_clip": 1.03762794, + "balance_loss_mlp": 1.0225842, + "epoch": 0.5634751240042086, + "flos": 19244672530560.0, + "grad_norm": 1.8071363711463713, + "language_loss": 0.76317263, + "learning_rate": 1.687188770067285e-06, + "loss": 0.78449726, + "num_input_tokens_seen": 201870350, + "step": 9372, + "time_per_iteration": 2.4666340351104736 + }, + { + "auxiliary_loss_clip": 0.01084013, + "auxiliary_loss_mlp": 0.01029356, + "balance_loss_clip": 1.03692687, + "balance_loss_mlp": 1.01659465, + "epoch": 0.5635352472568766, + "flos": 12020630987520.0, + "grad_norm": 2.2339747876177327, + "language_loss": 0.72041327, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.74154699, + "num_input_tokens_seen": 201886800, + "step": 9373, + "time_per_iteration": 2.500312566757202 + }, + { + "auxiliary_loss_clip": 0.01079994, + "auxiliary_loss_mlp": 0.01029511, + "balance_loss_clip": 1.03900743, + "balance_loss_mlp": 1.01548052, + "epoch": 0.5635953705095446, + "flos": 21871645701120.0, + "grad_norm": 2.169217842665235, + "language_loss": 0.82631791, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.847413, + "num_input_tokens_seen": 201904730, + "step": 9374, + "time_per_iteration": 2.559629201889038 + }, + { + "auxiliary_loss_clip": 0.01096293, + "auxiliary_loss_mlp": 0.01026069, + "balance_loss_clip": 1.03486478, + "balance_loss_mlp": 1.01389194, + "epoch": 0.5636554937622126, + "flos": 27126166659840.0, + "grad_norm": 1.8846048261226038, + "language_loss": 0.66332394, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.68454754, + "num_input_tokens_seen": 201924850, + "step": 9375, + "time_per_iteration": 2.5452280044555664 + }, + { + "auxiliary_loss_clip": 0.01076355, + "auxiliary_loss_mlp": 0.00785803, + "balance_loss_clip": 1.03947616, + "balance_loss_mlp": 1.01079333, + "epoch": 0.5637156170148805, + "flos": 12926405272320.0, + "grad_norm": 2.3234475342398073, + "language_loss": 0.81036025, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.82898188, + "num_input_tokens_seen": 201939500, + "step": 9376, + "time_per_iteration": 2.535027265548706 + }, + { + "auxiliary_loss_clip": 0.01092978, + "auxiliary_loss_mlp": 0.0103248, + "balance_loss_clip": 1.03780413, + "balance_loss_mlp": 1.01926529, + "epoch": 0.5637757402675485, + "flos": 45551033130240.0, + "grad_norm": 2.890655240608413, + "language_loss": 0.69575053, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.71700513, + "num_input_tokens_seen": 201963000, + "step": 9377, + "time_per_iteration": 2.7293825149536133 + }, + { + "auxiliary_loss_clip": 0.01074432, + "auxiliary_loss_mlp": 0.01029122, + "balance_loss_clip": 1.03781152, + "balance_loss_mlp": 1.01694465, + "epoch": 0.5638358635202164, + "flos": 20886041439360.0, + "grad_norm": 1.3939822536844346, + "language_loss": 0.74550712, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.76654267, + "num_input_tokens_seen": 201983145, + "step": 9378, + "time_per_iteration": 3.932915687561035 + }, + { + "auxiliary_loss_clip": 0.01116313, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.03789353, + "balance_loss_mlp": 1.01710629, + "epoch": 0.5638959867728844, + "flos": 18806562345600.0, + "grad_norm": 2.634038597833133, + "language_loss": 0.81880987, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.84027803, + "num_input_tokens_seen": 202000335, + "step": 9379, + "time_per_iteration": 2.470479726791382 + }, + { + "auxiliary_loss_clip": 0.01087478, + "auxiliary_loss_mlp": 0.01032588, + "balance_loss_clip": 1.03610086, + "balance_loss_mlp": 1.01983869, + "epoch": 0.5639561100255523, + "flos": 27490336698240.0, + "grad_norm": 2.004599446517043, + "language_loss": 0.72453254, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.74573326, + "num_input_tokens_seen": 202018275, + "step": 9380, + "time_per_iteration": 2.5774669647216797 + }, + { + "auxiliary_loss_clip": 0.01076245, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.03908956, + "balance_loss_mlp": 1.01909125, + "epoch": 0.5640162332782204, + "flos": 18076570243200.0, + "grad_norm": 2.1803062428701487, + "language_loss": 0.74085033, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.76194501, + "num_input_tokens_seen": 202034330, + "step": 9381, + "time_per_iteration": 2.563936948776245 + }, + { + "auxiliary_loss_clip": 0.01063693, + "auxiliary_loss_mlp": 0.0103206, + "balance_loss_clip": 1.03514504, + "balance_loss_mlp": 1.01887584, + "epoch": 0.5640763565308883, + "flos": 20884856290560.0, + "grad_norm": 1.9192443092313876, + "language_loss": 0.71914607, + "learning_rate": 1.683342680176499e-06, + "loss": 0.7401036, + "num_input_tokens_seen": 202053100, + "step": 9382, + "time_per_iteration": 2.5952281951904297 + }, + { + "auxiliary_loss_clip": 0.0104088, + "auxiliary_loss_mlp": 0.01003718, + "balance_loss_clip": 1.01596165, + "balance_loss_mlp": 1.00258541, + "epoch": 0.5641364797835563, + "flos": 64447912224000.0, + "grad_norm": 0.7348883059868159, + "language_loss": 0.5439325, + "learning_rate": 1.682958136989022e-06, + "loss": 0.5643785, + "num_input_tokens_seen": 202120125, + "step": 9383, + "time_per_iteration": 3.1972410678863525 + }, + { + "auxiliary_loss_clip": 0.01106318, + "auxiliary_loss_mlp": 0.01025779, + "balance_loss_clip": 1.03903937, + "balance_loss_mlp": 1.01225507, + "epoch": 0.5641966030362242, + "flos": 18660944609280.0, + "grad_norm": 1.8086907017111076, + "language_loss": 0.70529813, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.72661906, + "num_input_tokens_seen": 202138030, + "step": 9384, + "time_per_iteration": 2.4926223754882812 + }, + { + "auxiliary_loss_clip": 0.01088484, + "auxiliary_loss_mlp": 0.01031245, + "balance_loss_clip": 1.03737712, + "balance_loss_mlp": 1.01758409, + "epoch": 0.5642567262888922, + "flos": 22492325738880.0, + "grad_norm": 1.9390452495247288, + "language_loss": 0.75992256, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78111982, + "num_input_tokens_seen": 202155580, + "step": 9385, + "time_per_iteration": 2.538809061050415 + }, + { + "auxiliary_loss_clip": 0.01097528, + "auxiliary_loss_mlp": 0.01031364, + "balance_loss_clip": 1.03587413, + "balance_loss_mlp": 1.01850152, + "epoch": 0.5643168495415603, + "flos": 13003972692480.0, + "grad_norm": 1.8727301872221094, + "language_loss": 0.82192922, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.84321809, + "num_input_tokens_seen": 202170365, + "step": 9386, + "time_per_iteration": 3.8266384601593018 + }, + { + "auxiliary_loss_clip": 0.01109576, + "auxiliary_loss_mlp": 0.01033686, + "balance_loss_clip": 1.04208469, + "balance_loss_mlp": 1.01995897, + "epoch": 0.5643769727942282, + "flos": 18588297352320.0, + "grad_norm": 1.8946487539682784, + "language_loss": 0.69898617, + "learning_rate": 1.681420084607516e-06, + "loss": 0.72041881, + "num_input_tokens_seen": 202189095, + "step": 9387, + "time_per_iteration": 2.4678027629852295 + }, + { + "auxiliary_loss_clip": 0.01103875, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.0389564, + "balance_loss_mlp": 1.01866698, + "epoch": 0.5644370960468962, + "flos": 33806269572480.0, + "grad_norm": 1.6670007568389555, + "language_loss": 0.74526334, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.76661396, + "num_input_tokens_seen": 202213500, + "step": 9388, + "time_per_iteration": 2.626058578491211 + }, + { + "auxiliary_loss_clip": 0.01096248, + "auxiliary_loss_mlp": 0.01030777, + "balance_loss_clip": 1.03724921, + "balance_loss_mlp": 1.0195055, + "epoch": 0.5644972192995641, + "flos": 21214911386880.0, + "grad_norm": 1.6060694701436016, + "language_loss": 0.82418311, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.84545338, + "num_input_tokens_seen": 202231920, + "step": 9389, + "time_per_iteration": 3.9387266635894775 + }, + { + "auxiliary_loss_clip": 0.01078195, + "auxiliary_loss_mlp": 0.01031208, + "balance_loss_clip": 1.03638661, + "balance_loss_mlp": 1.01712966, + "epoch": 0.5645573425522321, + "flos": 18587722734720.0, + "grad_norm": 2.1402179283191947, + "language_loss": 0.63467348, + "learning_rate": 1.680266672116467e-06, + "loss": 0.6557675, + "num_input_tokens_seen": 202247600, + "step": 9390, + "time_per_iteration": 2.530947685241699 + }, + { + "auxiliary_loss_clip": 0.01090565, + "auxiliary_loss_mlp": 0.01028264, + "balance_loss_clip": 1.04131854, + "balance_loss_mlp": 1.01739788, + "epoch": 0.5646174658049, + "flos": 18113809668480.0, + "grad_norm": 1.698357523672355, + "language_loss": 0.92235613, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.94354439, + "num_input_tokens_seen": 202265350, + "step": 9391, + "time_per_iteration": 2.5351412296295166 + }, + { + "auxiliary_loss_clip": 0.0110622, + "auxiliary_loss_mlp": 0.01033582, + "balance_loss_clip": 1.03977036, + "balance_loss_mlp": 1.02008104, + "epoch": 0.564677589057568, + "flos": 28329964087680.0, + "grad_norm": 2.506781699755024, + "language_loss": 0.59922493, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.62062299, + "num_input_tokens_seen": 202284285, + "step": 9392, + "time_per_iteration": 2.5862443447113037 + }, + { + "auxiliary_loss_clip": 0.01066851, + "auxiliary_loss_mlp": 0.01028316, + "balance_loss_clip": 1.03738284, + "balance_loss_mlp": 1.01454687, + "epoch": 0.564737712310236, + "flos": 22163743100160.0, + "grad_norm": 2.235141862550795, + "language_loss": 0.8123157, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.83326733, + "num_input_tokens_seen": 202303450, + "step": 9393, + "time_per_iteration": 3.9696593284606934 + }, + { + "auxiliary_loss_clip": 0.0108975, + "auxiliary_loss_mlp": 0.01027999, + "balance_loss_clip": 1.03925169, + "balance_loss_mlp": 1.01563156, + "epoch": 0.564797835562904, + "flos": 20959011918720.0, + "grad_norm": 1.6813817087844667, + "language_loss": 0.87495577, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.8961333, + "num_input_tokens_seen": 202322315, + "step": 9394, + "time_per_iteration": 2.5262956619262695 + }, + { + "auxiliary_loss_clip": 0.01101296, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.03990197, + "balance_loss_mlp": 1.01884079, + "epoch": 0.5648579588155719, + "flos": 17420302805760.0, + "grad_norm": 1.8154715138474475, + "language_loss": 0.84764075, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.8689642, + "num_input_tokens_seen": 202339905, + "step": 9395, + "time_per_iteration": 2.4802451133728027 + }, + { + "auxiliary_loss_clip": 0.01033372, + "auxiliary_loss_mlp": 0.01006163, + "balance_loss_clip": 1.0189352, + "balance_loss_mlp": 1.00503635, + "epoch": 0.5649180820682399, + "flos": 69929568835200.0, + "grad_norm": 0.8120434171049438, + "language_loss": 0.58315897, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60355425, + "num_input_tokens_seen": 202397320, + "step": 9396, + "time_per_iteration": 3.101322889328003 + }, + { + "auxiliary_loss_clip": 0.01092862, + "auxiliary_loss_mlp": 0.01027955, + "balance_loss_clip": 1.03923392, + "balance_loss_mlp": 1.01577806, + "epoch": 0.5649782053209078, + "flos": 24973070641920.0, + "grad_norm": 2.1540038890366477, + "language_loss": 0.69734752, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.71855569, + "num_input_tokens_seen": 202416865, + "step": 9397, + "time_per_iteration": 2.5832560062408447 + }, + { + "auxiliary_loss_clip": 0.01080149, + "auxiliary_loss_mlp": 0.01032477, + "balance_loss_clip": 1.04243374, + "balance_loss_mlp": 1.02063322, + "epoch": 0.5650383285735758, + "flos": 21726602582400.0, + "grad_norm": 1.888326422181968, + "language_loss": 0.67385888, + "learning_rate": 1.67719144001275e-06, + "loss": 0.69498515, + "num_input_tokens_seen": 202436210, + "step": 9398, + "time_per_iteration": 2.598712205886841 + }, + { + "auxiliary_loss_clip": 0.01025956, + "auxiliary_loss_mlp": 0.01000848, + "balance_loss_clip": 1.01994824, + "balance_loss_mlp": 0.99962026, + "epoch": 0.5650984518262439, + "flos": 65904484636800.0, + "grad_norm": 0.8333174124160037, + "language_loss": 0.58152932, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.60179734, + "num_input_tokens_seen": 202492925, + "step": 9399, + "time_per_iteration": 3.1023948192596436 + }, + { + "auxiliary_loss_clip": 0.01069043, + "auxiliary_loss_mlp": 0.01036029, + "balance_loss_clip": 1.03508413, + "balance_loss_mlp": 1.0211997, + "epoch": 0.5651585750789118, + "flos": 21032592929280.0, + "grad_norm": 1.8590439226234374, + "language_loss": 0.73216635, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.7532171, + "num_input_tokens_seen": 202511905, + "step": 9400, + "time_per_iteration": 2.5995612144470215 + }, + { + "auxiliary_loss_clip": 0.01085246, + "auxiliary_loss_mlp": 0.01032467, + "balance_loss_clip": 1.0402267, + "balance_loss_mlp": 1.01838279, + "epoch": 0.5652186983315798, + "flos": 18551919853440.0, + "grad_norm": 1.9318558765516585, + "language_loss": 0.61068463, + "learning_rate": 1.676038429548412e-06, + "loss": 0.63186181, + "num_input_tokens_seen": 202529815, + "step": 9401, + "time_per_iteration": 2.5785653591156006 + }, + { + "auxiliary_loss_clip": 0.01080643, + "auxiliary_loss_mlp": 0.01028243, + "balance_loss_clip": 1.03759658, + "balance_loss_mlp": 1.01608348, + "epoch": 0.5652788215842477, + "flos": 18478662065280.0, + "grad_norm": 1.8467456792970585, + "language_loss": 0.81327617, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.83436501, + "num_input_tokens_seen": 202547710, + "step": 9402, + "time_per_iteration": 2.533280849456787 + }, + { + "auxiliary_loss_clip": 0.010595, + "auxiliary_loss_mlp": 0.01042671, + "balance_loss_clip": 1.03430557, + "balance_loss_mlp": 1.02911067, + "epoch": 0.5653389448369157, + "flos": 30044052080640.0, + "grad_norm": 1.4848761393433714, + "language_loss": 0.77594614, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.7969678, + "num_input_tokens_seen": 202568835, + "step": 9403, + "time_per_iteration": 2.6822876930236816 + }, + { + "auxiliary_loss_clip": 0.01063241, + "auxiliary_loss_mlp": 0.0104845, + "balance_loss_clip": 1.03883731, + "balance_loss_mlp": 1.03394222, + "epoch": 0.5653990680895836, + "flos": 16727550128640.0, + "grad_norm": 1.64258243294836, + "language_loss": 0.69148064, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.71259749, + "num_input_tokens_seen": 202587385, + "step": 9404, + "time_per_iteration": 2.5738272666931152 + }, + { + "auxiliary_loss_clip": 0.01083386, + "auxiliary_loss_mlp": 0.01030198, + "balance_loss_clip": 1.0379405, + "balance_loss_mlp": 1.01867676, + "epoch": 0.5654591913422516, + "flos": 14538256179840.0, + "grad_norm": 1.7369339932280703, + "language_loss": 0.6721428, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.69327861, + "num_input_tokens_seen": 202604815, + "step": 9405, + "time_per_iteration": 2.515717029571533 + }, + { + "auxiliary_loss_clip": 0.01081215, + "auxiliary_loss_mlp": 0.01031419, + "balance_loss_clip": 1.03832436, + "balance_loss_mlp": 1.01945019, + "epoch": 0.5655193145949196, + "flos": 26209905603840.0, + "grad_norm": 1.7241675671362502, + "language_loss": 0.7436446, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.76477093, + "num_input_tokens_seen": 202623775, + "step": 9406, + "time_per_iteration": 2.5901730060577393 + }, + { + "auxiliary_loss_clip": 0.01062365, + "auxiliary_loss_mlp": 0.01044241, + "balance_loss_clip": 1.03849387, + "balance_loss_mlp": 1.02903008, + "epoch": 0.5655794378475876, + "flos": 25046579825280.0, + "grad_norm": 1.7474346239927856, + "language_loss": 0.79793251, + "learning_rate": 1.673732740698882e-06, + "loss": 0.81899858, + "num_input_tokens_seen": 202643375, + "step": 9407, + "time_per_iteration": 2.654787063598633 + }, + { + "auxiliary_loss_clip": 0.01073196, + "auxiliary_loss_mlp": 0.01037575, + "balance_loss_clip": 1.03745532, + "balance_loss_mlp": 1.0241226, + "epoch": 0.5656395611002555, + "flos": 31032852652800.0, + "grad_norm": 1.5671701454727633, + "language_loss": 0.70797014, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.72907782, + "num_input_tokens_seen": 202668400, + "step": 9408, + "time_per_iteration": 2.706003189086914 + }, + { + "auxiliary_loss_clip": 0.01063863, + "auxiliary_loss_mlp": 0.0103425, + "balance_loss_clip": 1.04368174, + "balance_loss_mlp": 1.02167296, + "epoch": 0.5656996843529235, + "flos": 20229522606720.0, + "grad_norm": 2.1246550367021406, + "language_loss": 0.81068408, + "learning_rate": 1.672964276570308e-06, + "loss": 0.83166522, + "num_input_tokens_seen": 202685125, + "step": 9409, + "time_per_iteration": 2.608349084854126 + }, + { + "auxiliary_loss_clip": 0.01075747, + "auxiliary_loss_mlp": 0.01029962, + "balance_loss_clip": 1.03635073, + "balance_loss_mlp": 1.0173254, + "epoch": 0.5657598076055914, + "flos": 20996251344000.0, + "grad_norm": 1.5779337799422444, + "language_loss": 0.7848115, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.80586857, + "num_input_tokens_seen": 202703830, + "step": 9410, + "time_per_iteration": 2.580579996109009 + }, + { + "auxiliary_loss_clip": 0.01110768, + "auxiliary_loss_mlp": 0.01033301, + "balance_loss_clip": 1.03760624, + "balance_loss_mlp": 1.02123713, + "epoch": 0.5658199308582594, + "flos": 11545999649280.0, + "grad_norm": 2.361251989439924, + "language_loss": 0.83583331, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.85727394, + "num_input_tokens_seen": 202719835, + "step": 9411, + "time_per_iteration": 2.4839742183685303 + }, + { + "auxiliary_loss_clip": 0.01105789, + "auxiliary_loss_mlp": 0.01029918, + "balance_loss_clip": 1.0400666, + "balance_loss_mlp": 1.01675749, + "epoch": 0.5658800541109275, + "flos": 14172146807040.0, + "grad_norm": 3.0903679575153706, + "language_loss": 0.67093891, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.69229603, + "num_input_tokens_seen": 202736795, + "step": 9412, + "time_per_iteration": 2.4792423248291016 + }, + { + "auxiliary_loss_clip": 0.01097257, + "auxiliary_loss_mlp": 0.010279, + "balance_loss_clip": 1.03848267, + "balance_loss_mlp": 1.01712906, + "epoch": 0.5659401773635954, + "flos": 27305073325440.0, + "grad_norm": 1.583659610135019, + "language_loss": 0.58127111, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.60252273, + "num_input_tokens_seen": 202756900, + "step": 9413, + "time_per_iteration": 2.5727343559265137 + }, + { + "auxiliary_loss_clip": 0.01032565, + "auxiliary_loss_mlp": 0.01035711, + "balance_loss_clip": 1.03311539, + "balance_loss_mlp": 1.02232337, + "epoch": 0.5660003006162634, + "flos": 16728196573440.0, + "grad_norm": 1.724829667822143, + "language_loss": 0.69248837, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.71317112, + "num_input_tokens_seen": 202775145, + "step": 9414, + "time_per_iteration": 2.668618679046631 + }, + { + "auxiliary_loss_clip": 0.01039584, + "auxiliary_loss_mlp": 0.01028347, + "balance_loss_clip": 1.03917158, + "balance_loss_mlp": 1.01689112, + "epoch": 0.5660604238689313, + "flos": 21653452535040.0, + "grad_norm": 1.4720773893730983, + "language_loss": 0.78094149, + "learning_rate": 1.670659182280247e-06, + "loss": 0.80162078, + "num_input_tokens_seen": 202794505, + "step": 9415, + "time_per_iteration": 2.933361291885376 + }, + { + "auxiliary_loss_clip": 0.01030256, + "auxiliary_loss_mlp": 0.01002366, + "balance_loss_clip": 1.02621818, + "balance_loss_mlp": 1.00137091, + "epoch": 0.5661205471215993, + "flos": 68824022083200.0, + "grad_norm": 0.6851654552910538, + "language_loss": 0.49239028, + "learning_rate": 1.670275043523822e-06, + "loss": 0.51271647, + "num_input_tokens_seen": 202858580, + "step": 9416, + "time_per_iteration": 3.2647109031677246 + }, + { + "auxiliary_loss_clip": 0.01100318, + "auxiliary_loss_mlp": 0.00785275, + "balance_loss_clip": 1.03867745, + "balance_loss_mlp": 1.00980496, + "epoch": 0.5661806703742672, + "flos": 28621774177920.0, + "grad_norm": 1.781036512704545, + "language_loss": 0.62796152, + "learning_rate": 1.6698909172706e-06, + "loss": 0.64681745, + "num_input_tokens_seen": 202878565, + "step": 9417, + "time_per_iteration": 4.056067705154419 + }, + { + "auxiliary_loss_clip": 0.0109187, + "auxiliary_loss_mlp": 0.01026912, + "balance_loss_clip": 1.03865957, + "balance_loss_mlp": 1.0144608, + "epoch": 0.5662407936269352, + "flos": 21397948116480.0, + "grad_norm": 1.816116606688781, + "language_loss": 0.69057071, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.71175849, + "num_input_tokens_seen": 202897350, + "step": 9418, + "time_per_iteration": 2.544419050216675 + }, + { + "auxiliary_loss_clip": 0.01098202, + "auxiliary_loss_mlp": 0.01032025, + "balance_loss_clip": 1.03552067, + "balance_loss_mlp": 1.01804137, + "epoch": 0.5663009168796032, + "flos": 25660005315840.0, + "grad_norm": 2.353023749423087, + "language_loss": 0.6496402, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.67094249, + "num_input_tokens_seen": 202916745, + "step": 9419, + "time_per_iteration": 2.54282546043396 + }, + { + "auxiliary_loss_clip": 0.00983641, + "auxiliary_loss_mlp": 0.010054, + "balance_loss_clip": 1.02711225, + "balance_loss_mlp": 1.00399327, + "epoch": 0.5663610401322712, + "flos": 67930458422400.0, + "grad_norm": 0.7393120236411663, + "language_loss": 0.59719312, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.61708355, + "num_input_tokens_seen": 202982375, + "step": 9420, + "time_per_iteration": 3.463653564453125 + }, + { + "auxiliary_loss_clip": 0.01088325, + "auxiliary_loss_mlp": 0.00784966, + "balance_loss_clip": 1.03516269, + "balance_loss_mlp": 1.00989616, + "epoch": 0.5664211633849391, + "flos": 24609367480320.0, + "grad_norm": 1.8085022441401146, + "language_loss": 0.74569303, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.76442587, + "num_input_tokens_seen": 203002430, + "step": 9421, + "time_per_iteration": 2.815002679824829 + }, + { + "auxiliary_loss_clip": 0.01078119, + "auxiliary_loss_mlp": 0.01033662, + "balance_loss_clip": 1.03704166, + "balance_loss_mlp": 1.02069223, + "epoch": 0.5664812866376071, + "flos": 11648811352320.0, + "grad_norm": 2.096999602224946, + "language_loss": 0.72561389, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.74673176, + "num_input_tokens_seen": 203019425, + "step": 9422, + "time_per_iteration": 2.505920171737671 + }, + { + "auxiliary_loss_clip": 0.01097897, + "auxiliary_loss_mlp": 0.01035055, + "balance_loss_clip": 1.03904164, + "balance_loss_mlp": 1.02351534, + "epoch": 0.566541409890275, + "flos": 24643985212800.0, + "grad_norm": 2.0138486613147832, + "language_loss": 0.81824577, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.83957529, + "num_input_tokens_seen": 203039035, + "step": 9423, + "time_per_iteration": 2.5557520389556885 + }, + { + "auxiliary_loss_clip": 0.01090302, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_clip": 1.03805995, + "balance_loss_mlp": 1.02187681, + "epoch": 0.566601533142943, + "flos": 22270577126400.0, + "grad_norm": 1.529964361287081, + "language_loss": 0.80773771, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.82898235, + "num_input_tokens_seen": 203059320, + "step": 9424, + "time_per_iteration": 4.007587432861328 + }, + { + "auxiliary_loss_clip": 0.01116572, + "auxiliary_loss_mlp": 0.00787162, + "balance_loss_clip": 1.04077363, + "balance_loss_mlp": 1.01129794, + "epoch": 0.5666616563956111, + "flos": 29971656218880.0, + "grad_norm": 1.8408710483586221, + "language_loss": 0.78992331, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.80896062, + "num_input_tokens_seen": 203078490, + "step": 9425, + "time_per_iteration": 2.5645346641540527 + }, + { + "auxiliary_loss_clip": 0.01084399, + "auxiliary_loss_mlp": 0.01033393, + "balance_loss_clip": 1.04023814, + "balance_loss_mlp": 1.02083397, + "epoch": 0.566721779648279, + "flos": 17781456101760.0, + "grad_norm": 1.8594613756821663, + "language_loss": 0.59248328, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.61366117, + "num_input_tokens_seen": 203096065, + "step": 9426, + "time_per_iteration": 2.4931955337524414 + }, + { + "auxiliary_loss_clip": 0.01104383, + "auxiliary_loss_mlp": 0.01031612, + "balance_loss_clip": 1.03974771, + "balance_loss_mlp": 1.01938677, + "epoch": 0.566781902900947, + "flos": 21033490769280.0, + "grad_norm": 1.7592573971758756, + "language_loss": 0.82018334, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.84154332, + "num_input_tokens_seen": 203115270, + "step": 9427, + "time_per_iteration": 4.71933126449585 + }, + { + "auxiliary_loss_clip": 0.01110346, + "auxiliary_loss_mlp": 0.01032167, + "balance_loss_clip": 1.04027009, + "balance_loss_mlp": 1.01978755, + "epoch": 0.5668420261536149, + "flos": 23148593176320.0, + "grad_norm": 2.569835256152958, + "language_loss": 0.86509335, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.88651848, + "num_input_tokens_seen": 203134290, + "step": 9428, + "time_per_iteration": 2.5907247066497803 + }, + { + "auxiliary_loss_clip": 0.01093258, + "auxiliary_loss_mlp": 0.01035893, + "balance_loss_clip": 1.03980136, + "balance_loss_mlp": 1.02258348, + "epoch": 0.5669021494062829, + "flos": 22601601889920.0, + "grad_norm": 2.1620256535281994, + "language_loss": 0.73413855, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.75543004, + "num_input_tokens_seen": 203152935, + "step": 9429, + "time_per_iteration": 2.6047463417053223 + }, + { + "auxiliary_loss_clip": 0.01091, + "auxiliary_loss_mlp": 0.00786316, + "balance_loss_clip": 1.0375818, + "balance_loss_mlp": 1.00933361, + "epoch": 0.5669622726589508, + "flos": 17381231786880.0, + "grad_norm": 1.6780703664676782, + "language_loss": 0.7547344, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.77350754, + "num_input_tokens_seen": 203170110, + "step": 9430, + "time_per_iteration": 2.5320253372192383 + }, + { + "auxiliary_loss_clip": 0.01112957, + "auxiliary_loss_mlp": 0.01029904, + "balance_loss_clip": 1.03937054, + "balance_loss_mlp": 1.01711845, + "epoch": 0.5670223959116188, + "flos": 18763253521920.0, + "grad_norm": 2.0890869452883902, + "language_loss": 0.73027909, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.75170767, + "num_input_tokens_seen": 203188825, + "step": 9431, + "time_per_iteration": 3.854848861694336 + }, + { + "auxiliary_loss_clip": 0.01062869, + "auxiliary_loss_mlp": 0.01026345, + "balance_loss_clip": 1.03701806, + "balance_loss_mlp": 1.01512742, + "epoch": 0.5670825191642868, + "flos": 13553334276480.0, + "grad_norm": 1.710588049403222, + "language_loss": 0.73312503, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.75401717, + "num_input_tokens_seen": 203206860, + "step": 9432, + "time_per_iteration": 2.565703868865967 + }, + { + "auxiliary_loss_clip": 0.0106692, + "auxiliary_loss_mlp": 0.01031564, + "balance_loss_clip": 1.03643513, + "balance_loss_mlp": 1.01936889, + "epoch": 0.5671426424169548, + "flos": 22054035985920.0, + "grad_norm": 1.504580750906571, + "language_loss": 0.77700514, + "learning_rate": 1.663746609539197e-06, + "loss": 0.79798996, + "num_input_tokens_seen": 203225625, + "step": 9433, + "time_per_iteration": 2.612353563308716 + }, + { + "auxiliary_loss_clip": 0.01116609, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.03993678, + "balance_loss_mlp": 1.02098393, + "epoch": 0.5672027656696227, + "flos": 21323972056320.0, + "grad_norm": 2.2646590216597997, + "language_loss": 0.63676155, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.65828598, + "num_input_tokens_seen": 203242920, + "step": 9434, + "time_per_iteration": 2.4881749153137207 + }, + { + "auxiliary_loss_clip": 0.01098462, + "auxiliary_loss_mlp": 0.0102465, + "balance_loss_clip": 1.03732324, + "balance_loss_mlp": 1.01260996, + "epoch": 0.5672628889222907, + "flos": 23514056104320.0, + "grad_norm": 1.6937135340170024, + "language_loss": 0.66725165, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.68848276, + "num_input_tokens_seen": 203261995, + "step": 9435, + "time_per_iteration": 2.563514471054077 + }, + { + "auxiliary_loss_clip": 0.01085727, + "auxiliary_loss_mlp": 0.00784651, + "balance_loss_clip": 1.0356406, + "balance_loss_mlp": 1.00972307, + "epoch": 0.5673230121749586, + "flos": 27121928855040.0, + "grad_norm": 1.5282927868031748, + "language_loss": 0.71763206, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.73633587, + "num_input_tokens_seen": 203280670, + "step": 9436, + "time_per_iteration": 2.5691051483154297 + }, + { + "auxiliary_loss_clip": 0.01113266, + "auxiliary_loss_mlp": 0.01029383, + "balance_loss_clip": 1.03857851, + "balance_loss_mlp": 1.01644313, + "epoch": 0.5673831354276266, + "flos": 31141985149440.0, + "grad_norm": 1.5229693757185925, + "language_loss": 0.74188656, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.76331306, + "num_input_tokens_seen": 203304800, + "step": 9437, + "time_per_iteration": 2.5707600116729736 + }, + { + "auxiliary_loss_clip": 0.01106628, + "auxiliary_loss_mlp": 0.01031466, + "balance_loss_clip": 1.04255581, + "balance_loss_mlp": 1.01792395, + "epoch": 0.5674432586802945, + "flos": 27673193859840.0, + "grad_norm": 1.7392912999870531, + "language_loss": 0.61368614, + "learning_rate": 1.661827179985277e-06, + "loss": 0.63506705, + "num_input_tokens_seen": 203324060, + "step": 9438, + "time_per_iteration": 2.5404555797576904 + }, + { + "auxiliary_loss_clip": 0.01091258, + "auxiliary_loss_mlp": 0.0102871, + "balance_loss_clip": 1.03732967, + "balance_loss_mlp": 1.01585364, + "epoch": 0.5675033819329626, + "flos": 26615157822720.0, + "grad_norm": 1.5045269495473483, + "language_loss": 0.75023627, + "learning_rate": 1.661443332486909e-06, + "loss": 0.77143598, + "num_input_tokens_seen": 203344360, + "step": 9439, + "time_per_iteration": 2.584634304046631 + }, + { + "auxiliary_loss_clip": 0.01090335, + "auxiliary_loss_mlp": 0.01029376, + "balance_loss_clip": 1.04464197, + "balance_loss_mlp": 1.01566684, + "epoch": 0.5675635051856306, + "flos": 19098372435840.0, + "grad_norm": 1.9179427770924722, + "language_loss": 0.84025669, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.86145377, + "num_input_tokens_seen": 203362115, + "step": 9440, + "time_per_iteration": 2.508071184158325 + }, + { + "auxiliary_loss_clip": 0.01081371, + "auxiliary_loss_mlp": 0.010334, + "balance_loss_clip": 1.03790474, + "balance_loss_mlp": 1.01960206, + "epoch": 0.5676236284382985, + "flos": 17566315591680.0, + "grad_norm": 2.0220374187629355, + "language_loss": 0.74992871, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.77107644, + "num_input_tokens_seen": 203380550, + "step": 9441, + "time_per_iteration": 2.5473110675811768 + }, + { + "auxiliary_loss_clip": 0.01063378, + "auxiliary_loss_mlp": 0.01037608, + "balance_loss_clip": 1.03607392, + "balance_loss_mlp": 1.02425611, + "epoch": 0.5676837516909665, + "flos": 15954069634560.0, + "grad_norm": 1.7791149116592928, + "language_loss": 0.83313572, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.85414553, + "num_input_tokens_seen": 203396590, + "step": 9442, + "time_per_iteration": 2.543396472930908 + }, + { + "auxiliary_loss_clip": 0.01076146, + "auxiliary_loss_mlp": 0.0102843, + "balance_loss_clip": 1.03780532, + "balance_loss_mlp": 1.01625848, + "epoch": 0.5677438749436344, + "flos": 18295912644480.0, + "grad_norm": 1.827091324933523, + "language_loss": 0.74132037, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.76236618, + "num_input_tokens_seen": 203414280, + "step": 9443, + "time_per_iteration": 2.5442094802856445 + }, + { + "auxiliary_loss_clip": 0.01092193, + "auxiliary_loss_mlp": 0.01030371, + "balance_loss_clip": 1.03975224, + "balance_loss_mlp": 1.01769304, + "epoch": 0.5678039981963025, + "flos": 17931311642880.0, + "grad_norm": 2.0645873240618826, + "language_loss": 0.77802598, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.79925168, + "num_input_tokens_seen": 203433280, + "step": 9444, + "time_per_iteration": 2.4990310668945312 + }, + { + "auxiliary_loss_clip": 0.01074801, + "auxiliary_loss_mlp": 0.0103886, + "balance_loss_clip": 1.03873658, + "balance_loss_mlp": 1.02617657, + "epoch": 0.5678641214489704, + "flos": 19316350120320.0, + "grad_norm": 1.7144500893703216, + "language_loss": 0.80599928, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.82713592, + "num_input_tokens_seen": 203449935, + "step": 9445, + "time_per_iteration": 2.556428909301758 + }, + { + "auxiliary_loss_clip": 0.01109444, + "auxiliary_loss_mlp": 0.01027659, + "balance_loss_clip": 1.03605199, + "balance_loss_mlp": 1.01514792, + "epoch": 0.5679242447016384, + "flos": 27751084502400.0, + "grad_norm": 1.7663567069093988, + "language_loss": 0.71003497, + "learning_rate": 1.658756760280259e-06, + "loss": 0.73140597, + "num_input_tokens_seen": 203473025, + "step": 9446, + "time_per_iteration": 2.5397725105285645 + }, + { + "auxiliary_loss_clip": 0.0107626, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.03566146, + "balance_loss_mlp": 1.01879573, + "epoch": 0.5679843679543063, + "flos": 23769093646080.0, + "grad_norm": 2.467073147527505, + "language_loss": 0.73228145, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.75336379, + "num_input_tokens_seen": 203492895, + "step": 9447, + "time_per_iteration": 2.5875515937805176 + }, + { + "auxiliary_loss_clip": 0.01090615, + "auxiliary_loss_mlp": 0.01033431, + "balance_loss_clip": 1.03765309, + "balance_loss_mlp": 1.02071118, + "epoch": 0.5680444912069743, + "flos": 25591883172480.0, + "grad_norm": 1.9268829056879138, + "language_loss": 0.74979532, + "learning_rate": 1.657989284462725e-06, + "loss": 0.77103579, + "num_input_tokens_seen": 203513710, + "step": 9448, + "time_per_iteration": 2.565929889678955 + }, + { + "auxiliary_loss_clip": 0.01070782, + "auxiliary_loss_mlp": 0.01044614, + "balance_loss_clip": 1.03884685, + "balance_loss_mlp": 1.02953434, + "epoch": 0.5681046144596422, + "flos": 23695799944320.0, + "grad_norm": 2.486389283656631, + "language_loss": 0.760584, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.78173792, + "num_input_tokens_seen": 203531630, + "step": 9449, + "time_per_iteration": 2.626469135284424 + }, + { + "auxiliary_loss_clip": 0.0108577, + "auxiliary_loss_mlp": 0.01039751, + "balance_loss_clip": 1.03516948, + "balance_loss_mlp": 1.02529049, + "epoch": 0.5681647377123102, + "flos": 28000770917760.0, + "grad_norm": 1.632940063822298, + "language_loss": 0.75036335, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.77161849, + "num_input_tokens_seen": 203551885, + "step": 9450, + "time_per_iteration": 2.5862135887145996 + }, + { + "auxiliary_loss_clip": 0.01095114, + "auxiliary_loss_mlp": 0.01034721, + "balance_loss_clip": 1.03866935, + "balance_loss_mlp": 1.02158999, + "epoch": 0.5682248609649782, + "flos": 22747758330240.0, + "grad_norm": 1.7075379180463586, + "language_loss": 0.66782826, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.68912661, + "num_input_tokens_seen": 203572250, + "step": 9451, + "time_per_iteration": 2.552676200866699 + }, + { + "auxiliary_loss_clip": 0.01091081, + "auxiliary_loss_mlp": 0.01032935, + "balance_loss_clip": 1.03900123, + "balance_loss_mlp": 1.01767564, + "epoch": 0.5682849842176462, + "flos": 21288600138240.0, + "grad_norm": 2.591651548254325, + "language_loss": 0.71823847, + "learning_rate": 1.656454488573026e-06, + "loss": 0.73947865, + "num_input_tokens_seen": 203590605, + "step": 9452, + "time_per_iteration": 2.5411341190338135 + }, + { + "auxiliary_loss_clip": 0.0106842, + "auxiliary_loss_mlp": 0.01031153, + "balance_loss_clip": 1.03632891, + "balance_loss_mlp": 1.01897609, + "epoch": 0.5683451074703142, + "flos": 21141689512320.0, + "grad_norm": 1.408418980924042, + "language_loss": 0.70049727, + "learning_rate": 1.656070822132428e-06, + "loss": 0.72149301, + "num_input_tokens_seen": 203610080, + "step": 9453, + "time_per_iteration": 2.568737506866455 + }, + { + "auxiliary_loss_clip": 0.01070748, + "auxiliary_loss_mlp": 0.00784549, + "balance_loss_clip": 1.0384481, + "balance_loss_mlp": 1.01025033, + "epoch": 0.5684052307229821, + "flos": 22344481359360.0, + "grad_norm": 1.6873294828023975, + "language_loss": 0.70155478, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.72010773, + "num_input_tokens_seen": 203630060, + "step": 9454, + "time_per_iteration": 2.6179285049438477 + }, + { + "auxiliary_loss_clip": 0.01087093, + "auxiliary_loss_mlp": 0.01031171, + "balance_loss_clip": 1.03575301, + "balance_loss_mlp": 1.01926827, + "epoch": 0.5684653539756501, + "flos": 21798639308160.0, + "grad_norm": 2.127529145871682, + "language_loss": 0.60131872, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.62250137, + "num_input_tokens_seen": 203649065, + "step": 9455, + "time_per_iteration": 2.5475995540618896 + }, + { + "auxiliary_loss_clip": 0.01072543, + "auxiliary_loss_mlp": 0.01029429, + "balance_loss_clip": 1.04012346, + "balance_loss_mlp": 1.01646531, + "epoch": 0.568525477228318, + "flos": 22999635475200.0, + "grad_norm": 1.8054933074336006, + "language_loss": 0.73351681, + "learning_rate": 1.6549199011198e-06, + "loss": 0.75453651, + "num_input_tokens_seen": 203667545, + "step": 9456, + "time_per_iteration": 3.944758892059326 + }, + { + "auxiliary_loss_clip": 0.01090795, + "auxiliary_loss_mlp": 0.01030062, + "balance_loss_clip": 1.03727818, + "balance_loss_mlp": 1.0184269, + "epoch": 0.568585600480986, + "flos": 21392489249280.0, + "grad_norm": 1.5783313473980736, + "language_loss": 0.7643187, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.78552735, + "num_input_tokens_seen": 203686025, + "step": 9457, + "time_per_iteration": 2.5339467525482178 + }, + { + "auxiliary_loss_clip": 0.01101802, + "auxiliary_loss_mlp": 0.01033221, + "balance_loss_clip": 1.037799, + "balance_loss_mlp": 1.01950622, + "epoch": 0.568645723733654, + "flos": 30007351359360.0, + "grad_norm": 1.7771198828565364, + "language_loss": 0.66468215, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.6860323, + "num_input_tokens_seen": 203705540, + "step": 9458, + "time_per_iteration": 2.5632290840148926 + }, + { + "auxiliary_loss_clip": 0.01101051, + "auxiliary_loss_mlp": 0.01027337, + "balance_loss_clip": 1.03715098, + "balance_loss_mlp": 1.01404488, + "epoch": 0.568705846986322, + "flos": 20412667077120.0, + "grad_norm": 2.090457539303114, + "language_loss": 0.68473279, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.70601666, + "num_input_tokens_seen": 203723670, + "step": 9459, + "time_per_iteration": 2.5082497596740723 + }, + { + "auxiliary_loss_clip": 0.01084902, + "auxiliary_loss_mlp": 0.01031621, + "balance_loss_clip": 1.03917813, + "balance_loss_mlp": 1.01847839, + "epoch": 0.5687659702389899, + "flos": 17456752131840.0, + "grad_norm": 2.1690679099230734, + "language_loss": 0.76732177, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.78848696, + "num_input_tokens_seen": 203739705, + "step": 9460, + "time_per_iteration": 2.5181405544281006 + }, + { + "auxiliary_loss_clip": 0.01055513, + "auxiliary_loss_mlp": 0.01031193, + "balance_loss_clip": 1.03686357, + "balance_loss_mlp": 1.01860452, + "epoch": 0.5688260934916579, + "flos": 25406081095680.0, + "grad_norm": 1.673635462474603, + "language_loss": 0.71674144, + "learning_rate": 1.65300196133547e-06, + "loss": 0.73760849, + "num_input_tokens_seen": 203759000, + "step": 9461, + "time_per_iteration": 2.676396608352661 + }, + { + "auxiliary_loss_clip": 0.01100836, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.03674638, + "balance_loss_mlp": 1.01722503, + "epoch": 0.5688862167443258, + "flos": 21608024808960.0, + "grad_norm": 2.1587466833064477, + "language_loss": 0.73147863, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.752792, + "num_input_tokens_seen": 203774295, + "step": 9462, + "time_per_iteration": 2.464184284210205 + }, + { + "auxiliary_loss_clip": 0.01098623, + "auxiliary_loss_mlp": 0.01027506, + "balance_loss_clip": 1.03695846, + "balance_loss_mlp": 1.0157994, + "epoch": 0.5689463399969938, + "flos": 22418996123520.0, + "grad_norm": 2.0342704236367255, + "language_loss": 0.72991014, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.75117147, + "num_input_tokens_seen": 203792710, + "step": 9463, + "time_per_iteration": 3.860083818435669 + }, + { + "auxiliary_loss_clip": 0.01099876, + "auxiliary_loss_mlp": 0.01032294, + "balance_loss_clip": 1.03736341, + "balance_loss_mlp": 1.01937807, + "epoch": 0.5690064632496618, + "flos": 18296810484480.0, + "grad_norm": 1.7213839344008175, + "language_loss": 0.7408691, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.76219082, + "num_input_tokens_seen": 203811645, + "step": 9464, + "time_per_iteration": 2.469008207321167 + }, + { + "auxiliary_loss_clip": 0.01103381, + "auxiliary_loss_mlp": 0.00786281, + "balance_loss_clip": 1.03821933, + "balance_loss_mlp": 1.00935578, + "epoch": 0.5690665865023298, + "flos": 21579260993280.0, + "grad_norm": 1.6464597927038038, + "language_loss": 0.84073055, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.85962713, + "num_input_tokens_seen": 203830040, + "step": 9465, + "time_per_iteration": 2.521355390548706 + }, + { + "auxiliary_loss_clip": 0.0108818, + "auxiliary_loss_mlp": 0.01029175, + "balance_loss_clip": 1.03558838, + "balance_loss_mlp": 1.01678944, + "epoch": 0.5691267097549978, + "flos": 24421446501120.0, + "grad_norm": 1.61043418137058, + "language_loss": 0.72300458, + "learning_rate": 1.651084350506125e-06, + "loss": 0.74417818, + "num_input_tokens_seen": 203851245, + "step": 9466, + "time_per_iteration": 3.9728055000305176 + }, + { + "auxiliary_loss_clip": 0.01020244, + "auxiliary_loss_mlp": 0.01007428, + "balance_loss_clip": 1.02555883, + "balance_loss_mlp": 1.0062356, + "epoch": 0.5691868330076657, + "flos": 61657906199040.0, + "grad_norm": 0.7186943324151508, + "language_loss": 0.55366957, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.57394636, + "num_input_tokens_seen": 203916400, + "step": 9467, + "time_per_iteration": 3.2471742630004883 + }, + { + "auxiliary_loss_clip": 0.01096585, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.03627002, + "balance_loss_mlp": 1.01959753, + "epoch": 0.5692469562603337, + "flos": 21325193118720.0, + "grad_norm": 2.083688115989741, + "language_loss": 0.63707232, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.65837789, + "num_input_tokens_seen": 203935870, + "step": 9468, + "time_per_iteration": 2.5051703453063965 + }, + { + "auxiliary_loss_clip": 0.0106662, + "auxiliary_loss_mlp": 0.01034962, + "balance_loss_clip": 1.04002345, + "balance_loss_mlp": 1.0203706, + "epoch": 0.5693070795130016, + "flos": 23367899664000.0, + "grad_norm": 1.7097172990780025, + "language_loss": 0.79330862, + "learning_rate": 1.64993394266317e-06, + "loss": 0.81432438, + "num_input_tokens_seen": 203954950, + "step": 9469, + "time_per_iteration": 2.6063969135284424 + }, + { + "auxiliary_loss_clip": 0.01074256, + "auxiliary_loss_mlp": 0.01041037, + "balance_loss_clip": 1.03514552, + "balance_loss_mlp": 1.02636826, + "epoch": 0.5693672027656697, + "flos": 18697250280960.0, + "grad_norm": 2.0801613930733276, + "language_loss": 0.69057661, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.71172953, + "num_input_tokens_seen": 203972715, + "step": 9470, + "time_per_iteration": 3.8974034786224365 + }, + { + "auxiliary_loss_clip": 0.01087557, + "auxiliary_loss_mlp": 0.01033091, + "balance_loss_clip": 1.04060674, + "balance_loss_mlp": 1.01995432, + "epoch": 0.5694273260183376, + "flos": 20449188230400.0, + "grad_norm": 1.6127834131511065, + "language_loss": 0.74745941, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.76866591, + "num_input_tokens_seen": 203990775, + "step": 9471, + "time_per_iteration": 2.530219078063965 + }, + { + "auxiliary_loss_clip": 0.01069863, + "auxiliary_loss_mlp": 0.01035513, + "balance_loss_clip": 1.03857756, + "balance_loss_mlp": 1.02144027, + "epoch": 0.5694874492710056, + "flos": 17603195880960.0, + "grad_norm": 1.8550542983181562, + "language_loss": 0.57301068, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.59406447, + "num_input_tokens_seen": 204008845, + "step": 9472, + "time_per_iteration": 2.570781946182251 + }, + { + "auxiliary_loss_clip": 0.01074903, + "auxiliary_loss_mlp": 0.01027963, + "balance_loss_clip": 1.03609252, + "balance_loss_mlp": 1.01506495, + "epoch": 0.5695475725236735, + "flos": 13370836250880.0, + "grad_norm": 1.796439276664439, + "language_loss": 0.73667431, + "learning_rate": 1.648400251450638e-06, + "loss": 0.75770295, + "num_input_tokens_seen": 204023755, + "step": 9473, + "time_per_iteration": 2.5423378944396973 + }, + { + "auxiliary_loss_clip": 0.01020307, + "auxiliary_loss_mlp": 0.01005212, + "balance_loss_clip": 1.02489138, + "balance_loss_mlp": 1.00396013, + "epoch": 0.5696076957763415, + "flos": 68174398661760.0, + "grad_norm": 0.6567724433936057, + "language_loss": 0.57614493, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.59640014, + "num_input_tokens_seen": 204091255, + "step": 9474, + "time_per_iteration": 3.2049617767333984 + }, + { + "auxiliary_loss_clip": 0.01098285, + "auxiliary_loss_mlp": 0.01029717, + "balance_loss_clip": 1.03776395, + "balance_loss_mlp": 1.01587653, + "epoch": 0.5696678190290094, + "flos": 33838301525760.0, + "grad_norm": 1.7969436915155912, + "language_loss": 0.53792381, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.5592038, + "num_input_tokens_seen": 204113285, + "step": 9475, + "time_per_iteration": 2.6284215450286865 + }, + { + "auxiliary_loss_clip": 0.01113868, + "auxiliary_loss_mlp": 0.01035213, + "balance_loss_clip": 1.03971708, + "balance_loss_mlp": 1.02190375, + "epoch": 0.5697279422816774, + "flos": 26356600748160.0, + "grad_norm": 1.5480221995339087, + "language_loss": 0.79586828, + "learning_rate": 1.647250122983675e-06, + "loss": 0.81735909, + "num_input_tokens_seen": 204133045, + "step": 9476, + "time_per_iteration": 2.482653856277466 + }, + { + "auxiliary_loss_clip": 0.01092047, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.04360485, + "balance_loss_mlp": 1.02069688, + "epoch": 0.5697880655343454, + "flos": 22930507751040.0, + "grad_norm": 2.3836782465357693, + "language_loss": 0.66246665, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.68372118, + "num_input_tokens_seen": 204152590, + "step": 9477, + "time_per_iteration": 2.5630757808685303 + }, + { + "auxiliary_loss_clip": 0.01078296, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.03743505, + "balance_loss_mlp": 1.01606703, + "epoch": 0.5698481887870134, + "flos": 26761314263040.0, + "grad_norm": 1.714710410538919, + "language_loss": 0.70881665, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.72989029, + "num_input_tokens_seen": 204171815, + "step": 9478, + "time_per_iteration": 2.5945138931274414 + }, + { + "auxiliary_loss_clip": 0.01079669, + "auxiliary_loss_mlp": 0.01024578, + "balance_loss_clip": 1.03807688, + "balance_loss_mlp": 1.01318753, + "epoch": 0.5699083120396814, + "flos": 15742269089280.0, + "grad_norm": 1.566459101634958, + "language_loss": 0.69044161, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.71148407, + "num_input_tokens_seen": 204188535, + "step": 9479, + "time_per_iteration": 2.5524508953094482 + }, + { + "auxiliary_loss_clip": 0.01069883, + "auxiliary_loss_mlp": 0.01029678, + "balance_loss_clip": 1.03769422, + "balance_loss_mlp": 1.01777482, + "epoch": 0.5699684352923493, + "flos": 19537272720000.0, + "grad_norm": 1.385054220675366, + "language_loss": 0.71479565, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.73579121, + "num_input_tokens_seen": 204208365, + "step": 9480, + "time_per_iteration": 2.5460689067840576 + }, + { + "auxiliary_loss_clip": 0.01082401, + "auxiliary_loss_mlp": 0.00785777, + "balance_loss_clip": 1.03889263, + "balance_loss_mlp": 1.00795007, + "epoch": 0.5700285585450173, + "flos": 16253349753600.0, + "grad_norm": 4.027757456211928, + "language_loss": 0.72264338, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.74132514, + "num_input_tokens_seen": 204226560, + "step": 9481, + "time_per_iteration": 2.559252977371216 + }, + { + "auxiliary_loss_clip": 0.01102826, + "auxiliary_loss_mlp": 0.01035159, + "balance_loss_clip": 1.0395689, + "balance_loss_mlp": 1.02197456, + "epoch": 0.5700886817976852, + "flos": 19864993432320.0, + "grad_norm": 1.6869627984187903, + "language_loss": 0.78536201, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.80674189, + "num_input_tokens_seen": 204245410, + "step": 9482, + "time_per_iteration": 2.4823200702667236 + }, + { + "auxiliary_loss_clip": 0.01087469, + "auxiliary_loss_mlp": 0.01028214, + "balance_loss_clip": 1.03766799, + "balance_loss_mlp": 1.01589954, + "epoch": 0.5701488050503533, + "flos": 23841704989440.0, + "grad_norm": 1.5037004707024952, + "language_loss": 0.77988756, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.8010444, + "num_input_tokens_seen": 204264840, + "step": 9483, + "time_per_iteration": 2.5463674068450928 + }, + { + "auxiliary_loss_clip": 0.01088439, + "auxiliary_loss_mlp": 0.01035682, + "balance_loss_clip": 1.03827941, + "balance_loss_mlp": 1.023278, + "epoch": 0.5702089283030212, + "flos": 23659673840640.0, + "grad_norm": 1.6172058110188359, + "language_loss": 0.80973971, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.8309809, + "num_input_tokens_seen": 204284335, + "step": 9484, + "time_per_iteration": 2.5354485511779785 + }, + { + "auxiliary_loss_clip": 0.01110895, + "auxiliary_loss_mlp": 0.00786989, + "balance_loss_clip": 1.03757882, + "balance_loss_mlp": 1.0116365, + "epoch": 0.5702690515556892, + "flos": 27891171544320.0, + "grad_norm": 2.8551639768353785, + "language_loss": 0.60914063, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.62811947, + "num_input_tokens_seen": 204302590, + "step": 9485, + "time_per_iteration": 2.5341286659240723 + }, + { + "auxiliary_loss_clip": 0.01099886, + "auxiliary_loss_mlp": 0.01031667, + "balance_loss_clip": 1.0382452, + "balance_loss_mlp": 1.01859605, + "epoch": 0.5703291748083571, + "flos": 24023951619840.0, + "grad_norm": 2.1961525566407714, + "language_loss": 0.65175825, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.67307377, + "num_input_tokens_seen": 204323055, + "step": 9486, + "time_per_iteration": 2.505262851715088 + }, + { + "auxiliary_loss_clip": 0.01029934, + "auxiliary_loss_mlp": 0.01020796, + "balance_loss_clip": 1.02298546, + "balance_loss_mlp": 1.01963377, + "epoch": 0.5703892980610251, + "flos": 57023382919680.0, + "grad_norm": 0.6696989982409408, + "language_loss": 0.47989193, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.50039923, + "num_input_tokens_seen": 204386160, + "step": 9487, + "time_per_iteration": 3.252068519592285 + }, + { + "auxiliary_loss_clip": 0.01076471, + "auxiliary_loss_mlp": 0.00785945, + "balance_loss_clip": 1.0373621, + "balance_loss_mlp": 1.01003265, + "epoch": 0.570449421313693, + "flos": 24351025887360.0, + "grad_norm": 1.7061662664711827, + "language_loss": 0.8588841, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.87750828, + "num_input_tokens_seen": 204406315, + "step": 9488, + "time_per_iteration": 2.575137138366699 + }, + { + "auxiliary_loss_clip": 0.01077628, + "auxiliary_loss_mlp": 0.01034673, + "balance_loss_clip": 1.0379169, + "balance_loss_mlp": 1.02144718, + "epoch": 0.570509544566361, + "flos": 24828566227200.0, + "grad_norm": 9.053896645043254, + "language_loss": 0.79399848, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81512153, + "num_input_tokens_seen": 204427645, + "step": 9489, + "time_per_iteration": 2.613002061843872 + }, + { + "auxiliary_loss_clip": 0.01090082, + "auxiliary_loss_mlp": 0.01031092, + "balance_loss_clip": 1.03820372, + "balance_loss_mlp": 1.01949286, + "epoch": 0.570569667819029, + "flos": 21397301671680.0, + "grad_norm": 1.800161456174711, + "language_loss": 0.7021488, + "learning_rate": 1.641884454927604e-06, + "loss": 0.72336054, + "num_input_tokens_seen": 204445910, + "step": 9490, + "time_per_iteration": 2.5316083431243896 + }, + { + "auxiliary_loss_clip": 0.01078642, + "auxiliary_loss_mlp": 0.01033057, + "balance_loss_clip": 1.0384903, + "balance_loss_mlp": 1.02069473, + "epoch": 0.570629791071697, + "flos": 23216751233280.0, + "grad_norm": 1.5502703082637892, + "language_loss": 0.76322162, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.78433859, + "num_input_tokens_seen": 204464680, + "step": 9491, + "time_per_iteration": 2.5991523265838623 + }, + { + "auxiliary_loss_clip": 0.01016325, + "auxiliary_loss_mlp": 0.00798094, + "balance_loss_clip": 1.0230701, + "balance_loss_mlp": 1.06960046, + "epoch": 0.570689914324365, + "flos": 65284666525440.0, + "grad_norm": 0.8034113437131128, + "language_loss": 0.5734486, + "learning_rate": 1.641118147266011e-06, + "loss": 0.59159279, + "num_input_tokens_seen": 204525580, + "step": 9492, + "time_per_iteration": 3.1516993045806885 + }, + { + "auxiliary_loss_clip": 0.01086539, + "auxiliary_loss_mlp": 0.00786372, + "balance_loss_clip": 1.04094994, + "balance_loss_mlp": 1.012802, + "epoch": 0.5707500375770329, + "flos": 21141904993920.0, + "grad_norm": 1.8700851173636142, + "language_loss": 0.71845615, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.73718536, + "num_input_tokens_seen": 204541320, + "step": 9493, + "time_per_iteration": 2.5684752464294434 + }, + { + "auxiliary_loss_clip": 0.01115669, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.04044378, + "balance_loss_mlp": 1.01914227, + "epoch": 0.5708101608297009, + "flos": 20812747737600.0, + "grad_norm": 1.7727857290087572, + "language_loss": 0.78454, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.80601609, + "num_input_tokens_seen": 204560275, + "step": 9494, + "time_per_iteration": 3.8410916328430176 + }, + { + "auxiliary_loss_clip": 0.011148, + "auxiliary_loss_mlp": 0.01038202, + "balance_loss_clip": 1.03880918, + "balance_loss_mlp": 1.02446353, + "epoch": 0.5708702840823688, + "flos": 25812338895360.0, + "grad_norm": 2.579301509376585, + "language_loss": 0.80801511, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.82954514, + "num_input_tokens_seen": 204579430, + "step": 9495, + "time_per_iteration": 2.513277053833008 + }, + { + "auxiliary_loss_clip": 0.01079157, + "auxiliary_loss_mlp": 0.0104062, + "balance_loss_clip": 1.038077, + "balance_loss_mlp": 1.02490783, + "epoch": 0.5709304073350369, + "flos": 23651916503040.0, + "grad_norm": 2.14819061834764, + "language_loss": 0.65971398, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.68091172, + "num_input_tokens_seen": 204597710, + "step": 9496, + "time_per_iteration": 2.5825417041778564 + }, + { + "auxiliary_loss_clip": 0.01115401, + "auxiliary_loss_mlp": 0.01037288, + "balance_loss_clip": 1.04022908, + "balance_loss_mlp": 1.0235852, + "epoch": 0.5709905305877048, + "flos": 16107552449280.0, + "grad_norm": 2.1603963563875297, + "language_loss": 0.69408298, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.71560991, + "num_input_tokens_seen": 204616140, + "step": 9497, + "time_per_iteration": 2.456901788711548 + }, + { + "auxiliary_loss_clip": 0.01097515, + "auxiliary_loss_mlp": 0.00797183, + "balance_loss_clip": 1.03938162, + "balance_loss_mlp": 1.02839005, + "epoch": 0.5710506538403728, + "flos": 24750819239040.0, + "grad_norm": 1.856306117199049, + "language_loss": 0.8126272, + "learning_rate": 1.638819551358182e-06, + "loss": 0.83157414, + "num_input_tokens_seen": 204636470, + "step": 9498, + "time_per_iteration": 2.519699811935425 + }, + { + "auxiliary_loss_clip": 0.01114113, + "auxiliary_loss_mlp": 0.0103438, + "balance_loss_clip": 1.04001558, + "balance_loss_mlp": 1.02022386, + "epoch": 0.5711107770930407, + "flos": 21982250655360.0, + "grad_norm": 3.0399086489373386, + "language_loss": 0.6640833, + "learning_rate": 1.638436499891469e-06, + "loss": 0.68556827, + "num_input_tokens_seen": 204656640, + "step": 9499, + "time_per_iteration": 2.4740893840789795 + }, + { + "auxiliary_loss_clip": 0.01083928, + "auxiliary_loss_mlp": 0.01035625, + "balance_loss_clip": 1.04174042, + "balance_loss_mlp": 1.02272081, + "epoch": 0.5711709003457087, + "flos": 19574009354880.0, + "grad_norm": 1.6450544273909862, + "language_loss": 0.71376085, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.73495638, + "num_input_tokens_seen": 204675475, + "step": 9500, + "time_per_iteration": 2.5214314460754395 + }, + { + "auxiliary_loss_clip": 0.01086509, + "auxiliary_loss_mlp": 0.01036318, + "balance_loss_clip": 1.03938651, + "balance_loss_mlp": 1.02260911, + "epoch": 0.5712310235983766, + "flos": 24242683489920.0, + "grad_norm": 1.7203672939306964, + "language_loss": 0.7567029, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.77793121, + "num_input_tokens_seen": 204695385, + "step": 9501, + "time_per_iteration": 3.9852135181427 + }, + { + "auxiliary_loss_clip": 0.01093509, + "auxiliary_loss_mlp": 0.01034624, + "balance_loss_clip": 1.03907716, + "balance_loss_mlp": 1.02219605, + "epoch": 0.5712911468510447, + "flos": 20996143603200.0, + "grad_norm": 1.6642240527826175, + "language_loss": 0.7493223, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.77060366, + "num_input_tokens_seen": 204714730, + "step": 9502, + "time_per_iteration": 2.512223720550537 + }, + { + "auxiliary_loss_clip": 0.01086754, + "auxiliary_loss_mlp": 0.01026872, + "balance_loss_clip": 1.04020965, + "balance_loss_mlp": 1.01454008, + "epoch": 0.5713512701037126, + "flos": 18916987731840.0, + "grad_norm": 1.818740889834848, + "language_loss": 0.82179761, + "learning_rate": 1.636904431275105e-06, + "loss": 0.84293383, + "num_input_tokens_seen": 204735025, + "step": 9503, + "time_per_iteration": 2.5661613941192627 + }, + { + "auxiliary_loss_clip": 0.01077436, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.03812718, + "balance_loss_mlp": 1.02017009, + "epoch": 0.5714113933563806, + "flos": 17413443308160.0, + "grad_norm": 2.209171273062203, + "language_loss": 0.85906088, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.88015425, + "num_input_tokens_seen": 204751365, + "step": 9504, + "time_per_iteration": 3.9453797340393066 + }, + { + "auxiliary_loss_clip": 0.0107023, + "auxiliary_loss_mlp": 0.01032079, + "balance_loss_clip": 1.04096985, + "balance_loss_mlp": 1.01927614, + "epoch": 0.5714715166090486, + "flos": 20193360589440.0, + "grad_norm": 1.7724107493948658, + "language_loss": 0.75532007, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.77634317, + "num_input_tokens_seen": 204768980, + "step": 9505, + "time_per_iteration": 2.5709176063537598 + }, + { + "auxiliary_loss_clip": 0.0111023, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.03836262, + "balance_loss_mlp": 1.02120447, + "epoch": 0.5715316398617165, + "flos": 18551668458240.0, + "grad_norm": 1.4318363449775744, + "language_loss": 0.81548148, + "learning_rate": 1.635755524332509e-06, + "loss": 0.83691084, + "num_input_tokens_seen": 204788110, + "step": 9506, + "time_per_iteration": 2.4674127101898193 + }, + { + "auxiliary_loss_clip": 0.01072317, + "auxiliary_loss_mlp": 0.00802353, + "balance_loss_clip": 1.03636277, + "balance_loss_mlp": 1.04051137, + "epoch": 0.5715917631143845, + "flos": 18478195188480.0, + "grad_norm": 1.896509368230024, + "language_loss": 0.77432179, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.79306853, + "num_input_tokens_seen": 204807240, + "step": 9507, + "time_per_iteration": 2.5694875717163086 + }, + { + "auxiliary_loss_clip": 0.01092036, + "auxiliary_loss_mlp": 0.01036311, + "balance_loss_clip": 1.03982234, + "balance_loss_mlp": 1.02262592, + "epoch": 0.5716518863670524, + "flos": 24020037037440.0, + "grad_norm": 1.4251877427544266, + "language_loss": 0.68302393, + "learning_rate": 1.63498965540751e-06, + "loss": 0.70430744, + "num_input_tokens_seen": 204826415, + "step": 9508, + "time_per_iteration": 2.550493001937866 + }, + { + "auxiliary_loss_clip": 0.01111714, + "auxiliary_loss_mlp": 0.01030543, + "balance_loss_clip": 1.03777885, + "balance_loss_mlp": 1.01777518, + "epoch": 0.5717120096197205, + "flos": 17819485626240.0, + "grad_norm": 2.013557444144772, + "language_loss": 0.79321647, + "learning_rate": 1.634606741699593e-06, + "loss": 0.81463903, + "num_input_tokens_seen": 204844305, + "step": 9509, + "time_per_iteration": 3.924833059310913 + }, + { + "auxiliary_loss_clip": 0.01094957, + "auxiliary_loss_mlp": 0.01033618, + "balance_loss_clip": 1.03725672, + "balance_loss_mlp": 1.02118492, + "epoch": 0.5717721328723884, + "flos": 21866043179520.0, + "grad_norm": 1.8201422809928949, + "language_loss": 0.72045708, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.74174279, + "num_input_tokens_seen": 204861765, + "step": 9510, + "time_per_iteration": 2.4938831329345703 + }, + { + "auxiliary_loss_clip": 0.01088147, + "auxiliary_loss_mlp": 0.01024815, + "balance_loss_clip": 1.03722477, + "balance_loss_mlp": 1.0132463, + "epoch": 0.5718322561250564, + "flos": 28437624126720.0, + "grad_norm": 1.4977403682916084, + "language_loss": 0.69609344, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.71722305, + "num_input_tokens_seen": 204882505, + "step": 9511, + "time_per_iteration": 2.5952227115631104 + }, + { + "auxiliary_loss_clip": 0.01090976, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.04039609, + "balance_loss_mlp": 1.02191842, + "epoch": 0.5718923793777243, + "flos": 13551825905280.0, + "grad_norm": 2.1236188820867494, + "language_loss": 0.61620033, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.63745046, + "num_input_tokens_seen": 204899830, + "step": 9512, + "time_per_iteration": 2.5052785873413086 + }, + { + "auxiliary_loss_clip": 0.01086958, + "auxiliary_loss_mlp": 0.01025913, + "balance_loss_clip": 1.03767371, + "balance_loss_mlp": 1.01407504, + "epoch": 0.5719525026303923, + "flos": 17822035491840.0, + "grad_norm": 2.247684883837382, + "language_loss": 0.7584554, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.77958411, + "num_input_tokens_seen": 204918100, + "step": 9513, + "time_per_iteration": 2.5619709491729736 + }, + { + "auxiliary_loss_clip": 0.01043548, + "auxiliary_loss_mlp": 0.01004773, + "balance_loss_clip": 1.02763939, + "balance_loss_mlp": 1.00355721, + "epoch": 0.5720126258830602, + "flos": 61298042814720.0, + "grad_norm": 0.8927797788488366, + "language_loss": 0.66823095, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.68871415, + "num_input_tokens_seen": 204972925, + "step": 9514, + "time_per_iteration": 3.097731590270996 + }, + { + "auxiliary_loss_clip": 0.01107083, + "auxiliary_loss_mlp": 0.01038213, + "balance_loss_clip": 1.04164267, + "balance_loss_mlp": 1.02489722, + "epoch": 0.5720727491357283, + "flos": 23988040997760.0, + "grad_norm": 2.0966821645380884, + "language_loss": 0.81354249, + "learning_rate": 1.63230955093099e-06, + "loss": 0.83499551, + "num_input_tokens_seen": 204990910, + "step": 9515, + "time_per_iteration": 2.524500608444214 + }, + { + "auxiliary_loss_clip": 0.01090439, + "auxiliary_loss_mlp": 0.01028255, + "balance_loss_clip": 1.03541255, + "balance_loss_mlp": 1.01575601, + "epoch": 0.5721328723883962, + "flos": 23405426398080.0, + "grad_norm": 1.6819670190450924, + "language_loss": 0.8565501, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.8777371, + "num_input_tokens_seen": 205010500, + "step": 9516, + "time_per_iteration": 2.5218660831451416 + }, + { + "auxiliary_loss_clip": 0.01079447, + "auxiliary_loss_mlp": 0.01028215, + "balance_loss_clip": 1.03960609, + "balance_loss_mlp": 1.01551962, + "epoch": 0.5721929956410642, + "flos": 18804910320000.0, + "grad_norm": 1.8982365547288318, + "language_loss": 0.87668842, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.89776504, + "num_input_tokens_seen": 205028560, + "step": 9517, + "time_per_iteration": 2.5268032550811768 + }, + { + "auxiliary_loss_clip": 0.01062084, + "auxiliary_loss_mlp": 0.01026157, + "balance_loss_clip": 1.03693318, + "balance_loss_mlp": 1.01369381, + "epoch": 0.5722531188937322, + "flos": 27196659100800.0, + "grad_norm": 1.7285483319456438, + "language_loss": 0.85271597, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.8735984, + "num_input_tokens_seen": 205048650, + "step": 9518, + "time_per_iteration": 2.62981915473938 + }, + { + "auxiliary_loss_clip": 0.01096559, + "auxiliary_loss_mlp": 0.01027152, + "balance_loss_clip": 1.03863716, + "balance_loss_mlp": 1.01575518, + "epoch": 0.5723132421464001, + "flos": 15195672852480.0, + "grad_norm": 1.7223778037151694, + "language_loss": 0.78654826, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.80778533, + "num_input_tokens_seen": 205066480, + "step": 9519, + "time_per_iteration": 2.499634265899658 + }, + { + "auxiliary_loss_clip": 0.01108861, + "auxiliary_loss_mlp": 0.01027137, + "balance_loss_clip": 1.03815114, + "balance_loss_mlp": 1.01516831, + "epoch": 0.5723733653990681, + "flos": 27599433281280.0, + "grad_norm": 1.7617794677634286, + "language_loss": 0.82896066, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.85032058, + "num_input_tokens_seen": 205087475, + "step": 9520, + "time_per_iteration": 2.5477538108825684 + }, + { + "auxiliary_loss_clip": 0.01091186, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.03956175, + "balance_loss_mlp": 1.02417064, + "epoch": 0.572433488651736, + "flos": 18222870337920.0, + "grad_norm": 2.1708767714932202, + "language_loss": 0.72239113, + "learning_rate": 1.630012862105243e-06, + "loss": 0.74367344, + "num_input_tokens_seen": 205106495, + "step": 9521, + "time_per_iteration": 2.495607852935791 + }, + { + "auxiliary_loss_clip": 0.01109241, + "auxiliary_loss_mlp": 0.00797837, + "balance_loss_clip": 1.03825366, + "balance_loss_mlp": 1.03448629, + "epoch": 0.5724936119044041, + "flos": 31249106484480.0, + "grad_norm": 1.8031782177854123, + "language_loss": 0.78143996, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.80051076, + "num_input_tokens_seen": 205128285, + "step": 9522, + "time_per_iteration": 2.54917573928833 + }, + { + "auxiliary_loss_clip": 0.01082542, + "auxiliary_loss_mlp": 0.010259, + "balance_loss_clip": 1.03938484, + "balance_loss_mlp": 1.01508784, + "epoch": 0.572553735157072, + "flos": 19202189719680.0, + "grad_norm": 1.7727585244931998, + "language_loss": 0.71645367, + "learning_rate": 1.629247411248102e-06, + "loss": 0.7375381, + "num_input_tokens_seen": 205146595, + "step": 9523, + "time_per_iteration": 2.5215957164764404 + }, + { + "auxiliary_loss_clip": 0.01086657, + "auxiliary_loss_mlp": 0.01025677, + "balance_loss_clip": 1.03703606, + "balance_loss_mlp": 1.01430416, + "epoch": 0.57261385840974, + "flos": 21214911386880.0, + "grad_norm": 1.8085329766047524, + "language_loss": 0.70161301, + "learning_rate": 1.628864706900738e-06, + "loss": 0.72273636, + "num_input_tokens_seen": 205164295, + "step": 9524, + "time_per_iteration": 2.532341718673706 + }, + { + "auxiliary_loss_clip": 0.01101731, + "auxiliary_loss_mlp": 0.01026577, + "balance_loss_clip": 1.0407536, + "balance_loss_mlp": 1.01506162, + "epoch": 0.5726739816624079, + "flos": 33984529793280.0, + "grad_norm": 1.6096829344576329, + "language_loss": 0.65060532, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.67188841, + "num_input_tokens_seen": 205185380, + "step": 9525, + "time_per_iteration": 2.594330310821533 + }, + { + "auxiliary_loss_clip": 0.01083082, + "auxiliary_loss_mlp": 0.01025402, + "balance_loss_clip": 1.03504348, + "balance_loss_mlp": 1.01423824, + "epoch": 0.5727341049150759, + "flos": 24275972419200.0, + "grad_norm": 1.6166338521519648, + "language_loss": 0.72365582, + "learning_rate": 1.628099340440984e-06, + "loss": 0.74474066, + "num_input_tokens_seen": 205204895, + "step": 9526, + "time_per_iteration": 2.5427823066711426 + }, + { + "auxiliary_loss_clip": 0.01097089, + "auxiliary_loss_mlp": 0.01030829, + "balance_loss_clip": 1.03804088, + "balance_loss_mlp": 1.01923609, + "epoch": 0.5727942281677438, + "flos": 28400564269440.0, + "grad_norm": 1.6831639797099494, + "language_loss": 0.79868984, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.819969, + "num_input_tokens_seen": 205223440, + "step": 9527, + "time_per_iteration": 2.551806688308716 + }, + { + "auxiliary_loss_clip": 0.01097417, + "auxiliary_loss_mlp": 0.01032638, + "balance_loss_clip": 1.03760695, + "balance_loss_mlp": 1.01979947, + "epoch": 0.5728543514204119, + "flos": 19536769929600.0, + "grad_norm": 1.6517106805543817, + "language_loss": 0.72146332, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.74276388, + "num_input_tokens_seen": 205242800, + "step": 9528, + "time_per_iteration": 2.492645025253296 + }, + { + "auxiliary_loss_clip": 0.01106807, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.03640521, + "balance_loss_mlp": 1.02024162, + "epoch": 0.5729144746730798, + "flos": 21506757390720.0, + "grad_norm": 1.9677750259052675, + "language_loss": 0.86092019, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.88231575, + "num_input_tokens_seen": 205259465, + "step": 9529, + "time_per_iteration": 2.477224826812744 + }, + { + "auxiliary_loss_clip": 0.01033155, + "auxiliary_loss_mlp": 0.01001732, + "balance_loss_clip": 1.02842808, + "balance_loss_mlp": 1.0005753, + "epoch": 0.5729745979257478, + "flos": 58681628242560.0, + "grad_norm": 0.7630517016599336, + "language_loss": 0.5611006, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58144951, + "num_input_tokens_seen": 205314100, + "step": 9530, + "time_per_iteration": 3.0021424293518066 + }, + { + "auxiliary_loss_clip": 0.01092804, + "auxiliary_loss_mlp": 0.01027348, + "balance_loss_clip": 1.0413326, + "balance_loss_mlp": 1.01554012, + "epoch": 0.5730347211784158, + "flos": 18552099421440.0, + "grad_norm": 1.778605134057174, + "language_loss": 0.66374612, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.68494761, + "num_input_tokens_seen": 205333420, + "step": 9531, + "time_per_iteration": 2.5145435333251953 + }, + { + "auxiliary_loss_clip": 0.0109431, + "auxiliary_loss_mlp": 0.01038, + "balance_loss_clip": 1.03529692, + "balance_loss_mlp": 1.02353382, + "epoch": 0.5730948444310837, + "flos": 38031482396160.0, + "grad_norm": 2.150210760011325, + "language_loss": 0.75336659, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.77468973, + "num_input_tokens_seen": 205350995, + "step": 9532, + "time_per_iteration": 2.6048686504364014 + }, + { + "auxiliary_loss_clip": 0.01108589, + "auxiliary_loss_mlp": 0.01027962, + "balance_loss_clip": 1.0378356, + "balance_loss_mlp": 1.01571977, + "epoch": 0.5731549676837517, + "flos": 25227066689280.0, + "grad_norm": 1.33373848369291, + "language_loss": 0.78671956, + "learning_rate": 1.625421002822686e-06, + "loss": 0.80808502, + "num_input_tokens_seen": 205372675, + "step": 9533, + "time_per_iteration": 3.8980746269226074 + }, + { + "auxiliary_loss_clip": 0.01098039, + "auxiliary_loss_mlp": 0.01026549, + "balance_loss_clip": 1.03936124, + "balance_loss_mlp": 1.01529002, + "epoch": 0.5732150909364196, + "flos": 23368222886400.0, + "grad_norm": 1.5860104457102466, + "language_loss": 0.85742164, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.87866759, + "num_input_tokens_seen": 205392590, + "step": 9534, + "time_per_iteration": 2.5236244201660156 + }, + { + "auxiliary_loss_clip": 0.01089552, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.03856468, + "balance_loss_mlp": 1.01801145, + "epoch": 0.5732752141890877, + "flos": 23079357711360.0, + "grad_norm": 1.6351604502384107, + "language_loss": 0.75038755, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.77159131, + "num_input_tokens_seen": 205414885, + "step": 9535, + "time_per_iteration": 2.592090606689453 + }, + { + "auxiliary_loss_clip": 0.01093295, + "auxiliary_loss_mlp": 0.01031837, + "balance_loss_clip": 1.03809488, + "balance_loss_mlp": 1.01915288, + "epoch": 0.5733353374417556, + "flos": 24352282863360.0, + "grad_norm": 1.6562353931487452, + "language_loss": 0.70850229, + "learning_rate": 1.624273356614346e-06, + "loss": 0.72975361, + "num_input_tokens_seen": 205434440, + "step": 9536, + "time_per_iteration": 2.567009925842285 + }, + { + "auxiliary_loss_clip": 0.01067973, + "auxiliary_loss_mlp": 0.0103559, + "balance_loss_clip": 1.03442121, + "balance_loss_mlp": 1.02162492, + "epoch": 0.5733954606944236, + "flos": 27198849830400.0, + "grad_norm": 1.8241620583052136, + "language_loss": 0.69811648, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.71915209, + "num_input_tokens_seen": 205454225, + "step": 9537, + "time_per_iteration": 2.606750726699829 + }, + { + "auxiliary_loss_clip": 0.01110012, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.03860426, + "balance_loss_mlp": 1.02011752, + "epoch": 0.5734555839470915, + "flos": 28765129357440.0, + "grad_norm": 1.8480729092766628, + "language_loss": 0.62788153, + "learning_rate": 1.623508330355902e-06, + "loss": 0.64930809, + "num_input_tokens_seen": 205474750, + "step": 9538, + "time_per_iteration": 2.5225141048431396 + }, + { + "auxiliary_loss_clip": 0.01097506, + "auxiliary_loss_mlp": 0.01032649, + "balance_loss_clip": 1.03817534, + "balance_loss_mlp": 1.02024531, + "epoch": 0.5735157071997595, + "flos": 22966813422720.0, + "grad_norm": 1.7368566504944476, + "language_loss": 0.8321678, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.85346937, + "num_input_tokens_seen": 205495495, + "step": 9539, + "time_per_iteration": 2.540273427963257 + }, + { + "auxiliary_loss_clip": 0.01073486, + "auxiliary_loss_mlp": 0.01034016, + "balance_loss_clip": 1.03858471, + "balance_loss_mlp": 1.02130246, + "epoch": 0.5735758304524274, + "flos": 18989455420800.0, + "grad_norm": 1.9165004901199791, + "language_loss": 0.73458493, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.75565988, + "num_input_tokens_seen": 205510070, + "step": 9540, + "time_per_iteration": 3.978902578353882 + }, + { + "auxiliary_loss_clip": 0.01094561, + "auxiliary_loss_mlp": 0.00791357, + "balance_loss_clip": 1.03992331, + "balance_loss_mlp": 1.02316213, + "epoch": 0.5736359537050955, + "flos": 28397942576640.0, + "grad_norm": 1.8703763574794876, + "language_loss": 0.80427146, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.82313067, + "num_input_tokens_seen": 205530190, + "step": 9541, + "time_per_iteration": 2.5595176219940186 + }, + { + "auxiliary_loss_clip": 0.01091424, + "auxiliary_loss_mlp": 0.01033802, + "balance_loss_clip": 1.03888714, + "balance_loss_mlp": 1.02083206, + "epoch": 0.5736960769577634, + "flos": 15627210848640.0, + "grad_norm": 2.510515481482136, + "language_loss": 0.64581394, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.66706622, + "num_input_tokens_seen": 205547380, + "step": 9542, + "time_per_iteration": 2.511321783065796 + }, + { + "auxiliary_loss_clip": 0.01087974, + "auxiliary_loss_mlp": 0.01031043, + "balance_loss_clip": 1.03642023, + "balance_loss_mlp": 1.01929522, + "epoch": 0.5737562002104314, + "flos": 18003994813440.0, + "grad_norm": 2.3422373565128427, + "language_loss": 0.82524121, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.84643137, + "num_input_tokens_seen": 205566540, + "step": 9543, + "time_per_iteration": 3.890683174133301 + }, + { + "auxiliary_loss_clip": 0.01074786, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.04066706, + "balance_loss_mlp": 1.01816535, + "epoch": 0.5738163234630994, + "flos": 20698192287360.0, + "grad_norm": 2.0063288820610983, + "language_loss": 0.73534238, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.75641048, + "num_input_tokens_seen": 205584200, + "step": 9544, + "time_per_iteration": 2.5836920738220215 + }, + { + "auxiliary_loss_clip": 0.01060159, + "auxiliary_loss_mlp": 0.01034549, + "balance_loss_clip": 1.03603303, + "balance_loss_mlp": 1.02042317, + "epoch": 0.5738764467157673, + "flos": 23149311448320.0, + "grad_norm": 1.611398487346533, + "language_loss": 0.76189983, + "learning_rate": 1.620831188925733e-06, + "loss": 0.78284693, + "num_input_tokens_seen": 205604675, + "step": 9545, + "time_per_iteration": 2.6100571155548096 + }, + { + "auxiliary_loss_clip": 0.0108825, + "auxiliary_loss_mlp": 0.01033616, + "balance_loss_clip": 1.0384413, + "balance_loss_mlp": 1.0210638, + "epoch": 0.5739365699684353, + "flos": 29492930730240.0, + "grad_norm": 2.1403588151374495, + "language_loss": 0.56709427, + "learning_rate": 1.620448797546459e-06, + "loss": 0.58831292, + "num_input_tokens_seen": 205624680, + "step": 9546, + "time_per_iteration": 2.593029260635376 + }, + { + "auxiliary_loss_clip": 0.01083136, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.034724, + "balance_loss_mlp": 1.02137363, + "epoch": 0.5739966932211032, + "flos": 14027247342720.0, + "grad_norm": 2.1381204111938703, + "language_loss": 0.76421195, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.78538418, + "num_input_tokens_seen": 205641950, + "step": 9547, + "time_per_iteration": 2.507086992263794 + }, + { + "auxiliary_loss_clip": 0.01097711, + "auxiliary_loss_mlp": 0.01034414, + "balance_loss_clip": 1.03584099, + "balance_loss_mlp": 1.02099085, + "epoch": 0.5740568164737713, + "flos": 19062030850560.0, + "grad_norm": 1.896861626753832, + "language_loss": 0.7394681, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.76078933, + "num_input_tokens_seen": 205660130, + "step": 9548, + "time_per_iteration": 3.8896377086639404 + }, + { + "auxiliary_loss_clip": 0.01080759, + "auxiliary_loss_mlp": 0.01040283, + "balance_loss_clip": 1.03564858, + "balance_loss_mlp": 1.02707505, + "epoch": 0.5741169397264392, + "flos": 22127832478080.0, + "grad_norm": 3.1162462266455933, + "language_loss": 0.69195485, + "learning_rate": 1.619301709822355e-06, + "loss": 0.71316522, + "num_input_tokens_seen": 205678895, + "step": 9549, + "time_per_iteration": 2.5695111751556396 + }, + { + "auxiliary_loss_clip": 0.01070365, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.04490864, + "balance_loss_mlp": 1.0167284, + "epoch": 0.5741770629791072, + "flos": 24936836797440.0, + "grad_norm": 1.543047171172197, + "language_loss": 0.79638988, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.81737864, + "num_input_tokens_seen": 205698450, + "step": 9550, + "time_per_iteration": 2.617417573928833 + }, + { + "auxiliary_loss_clip": 0.01084074, + "auxiliary_loss_mlp": 0.01040287, + "balance_loss_clip": 1.03985322, + "balance_loss_mlp": 1.02609527, + "epoch": 0.5742371862317751, + "flos": 18801462614400.0, + "grad_norm": 1.9132755642380894, + "language_loss": 0.68039703, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.70164061, + "num_input_tokens_seen": 205714870, + "step": 9551, + "time_per_iteration": 2.5182394981384277 + }, + { + "auxiliary_loss_clip": 0.01076337, + "auxiliary_loss_mlp": 0.01033148, + "balance_loss_clip": 1.04079342, + "balance_loss_mlp": 1.02067876, + "epoch": 0.5742973094844431, + "flos": 24460661174400.0, + "grad_norm": 1.6332408902198359, + "language_loss": 0.71926105, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.74035591, + "num_input_tokens_seen": 205736045, + "step": 9552, + "time_per_iteration": 2.6009294986724854 + }, + { + "auxiliary_loss_clip": 0.01096652, + "auxiliary_loss_mlp": 0.01031157, + "balance_loss_clip": 1.04038477, + "balance_loss_mlp": 1.01824081, + "epoch": 0.574357432737111, + "flos": 21652770176640.0, + "grad_norm": 2.2882464147097545, + "language_loss": 0.79886734, + "learning_rate": 1.617772461696843e-06, + "loss": 0.82014537, + "num_input_tokens_seen": 205754445, + "step": 9553, + "time_per_iteration": 2.5065300464630127 + }, + { + "auxiliary_loss_clip": 0.01101528, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.03676379, + "balance_loss_mlp": 1.01701379, + "epoch": 0.5744175559897791, + "flos": 16544728880640.0, + "grad_norm": 2.191253223324492, + "language_loss": 0.83503735, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.8563509, + "num_input_tokens_seen": 205770595, + "step": 9554, + "time_per_iteration": 2.491607904434204 + }, + { + "auxiliary_loss_clip": 0.01103781, + "auxiliary_loss_mlp": 0.00789142, + "balance_loss_clip": 1.03803802, + "balance_loss_mlp": 1.01378846, + "epoch": 0.574477679242447, + "flos": 24207598880640.0, + "grad_norm": 1.3239887581741785, + "language_loss": 0.70904803, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.72797728, + "num_input_tokens_seen": 205791935, + "step": 9555, + "time_per_iteration": 2.5555660724639893 + }, + { + "auxiliary_loss_clip": 0.0108746, + "auxiliary_loss_mlp": 0.01027006, + "balance_loss_clip": 1.03735936, + "balance_loss_mlp": 1.01377928, + "epoch": 0.574537802495115, + "flos": 14903000835840.0, + "grad_norm": 2.1372378962410665, + "language_loss": 0.72535646, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.74650109, + "num_input_tokens_seen": 205807260, + "step": 9556, + "time_per_iteration": 2.4919373989105225 + }, + { + "auxiliary_loss_clip": 0.01101088, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.03875434, + "balance_loss_mlp": 1.01873934, + "epoch": 0.5745979257477829, + "flos": 24934969290240.0, + "grad_norm": 1.5507228973860683, + "language_loss": 0.7394014, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.76072729, + "num_input_tokens_seen": 205826885, + "step": 9557, + "time_per_iteration": 2.5309243202209473 + }, + { + "auxiliary_loss_clip": 0.01101422, + "auxiliary_loss_mlp": 0.01033608, + "balance_loss_clip": 1.03812909, + "balance_loss_mlp": 1.0213176, + "epoch": 0.5746580490004509, + "flos": 17235757704960.0, + "grad_norm": 1.511129056785785, + "language_loss": 0.67315263, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.69450295, + "num_input_tokens_seen": 205844630, + "step": 9558, + "time_per_iteration": 2.47845721244812 + }, + { + "auxiliary_loss_clip": 0.01085446, + "auxiliary_loss_mlp": 0.01049259, + "balance_loss_clip": 1.03800106, + "balance_loss_mlp": 1.03150845, + "epoch": 0.5747181722531189, + "flos": 13187871348480.0, + "grad_norm": 1.9897538431465565, + "language_loss": 0.70817542, + "learning_rate": 1.615479024621659e-06, + "loss": 0.72952253, + "num_input_tokens_seen": 205860960, + "step": 9559, + "time_per_iteration": 2.483955144882202 + }, + { + "auxiliary_loss_clip": 0.01090683, + "auxiliary_loss_mlp": 0.00786203, + "balance_loss_clip": 1.04041719, + "balance_loss_mlp": 1.01151371, + "epoch": 0.5747782955057869, + "flos": 22963006581120.0, + "grad_norm": 1.611522133177141, + "language_loss": 0.7929498, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.8117187, + "num_input_tokens_seen": 205880675, + "step": 9560, + "time_per_iteration": 2.576812267303467 + }, + { + "auxiliary_loss_clip": 0.01045484, + "auxiliary_loss_mlp": 0.01030483, + "balance_loss_clip": 1.03717983, + "balance_loss_mlp": 1.01734078, + "epoch": 0.5748384187584549, + "flos": 23403235668480.0, + "grad_norm": 3.0295522807320423, + "language_loss": 0.64246464, + "learning_rate": 1.614714662090588e-06, + "loss": 0.66322428, + "num_input_tokens_seen": 205900050, + "step": 9561, + "time_per_iteration": 2.643965244293213 + }, + { + "auxiliary_loss_clip": 0.01109067, + "auxiliary_loss_mlp": 0.01040413, + "balance_loss_clip": 1.04039204, + "balance_loss_mlp": 1.02645993, + "epoch": 0.5748985420111228, + "flos": 17785514338560.0, + "grad_norm": 1.850556924646519, + "language_loss": 0.71675783, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.73825264, + "num_input_tokens_seen": 205918855, + "step": 9562, + "time_per_iteration": 2.50701642036438 + }, + { + "auxiliary_loss_clip": 0.01063053, + "auxiliary_loss_mlp": 0.01035412, + "balance_loss_clip": 1.0414021, + "balance_loss_mlp": 1.02338362, + "epoch": 0.5749586652637908, + "flos": 19866250408320.0, + "grad_norm": 1.4968871289394081, + "language_loss": 0.84102875, + "learning_rate": 1.613950357999751e-06, + "loss": 0.86201334, + "num_input_tokens_seen": 205936970, + "step": 9563, + "time_per_iteration": 2.5932610034942627 + }, + { + "auxiliary_loss_clip": 0.01064101, + "auxiliary_loss_mlp": 0.01036531, + "balance_loss_clip": 1.03741908, + "balance_loss_mlp": 1.02323937, + "epoch": 0.5750187885164587, + "flos": 21287235421440.0, + "grad_norm": 1.8234061124145455, + "language_loss": 0.56915355, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.59015989, + "num_input_tokens_seen": 205954630, + "step": 9564, + "time_per_iteration": 2.6312255859375 + }, + { + "auxiliary_loss_clip": 0.01082275, + "auxiliary_loss_mlp": 0.01036811, + "balance_loss_clip": 1.03487062, + "balance_loss_mlp": 1.02307844, + "epoch": 0.5750789117691267, + "flos": 18804658924800.0, + "grad_norm": 1.762261285838027, + "language_loss": 0.76083028, + "learning_rate": 1.613186112465078e-06, + "loss": 0.78202111, + "num_input_tokens_seen": 205971510, + "step": 9565, + "time_per_iteration": 2.5008556842803955 + }, + { + "auxiliary_loss_clip": 0.01013251, + "auxiliary_loss_mlp": 0.01005743, + "balance_loss_clip": 1.02556431, + "balance_loss_mlp": 1.004354, + "epoch": 0.5751390350217946, + "flos": 70663224124800.0, + "grad_norm": 0.7416702879767103, + "language_loss": 0.60841537, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.62860537, + "num_input_tokens_seen": 206035125, + "step": 9566, + "time_per_iteration": 3.2971808910369873 + }, + { + "auxiliary_loss_clip": 0.01085744, + "auxiliary_loss_mlp": 0.01030164, + "balance_loss_clip": 1.03921092, + "balance_loss_mlp": 1.01796889, + "epoch": 0.5751991582744627, + "flos": 14246338348800.0, + "grad_norm": 1.7064161138435505, + "language_loss": 0.75460857, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.77576768, + "num_input_tokens_seen": 206052075, + "step": 9567, + "time_per_iteration": 2.557204008102417 + }, + { + "auxiliary_loss_clip": 0.01099941, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.03807843, + "balance_loss_mlp": 1.01970434, + "epoch": 0.5752592815271306, + "flos": 18328160079360.0, + "grad_norm": 1.4999982983897755, + "language_loss": 0.74601126, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.76733649, + "num_input_tokens_seen": 206069970, + "step": 9568, + "time_per_iteration": 2.516064405441284 + }, + { + "auxiliary_loss_clip": 0.01113469, + "auxiliary_loss_mlp": 0.01030367, + "balance_loss_clip": 1.03943384, + "balance_loss_mlp": 1.01774919, + "epoch": 0.5753194047797986, + "flos": 20922742160640.0, + "grad_norm": 1.6085225091542386, + "language_loss": 0.7107749, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.73221332, + "num_input_tokens_seen": 206088950, + "step": 9569, + "time_per_iteration": 2.586909532546997 + }, + { + "auxiliary_loss_clip": 0.0110229, + "auxiliary_loss_mlp": 0.01041371, + "balance_loss_clip": 1.03902614, + "balance_loss_mlp": 1.02807927, + "epoch": 0.5753795280324665, + "flos": 19281804215040.0, + "grad_norm": 2.363356065798469, + "language_loss": 0.55784857, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.5792852, + "num_input_tokens_seen": 206107780, + "step": 9570, + "time_per_iteration": 2.4974775314331055 + }, + { + "auxiliary_loss_clip": 0.01110224, + "auxiliary_loss_mlp": 0.01035424, + "balance_loss_clip": 1.03806901, + "balance_loss_mlp": 1.02342534, + "epoch": 0.5754396512851345, + "flos": 21652877917440.0, + "grad_norm": 1.6164740166745846, + "language_loss": 0.64654905, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.66800547, + "num_input_tokens_seen": 206127445, + "step": 9571, + "time_per_iteration": 3.870680093765259 + }, + { + "auxiliary_loss_clip": 0.01102108, + "auxiliary_loss_mlp": 0.01032571, + "balance_loss_clip": 1.03762579, + "balance_loss_mlp": 1.01991665, + "epoch": 0.5754997745378025, + "flos": 51021700179840.0, + "grad_norm": 1.5836331463291646, + "language_loss": 0.66833389, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.6896807, + "num_input_tokens_seen": 206152005, + "step": 9572, + "time_per_iteration": 2.7728612422943115 + }, + { + "auxiliary_loss_clip": 0.01088745, + "auxiliary_loss_mlp": 0.01033571, + "balance_loss_clip": 1.03803968, + "balance_loss_mlp": 1.0209825, + "epoch": 0.5755598977904705, + "flos": 22856890826880.0, + "grad_norm": 2.519247914246552, + "language_loss": 0.72137564, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.74259883, + "num_input_tokens_seen": 206169875, + "step": 9573, + "time_per_iteration": 2.536163806915283 + }, + { + "auxiliary_loss_clip": 0.01108362, + "auxiliary_loss_mlp": 0.01029622, + "balance_loss_clip": 1.03982091, + "balance_loss_mlp": 1.01814842, + "epoch": 0.5756200210431385, + "flos": 38472824805120.0, + "grad_norm": 1.7326197357493094, + "language_loss": 0.76557893, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.78695875, + "num_input_tokens_seen": 206192635, + "step": 9574, + "time_per_iteration": 2.5983545780181885 + }, + { + "auxiliary_loss_clip": 0.01061326, + "auxiliary_loss_mlp": 0.0103832, + "balance_loss_clip": 1.03499579, + "balance_loss_mlp": 1.02400911, + "epoch": 0.5756801442958064, + "flos": 23910006700800.0, + "grad_norm": 2.77648120227331, + "language_loss": 0.6636247, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.68462121, + "num_input_tokens_seen": 206211485, + "step": 9575, + "time_per_iteration": 2.6172397136688232 + }, + { + "auxiliary_loss_clip": 0.01088042, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.0380342, + "balance_loss_mlp": 1.01893449, + "epoch": 0.5757402675484744, + "flos": 21105276099840.0, + "grad_norm": 1.5558898645892454, + "language_loss": 0.79777193, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.81896049, + "num_input_tokens_seen": 206231740, + "step": 9576, + "time_per_iteration": 2.535367727279663 + }, + { + "auxiliary_loss_clip": 0.01088145, + "auxiliary_loss_mlp": 0.01026765, + "balance_loss_clip": 1.03839636, + "balance_loss_mlp": 1.01539254, + "epoch": 0.5758003908011423, + "flos": 20559110826240.0, + "grad_norm": 1.6535958508242505, + "language_loss": 0.693295, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.7144441, + "num_input_tokens_seen": 206250975, + "step": 9577, + "time_per_iteration": 2.526874303817749 + }, + { + "auxiliary_loss_clip": 0.01102996, + "auxiliary_loss_mlp": 0.01034222, + "balance_loss_clip": 1.03848052, + "balance_loss_mlp": 1.02199745, + "epoch": 0.5758605140538103, + "flos": 16473015377280.0, + "grad_norm": 1.6495591472385231, + "language_loss": 0.66861677, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.68998897, + "num_input_tokens_seen": 206268800, + "step": 9578, + "time_per_iteration": 2.470482587814331 + }, + { + "auxiliary_loss_clip": 0.01089002, + "auxiliary_loss_mlp": 0.01026933, + "balance_loss_clip": 1.03784275, + "balance_loss_mlp": 1.01522708, + "epoch": 0.5759206373064782, + "flos": 21287558643840.0, + "grad_norm": 1.6553460605654928, + "language_loss": 0.72552121, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.74668056, + "num_input_tokens_seen": 206287190, + "step": 9579, + "time_per_iteration": 3.8875722885131836 + }, + { + "auxiliary_loss_clip": 0.01095066, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.04029238, + "balance_loss_mlp": 1.01738548, + "epoch": 0.5759807605591463, + "flos": 26067879227520.0, + "grad_norm": 2.318208160183359, + "language_loss": 0.63953638, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.66079837, + "num_input_tokens_seen": 206307020, + "step": 9580, + "time_per_iteration": 2.5736711025238037 + }, + { + "auxiliary_loss_clip": 0.01083778, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.03510988, + "balance_loss_mlp": 1.01860678, + "epoch": 0.5760408838118142, + "flos": 18873068376960.0, + "grad_norm": 1.8295650273476893, + "language_loss": 0.85351741, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.87467706, + "num_input_tokens_seen": 206324095, + "step": 9581, + "time_per_iteration": 2.509777784347534 + }, + { + "auxiliary_loss_clip": 0.01119343, + "auxiliary_loss_mlp": 0.01041945, + "balance_loss_clip": 1.04206812, + "balance_loss_mlp": 1.02830756, + "epoch": 0.5761010070644822, + "flos": 15378134964480.0, + "grad_norm": 3.2848204941500736, + "language_loss": 0.66850209, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.69011497, + "num_input_tokens_seen": 206343210, + "step": 9582, + "time_per_iteration": 3.871877431869507 + }, + { + "auxiliary_loss_clip": 0.0103245, + "auxiliary_loss_mlp": 0.01003135, + "balance_loss_clip": 1.02614343, + "balance_loss_mlp": 1.00197911, + "epoch": 0.5761611303171501, + "flos": 71471932882560.0, + "grad_norm": 0.6714170727724333, + "language_loss": 0.5725106, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59286642, + "num_input_tokens_seen": 206415935, + "step": 9583, + "time_per_iteration": 3.314527988433838 + }, + { + "auxiliary_loss_clip": 0.01085306, + "auxiliary_loss_mlp": 0.01028457, + "balance_loss_clip": 1.03902435, + "balance_loss_mlp": 1.01620865, + "epoch": 0.5762212535698181, + "flos": 16246167033600.0, + "grad_norm": 1.7737378394620331, + "language_loss": 0.82577968, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.84691727, + "num_input_tokens_seen": 206431900, + "step": 9584, + "time_per_iteration": 2.503521203994751 + }, + { + "auxiliary_loss_clip": 0.01047451, + "auxiliary_loss_mlp": 0.01003377, + "balance_loss_clip": 1.02265346, + "balance_loss_mlp": 1.00222707, + "epoch": 0.5762813768224861, + "flos": 70185504216960.0, + "grad_norm": 0.6332332230034082, + "language_loss": 0.49561346, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51612175, + "num_input_tokens_seen": 206501200, + "step": 9585, + "time_per_iteration": 3.142387628555298 + }, + { + "auxiliary_loss_clip": 0.01083443, + "auxiliary_loss_mlp": 0.01026817, + "balance_loss_clip": 1.03562474, + "balance_loss_mlp": 1.01515877, + "epoch": 0.5763415000751541, + "flos": 20518028645760.0, + "grad_norm": 1.6885465176305137, + "language_loss": 0.84566736, + "learning_rate": 1.605165098835465e-06, + "loss": 0.86676997, + "num_input_tokens_seen": 206520575, + "step": 9586, + "time_per_iteration": 2.5307562351226807 + }, + { + "auxiliary_loss_clip": 0.01098699, + "auxiliary_loss_mlp": 0.01030525, + "balance_loss_clip": 1.04122639, + "balance_loss_mlp": 1.01747131, + "epoch": 0.5764016233278221, + "flos": 15815526877440.0, + "grad_norm": 1.7563100313009583, + "language_loss": 0.79709017, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.8183825, + "num_input_tokens_seen": 206538060, + "step": 9587, + "time_per_iteration": 3.8719122409820557 + }, + { + "auxiliary_loss_clip": 0.01080611, + "auxiliary_loss_mlp": 0.01034322, + "balance_loss_clip": 1.03551686, + "balance_loss_mlp": 1.0208571, + "epoch": 0.57646174658049, + "flos": 20772312001920.0, + "grad_norm": 1.803565523749224, + "language_loss": 0.66185069, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.68300003, + "num_input_tokens_seen": 206557320, + "step": 9588, + "time_per_iteration": 2.5080082416534424 + }, + { + "auxiliary_loss_clip": 0.01084625, + "auxiliary_loss_mlp": 0.01035045, + "balance_loss_clip": 1.03616285, + "balance_loss_mlp": 1.02128196, + "epoch": 0.576521869833158, + "flos": 23549930812800.0, + "grad_norm": 1.9641677735767007, + "language_loss": 0.78744984, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.80864656, + "num_input_tokens_seen": 206575780, + "step": 9589, + "time_per_iteration": 2.5306313037872314 + }, + { + "auxiliary_loss_clip": 0.01106804, + "auxiliary_loss_mlp": 0.01026467, + "balance_loss_clip": 1.03625321, + "balance_loss_mlp": 1.01465368, + "epoch": 0.5765819930858259, + "flos": 20266582464000.0, + "grad_norm": 1.9597697625278394, + "language_loss": 0.79119581, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.81252855, + "num_input_tokens_seen": 206594100, + "step": 9590, + "time_per_iteration": 2.4647674560546875 + }, + { + "auxiliary_loss_clip": 0.0105259, + "auxiliary_loss_mlp": 0.00784155, + "balance_loss_clip": 1.03951335, + "balance_loss_mlp": 1.0084399, + "epoch": 0.5766421163384939, + "flos": 23148772744320.0, + "grad_norm": 1.6967300517918502, + "language_loss": 0.62544715, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.64381462, + "num_input_tokens_seen": 206613325, + "step": 9591, + "time_per_iteration": 2.6545443534851074 + }, + { + "auxiliary_loss_clip": 0.01112376, + "auxiliary_loss_mlp": 0.00789919, + "balance_loss_clip": 1.03913271, + "balance_loss_mlp": 1.01460314, + "epoch": 0.5767022395911618, + "flos": 25848895962240.0, + "grad_norm": 1.6436047468632307, + "language_loss": 0.77798533, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.79700828, + "num_input_tokens_seen": 206634265, + "step": 9592, + "time_per_iteration": 2.5054421424865723 + }, + { + "auxiliary_loss_clip": 0.01002907, + "auxiliary_loss_mlp": 0.01009004, + "balance_loss_clip": 1.0270834, + "balance_loss_mlp": 1.00735891, + "epoch": 0.5767623628438299, + "flos": 68293299657600.0, + "grad_norm": 0.7332544973540513, + "language_loss": 0.59644496, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.6165641, + "num_input_tokens_seen": 206696990, + "step": 9593, + "time_per_iteration": 3.35587477684021 + }, + { + "auxiliary_loss_clip": 0.01103429, + "auxiliary_loss_mlp": 0.01045106, + "balance_loss_clip": 1.03781521, + "balance_loss_mlp": 1.03147435, + "epoch": 0.5768224860964978, + "flos": 30188448754560.0, + "grad_norm": 1.5638496278570684, + "language_loss": 0.71214902, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73363435, + "num_input_tokens_seen": 206717815, + "step": 9594, + "time_per_iteration": 2.58227276802063 + }, + { + "auxiliary_loss_clip": 0.01077126, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.03752565, + "balance_loss_mlp": 1.01966286, + "epoch": 0.5768826093491658, + "flos": 17895041884800.0, + "grad_norm": 1.7154469790847517, + "language_loss": 0.70838284, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.72946179, + "num_input_tokens_seen": 206735985, + "step": 9595, + "time_per_iteration": 2.5414834022521973 + }, + { + "auxiliary_loss_clip": 0.01111601, + "auxiliary_loss_mlp": 0.01029945, + "balance_loss_clip": 1.03897047, + "balance_loss_mlp": 1.01726127, + "epoch": 0.5769427326018337, + "flos": 17457183095040.0, + "grad_norm": 2.293411051975021, + "language_loss": 0.70005077, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.7214663, + "num_input_tokens_seen": 206753370, + "step": 9596, + "time_per_iteration": 2.4497220516204834 + }, + { + "auxiliary_loss_clip": 0.01090222, + "auxiliary_loss_mlp": 0.01037031, + "balance_loss_clip": 1.04040253, + "balance_loss_mlp": 1.022493, + "epoch": 0.5770028558545017, + "flos": 39421728345600.0, + "grad_norm": 4.511767085649352, + "language_loss": 0.67250514, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.69377768, + "num_input_tokens_seen": 206777645, + "step": 9597, + "time_per_iteration": 2.679090976715088 + }, + { + "auxiliary_loss_clip": 0.01089452, + "auxiliary_loss_mlp": 0.0103314, + "balance_loss_clip": 1.03847456, + "balance_loss_mlp": 1.02099228, + "epoch": 0.5770629791071697, + "flos": 21536383132800.0, + "grad_norm": 1.7757863049632339, + "language_loss": 0.81311107, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.83433688, + "num_input_tokens_seen": 206794865, + "step": 9598, + "time_per_iteration": 2.546495199203491 + }, + { + "auxiliary_loss_clip": 0.01068511, + "auxiliary_loss_mlp": 0.01044445, + "balance_loss_clip": 1.03362823, + "balance_loss_mlp": 1.02923369, + "epoch": 0.5771231023598377, + "flos": 20886795624960.0, + "grad_norm": 1.4492360448841175, + "language_loss": 0.7266261, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.74775565, + "num_input_tokens_seen": 206814095, + "step": 9599, + "time_per_iteration": 2.5570120811462402 + }, + { + "auxiliary_loss_clip": 0.01108364, + "auxiliary_loss_mlp": 0.01032859, + "balance_loss_clip": 1.03841829, + "balance_loss_mlp": 1.02113485, + "epoch": 0.5771832256125057, + "flos": 18077216688000.0, + "grad_norm": 33.99850999422814, + "language_loss": 0.77838981, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.79980201, + "num_input_tokens_seen": 206832245, + "step": 9600, + "time_per_iteration": 2.451108455657959 + }, + { + "auxiliary_loss_clip": 0.01106277, + "auxiliary_loss_mlp": 0.00787037, + "balance_loss_clip": 1.04156065, + "balance_loss_mlp": 1.00862527, + "epoch": 0.5772433488651736, + "flos": 26359078786560.0, + "grad_norm": 3.651324503435479, + "language_loss": 0.72531855, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.74425173, + "num_input_tokens_seen": 206851535, + "step": 9601, + "time_per_iteration": 2.5447776317596436 + }, + { + "auxiliary_loss_clip": 0.01054195, + "auxiliary_loss_mlp": 0.01036181, + "balance_loss_clip": 1.03777623, + "balance_loss_mlp": 1.02346158, + "epoch": 0.5773034721178416, + "flos": 19680987035520.0, + "grad_norm": 1.9632505663559048, + "language_loss": 0.6870755, + "learning_rate": 1.599058274973348e-06, + "loss": 0.70797926, + "num_input_tokens_seen": 206870595, + "step": 9602, + "time_per_iteration": 2.6276421546936035 + }, + { + "auxiliary_loss_clip": 0.01081105, + "auxiliary_loss_mlp": 0.01039592, + "balance_loss_clip": 1.03620386, + "balance_loss_mlp": 1.02668142, + "epoch": 0.5773635953705095, + "flos": 25082885496960.0, + "grad_norm": 1.456868008043915, + "language_loss": 0.72893697, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.75014395, + "num_input_tokens_seen": 206892320, + "step": 9603, + "time_per_iteration": 2.551957607269287 + }, + { + "auxiliary_loss_clip": 0.01098239, + "auxiliary_loss_mlp": 0.01031097, + "balance_loss_clip": 1.03829718, + "balance_loss_mlp": 1.01883674, + "epoch": 0.5774237186231775, + "flos": 21032987978880.0, + "grad_norm": 1.637205433736546, + "language_loss": 0.76486218, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.78615558, + "num_input_tokens_seen": 206912485, + "step": 9604, + "time_per_iteration": 2.5517444610595703 + }, + { + "auxiliary_loss_clip": 0.01078623, + "auxiliary_loss_mlp": 0.0103517, + "balance_loss_clip": 1.03819728, + "balance_loss_mlp": 1.02109146, + "epoch": 0.5774838418758454, + "flos": 15231727128960.0, + "grad_norm": 1.5914744355961994, + "language_loss": 0.83448887, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.85562682, + "num_input_tokens_seen": 206929100, + "step": 9605, + "time_per_iteration": 2.5371854305267334 + }, + { + "auxiliary_loss_clip": 0.01087309, + "auxiliary_loss_mlp": 0.01033881, + "balance_loss_clip": 1.04086876, + "balance_loss_mlp": 1.01918876, + "epoch": 0.5775439651285135, + "flos": 23582609210880.0, + "grad_norm": 1.6257056546838746, + "language_loss": 0.78168225, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.80289418, + "num_input_tokens_seen": 206947020, + "step": 9606, + "time_per_iteration": 2.572598695755005 + }, + { + "auxiliary_loss_clip": 0.01110487, + "auxiliary_loss_mlp": 0.01039323, + "balance_loss_clip": 1.037907, + "balance_loss_mlp": 1.02723527, + "epoch": 0.5776040883811814, + "flos": 18040515966720.0, + "grad_norm": 1.7001614488418964, + "language_loss": 0.73802781, + "learning_rate": 1.597150687927619e-06, + "loss": 0.7595259, + "num_input_tokens_seen": 206964065, + "step": 9607, + "time_per_iteration": 2.438049077987671 + }, + { + "auxiliary_loss_clip": 0.01072783, + "auxiliary_loss_mlp": 0.01034188, + "balance_loss_clip": 1.04144335, + "balance_loss_mlp": 1.02112854, + "epoch": 0.5776642116338494, + "flos": 18624638937600.0, + "grad_norm": 1.621530558822664, + "language_loss": 0.69327891, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.71434867, + "num_input_tokens_seen": 206981940, + "step": 9608, + "time_per_iteration": 2.536414384841919 + }, + { + "auxiliary_loss_clip": 0.01079845, + "auxiliary_loss_mlp": 0.01039604, + "balance_loss_clip": 1.03722501, + "balance_loss_mlp": 1.02625895, + "epoch": 0.5777243348865173, + "flos": 28402539517440.0, + "grad_norm": 1.9686946279303033, + "language_loss": 0.76697063, + "learning_rate": 1.596387759940665e-06, + "loss": 0.78816509, + "num_input_tokens_seen": 207002365, + "step": 9609, + "time_per_iteration": 2.6088550090789795 + }, + { + "auxiliary_loss_clip": 0.01079729, + "auxiliary_loss_mlp": 0.01033535, + "balance_loss_clip": 1.03666258, + "balance_loss_mlp": 1.02106571, + "epoch": 0.5777844581391853, + "flos": 24024705805440.0, + "grad_norm": 1.6330184431273478, + "language_loss": 0.77313435, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.79426706, + "num_input_tokens_seen": 207021195, + "step": 9610, + "time_per_iteration": 3.98441481590271 + }, + { + "auxiliary_loss_clip": 0.01076136, + "auxiliary_loss_mlp": 0.01031014, + "balance_loss_clip": 1.03542721, + "balance_loss_mlp": 1.01751316, + "epoch": 0.5778445813918534, + "flos": 17777361951360.0, + "grad_norm": 2.220790911156004, + "language_loss": 0.69114554, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.71221697, + "num_input_tokens_seen": 207037465, + "step": 9611, + "time_per_iteration": 2.5201942920684814 + }, + { + "auxiliary_loss_clip": 0.01097783, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.03599942, + "balance_loss_mlp": 1.01685715, + "epoch": 0.5779047046445213, + "flos": 22233194046720.0, + "grad_norm": 1.8832902346325437, + "language_loss": 0.83548117, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.85675824, + "num_input_tokens_seen": 207054230, + "step": 9612, + "time_per_iteration": 2.481104850769043 + }, + { + "auxiliary_loss_clip": 0.01111463, + "auxiliary_loss_mlp": 0.0103062, + "balance_loss_clip": 1.03877521, + "balance_loss_mlp": 1.01763272, + "epoch": 0.5779648278971893, + "flos": 21434361528960.0, + "grad_norm": 1.5968871680160088, + "language_loss": 0.79444182, + "learning_rate": 1.594862087742667e-06, + "loss": 0.81586266, + "num_input_tokens_seen": 207073150, + "step": 9613, + "time_per_iteration": 2.4771249294281006 + }, + { + "auxiliary_loss_clip": 0.01096945, + "auxiliary_loss_mlp": 0.01035991, + "balance_loss_clip": 1.03515565, + "balance_loss_mlp": 1.02439821, + "epoch": 0.5780249511498572, + "flos": 19026120228480.0, + "grad_norm": 1.964052428129157, + "language_loss": 0.77522904, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.79655844, + "num_input_tokens_seen": 207090375, + "step": 9614, + "time_per_iteration": 2.4691452980041504 + }, + { + "auxiliary_loss_clip": 0.01078888, + "auxiliary_loss_mlp": 0.01035095, + "balance_loss_clip": 1.03708076, + "balance_loss_mlp": 1.02248836, + "epoch": 0.5780850744025252, + "flos": 12124663752960.0, + "grad_norm": 2.6195108466702175, + "language_loss": 0.80592608, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.82706589, + "num_input_tokens_seen": 207106030, + "step": 9615, + "time_per_iteration": 2.52592396736145 + }, + { + "auxiliary_loss_clip": 0.01098756, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.03605521, + "balance_loss_mlp": 1.01991487, + "epoch": 0.5781451976551931, + "flos": 25044425009280.0, + "grad_norm": 1.62400853154449, + "language_loss": 0.6682651, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.68958116, + "num_input_tokens_seen": 207125435, + "step": 9616, + "time_per_iteration": 2.516204833984375 + }, + { + "auxiliary_loss_clip": 0.01099607, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.03791416, + "balance_loss_mlp": 1.018399, + "epoch": 0.5782053209078611, + "flos": 19245606284160.0, + "grad_norm": 1.6368851459517004, + "language_loss": 0.77510601, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.79640758, + "num_input_tokens_seen": 207145095, + "step": 9617, + "time_per_iteration": 3.8840296268463135 + }, + { + "auxiliary_loss_clip": 0.0108614, + "auxiliary_loss_mlp": 0.01032421, + "balance_loss_clip": 1.03626132, + "balance_loss_mlp": 1.0189209, + "epoch": 0.578265444160529, + "flos": 25993831340160.0, + "grad_norm": 1.487659265962285, + "language_loss": 0.74895203, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.77013755, + "num_input_tokens_seen": 207166045, + "step": 9618, + "time_per_iteration": 2.6071295738220215 + }, + { + "auxiliary_loss_clip": 0.0111016, + "auxiliary_loss_mlp": 0.01030254, + "balance_loss_clip": 1.03822553, + "balance_loss_mlp": 1.01807642, + "epoch": 0.5783255674131971, + "flos": 21798603394560.0, + "grad_norm": 1.5122850014564808, + "language_loss": 0.81224424, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.83364838, + "num_input_tokens_seen": 207185290, + "step": 9619, + "time_per_iteration": 2.4854722023010254 + }, + { + "auxiliary_loss_clip": 0.01090868, + "auxiliary_loss_mlp": 0.01031663, + "balance_loss_clip": 1.03743315, + "balance_loss_mlp": 1.01924777, + "epoch": 0.578385690665865, + "flos": 24789746603520.0, + "grad_norm": 1.749753163619696, + "language_loss": 0.72332227, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.74454761, + "num_input_tokens_seen": 207205505, + "step": 9620, + "time_per_iteration": 3.9409501552581787 + }, + { + "auxiliary_loss_clip": 0.01091337, + "auxiliary_loss_mlp": 0.01029432, + "balance_loss_clip": 1.03855944, + "balance_loss_mlp": 1.01671791, + "epoch": 0.578445813918533, + "flos": 21212864311680.0, + "grad_norm": 1.677801573517177, + "language_loss": 0.77153313, + "learning_rate": 1.591811481689916e-06, + "loss": 0.79274088, + "num_input_tokens_seen": 207225315, + "step": 9621, + "time_per_iteration": 2.5292625427246094 + }, + { + "auxiliary_loss_clip": 0.01051488, + "auxiliary_loss_mlp": 0.01040284, + "balance_loss_clip": 1.03520775, + "balance_loss_mlp": 1.02531695, + "epoch": 0.5785059371712009, + "flos": 25046795306880.0, + "grad_norm": 1.4256362301776235, + "language_loss": 0.70454353, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.72546124, + "num_input_tokens_seen": 207247690, + "step": 9622, + "time_per_iteration": 2.6778180599212646 + }, + { + "auxiliary_loss_clip": 0.01022642, + "auxiliary_loss_mlp": 0.0101059, + "balance_loss_clip": 1.01941776, + "balance_loss_mlp": 1.00925529, + "epoch": 0.5785660604238689, + "flos": 70843172284800.0, + "grad_norm": 0.7716990871375858, + "language_loss": 0.55968612, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.58001846, + "num_input_tokens_seen": 207301735, + "step": 9623, + "time_per_iteration": 3.1689460277557373 + }, + { + "auxiliary_loss_clip": 0.01075807, + "auxiliary_loss_mlp": 0.01035289, + "balance_loss_clip": 1.03954959, + "balance_loss_mlp": 1.0218842, + "epoch": 0.578626183676537, + "flos": 31649977244160.0, + "grad_norm": 1.7579473328603779, + "language_loss": 0.71149564, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.73260659, + "num_input_tokens_seen": 207321240, + "step": 9624, + "time_per_iteration": 2.6557812690734863 + }, + { + "auxiliary_loss_clip": 0.01077434, + "auxiliary_loss_mlp": 0.01035714, + "balance_loss_clip": 1.03694963, + "balance_loss_mlp": 1.02247608, + "epoch": 0.5786863069292049, + "flos": 21865181253120.0, + "grad_norm": 1.9196384659944488, + "language_loss": 0.82300937, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.84414083, + "num_input_tokens_seen": 207339540, + "step": 9625, + "time_per_iteration": 4.006484746932983 + }, + { + "auxiliary_loss_clip": 0.01108705, + "auxiliary_loss_mlp": 0.01034307, + "balance_loss_clip": 1.03711653, + "balance_loss_mlp": 1.02035403, + "epoch": 0.5787464301818729, + "flos": 23364954748800.0, + "grad_norm": 1.4302865888315015, + "language_loss": 0.7019484, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72337854, + "num_input_tokens_seen": 207360470, + "step": 9626, + "time_per_iteration": 2.5032944679260254 + }, + { + "auxiliary_loss_clip": 0.01086055, + "auxiliary_loss_mlp": 0.01032099, + "balance_loss_clip": 1.03581452, + "balance_loss_mlp": 1.02001071, + "epoch": 0.5788065534345408, + "flos": 30004011394560.0, + "grad_norm": 1.4808204878485407, + "language_loss": 0.71533668, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.73651826, + "num_input_tokens_seen": 207383080, + "step": 9627, + "time_per_iteration": 2.610231399536133 + }, + { + "auxiliary_loss_clip": 0.01098718, + "auxiliary_loss_mlp": 0.01024868, + "balance_loss_clip": 1.03699946, + "balance_loss_mlp": 1.01254725, + "epoch": 0.5788666766872088, + "flos": 24527849564160.0, + "grad_norm": 2.0279055591584787, + "language_loss": 0.83884835, + "learning_rate": 1.589143013764458e-06, + "loss": 0.86008424, + "num_input_tokens_seen": 207401000, + "step": 9628, + "time_per_iteration": 2.52591872215271 + }, + { + "auxiliary_loss_clip": 0.01089572, + "auxiliary_loss_mlp": 0.01028582, + "balance_loss_clip": 1.03638864, + "balance_loss_mlp": 1.0153017, + "epoch": 0.5789267999398767, + "flos": 23732823888000.0, + "grad_norm": 1.5397687382546326, + "language_loss": 0.72260225, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.74378377, + "num_input_tokens_seen": 207419230, + "step": 9629, + "time_per_iteration": 2.559020757675171 + }, + { + "auxiliary_loss_clip": 0.01089128, + "auxiliary_loss_mlp": 0.01032604, + "balance_loss_clip": 1.03952777, + "balance_loss_mlp": 1.01937807, + "epoch": 0.5789869231925447, + "flos": 21135045496320.0, + "grad_norm": 2.111620354855419, + "language_loss": 0.74529856, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.76651585, + "num_input_tokens_seen": 207437615, + "step": 9630, + "time_per_iteration": 2.5147297382354736 + }, + { + "auxiliary_loss_clip": 0.01077298, + "auxiliary_loss_mlp": 0.00787696, + "balance_loss_clip": 1.03749299, + "balance_loss_mlp": 1.01085997, + "epoch": 0.5790470464452127, + "flos": 21209632087680.0, + "grad_norm": 1.570739191210158, + "language_loss": 0.78826874, + "learning_rate": 1.587999618060523e-06, + "loss": 0.80691868, + "num_input_tokens_seen": 207457270, + "step": 9631, + "time_per_iteration": 2.5936620235443115 + }, + { + "auxiliary_loss_clip": 0.01111485, + "auxiliary_loss_mlp": 0.01026315, + "balance_loss_clip": 1.03834999, + "balance_loss_mlp": 1.01352978, + "epoch": 0.5791071696978807, + "flos": 23404384903680.0, + "grad_norm": 1.8068787515011382, + "language_loss": 0.74892652, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.7703045, + "num_input_tokens_seen": 207477890, + "step": 9632, + "time_per_iteration": 2.480193853378296 + }, + { + "auxiliary_loss_clip": 0.01091008, + "auxiliary_loss_mlp": 0.01026111, + "balance_loss_clip": 1.03899837, + "balance_loss_mlp": 1.0124675, + "epoch": 0.5791672929505486, + "flos": 24206521472640.0, + "grad_norm": 2.012962602706486, + "language_loss": 0.79316151, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.81433266, + "num_input_tokens_seen": 207497670, + "step": 9633, + "time_per_iteration": 2.5568864345550537 + }, + { + "auxiliary_loss_clip": 0.01082563, + "auxiliary_loss_mlp": 0.01037819, + "balance_loss_clip": 1.03884637, + "balance_loss_mlp": 1.0240978, + "epoch": 0.5792274162032166, + "flos": 24348871071360.0, + "grad_norm": 1.7563188477883869, + "language_loss": 0.77693903, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.79814291, + "num_input_tokens_seen": 207516105, + "step": 9634, + "time_per_iteration": 2.5786101818084717 + }, + { + "auxiliary_loss_clip": 0.01093814, + "auxiliary_loss_mlp": 0.01031412, + "balance_loss_clip": 1.03762794, + "balance_loss_mlp": 1.01805449, + "epoch": 0.5792875394558845, + "flos": 20449403712000.0, + "grad_norm": 2.1912193904873005, + "language_loss": 0.63574183, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.6569941, + "num_input_tokens_seen": 207533685, + "step": 9635, + "time_per_iteration": 2.5347909927368164 + }, + { + "auxiliary_loss_clip": 0.01084408, + "auxiliary_loss_mlp": 0.01034394, + "balance_loss_clip": 1.03752232, + "balance_loss_mlp": 1.02185369, + "epoch": 0.5793476627085525, + "flos": 24060329118720.0, + "grad_norm": 1.416642595016995, + "language_loss": 0.77277547, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.79396349, + "num_input_tokens_seen": 207552840, + "step": 9636, + "time_per_iteration": 2.5575127601623535 + }, + { + "auxiliary_loss_clip": 0.0108202, + "auxiliary_loss_mlp": 0.01027875, + "balance_loss_clip": 1.03496385, + "balance_loss_mlp": 1.01582313, + "epoch": 0.5794077859612206, + "flos": 22054287381120.0, + "grad_norm": 1.9333052842627727, + "language_loss": 0.68403339, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.70513237, + "num_input_tokens_seen": 207572095, + "step": 9637, + "time_per_iteration": 2.542168617248535 + }, + { + "auxiliary_loss_clip": 0.01069495, + "auxiliary_loss_mlp": 0.01033839, + "balance_loss_clip": 1.03677177, + "balance_loss_mlp": 1.02080941, + "epoch": 0.5794679092138885, + "flos": 11434855991040.0, + "grad_norm": 5.6608969495046, + "language_loss": 0.72325277, + "learning_rate": 1.585332242234043e-06, + "loss": 0.74428618, + "num_input_tokens_seen": 207587495, + "step": 9638, + "time_per_iteration": 2.544402837753296 + }, + { + "auxiliary_loss_clip": 0.01102764, + "auxiliary_loss_mlp": 0.0103167, + "balance_loss_clip": 1.04162478, + "balance_loss_mlp": 1.01979721, + "epoch": 0.5795280324665565, + "flos": 18880215183360.0, + "grad_norm": 1.6026392766461843, + "language_loss": 0.72239196, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.74373633, + "num_input_tokens_seen": 207606795, + "step": 9639, + "time_per_iteration": 2.4948065280914307 + }, + { + "auxiliary_loss_clip": 0.01088875, + "auxiliary_loss_mlp": 0.01032392, + "balance_loss_clip": 1.03790855, + "balance_loss_mlp": 1.01978016, + "epoch": 0.5795881557192244, + "flos": 13005947940480.0, + "grad_norm": 9.70793302030323, + "language_loss": 0.69621074, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.71742344, + "num_input_tokens_seen": 207623620, + "step": 9640, + "time_per_iteration": 2.5191168785095215 + }, + { + "auxiliary_loss_clip": 0.01091223, + "auxiliary_loss_mlp": 0.01038218, + "balance_loss_clip": 1.03970647, + "balance_loss_mlp": 1.02366829, + "epoch": 0.5796482789718924, + "flos": 19932397303680.0, + "grad_norm": 2.4049987717189265, + "language_loss": 0.77881765, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.80011207, + "num_input_tokens_seen": 207639380, + "step": 9641, + "time_per_iteration": 2.5128629207611084 + }, + { + "auxiliary_loss_clip": 0.01111348, + "auxiliary_loss_mlp": 0.01034317, + "balance_loss_clip": 1.03949213, + "balance_loss_mlp": 1.02190733, + "epoch": 0.5797084022245603, + "flos": 21650794928640.0, + "grad_norm": 1.7042051271300673, + "language_loss": 0.73442292, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.75587958, + "num_input_tokens_seen": 207657915, + "step": 9642, + "time_per_iteration": 2.480365753173828 + }, + { + "auxiliary_loss_clip": 0.01092534, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.03840709, + "balance_loss_mlp": 1.01903617, + "epoch": 0.5797685254772283, + "flos": 26031573555840.0, + "grad_norm": 3.7456062141671684, + "language_loss": 0.73700696, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.75825202, + "num_input_tokens_seen": 207678620, + "step": 9643, + "time_per_iteration": 2.5942113399505615 + }, + { + "auxiliary_loss_clip": 0.0111393, + "auxiliary_loss_mlp": 0.01029845, + "balance_loss_clip": 1.03935742, + "balance_loss_mlp": 1.01707125, + "epoch": 0.5798286487298963, + "flos": 22705167778560.0, + "grad_norm": 2.1441141952492466, + "language_loss": 0.67371035, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.69514811, + "num_input_tokens_seen": 207696980, + "step": 9644, + "time_per_iteration": 2.4940803050994873 + }, + { + "auxiliary_loss_clip": 0.01107337, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.04122233, + "balance_loss_mlp": 1.01870656, + "epoch": 0.5798887719825643, + "flos": 23148988225920.0, + "grad_norm": 2.1534512485486723, + "language_loss": 0.8538307, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.87522262, + "num_input_tokens_seen": 207714065, + "step": 9645, + "time_per_iteration": 2.502521514892578 + }, + { + "auxiliary_loss_clip": 0.01113835, + "auxiliary_loss_mlp": 0.01034153, + "balance_loss_clip": 1.04054999, + "balance_loss_mlp": 1.02183914, + "epoch": 0.5799488952352322, + "flos": 24426043441920.0, + "grad_norm": 2.0125740626461313, + "language_loss": 0.75746059, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.77894044, + "num_input_tokens_seen": 207734720, + "step": 9646, + "time_per_iteration": 2.490468978881836 + }, + { + "auxiliary_loss_clip": 0.01094095, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.04113126, + "balance_loss_mlp": 1.01970983, + "epoch": 0.5800090184879002, + "flos": 38395903829760.0, + "grad_norm": 1.7952208947652493, + "language_loss": 0.59184659, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.61312288, + "num_input_tokens_seen": 207755435, + "step": 9647, + "time_per_iteration": 2.6860110759735107 + }, + { + "auxiliary_loss_clip": 0.01070926, + "auxiliary_loss_mlp": 0.01044541, + "balance_loss_clip": 1.03808463, + "balance_loss_mlp": 1.02869225, + "epoch": 0.5800691417405681, + "flos": 19784840232960.0, + "grad_norm": 1.4926907155484304, + "language_loss": 0.84346664, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.86462122, + "num_input_tokens_seen": 207773570, + "step": 9648, + "time_per_iteration": 2.5891292095184326 + }, + { + "auxiliary_loss_clip": 0.01033565, + "auxiliary_loss_mlp": 0.01003602, + "balance_loss_clip": 1.018767, + "balance_loss_mlp": 1.00248134, + "epoch": 0.5801292649932361, + "flos": 70314565783680.0, + "grad_norm": 0.8368925951623254, + "language_loss": 0.63002115, + "learning_rate": 1.581142210256242e-06, + "loss": 0.65039277, + "num_input_tokens_seen": 207830095, + "step": 9649, + "time_per_iteration": 4.470088958740234 + }, + { + "auxiliary_loss_clip": 0.01072201, + "auxiliary_loss_mlp": 0.01036685, + "balance_loss_clip": 1.03441966, + "balance_loss_mlp": 1.02248716, + "epoch": 0.5801893882459042, + "flos": 18734812928640.0, + "grad_norm": 1.9888139046756772, + "language_loss": 0.8212713, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.84236014, + "num_input_tokens_seen": 207848555, + "step": 9650, + "time_per_iteration": 2.545161247253418 + }, + { + "auxiliary_loss_clip": 0.01082369, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.04259562, + "balance_loss_mlp": 1.0218128, + "epoch": 0.5802495114985721, + "flos": 15596507698560.0, + "grad_norm": 2.900755113185745, + "language_loss": 0.77960712, + "learning_rate": 1.580380592177698e-06, + "loss": 0.80077732, + "num_input_tokens_seen": 207867060, + "step": 9651, + "time_per_iteration": 2.577195882797241 + }, + { + "auxiliary_loss_clip": 0.01095074, + "auxiliary_loss_mlp": 0.01040219, + "balance_loss_clip": 1.0403657, + "balance_loss_mlp": 1.02672482, + "epoch": 0.5803096347512401, + "flos": 18255405081600.0, + "grad_norm": 2.0172398935791156, + "language_loss": 0.74172199, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.76307487, + "num_input_tokens_seen": 207884520, + "step": 9652, + "time_per_iteration": 2.5115127563476562 + }, + { + "auxiliary_loss_clip": 0.01093659, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.03852367, + "balance_loss_mlp": 1.01991069, + "epoch": 0.580369758003908, + "flos": 22893160584960.0, + "grad_norm": 2.1188365206042996, + "language_loss": 0.76770031, + "learning_rate": 1.579619037747193e-06, + "loss": 0.78897142, + "num_input_tokens_seen": 207905370, + "step": 9653, + "time_per_iteration": 2.577439785003662 + }, + { + "auxiliary_loss_clip": 0.01113134, + "auxiliary_loss_mlp": 0.01034671, + "balance_loss_clip": 1.03919435, + "balance_loss_mlp": 1.02066982, + "epoch": 0.580429881256576, + "flos": 18697681244160.0, + "grad_norm": 3.1285110799308873, + "language_loss": 0.74182719, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.76330525, + "num_input_tokens_seen": 207923790, + "step": 9654, + "time_per_iteration": 2.4604549407958984 + }, + { + "auxiliary_loss_clip": 0.01048238, + "auxiliary_loss_mlp": 0.01034319, + "balance_loss_clip": 1.04043448, + "balance_loss_mlp": 1.0219456, + "epoch": 0.5804900045092439, + "flos": 24681978823680.0, + "grad_norm": 1.865068539020379, + "language_loss": 0.70127845, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.72210407, + "num_input_tokens_seen": 207942335, + "step": 9655, + "time_per_iteration": 2.6621060371398926 + }, + { + "auxiliary_loss_clip": 0.01116148, + "auxiliary_loss_mlp": 0.01036488, + "balance_loss_clip": 1.03818643, + "balance_loss_mlp": 1.02329707, + "epoch": 0.580550127761912, + "flos": 23112790295040.0, + "grad_norm": 1.9393951827402272, + "language_loss": 0.6920054, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.71353173, + "num_input_tokens_seen": 207961975, + "step": 9656, + "time_per_iteration": 3.8509809970855713 + }, + { + "auxiliary_loss_clip": 0.01095543, + "auxiliary_loss_mlp": 0.010313, + "balance_loss_clip": 1.03799713, + "balance_loss_mlp": 1.01908755, + "epoch": 0.5806102510145799, + "flos": 18475681236480.0, + "grad_norm": 1.6292756304836526, + "language_loss": 0.71927881, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.74054724, + "num_input_tokens_seen": 207979520, + "step": 9657, + "time_per_iteration": 2.4866409301757812 + }, + { + "auxiliary_loss_clip": 0.01103426, + "auxiliary_loss_mlp": 0.01033571, + "balance_loss_clip": 1.03936303, + "balance_loss_mlp": 1.0198915, + "epoch": 0.5806703742672479, + "flos": 23915645136000.0, + "grad_norm": 1.9859342223916547, + "language_loss": 0.70817888, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.72954881, + "num_input_tokens_seen": 207998375, + "step": 9658, + "time_per_iteration": 2.561589002609253 + }, + { + "auxiliary_loss_clip": 0.01029212, + "auxiliary_loss_mlp": 0.01005754, + "balance_loss_clip": 1.01462007, + "balance_loss_mlp": 1.00457406, + "epoch": 0.5807304975199158, + "flos": 66311999412480.0, + "grad_norm": 0.6521085049930855, + "language_loss": 0.5355624, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.55591214, + "num_input_tokens_seen": 208060605, + "step": 9659, + "time_per_iteration": 4.5142662525177 + }, + { + "auxiliary_loss_clip": 0.01104818, + "auxiliary_loss_mlp": 0.0104022, + "balance_loss_clip": 1.04019094, + "balance_loss_mlp": 1.02672529, + "epoch": 0.5807906207725838, + "flos": 31722444933120.0, + "grad_norm": 2.011363366137313, + "language_loss": 0.62232268, + "learning_rate": 1.576954100136366e-06, + "loss": 0.64377308, + "num_input_tokens_seen": 208080320, + "step": 9660, + "time_per_iteration": 2.5697338581085205 + }, + { + "auxiliary_loss_clip": 0.01099887, + "auxiliary_loss_mlp": 0.01035638, + "balance_loss_clip": 1.03493285, + "balance_loss_mlp": 1.02226257, + "epoch": 0.5808507440252517, + "flos": 23801161512960.0, + "grad_norm": 1.4287554787905368, + "language_loss": 0.65328401, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.67463923, + "num_input_tokens_seen": 208099305, + "step": 9661, + "time_per_iteration": 2.5479750633239746 + }, + { + "auxiliary_loss_clip": 0.01059065, + "auxiliary_loss_mlp": 0.01028042, + "balance_loss_clip": 1.03673077, + "balance_loss_mlp": 1.01663911, + "epoch": 0.5809108672779197, + "flos": 13698449222400.0, + "grad_norm": 1.5159816296374204, + "language_loss": 0.74355787, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.76442897, + "num_input_tokens_seen": 208116960, + "step": 9662, + "time_per_iteration": 2.6193628311157227 + }, + { + "auxiliary_loss_clip": 0.01039112, + "auxiliary_loss_mlp": 0.00999901, + "balance_loss_clip": 1.01458621, + "balance_loss_mlp": 0.99867862, + "epoch": 0.5809709905305876, + "flos": 69134866381440.0, + "grad_norm": 0.8750482968604066, + "language_loss": 0.58327734, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.60366744, + "num_input_tokens_seen": 208182190, + "step": 9663, + "time_per_iteration": 3.1766486167907715 + }, + { + "auxiliary_loss_clip": 0.01090779, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.03814459, + "balance_loss_mlp": 1.01643968, + "epoch": 0.5810311137832557, + "flos": 19827538525440.0, + "grad_norm": 2.10974465886977, + "language_loss": 0.81897408, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.84016979, + "num_input_tokens_seen": 208197015, + "step": 9664, + "time_per_iteration": 3.943603754043579 + }, + { + "auxiliary_loss_clip": 0.01089299, + "auxiliary_loss_mlp": 0.00787229, + "balance_loss_clip": 1.03435671, + "balance_loss_mlp": 1.00810921, + "epoch": 0.5810912370359237, + "flos": 29238503719680.0, + "grad_norm": 1.6760001109891884, + "language_loss": 0.81738728, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.83615255, + "num_input_tokens_seen": 208215795, + "step": 9665, + "time_per_iteration": 2.6040873527526855 + }, + { + "auxiliary_loss_clip": 0.01093465, + "auxiliary_loss_mlp": 0.01033162, + "balance_loss_clip": 1.03939962, + "balance_loss_mlp": 1.01868391, + "epoch": 0.5811513602885916, + "flos": 22785572373120.0, + "grad_norm": 1.5774222070234176, + "language_loss": 0.8126744, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.83394074, + "num_input_tokens_seen": 208234655, + "step": 9666, + "time_per_iteration": 2.575927734375 + }, + { + "auxiliary_loss_clip": 0.01097827, + "auxiliary_loss_mlp": 0.01036237, + "balance_loss_clip": 1.04124784, + "balance_loss_mlp": 1.02433944, + "epoch": 0.5812114835412596, + "flos": 18734346051840.0, + "grad_norm": 2.7661009622485686, + "language_loss": 0.8019889, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.82332957, + "num_input_tokens_seen": 208251300, + "step": 9667, + "time_per_iteration": 2.489210367202759 + }, + { + "auxiliary_loss_clip": 0.01106145, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.03772783, + "balance_loss_mlp": 1.02032232, + "epoch": 0.5812716067939275, + "flos": 26431295080320.0, + "grad_norm": 1.613827804355796, + "language_loss": 0.78760469, + "learning_rate": 1.573909419957653e-06, + "loss": 0.80900574, + "num_input_tokens_seen": 208272685, + "step": 9668, + "time_per_iteration": 2.576481342315674 + }, + { + "auxiliary_loss_clip": 0.01090393, + "auxiliary_loss_mlp": 0.01029848, + "balance_loss_clip": 1.03876138, + "balance_loss_mlp": 1.01785016, + "epoch": 0.5813317300465956, + "flos": 43397865285120.0, + "grad_norm": 1.9330845303390036, + "language_loss": 0.6447556, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.66595793, + "num_input_tokens_seen": 208294315, + "step": 9669, + "time_per_iteration": 2.7305614948272705 + }, + { + "auxiliary_loss_clip": 0.01062096, + "auxiliary_loss_mlp": 0.01033377, + "balance_loss_clip": 1.03899097, + "balance_loss_mlp": 1.02034712, + "epoch": 0.5813918532992635, + "flos": 24785472885120.0, + "grad_norm": 1.5323161940272176, + "language_loss": 0.73099363, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.75194836, + "num_input_tokens_seen": 208315610, + "step": 9670, + "time_per_iteration": 2.6420578956604004 + }, + { + "auxiliary_loss_clip": 0.01081181, + "auxiliary_loss_mlp": 0.01041479, + "balance_loss_clip": 1.03766251, + "balance_loss_mlp": 1.02876544, + "epoch": 0.5814519765519315, + "flos": 22857357703680.0, + "grad_norm": 1.9984474454413688, + "language_loss": 0.78904748, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.810274, + "num_input_tokens_seen": 208334725, + "step": 9671, + "time_per_iteration": 2.5628437995910645 + }, + { + "auxiliary_loss_clip": 0.0106911, + "auxiliary_loss_mlp": 0.01032912, + "balance_loss_clip": 1.03729868, + "balance_loss_mlp": 1.01895297, + "epoch": 0.5815120998045994, + "flos": 24060831909120.0, + "grad_norm": 2.081288871488614, + "language_loss": 0.61383343, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.6348536, + "num_input_tokens_seen": 208353825, + "step": 9672, + "time_per_iteration": 2.6273372173309326 + }, + { + "auxiliary_loss_clip": 0.01066832, + "auxiliary_loss_mlp": 0.01040701, + "balance_loss_clip": 1.03763509, + "balance_loss_mlp": 1.02736163, + "epoch": 0.5815722230572674, + "flos": 24279491952000.0, + "grad_norm": 1.6354908146370508, + "language_loss": 0.81136405, + "learning_rate": 1.572007019492342e-06, + "loss": 0.83243942, + "num_input_tokens_seen": 208374160, + "step": 9673, + "time_per_iteration": 2.6280763149261475 + }, + { + "auxiliary_loss_clip": 0.01081723, + "auxiliary_loss_mlp": 0.01035208, + "balance_loss_clip": 1.04130912, + "balance_loss_mlp": 1.02135026, + "epoch": 0.5816323463099353, + "flos": 22200371994240.0, + "grad_norm": 1.9092854713163045, + "language_loss": 0.88017917, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.90134847, + "num_input_tokens_seen": 208392105, + "step": 9674, + "time_per_iteration": 2.558434247970581 + }, + { + "auxiliary_loss_clip": 0.01111774, + "auxiliary_loss_mlp": 0.0078829, + "balance_loss_clip": 1.03817892, + "balance_loss_mlp": 1.01314139, + "epoch": 0.5816924695626033, + "flos": 24134448833280.0, + "grad_norm": 1.528446293420111, + "language_loss": 0.79174829, + "learning_rate": 1.571246172811984e-06, + "loss": 0.81074893, + "num_input_tokens_seen": 208411755, + "step": 9675, + "time_per_iteration": 2.5101263523101807 + }, + { + "auxiliary_loss_clip": 0.01100106, + "auxiliary_loss_mlp": 0.01033619, + "balance_loss_clip": 1.0388186, + "balance_loss_mlp": 1.02006531, + "epoch": 0.5817525928152713, + "flos": 21324223451520.0, + "grad_norm": 2.1326498867658046, + "language_loss": 0.70043015, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.72176743, + "num_input_tokens_seen": 208429995, + "step": 9676, + "time_per_iteration": 2.485180377960205 + }, + { + "auxiliary_loss_clip": 0.01059926, + "auxiliary_loss_mlp": 0.01028928, + "balance_loss_clip": 1.03895807, + "balance_loss_mlp": 1.01570141, + "epoch": 0.5818127160679393, + "flos": 26934510666240.0, + "grad_norm": 2.1910144496441735, + "language_loss": 0.63410819, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.65499669, + "num_input_tokens_seen": 208443655, + "step": 9677, + "time_per_iteration": 2.646524667739868 + }, + { + "auxiliary_loss_clip": 0.01023462, + "auxiliary_loss_mlp": 0.01001475, + "balance_loss_clip": 1.02554035, + "balance_loss_mlp": 1.00030637, + "epoch": 0.5818728393206073, + "flos": 63918626342400.0, + "grad_norm": 0.8016440887987955, + "language_loss": 0.54147637, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.56172574, + "num_input_tokens_seen": 208498405, + "step": 9678, + "time_per_iteration": 3.207714080810547 + }, + { + "auxiliary_loss_clip": 0.01025672, + "auxiliary_loss_mlp": 0.01007996, + "balance_loss_clip": 1.02118897, + "balance_loss_mlp": 1.00641024, + "epoch": 0.5819329625732752, + "flos": 64954108638720.0, + "grad_norm": 0.7440782150046145, + "language_loss": 0.56239396, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58273065, + "num_input_tokens_seen": 208559075, + "step": 9679, + "time_per_iteration": 3.0381968021392822 + }, + { + "auxiliary_loss_clip": 0.01109972, + "auxiliary_loss_mlp": 0.01030784, + "balance_loss_clip": 1.03743422, + "balance_loss_mlp": 1.01917338, + "epoch": 0.5819930858259432, + "flos": 21215270522880.0, + "grad_norm": 1.737877709267846, + "language_loss": 0.65344119, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.67484879, + "num_input_tokens_seen": 208577770, + "step": 9680, + "time_per_iteration": 2.499100923538208 + }, + { + "auxiliary_loss_clip": 0.0108846, + "auxiliary_loss_mlp": 0.01027264, + "balance_loss_clip": 1.0373317, + "balance_loss_mlp": 1.01548588, + "epoch": 0.5820532090786111, + "flos": 19458520151040.0, + "grad_norm": 1.977328511850355, + "language_loss": 0.83486295, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.85602015, + "num_input_tokens_seen": 208595110, + "step": 9681, + "time_per_iteration": 2.534601926803589 + }, + { + "auxiliary_loss_clip": 0.01110927, + "auxiliary_loss_mlp": 0.01028284, + "balance_loss_clip": 1.03819335, + "balance_loss_mlp": 1.01600552, + "epoch": 0.5821133323312792, + "flos": 17712615686400.0, + "grad_norm": 1.8546241476533565, + "language_loss": 0.75860071, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.77999282, + "num_input_tokens_seen": 208612080, + "step": 9682, + "time_per_iteration": 2.4762351512908936 + }, + { + "auxiliary_loss_clip": 0.01056264, + "auxiliary_loss_mlp": 0.01035487, + "balance_loss_clip": 1.03525233, + "balance_loss_mlp": 1.02121806, + "epoch": 0.5821734555839471, + "flos": 24571804832640.0, + "grad_norm": 1.8968213965619711, + "language_loss": 0.74988681, + "learning_rate": 1.568203437579977e-06, + "loss": 0.77080429, + "num_input_tokens_seen": 208630235, + "step": 9683, + "time_per_iteration": 2.6503167152404785 + }, + { + "auxiliary_loss_clip": 0.0108717, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.03854156, + "balance_loss_mlp": 1.01766098, + "epoch": 0.5822335788366151, + "flos": 22382259488640.0, + "grad_norm": 1.720758563175461, + "language_loss": 0.74009919, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.7612744, + "num_input_tokens_seen": 208647925, + "step": 9684, + "time_per_iteration": 2.5448365211486816 + }, + { + "auxiliary_loss_clip": 0.01093407, + "auxiliary_loss_mlp": 0.01038203, + "balance_loss_clip": 1.03911161, + "balance_loss_mlp": 1.02488124, + "epoch": 0.582293702089283, + "flos": 26722494639360.0, + "grad_norm": 2.4591071958590414, + "language_loss": 0.78578782, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.80710393, + "num_input_tokens_seen": 208666180, + "step": 9685, + "time_per_iteration": 2.5617973804473877 + }, + { + "auxiliary_loss_clip": 0.01111417, + "auxiliary_loss_mlp": 0.01033656, + "balance_loss_clip": 1.03905463, + "balance_loss_mlp": 1.02087688, + "epoch": 0.582353825341951, + "flos": 17348661129600.0, + "grad_norm": 1.8532190127315478, + "language_loss": 0.75009811, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.77154881, + "num_input_tokens_seen": 208684240, + "step": 9686, + "time_per_iteration": 2.444061756134033 + }, + { + "auxiliary_loss_clip": 0.0102923, + "auxiliary_loss_mlp": 0.01021969, + "balance_loss_clip": 1.01453114, + "balance_loss_mlp": 1.02086663, + "epoch": 0.5824139485946189, + "flos": 55473261534720.0, + "grad_norm": 0.8257090074653991, + "language_loss": 0.57389104, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.59440303, + "num_input_tokens_seen": 208736090, + "step": 9687, + "time_per_iteration": 4.245602607727051 + }, + { + "auxiliary_loss_clip": 0.01072114, + "auxiliary_loss_mlp": 0.01030604, + "balance_loss_clip": 1.03698444, + "balance_loss_mlp": 1.0161916, + "epoch": 0.582474071847287, + "flos": 20303031790080.0, + "grad_norm": 3.375118318481554, + "language_loss": 0.70135355, + "learning_rate": 1.566302259738727e-06, + "loss": 0.72238076, + "num_input_tokens_seen": 208754600, + "step": 9688, + "time_per_iteration": 2.5675840377807617 + }, + { + "auxiliary_loss_clip": 0.01101622, + "auxiliary_loss_mlp": 0.01031143, + "balance_loss_clip": 1.03738344, + "balance_loss_mlp": 1.01889467, + "epoch": 0.5825341950999549, + "flos": 23878010661120.0, + "grad_norm": 2.0451778245297745, + "language_loss": 0.6478132, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.66914082, + "num_input_tokens_seen": 208773140, + "step": 9689, + "time_per_iteration": 2.509417772293091 + }, + { + "auxiliary_loss_clip": 0.01090404, + "auxiliary_loss_mlp": 0.0078717, + "balance_loss_clip": 1.03882194, + "balance_loss_mlp": 1.01166737, + "epoch": 0.5825943183526229, + "flos": 23113041690240.0, + "grad_norm": 1.851121758499308, + "language_loss": 0.73272884, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.75150454, + "num_input_tokens_seen": 208793410, + "step": 9690, + "time_per_iteration": 2.553427219390869 + }, + { + "auxiliary_loss_clip": 0.01089254, + "auxiliary_loss_mlp": 0.01038313, + "balance_loss_clip": 1.0366919, + "balance_loss_mlp": 1.02388895, + "epoch": 0.5826544416052909, + "flos": 22857429530880.0, + "grad_norm": 1.692964391838581, + "language_loss": 0.75841761, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.77969325, + "num_input_tokens_seen": 208811920, + "step": 9691, + "time_per_iteration": 2.5359082221984863 + }, + { + "auxiliary_loss_clip": 0.01101382, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.03681457, + "balance_loss_mlp": 1.01633239, + "epoch": 0.5827145648579588, + "flos": 31501845555840.0, + "grad_norm": 1.7634569892114038, + "language_loss": 0.8062728, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.8275789, + "num_input_tokens_seen": 208834720, + "step": 9692, + "time_per_iteration": 2.572535991668701 + }, + { + "auxiliary_loss_clip": 0.01028018, + "auxiliary_loss_mlp": 0.01009247, + "balance_loss_clip": 1.01335144, + "balance_loss_mlp": 1.00808454, + "epoch": 0.5827746881106268, + "flos": 69811817074560.0, + "grad_norm": 0.7589954772475821, + "language_loss": 0.56916434, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.58953696, + "num_input_tokens_seen": 208898415, + "step": 9693, + "time_per_iteration": 3.0765902996063232 + }, + { + "auxiliary_loss_clip": 0.01096518, + "auxiliary_loss_mlp": 0.00786404, + "balance_loss_clip": 1.03533256, + "balance_loss_mlp": 1.01185715, + "epoch": 0.5828348113632947, + "flos": 23112395245440.0, + "grad_norm": 1.6007685806698053, + "language_loss": 0.79004842, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.80887765, + "num_input_tokens_seen": 208919045, + "step": 9694, + "time_per_iteration": 3.8807618618011475 + }, + { + "auxiliary_loss_clip": 0.01078851, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.03770614, + "balance_loss_mlp": 1.02243245, + "epoch": 0.5828949346159628, + "flos": 21873082245120.0, + "grad_norm": 1.5250935060362156, + "language_loss": 0.76176476, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.78291082, + "num_input_tokens_seen": 208939375, + "step": 9695, + "time_per_iteration": 2.5749073028564453 + }, + { + "auxiliary_loss_clip": 0.01027179, + "auxiliary_loss_mlp": 0.01003175, + "balance_loss_clip": 1.01258159, + "balance_loss_mlp": 1.00198925, + "epoch": 0.5829550578686307, + "flos": 65962553950080.0, + "grad_norm": 0.7722824934367444, + "language_loss": 0.55043417, + "learning_rate": 1.563261231127095e-06, + "loss": 0.57073772, + "num_input_tokens_seen": 209004760, + "step": 9696, + "time_per_iteration": 3.2099177837371826 + }, + { + "auxiliary_loss_clip": 0.01075708, + "auxiliary_loss_mlp": 0.01029282, + "balance_loss_clip": 1.03986526, + "balance_loss_mlp": 1.01700878, + "epoch": 0.5830151811212987, + "flos": 16289799079680.0, + "grad_norm": 1.8591119985358442, + "language_loss": 0.76430261, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.78535247, + "num_input_tokens_seen": 209022930, + "step": 9697, + "time_per_iteration": 2.5392792224884033 + }, + { + "auxiliary_loss_clip": 0.01113123, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.03758454, + "balance_loss_mlp": 1.0188067, + "epoch": 0.5830753043739666, + "flos": 24168851084160.0, + "grad_norm": 1.699767819917473, + "language_loss": 0.77386546, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.79531789, + "num_input_tokens_seen": 209043740, + "step": 9698, + "time_per_iteration": 3.921567916870117 + }, + { + "auxiliary_loss_clip": 0.01071583, + "auxiliary_loss_mlp": 0.01035593, + "balance_loss_clip": 1.03712368, + "balance_loss_mlp": 1.02222395, + "epoch": 0.5831354276266346, + "flos": 27059050097280.0, + "grad_norm": 1.6084596605716308, + "language_loss": 0.83392954, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.85500133, + "num_input_tokens_seen": 209068885, + "step": 9699, + "time_per_iteration": 2.712510347366333 + }, + { + "auxiliary_loss_clip": 0.01087192, + "auxiliary_loss_mlp": 0.01033532, + "balance_loss_clip": 1.03641319, + "balance_loss_mlp": 1.01931024, + "epoch": 0.5831955508793025, + "flos": 23623475909760.0, + "grad_norm": 2.0176714780087073, + "language_loss": 0.66298932, + "learning_rate": 1.561741113828305e-06, + "loss": 0.68419659, + "num_input_tokens_seen": 209087340, + "step": 9700, + "time_per_iteration": 2.5855231285095215 + }, + { + "auxiliary_loss_clip": 0.0110083, + "auxiliary_loss_mlp": 0.0103192, + "balance_loss_clip": 1.03688288, + "balance_loss_mlp": 1.01903987, + "epoch": 0.5832556741319705, + "flos": 24973250209920.0, + "grad_norm": 1.5919089363235128, + "language_loss": 0.71417332, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.73550075, + "num_input_tokens_seen": 209108840, + "step": 9701, + "time_per_iteration": 2.552030563354492 + }, + { + "auxiliary_loss_clip": 0.0108613, + "auxiliary_loss_mlp": 0.01031428, + "balance_loss_clip": 1.03619194, + "balance_loss_mlp": 1.01907206, + "epoch": 0.5833157973846385, + "flos": 23221563655680.0, + "grad_norm": 1.783521519410721, + "language_loss": 0.85411823, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.87529385, + "num_input_tokens_seen": 209127985, + "step": 9702, + "time_per_iteration": 4.058727502822876 + }, + { + "auxiliary_loss_clip": 0.01093036, + "auxiliary_loss_mlp": 0.0103002, + "balance_loss_clip": 1.03486216, + "balance_loss_mlp": 1.01831973, + "epoch": 0.5833759206373065, + "flos": 21977941023360.0, + "grad_norm": 1.5216422300525767, + "language_loss": 0.77711606, + "learning_rate": 1.560601200301392e-06, + "loss": 0.79834664, + "num_input_tokens_seen": 209146885, + "step": 9703, + "time_per_iteration": 2.5280017852783203 + }, + { + "auxiliary_loss_clip": 0.01112103, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.03744507, + "balance_loss_mlp": 1.01686001, + "epoch": 0.5834360438899745, + "flos": 21762405463680.0, + "grad_norm": 4.0664772205101105, + "language_loss": 0.71134675, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.73276663, + "num_input_tokens_seen": 209166130, + "step": 9704, + "time_per_iteration": 2.4972457885742188 + }, + { + "auxiliary_loss_clip": 0.01088403, + "auxiliary_loss_mlp": 0.01031322, + "balance_loss_clip": 1.04265368, + "balance_loss_mlp": 1.01949692, + "epoch": 0.5834961671426424, + "flos": 15992566035840.0, + "grad_norm": 1.729744324478098, + "language_loss": 0.81618983, + "learning_rate": 1.559841341236335e-06, + "loss": 0.83738708, + "num_input_tokens_seen": 209183350, + "step": 9705, + "time_per_iteration": 2.5404393672943115 + }, + { + "auxiliary_loss_clip": 0.0106489, + "auxiliary_loss_mlp": 0.01030431, + "balance_loss_clip": 1.03608203, + "balance_loss_mlp": 1.01775277, + "epoch": 0.5835562903953104, + "flos": 22818322598400.0, + "grad_norm": 2.083095865815948, + "language_loss": 0.8053838, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.82633698, + "num_input_tokens_seen": 209203945, + "step": 9706, + "time_per_iteration": 2.585801839828491 + }, + { + "auxiliary_loss_clip": 0.01097794, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.03742778, + "balance_loss_mlp": 1.01940238, + "epoch": 0.5836164136479783, + "flos": 48468056624640.0, + "grad_norm": 2.477424665590362, + "language_loss": 0.75106394, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.77236605, + "num_input_tokens_seen": 209227080, + "step": 9707, + "time_per_iteration": 2.732780933380127 + }, + { + "auxiliary_loss_clip": 0.01079921, + "auxiliary_loss_mlp": 0.01031397, + "balance_loss_clip": 1.03913832, + "balance_loss_mlp": 1.01921368, + "epoch": 0.5836765369006464, + "flos": 26905998245760.0, + "grad_norm": 1.6486625773042023, + "language_loss": 0.81566226, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.83677542, + "num_input_tokens_seen": 209248170, + "step": 9708, + "time_per_iteration": 2.6179630756378174 + }, + { + "auxiliary_loss_clip": 0.0109623, + "auxiliary_loss_mlp": 0.01031008, + "balance_loss_clip": 1.04087853, + "balance_loss_mlp": 1.01850891, + "epoch": 0.5837366601533143, + "flos": 20084048524800.0, + "grad_norm": 1.5024775154525496, + "language_loss": 0.78799313, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.80926549, + "num_input_tokens_seen": 209267730, + "step": 9709, + "time_per_iteration": 2.526174306869507 + }, + { + "auxiliary_loss_clip": 0.01017574, + "auxiliary_loss_mlp": 0.01005849, + "balance_loss_clip": 1.0118804, + "balance_loss_mlp": 1.00463891, + "epoch": 0.5837967834059823, + "flos": 65363885971200.0, + "grad_norm": 0.7618155826030154, + "language_loss": 0.56576705, + "learning_rate": 1.557941985915844e-06, + "loss": 0.58600128, + "num_input_tokens_seen": 209332510, + "step": 9710, + "time_per_iteration": 3.146564245223999 + }, + { + "auxiliary_loss_clip": 0.01077113, + "auxiliary_loss_mlp": 0.01028621, + "balance_loss_clip": 1.0378933, + "balance_loss_mlp": 1.01740968, + "epoch": 0.5838569066586502, + "flos": 25338641310720.0, + "grad_norm": 1.5541538853730885, + "language_loss": 0.6531918, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.67424905, + "num_input_tokens_seen": 209353355, + "step": 9711, + "time_per_iteration": 2.6623027324676514 + }, + { + "auxiliary_loss_clip": 0.01113468, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.03689349, + "balance_loss_mlp": 1.01981401, + "epoch": 0.5839170299113182, + "flos": 22229243550720.0, + "grad_norm": 1.773640185748023, + "language_loss": 0.78762579, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.80910051, + "num_input_tokens_seen": 209370960, + "step": 9712, + "time_per_iteration": 2.488757610321045 + }, + { + "auxiliary_loss_clip": 0.01072236, + "auxiliary_loss_mlp": 0.00786924, + "balance_loss_clip": 1.03372228, + "balance_loss_mlp": 1.00890756, + "epoch": 0.5839771531639861, + "flos": 22200012858240.0, + "grad_norm": 1.504094943550871, + "language_loss": 0.7316227, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.75021422, + "num_input_tokens_seen": 209390955, + "step": 9713, + "time_per_iteration": 2.6179497241973877 + }, + { + "auxiliary_loss_clip": 0.01091146, + "auxiliary_loss_mlp": 0.01031076, + "balance_loss_clip": 1.03654754, + "balance_loss_mlp": 1.0171169, + "epoch": 0.5840372764166541, + "flos": 22419355259520.0, + "grad_norm": 1.7942594156588376, + "language_loss": 0.69045496, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.71167719, + "num_input_tokens_seen": 209410260, + "step": 9714, + "time_per_iteration": 2.540234327316284 + }, + { + "auxiliary_loss_clip": 0.01109877, + "auxiliary_loss_mlp": 0.01030724, + "balance_loss_clip": 1.03588855, + "balance_loss_mlp": 1.01775432, + "epoch": 0.5840973996693221, + "flos": 19828256797440.0, + "grad_norm": 1.8622988785938341, + "language_loss": 0.8018434, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.8232494, + "num_input_tokens_seen": 209429920, + "step": 9715, + "time_per_iteration": 2.478945255279541 + }, + { + "auxiliary_loss_clip": 0.01088122, + "auxiliary_loss_mlp": 0.01033787, + "balance_loss_clip": 1.03425694, + "balance_loss_mlp": 1.02055442, + "epoch": 0.5841575229219901, + "flos": 21142982401920.0, + "grad_norm": 2.224598048742477, + "language_loss": 0.72166967, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.74288875, + "num_input_tokens_seen": 209449470, + "step": 9716, + "time_per_iteration": 2.552072763442993 + }, + { + "auxiliary_loss_clip": 0.01083544, + "auxiliary_loss_mlp": 0.01033659, + "balance_loss_clip": 1.03606355, + "balance_loss_mlp": 1.02146339, + "epoch": 0.5842176461746581, + "flos": 24640322025600.0, + "grad_norm": 1.7686244979918377, + "language_loss": 0.75020474, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.77137673, + "num_input_tokens_seen": 209467695, + "step": 9717, + "time_per_iteration": 2.5849974155426025 + }, + { + "auxiliary_loss_clip": 0.01098651, + "auxiliary_loss_mlp": 0.01035637, + "balance_loss_clip": 1.03567743, + "balance_loss_mlp": 1.02243447, + "epoch": 0.584277769427326, + "flos": 19131158574720.0, + "grad_norm": 2.0323020161954384, + "language_loss": 0.8008523, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.82219517, + "num_input_tokens_seen": 209484250, + "step": 9718, + "time_per_iteration": 2.4527716636657715 + }, + { + "auxiliary_loss_clip": 0.01086716, + "auxiliary_loss_mlp": 0.01031093, + "balance_loss_clip": 1.03591502, + "balance_loss_mlp": 1.01776576, + "epoch": 0.584337892679994, + "flos": 22675111073280.0, + "grad_norm": 1.7083170752983856, + "language_loss": 0.67511809, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.69629616, + "num_input_tokens_seen": 209502830, + "step": 9719, + "time_per_iteration": 2.54215931892395 + }, + { + "auxiliary_loss_clip": 0.01111362, + "auxiliary_loss_mlp": 0.01032261, + "balance_loss_clip": 1.03730285, + "balance_loss_mlp": 1.01952386, + "epoch": 0.5843980159326619, + "flos": 31284083352960.0, + "grad_norm": 2.1832606661477483, + "language_loss": 0.76061189, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.78204823, + "num_input_tokens_seen": 209525995, + "step": 9720, + "time_per_iteration": 2.532802104949951 + }, + { + "auxiliary_loss_clip": 0.01076505, + "auxiliary_loss_mlp": 0.01036644, + "balance_loss_clip": 1.03520632, + "balance_loss_mlp": 1.02469313, + "epoch": 0.58445813918533, + "flos": 22748117466240.0, + "grad_norm": 1.4785674678532308, + "language_loss": 0.82928276, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.85041416, + "num_input_tokens_seen": 209545895, + "step": 9721, + "time_per_iteration": 2.55588960647583 + }, + { + "auxiliary_loss_clip": 0.01039165, + "auxiliary_loss_mlp": 0.01001236, + "balance_loss_clip": 1.0144273, + "balance_loss_mlp": 1.00006223, + "epoch": 0.5845182624379979, + "flos": 60686556658560.0, + "grad_norm": 0.9332007813588825, + "language_loss": 0.71330398, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73370802, + "num_input_tokens_seen": 209602315, + "step": 9722, + "time_per_iteration": 3.117319107055664 + }, + { + "auxiliary_loss_clip": 0.01092433, + "auxiliary_loss_mlp": 0.01038294, + "balance_loss_clip": 1.03577447, + "balance_loss_mlp": 1.0257889, + "epoch": 0.5845783856906659, + "flos": 16362446336640.0, + "grad_norm": 2.0645246566155526, + "language_loss": 0.89334166, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.91464895, + "num_input_tokens_seen": 209617615, + "step": 9723, + "time_per_iteration": 2.4645912647247314 + }, + { + "auxiliary_loss_clip": 0.01088404, + "auxiliary_loss_mlp": 0.01028894, + "balance_loss_clip": 1.04311967, + "balance_loss_mlp": 1.0171454, + "epoch": 0.5846385089433338, + "flos": 20083402080000.0, + "grad_norm": 1.4432273107401823, + "language_loss": 0.68348706, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.70466, + "num_input_tokens_seen": 209637005, + "step": 9724, + "time_per_iteration": 2.544124126434326 + }, + { + "auxiliary_loss_clip": 0.01102932, + "auxiliary_loss_mlp": 0.01036582, + "balance_loss_clip": 1.039294, + "balance_loss_mlp": 1.02332044, + "epoch": 0.5846986321960018, + "flos": 17311062568320.0, + "grad_norm": 1.807815661131636, + "language_loss": 0.86239684, + "learning_rate": 1.552246441587197e-06, + "loss": 0.88379204, + "num_input_tokens_seen": 209653170, + "step": 9725, + "time_per_iteration": 2.4518752098083496 + }, + { + "auxiliary_loss_clip": 0.01097181, + "auxiliary_loss_mlp": 0.01040424, + "balance_loss_clip": 1.04152524, + "balance_loss_mlp": 1.02744782, + "epoch": 0.5847587554486697, + "flos": 17197907748480.0, + "grad_norm": 2.0087510121496983, + "language_loss": 0.8304562, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.85183227, + "num_input_tokens_seen": 209671275, + "step": 9726, + "time_per_iteration": 3.8872323036193848 + }, + { + "auxiliary_loss_clip": 0.01058426, + "auxiliary_loss_mlp": 0.00787076, + "balance_loss_clip": 1.04306662, + "balance_loss_mlp": 1.01010334, + "epoch": 0.5848188787013378, + "flos": 24529106540160.0, + "grad_norm": 2.0851336476345996, + "language_loss": 0.6673792, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.68583423, + "num_input_tokens_seen": 209690380, + "step": 9727, + "time_per_iteration": 2.6648616790771484 + }, + { + "auxiliary_loss_clip": 0.01077277, + "auxiliary_loss_mlp": 0.0104738, + "balance_loss_clip": 1.03815508, + "balance_loss_mlp": 1.03309834, + "epoch": 0.5848790019540057, + "flos": 20628382204800.0, + "grad_norm": 1.8939032787781607, + "language_loss": 0.8117739, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.83302045, + "num_input_tokens_seen": 209708845, + "step": 9728, + "time_per_iteration": 2.5723607540130615 + }, + { + "auxiliary_loss_clip": 0.01096825, + "auxiliary_loss_mlp": 0.01036652, + "balance_loss_clip": 1.03702021, + "balance_loss_mlp": 1.02479053, + "epoch": 0.5849391252066737, + "flos": 22418852469120.0, + "grad_norm": 1.8883601993267758, + "language_loss": 0.77684081, + "learning_rate": 1.550728272957027e-06, + "loss": 0.79817557, + "num_input_tokens_seen": 209729000, + "step": 9729, + "time_per_iteration": 2.5113677978515625 + }, + { + "auxiliary_loss_clip": 0.01094553, + "auxiliary_loss_mlp": 0.01035389, + "balance_loss_clip": 1.03544831, + "balance_loss_mlp": 1.02184737, + "epoch": 0.5849992484593417, + "flos": 25410929431680.0, + "grad_norm": 1.7184935229225724, + "language_loss": 0.70226097, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.72356039, + "num_input_tokens_seen": 209747435, + "step": 9730, + "time_per_iteration": 2.5356411933898926 + }, + { + "auxiliary_loss_clip": 0.01116016, + "auxiliary_loss_mlp": 0.01037856, + "balance_loss_clip": 1.03973603, + "balance_loss_mlp": 1.02378917, + "epoch": 0.5850593717120096, + "flos": 21065163586560.0, + "grad_norm": 1.8104590166265047, + "language_loss": 0.7845158, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.80605453, + "num_input_tokens_seen": 209764910, + "step": 9731, + "time_per_iteration": 2.4685041904449463 + }, + { + "auxiliary_loss_clip": 0.01097721, + "auxiliary_loss_mlp": 0.01046286, + "balance_loss_clip": 1.03801632, + "balance_loss_mlp": 1.03206468, + "epoch": 0.5851194949646776, + "flos": 25301545539840.0, + "grad_norm": 2.0684086050749126, + "language_loss": 0.71009183, + "learning_rate": 1.549589825316528e-06, + "loss": 0.73153192, + "num_input_tokens_seen": 209786115, + "step": 9732, + "time_per_iteration": 2.526305675506592 + }, + { + "auxiliary_loss_clip": 0.01059388, + "auxiliary_loss_mlp": 0.01039272, + "balance_loss_clip": 1.03481865, + "balance_loss_mlp": 1.02426374, + "epoch": 0.5851796182173455, + "flos": 23587242065280.0, + "grad_norm": 1.7510051863735043, + "language_loss": 0.52563792, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.54662454, + "num_input_tokens_seen": 209806095, + "step": 9733, + "time_per_iteration": 3.988006353378296 + }, + { + "auxiliary_loss_clip": 0.01095774, + "auxiliary_loss_mlp": 0.0103647, + "balance_loss_clip": 1.03661466, + "balance_loss_mlp": 1.02297521, + "epoch": 0.5852397414700136, + "flos": 24822712310400.0, + "grad_norm": 2.1778207085029457, + "language_loss": 0.87252879, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.89385128, + "num_input_tokens_seen": 209823650, + "step": 9734, + "time_per_iteration": 2.511223316192627 + }, + { + "auxiliary_loss_clip": 0.01084268, + "auxiliary_loss_mlp": 0.01032859, + "balance_loss_clip": 1.03744793, + "balance_loss_mlp": 1.02099812, + "epoch": 0.5852998647226815, + "flos": 19937784343680.0, + "grad_norm": 1.5963916871040333, + "language_loss": 0.72229505, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.74346626, + "num_input_tokens_seen": 209843220, + "step": 9735, + "time_per_iteration": 2.5299553871154785 + }, + { + "auxiliary_loss_clip": 0.01100232, + "auxiliary_loss_mlp": 0.01040078, + "balance_loss_clip": 1.03700614, + "balance_loss_mlp": 1.02609456, + "epoch": 0.5853599879753495, + "flos": 16720367408640.0, + "grad_norm": 2.8952653470644756, + "language_loss": 0.74215198, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.76355511, + "num_input_tokens_seen": 209854880, + "step": 9736, + "time_per_iteration": 2.433213949203491 + }, + { + "auxiliary_loss_clip": 0.01077161, + "auxiliary_loss_mlp": 0.01036161, + "balance_loss_clip": 1.03607893, + "balance_loss_mlp": 1.02233911, + "epoch": 0.5854201112280174, + "flos": 44456583680640.0, + "grad_norm": 1.942817040201886, + "language_loss": 0.70867777, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.72981101, + "num_input_tokens_seen": 209877870, + "step": 9737, + "time_per_iteration": 4.199691295623779 + }, + { + "auxiliary_loss_clip": 0.01072462, + "auxiliary_loss_mlp": 0.01039381, + "balance_loss_clip": 1.03706181, + "balance_loss_mlp": 1.02620268, + "epoch": 0.5854802344806854, + "flos": 20339193807360.0, + "grad_norm": 1.7636976658255459, + "language_loss": 0.81906962, + "learning_rate": 1.547313391573169e-06, + "loss": 0.84018809, + "num_input_tokens_seen": 209896690, + "step": 9738, + "time_per_iteration": 2.589310884475708 + }, + { + "auxiliary_loss_clip": 0.01115642, + "auxiliary_loss_mlp": 0.00787831, + "balance_loss_clip": 1.03914273, + "balance_loss_mlp": 1.01140237, + "epoch": 0.5855403577333533, + "flos": 20921054221440.0, + "grad_norm": 1.6307167669259224, + "language_loss": 0.68397492, + "learning_rate": 1.546934045946082e-06, + "loss": 0.70300967, + "num_input_tokens_seen": 209914640, + "step": 9739, + "time_per_iteration": 2.452491283416748 + }, + { + "auxiliary_loss_clip": 0.01111522, + "auxiliary_loss_mlp": 0.01027029, + "balance_loss_clip": 1.03640151, + "balance_loss_mlp": 1.0139401, + "epoch": 0.5856004809860214, + "flos": 20448649526400.0, + "grad_norm": 2.526446551073146, + "language_loss": 0.58765775, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.60904324, + "num_input_tokens_seen": 209933375, + "step": 9740, + "time_per_iteration": 2.4622416496276855 + }, + { + "auxiliary_loss_clip": 0.01084705, + "auxiliary_loss_mlp": 0.01029606, + "balance_loss_clip": 1.03778756, + "balance_loss_mlp": 1.01737523, + "epoch": 0.5856606042386893, + "flos": 19640766781440.0, + "grad_norm": 2.1259023268483572, + "language_loss": 0.75180453, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.77294767, + "num_input_tokens_seen": 209952055, + "step": 9741, + "time_per_iteration": 3.94122576713562 + }, + { + "auxiliary_loss_clip": 0.01079799, + "auxiliary_loss_mlp": 0.01032272, + "balance_loss_clip": 1.03870523, + "balance_loss_mlp": 1.01943946, + "epoch": 0.5857207274913573, + "flos": 21686166846720.0, + "grad_norm": 1.584600684819687, + "language_loss": 0.75671571, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.77783644, + "num_input_tokens_seen": 209971190, + "step": 9742, + "time_per_iteration": 2.579705238342285 + }, + { + "auxiliary_loss_clip": 0.01087098, + "auxiliary_loss_mlp": 0.0102857, + "balance_loss_clip": 1.0374918, + "balance_loss_mlp": 1.01650572, + "epoch": 0.5857808507440253, + "flos": 23182708118400.0, + "grad_norm": 1.6859565108814716, + "language_loss": 0.75093699, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.77209365, + "num_input_tokens_seen": 209990695, + "step": 9743, + "time_per_iteration": 2.5167527198791504 + }, + { + "auxiliary_loss_clip": 0.01087332, + "auxiliary_loss_mlp": 0.01030164, + "balance_loss_clip": 1.03858042, + "balance_loss_mlp": 1.01833892, + "epoch": 0.5858409739966932, + "flos": 27235299156480.0, + "grad_norm": 1.6268001699396888, + "language_loss": 0.81011724, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.83129221, + "num_input_tokens_seen": 210010210, + "step": 9744, + "time_per_iteration": 2.583547592163086 + }, + { + "auxiliary_loss_clip": 0.01087243, + "auxiliary_loss_mlp": 0.01030565, + "balance_loss_clip": 1.03879786, + "balance_loss_mlp": 1.01807177, + "epoch": 0.5859010972493612, + "flos": 27855512317440.0, + "grad_norm": 1.6942867990049928, + "language_loss": 0.71709675, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.73827481, + "num_input_tokens_seen": 210030030, + "step": 9745, + "time_per_iteration": 2.5846216678619385 + }, + { + "auxiliary_loss_clip": 0.01019295, + "auxiliary_loss_mlp": 0.01000799, + "balance_loss_clip": 1.01718855, + "balance_loss_mlp": 0.99955952, + "epoch": 0.5859612205020291, + "flos": 70007064428160.0, + "grad_norm": 0.7253393624645269, + "language_loss": 0.53302419, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.55322516, + "num_input_tokens_seen": 210094840, + "step": 9746, + "time_per_iteration": 3.2194530963897705 + }, + { + "auxiliary_loss_clip": 0.01094164, + "auxiliary_loss_mlp": 0.01032769, + "balance_loss_clip": 1.0390842, + "balance_loss_mlp": 1.01908338, + "epoch": 0.5860213437546972, + "flos": 24056019486720.0, + "grad_norm": 2.124330420668836, + "language_loss": 0.73340702, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.75467634, + "num_input_tokens_seen": 210114660, + "step": 9747, + "time_per_iteration": 2.5679404735565186 + }, + { + "auxiliary_loss_clip": 0.01086067, + "auxiliary_loss_mlp": 0.0103675, + "balance_loss_clip": 1.03708172, + "balance_loss_mlp": 1.02136016, + "epoch": 0.5860814670073651, + "flos": 18947583141120.0, + "grad_norm": 3.3900216214184953, + "language_loss": 0.8151229, + "learning_rate": 1.543520710142051e-06, + "loss": 0.83635104, + "num_input_tokens_seen": 210132770, + "step": 9748, + "time_per_iteration": 2.4953346252441406 + }, + { + "auxiliary_loss_clip": 0.01101803, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.03795373, + "balance_loss_mlp": 1.01799512, + "epoch": 0.5861415902600331, + "flos": 22561848512640.0, + "grad_norm": 1.758573184427131, + "language_loss": 0.71919, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.74051678, + "num_input_tokens_seen": 210151895, + "step": 9749, + "time_per_iteration": 2.5414841175079346 + }, + { + "auxiliary_loss_clip": 0.01089843, + "auxiliary_loss_mlp": 0.01027837, + "balance_loss_clip": 1.03953218, + "balance_loss_mlp": 1.01506364, + "epoch": 0.586201713512701, + "flos": 14392027912320.0, + "grad_norm": 2.2561656675407726, + "language_loss": 0.75129086, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.77246761, + "num_input_tokens_seen": 210168040, + "step": 9750, + "time_per_iteration": 2.5102453231811523 + }, + { + "auxiliary_loss_clip": 0.01078935, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.04317117, + "balance_loss_mlp": 1.0199579, + "epoch": 0.586261836765369, + "flos": 19498560837120.0, + "grad_norm": 1.8068652651517798, + "language_loss": 0.71003479, + "learning_rate": 1.542383242598344e-06, + "loss": 0.7311548, + "num_input_tokens_seen": 210187720, + "step": 9751, + "time_per_iteration": 2.5922772884368896 + }, + { + "auxiliary_loss_clip": 0.0111616, + "auxiliary_loss_mlp": 0.01035471, + "balance_loss_clip": 1.03947616, + "balance_loss_mlp": 1.02116585, + "epoch": 0.5863219600180369, + "flos": 20701819560960.0, + "grad_norm": 1.7696133906560123, + "language_loss": 0.74616861, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.76768494, + "num_input_tokens_seen": 210206080, + "step": 9752, + "time_per_iteration": 2.4440674781799316 + }, + { + "auxiliary_loss_clip": 0.01099928, + "auxiliary_loss_mlp": 0.010285, + "balance_loss_clip": 1.0385654, + "balance_loss_mlp": 1.01572132, + "epoch": 0.586382083270705, + "flos": 19792130693760.0, + "grad_norm": 1.7425574926982417, + "language_loss": 0.77049434, + "learning_rate": 1.541625017642943e-06, + "loss": 0.79177862, + "num_input_tokens_seen": 210225660, + "step": 9753, + "time_per_iteration": 2.50671648979187 + }, + { + "auxiliary_loss_clip": 0.0110907, + "auxiliary_loss_mlp": 0.01026102, + "balance_loss_clip": 1.03890264, + "balance_loss_mlp": 1.01407409, + "epoch": 0.5864422065233729, + "flos": 16500558130560.0, + "grad_norm": 1.8013959514570013, + "language_loss": 0.71102989, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.73238158, + "num_input_tokens_seen": 210242725, + "step": 9754, + "time_per_iteration": 2.450523853302002 + }, + { + "auxiliary_loss_clip": 0.01087892, + "auxiliary_loss_mlp": 0.01030881, + "balance_loss_clip": 1.03663969, + "balance_loss_mlp": 1.01708865, + "epoch": 0.5865023297760409, + "flos": 20413277608320.0, + "grad_norm": 2.434358315089651, + "language_loss": 0.72120059, + "learning_rate": 1.540866862214043e-06, + "loss": 0.74238837, + "num_input_tokens_seen": 210263225, + "step": 9755, + "time_per_iteration": 2.5571441650390625 + }, + { + "auxiliary_loss_clip": 0.01014294, + "auxiliary_loss_mlp": 0.01005688, + "balance_loss_clip": 1.01985657, + "balance_loss_mlp": 1.00439453, + "epoch": 0.5865624530287089, + "flos": 63350769254400.0, + "grad_norm": 0.7416104160457917, + "language_loss": 0.56947958, + "learning_rate": 1.540487810607967e-06, + "loss": 0.58967936, + "num_input_tokens_seen": 210322310, + "step": 9756, + "time_per_iteration": 3.178391456604004 + }, + { + "auxiliary_loss_clip": 0.0111047, + "auxiliary_loss_mlp": 0.01033578, + "balance_loss_clip": 1.03848875, + "balance_loss_mlp": 1.02173471, + "epoch": 0.5866225762813768, + "flos": 27016279977600.0, + "grad_norm": 2.0634787975902715, + "language_loss": 0.76056182, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.78200227, + "num_input_tokens_seen": 210340845, + "step": 9757, + "time_per_iteration": 2.5442492961883545 + }, + { + "auxiliary_loss_clip": 0.01022971, + "auxiliary_loss_mlp": 0.01004466, + "balance_loss_clip": 1.01776791, + "balance_loss_mlp": 1.00320852, + "epoch": 0.5866826995340448, + "flos": 72987038507520.0, + "grad_norm": 0.8455578931380895, + "language_loss": 0.60415035, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.6244247, + "num_input_tokens_seen": 210397815, + "step": 9758, + "time_per_iteration": 3.119826316833496 + }, + { + "auxiliary_loss_clip": 0.01118532, + "auxiliary_loss_mlp": 0.01029564, + "balance_loss_clip": 1.04075909, + "balance_loss_mlp": 1.0159502, + "epoch": 0.5867428227867127, + "flos": 21285727050240.0, + "grad_norm": 2.599047411874615, + "language_loss": 0.72203374, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.74351466, + "num_input_tokens_seen": 210413900, + "step": 9759, + "time_per_iteration": 2.4718644618988037 + }, + { + "auxiliary_loss_clip": 0.0108858, + "auxiliary_loss_mlp": 0.01033183, + "balance_loss_clip": 1.03727615, + "balance_loss_mlp": 1.02099991, + "epoch": 0.5868029460393808, + "flos": 33468852188160.0, + "grad_norm": 1.5756043382474239, + "language_loss": 0.73370838, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.75492603, + "num_input_tokens_seen": 210434110, + "step": 9760, + "time_per_iteration": 2.63358211517334 + }, + { + "auxiliary_loss_clip": 0.01099196, + "auxiliary_loss_mlp": 0.01032232, + "balance_loss_clip": 1.03842068, + "balance_loss_mlp": 1.01911926, + "epoch": 0.5868630692920487, + "flos": 17889475276800.0, + "grad_norm": 1.7408274139291435, + "language_loss": 0.72576219, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.74707639, + "num_input_tokens_seen": 210451685, + "step": 9761, + "time_per_iteration": 2.4819915294647217 + }, + { + "auxiliary_loss_clip": 0.01096412, + "auxiliary_loss_mlp": 0.0103288, + "balance_loss_clip": 1.03928137, + "balance_loss_mlp": 1.01872969, + "epoch": 0.5869231925447167, + "flos": 21035035054080.0, + "grad_norm": 1.743486170683403, + "language_loss": 0.75101382, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.77230674, + "num_input_tokens_seen": 210470825, + "step": 9762, + "time_per_iteration": 2.5373435020446777 + }, + { + "auxiliary_loss_clip": 0.01075535, + "auxiliary_loss_mlp": 0.01034681, + "balance_loss_clip": 1.03870296, + "balance_loss_mlp": 1.02079272, + "epoch": 0.5869833157973846, + "flos": 74738219293440.0, + "grad_norm": 1.295391831116469, + "language_loss": 0.72367644, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.74477863, + "num_input_tokens_seen": 210500075, + "step": 9763, + "time_per_iteration": 2.9667675495147705 + }, + { + "auxiliary_loss_clip": 0.01099154, + "auxiliary_loss_mlp": 0.01034563, + "balance_loss_clip": 1.0377363, + "balance_loss_mlp": 1.02224255, + "epoch": 0.5870434390500526, + "flos": 17638998762240.0, + "grad_norm": 1.5739301011450604, + "language_loss": 0.79865897, + "learning_rate": 1.53745602625755e-06, + "loss": 0.81999612, + "num_input_tokens_seen": 210518150, + "step": 9764, + "time_per_iteration": 3.8340649604797363 + }, + { + "auxiliary_loss_clip": 0.01086227, + "auxiliary_loss_mlp": 0.01034175, + "balance_loss_clip": 1.0416683, + "balance_loss_mlp": 1.02172387, + "epoch": 0.5871035623027205, + "flos": 21506146859520.0, + "grad_norm": 1.9507442618381798, + "language_loss": 0.79024267, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.81144667, + "num_input_tokens_seen": 210537760, + "step": 9765, + "time_per_iteration": 2.5186378955841064 + }, + { + "auxiliary_loss_clip": 0.01081511, + "auxiliary_loss_mlp": 0.0103874, + "balance_loss_clip": 1.03905666, + "balance_loss_mlp": 1.02531707, + "epoch": 0.5871636855553886, + "flos": 13551861818880.0, + "grad_norm": 1.8416580105716132, + "language_loss": 0.83941203, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.86061454, + "num_input_tokens_seen": 210555515, + "step": 9766, + "time_per_iteration": 2.5129966735839844 + }, + { + "auxiliary_loss_clip": 0.01105947, + "auxiliary_loss_mlp": 0.01036324, + "balance_loss_clip": 1.04003024, + "balance_loss_mlp": 1.02361083, + "epoch": 0.5872238088080565, + "flos": 26212922346240.0, + "grad_norm": 1.7282659316141296, + "language_loss": 0.69688237, + "learning_rate": 1.536319396136257e-06, + "loss": 0.71830511, + "num_input_tokens_seen": 210575000, + "step": 9767, + "time_per_iteration": 2.549257755279541 + }, + { + "auxiliary_loss_clip": 0.01097425, + "auxiliary_loss_mlp": 0.00787526, + "balance_loss_clip": 1.03678215, + "balance_loss_mlp": 1.00764179, + "epoch": 0.5872839320607245, + "flos": 30665198995200.0, + "grad_norm": 2.1039210906789227, + "language_loss": 0.63602257, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.65487206, + "num_input_tokens_seen": 210595185, + "step": 9768, + "time_per_iteration": 2.5859522819519043 + }, + { + "auxiliary_loss_clip": 0.01043246, + "auxiliary_loss_mlp": 0.00782503, + "balance_loss_clip": 1.01825178, + "balance_loss_mlp": 1.0361836, + "epoch": 0.5873440553133924, + "flos": 60303570871680.0, + "grad_norm": 0.723566838371461, + "language_loss": 0.53938699, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.55764449, + "num_input_tokens_seen": 210653210, + "step": 9769, + "time_per_iteration": 3.0935494899749756 + }, + { + "auxiliary_loss_clip": 0.01080416, + "auxiliary_loss_mlp": 0.01036693, + "balance_loss_clip": 1.03841472, + "balance_loss_mlp": 1.02374053, + "epoch": 0.5874041785660604, + "flos": 21539292134400.0, + "grad_norm": 1.4141090337780486, + "language_loss": 0.70547748, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.72664857, + "num_input_tokens_seen": 210673750, + "step": 9770, + "time_per_iteration": 2.581315040588379 + }, + { + "auxiliary_loss_clip": 0.0106611, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.03683901, + "balance_loss_mlp": 1.02077293, + "epoch": 0.5874643018187284, + "flos": 24388947671040.0, + "grad_norm": 1.8064983361768363, + "language_loss": 0.67889071, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.69989085, + "num_input_tokens_seen": 210692960, + "step": 9771, + "time_per_iteration": 2.605313777923584 + }, + { + "auxiliary_loss_clip": 0.01067209, + "auxiliary_loss_mlp": 0.0103773, + "balance_loss_clip": 1.03580904, + "balance_loss_mlp": 1.02188754, + "epoch": 0.5875244250713964, + "flos": 28147717457280.0, + "grad_norm": 1.7052973001700296, + "language_loss": 0.65857887, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.67962825, + "num_input_tokens_seen": 210714040, + "step": 9772, + "time_per_iteration": 4.01865291595459 + }, + { + "auxiliary_loss_clip": 0.01118571, + "auxiliary_loss_mlp": 0.01044037, + "balance_loss_clip": 1.04177642, + "balance_loss_mlp": 1.0293324, + "epoch": 0.5875845483240644, + "flos": 25812410722560.0, + "grad_norm": 1.6964720499189807, + "language_loss": 0.74325776, + "learning_rate": 1.534046611017519e-06, + "loss": 0.76488388, + "num_input_tokens_seen": 210733710, + "step": 9773, + "time_per_iteration": 2.521066188812256 + }, + { + "auxiliary_loss_clip": 0.01080907, + "auxiliary_loss_mlp": 0.01041614, + "balance_loss_clip": 1.0390569, + "balance_loss_mlp": 1.02748156, + "epoch": 0.5876446715767323, + "flos": 26906572863360.0, + "grad_norm": 2.079384471812669, + "language_loss": 0.53757101, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.55879623, + "num_input_tokens_seen": 210753580, + "step": 9774, + "time_per_iteration": 2.600245475769043 + }, + { + "auxiliary_loss_clip": 0.01105871, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.04047668, + "balance_loss_mlp": 1.02205133, + "epoch": 0.5877047948294003, + "flos": 36684832579200.0, + "grad_norm": 2.250352830420552, + "language_loss": 0.64837056, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.66978568, + "num_input_tokens_seen": 210773495, + "step": 9775, + "time_per_iteration": 4.013902902603149 + }, + { + "auxiliary_loss_clip": 0.01099395, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.04133022, + "balance_loss_mlp": 1.01880491, + "epoch": 0.5877649180820682, + "flos": 26724721282560.0, + "grad_norm": 1.6246862364354295, + "language_loss": 0.73812592, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.75943792, + "num_input_tokens_seen": 210793645, + "step": 9776, + "time_per_iteration": 2.5575692653656006 + }, + { + "auxiliary_loss_clip": 0.01111875, + "auxiliary_loss_mlp": 0.01032593, + "balance_loss_clip": 1.03809237, + "balance_loss_mlp": 1.0201596, + "epoch": 0.5878250413347362, + "flos": 21032197879680.0, + "grad_norm": 2.018040131483488, + "language_loss": 0.74001086, + "learning_rate": 1.532531774126821e-06, + "loss": 0.76145554, + "num_input_tokens_seen": 210813415, + "step": 9777, + "time_per_iteration": 2.4730143547058105 + }, + { + "auxiliary_loss_clip": 0.01076816, + "auxiliary_loss_mlp": 0.01034888, + "balance_loss_clip": 1.038064, + "balance_loss_mlp": 1.02340198, + "epoch": 0.5878851645874041, + "flos": 25484259047040.0, + "grad_norm": 1.4407696526715994, + "language_loss": 0.74290252, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.76401955, + "num_input_tokens_seen": 210833850, + "step": 9778, + "time_per_iteration": 2.60758376121521 + }, + { + "auxiliary_loss_clip": 0.01077382, + "auxiliary_loss_mlp": 0.01034811, + "balance_loss_clip": 1.03715062, + "balance_loss_mlp": 1.02050543, + "epoch": 0.5879452878400722, + "flos": 23769129559680.0, + "grad_norm": 1.8398665779029335, + "language_loss": 0.70236313, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.72348511, + "num_input_tokens_seen": 210853115, + "step": 9779, + "time_per_iteration": 2.5499143600463867 + }, + { + "auxiliary_loss_clip": 0.01114743, + "auxiliary_loss_mlp": 0.00793182, + "balance_loss_clip": 1.03845167, + "balance_loss_mlp": 1.02141297, + "epoch": 0.5880054110927401, + "flos": 17824513530240.0, + "grad_norm": 1.9558306794631963, + "language_loss": 0.6662885, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.6853677, + "num_input_tokens_seen": 210872090, + "step": 9780, + "time_per_iteration": 3.889853000640869 + }, + { + "auxiliary_loss_clip": 0.01087679, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.04505301, + "balance_loss_mlp": 1.02198994, + "epoch": 0.5880655343454081, + "flos": 19463404400640.0, + "grad_norm": 2.7006532753907564, + "language_loss": 0.72241211, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.74364519, + "num_input_tokens_seen": 210888490, + "step": 9781, + "time_per_iteration": 2.5106546878814697 + }, + { + "auxiliary_loss_clip": 0.01092238, + "auxiliary_loss_mlp": 0.00791532, + "balance_loss_clip": 1.03898883, + "balance_loss_mlp": 1.02023554, + "epoch": 0.588125657598076, + "flos": 21397588980480.0, + "grad_norm": 1.3534517984080048, + "language_loss": 0.70462739, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.72346509, + "num_input_tokens_seen": 210908220, + "step": 9782, + "time_per_iteration": 2.5486207008361816 + }, + { + "auxiliary_loss_clip": 0.01099559, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.03660679, + "balance_loss_mlp": 1.02572536, + "epoch": 0.588185780850744, + "flos": 16034653797120.0, + "grad_norm": 2.161754714537969, + "language_loss": 0.70372021, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.72511542, + "num_input_tokens_seen": 210923945, + "step": 9783, + "time_per_iteration": 2.4505860805511475 + }, + { + "auxiliary_loss_clip": 0.01083231, + "auxiliary_loss_mlp": 0.01034092, + "balance_loss_clip": 1.03991699, + "balance_loss_mlp": 1.02077055, + "epoch": 0.588245904103412, + "flos": 23728226947200.0, + "grad_norm": 1.7518219762691998, + "language_loss": 0.68820012, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.70937335, + "num_input_tokens_seen": 210941955, + "step": 9784, + "time_per_iteration": 2.5787034034729004 + }, + { + "auxiliary_loss_clip": 0.01068515, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_clip": 1.03915787, + "balance_loss_mlp": 1.01985955, + "epoch": 0.58830602735608, + "flos": 33802534558080.0, + "grad_norm": 1.8369422479704325, + "language_loss": 0.69522029, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.71623063, + "num_input_tokens_seen": 210963105, + "step": 9785, + "time_per_iteration": 2.697991371154785 + }, + { + "auxiliary_loss_clip": 0.01099112, + "auxiliary_loss_mlp": 0.01027398, + "balance_loss_clip": 1.03823066, + "balance_loss_mlp": 1.01535821, + "epoch": 0.588366150608748, + "flos": 17090714586240.0, + "grad_norm": 1.8887212292080808, + "language_loss": 0.77096963, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.79223472, + "num_input_tokens_seen": 210978720, + "step": 9786, + "time_per_iteration": 2.472264051437378 + }, + { + "auxiliary_loss_clip": 0.01098721, + "auxiliary_loss_mlp": 0.01035546, + "balance_loss_clip": 1.04028976, + "balance_loss_mlp": 1.02224243, + "epoch": 0.5884262738614159, + "flos": 22127186033280.0, + "grad_norm": 1.5004041176995888, + "language_loss": 0.79224455, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.81358725, + "num_input_tokens_seen": 210998750, + "step": 9787, + "time_per_iteration": 2.5327038764953613 + }, + { + "auxiliary_loss_clip": 0.01074992, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.03912926, + "balance_loss_mlp": 1.01884675, + "epoch": 0.5884863971140839, + "flos": 21031838743680.0, + "grad_norm": 1.5344081525962847, + "language_loss": 0.66256833, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.68362516, + "num_input_tokens_seen": 211017550, + "step": 9788, + "time_per_iteration": 2.556525468826294 + }, + { + "auxiliary_loss_clip": 0.01082947, + "auxiliary_loss_mlp": 0.01037752, + "balance_loss_clip": 1.03685665, + "balance_loss_mlp": 1.02304161, + "epoch": 0.5885465203667518, + "flos": 23805112008960.0, + "grad_norm": 2.0761374385340434, + "language_loss": 0.80231154, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.82351851, + "num_input_tokens_seen": 211034135, + "step": 9789, + "time_per_iteration": 2.531259536743164 + }, + { + "auxiliary_loss_clip": 0.01079249, + "auxiliary_loss_mlp": 0.00786765, + "balance_loss_clip": 1.03453588, + "balance_loss_mlp": 1.01000154, + "epoch": 0.5886066436194198, + "flos": 18880574319360.0, + "grad_norm": 1.771163247482478, + "language_loss": 0.70251375, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.721174, + "num_input_tokens_seen": 211053850, + "step": 9790, + "time_per_iteration": 2.534313917160034 + }, + { + "auxiliary_loss_clip": 0.0107608, + "auxiliary_loss_mlp": 0.01032007, + "balance_loss_clip": 1.040277, + "balance_loss_mlp": 1.01922154, + "epoch": 0.5886667668720877, + "flos": 24790141653120.0, + "grad_norm": 8.454696958345949, + "language_loss": 0.8373248, + "learning_rate": 1.527232084570895e-06, + "loss": 0.85840559, + "num_input_tokens_seen": 211072165, + "step": 9791, + "time_per_iteration": 2.5829927921295166 + }, + { + "auxiliary_loss_clip": 0.01100541, + "auxiliary_loss_mlp": 0.01047077, + "balance_loss_clip": 1.03928971, + "balance_loss_mlp": 1.03283167, + "epoch": 0.5887268901247558, + "flos": 21614381516160.0, + "grad_norm": 1.5533064488736121, + "language_loss": 0.76606023, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.7875365, + "num_input_tokens_seen": 211089630, + "step": 9792, + "time_per_iteration": 2.520554542541504 + }, + { + "auxiliary_loss_clip": 0.01056414, + "auxiliary_loss_mlp": 0.01043274, + "balance_loss_clip": 1.0333004, + "balance_loss_mlp": 1.0278008, + "epoch": 0.5887870133774237, + "flos": 20481722974080.0, + "grad_norm": 1.8218083200241963, + "language_loss": 0.69043422, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.71143103, + "num_input_tokens_seen": 211106120, + "step": 9793, + "time_per_iteration": 2.5907654762268066 + }, + { + "auxiliary_loss_clip": 0.01109965, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.03868628, + "balance_loss_mlp": 1.02062774, + "epoch": 0.5888471366300917, + "flos": 19206283870080.0, + "grad_norm": 1.7369088137908923, + "language_loss": 0.59978366, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.62122118, + "num_input_tokens_seen": 211122450, + "step": 9794, + "time_per_iteration": 2.449765205383301 + }, + { + "auxiliary_loss_clip": 0.01081435, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.03863013, + "balance_loss_mlp": 1.01942217, + "epoch": 0.5889072598827596, + "flos": 19972904866560.0, + "grad_norm": 1.8171701324838332, + "language_loss": 0.64642334, + "learning_rate": 1.525718531219257e-06, + "loss": 0.66756797, + "num_input_tokens_seen": 211141765, + "step": 9795, + "time_per_iteration": 2.583070755004883 + }, + { + "auxiliary_loss_clip": 0.01080582, + "auxiliary_loss_mlp": 0.01033878, + "balance_loss_clip": 1.03665304, + "balance_loss_mlp": 1.02210045, + "epoch": 0.5889673831354276, + "flos": 20741249715840.0, + "grad_norm": 1.8290381374896718, + "language_loss": 0.74590302, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.76704764, + "num_input_tokens_seen": 211160475, + "step": 9796, + "time_per_iteration": 2.5452826023101807 + }, + { + "auxiliary_loss_clip": 0.01087929, + "auxiliary_loss_mlp": 0.01029938, + "balance_loss_clip": 1.03788877, + "balance_loss_mlp": 1.01737356, + "epoch": 0.5890275063880956, + "flos": 25300935008640.0, + "grad_norm": 1.485693062621769, + "language_loss": 0.82924139, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.85042012, + "num_input_tokens_seen": 211180480, + "step": 9797, + "time_per_iteration": 2.559847354888916 + }, + { + "auxiliary_loss_clip": 0.01084673, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.0354116, + "balance_loss_mlp": 1.01720071, + "epoch": 0.5890876296407636, + "flos": 11765377964160.0, + "grad_norm": 1.670108079573868, + "language_loss": 0.79379642, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.81494188, + "num_input_tokens_seen": 211198000, + "step": 9798, + "time_per_iteration": 2.5193421840667725 + }, + { + "auxiliary_loss_clip": 0.01108556, + "auxiliary_loss_mlp": 0.01033422, + "balance_loss_clip": 1.03775144, + "balance_loss_mlp": 1.02147102, + "epoch": 0.5891477528934316, + "flos": 13589460380160.0, + "grad_norm": 2.618494668340049, + "language_loss": 0.73831493, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.75973469, + "num_input_tokens_seen": 211214765, + "step": 9799, + "time_per_iteration": 2.42594313621521 + }, + { + "auxiliary_loss_clip": 0.01080204, + "auxiliary_loss_mlp": 0.01033764, + "balance_loss_clip": 1.03908372, + "balance_loss_mlp": 1.01944721, + "epoch": 0.5892078761460995, + "flos": 15049193189760.0, + "grad_norm": 2.000297352632636, + "language_loss": 0.75438011, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.77551979, + "num_input_tokens_seen": 211232335, + "step": 9800, + "time_per_iteration": 2.5530693531036377 + }, + { + "auxiliary_loss_clip": 0.01063226, + "auxiliary_loss_mlp": 0.01044626, + "balance_loss_clip": 1.03567576, + "balance_loss_mlp": 1.03047574, + "epoch": 0.5892679993987675, + "flos": 15778215624960.0, + "grad_norm": 2.0618376103104312, + "language_loss": 0.78760672, + "learning_rate": 1.523448741022722e-06, + "loss": 0.8086853, + "num_input_tokens_seen": 211249985, + "step": 9801, + "time_per_iteration": 2.59871768951416 + }, + { + "auxiliary_loss_clip": 0.01079812, + "auxiliary_loss_mlp": 0.01033732, + "balance_loss_clip": 1.04100442, + "balance_loss_mlp": 1.02005327, + "epoch": 0.5893281226514354, + "flos": 25265203954560.0, + "grad_norm": 2.050347508255539, + "language_loss": 0.66018283, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.68131828, + "num_input_tokens_seen": 211268425, + "step": 9802, + "time_per_iteration": 2.582305908203125 + }, + { + "auxiliary_loss_clip": 0.0109933, + "auxiliary_loss_mlp": 0.01030834, + "balance_loss_clip": 1.03833342, + "balance_loss_mlp": 1.01806617, + "epoch": 0.5893882459041034, + "flos": 19458232842240.0, + "grad_norm": 1.8212430810938476, + "language_loss": 0.7818222, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.80312383, + "num_input_tokens_seen": 211286680, + "step": 9803, + "time_per_iteration": 3.8443236351013184 + }, + { + "auxiliary_loss_clip": 0.01101908, + "auxiliary_loss_mlp": 0.01036838, + "balance_loss_clip": 1.0387677, + "balance_loss_mlp": 1.02437496, + "epoch": 0.5894483691567713, + "flos": 20634056553600.0, + "grad_norm": 1.8522288904853887, + "language_loss": 0.72998148, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75136894, + "num_input_tokens_seen": 211307700, + "step": 9804, + "time_per_iteration": 2.521259307861328 + }, + { + "auxiliary_loss_clip": 0.01088233, + "auxiliary_loss_mlp": 0.01034433, + "balance_loss_clip": 1.04182434, + "balance_loss_mlp": 1.02138555, + "epoch": 0.5895084924094394, + "flos": 17778223877760.0, + "grad_norm": 1.605320292980396, + "language_loss": 0.74651647, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.76774317, + "num_input_tokens_seen": 211324835, + "step": 9805, + "time_per_iteration": 2.5242626667022705 + }, + { + "auxiliary_loss_clip": 0.01109022, + "auxiliary_loss_mlp": 0.00787177, + "balance_loss_clip": 1.04005837, + "balance_loss_mlp": 1.00952387, + "epoch": 0.5895686156621073, + "flos": 20121072468480.0, + "grad_norm": 1.717204136845522, + "language_loss": 0.77869302, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.79765499, + "num_input_tokens_seen": 211344130, + "step": 9806, + "time_per_iteration": 2.494640350341797 + }, + { + "auxiliary_loss_clip": 0.01111259, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.03814399, + "balance_loss_mlp": 1.01787281, + "epoch": 0.5896287389147753, + "flos": 20850058990080.0, + "grad_norm": 1.9855212292299347, + "language_loss": 0.76451027, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.78593314, + "num_input_tokens_seen": 211362915, + "step": 9807, + "time_per_iteration": 2.487518310546875 + }, + { + "auxiliary_loss_clip": 0.01108258, + "auxiliary_loss_mlp": 0.01028626, + "balance_loss_clip": 1.04268312, + "balance_loss_mlp": 1.01534653, + "epoch": 0.5896888621674432, + "flos": 14537897043840.0, + "grad_norm": 1.7748492418605226, + "language_loss": 0.74086779, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.7622366, + "num_input_tokens_seen": 211380700, + "step": 9808, + "time_per_iteration": 2.487553119659424 + }, + { + "auxiliary_loss_clip": 0.01068213, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.04380679, + "balance_loss_mlp": 1.01799417, + "epoch": 0.5897489854201112, + "flos": 20886759711360.0, + "grad_norm": 2.1271699493166487, + "language_loss": 0.72336638, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.74437749, + "num_input_tokens_seen": 211400095, + "step": 9809, + "time_per_iteration": 2.62705659866333 + }, + { + "auxiliary_loss_clip": 0.01093689, + "auxiliary_loss_mlp": 0.01034995, + "balance_loss_clip": 1.04045033, + "balance_loss_mlp": 1.02195358, + "epoch": 0.5898091086727792, + "flos": 20011149872640.0, + "grad_norm": 1.9768195398791444, + "language_loss": 0.82200551, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.8432923, + "num_input_tokens_seen": 211417810, + "step": 9810, + "time_per_iteration": 2.508047342300415 + }, + { + "auxiliary_loss_clip": 0.01102053, + "auxiliary_loss_mlp": 0.01028912, + "balance_loss_clip": 1.03986835, + "balance_loss_mlp": 1.01663351, + "epoch": 0.5898692319254472, + "flos": 16253242012800.0, + "grad_norm": 1.6721642447318088, + "language_loss": 0.81253982, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.83384949, + "num_input_tokens_seen": 211436020, + "step": 9811, + "time_per_iteration": 3.883561849594116 + }, + { + "auxiliary_loss_clip": 0.01104352, + "auxiliary_loss_mlp": 0.01031075, + "balance_loss_clip": 1.03946757, + "balance_loss_mlp": 1.01710987, + "epoch": 0.5899293551781152, + "flos": 20448541785600.0, + "grad_norm": 1.6624653670722664, + "language_loss": 0.76807928, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.78943348, + "num_input_tokens_seen": 211454335, + "step": 9812, + "time_per_iteration": 2.513965368270874 + }, + { + "auxiliary_loss_clip": 0.01078846, + "auxiliary_loss_mlp": 0.01031703, + "balance_loss_clip": 1.04029775, + "balance_loss_mlp": 1.01998448, + "epoch": 0.5899894784307831, + "flos": 13881701433600.0, + "grad_norm": 2.0878018853786977, + "language_loss": 0.70123708, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.72234261, + "num_input_tokens_seen": 211472775, + "step": 9813, + "time_per_iteration": 2.536134958267212 + }, + { + "auxiliary_loss_clip": 0.01087342, + "auxiliary_loss_mlp": 0.01034334, + "balance_loss_clip": 1.04080617, + "balance_loss_mlp": 1.02157807, + "epoch": 0.5900496016834511, + "flos": 20083797129600.0, + "grad_norm": 2.2328678252728116, + "language_loss": 0.72156942, + "learning_rate": 1.518533098148494e-06, + "loss": 0.74278623, + "num_input_tokens_seen": 211492195, + "step": 9814, + "time_per_iteration": 3.934065580368042 + }, + { + "auxiliary_loss_clip": 0.01093714, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.0410459, + "balance_loss_mlp": 1.01751137, + "epoch": 0.590109724936119, + "flos": 20259148348800.0, + "grad_norm": 2.1763705642038085, + "language_loss": 0.78292441, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.80416489, + "num_input_tokens_seen": 211510220, + "step": 9815, + "time_per_iteration": 2.5309579372406006 + }, + { + "auxiliary_loss_clip": 0.01087764, + "auxiliary_loss_mlp": 0.00788168, + "balance_loss_clip": 1.03954411, + "balance_loss_mlp": 1.01195502, + "epoch": 0.590169848188787, + "flos": 24235069806720.0, + "grad_norm": 3.684914932130535, + "language_loss": 0.75931346, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.77807271, + "num_input_tokens_seen": 211526260, + "step": 9816, + "time_per_iteration": 2.5609230995178223 + }, + { + "auxiliary_loss_clip": 0.01113324, + "auxiliary_loss_mlp": 0.01037378, + "balance_loss_clip": 1.04132223, + "balance_loss_mlp": 1.02461696, + "epoch": 0.590229971441455, + "flos": 17784724239360.0, + "grad_norm": 1.896639751456738, + "language_loss": 0.81112707, + "learning_rate": 1.517399156051309e-06, + "loss": 0.83263409, + "num_input_tokens_seen": 211542890, + "step": 9817, + "time_per_iteration": 2.438495397567749 + }, + { + "auxiliary_loss_clip": 0.01068123, + "auxiliary_loss_mlp": 0.01037646, + "balance_loss_clip": 1.03910494, + "balance_loss_mlp": 1.02399039, + "epoch": 0.590290094694123, + "flos": 22236893147520.0, + "grad_norm": 1.725273257630142, + "language_loss": 0.76314938, + "learning_rate": 1.517021211933682e-06, + "loss": 0.78420699, + "num_input_tokens_seen": 211562685, + "step": 9818, + "time_per_iteration": 4.051042079925537 + }, + { + "auxiliary_loss_clip": 0.01076578, + "auxiliary_loss_mlp": 0.01032048, + "balance_loss_clip": 1.04300404, + "balance_loss_mlp": 1.0199008, + "epoch": 0.5903502179467909, + "flos": 19098623831040.0, + "grad_norm": 1.9294361333421874, + "language_loss": 0.66574407, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.68683034, + "num_input_tokens_seen": 211579960, + "step": 9819, + "time_per_iteration": 2.5461270809173584 + }, + { + "auxiliary_loss_clip": 0.01114073, + "auxiliary_loss_mlp": 0.01032228, + "balance_loss_clip": 1.04092884, + "balance_loss_mlp": 1.01943135, + "epoch": 0.5904103411994589, + "flos": 24235500769920.0, + "grad_norm": 1.637266323299819, + "language_loss": 0.78202331, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.80348635, + "num_input_tokens_seen": 211599310, + "step": 9820, + "time_per_iteration": 2.4735982418060303 + }, + { + "auxiliary_loss_clip": 0.01011357, + "auxiliary_loss_mlp": 0.01009019, + "balance_loss_clip": 1.02297664, + "balance_loss_mlp": 1.0077734, + "epoch": 0.5904704644521268, + "flos": 64876613045760.0, + "grad_norm": 1.0410484768421304, + "language_loss": 0.65130115, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67150486, + "num_input_tokens_seen": 211658790, + "step": 9821, + "time_per_iteration": 3.1371514797210693 + }, + { + "auxiliary_loss_clip": 0.01072334, + "auxiliary_loss_mlp": 0.01037201, + "balance_loss_clip": 1.03776062, + "balance_loss_mlp": 1.02491653, + "epoch": 0.5905305877047948, + "flos": 19609991804160.0, + "grad_norm": 1.866637047790167, + "language_loss": 0.61203998, + "learning_rate": 1.515509618752521e-06, + "loss": 0.63313532, + "num_input_tokens_seen": 211677240, + "step": 9822, + "time_per_iteration": 2.5514883995056152 + }, + { + "auxiliary_loss_clip": 0.01114618, + "auxiliary_loss_mlp": 0.01039342, + "balance_loss_clip": 1.04027772, + "balance_loss_mlp": 1.02646101, + "epoch": 0.5905907109574628, + "flos": 18989634988800.0, + "grad_norm": 1.9228268526462933, + "language_loss": 0.82235456, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.84389412, + "num_input_tokens_seen": 211695485, + "step": 9823, + "time_per_iteration": 2.4734208583831787 + }, + { + "auxiliary_loss_clip": 0.01092112, + "auxiliary_loss_mlp": 0.01029321, + "balance_loss_clip": 1.04015994, + "balance_loss_mlp": 1.01647043, + "epoch": 0.5906508342101308, + "flos": 22200407907840.0, + "grad_norm": 2.003158329502571, + "language_loss": 0.72716427, + "learning_rate": 1.514753932336165e-06, + "loss": 0.74837852, + "num_input_tokens_seen": 211713090, + "step": 9824, + "time_per_iteration": 2.534318447113037 + }, + { + "auxiliary_loss_clip": 0.01089893, + "auxiliary_loss_mlp": 0.00789678, + "balance_loss_clip": 1.04036725, + "balance_loss_mlp": 1.01239514, + "epoch": 0.5907109574627988, + "flos": 20886687884160.0, + "grad_norm": 2.0895637166885024, + "language_loss": 0.83182895, + "learning_rate": 1.514376116721693e-06, + "loss": 0.85062468, + "num_input_tokens_seen": 211732510, + "step": 9825, + "time_per_iteration": 2.5419535636901855 + }, + { + "auxiliary_loss_clip": 0.01097235, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.03819513, + "balance_loss_mlp": 1.0218817, + "epoch": 0.5907710807154667, + "flos": 21506649649920.0, + "grad_norm": 1.7481737745251156, + "language_loss": 0.7643137, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.78561437, + "num_input_tokens_seen": 211748695, + "step": 9826, + "time_per_iteration": 2.4918174743652344 + }, + { + "auxiliary_loss_clip": 0.0108668, + "auxiliary_loss_mlp": 0.01026646, + "balance_loss_clip": 1.037992, + "balance_loss_mlp": 1.01436126, + "epoch": 0.5908312039681347, + "flos": 22018376759040.0, + "grad_norm": 1.7093622234729342, + "language_loss": 0.72092712, + "learning_rate": 1.513620540751793e-06, + "loss": 0.74206036, + "num_input_tokens_seen": 211768545, + "step": 9827, + "time_per_iteration": 2.523790121078491 + }, + { + "auxiliary_loss_clip": 0.01067216, + "auxiliary_loss_mlp": 0.01027879, + "balance_loss_clip": 1.03720737, + "balance_loss_mlp": 1.01629162, + "epoch": 0.5908913272208026, + "flos": 18479523991680.0, + "grad_norm": 1.673243890709185, + "language_loss": 0.79903722, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.81998813, + "num_input_tokens_seen": 211786665, + "step": 9828, + "time_per_iteration": 2.5636303424835205 + }, + { + "auxiliary_loss_clip": 0.01070159, + "auxiliary_loss_mlp": 0.0103292, + "balance_loss_clip": 1.04254913, + "balance_loss_mlp": 1.01953912, + "epoch": 0.5909514504734706, + "flos": 12312189682560.0, + "grad_norm": 2.19545181808018, + "language_loss": 0.88184428, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.90287513, + "num_input_tokens_seen": 211801215, + "step": 9829, + "time_per_iteration": 2.5871691703796387 + }, + { + "auxiliary_loss_clip": 0.01028918, + "auxiliary_loss_mlp": 0.01001604, + "balance_loss_clip": 1.02365088, + "balance_loss_mlp": 1.0003705, + "epoch": 0.5910115737261386, + "flos": 70213262451840.0, + "grad_norm": 0.756472002514238, + "language_loss": 0.57840121, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.59870636, + "num_input_tokens_seen": 211857005, + "step": 9830, + "time_per_iteration": 3.0987348556518555 + }, + { + "auxiliary_loss_clip": 0.01108899, + "auxiliary_loss_mlp": 0.00787866, + "balance_loss_clip": 1.04092383, + "balance_loss_mlp": 1.01049888, + "epoch": 0.5910716969788066, + "flos": 22017766227840.0, + "grad_norm": 2.1670634946625964, + "language_loss": 0.76108062, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.78004825, + "num_input_tokens_seen": 211876675, + "step": 9831, + "time_per_iteration": 2.5106663703918457 + }, + { + "auxiliary_loss_clip": 0.01083917, + "auxiliary_loss_mlp": 0.01028926, + "balance_loss_clip": 1.04151165, + "balance_loss_mlp": 1.01596808, + "epoch": 0.5911318202314745, + "flos": 21251648021760.0, + "grad_norm": 1.784911574620458, + "language_loss": 0.77619344, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.79732186, + "num_input_tokens_seen": 211895725, + "step": 9832, + "time_per_iteration": 2.550844192504883 + }, + { + "auxiliary_loss_clip": 0.01099598, + "auxiliary_loss_mlp": 0.01026135, + "balance_loss_clip": 1.038517, + "balance_loss_mlp": 1.01344514, + "epoch": 0.5911919434841425, + "flos": 17821604528640.0, + "grad_norm": 1.7770388404304318, + "language_loss": 0.8345865, + "learning_rate": 1.511354255945847e-06, + "loss": 0.85584378, + "num_input_tokens_seen": 211913860, + "step": 9833, + "time_per_iteration": 2.500671863555908 + }, + { + "auxiliary_loss_clip": 0.0110316, + "auxiliary_loss_mlp": 0.01029018, + "balance_loss_clip": 1.03980684, + "balance_loss_mlp": 1.01597643, + "epoch": 0.5912520667368104, + "flos": 20374781207040.0, + "grad_norm": 1.4785805711954314, + "language_loss": 0.74187219, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.76319396, + "num_input_tokens_seen": 211932880, + "step": 9834, + "time_per_iteration": 2.512303590774536 + }, + { + "auxiliary_loss_clip": 0.01112082, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.03837514, + "balance_loss_mlp": 1.01863647, + "epoch": 0.5913121899894784, + "flos": 17930557457280.0, + "grad_norm": 3.2239431820696236, + "language_loss": 0.77622437, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.79765713, + "num_input_tokens_seen": 211948625, + "step": 9835, + "time_per_iteration": 2.444979429244995 + }, + { + "auxiliary_loss_clip": 0.01090603, + "auxiliary_loss_mlp": 0.01031517, + "balance_loss_clip": 1.03800499, + "balance_loss_mlp": 1.01859426, + "epoch": 0.5913723132421465, + "flos": 22126934638080.0, + "grad_norm": 1.9103873684530082, + "language_loss": 0.74125898, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.76248014, + "num_input_tokens_seen": 211965355, + "step": 9836, + "time_per_iteration": 2.5275633335113525 + }, + { + "auxiliary_loss_clip": 0.01077196, + "auxiliary_loss_mlp": 0.010319, + "balance_loss_clip": 1.03770971, + "balance_loss_mlp": 1.01794076, + "epoch": 0.5914324364948144, + "flos": 15697918771200.0, + "grad_norm": 1.9865068878752574, + "language_loss": 0.82167792, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.84276891, + "num_input_tokens_seen": 211982245, + "step": 9837, + "time_per_iteration": 2.5246269702911377 + }, + { + "auxiliary_loss_clip": 0.01074107, + "auxiliary_loss_mlp": 0.01032113, + "balance_loss_clip": 1.03695917, + "balance_loss_mlp": 1.0177784, + "epoch": 0.5914925597474824, + "flos": 22747327367040.0, + "grad_norm": 1.6639652050668934, + "language_loss": 0.79549718, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.81655937, + "num_input_tokens_seen": 212000250, + "step": 9838, + "time_per_iteration": 2.574160099029541 + }, + { + "auxiliary_loss_clip": 0.01066743, + "auxiliary_loss_mlp": 0.01035253, + "balance_loss_clip": 1.0425539, + "balance_loss_mlp": 1.02272391, + "epoch": 0.5915526830001503, + "flos": 18292788161280.0, + "grad_norm": 1.9185663045283423, + "language_loss": 0.69497162, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.71599162, + "num_input_tokens_seen": 212017505, + "step": 9839, + "time_per_iteration": 2.569204330444336 + }, + { + "auxiliary_loss_clip": 0.01089936, + "auxiliary_loss_mlp": 0.0103793, + "balance_loss_clip": 1.04100585, + "balance_loss_mlp": 1.02504313, + "epoch": 0.5916128062528183, + "flos": 17019072910080.0, + "grad_norm": 2.084320085397688, + "language_loss": 0.65990424, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.68118286, + "num_input_tokens_seen": 212034595, + "step": 9840, + "time_per_iteration": 2.5272700786590576 + }, + { + "auxiliary_loss_clip": 0.01087699, + "auxiliary_loss_mlp": 0.0103131, + "balance_loss_clip": 1.03908157, + "balance_loss_mlp": 1.01790547, + "epoch": 0.5916729295054862, + "flos": 24754231031040.0, + "grad_norm": 1.8946270754325716, + "language_loss": 0.81893617, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.84012628, + "num_input_tokens_seen": 212055775, + "step": 9841, + "time_per_iteration": 3.930525302886963 + }, + { + "auxiliary_loss_clip": 0.01086308, + "auxiliary_loss_mlp": 0.01031802, + "balance_loss_clip": 1.04003787, + "balance_loss_mlp": 1.01933837, + "epoch": 0.5917330527581542, + "flos": 15958199698560.0, + "grad_norm": 1.5354615079987628, + "language_loss": 0.6906454, + "learning_rate": 1.507956080444291e-06, + "loss": 0.71182644, + "num_input_tokens_seen": 212074000, + "step": 9842, + "time_per_iteration": 2.532405376434326 + }, + { + "auxiliary_loss_clip": 0.0108932, + "auxiliary_loss_mlp": 0.010304, + "balance_loss_clip": 1.03821778, + "balance_loss_mlp": 1.01729906, + "epoch": 0.5917931760108222, + "flos": 23800730549760.0, + "grad_norm": 1.8376882155106844, + "language_loss": 0.82905746, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.85025465, + "num_input_tokens_seen": 212091415, + "step": 9843, + "time_per_iteration": 2.5572309494018555 + }, + { + "auxiliary_loss_clip": 0.01086739, + "auxiliary_loss_mlp": 0.01031407, + "balance_loss_clip": 1.03759265, + "balance_loss_mlp": 1.01757288, + "epoch": 0.5918532992634902, + "flos": 23249609199360.0, + "grad_norm": 2.5863809374432662, + "language_loss": 0.81397957, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.83516103, + "num_input_tokens_seen": 212105255, + "step": 9844, + "time_per_iteration": 2.496699810028076 + }, + { + "auxiliary_loss_clip": 0.01073007, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.04379535, + "balance_loss_mlp": 1.01615047, + "epoch": 0.5919134225161581, + "flos": 19499853726720.0, + "grad_norm": 1.8759841586584602, + "language_loss": 0.74528074, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.7663008, + "num_input_tokens_seen": 212122765, + "step": 9845, + "time_per_iteration": 2.576406478881836 + }, + { + "auxiliary_loss_clip": 0.01071963, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.0367198, + "balance_loss_mlp": 1.01756835, + "epoch": 0.5919735457688261, + "flos": 38800940567040.0, + "grad_norm": 1.9421099722789443, + "language_loss": 0.63590753, + "learning_rate": 1.506446264718213e-06, + "loss": 0.6569559, + "num_input_tokens_seen": 212143960, + "step": 9846, + "time_per_iteration": 2.711371421813965 + }, + { + "auxiliary_loss_clip": 0.0107157, + "auxiliary_loss_mlp": 0.00786157, + "balance_loss_clip": 1.03969145, + "balance_loss_mlp": 1.01349831, + "epoch": 0.592033669021494, + "flos": 22163994495360.0, + "grad_norm": 1.9881070420526656, + "language_loss": 0.76501119, + "learning_rate": 1.506068857539931e-06, + "loss": 0.78358847, + "num_input_tokens_seen": 212162005, + "step": 9847, + "time_per_iteration": 2.585439682006836 + }, + { + "auxiliary_loss_clip": 0.01088007, + "auxiliary_loss_mlp": 0.01029809, + "balance_loss_clip": 1.03804719, + "balance_loss_mlp": 1.01657724, + "epoch": 0.592093792274162, + "flos": 22710985781760.0, + "grad_norm": 1.6555255614634647, + "language_loss": 0.62148297, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.64266121, + "num_input_tokens_seen": 212181635, + "step": 9848, + "time_per_iteration": 2.5455000400543213 + }, + { + "auxiliary_loss_clip": 0.01102427, + "auxiliary_loss_mlp": 0.01034017, + "balance_loss_clip": 1.03977418, + "balance_loss_mlp": 1.02164268, + "epoch": 0.59215391552683, + "flos": 22528954632960.0, + "grad_norm": 1.7745316391621855, + "language_loss": 0.76170218, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.78306657, + "num_input_tokens_seen": 212201615, + "step": 9849, + "time_per_iteration": 2.5041754245758057 + }, + { + "auxiliary_loss_clip": 0.01086643, + "auxiliary_loss_mlp": 0.0103282, + "balance_loss_clip": 1.03692365, + "balance_loss_mlp": 1.01948667, + "epoch": 0.592214038779498, + "flos": 24499013921280.0, + "grad_norm": 1.7556690663919403, + "language_loss": 0.75196922, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.77316386, + "num_input_tokens_seen": 212219355, + "step": 9850, + "time_per_iteration": 3.974691867828369 + }, + { + "auxiliary_loss_clip": 0.01076381, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.03938556, + "balance_loss_mlp": 1.01824689, + "epoch": 0.592274162032166, + "flos": 21831353619840.0, + "grad_norm": 1.945095142186983, + "language_loss": 0.75400156, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.77508378, + "num_input_tokens_seen": 212236710, + "step": 9851, + "time_per_iteration": 2.5520565509796143 + }, + { + "auxiliary_loss_clip": 0.01091127, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.03992486, + "balance_loss_mlp": 1.01939535, + "epoch": 0.5923342852848339, + "flos": 24608146417920.0, + "grad_norm": 1.8919764684737679, + "language_loss": 0.7040534, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.72528362, + "num_input_tokens_seen": 212256195, + "step": 9852, + "time_per_iteration": 2.548759937286377 + }, + { + "auxiliary_loss_clip": 0.01095862, + "auxiliary_loss_mlp": 0.0078804, + "balance_loss_clip": 1.04011524, + "balance_loss_mlp": 1.01216006, + "epoch": 0.5923944085375019, + "flos": 19938143479680.0, + "grad_norm": 1.6090319990815676, + "language_loss": 0.80295765, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.82179666, + "num_input_tokens_seen": 212274085, + "step": 9853, + "time_per_iteration": 3.914362907409668 + }, + { + "auxiliary_loss_clip": 0.01086923, + "auxiliary_loss_mlp": 0.01026305, + "balance_loss_clip": 1.03888106, + "balance_loss_mlp": 1.0150274, + "epoch": 0.5924545317901698, + "flos": 28658510812800.0, + "grad_norm": 2.352007838498656, + "language_loss": 0.67241514, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.69354743, + "num_input_tokens_seen": 212295530, + "step": 9854, + "time_per_iteration": 2.59879994392395 + }, + { + "auxiliary_loss_clip": 0.0108378, + "auxiliary_loss_mlp": 0.01028277, + "balance_loss_clip": 1.03945851, + "balance_loss_mlp": 1.01578355, + "epoch": 0.5925146550428378, + "flos": 19864885691520.0, + "grad_norm": 1.7138799087444703, + "language_loss": 0.88820267, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.90932322, + "num_input_tokens_seen": 212313770, + "step": 9855, + "time_per_iteration": 2.5400307178497314 + }, + { + "auxiliary_loss_clip": 0.01101517, + "auxiliary_loss_mlp": 0.01029643, + "balance_loss_clip": 1.04023695, + "balance_loss_mlp": 1.01815104, + "epoch": 0.5925747782955058, + "flos": 15122989681920.0, + "grad_norm": 1.6914566048389879, + "language_loss": 0.87011135, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.89142299, + "num_input_tokens_seen": 212331525, + "step": 9856, + "time_per_iteration": 3.8931527137756348 + }, + { + "auxiliary_loss_clip": 0.01100305, + "auxiliary_loss_mlp": 0.0103244, + "balance_loss_clip": 1.03776622, + "balance_loss_mlp": 1.02076912, + "epoch": 0.5926349015481738, + "flos": 18405440190720.0, + "grad_norm": 2.1509412297441606, + "language_loss": 0.77796149, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.79928887, + "num_input_tokens_seen": 212347295, + "step": 9857, + "time_per_iteration": 2.460188388824463 + }, + { + "auxiliary_loss_clip": 0.01070509, + "auxiliary_loss_mlp": 0.010456, + "balance_loss_clip": 1.03666258, + "balance_loss_mlp": 1.03054404, + "epoch": 0.5926950248008417, + "flos": 23111138269440.0, + "grad_norm": 2.0611348538119443, + "language_loss": 0.64408517, + "learning_rate": 1.501918617901419e-06, + "loss": 0.66524625, + "num_input_tokens_seen": 212365750, + "step": 9858, + "time_per_iteration": 2.548243522644043 + }, + { + "auxiliary_loss_clip": 0.01099923, + "auxiliary_loss_mlp": 0.01027326, + "balance_loss_clip": 1.04045558, + "balance_loss_mlp": 1.01499999, + "epoch": 0.5927551480535097, + "flos": 28033916192640.0, + "grad_norm": 1.861983520034219, + "language_loss": 0.76944762, + "learning_rate": 1.501541436426501e-06, + "loss": 0.79072011, + "num_input_tokens_seen": 212385300, + "step": 9859, + "time_per_iteration": 2.5366523265838623 + }, + { + "auxiliary_loss_clip": 0.01070226, + "auxiliary_loss_mlp": 0.0078616, + "balance_loss_clip": 1.04022706, + "balance_loss_mlp": 1.00947225, + "epoch": 0.5928152713061776, + "flos": 21798675221760.0, + "grad_norm": 1.9034874480074724, + "language_loss": 0.75506556, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.77362943, + "num_input_tokens_seen": 212402140, + "step": 9860, + "time_per_iteration": 2.6008763313293457 + }, + { + "auxiliary_loss_clip": 0.01077186, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.0417496, + "balance_loss_mlp": 1.02084148, + "epoch": 0.5928753945588456, + "flos": 24316839118080.0, + "grad_norm": 1.5388682379169103, + "language_loss": 0.76042235, + "learning_rate": 1.500787130195763e-06, + "loss": 0.78151715, + "num_input_tokens_seen": 212421790, + "step": 9861, + "time_per_iteration": 2.5807487964630127 + }, + { + "auxiliary_loss_clip": 0.01074882, + "auxiliary_loss_mlp": 0.0102921, + "balance_loss_clip": 1.03693175, + "balance_loss_mlp": 1.01796889, + "epoch": 0.5929355178115137, + "flos": 26464619923200.0, + "grad_norm": 1.6613216357323606, + "language_loss": 0.70428526, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.72532618, + "num_input_tokens_seen": 212442115, + "step": 9862, + "time_per_iteration": 2.6039986610412598 + }, + { + "auxiliary_loss_clip": 0.01057782, + "auxiliary_loss_mlp": 0.01035022, + "balance_loss_clip": 1.03697205, + "balance_loss_mlp": 1.02276707, + "epoch": 0.5929956410641816, + "flos": 24965995662720.0, + "grad_norm": 1.7937472333377105, + "language_loss": 0.77674246, + "learning_rate": 1.500032899685832e-06, + "loss": 0.79767048, + "num_input_tokens_seen": 212459535, + "step": 9863, + "time_per_iteration": 2.6328370571136475 + }, + { + "auxiliary_loss_clip": 0.0108896, + "auxiliary_loss_mlp": 0.01036079, + "balance_loss_clip": 1.04194498, + "balance_loss_mlp": 1.02306151, + "epoch": 0.5930557643168496, + "flos": 26208325405440.0, + "grad_norm": 1.7081637176838518, + "language_loss": 0.704476, + "learning_rate": 1.499655812861921e-06, + "loss": 0.72572637, + "num_input_tokens_seen": 212479385, + "step": 9864, + "time_per_iteration": 2.597909688949585 + }, + { + "auxiliary_loss_clip": 0.01086546, + "auxiliary_loss_mlp": 0.01032078, + "balance_loss_clip": 1.03879309, + "balance_loss_mlp": 1.01880407, + "epoch": 0.5931158875695175, + "flos": 27854937699840.0, + "grad_norm": 1.582104930575943, + "language_loss": 0.67374563, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.69493186, + "num_input_tokens_seen": 212500060, + "step": 9865, + "time_per_iteration": 2.595360040664673 + }, + { + "auxiliary_loss_clip": 0.01094316, + "auxiliary_loss_mlp": 0.01035074, + "balance_loss_clip": 1.03960347, + "balance_loss_mlp": 1.02137673, + "epoch": 0.5931760108221855, + "flos": 15413650536960.0, + "grad_norm": 2.0892524101652827, + "language_loss": 0.7844094, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.80570334, + "num_input_tokens_seen": 212518590, + "step": 9866, + "time_per_iteration": 2.5086817741394043 + }, + { + "auxiliary_loss_clip": 0.01087272, + "auxiliary_loss_mlp": 0.01026949, + "balance_loss_clip": 1.0428443, + "balance_loss_mlp": 1.01518297, + "epoch": 0.5932361340748534, + "flos": 30188520581760.0, + "grad_norm": 2.3946422043220816, + "language_loss": 0.72184849, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.74299073, + "num_input_tokens_seen": 212538190, + "step": 9867, + "time_per_iteration": 2.6354801654815674 + }, + { + "auxiliary_loss_clip": 0.01090008, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.04020524, + "balance_loss_mlp": 1.01884961, + "epoch": 0.5932962573275214, + "flos": 20157557708160.0, + "grad_norm": 1.533993730977218, + "language_loss": 0.66655385, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.68777627, + "num_input_tokens_seen": 212557820, + "step": 9868, + "time_per_iteration": 2.5394532680511475 + }, + { + "auxiliary_loss_clip": 0.01055969, + "auxiliary_loss_mlp": 0.00786531, + "balance_loss_clip": 1.03622556, + "balance_loss_mlp": 1.01135468, + "epoch": 0.5933563805801894, + "flos": 25445906300160.0, + "grad_norm": 1.485757783492498, + "language_loss": 0.75397348, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.77239847, + "num_input_tokens_seen": 212577645, + "step": 9869, + "time_per_iteration": 2.665998935699463 + }, + { + "auxiliary_loss_clip": 0.0107226, + "auxiliary_loss_mlp": 0.01036952, + "balance_loss_clip": 1.04438031, + "balance_loss_mlp": 1.02335644, + "epoch": 0.5934165038328574, + "flos": 59995740337920.0, + "grad_norm": 1.695929421402375, + "language_loss": 0.74059808, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.76169026, + "num_input_tokens_seen": 212603430, + "step": 9870, + "time_per_iteration": 2.9447684288024902 + }, + { + "auxiliary_loss_clip": 0.01071845, + "auxiliary_loss_mlp": 0.01030196, + "balance_loss_clip": 1.04079032, + "balance_loss_mlp": 1.01772022, + "epoch": 0.5934766270855253, + "flos": 24420548661120.0, + "grad_norm": 2.1091478710565825, + "language_loss": 0.72341073, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.74443114, + "num_input_tokens_seen": 212620730, + "step": 9871, + "time_per_iteration": 2.6282167434692383 + }, + { + "auxiliary_loss_clip": 0.01081017, + "auxiliary_loss_mlp": 0.01034872, + "balance_loss_clip": 1.03962374, + "balance_loss_mlp": 1.02146745, + "epoch": 0.5935367503381933, + "flos": 23513158264320.0, + "grad_norm": 1.8491600721977461, + "language_loss": 0.74199682, + "learning_rate": 1.496639802503271e-06, + "loss": 0.7631557, + "num_input_tokens_seen": 212639745, + "step": 9872, + "time_per_iteration": 2.592555522918701 + }, + { + "auxiliary_loss_clip": 0.01105774, + "auxiliary_loss_mlp": 0.01033745, + "balance_loss_clip": 1.03941345, + "balance_loss_mlp": 1.01982129, + "epoch": 0.5935968735908612, + "flos": 18948337326720.0, + "grad_norm": 2.139377049560551, + "language_loss": 0.79569316, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.81708825, + "num_input_tokens_seen": 212655915, + "step": 9873, + "time_per_iteration": 2.4909873008728027 + }, + { + "auxiliary_loss_clip": 0.01102829, + "auxiliary_loss_mlp": 0.01032602, + "balance_loss_clip": 1.04025936, + "balance_loss_mlp": 1.01997149, + "epoch": 0.5936569968435292, + "flos": 25483433034240.0, + "grad_norm": 1.5986979303218471, + "language_loss": 0.84910524, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.87045956, + "num_input_tokens_seen": 212676115, + "step": 9874, + "time_per_iteration": 2.542112112045288 + }, + { + "auxiliary_loss_clip": 0.01029309, + "auxiliary_loss_mlp": 0.01004002, + "balance_loss_clip": 1.02346742, + "balance_loss_mlp": 1.00266063, + "epoch": 0.5937171200961973, + "flos": 66378361789440.0, + "grad_norm": 0.7141555613170256, + "language_loss": 0.60065114, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.62098432, + "num_input_tokens_seen": 212737560, + "step": 9875, + "time_per_iteration": 3.216447353363037 + }, + { + "auxiliary_loss_clip": 0.01091187, + "auxiliary_loss_mlp": 0.01033532, + "balance_loss_clip": 1.03756762, + "balance_loss_mlp": 1.01908958, + "epoch": 0.5937772433488652, + "flos": 14903467712640.0, + "grad_norm": 2.2212223089953795, + "language_loss": 0.77613425, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.7973814, + "num_input_tokens_seen": 212755365, + "step": 9876, + "time_per_iteration": 2.501455307006836 + }, + { + "auxiliary_loss_clip": 0.01096131, + "auxiliary_loss_mlp": 0.01029737, + "balance_loss_clip": 1.03669047, + "balance_loss_mlp": 1.01772714, + "epoch": 0.5938373666015332, + "flos": 22561489376640.0, + "grad_norm": 1.4844602311851767, + "language_loss": 0.75932467, + "learning_rate": 1.494755415907243e-06, + "loss": 0.78058332, + "num_input_tokens_seen": 212773875, + "step": 9877, + "time_per_iteration": 2.5205228328704834 + }, + { + "auxiliary_loss_clip": 0.01100634, + "auxiliary_loss_mlp": 0.01031299, + "balance_loss_clip": 1.03684437, + "balance_loss_mlp": 1.01781678, + "epoch": 0.5938974898542011, + "flos": 18440883936000.0, + "grad_norm": 2.240245576893391, + "language_loss": 0.81115097, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.8324703, + "num_input_tokens_seen": 212790590, + "step": 9878, + "time_per_iteration": 2.467648506164551 + }, + { + "auxiliary_loss_clip": 0.0108592, + "auxiliary_loss_mlp": 0.00789427, + "balance_loss_clip": 1.03731871, + "balance_loss_mlp": 1.01361036, + "epoch": 0.5939576131068691, + "flos": 45586728270720.0, + "grad_norm": 2.013554050118306, + "language_loss": 0.7108115, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.72956491, + "num_input_tokens_seen": 212812265, + "step": 9879, + "time_per_iteration": 2.748717784881592 + }, + { + "auxiliary_loss_clip": 0.01099443, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.03882217, + "balance_loss_mlp": 1.02052689, + "epoch": 0.594017736359537, + "flos": 23587708942080.0, + "grad_norm": 1.6633596186124735, + "language_loss": 0.57777631, + "learning_rate": 1.493625013742401e-06, + "loss": 0.59909892, + "num_input_tokens_seen": 212831915, + "step": 9880, + "time_per_iteration": 3.878962755203247 + }, + { + "auxiliary_loss_clip": 0.01101919, + "auxiliary_loss_mlp": 0.01038005, + "balance_loss_clip": 1.03806078, + "balance_loss_mlp": 1.02470684, + "epoch": 0.594077859612205, + "flos": 29457235589760.0, + "grad_norm": 1.7697321674140203, + "language_loss": 0.77656424, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.7979635, + "num_input_tokens_seen": 212851350, + "step": 9881, + "time_per_iteration": 2.5421557426452637 + }, + { + "auxiliary_loss_clip": 0.01098524, + "auxiliary_loss_mlp": 0.0102823, + "balance_loss_clip": 1.03649139, + "balance_loss_mlp": 1.01602292, + "epoch": 0.594137982864873, + "flos": 16800089644800.0, + "grad_norm": 2.1422483564596257, + "language_loss": 0.82581383, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.84708142, + "num_input_tokens_seen": 212867995, + "step": 9882, + "time_per_iteration": 2.466517686843872 + }, + { + "auxiliary_loss_clip": 0.01100199, + "auxiliary_loss_mlp": 0.01035577, + "balance_loss_clip": 1.03823781, + "balance_loss_mlp": 1.02385902, + "epoch": 0.594198106117541, + "flos": 12750263953920.0, + "grad_norm": 2.4012796297569277, + "language_loss": 0.79718667, + "learning_rate": 1.492494784393667e-06, + "loss": 0.81854445, + "num_input_tokens_seen": 212885220, + "step": 9883, + "time_per_iteration": 2.4641411304473877 + }, + { + "auxiliary_loss_clip": 0.01085098, + "auxiliary_loss_mlp": 0.0078619, + "balance_loss_clip": 1.04476762, + "balance_loss_mlp": 1.01141906, + "epoch": 0.5942582293702089, + "flos": 20996538652800.0, + "grad_norm": 2.134529982944676, + "language_loss": 0.74623209, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.76494491, + "num_input_tokens_seen": 212903195, + "step": 9884, + "time_per_iteration": 2.5674521923065186 + }, + { + "auxiliary_loss_clip": 0.01114208, + "auxiliary_loss_mlp": 0.01032101, + "balance_loss_clip": 1.04095745, + "balance_loss_mlp": 1.01915443, + "epoch": 0.5943183526228769, + "flos": 28291431772800.0, + "grad_norm": 2.340225712451861, + "language_loss": 0.66142064, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.68288374, + "num_input_tokens_seen": 212923340, + "step": 9885, + "time_per_iteration": 2.5157217979431152 + }, + { + "auxiliary_loss_clip": 0.01089189, + "auxiliary_loss_mlp": 0.01039127, + "balance_loss_clip": 1.03976357, + "balance_loss_mlp": 1.02637124, + "epoch": 0.5943784758755448, + "flos": 26614619118720.0, + "grad_norm": 2.4255074821586575, + "language_loss": 0.77457261, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.79585576, + "num_input_tokens_seen": 212942755, + "step": 9886, + "time_per_iteration": 2.573957920074463 + }, + { + "auxiliary_loss_clip": 0.0103233, + "auxiliary_loss_mlp": 0.0100115, + "balance_loss_clip": 1.022367, + "balance_loss_mlp": 0.99993438, + "epoch": 0.5944385991282128, + "flos": 64190935347840.0, + "grad_norm": 0.862837093346675, + "language_loss": 0.64530551, + "learning_rate": 1.490988081420423e-06, + "loss": 0.66564029, + "num_input_tokens_seen": 212999355, + "step": 9887, + "time_per_iteration": 2.991119861602783 + }, + { + "auxiliary_loss_clip": 0.01096838, + "auxiliary_loss_mlp": 0.01030106, + "balance_loss_clip": 1.03936362, + "balance_loss_mlp": 1.01738667, + "epoch": 0.5944987223808808, + "flos": 19571998193280.0, + "grad_norm": 2.018216627000778, + "language_loss": 0.69329154, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.71456099, + "num_input_tokens_seen": 213018570, + "step": 9888, + "time_per_iteration": 2.5026822090148926 + }, + { + "auxiliary_loss_clip": 0.01084951, + "auxiliary_loss_mlp": 0.010316, + "balance_loss_clip": 1.03680241, + "balance_loss_mlp": 1.0177002, + "epoch": 0.5945588456335488, + "flos": 26177586341760.0, + "grad_norm": 1.5614394370071636, + "language_loss": 0.79486835, + "learning_rate": 1.490234845687366e-06, + "loss": 0.81603384, + "num_input_tokens_seen": 213037735, + "step": 9889, + "time_per_iteration": 3.958974599838257 + }, + { + "auxiliary_loss_clip": 0.01076651, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.03798008, + "balance_loss_mlp": 1.01938057, + "epoch": 0.5946189688862168, + "flos": 20446494710400.0, + "grad_norm": 2.0143273525859646, + "language_loss": 0.70347524, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.72456175, + "num_input_tokens_seen": 213057160, + "step": 9890, + "time_per_iteration": 2.544884443283081 + }, + { + "auxiliary_loss_clip": 0.01078601, + "auxiliary_loss_mlp": 0.01032309, + "balance_loss_clip": 1.03856206, + "balance_loss_mlp": 1.01922548, + "epoch": 0.5946790921388847, + "flos": 13437521850240.0, + "grad_norm": 1.9948031566568865, + "language_loss": 0.69487274, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71598184, + "num_input_tokens_seen": 213073630, + "step": 9891, + "time_per_iteration": 3.916905641555786 + }, + { + "auxiliary_loss_clip": 0.01098576, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.0378921, + "balance_loss_mlp": 1.02174997, + "epoch": 0.5947392153915527, + "flos": 20412272027520.0, + "grad_norm": 1.79919145934123, + "language_loss": 0.53321147, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.55454111, + "num_input_tokens_seen": 213092450, + "step": 9892, + "time_per_iteration": 2.4892539978027344 + }, + { + "auxiliary_loss_clip": 0.0101252, + "auxiliary_loss_mlp": 0.01001987, + "balance_loss_clip": 1.02158308, + "balance_loss_mlp": 1.00059783, + "epoch": 0.5947993386442206, + "flos": 65619138994560.0, + "grad_norm": 0.6648049247026984, + "language_loss": 0.54517436, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.56531942, + "num_input_tokens_seen": 213155465, + "step": 9893, + "time_per_iteration": 3.221587896347046 + }, + { + "auxiliary_loss_clip": 0.01072481, + "auxiliary_loss_mlp": 0.01028502, + "balance_loss_clip": 1.0386219, + "balance_loss_mlp": 1.0169028, + "epoch": 0.5948594618968887, + "flos": 23183103168000.0, + "grad_norm": 1.6150938406979223, + "language_loss": 0.74806881, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.76907855, + "num_input_tokens_seen": 213174875, + "step": 9894, + "time_per_iteration": 4.0031208992004395 + }, + { + "auxiliary_loss_clip": 0.01078028, + "auxiliary_loss_mlp": 0.01027931, + "balance_loss_clip": 1.03744316, + "balance_loss_mlp": 1.01592112, + "epoch": 0.5949195851495566, + "flos": 13626771632640.0, + "grad_norm": 1.8328721191747943, + "language_loss": 0.7779544, + "learning_rate": 1.487975602873434e-06, + "loss": 0.79901403, + "num_input_tokens_seen": 213192695, + "step": 9895, + "time_per_iteration": 2.536914110183716 + }, + { + "auxiliary_loss_clip": 0.01061158, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.03524113, + "balance_loss_mlp": 1.01796484, + "epoch": 0.5949797084022246, + "flos": 19751012599680.0, + "grad_norm": 1.8102334658804082, + "language_loss": 0.79003006, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.81095469, + "num_input_tokens_seen": 213211195, + "step": 9896, + "time_per_iteration": 2.6095926761627197 + }, + { + "auxiliary_loss_clip": 0.0110129, + "auxiliary_loss_mlp": 0.01033005, + "balance_loss_clip": 1.03842342, + "balance_loss_mlp": 1.02022529, + "epoch": 0.5950398316548925, + "flos": 25773878407680.0, + "grad_norm": 1.4755857071709906, + "language_loss": 0.83230621, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.85364914, + "num_input_tokens_seen": 213231975, + "step": 9897, + "time_per_iteration": 2.5507583618164062 + }, + { + "auxiliary_loss_clip": 0.01094842, + "auxiliary_loss_mlp": 0.01034288, + "balance_loss_clip": 1.04020929, + "balance_loss_mlp": 1.02184904, + "epoch": 0.5950999549075605, + "flos": 23039029716480.0, + "grad_norm": 1.8631106953418137, + "language_loss": 0.70362848, + "learning_rate": 1.486846243389939e-06, + "loss": 0.7249198, + "num_input_tokens_seen": 213249760, + "step": 9898, + "time_per_iteration": 2.539904832839966 + }, + { + "auxiliary_loss_clip": 0.01101505, + "auxiliary_loss_mlp": 0.01042889, + "balance_loss_clip": 1.03783953, + "balance_loss_mlp": 1.02703393, + "epoch": 0.5951600781602284, + "flos": 32446367637120.0, + "grad_norm": 2.237914418290006, + "language_loss": 0.64122039, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.66266435, + "num_input_tokens_seen": 213269890, + "step": 9899, + "time_per_iteration": 2.578726291656494 + }, + { + "auxiliary_loss_clip": 0.01109309, + "auxiliary_loss_mlp": 0.01026389, + "balance_loss_clip": 1.03830779, + "balance_loss_mlp": 1.01532078, + "epoch": 0.5952202014128964, + "flos": 23800874204160.0, + "grad_norm": 1.6573328855171923, + "language_loss": 0.72140336, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.74276036, + "num_input_tokens_seen": 213289400, + "step": 9900, + "time_per_iteration": 2.5060765743255615 + }, + { + "auxiliary_loss_clip": 0.01108556, + "auxiliary_loss_mlp": 0.01029682, + "balance_loss_clip": 1.03844059, + "balance_loss_mlp": 1.01740289, + "epoch": 0.5952803246655644, + "flos": 22492182084480.0, + "grad_norm": 1.6924728637463484, + "language_loss": 0.84392416, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.8653065, + "num_input_tokens_seen": 213308040, + "step": 9901, + "time_per_iteration": 2.4726016521453857 + }, + { + "auxiliary_loss_clip": 0.01005666, + "auxiliary_loss_mlp": 0.01007597, + "balance_loss_clip": 1.02270305, + "balance_loss_mlp": 1.00636292, + "epoch": 0.5953404479182324, + "flos": 51234688851840.0, + "grad_norm": 0.8024002967358176, + "language_loss": 0.58276147, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.60289413, + "num_input_tokens_seen": 213358585, + "step": 9902, + "time_per_iteration": 3.09171724319458 + }, + { + "auxiliary_loss_clip": 0.01054401, + "auxiliary_loss_mlp": 0.01032851, + "balance_loss_clip": 1.03985071, + "balance_loss_mlp": 1.02018487, + "epoch": 0.5954005711709004, + "flos": 23112682554240.0, + "grad_norm": 1.677296382406316, + "language_loss": 0.77128893, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.79216152, + "num_input_tokens_seen": 213379585, + "step": 9903, + "time_per_iteration": 2.887528657913208 + }, + { + "auxiliary_loss_clip": 0.01075971, + "auxiliary_loss_mlp": 0.0103487, + "balance_loss_clip": 1.04032373, + "balance_loss_mlp": 1.02293682, + "epoch": 0.5954606944235683, + "flos": 35954732736000.0, + "grad_norm": 2.715804357459143, + "language_loss": 0.78005838, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.80116677, + "num_input_tokens_seen": 213401465, + "step": 9904, + "time_per_iteration": 2.6910061836242676 + }, + { + "auxiliary_loss_clip": 0.011003, + "auxiliary_loss_mlp": 0.01034715, + "balance_loss_clip": 1.04118919, + "balance_loss_mlp": 1.02167392, + "epoch": 0.5955208176762363, + "flos": 30443665864320.0, + "grad_norm": 1.599259808304985, + "language_loss": 0.72802532, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.74937546, + "num_input_tokens_seen": 213422720, + "step": 9905, + "time_per_iteration": 2.582089900970459 + }, + { + "auxiliary_loss_clip": 0.01101946, + "auxiliary_loss_mlp": 0.01031396, + "balance_loss_clip": 1.03781736, + "balance_loss_mlp": 1.01852167, + "epoch": 0.5955809409289042, + "flos": 17640112083840.0, + "grad_norm": 1.8894650969618925, + "language_loss": 0.69754016, + "learning_rate": 1.483835475336295e-06, + "loss": 0.71887362, + "num_input_tokens_seen": 213439480, + "step": 9906, + "time_per_iteration": 2.4769270420074463 + }, + { + "auxiliary_loss_clip": 0.01101669, + "auxiliary_loss_mlp": 0.01036825, + "balance_loss_clip": 1.04016018, + "balance_loss_mlp": 1.02411699, + "epoch": 0.5956410641815723, + "flos": 24279887001600.0, + "grad_norm": 2.0475935065787176, + "language_loss": 0.75090694, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.77229184, + "num_input_tokens_seen": 213458895, + "step": 9907, + "time_per_iteration": 2.5450284481048584 + }, + { + "auxiliary_loss_clip": 0.01086305, + "auxiliary_loss_mlp": 0.01034422, + "balance_loss_clip": 1.04040527, + "balance_loss_mlp": 1.02199471, + "epoch": 0.5957011874342402, + "flos": 35734277013120.0, + "grad_norm": 1.4449155807552163, + "language_loss": 0.66762429, + "learning_rate": 1.483082978767595e-06, + "loss": 0.68883157, + "num_input_tokens_seen": 213481730, + "step": 9908, + "time_per_iteration": 2.63934588432312 + }, + { + "auxiliary_loss_clip": 0.01039039, + "auxiliary_loss_mlp": 0.01031933, + "balance_loss_clip": 1.03553379, + "balance_loss_mlp": 1.02002418, + "epoch": 0.5957613106869082, + "flos": 21245004005760.0, + "grad_norm": 2.03279110852491, + "language_loss": 0.76309848, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.78380817, + "num_input_tokens_seen": 213497225, + "step": 9909, + "time_per_iteration": 2.656296730041504 + }, + { + "auxiliary_loss_clip": 0.01040471, + "auxiliary_loss_mlp": 0.01007415, + "balance_loss_clip": 1.01593447, + "balance_loss_mlp": 1.00614572, + "epoch": 0.5958214339395761, + "flos": 65940969876480.0, + "grad_norm": 0.9275788849983656, + "language_loss": 0.73391551, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75439435, + "num_input_tokens_seen": 213556890, + "step": 9910, + "time_per_iteration": 3.1315133571624756 + }, + { + "auxiliary_loss_clip": 0.0108864, + "auxiliary_loss_mlp": 0.01034284, + "balance_loss_clip": 1.04212213, + "balance_loss_mlp": 1.02173114, + "epoch": 0.5958815571922441, + "flos": 23218690567680.0, + "grad_norm": 1.6124512825927848, + "language_loss": 0.69472575, + "learning_rate": 1.481954380961799e-06, + "loss": 0.71595502, + "num_input_tokens_seen": 213575800, + "step": 9911, + "time_per_iteration": 2.5535411834716797 + }, + { + "auxiliary_loss_clip": 0.01109977, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.04162765, + "balance_loss_mlp": 1.0226469, + "epoch": 0.595941680444912, + "flos": 16538623568640.0, + "grad_norm": 1.9116322958736558, + "language_loss": 0.65578735, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.67724854, + "num_input_tokens_seen": 213592740, + "step": 9912, + "time_per_iteration": 2.451472759246826 + }, + { + "auxiliary_loss_clip": 0.01081522, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.03756559, + "balance_loss_mlp": 1.02162516, + "epoch": 0.59600180369758, + "flos": 27818883423360.0, + "grad_norm": 2.0174755107934064, + "language_loss": 0.73462057, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.75578445, + "num_input_tokens_seen": 213611970, + "step": 9913, + "time_per_iteration": 2.600966215133667 + }, + { + "auxiliary_loss_clip": 0.01086639, + "auxiliary_loss_mlp": 0.00786152, + "balance_loss_clip": 1.04114628, + "balance_loss_mlp": 1.00907373, + "epoch": 0.596061926950248, + "flos": 29491566013440.0, + "grad_norm": 2.0391472080313835, + "language_loss": 0.80441689, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.82314479, + "num_input_tokens_seen": 213632230, + "step": 9914, + "time_per_iteration": 2.6229355335235596 + }, + { + "auxiliary_loss_clip": 0.01077418, + "auxiliary_loss_mlp": 0.01034165, + "balance_loss_clip": 1.03807545, + "balance_loss_mlp": 1.02171922, + "epoch": 0.596122050202916, + "flos": 16836790366080.0, + "grad_norm": 1.860059397535033, + "language_loss": 0.67537236, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.69648826, + "num_input_tokens_seen": 213649645, + "step": 9915, + "time_per_iteration": 2.5574097633361816 + }, + { + "auxiliary_loss_clip": 0.01087435, + "auxiliary_loss_mlp": 0.01033603, + "balance_loss_clip": 1.04017234, + "balance_loss_mlp": 1.02143192, + "epoch": 0.596182173455584, + "flos": 20996646393600.0, + "grad_norm": 1.6779341832809922, + "language_loss": 0.78899252, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.81020296, + "num_input_tokens_seen": 213668850, + "step": 9916, + "time_per_iteration": 2.5337300300598145 + }, + { + "auxiliary_loss_clip": 0.01090272, + "auxiliary_loss_mlp": 0.01030851, + "balance_loss_clip": 1.03732061, + "balance_loss_mlp": 1.01813126, + "epoch": 0.5962422967082519, + "flos": 16065680169600.0, + "grad_norm": 2.112220237370678, + "language_loss": 0.82733524, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.8485465, + "num_input_tokens_seen": 213685695, + "step": 9917, + "time_per_iteration": 2.5230658054351807 + }, + { + "auxiliary_loss_clip": 0.01084245, + "auxiliary_loss_mlp": 0.01036425, + "balance_loss_clip": 1.03763437, + "balance_loss_mlp": 1.02403355, + "epoch": 0.5963024199609199, + "flos": 12166966995840.0, + "grad_norm": 1.998308255896773, + "language_loss": 0.77536386, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.79657054, + "num_input_tokens_seen": 213703515, + "step": 9918, + "time_per_iteration": 2.508625030517578 + }, + { + "auxiliary_loss_clip": 0.011028, + "auxiliary_loss_mlp": 0.01037278, + "balance_loss_clip": 1.04080439, + "balance_loss_mlp": 1.02462947, + "epoch": 0.5963625432135878, + "flos": 28074280101120.0, + "grad_norm": 1.4941230028775716, + "language_loss": 0.78969336, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.81109411, + "num_input_tokens_seen": 213724170, + "step": 9919, + "time_per_iteration": 3.9412906169891357 + }, + { + "auxiliary_loss_clip": 0.01087666, + "auxiliary_loss_mlp": 0.01032964, + "balance_loss_clip": 1.04023504, + "balance_loss_mlp": 1.01963091, + "epoch": 0.5964226664662559, + "flos": 19860324664320.0, + "grad_norm": 6.158667021537544, + "language_loss": 0.77535331, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.79655957, + "num_input_tokens_seen": 213740620, + "step": 9920, + "time_per_iteration": 2.521289587020874 + }, + { + "auxiliary_loss_clip": 0.01100342, + "auxiliary_loss_mlp": 0.01036639, + "balance_loss_clip": 1.04024756, + "balance_loss_mlp": 1.02221429, + "epoch": 0.5964827897189238, + "flos": 12932618325120.0, + "grad_norm": 2.357372067891289, + "language_loss": 0.82598889, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.8473587, + "num_input_tokens_seen": 213755390, + "step": 9921, + "time_per_iteration": 2.4739091396331787 + }, + { + "auxiliary_loss_clip": 0.01100088, + "auxiliary_loss_mlp": 0.01031538, + "balance_loss_clip": 1.04002142, + "balance_loss_mlp": 1.01759648, + "epoch": 0.5965429129715918, + "flos": 18150797698560.0, + "grad_norm": 1.8921866960932194, + "language_loss": 0.80833578, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.82965207, + "num_input_tokens_seen": 213773225, + "step": 9922, + "time_per_iteration": 2.4793074131011963 + }, + { + "auxiliary_loss_clip": 0.01102302, + "auxiliary_loss_mlp": 0.00784794, + "balance_loss_clip": 1.03800893, + "balance_loss_mlp": 1.00918078, + "epoch": 0.5966030362242597, + "flos": 21763231476480.0, + "grad_norm": 2.342034338988647, + "language_loss": 0.77173853, + "learning_rate": 1.477441761580111e-06, + "loss": 0.79060948, + "num_input_tokens_seen": 213791860, + "step": 9923, + "time_per_iteration": 2.5016345977783203 + }, + { + "auxiliary_loss_clip": 0.01099735, + "auxiliary_loss_mlp": 0.01036581, + "balance_loss_clip": 1.04068804, + "balance_loss_mlp": 1.02154887, + "epoch": 0.5966631594769277, + "flos": 18807208790400.0, + "grad_norm": 1.958172843801533, + "language_loss": 0.76033211, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.78169525, + "num_input_tokens_seen": 213809455, + "step": 9924, + "time_per_iteration": 2.5159738063812256 + }, + { + "auxiliary_loss_clip": 0.01093692, + "auxiliary_loss_mlp": 0.01038972, + "balance_loss_clip": 1.03634071, + "balance_loss_mlp": 1.02405918, + "epoch": 0.5967232827295956, + "flos": 14064163545600.0, + "grad_norm": 1.905418099156943, + "language_loss": 0.66658163, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.68790829, + "num_input_tokens_seen": 213826615, + "step": 9925, + "time_per_iteration": 2.4532430171966553 + }, + { + "auxiliary_loss_clip": 0.01085397, + "auxiliary_loss_mlp": 0.01038931, + "balance_loss_clip": 1.04368818, + "balance_loss_mlp": 1.02529347, + "epoch": 0.5967834059822636, + "flos": 17238235743360.0, + "grad_norm": 2.082044141284348, + "language_loss": 0.71400452, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.73524779, + "num_input_tokens_seen": 213844495, + "step": 9926, + "time_per_iteration": 2.5140082836151123 + }, + { + "auxiliary_loss_clip": 0.01067154, + "auxiliary_loss_mlp": 0.00788388, + "balance_loss_clip": 1.03603327, + "balance_loss_mlp": 1.01054525, + "epoch": 0.5968435292349316, + "flos": 42520244284800.0, + "grad_norm": 1.8901173497320498, + "language_loss": 0.70230258, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.72085798, + "num_input_tokens_seen": 213869125, + "step": 9927, + "time_per_iteration": 2.775291919708252 + }, + { + "auxiliary_loss_clip": 0.01071971, + "auxiliary_loss_mlp": 0.01030473, + "balance_loss_clip": 1.04107928, + "balance_loss_mlp": 1.01650143, + "epoch": 0.5969036524875996, + "flos": 37630898945280.0, + "grad_norm": 1.8761927242970284, + "language_loss": 0.6366837, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.65770817, + "num_input_tokens_seen": 213891115, + "step": 9928, + "time_per_iteration": 4.160938262939453 + }, + { + "auxiliary_loss_clip": 0.01109795, + "auxiliary_loss_mlp": 0.01031217, + "balance_loss_clip": 1.03759372, + "balance_loss_mlp": 1.01892602, + "epoch": 0.5969637757402676, + "flos": 23148377694720.0, + "grad_norm": 1.6192415440609704, + "language_loss": 0.69579244, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.71720254, + "num_input_tokens_seen": 213911925, + "step": 9929, + "time_per_iteration": 3.9009790420532227 + }, + { + "auxiliary_loss_clip": 0.01065301, + "auxiliary_loss_mlp": 0.01030544, + "balance_loss_clip": 1.03899539, + "balance_loss_mlp": 1.01898623, + "epoch": 0.5970238989929355, + "flos": 24020934877440.0, + "grad_norm": 1.7669403123104326, + "language_loss": 0.76367021, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.78462863, + "num_input_tokens_seen": 213930715, + "step": 9930, + "time_per_iteration": 2.596670627593994 + }, + { + "auxiliary_loss_clip": 0.01091422, + "auxiliary_loss_mlp": 0.01033737, + "balance_loss_clip": 1.04268742, + "balance_loss_mlp": 1.01958084, + "epoch": 0.5970840222456035, + "flos": 19426883247360.0, + "grad_norm": 1.5495078475348034, + "language_loss": 0.6887964, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.71004808, + "num_input_tokens_seen": 213950015, + "step": 9931, + "time_per_iteration": 2.5316030979156494 + }, + { + "auxiliary_loss_clip": 0.01031462, + "auxiliary_loss_mlp": 0.01002353, + "balance_loss_clip": 1.0212822, + "balance_loss_mlp": 1.00084484, + "epoch": 0.5971441454982714, + "flos": 62976615235200.0, + "grad_norm": 0.8549358942710722, + "language_loss": 0.6422137, + "learning_rate": 1.474059168257065e-06, + "loss": 0.66255188, + "num_input_tokens_seen": 214003330, + "step": 9932, + "time_per_iteration": 3.0554797649383545 + }, + { + "auxiliary_loss_clip": 0.01081903, + "auxiliary_loss_mlp": 0.01029666, + "balance_loss_clip": 1.03865552, + "balance_loss_mlp": 1.01628447, + "epoch": 0.5972042687509395, + "flos": 20266223328000.0, + "grad_norm": 2.030760897076716, + "language_loss": 0.73987758, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.76099324, + "num_input_tokens_seen": 214021680, + "step": 9933, + "time_per_iteration": 3.9462296962738037 + }, + { + "auxiliary_loss_clip": 0.01042052, + "auxiliary_loss_mlp": 0.01002729, + "balance_loss_clip": 1.03416657, + "balance_loss_mlp": 1.00120318, + "epoch": 0.5972643920036074, + "flos": 71652383832960.0, + "grad_norm": 0.6685761530391885, + "language_loss": 0.52001274, + "learning_rate": 1.473307699867203e-06, + "loss": 0.54046053, + "num_input_tokens_seen": 214090265, + "step": 9934, + "time_per_iteration": 3.2040419578552246 + }, + { + "auxiliary_loss_clip": 0.01043251, + "auxiliary_loss_mlp": 0.01003202, + "balance_loss_clip": 1.01859117, + "balance_loss_mlp": 1.00195599, + "epoch": 0.5973245152562754, + "flos": 56892702263040.0, + "grad_norm": 0.8303508399286358, + "language_loss": 0.54184294, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.56230748, + "num_input_tokens_seen": 214146375, + "step": 9935, + "time_per_iteration": 3.0248618125915527 + }, + { + "auxiliary_loss_clip": 0.01093962, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.04049563, + "balance_loss_mlp": 1.02114224, + "epoch": 0.5973846385089433, + "flos": 24164361884160.0, + "grad_norm": 1.5739976533121374, + "language_loss": 0.659971, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.68126082, + "num_input_tokens_seen": 214165340, + "step": 9936, + "time_per_iteration": 2.555187463760376 + }, + { + "auxiliary_loss_clip": 0.01063084, + "auxiliary_loss_mlp": 0.01036489, + "balance_loss_clip": 1.04163027, + "balance_loss_mlp": 1.0235424, + "epoch": 0.5974447617616113, + "flos": 17670599752320.0, + "grad_norm": 2.4972349090020196, + "language_loss": 0.67417616, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.69517189, + "num_input_tokens_seen": 214181360, + "step": 9937, + "time_per_iteration": 2.578324556350708 + }, + { + "auxiliary_loss_clip": 0.01104724, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.03940046, + "balance_loss_mlp": 1.01831436, + "epoch": 0.5975048850142792, + "flos": 22892514140160.0, + "grad_norm": 2.3822933670878768, + "language_loss": 0.77450418, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.79586637, + "num_input_tokens_seen": 214198525, + "step": 9938, + "time_per_iteration": -0.15279340744018555 + }, + { + "auxiliary_loss_clip": 0.01101715, + "auxiliary_loss_mlp": 0.01030984, + "balance_loss_clip": 1.03743696, + "balance_loss_mlp": 1.01753688, + "epoch": 0.5975650082669473, + "flos": 24353108876160.0, + "grad_norm": 1.639244316691942, + "language_loss": 0.75531, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.77663696, + "num_input_tokens_seen": 214218710, + "step": 9939, + "time_per_iteration": 2.553797483444214 + }, + { + "auxiliary_loss_clip": 0.01074603, + "auxiliary_loss_mlp": 0.01034866, + "balance_loss_clip": 1.04097939, + "balance_loss_mlp": 1.01957774, + "epoch": 0.5976251315196152, + "flos": 20923352691840.0, + "grad_norm": 2.1198116975492236, + "language_loss": 0.67961419, + "learning_rate": 1.471053774486878e-06, + "loss": 0.70070887, + "num_input_tokens_seen": 214237800, + "step": 9940, + "time_per_iteration": 2.6156911849975586 + }, + { + "auxiliary_loss_clip": 0.01086864, + "auxiliary_loss_mlp": 0.01033649, + "balance_loss_clip": 1.04089689, + "balance_loss_mlp": 1.02180552, + "epoch": 0.5976852547722832, + "flos": 35844594658560.0, + "grad_norm": 1.3610750392125175, + "language_loss": 0.7028901, + "learning_rate": 1.470678190375664e-06, + "loss": 0.72409523, + "num_input_tokens_seen": 214260355, + "step": 9941, + "time_per_iteration": 2.688088893890381 + }, + { + "auxiliary_loss_clip": 0.01090483, + "auxiliary_loss_mlp": 0.01032781, + "balance_loss_clip": 1.03843665, + "balance_loss_mlp": 1.01941228, + "epoch": 0.5977453780249512, + "flos": 12855948744960.0, + "grad_norm": 2.0442114160901745, + "language_loss": 0.77498066, + "learning_rate": 1.470302626336386e-06, + "loss": 0.79621327, + "num_input_tokens_seen": 214277120, + "step": 9942, + "time_per_iteration": 2.5859720706939697 + }, + { + "auxiliary_loss_clip": 0.0106455, + "auxiliary_loss_mlp": 0.01038434, + "balance_loss_clip": 1.03912997, + "balance_loss_mlp": 1.02556562, + "epoch": 0.5978055012776191, + "flos": 20959155573120.0, + "grad_norm": 1.968505086226769, + "language_loss": 0.75784367, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.7788735, + "num_input_tokens_seen": 214295300, + "step": 9943, + "time_per_iteration": 2.607978105545044 + }, + { + "auxiliary_loss_clip": 0.01053578, + "auxiliary_loss_mlp": 0.01033183, + "balance_loss_clip": 1.03855121, + "balance_loss_mlp": 1.02142906, + "epoch": 0.5978656245302871, + "flos": 34058003063040.0, + "grad_norm": 1.8145597272272949, + "language_loss": 0.62346232, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.64432991, + "num_input_tokens_seen": 214317050, + "step": 9944, + "time_per_iteration": 2.76664662361145 + }, + { + "auxiliary_loss_clip": 0.01091241, + "auxiliary_loss_mlp": 0.01032702, + "balance_loss_clip": 1.03920972, + "balance_loss_mlp": 1.01992893, + "epoch": 0.597925747782955, + "flos": 37373275624320.0, + "grad_norm": 1.6320610528755304, + "language_loss": 0.72717929, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.74841875, + "num_input_tokens_seen": 214337470, + "step": 9945, + "time_per_iteration": 2.6749441623687744 + }, + { + "auxiliary_loss_clip": 0.01062879, + "auxiliary_loss_mlp": 0.01034458, + "balance_loss_clip": 1.0400672, + "balance_loss_mlp": 1.02118385, + "epoch": 0.5979858710356231, + "flos": 25374803328000.0, + "grad_norm": 1.9688571116306892, + "language_loss": 0.66930079, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.69027412, + "num_input_tokens_seen": 214357975, + "step": 9946, + "time_per_iteration": 2.658874034881592 + }, + { + "auxiliary_loss_clip": 0.01099908, + "auxiliary_loss_mlp": 0.01042404, + "balance_loss_clip": 1.03763676, + "balance_loss_mlp": 1.02775884, + "epoch": 0.598045994288291, + "flos": 13698413308800.0, + "grad_norm": 2.8121075167084593, + "language_loss": 0.88837194, + "learning_rate": 1.468425107717461e-06, + "loss": 0.90979505, + "num_input_tokens_seen": 214374125, + "step": 9947, + "time_per_iteration": 2.4754021167755127 + }, + { + "auxiliary_loss_clip": 0.01106567, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.03714359, + "balance_loss_mlp": 1.02244759, + "epoch": 0.598106117540959, + "flos": 21981352815360.0, + "grad_norm": 1.7296805620945566, + "language_loss": 0.72271764, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.7441178, + "num_input_tokens_seen": 214393395, + "step": 9948, + "time_per_iteration": 2.477919816970825 + }, + { + "auxiliary_loss_clip": 0.01090151, + "auxiliary_loss_mlp": 0.01032647, + "balance_loss_clip": 1.03858924, + "balance_loss_mlp": 1.01865768, + "epoch": 0.5981662407936269, + "flos": 20559362221440.0, + "grad_norm": 1.8877963961587216, + "language_loss": 0.89256656, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.91379452, + "num_input_tokens_seen": 214411550, + "step": 9949, + "time_per_iteration": 2.5178425312042236 + }, + { + "auxiliary_loss_clip": 0.01098404, + "auxiliary_loss_mlp": 0.01029049, + "balance_loss_clip": 1.03819585, + "balance_loss_mlp": 1.01755762, + "epoch": 0.5982263640462949, + "flos": 14063840323200.0, + "grad_norm": 2.025294997686301, + "language_loss": 0.70611203, + "learning_rate": 1.467298838320673e-06, + "loss": 0.72738659, + "num_input_tokens_seen": 214429780, + "step": 9950, + "time_per_iteration": 2.458994150161743 + }, + { + "auxiliary_loss_clip": 0.01100612, + "auxiliary_loss_mlp": 0.01033299, + "balance_loss_clip": 1.03830361, + "balance_loss_mlp": 1.0205493, + "epoch": 0.5982864872989628, + "flos": 17707228646400.0, + "grad_norm": 1.5658436154703812, + "language_loss": 0.78309256, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.80443162, + "num_input_tokens_seen": 214447775, + "step": 9951, + "time_per_iteration": 2.482403516769409 + }, + { + "auxiliary_loss_clip": 0.01090014, + "auxiliary_loss_mlp": 0.01035477, + "balance_loss_clip": 1.03847587, + "balance_loss_mlp": 1.02188134, + "epoch": 0.5983466105516309, + "flos": 16764789553920.0, + "grad_norm": 1.6459489315392206, + "language_loss": 0.74011648, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.76137137, + "num_input_tokens_seen": 214467245, + "step": 9952, + "time_per_iteration": 2.5072500705718994 + }, + { + "auxiliary_loss_clip": 0.01090399, + "auxiliary_loss_mlp": 0.00790936, + "balance_loss_clip": 1.03710818, + "balance_loss_mlp": 1.01688266, + "epoch": 0.5984067338042988, + "flos": 20042714949120.0, + "grad_norm": 3.179770037265985, + "language_loss": 0.79040891, + "learning_rate": 1.466172750724613e-06, + "loss": 0.80922222, + "num_input_tokens_seen": 214484385, + "step": 9953, + "time_per_iteration": 2.516258955001831 + }, + { + "auxiliary_loss_clip": 0.01078635, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.0368048, + "balance_loss_mlp": 1.01883459, + "epoch": 0.5984668570569668, + "flos": 26319900026880.0, + "grad_norm": 1.4079486782394104, + "language_loss": 0.69663417, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.71772909, + "num_input_tokens_seen": 214503465, + "step": 9954, + "time_per_iteration": 2.620380401611328 + }, + { + "auxiliary_loss_clip": 0.0108761, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.03767979, + "balance_loss_mlp": 1.01731133, + "epoch": 0.5985269803096348, + "flos": 20593728558720.0, + "grad_norm": 1.7690164240785402, + "language_loss": 0.73001599, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.75118846, + "num_input_tokens_seen": 214520725, + "step": 9955, + "time_per_iteration": 2.51125168800354 + }, + { + "auxiliary_loss_clip": 0.01110852, + "auxiliary_loss_mlp": 0.01029638, + "balance_loss_clip": 1.0372901, + "balance_loss_mlp": 1.01684666, + "epoch": 0.5985871035623027, + "flos": 26865382942080.0, + "grad_norm": 1.7835559076894665, + "language_loss": 0.68579543, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.70720035, + "num_input_tokens_seen": 214540675, + "step": 9956, + "time_per_iteration": 3.9317078590393066 + }, + { + "auxiliary_loss_clip": 0.01114278, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.04028368, + "balance_loss_mlp": 1.02096188, + "epoch": 0.5986472268149707, + "flos": 19609704495360.0, + "grad_norm": 1.9673797068581835, + "language_loss": 0.74095362, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.76243579, + "num_input_tokens_seen": 214559910, + "step": 9957, + "time_per_iteration": 2.4396958351135254 + }, + { + "auxiliary_loss_clip": 0.01073535, + "auxiliary_loss_mlp": 0.01028771, + "balance_loss_clip": 1.03662837, + "balance_loss_mlp": 1.01635551, + "epoch": 0.5987073500676386, + "flos": 21794616984960.0, + "grad_norm": 1.7390693782133018, + "language_loss": 0.84983194, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.87085497, + "num_input_tokens_seen": 214575960, + "step": 9958, + "time_per_iteration": 2.5433621406555176 + }, + { + "auxiliary_loss_clip": 0.01080942, + "auxiliary_loss_mlp": 0.00788761, + "balance_loss_clip": 1.03764927, + "balance_loss_mlp": 1.01539588, + "epoch": 0.5987674733203067, + "flos": 24314361079680.0, + "grad_norm": 1.7129298842779497, + "language_loss": 0.66301304, + "learning_rate": 1.463921122471864e-06, + "loss": 0.68171012, + "num_input_tokens_seen": 214594230, + "step": 9959, + "time_per_iteration": 2.57487416267395 + }, + { + "auxiliary_loss_clip": 0.01101359, + "auxiliary_loss_mlp": 0.01029859, + "balance_loss_clip": 1.03855109, + "balance_loss_mlp": 1.01731253, + "epoch": 0.5988275965729746, + "flos": 21320201128320.0, + "grad_norm": 1.699800112394332, + "language_loss": 0.83356196, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.85487413, + "num_input_tokens_seen": 214613130, + "step": 9960, + "time_per_iteration": 2.5008766651153564 + }, + { + "auxiliary_loss_clip": 0.01090775, + "auxiliary_loss_mlp": 0.01026026, + "balance_loss_clip": 1.03624046, + "balance_loss_mlp": 1.01406932, + "epoch": 0.5988877198256426, + "flos": 25118041933440.0, + "grad_norm": 1.6932379922753558, + "language_loss": 0.79163313, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.81280118, + "num_input_tokens_seen": 214634470, + "step": 9961, + "time_per_iteration": 2.5360066890716553 + }, + { + "auxiliary_loss_clip": 0.01109622, + "auxiliary_loss_mlp": 0.0102993, + "balance_loss_clip": 1.03743315, + "balance_loss_mlp": 1.01742542, + "epoch": 0.5989478430783105, + "flos": 26429104350720.0, + "grad_norm": 1.6161378047906223, + "language_loss": 0.67028213, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69167769, + "num_input_tokens_seen": 214654030, + "step": 9962, + "time_per_iteration": 2.5018529891967773 + }, + { + "auxiliary_loss_clip": 0.0109492, + "auxiliary_loss_mlp": 0.01036387, + "balance_loss_clip": 1.03582644, + "balance_loss_mlp": 1.02237964, + "epoch": 0.5990079663309785, + "flos": 25778439434880.0, + "grad_norm": 2.4311265134033238, + "language_loss": 0.74284446, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.76415753, + "num_input_tokens_seen": 214676985, + "step": 9963, + "time_per_iteration": 2.5437021255493164 + }, + { + "auxiliary_loss_clip": 0.010987, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.03762984, + "balance_loss_mlp": 1.01888371, + "epoch": 0.5990680895836464, + "flos": 36831779118720.0, + "grad_norm": 1.9373227927078158, + "language_loss": 0.68197942, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.70327842, + "num_input_tokens_seen": 214700105, + "step": 9964, + "time_per_iteration": 2.629603862762451 + }, + { + "auxiliary_loss_clip": 0.01075555, + "auxiliary_loss_mlp": 0.01029123, + "balance_loss_clip": 1.03792787, + "balance_loss_mlp": 1.01572418, + "epoch": 0.5991282128363145, + "flos": 24133550993280.0, + "grad_norm": 2.140079384373744, + "language_loss": 0.76875317, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.78979993, + "num_input_tokens_seen": 214717885, + "step": 9965, + "time_per_iteration": 2.54160475730896 + }, + { + "auxiliary_loss_clip": 0.01096892, + "auxiliary_loss_mlp": 0.01028845, + "balance_loss_clip": 1.04016089, + "balance_loss_mlp": 1.0164299, + "epoch": 0.5991883360889824, + "flos": 10304064956160.0, + "grad_norm": 2.853214738541968, + "language_loss": 0.77355063, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.79480797, + "num_input_tokens_seen": 214733680, + "step": 9966, + "time_per_iteration": 3.840423107147217 + }, + { + "auxiliary_loss_clip": 0.01073903, + "auxiliary_loss_mlp": 0.01027319, + "balance_loss_clip": 1.03661287, + "balance_loss_mlp": 1.01572585, + "epoch": 0.5992484593416504, + "flos": 23951196622080.0, + "grad_norm": 1.5184891482317633, + "language_loss": 0.7356196, + "learning_rate": 1.460920090376422e-06, + "loss": 0.75663185, + "num_input_tokens_seen": 214753285, + "step": 9967, + "time_per_iteration": 2.5566506385803223 + }, + { + "auxiliary_loss_clip": 0.01104107, + "auxiliary_loss_mlp": 0.01033026, + "balance_loss_clip": 1.03896761, + "balance_loss_mlp": 1.02005625, + "epoch": 0.5993085825943184, + "flos": 11944105061760.0, + "grad_norm": 2.1874404383089, + "language_loss": 0.68645012, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.70782149, + "num_input_tokens_seen": 214767810, + "step": 9968, + "time_per_iteration": 3.831204652786255 + }, + { + "auxiliary_loss_clip": 0.01099824, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.03700709, + "balance_loss_mlp": 1.01981235, + "epoch": 0.5993687058469863, + "flos": 19026838500480.0, + "grad_norm": 1.6247381169512767, + "language_loss": 0.79320014, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.81453145, + "num_input_tokens_seen": 214786040, + "step": 9969, + "time_per_iteration": 2.4817569255828857 + }, + { + "auxiliary_loss_clip": 0.01099553, + "auxiliary_loss_mlp": 0.01030315, + "balance_loss_clip": 1.03625309, + "balance_loss_mlp": 1.01775646, + "epoch": 0.5994288290996543, + "flos": 14282967242880.0, + "grad_norm": 3.259396347304868, + "language_loss": 0.81121689, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.8325156, + "num_input_tokens_seen": 214803110, + "step": 9970, + "time_per_iteration": 2.464970350265503 + }, + { + "auxiliary_loss_clip": 0.01060044, + "auxiliary_loss_mlp": 0.01039992, + "balance_loss_clip": 1.03763771, + "balance_loss_mlp": 1.0249002, + "epoch": 0.5994889523523222, + "flos": 19206643006080.0, + "grad_norm": 1.88287261595569, + "language_loss": 0.62268108, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.64368141, + "num_input_tokens_seen": 214819945, + "step": 9971, + "time_per_iteration": 4.010250091552734 + }, + { + "auxiliary_loss_clip": 0.01107305, + "auxiliary_loss_mlp": 0.01032226, + "balance_loss_clip": 1.03753161, + "balance_loss_mlp": 1.02057898, + "epoch": 0.5995490756049903, + "flos": 28037040675840.0, + "grad_norm": 1.619864367766327, + "language_loss": 0.7910223, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.81241763, + "num_input_tokens_seen": 214838810, + "step": 9972, + "time_per_iteration": 2.5014054775238037 + }, + { + "auxiliary_loss_clip": 0.01070036, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.03677845, + "balance_loss_mlp": 1.02044678, + "epoch": 0.5996091988576582, + "flos": 29052953038080.0, + "grad_norm": 2.3433235600783804, + "language_loss": 0.76146877, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.78250587, + "num_input_tokens_seen": 214857040, + "step": 9973, + "time_per_iteration": 2.6118710041046143 + }, + { + "auxiliary_loss_clip": 0.01074863, + "auxiliary_loss_mlp": 0.01030518, + "balance_loss_clip": 1.03901875, + "balance_loss_mlp": 1.01800656, + "epoch": 0.5996693221103262, + "flos": 20813968800000.0, + "grad_norm": 2.1030472768023993, + "language_loss": 0.65293252, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.67398632, + "num_input_tokens_seen": 214873375, + "step": 9974, + "time_per_iteration": 2.558683395385742 + }, + { + "auxiliary_loss_clip": 0.01099061, + "auxiliary_loss_mlp": 0.01030271, + "balance_loss_clip": 1.03643036, + "balance_loss_mlp": 1.01751614, + "epoch": 0.5997294453629941, + "flos": 23768914078080.0, + "grad_norm": 1.4002754057744817, + "language_loss": 0.74702406, + "learning_rate": 1.457920366566428e-06, + "loss": 0.76831734, + "num_input_tokens_seen": 214893900, + "step": 9975, + "time_per_iteration": 2.52449893951416 + }, + { + "auxiliary_loss_clip": 0.01110682, + "auxiliary_loss_mlp": 0.01028474, + "balance_loss_clip": 1.03785825, + "balance_loss_mlp": 1.01533723, + "epoch": 0.5997895686156621, + "flos": 20960017499520.0, + "grad_norm": 1.8546596572625194, + "language_loss": 0.77098382, + "learning_rate": 1.457545493441611e-06, + "loss": 0.79237545, + "num_input_tokens_seen": 214912110, + "step": 9976, + "time_per_iteration": 2.4777016639709473 + }, + { + "auxiliary_loss_clip": 0.01090892, + "auxiliary_loss_mlp": 0.01038506, + "balance_loss_clip": 1.03621447, + "balance_loss_mlp": 1.02519083, + "epoch": 0.59984969186833, + "flos": 28365443746560.0, + "grad_norm": 2.394676883213683, + "language_loss": 0.74494195, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.76623595, + "num_input_tokens_seen": 214930140, + "step": 9977, + "time_per_iteration": 2.56866192817688 + }, + { + "auxiliary_loss_clip": 0.01075392, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.03867257, + "balance_loss_mlp": 1.01899171, + "epoch": 0.5999098151209981, + "flos": 22565906749440.0, + "grad_norm": 2.122529937081405, + "language_loss": 0.69012928, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.71120179, + "num_input_tokens_seen": 214949200, + "step": 9978, + "time_per_iteration": 2.576420783996582 + }, + { + "auxiliary_loss_clip": 0.01118716, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.04264665, + "balance_loss_mlp": 1.01955771, + "epoch": 0.599969938373666, + "flos": 18768712389120.0, + "grad_norm": 1.930847031543128, + "language_loss": 0.80789244, + "learning_rate": 1.456420997543594e-06, + "loss": 0.82940757, + "num_input_tokens_seen": 214965775, + "step": 9979, + "time_per_iteration": 2.437883138656616 + }, + { + "auxiliary_loss_clip": 0.01105676, + "auxiliary_loss_mlp": 0.0103138, + "balance_loss_clip": 1.03632534, + "balance_loss_mlp": 1.01920891, + "epoch": 0.600030061626334, + "flos": 11327231865600.0, + "grad_norm": 1.7943686882502095, + "language_loss": 0.69894212, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.72031271, + "num_input_tokens_seen": 214982480, + "step": 9980, + "time_per_iteration": 2.443035125732422 + }, + { + "auxiliary_loss_clip": 0.01100059, + "auxiliary_loss_mlp": 0.01031282, + "balance_loss_clip": 1.03742337, + "balance_loss_mlp": 1.01756048, + "epoch": 0.600090184879002, + "flos": 16578664254720.0, + "grad_norm": 2.5929201752022677, + "language_loss": 0.68188822, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.70320165, + "num_input_tokens_seen": 214998110, + "step": 9981, + "time_per_iteration": 2.466353178024292 + }, + { + "auxiliary_loss_clip": 0.01099796, + "auxiliary_loss_mlp": 0.01031603, + "balance_loss_clip": 1.03912926, + "balance_loss_mlp": 1.02048683, + "epoch": 0.6001503081316699, + "flos": 23618627573760.0, + "grad_norm": 3.8311568841443346, + "language_loss": 0.78344584, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.80475974, + "num_input_tokens_seen": 215017995, + "step": 9982, + "time_per_iteration": 2.5512075424194336 + }, + { + "auxiliary_loss_clip": 0.01060003, + "auxiliary_loss_mlp": 0.01040685, + "balance_loss_clip": 1.03550911, + "balance_loss_mlp": 1.02544999, + "epoch": 0.6002104313843379, + "flos": 20667668705280.0, + "grad_norm": 1.7094990159988908, + "language_loss": 0.73001802, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.7510249, + "num_input_tokens_seen": 215038285, + "step": 9983, + "time_per_iteration": 2.5976078510284424 + }, + { + "auxiliary_loss_clip": 0.01070241, + "auxiliary_loss_mlp": 0.01030999, + "balance_loss_clip": 1.03625476, + "balance_loss_mlp": 1.0180769, + "epoch": 0.6002705546370058, + "flos": 22455229968000.0, + "grad_norm": 3.580002728486087, + "language_loss": 0.78129643, + "learning_rate": 1.454547250154447e-06, + "loss": 0.80230886, + "num_input_tokens_seen": 215057825, + "step": 9984, + "time_per_iteration": 2.5612192153930664 + }, + { + "auxiliary_loss_clip": 0.01099272, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.03801036, + "balance_loss_mlp": 1.0189836, + "epoch": 0.6003306778896739, + "flos": 25191982080000.0, + "grad_norm": 1.8639196756054945, + "language_loss": 0.83415568, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.8554585, + "num_input_tokens_seen": 215077790, + "step": 9985, + "time_per_iteration": 2.6163461208343506 + }, + { + "auxiliary_loss_clip": 0.01099048, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.03778839, + "balance_loss_mlp": 1.02109194, + "epoch": 0.6003908011423418, + "flos": 26687733252480.0, + "grad_norm": 1.9832185370832152, + "language_loss": 0.70600045, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.72732162, + "num_input_tokens_seen": 215097650, + "step": 9986, + "time_per_iteration": 2.5708351135253906 + }, + { + "auxiliary_loss_clip": 0.01113712, + "auxiliary_loss_mlp": 0.00787891, + "balance_loss_clip": 1.04021382, + "balance_loss_mlp": 1.01209807, + "epoch": 0.6004509243950098, + "flos": 22565080736640.0, + "grad_norm": 1.5663550536344832, + "language_loss": 0.71615899, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.73517501, + "num_input_tokens_seen": 215118235, + "step": 9987, + "time_per_iteration": 2.5417897701263428 + }, + { + "auxiliary_loss_clip": 0.01087486, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.03754115, + "balance_loss_mlp": 1.01894212, + "epoch": 0.6005110476476777, + "flos": 19719303868800.0, + "grad_norm": 1.7036796877513132, + "language_loss": 0.84506053, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.86624324, + "num_input_tokens_seen": 215136755, + "step": 9988, + "time_per_iteration": 2.5509607791900635 + }, + { + "auxiliary_loss_clip": 0.0110018, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.03813744, + "balance_loss_mlp": 1.02334452, + "epoch": 0.6005711709003457, + "flos": 17712543859200.0, + "grad_norm": 8.228756840956002, + "language_loss": 0.65192425, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.67328936, + "num_input_tokens_seen": 215155225, + "step": 9989, + "time_per_iteration": 2.4805426597595215 + }, + { + "auxiliary_loss_clip": 0.01096014, + "auxiliary_loss_mlp": 0.01035707, + "balance_loss_clip": 1.03547645, + "balance_loss_mlp": 1.02437568, + "epoch": 0.6006312941530136, + "flos": 18514464946560.0, + "grad_norm": 1.6056280009443284, + "language_loss": 0.80855834, + "learning_rate": 1.452299436003257e-06, + "loss": 0.82987553, + "num_input_tokens_seen": 215174815, + "step": 9990, + "time_per_iteration": 2.531773090362549 + }, + { + "auxiliary_loss_clip": 0.01075435, + "auxiliary_loss_mlp": 0.0103509, + "balance_loss_clip": 1.04266191, + "balance_loss_mlp": 1.02238786, + "epoch": 0.6006914174056817, + "flos": 21390837223680.0, + "grad_norm": 1.7997512067192623, + "language_loss": 0.83117062, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.85227585, + "num_input_tokens_seen": 215192045, + "step": 9991, + "time_per_iteration": 2.5668325424194336 + }, + { + "auxiliary_loss_clip": 0.01059375, + "auxiliary_loss_mlp": 0.01031658, + "balance_loss_clip": 1.03750944, + "balance_loss_mlp": 1.01810443, + "epoch": 0.6007515406583496, + "flos": 12750515349120.0, + "grad_norm": 1.934758158684221, + "language_loss": 0.82606888, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.84697926, + "num_input_tokens_seen": 215209885, + "step": 9992, + "time_per_iteration": 2.6903655529022217 + }, + { + "auxiliary_loss_clip": 0.01086856, + "auxiliary_loss_mlp": 0.00787445, + "balance_loss_clip": 1.03711379, + "balance_loss_mlp": 1.01446509, + "epoch": 0.6008116639110176, + "flos": 19206894401280.0, + "grad_norm": 1.9120570550179155, + "language_loss": 0.66055644, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.67929947, + "num_input_tokens_seen": 215228150, + "step": 9993, + "time_per_iteration": 2.741182327270508 + }, + { + "auxiliary_loss_clip": 0.01073553, + "auxiliary_loss_mlp": 0.01028204, + "balance_loss_clip": 1.03909647, + "balance_loss_mlp": 1.01515102, + "epoch": 0.6008717871636855, + "flos": 17055342668160.0, + "grad_norm": 2.8458277610948555, + "language_loss": 0.80971175, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.83072937, + "num_input_tokens_seen": 215243755, + "step": 9994, + "time_per_iteration": 2.6786487102508545 + }, + { + "auxiliary_loss_clip": 0.01066765, + "auxiliary_loss_mlp": 0.01027861, + "balance_loss_clip": 1.03585184, + "balance_loss_mlp": 1.01611304, + "epoch": 0.6009319104163535, + "flos": 20298686244480.0, + "grad_norm": 1.8170581974707647, + "language_loss": 0.72317708, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.74412334, + "num_input_tokens_seen": 215262130, + "step": 9995, + "time_per_iteration": 3.984469175338745 + }, + { + "auxiliary_loss_clip": 0.01089105, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.03649473, + "balance_loss_mlp": 1.01970673, + "epoch": 0.6009920336690215, + "flos": 21836776573440.0, + "grad_norm": 1.7409343669513493, + "language_loss": 0.81288779, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.83410305, + "num_input_tokens_seen": 215281785, + "step": 9996, + "time_per_iteration": 2.6140377521514893 + }, + { + "auxiliary_loss_clip": 0.01049171, + "auxiliary_loss_mlp": 0.01043779, + "balance_loss_clip": 1.033584, + "balance_loss_mlp": 1.02959275, + "epoch": 0.6010521569216895, + "flos": 22596107109120.0, + "grad_norm": 10.34631279393732, + "language_loss": 0.7867161, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.80764556, + "num_input_tokens_seen": 215297550, + "step": 9997, + "time_per_iteration": 2.8118796348571777 + }, + { + "auxiliary_loss_clip": 0.01103018, + "auxiliary_loss_mlp": 0.01028977, + "balance_loss_clip": 1.03900909, + "balance_loss_mlp": 1.01580465, + "epoch": 0.6011122801743575, + "flos": 19171702051200.0, + "grad_norm": 1.642982254397811, + "language_loss": 0.72720617, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.7485261, + "num_input_tokens_seen": 215316360, + "step": 9998, + "time_per_iteration": 2.697793960571289 + }, + { + "auxiliary_loss_clip": 0.01080905, + "auxiliary_loss_mlp": 0.01027578, + "balance_loss_clip": 1.04057693, + "balance_loss_mlp": 1.01567507, + "epoch": 0.6011724034270254, + "flos": 25010022758400.0, + "grad_norm": 1.486840100577563, + "language_loss": 0.72455883, + "learning_rate": 1.448929117633027e-06, + "loss": 0.74564362, + "num_input_tokens_seen": 215336405, + "step": 9999, + "time_per_iteration": 2.7487761974334717 + }, + { + "auxiliary_loss_clip": 0.01065872, + "auxiliary_loss_mlp": 0.01034368, + "balance_loss_clip": 1.03843439, + "balance_loss_mlp": 1.02114201, + "epoch": 0.6012325266796934, + "flos": 21797669640960.0, + "grad_norm": 1.4672808716871348, + "language_loss": 0.78525561, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.80625802, + "num_input_tokens_seen": 215356590, + "step": 10000, + "time_per_iteration": 2.6902832984924316 + }, + { + "auxiliary_loss_clip": 0.01116198, + "auxiliary_loss_mlp": 0.01031271, + "balance_loss_clip": 1.0403235, + "balance_loss_mlp": 1.0178721, + "epoch": 0.6012926499323613, + "flos": 19573003774080.0, + "grad_norm": 2.5382312407458762, + "language_loss": 0.77747631, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.79895103, + "num_input_tokens_seen": 215374295, + "step": 10001, + "time_per_iteration": 2.506680965423584 + }, + { + "auxiliary_loss_clip": 0.0110164, + "auxiliary_loss_mlp": 0.0102915, + "balance_loss_clip": 1.03822732, + "balance_loss_mlp": 1.01619208, + "epoch": 0.6013527731850293, + "flos": 34860786076800.0, + "grad_norm": 1.8184031201503985, + "language_loss": 0.58914065, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.61044854, + "num_input_tokens_seen": 215394535, + "step": 10002, + "time_per_iteration": 2.691248655319214 + }, + { + "auxiliary_loss_clip": 0.0109344, + "auxiliary_loss_mlp": 0.01034637, + "balance_loss_clip": 1.03983879, + "balance_loss_mlp": 1.02042079, + "epoch": 0.6014128964376972, + "flos": 23291948355840.0, + "grad_norm": 1.5107903509049625, + "language_loss": 0.77614057, + "learning_rate": 1.447431741055314e-06, + "loss": 0.79742134, + "num_input_tokens_seen": 215414355, + "step": 10003, + "time_per_iteration": 2.5917367935180664 + }, + { + "auxiliary_loss_clip": 0.0111465, + "auxiliary_loss_mlp": 0.01031541, + "balance_loss_clip": 1.0406009, + "balance_loss_mlp": 1.01877987, + "epoch": 0.6014730196903653, + "flos": 24820916630400.0, + "grad_norm": 8.886514689833119, + "language_loss": 0.77389681, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.79535878, + "num_input_tokens_seen": 215428280, + "step": 10004, + "time_per_iteration": 3.924069881439209 + }, + { + "auxiliary_loss_clip": 0.01098082, + "auxiliary_loss_mlp": 0.01029029, + "balance_loss_clip": 1.03743792, + "balance_loss_mlp": 1.01664352, + "epoch": 0.6015331429430332, + "flos": 23112359331840.0, + "grad_norm": 1.4582750139819272, + "language_loss": 0.72383267, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.74510384, + "num_input_tokens_seen": 215448970, + "step": 10005, + "time_per_iteration": 2.5578627586364746 + }, + { + "auxiliary_loss_clip": 0.01109255, + "auxiliary_loss_mlp": 0.01027058, + "balance_loss_clip": 1.03986657, + "balance_loss_mlp": 1.0146904, + "epoch": 0.6015932661957012, + "flos": 19201363706880.0, + "grad_norm": 2.1097385541079317, + "language_loss": 0.74524677, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.76660991, + "num_input_tokens_seen": 215465260, + "step": 10006, + "time_per_iteration": 3.9293792247772217 + }, + { + "auxiliary_loss_clip": 0.0108655, + "auxiliary_loss_mlp": 0.01031022, + "balance_loss_clip": 1.04013157, + "balance_loss_mlp": 1.01849294, + "epoch": 0.6016533894483691, + "flos": 18113630100480.0, + "grad_norm": 1.8268246506023238, + "language_loss": 0.73822033, + "learning_rate": 1.445934699732685e-06, + "loss": 0.75939608, + "num_input_tokens_seen": 215482725, + "step": 10007, + "time_per_iteration": 2.5542588233947754 + }, + { + "auxiliary_loss_clip": 0.01087849, + "auxiliary_loss_mlp": 0.01025782, + "balance_loss_clip": 1.03711653, + "balance_loss_mlp": 1.01421225, + "epoch": 0.6017135127010371, + "flos": 16216900427520.0, + "grad_norm": 1.78721342428747, + "language_loss": 0.70173252, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.7228688, + "num_input_tokens_seen": 215500420, + "step": 10008, + "time_per_iteration": 2.530632257461548 + }, + { + "auxiliary_loss_clip": 0.0110045, + "auxiliary_loss_mlp": 0.0102661, + "balance_loss_clip": 1.03837061, + "balance_loss_mlp": 1.01474833, + "epoch": 0.6017736359537051, + "flos": 23444246021760.0, + "grad_norm": 1.7146552045949488, + "language_loss": 0.76314962, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.78442025, + "num_input_tokens_seen": 215522260, + "step": 10009, + "time_per_iteration": 3.926481246948242 + }, + { + "auxiliary_loss_clip": 0.01085722, + "auxiliary_loss_mlp": 0.00789428, + "balance_loss_clip": 1.03799129, + "balance_loss_mlp": 1.0190345, + "epoch": 0.601833759206373, + "flos": 23514056104320.0, + "grad_norm": 2.84948881476015, + "language_loss": 0.74167722, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.76042873, + "num_input_tokens_seen": 215541715, + "step": 10010, + "time_per_iteration": 2.5889763832092285 + }, + { + "auxiliary_loss_clip": 0.01035235, + "auxiliary_loss_mlp": 0.01004011, + "balance_loss_clip": 1.01998329, + "balance_loss_mlp": 1.00284863, + "epoch": 0.6018938824590411, + "flos": 63991668648960.0, + "grad_norm": 0.8025599094925673, + "language_loss": 0.55105728, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.57144976, + "num_input_tokens_seen": 215603020, + "step": 10011, + "time_per_iteration": 3.181299924850464 + }, + { + "auxiliary_loss_clip": 0.01102279, + "auxiliary_loss_mlp": 0.01036126, + "balance_loss_clip": 1.03917122, + "balance_loss_mlp": 1.02390707, + "epoch": 0.601954005711709, + "flos": 34640007131520.0, + "grad_norm": 1.3793109772808514, + "language_loss": 0.61939204, + "learning_rate": 1.44406387091556e-06, + "loss": 0.64077616, + "num_input_tokens_seen": 215625115, + "step": 10012, + "time_per_iteration": 2.6178345680236816 + }, + { + "auxiliary_loss_clip": 0.01071802, + "auxiliary_loss_mlp": 0.01026072, + "balance_loss_clip": 1.03939795, + "balance_loss_mlp": 1.01448512, + "epoch": 0.602014128964377, + "flos": 19427062815360.0, + "grad_norm": 1.6741393497811643, + "language_loss": 0.75156295, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.77254164, + "num_input_tokens_seen": 215643730, + "step": 10013, + "time_per_iteration": 2.5505754947662354 + }, + { + "auxiliary_loss_clip": 0.0110563, + "auxiliary_loss_mlp": 0.01030736, + "balance_loss_clip": 1.03795075, + "balance_loss_mlp": 1.01970887, + "epoch": 0.6020742522170449, + "flos": 28329389470080.0, + "grad_norm": 1.6220764217896915, + "language_loss": 0.81536901, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.83673269, + "num_input_tokens_seen": 215664425, + "step": 10014, + "time_per_iteration": 2.5219874382019043 + }, + { + "auxiliary_loss_clip": 0.01083955, + "auxiliary_loss_mlp": 0.0102623, + "balance_loss_clip": 1.03634024, + "balance_loss_mlp": 1.01432085, + "epoch": 0.6021343754697129, + "flos": 22747040058240.0, + "grad_norm": 1.3948756444303017, + "language_loss": 0.72337914, + "learning_rate": 1.442941626485624e-06, + "loss": 0.74448103, + "num_input_tokens_seen": 215684280, + "step": 10015, + "time_per_iteration": 2.5843329429626465 + }, + { + "auxiliary_loss_clip": 0.01020776, + "auxiliary_loss_mlp": 0.01007277, + "balance_loss_clip": 1.01716185, + "balance_loss_mlp": 1.00602531, + "epoch": 0.6021944987223808, + "flos": 65752007402880.0, + "grad_norm": 0.8416662579324772, + "language_loss": 0.5483712, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.56865168, + "num_input_tokens_seen": 215739780, + "step": 10016, + "time_per_iteration": 3.023139476776123 + }, + { + "auxiliary_loss_clip": 0.01086657, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.03799367, + "balance_loss_mlp": 1.01856041, + "epoch": 0.6022546219750489, + "flos": 16105182151680.0, + "grad_norm": 1.6989356408620242, + "language_loss": 0.82890427, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.85007763, + "num_input_tokens_seen": 215757885, + "step": 10017, + "time_per_iteration": 2.5541739463806152 + }, + { + "auxiliary_loss_clip": 0.01088339, + "auxiliary_loss_mlp": 0.01031514, + "balance_loss_clip": 1.04317355, + "balance_loss_mlp": 1.01959944, + "epoch": 0.6023147452277168, + "flos": 25512555985920.0, + "grad_norm": 2.5493988119164457, + "language_loss": 0.83910334, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.86030191, + "num_input_tokens_seen": 215776415, + "step": 10018, + "time_per_iteration": 2.5687685012817383 + }, + { + "auxiliary_loss_clip": 0.01087911, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.03670549, + "balance_loss_mlp": 1.02173078, + "epoch": 0.6023748684803848, + "flos": 22636075968000.0, + "grad_norm": 2.035362453600421, + "language_loss": 0.7856679, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.80689418, + "num_input_tokens_seen": 215794865, + "step": 10019, + "time_per_iteration": 2.5579895973205566 + }, + { + "auxiliary_loss_clip": 0.01072989, + "auxiliary_loss_mlp": 0.0078647, + "balance_loss_clip": 1.04076838, + "balance_loss_mlp": 1.01004028, + "epoch": 0.6024349917330527, + "flos": 26210444307840.0, + "grad_norm": 1.4361806427390396, + "language_loss": 0.73806286, + "learning_rate": 1.441071641765681e-06, + "loss": 0.75665748, + "num_input_tokens_seen": 215816840, + "step": 10020, + "time_per_iteration": 2.642089366912842 + }, + { + "auxiliary_loss_clip": 0.01090328, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.0381645, + "balance_loss_mlp": 1.01978922, + "epoch": 0.6024951149857207, + "flos": 21251755762560.0, + "grad_norm": 1.477871585532041, + "language_loss": 0.64007169, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.66129625, + "num_input_tokens_seen": 215836100, + "step": 10021, + "time_per_iteration": 2.5580034255981445 + }, + { + "auxiliary_loss_clip": 0.01098366, + "auxiliary_loss_mlp": 0.01032265, + "balance_loss_clip": 1.03771758, + "balance_loss_mlp": 1.01953924, + "epoch": 0.6025552382383887, + "flos": 26943453152640.0, + "grad_norm": 1.387329587016392, + "language_loss": 0.80269909, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.82400542, + "num_input_tokens_seen": 215858480, + "step": 10022, + "time_per_iteration": 2.5585343837738037 + }, + { + "auxiliary_loss_clip": 0.01106971, + "auxiliary_loss_mlp": 0.0103117, + "balance_loss_clip": 1.04113483, + "balance_loss_mlp": 1.01903415, + "epoch": 0.6026153614910567, + "flos": 31684379495040.0, + "grad_norm": 3.8272796718036983, + "language_loss": 0.66632277, + "learning_rate": 1.439949905155693e-06, + "loss": 0.68770415, + "num_input_tokens_seen": 215879950, + "step": 10023, + "time_per_iteration": 2.5940968990325928 + }, + { + "auxiliary_loss_clip": 0.01100837, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.0367192, + "balance_loss_mlp": 1.01994133, + "epoch": 0.6026754847437247, + "flos": 29312731175040.0, + "grad_norm": 1.8587340628416016, + "language_loss": 0.73924708, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.76058137, + "num_input_tokens_seen": 215899830, + "step": 10024, + "time_per_iteration": 2.574982166290283 + }, + { + "auxiliary_loss_clip": 0.01100314, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.04014325, + "balance_loss_mlp": 1.01930511, + "epoch": 0.6027356079963926, + "flos": 23586775188480.0, + "grad_norm": 1.780291278895535, + "language_loss": 0.72702587, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.74834788, + "num_input_tokens_seen": 215920440, + "step": 10025, + "time_per_iteration": 2.5485243797302246 + }, + { + "auxiliary_loss_clip": 0.01115867, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.03976631, + "balance_loss_mlp": 1.02255642, + "epoch": 0.6027957312490606, + "flos": 20813753318400.0, + "grad_norm": 2.278380399845514, + "language_loss": 0.67450213, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.69602048, + "num_input_tokens_seen": 215940535, + "step": 10026, + "time_per_iteration": 2.522057294845581 + }, + { + "auxiliary_loss_clip": 0.01104737, + "auxiliary_loss_mlp": 0.01030633, + "balance_loss_clip": 1.03670835, + "balance_loss_mlp": 1.01895642, + "epoch": 0.6028558545017285, + "flos": 19935773182080.0, + "grad_norm": 1.8826848187594503, + "language_loss": 0.80337417, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.82472783, + "num_input_tokens_seen": 215958045, + "step": 10027, + "time_per_iteration": 2.491328477859497 + }, + { + "auxiliary_loss_clip": 0.01082059, + "auxiliary_loss_mlp": 0.01029844, + "balance_loss_clip": 1.04274571, + "balance_loss_mlp": 1.01727355, + "epoch": 0.6029159777543965, + "flos": 22820836550400.0, + "grad_norm": 1.899605678064577, + "language_loss": 0.71217263, + "learning_rate": 1.438080769071171e-06, + "loss": 0.73329163, + "num_input_tokens_seen": 215977330, + "step": 10028, + "time_per_iteration": 2.5809173583984375 + }, + { + "auxiliary_loss_clip": 0.0107371, + "auxiliary_loss_mlp": 0.01035639, + "balance_loss_clip": 1.0388236, + "balance_loss_mlp": 1.02302027, + "epoch": 0.6029761010070644, + "flos": 23587242065280.0, + "grad_norm": 1.7290199032334437, + "language_loss": 0.84897351, + "learning_rate": 1.437707005721669e-06, + "loss": 0.870067, + "num_input_tokens_seen": 215997865, + "step": 10029, + "time_per_iteration": 2.6202635765075684 + }, + { + "auxiliary_loss_clip": 0.01088868, + "auxiliary_loss_mlp": 0.01029704, + "balance_loss_clip": 1.03800082, + "balance_loss_mlp": 1.01800966, + "epoch": 0.6030362242597325, + "flos": 13662430859520.0, + "grad_norm": 1.7696920129827098, + "language_loss": 0.79993951, + "learning_rate": 1.437333263694373e-06, + "loss": 0.82112527, + "num_input_tokens_seen": 216016230, + "step": 10030, + "time_per_iteration": 2.513556718826294 + }, + { + "auxiliary_loss_clip": 0.01052144, + "auxiliary_loss_mlp": 0.01037801, + "balance_loss_clip": 1.03636169, + "balance_loss_mlp": 1.02405632, + "epoch": 0.6030963475124004, + "flos": 24422883045120.0, + "grad_norm": 1.9813475529165967, + "language_loss": 0.71288133, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.73378074, + "num_input_tokens_seen": 216035785, + "step": 10031, + "time_per_iteration": 2.669219970703125 + }, + { + "auxiliary_loss_clip": 0.01069588, + "auxiliary_loss_mlp": 0.01033822, + "balance_loss_clip": 1.03760839, + "balance_loss_mlp": 1.02010083, + "epoch": 0.6031564707650684, + "flos": 29644043247360.0, + "grad_norm": 1.5132774954035395, + "language_loss": 0.73141527, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.75244939, + "num_input_tokens_seen": 216059555, + "step": 10032, + "time_per_iteration": 2.6674206256866455 + }, + { + "auxiliary_loss_clip": 0.01093796, + "auxiliary_loss_mlp": 0.01031666, + "balance_loss_clip": 1.04071283, + "balance_loss_mlp": 1.01852334, + "epoch": 0.6032165940177363, + "flos": 16618776768000.0, + "grad_norm": 2.0525892666054197, + "language_loss": 0.68470132, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.70595598, + "num_input_tokens_seen": 216077235, + "step": 10033, + "time_per_iteration": 2.539726972579956 + }, + { + "auxiliary_loss_clip": 0.01085261, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.03887606, + "balance_loss_mlp": 1.01887321, + "epoch": 0.6032767172704043, + "flos": 17488173553920.0, + "grad_norm": 2.134185519419214, + "language_loss": 0.75730848, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.77848244, + "num_input_tokens_seen": 216094985, + "step": 10034, + "time_per_iteration": 3.9015564918518066 + }, + { + "auxiliary_loss_clip": 0.01088412, + "auxiliary_loss_mlp": 0.01031156, + "balance_loss_clip": 1.0383203, + "balance_loss_mlp": 1.01845431, + "epoch": 0.6033368405230723, + "flos": 26832955939200.0, + "grad_norm": 2.489865183325225, + "language_loss": 0.74558115, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.7667768, + "num_input_tokens_seen": 216115905, + "step": 10035, + "time_per_iteration": 2.6159863471984863 + }, + { + "auxiliary_loss_clip": 0.01082399, + "auxiliary_loss_mlp": 0.01024622, + "balance_loss_clip": 1.03924513, + "balance_loss_mlp": 1.01292145, + "epoch": 0.6033969637757403, + "flos": 16909904499840.0, + "grad_norm": 1.8108608768309096, + "language_loss": 0.86822319, + "learning_rate": 1.435091260090536e-06, + "loss": 0.88929343, + "num_input_tokens_seen": 216132420, + "step": 10036, + "time_per_iteration": 2.5128564834594727 + }, + { + "auxiliary_loss_clip": 0.01074886, + "auxiliary_loss_mlp": 0.01029626, + "balance_loss_clip": 1.0412364, + "balance_loss_mlp": 1.01705503, + "epoch": 0.6034570870284083, + "flos": 22930076787840.0, + "grad_norm": 1.846408447273741, + "language_loss": 0.70473069, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.72577584, + "num_input_tokens_seen": 216149800, + "step": 10037, + "time_per_iteration": 2.5700318813323975 + }, + { + "auxiliary_loss_clip": 0.01096522, + "auxiliary_loss_mlp": 0.01033579, + "balance_loss_clip": 1.03896713, + "balance_loss_mlp": 1.01970911, + "epoch": 0.6035172102810762, + "flos": 23366319465600.0, + "grad_norm": 1.6500345768370672, + "language_loss": 0.84729314, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.86859417, + "num_input_tokens_seen": 216168200, + "step": 10038, + "time_per_iteration": 2.5120959281921387 + }, + { + "auxiliary_loss_clip": 0.01093517, + "auxiliary_loss_mlp": 0.01031391, + "balance_loss_clip": 1.0379976, + "balance_loss_mlp": 1.01883864, + "epoch": 0.6035773335337442, + "flos": 20887082933760.0, + "grad_norm": 1.9736761768548143, + "language_loss": 0.7612592, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.78250819, + "num_input_tokens_seen": 216187105, + "step": 10039, + "time_per_iteration": 2.552495002746582 + }, + { + "auxiliary_loss_clip": 0.01099093, + "auxiliary_loss_mlp": 0.01029786, + "balance_loss_clip": 1.03828049, + "balance_loss_mlp": 1.01819897, + "epoch": 0.6036374567864121, + "flos": 24936298093440.0, + "grad_norm": 2.2320380375193873, + "language_loss": 0.71332216, + "learning_rate": 1.433597019260301e-06, + "loss": 0.73461097, + "num_input_tokens_seen": 216205440, + "step": 10040, + "time_per_iteration": 2.5314314365386963 + }, + { + "auxiliary_loss_clip": 0.01104374, + "auxiliary_loss_mlp": 0.01030455, + "balance_loss_clip": 1.04112959, + "balance_loss_mlp": 1.01626277, + "epoch": 0.6036975800390801, + "flos": 23148269953920.0, + "grad_norm": 2.1767315235121205, + "language_loss": 0.78115016, + "learning_rate": 1.433223512712475e-06, + "loss": 0.80249846, + "num_input_tokens_seen": 216223130, + "step": 10041, + "time_per_iteration": 2.515191078186035 + }, + { + "auxiliary_loss_clip": 0.01091591, + "auxiliary_loss_mlp": 0.01030151, + "balance_loss_clip": 1.04043984, + "balance_loss_mlp": 1.01795566, + "epoch": 0.603757703291748, + "flos": 18660729127680.0, + "grad_norm": 1.660766860055552, + "language_loss": 0.76066607, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.78188354, + "num_input_tokens_seen": 216240260, + "step": 10042, + "time_per_iteration": 2.5008740425109863 + }, + { + "auxiliary_loss_clip": 0.0106493, + "auxiliary_loss_mlp": 0.01024528, + "balance_loss_clip": 1.03642654, + "balance_loss_mlp": 1.01273203, + "epoch": 0.6038178265444161, + "flos": 19682603147520.0, + "grad_norm": 2.1005259480863203, + "language_loss": 0.85002667, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.87092125, + "num_input_tokens_seen": 216258510, + "step": 10043, + "time_per_iteration": 3.965954065322876 + }, + { + "auxiliary_loss_clip": 0.01075978, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.03671026, + "balance_loss_mlp": 1.02416217, + "epoch": 0.603877949797084, + "flos": 22638230784000.0, + "grad_norm": 1.944853373709849, + "language_loss": 0.69597578, + "learning_rate": 1.432103122078974e-06, + "loss": 0.71711123, + "num_input_tokens_seen": 216277550, + "step": 10044, + "time_per_iteration": 3.9723825454711914 + }, + { + "auxiliary_loss_clip": 0.01103901, + "auxiliary_loss_mlp": 0.01027208, + "balance_loss_clip": 1.04527903, + "balance_loss_mlp": 1.01386905, + "epoch": 0.603938073049752, + "flos": 25447881548160.0, + "grad_norm": 2.42866345554089, + "language_loss": 0.78087759, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.80218863, + "num_input_tokens_seen": 216296690, + "step": 10045, + "time_per_iteration": 2.5499019622802734 + }, + { + "auxiliary_loss_clip": 0.01061632, + "auxiliary_loss_mlp": 0.01031419, + "balance_loss_clip": 1.03772235, + "balance_loss_mlp": 1.01880646, + "epoch": 0.6039981963024199, + "flos": 22340135813760.0, + "grad_norm": 1.7276766180751577, + "language_loss": 0.77439433, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.7953248, + "num_input_tokens_seen": 216316110, + "step": 10046, + "time_per_iteration": 2.6359434127807617 + }, + { + "auxiliary_loss_clip": 0.01057289, + "auxiliary_loss_mlp": 0.01036129, + "balance_loss_clip": 1.03355813, + "balance_loss_mlp": 1.02301645, + "epoch": 0.6040583195550879, + "flos": 20703148364160.0, + "grad_norm": 1.530203271510399, + "language_loss": 0.86968774, + "learning_rate": 1.430982925257827e-06, + "loss": 0.89062196, + "num_input_tokens_seen": 216333855, + "step": 10047, + "time_per_iteration": 2.581923007965088 + }, + { + "auxiliary_loss_clip": 0.01103544, + "auxiliary_loss_mlp": 0.0102462, + "balance_loss_clip": 1.04317439, + "balance_loss_mlp": 1.01305699, + "epoch": 0.604118442807756, + "flos": 27163118776320.0, + "grad_norm": 1.5130385978010459, + "language_loss": 0.75436294, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.7756446, + "num_input_tokens_seen": 216354890, + "step": 10048, + "time_per_iteration": 3.9316518306732178 + }, + { + "auxiliary_loss_clip": 0.01110704, + "auxiliary_loss_mlp": 0.01035817, + "balance_loss_clip": 1.04134667, + "balance_loss_mlp": 1.02115417, + "epoch": 0.6041785660604239, + "flos": 30881524654080.0, + "grad_norm": 5.647231011256132, + "language_loss": 0.66300952, + "learning_rate": 1.430236235239386e-06, + "loss": 0.68447471, + "num_input_tokens_seen": 216376055, + "step": 10049, + "time_per_iteration": 2.5822269916534424 + }, + { + "auxiliary_loss_clip": 0.01085332, + "auxiliary_loss_mlp": 0.0103781, + "balance_loss_clip": 1.0371691, + "balance_loss_mlp": 1.02421403, + "epoch": 0.6042386893130919, + "flos": 19938215306880.0, + "grad_norm": 1.4933750124994163, + "language_loss": 0.66821849, + "learning_rate": 1.429862922631336e-06, + "loss": 0.68944991, + "num_input_tokens_seen": 216396295, + "step": 10050, + "time_per_iteration": 2.521122455596924 + }, + { + "auxiliary_loss_clip": 0.01079505, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.03853583, + "balance_loss_mlp": 1.01819277, + "epoch": 0.6042988125657598, + "flos": 32415915882240.0, + "grad_norm": 1.8554980457976378, + "language_loss": 0.69577515, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.71687621, + "num_input_tokens_seen": 216416605, + "step": 10051, + "time_per_iteration": 2.6521992683410645 + }, + { + "auxiliary_loss_clip": 0.01097246, + "auxiliary_loss_mlp": 0.01031722, + "balance_loss_clip": 1.03660965, + "balance_loss_mlp": 1.01974773, + "epoch": 0.6043589358184278, + "flos": 17420805596160.0, + "grad_norm": 2.3333945917612886, + "language_loss": 0.6423952, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.6636849, + "num_input_tokens_seen": 216435130, + "step": 10052, + "time_per_iteration": 2.465782403945923 + }, + { + "auxiliary_loss_clip": 0.01088852, + "auxiliary_loss_mlp": 0.01028161, + "balance_loss_clip": 1.03805947, + "balance_loss_mlp": 1.01535237, + "epoch": 0.6044190590710957, + "flos": 27672834723840.0, + "grad_norm": 1.9917833193835313, + "language_loss": 0.68856055, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.70973068, + "num_input_tokens_seen": 216455640, + "step": 10053, + "time_per_iteration": 2.5840976238250732 + }, + { + "auxiliary_loss_clip": 0.01028069, + "auxiliary_loss_mlp": 0.0100337, + "balance_loss_clip": 1.02368617, + "balance_loss_mlp": 1.00227344, + "epoch": 0.6044791823237637, + "flos": 65316267515520.0, + "grad_norm": 0.8018026378644575, + "language_loss": 0.60426438, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.62457883, + "num_input_tokens_seen": 216518130, + "step": 10054, + "time_per_iteration": 3.2507219314575195 + }, + { + "auxiliary_loss_clip": 0.01056424, + "auxiliary_loss_mlp": 0.01030655, + "balance_loss_clip": 1.04382837, + "balance_loss_mlp": 1.01806021, + "epoch": 0.6045393055764317, + "flos": 24492369905280.0, + "grad_norm": 1.5951397016679112, + "language_loss": 0.85608095, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.87695169, + "num_input_tokens_seen": 216536845, + "step": 10055, + "time_per_iteration": 2.655701160430908 + }, + { + "auxiliary_loss_clip": 0.01091049, + "auxiliary_loss_mlp": 0.01040739, + "balance_loss_clip": 1.04066801, + "balance_loss_mlp": 1.02654696, + "epoch": 0.6045994288290997, + "flos": 19054345340160.0, + "grad_norm": 2.809967490897323, + "language_loss": 0.73294115, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.75425899, + "num_input_tokens_seen": 216551860, + "step": 10056, + "time_per_iteration": 2.477033853530884 + }, + { + "auxiliary_loss_clip": 0.01074125, + "auxiliary_loss_mlp": 0.01036273, + "balance_loss_clip": 1.04142225, + "balance_loss_mlp": 1.02298737, + "epoch": 0.6046595520817676, + "flos": 26576697335040.0, + "grad_norm": 1.5265776167068597, + "language_loss": 0.80492866, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.82603264, + "num_input_tokens_seen": 216574775, + "step": 10057, + "time_per_iteration": 2.622055768966675 + }, + { + "auxiliary_loss_clip": 0.01109472, + "auxiliary_loss_mlp": 0.00787098, + "balance_loss_clip": 1.03863764, + "balance_loss_mlp": 1.01336122, + "epoch": 0.6047196753344356, + "flos": 13582277660160.0, + "grad_norm": 2.203609390783533, + "language_loss": 0.74722803, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.76619375, + "num_input_tokens_seen": 216590100, + "step": 10058, + "time_per_iteration": 2.441200017929077 + }, + { + "auxiliary_loss_clip": 0.01096729, + "auxiliary_loss_mlp": 0.01027782, + "balance_loss_clip": 1.0372988, + "balance_loss_mlp": 1.01575351, + "epoch": 0.6047797985871035, + "flos": 25520456977920.0, + "grad_norm": 2.1534508456887695, + "language_loss": 0.71329868, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.7345438, + "num_input_tokens_seen": 216610145, + "step": 10059, + "time_per_iteration": 2.543438673019409 + }, + { + "auxiliary_loss_clip": 0.01086643, + "auxiliary_loss_mlp": 0.01028579, + "balance_loss_clip": 1.03793144, + "balance_loss_mlp": 1.01639533, + "epoch": 0.6048399218397715, + "flos": 20520147548160.0, + "grad_norm": 2.424597540042457, + "language_loss": 0.76132894, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.78248119, + "num_input_tokens_seen": 216630625, + "step": 10060, + "time_per_iteration": 2.532600164413452 + }, + { + "auxiliary_loss_clip": 0.01098083, + "auxiliary_loss_mlp": 0.01034049, + "balance_loss_clip": 1.036502, + "balance_loss_mlp": 1.02206254, + "epoch": 0.6049000450924396, + "flos": 20408788408320.0, + "grad_norm": 2.2244763773548577, + "language_loss": 0.73717588, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.75849724, + "num_input_tokens_seen": 216649255, + "step": 10061, + "time_per_iteration": 2.515824794769287 + }, + { + "auxiliary_loss_clip": 0.01069205, + "auxiliary_loss_mlp": 0.00787569, + "balance_loss_clip": 1.03727555, + "balance_loss_mlp": 1.01358938, + "epoch": 0.6049601683451075, + "flos": 20741357456640.0, + "grad_norm": 1.9155354692803155, + "language_loss": 0.67381537, + "learning_rate": 1.425384861715639e-06, + "loss": 0.69238311, + "num_input_tokens_seen": 216668100, + "step": 10062, + "time_per_iteration": 2.587156295776367 + }, + { + "auxiliary_loss_clip": 0.01093396, + "auxiliary_loss_mlp": 0.0103161, + "balance_loss_clip": 1.03691483, + "balance_loss_mlp": 1.01912308, + "epoch": 0.6050202915977755, + "flos": 20083114771200.0, + "grad_norm": 2.0934210712138834, + "language_loss": 0.71068633, + "learning_rate": 1.425011831266978e-06, + "loss": 0.73193645, + "num_input_tokens_seen": 216686125, + "step": 10063, + "time_per_iteration": 2.496239423751831 + }, + { + "auxiliary_loss_clip": 0.01107002, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.03790951, + "balance_loss_mlp": 1.02281022, + "epoch": 0.6050804148504434, + "flos": 15960821391360.0, + "grad_norm": 1.7509912637358285, + "language_loss": 0.84598935, + "learning_rate": 1.424638822621926e-06, + "loss": 0.86740708, + "num_input_tokens_seen": 216704265, + "step": 10064, + "time_per_iteration": 2.4439849853515625 + }, + { + "auxiliary_loss_clip": 0.01098897, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.03793502, + "balance_loss_mlp": 1.01864564, + "epoch": 0.6051405381031114, + "flos": 17456644391040.0, + "grad_norm": 2.4685377207523262, + "language_loss": 0.7937507, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.81504858, + "num_input_tokens_seen": 216721765, + "step": 10065, + "time_per_iteration": 2.4791414737701416 + }, + { + "auxiliary_loss_clip": 0.0106653, + "auxiliary_loss_mlp": 0.01033147, + "balance_loss_clip": 1.04216015, + "balance_loss_mlp": 1.01889563, + "epoch": 0.6052006613557793, + "flos": 11400130517760.0, + "grad_norm": 1.9631116759794434, + "language_loss": 0.78413916, + "learning_rate": 1.423892870799226e-06, + "loss": 0.80513591, + "num_input_tokens_seen": 216738295, + "step": 10066, + "time_per_iteration": 2.562825918197632 + }, + { + "auxiliary_loss_clip": 0.01060878, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.03819776, + "balance_loss_mlp": 1.01878607, + "epoch": 0.6052607846084473, + "flos": 24750998807040.0, + "grad_norm": 2.1079384813004705, + "language_loss": 0.72724432, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.74816787, + "num_input_tokens_seen": 216759875, + "step": 10067, + "time_per_iteration": 2.6640305519104004 + }, + { + "auxiliary_loss_clip": 0.01089594, + "auxiliary_loss_mlp": 0.0078549, + "balance_loss_clip": 1.03948879, + "balance_loss_mlp": 1.01107931, + "epoch": 0.6053209078611153, + "flos": 20741141975040.0, + "grad_norm": 1.346722421466001, + "language_loss": 0.6892972, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.70804799, + "num_input_tokens_seen": 216780705, + "step": 10068, + "time_per_iteration": 2.581026077270508 + }, + { + "auxiliary_loss_clip": 0.01096887, + "auxiliary_loss_mlp": 0.01033899, + "balance_loss_clip": 1.03938198, + "balance_loss_mlp": 1.02137578, + "epoch": 0.6053810311137833, + "flos": 18953149749120.0, + "grad_norm": 4.8685391118757, + "language_loss": 0.87230909, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.89361691, + "num_input_tokens_seen": 216797625, + "step": 10069, + "time_per_iteration": 2.506182909011841 + }, + { + "auxiliary_loss_clip": 0.01076567, + "auxiliary_loss_mlp": 0.01024266, + "balance_loss_clip": 1.03692257, + "balance_loss_mlp": 1.01288128, + "epoch": 0.6054411543664512, + "flos": 23951124794880.0, + "grad_norm": 1.526687099184133, + "language_loss": 0.82826591, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.84927428, + "num_input_tokens_seen": 216817610, + "step": 10070, + "time_per_iteration": 2.5885255336761475 + }, + { + "auxiliary_loss_clip": 0.01092829, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.03813362, + "balance_loss_mlp": 1.01892841, + "epoch": 0.6055012776191192, + "flos": 20593979953920.0, + "grad_norm": 1.512132514023569, + "language_loss": 0.85890388, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.88014466, + "num_input_tokens_seen": 216836835, + "step": 10071, + "time_per_iteration": 2.5930097103118896 + }, + { + "auxiliary_loss_clip": 0.01101909, + "auxiliary_loss_mlp": 0.01032398, + "balance_loss_clip": 1.03912544, + "balance_loss_mlp": 1.0189867, + "epoch": 0.6055614008717871, + "flos": 30298191782400.0, + "grad_norm": 1.615428585967105, + "language_loss": 0.77037328, + "learning_rate": 1.421655540088603e-06, + "loss": 0.79171634, + "num_input_tokens_seen": 216856760, + "step": 10072, + "time_per_iteration": 4.005670785903931 + }, + { + "auxiliary_loss_clip": 0.01090166, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.03636837, + "balance_loss_mlp": 1.01536441, + "epoch": 0.6056215241244551, + "flos": 27125017424640.0, + "grad_norm": 2.209314346321389, + "language_loss": 0.74752963, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.76871747, + "num_input_tokens_seen": 216878795, + "step": 10073, + "time_per_iteration": 2.6189732551574707 + }, + { + "auxiliary_loss_clip": 0.01015094, + "auxiliary_loss_mlp": 0.0100186, + "balance_loss_clip": 1.02935171, + "balance_loss_mlp": 1.00048351, + "epoch": 0.6056816473771232, + "flos": 56007323925120.0, + "grad_norm": 0.7547475344909664, + "language_loss": 0.55225217, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.57242173, + "num_input_tokens_seen": 216937800, + "step": 10074, + "time_per_iteration": 3.2505900859832764 + }, + { + "auxiliary_loss_clip": 0.01073018, + "auxiliary_loss_mlp": 0.01038857, + "balance_loss_clip": 1.04232025, + "balance_loss_mlp": 1.02480233, + "epoch": 0.6057417706297911, + "flos": 23549499849600.0, + "grad_norm": 1.8094936446011878, + "language_loss": 0.81689227, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.83801103, + "num_input_tokens_seen": 216955280, + "step": 10075, + "time_per_iteration": 2.6116485595703125 + }, + { + "auxiliary_loss_clip": 0.01101141, + "auxiliary_loss_mlp": 0.0102348, + "balance_loss_clip": 1.03826797, + "balance_loss_mlp": 1.01123726, + "epoch": 0.6058018938824591, + "flos": 27744296832000.0, + "grad_norm": 2.124959838017282, + "language_loss": 0.78273004, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.80397618, + "num_input_tokens_seen": 216976950, + "step": 10076, + "time_per_iteration": 2.5469682216644287 + }, + { + "auxiliary_loss_clip": 0.01100008, + "auxiliary_loss_mlp": 0.01030333, + "balance_loss_clip": 1.03713512, + "balance_loss_mlp": 1.01774442, + "epoch": 0.605862017135127, + "flos": 22783381643520.0, + "grad_norm": 1.6994052790445462, + "language_loss": 0.72594124, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.74724466, + "num_input_tokens_seen": 216996945, + "step": 10077, + "time_per_iteration": 2.5557994842529297 + }, + { + "auxiliary_loss_clip": 0.01111495, + "auxiliary_loss_mlp": 0.01029074, + "balance_loss_clip": 1.03958499, + "balance_loss_mlp": 1.01670003, + "epoch": 0.605922140387795, + "flos": 21215019127680.0, + "grad_norm": 36.37244585435063, + "language_loss": 0.55466735, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.57607305, + "num_input_tokens_seen": 217016580, + "step": 10078, + "time_per_iteration": 2.482950210571289 + }, + { + "auxiliary_loss_clip": 0.01066806, + "auxiliary_loss_mlp": 0.0103017, + "balance_loss_clip": 1.03607512, + "balance_loss_mlp": 1.01759994, + "epoch": 0.6059822636404629, + "flos": 27268372604160.0, + "grad_norm": 1.532521720993168, + "language_loss": 0.70199001, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.72295982, + "num_input_tokens_seen": 217037300, + "step": 10079, + "time_per_iteration": 2.659311294555664 + }, + { + "auxiliary_loss_clip": 0.01086417, + "auxiliary_loss_mlp": 0.01030417, + "balance_loss_clip": 1.03749657, + "balance_loss_mlp": 1.01885331, + "epoch": 0.606042386893131, + "flos": 20631327120000.0, + "grad_norm": 1.8889697135097516, + "language_loss": 0.62466854, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.64583683, + "num_input_tokens_seen": 217055805, + "step": 10080, + "time_per_iteration": 2.537776231765747 + }, + { + "auxiliary_loss_clip": 0.01087986, + "auxiliary_loss_mlp": 0.01030295, + "balance_loss_clip": 1.03703547, + "balance_loss_mlp": 1.01755154, + "epoch": 0.6061025101457989, + "flos": 23002293081600.0, + "grad_norm": 2.0287072227362857, + "language_loss": 0.7139802, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.73516303, + "num_input_tokens_seen": 217074175, + "step": 10081, + "time_per_iteration": 2.578606128692627 + }, + { + "auxiliary_loss_clip": 0.01088144, + "auxiliary_loss_mlp": 0.01026355, + "balance_loss_clip": 1.04286683, + "balance_loss_mlp": 1.01405859, + "epoch": 0.6061626333984669, + "flos": 29898937134720.0, + "grad_norm": 1.771134490365611, + "language_loss": 0.68795866, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.70910364, + "num_input_tokens_seen": 217095695, + "step": 10082, + "time_per_iteration": 3.9480671882629395 + }, + { + "auxiliary_loss_clip": 0.01110603, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.03985119, + "balance_loss_mlp": 1.01795876, + "epoch": 0.6062227566511348, + "flos": 25009196745600.0, + "grad_norm": 1.3659074724538873, + "language_loss": 0.65917313, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.68057787, + "num_input_tokens_seen": 217116260, + "step": 10083, + "time_per_iteration": 3.8909735679626465 + }, + { + "auxiliary_loss_clip": 0.01098368, + "auxiliary_loss_mlp": 0.01027993, + "balance_loss_clip": 1.03775549, + "balance_loss_mlp": 1.01572025, + "epoch": 0.6062828799038028, + "flos": 19463943104640.0, + "grad_norm": 2.427560007929621, + "language_loss": 0.73923266, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.76049626, + "num_input_tokens_seen": 217134465, + "step": 10084, + "time_per_iteration": 2.4919352531433105 + }, + { + "auxiliary_loss_clip": 0.01086371, + "auxiliary_loss_mlp": 0.01031852, + "balance_loss_clip": 1.03908849, + "balance_loss_mlp": 1.01924586, + "epoch": 0.6063430031564707, + "flos": 13589568120960.0, + "grad_norm": 3.1147119012383615, + "language_loss": 0.72302085, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.74420309, + "num_input_tokens_seen": 217149920, + "step": 10085, + "time_per_iteration": 2.5389692783355713 + }, + { + "auxiliary_loss_clip": 0.0110912, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.0387938, + "balance_loss_mlp": 1.01844001, + "epoch": 0.6064031264091387, + "flos": 23255499029760.0, + "grad_norm": 2.28883799147687, + "language_loss": 0.76265967, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.78404963, + "num_input_tokens_seen": 217168165, + "step": 10086, + "time_per_iteration": 2.5144565105438232 + }, + { + "auxiliary_loss_clip": 0.01078396, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.03846371, + "balance_loss_mlp": 1.0174458, + "epoch": 0.6064632496618068, + "flos": 22458462192000.0, + "grad_norm": 1.3638182156046654, + "language_loss": 0.73000646, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.75108999, + "num_input_tokens_seen": 217190070, + "step": 10087, + "time_per_iteration": 3.9673447608947754 + }, + { + "auxiliary_loss_clip": 0.01095183, + "auxiliary_loss_mlp": 0.01024976, + "balance_loss_clip": 1.03723156, + "balance_loss_mlp": 1.01480782, + "epoch": 0.6065233729144747, + "flos": 25118652464640.0, + "grad_norm": 1.7020182777110142, + "language_loss": 0.84054983, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.86175144, + "num_input_tokens_seen": 217209370, + "step": 10088, + "time_per_iteration": 2.530547618865967 + }, + { + "auxiliary_loss_clip": 0.01057305, + "auxiliary_loss_mlp": 0.00785011, + "balance_loss_clip": 1.03657377, + "balance_loss_mlp": 1.0093832, + "epoch": 0.6065834961671427, + "flos": 23477355383040.0, + "grad_norm": 2.1312560347616247, + "language_loss": 0.71591777, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.73434097, + "num_input_tokens_seen": 217226990, + "step": 10089, + "time_per_iteration": 2.6258692741394043 + }, + { + "auxiliary_loss_clip": 0.01098715, + "auxiliary_loss_mlp": 0.01034237, + "balance_loss_clip": 1.03908098, + "balance_loss_mlp": 1.02309644, + "epoch": 0.6066436194198106, + "flos": 17019396132480.0, + "grad_norm": 2.1155703923655618, + "language_loss": 0.82850921, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.84983873, + "num_input_tokens_seen": 217244585, + "step": 10090, + "time_per_iteration": 2.5076911449432373 + }, + { + "auxiliary_loss_clip": 0.01081239, + "auxiliary_loss_mlp": 0.01039936, + "balance_loss_clip": 1.04204226, + "balance_loss_mlp": 1.02659106, + "epoch": 0.6067037426724786, + "flos": 18514752255360.0, + "grad_norm": 2.816886186304469, + "language_loss": 0.75057954, + "learning_rate": 1.4145758826341e-06, + "loss": 0.77179134, + "num_input_tokens_seen": 217263435, + "step": 10091, + "time_per_iteration": 2.5508944988250732 + }, + { + "auxiliary_loss_clip": 0.01105864, + "auxiliary_loss_mlp": 0.01035511, + "balance_loss_clip": 1.03675258, + "balance_loss_mlp": 1.02388263, + "epoch": 0.6067638659251465, + "flos": 22345989730560.0, + "grad_norm": 1.5527249444712599, + "language_loss": 0.79681635, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.81823015, + "num_input_tokens_seen": 217283725, + "step": 10092, + "time_per_iteration": 2.514124870300293 + }, + { + "auxiliary_loss_clip": 0.01087898, + "auxiliary_loss_mlp": 0.01029619, + "balance_loss_clip": 1.03711772, + "balance_loss_mlp": 1.01695275, + "epoch": 0.6068239891778145, + "flos": 12451019748480.0, + "grad_norm": 1.796857633109894, + "language_loss": 0.76177251, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.78294766, + "num_input_tokens_seen": 217301120, + "step": 10093, + "time_per_iteration": 2.504765510559082 + }, + { + "auxiliary_loss_clip": 0.01082817, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.03776622, + "balance_loss_mlp": 1.02035642, + "epoch": 0.6068841124304825, + "flos": 23185868515200.0, + "grad_norm": 1.7667628140435616, + "language_loss": 0.87489903, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.8960557, + "num_input_tokens_seen": 217319585, + "step": 10094, + "time_per_iteration": 2.539463996887207 + }, + { + "auxiliary_loss_clip": 0.01099245, + "auxiliary_loss_mlp": 0.01024953, + "balance_loss_clip": 1.03777254, + "balance_loss_mlp": 1.01241803, + "epoch": 0.6069442356831505, + "flos": 18587902302720.0, + "grad_norm": 1.6380361602165818, + "language_loss": 0.72406709, + "learning_rate": 1.413086446353919e-06, + "loss": 0.74530911, + "num_input_tokens_seen": 217338880, + "step": 10095, + "time_per_iteration": 2.507211446762085 + }, + { + "auxiliary_loss_clip": 0.01083323, + "auxiliary_loss_mlp": 0.01027, + "balance_loss_clip": 1.0356288, + "balance_loss_mlp": 1.01557398, + "epoch": 0.6070043589358184, + "flos": 20960340721920.0, + "grad_norm": 2.252348080494498, + "language_loss": 0.76800525, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.78910851, + "num_input_tokens_seen": 217357480, + "step": 10096, + "time_per_iteration": 2.553959846496582 + }, + { + "auxiliary_loss_clip": 0.01109714, + "auxiliary_loss_mlp": 0.01034686, + "balance_loss_clip": 1.0382272, + "balance_loss_mlp": 1.02275896, + "epoch": 0.6070644821884864, + "flos": 11692443398400.0, + "grad_norm": 1.8216081439368703, + "language_loss": 0.79915786, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.82060188, + "num_input_tokens_seen": 217374575, + "step": 10097, + "time_per_iteration": 2.4473273754119873 + }, + { + "auxiliary_loss_clip": 0.01086015, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.04188621, + "balance_loss_mlp": 1.01876497, + "epoch": 0.6071246054411543, + "flos": 19310568030720.0, + "grad_norm": 1.5438980391260921, + "language_loss": 0.67334974, + "learning_rate": 1.411969602780478e-06, + "loss": 0.69451582, + "num_input_tokens_seen": 217392950, + "step": 10098, + "time_per_iteration": 2.5609679222106934 + }, + { + "auxiliary_loss_clip": 0.01107615, + "auxiliary_loss_mlp": 0.01025373, + "balance_loss_clip": 1.03776169, + "balance_loss_mlp": 1.01399493, + "epoch": 0.6071847286938223, + "flos": 17749029098880.0, + "grad_norm": 2.1809886427641145, + "language_loss": 0.80454355, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.82587337, + "num_input_tokens_seen": 217412145, + "step": 10099, + "time_per_iteration": 2.463916778564453 + }, + { + "auxiliary_loss_clip": 0.0107941, + "auxiliary_loss_mlp": 0.01034261, + "balance_loss_clip": 1.0378598, + "balance_loss_mlp": 1.02145839, + "epoch": 0.6072448519464904, + "flos": 22637512512000.0, + "grad_norm": 4.2772732445089785, + "language_loss": 0.70693994, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.72807664, + "num_input_tokens_seen": 217432080, + "step": 10100, + "time_per_iteration": 2.5603649616241455 + }, + { + "auxiliary_loss_clip": 0.01075409, + "auxiliary_loss_mlp": 0.01032691, + "balance_loss_clip": 1.03980482, + "balance_loss_mlp": 1.01961398, + "epoch": 0.6073049751991583, + "flos": 19537308633600.0, + "grad_norm": 2.1576441637890666, + "language_loss": 0.70571029, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.72679132, + "num_input_tokens_seen": 217450945, + "step": 10101, + "time_per_iteration": 2.552314519882202 + }, + { + "auxiliary_loss_clip": 0.01080797, + "auxiliary_loss_mlp": 0.01025996, + "balance_loss_clip": 1.03707147, + "balance_loss_mlp": 1.01400352, + "epoch": 0.6073650984518263, + "flos": 28294233033600.0, + "grad_norm": 1.9419885592159174, + "language_loss": 0.69043058, + "learning_rate": 1.410480790256154e-06, + "loss": 0.7114985, + "num_input_tokens_seen": 217473105, + "step": 10102, + "time_per_iteration": 2.5911049842834473 + }, + { + "auxiliary_loss_clip": 0.01110628, + "auxiliary_loss_mlp": 0.01029899, + "balance_loss_clip": 1.03876746, + "balance_loss_mlp": 1.01833558, + "epoch": 0.6074252217044942, + "flos": 25664422688640.0, + "grad_norm": 2.134431057031402, + "language_loss": 0.73848087, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.75988615, + "num_input_tokens_seen": 217491780, + "step": 10103, + "time_per_iteration": 2.4777464866638184 + }, + { + "auxiliary_loss_clip": 0.01084011, + "auxiliary_loss_mlp": 0.01038861, + "balance_loss_clip": 1.04460454, + "balance_loss_mlp": 1.02581358, + "epoch": 0.6074853449571622, + "flos": 22857106308480.0, + "grad_norm": 1.6861131771687243, + "language_loss": 0.76911712, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.79034585, + "num_input_tokens_seen": 217510605, + "step": 10104, + "time_per_iteration": 2.5619850158691406 + }, + { + "auxiliary_loss_clip": 0.01024129, + "auxiliary_loss_mlp": 0.01000516, + "balance_loss_clip": 1.02892244, + "balance_loss_mlp": 0.99924099, + "epoch": 0.6075454682098301, + "flos": 67111406547840.0, + "grad_norm": 0.7067489793114724, + "language_loss": 0.55994225, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.58018875, + "num_input_tokens_seen": 217574815, + "step": 10105, + "time_per_iteration": 3.1713335514068604 + }, + { + "auxiliary_loss_clip": 0.01041295, + "auxiliary_loss_mlp": 0.01001101, + "balance_loss_clip": 1.02708519, + "balance_loss_mlp": 0.99978393, + "epoch": 0.6076055914624982, + "flos": 70712024751360.0, + "grad_norm": 0.7589759624522114, + "language_loss": 0.56881475, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.58923876, + "num_input_tokens_seen": 217632375, + "step": 10106, + "time_per_iteration": 3.062274694442749 + }, + { + "auxiliary_loss_clip": 0.01057769, + "auxiliary_loss_mlp": 0.01030084, + "balance_loss_clip": 1.03403449, + "balance_loss_mlp": 1.01830578, + "epoch": 0.6076657147151661, + "flos": 28364545906560.0, + "grad_norm": 1.5231891464913365, + "language_loss": 0.69024849, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.71112704, + "num_input_tokens_seen": 217653055, + "step": 10107, + "time_per_iteration": 2.6602609157562256 + }, + { + "auxiliary_loss_clip": 0.01098742, + "auxiliary_loss_mlp": 0.01026549, + "balance_loss_clip": 1.03704607, + "balance_loss_mlp": 1.01463389, + "epoch": 0.6077258379678341, + "flos": 15049767807360.0, + "grad_norm": 2.383265547185116, + "language_loss": 0.80982554, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.83107841, + "num_input_tokens_seen": 217671520, + "step": 10108, + "time_per_iteration": 2.479660749435425 + }, + { + "auxiliary_loss_clip": 0.01086565, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.03756785, + "balance_loss_mlp": 1.0180167, + "epoch": 0.607785961220502, + "flos": 36167251553280.0, + "grad_norm": 2.247146226624824, + "language_loss": 0.71179348, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.73297226, + "num_input_tokens_seen": 217691880, + "step": 10109, + "time_per_iteration": 2.6645328998565674 + }, + { + "auxiliary_loss_clip": 0.01082993, + "auxiliary_loss_mlp": 0.01030889, + "balance_loss_clip": 1.03634346, + "balance_loss_mlp": 1.01995742, + "epoch": 0.60784608447317, + "flos": 22524249951360.0, + "grad_norm": 1.615307358151303, + "language_loss": 0.80181587, + "learning_rate": 1.407504239132653e-06, + "loss": 0.82295465, + "num_input_tokens_seen": 217710530, + "step": 10110, + "time_per_iteration": 2.539525270462036 + }, + { + "auxiliary_loss_clip": 0.010835, + "auxiliary_loss_mlp": 0.01030211, + "balance_loss_clip": 1.03693557, + "balance_loss_mlp": 1.01781344, + "epoch": 0.6079062077258379, + "flos": 23841166285440.0, + "grad_norm": 2.376654815883698, + "language_loss": 0.70474136, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.72587848, + "num_input_tokens_seen": 217728650, + "step": 10111, + "time_per_iteration": 4.0014402866363525 + }, + { + "auxiliary_loss_clip": 0.01078302, + "auxiliary_loss_mlp": 0.01029566, + "balance_loss_clip": 1.03875852, + "balance_loss_mlp": 1.01744282, + "epoch": 0.6079663309785059, + "flos": 23367037737600.0, + "grad_norm": 2.0117154825809367, + "language_loss": 0.65347505, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.67455375, + "num_input_tokens_seen": 217747135, + "step": 10112, + "time_per_iteration": 2.5879979133605957 + }, + { + "auxiliary_loss_clip": 0.01040861, + "auxiliary_loss_mlp": 0.01004656, + "balance_loss_clip": 1.02544117, + "balance_loss_mlp": 1.00338066, + "epoch": 0.6080264542311739, + "flos": 71382873110400.0, + "grad_norm": 0.6485341283140896, + "language_loss": 0.49555272, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.5160079, + "num_input_tokens_seen": 217811860, + "step": 10113, + "time_per_iteration": 3.143420934677124 + }, + { + "auxiliary_loss_clip": 0.01040704, + "auxiliary_loss_mlp": 0.01003105, + "balance_loss_clip": 1.02533448, + "balance_loss_mlp": 1.0018295, + "epoch": 0.6080865774838419, + "flos": 66529833442560.0, + "grad_norm": 0.839224920970825, + "language_loss": 0.56921518, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.58965325, + "num_input_tokens_seen": 217866510, + "step": 10114, + "time_per_iteration": 3.030075788497925 + }, + { + "auxiliary_loss_clip": 0.01114013, + "auxiliary_loss_mlp": 0.0102681, + "balance_loss_clip": 1.04011869, + "balance_loss_mlp": 1.01368535, + "epoch": 0.6081467007365099, + "flos": 19207935895680.0, + "grad_norm": 1.9186102390174857, + "language_loss": 0.70383108, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.72523928, + "num_input_tokens_seen": 217885650, + "step": 10115, + "time_per_iteration": 2.4729413986206055 + }, + { + "auxiliary_loss_clip": 0.01071014, + "auxiliary_loss_mlp": 0.01027665, + "balance_loss_clip": 1.03522587, + "balance_loss_mlp": 1.01520813, + "epoch": 0.6082068239891778, + "flos": 24167737762560.0, + "grad_norm": 1.8493641633281328, + "language_loss": 0.72481185, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.74579865, + "num_input_tokens_seen": 217905300, + "step": 10116, + "time_per_iteration": 2.5942163467407227 + }, + { + "auxiliary_loss_clip": 0.01088772, + "auxiliary_loss_mlp": 0.01040124, + "balance_loss_clip": 1.04132521, + "balance_loss_mlp": 1.02659941, + "epoch": 0.6082669472418458, + "flos": 37413316310400.0, + "grad_norm": 1.531703364635155, + "language_loss": 0.53677702, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.55806595, + "num_input_tokens_seen": 217927845, + "step": 10117, + "time_per_iteration": 2.662885904312134 + }, + { + "auxiliary_loss_clip": 0.01089284, + "auxiliary_loss_mlp": 0.01026941, + "balance_loss_clip": 1.0377779, + "balance_loss_mlp": 1.01549745, + "epoch": 0.6083270704945137, + "flos": 15085534775040.0, + "grad_norm": 1.785879291536565, + "language_loss": 0.7034502, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.72461247, + "num_input_tokens_seen": 217946145, + "step": 10118, + "time_per_iteration": 2.5185561180114746 + }, + { + "auxiliary_loss_clip": 0.01055036, + "auxiliary_loss_mlp": 0.0103151, + "balance_loss_clip": 1.03682446, + "balance_loss_mlp": 1.01933312, + "epoch": 0.6083871937471818, + "flos": 20668458804480.0, + "grad_norm": 1.7941337197016602, + "language_loss": 0.74591386, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.76677936, + "num_input_tokens_seen": 217965190, + "step": 10119, + "time_per_iteration": 2.613905906677246 + }, + { + "auxiliary_loss_clip": 0.01095943, + "auxiliary_loss_mlp": 0.01030554, + "balance_loss_clip": 1.03750825, + "balance_loss_mlp": 1.01896644, + "epoch": 0.6084473169998497, + "flos": 21506901045120.0, + "grad_norm": 2.069858026655161, + "language_loss": 0.67005479, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.69131982, + "num_input_tokens_seen": 217983625, + "step": 10120, + "time_per_iteration": 2.4982779026031494 + }, + { + "auxiliary_loss_clip": 0.01102397, + "auxiliary_loss_mlp": 0.01035246, + "balance_loss_clip": 1.03989029, + "balance_loss_mlp": 1.02286005, + "epoch": 0.6085074402525177, + "flos": 26870051710080.0, + "grad_norm": 1.696661199381485, + "language_loss": 0.74752969, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.76890612, + "num_input_tokens_seen": 218006005, + "step": 10121, + "time_per_iteration": 5.2744951248168945 + }, + { + "auxiliary_loss_clip": 0.01098779, + "auxiliary_loss_mlp": 0.01025497, + "balance_loss_clip": 1.03736043, + "balance_loss_mlp": 1.01403534, + "epoch": 0.6085675635051856, + "flos": 10889839952640.0, + "grad_norm": 1.9442267479161475, + "language_loss": 0.80375433, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.82499713, + "num_input_tokens_seen": 218024195, + "step": 10122, + "time_per_iteration": 2.4725024700164795 + }, + { + "auxiliary_loss_clip": 0.0109483, + "auxiliary_loss_mlp": 0.01031382, + "balance_loss_clip": 1.03785467, + "balance_loss_mlp": 1.01873422, + "epoch": 0.6086276867578536, + "flos": 34862186707200.0, + "grad_norm": 1.6542715319695749, + "language_loss": 0.55798459, + "learning_rate": 1.402670413578284e-06, + "loss": 0.57924676, + "num_input_tokens_seen": 218047190, + "step": 10123, + "time_per_iteration": 2.631117582321167 + }, + { + "auxiliary_loss_clip": 0.01100034, + "auxiliary_loss_mlp": 0.01033493, + "balance_loss_clip": 1.04006767, + "balance_loss_mlp": 1.02113152, + "epoch": 0.6086878100105215, + "flos": 20047706939520.0, + "grad_norm": 1.9790485303433387, + "language_loss": 0.73741937, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.75875461, + "num_input_tokens_seen": 218065945, + "step": 10124, + "time_per_iteration": 2.4969887733459473 + }, + { + "auxiliary_loss_clip": 0.01085164, + "auxiliary_loss_mlp": 0.01033136, + "balance_loss_clip": 1.03588128, + "balance_loss_mlp": 1.0203805, + "epoch": 0.6087479332631895, + "flos": 18332469711360.0, + "grad_norm": 2.01110865161991, + "language_loss": 0.65510881, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.67629176, + "num_input_tokens_seen": 218085285, + "step": 10125, + "time_per_iteration": 3.8776230812072754 + }, + { + "auxiliary_loss_clip": 0.01109377, + "auxiliary_loss_mlp": 0.01032711, + "balance_loss_clip": 1.03950143, + "balance_loss_mlp": 1.0205518, + "epoch": 0.6088080565158575, + "flos": 24493411399680.0, + "grad_norm": 2.03139563504538, + "language_loss": 0.76294327, + "learning_rate": 1.40155545786479e-06, + "loss": 0.7843641, + "num_input_tokens_seen": 218104735, + "step": 10126, + "time_per_iteration": 2.4875481128692627 + }, + { + "auxiliary_loss_clip": 0.01077787, + "auxiliary_loss_mlp": 0.01030588, + "balance_loss_clip": 1.04146051, + "balance_loss_mlp": 1.01752257, + "epoch": 0.6088681797685255, + "flos": 10269016260480.0, + "grad_norm": 2.858209441634897, + "language_loss": 0.71822596, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.73930967, + "num_input_tokens_seen": 218121855, + "step": 10127, + "time_per_iteration": 2.531590700149536 + }, + { + "auxiliary_loss_clip": 0.01114685, + "auxiliary_loss_mlp": 0.01028935, + "balance_loss_clip": 1.04054165, + "balance_loss_mlp": 1.01579213, + "epoch": 0.6089283030211935, + "flos": 21973703218560.0, + "grad_norm": 2.0917846231890986, + "language_loss": 0.7219485, + "learning_rate": 1.400812267497691e-06, + "loss": 0.7433846, + "num_input_tokens_seen": 218137325, + "step": 10128, + "time_per_iteration": 2.4722726345062256 + }, + { + "auxiliary_loss_clip": 0.01066513, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.03676772, + "balance_loss_mlp": 1.01831293, + "epoch": 0.6089884262738614, + "flos": 17785191116160.0, + "grad_norm": 2.235975837790555, + "language_loss": 0.73396081, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.75492704, + "num_input_tokens_seen": 218155530, + "step": 10129, + "time_per_iteration": 2.568052291870117 + }, + { + "auxiliary_loss_clip": 0.01108865, + "auxiliary_loss_mlp": 0.0102975, + "balance_loss_clip": 1.03810024, + "balance_loss_mlp": 1.01753092, + "epoch": 0.6090485495265294, + "flos": 36910423946880.0, + "grad_norm": 1.5310407447365664, + "language_loss": 0.65575331, + "learning_rate": 1.400069168015626e-06, + "loss": 0.67713946, + "num_input_tokens_seen": 218182535, + "step": 10130, + "time_per_iteration": 2.668473482131958 + }, + { + "auxiliary_loss_clip": 0.010827, + "auxiliary_loss_mlp": 0.01026036, + "balance_loss_clip": 1.04056454, + "balance_loss_mlp": 1.01517034, + "epoch": 0.6091086727791973, + "flos": 19899036547200.0, + "grad_norm": 1.7854178451113443, + "language_loss": 0.7704854, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.79157281, + "num_input_tokens_seen": 218201740, + "step": 10131, + "time_per_iteration": 2.509843349456787 + }, + { + "auxiliary_loss_clip": 0.01079239, + "auxiliary_loss_mlp": 0.01030195, + "balance_loss_clip": 1.03994274, + "balance_loss_mlp": 1.01928115, + "epoch": 0.6091687960318654, + "flos": 22163635359360.0, + "grad_norm": 1.8119145125513794, + "language_loss": 0.77362418, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.79471856, + "num_input_tokens_seen": 218219800, + "step": 10132, + "time_per_iteration": 2.585703134536743 + }, + { + "auxiliary_loss_clip": 0.01105255, + "auxiliary_loss_mlp": 0.01029992, + "balance_loss_clip": 1.03796816, + "balance_loss_mlp": 1.01913166, + "epoch": 0.6092289192845333, + "flos": 21465280160640.0, + "grad_norm": 1.794455669509033, + "language_loss": 0.75940758, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.78076005, + "num_input_tokens_seen": 218237585, + "step": 10133, + "time_per_iteration": 2.497647762298584 + }, + { + "auxiliary_loss_clip": 0.01095238, + "auxiliary_loss_mlp": 0.01028711, + "balance_loss_clip": 1.03561091, + "balance_loss_mlp": 1.01631343, + "epoch": 0.6092890425372013, + "flos": 28694924225280.0, + "grad_norm": 1.873089198292273, + "language_loss": 0.63727963, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.65851915, + "num_input_tokens_seen": 218258700, + "step": 10134, + "time_per_iteration": 2.5632131099700928 + }, + { + "auxiliary_loss_clip": 0.01084006, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.03696895, + "balance_loss_mlp": 1.01835966, + "epoch": 0.6093491657898692, + "flos": 20813178700800.0, + "grad_norm": 1.7168237757893283, + "language_loss": 0.78609681, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.80723763, + "num_input_tokens_seen": 218275655, + "step": 10135, + "time_per_iteration": 2.507474422454834 + }, + { + "auxiliary_loss_clip": 0.01083452, + "auxiliary_loss_mlp": 0.01026831, + "balance_loss_clip": 1.03629935, + "balance_loss_mlp": 1.01584041, + "epoch": 0.6094092890425372, + "flos": 25446983708160.0, + "grad_norm": 2.363236335467373, + "language_loss": 0.72389984, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.74500263, + "num_input_tokens_seen": 218295720, + "step": 10136, + "time_per_iteration": 2.5551064014434814 + }, + { + "auxiliary_loss_clip": 0.01110666, + "auxiliary_loss_mlp": 0.01030796, + "balance_loss_clip": 1.03961921, + "balance_loss_mlp": 1.01860738, + "epoch": 0.6094694122952051, + "flos": 35621265847680.0, + "grad_norm": 1.7981181628078036, + "language_loss": 0.74066174, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.76207638, + "num_input_tokens_seen": 218316745, + "step": 10137, + "time_per_iteration": 2.6114141941070557 + }, + { + "auxiliary_loss_clip": 0.01095888, + "auxiliary_loss_mlp": 0.01038823, + "balance_loss_clip": 1.03672469, + "balance_loss_mlp": 1.02513814, + "epoch": 0.6095295355478731, + "flos": 24456962073600.0, + "grad_norm": 1.72962104763926, + "language_loss": 0.80443662, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.82578373, + "num_input_tokens_seen": 218335385, + "step": 10138, + "time_per_iteration": 2.5112786293029785 + }, + { + "auxiliary_loss_clip": 0.01080951, + "auxiliary_loss_mlp": 0.01027386, + "balance_loss_clip": 1.04020405, + "balance_loss_mlp": 1.01644874, + "epoch": 0.6095896588005411, + "flos": 15633208419840.0, + "grad_norm": 1.5899562880518696, + "language_loss": 0.80836701, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.82945037, + "num_input_tokens_seen": 218353320, + "step": 10139, + "time_per_iteration": 2.5003786087036133 + }, + { + "auxiliary_loss_clip": 0.01080651, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.03951144, + "balance_loss_mlp": 1.01945555, + "epoch": 0.6096497820532091, + "flos": 15550577182080.0, + "grad_norm": 2.1010908805724373, + "language_loss": 0.8361665, + "learning_rate": 1.396355037825315e-06, + "loss": 0.85729343, + "num_input_tokens_seen": 218365620, + "step": 10140, + "time_per_iteration": 2.510923385620117 + }, + { + "auxiliary_loss_clip": 0.01099789, + "auxiliary_loss_mlp": 0.01028179, + "balance_loss_clip": 1.03891838, + "balance_loss_mlp": 1.01618648, + "epoch": 0.6097099053058771, + "flos": 24204474397440.0, + "grad_norm": 1.917319986730665, + "language_loss": 0.75691509, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.77819479, + "num_input_tokens_seen": 218383785, + "step": 10141, + "time_per_iteration": 2.5229432582855225 + }, + { + "auxiliary_loss_clip": 0.01079984, + "auxiliary_loss_mlp": 0.0103105, + "balance_loss_clip": 1.03546834, + "balance_loss_mlp": 1.01809192, + "epoch": 0.609770028558545, + "flos": 19570238426880.0, + "grad_norm": 1.9392365331154207, + "language_loss": 0.76060402, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.78171438, + "num_input_tokens_seen": 218399055, + "step": 10142, + "time_per_iteration": 2.504120349884033 + }, + { + "auxiliary_loss_clip": 0.01109195, + "auxiliary_loss_mlp": 0.01029922, + "balance_loss_clip": 1.03872252, + "balance_loss_mlp": 1.01730323, + "epoch": 0.609830151811213, + "flos": 23949185460480.0, + "grad_norm": 1.7393159210216467, + "language_loss": 0.76719296, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.78858411, + "num_input_tokens_seen": 218419120, + "step": 10143, + "time_per_iteration": 2.469773292541504 + }, + { + "auxiliary_loss_clip": 0.01094025, + "auxiliary_loss_mlp": 0.01036689, + "balance_loss_clip": 1.03656435, + "balance_loss_mlp": 1.02279496, + "epoch": 0.6098902750638809, + "flos": 16179732829440.0, + "grad_norm": 1.7772425180792084, + "language_loss": 0.75079644, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.77210361, + "num_input_tokens_seen": 218435290, + "step": 10144, + "time_per_iteration": 2.4947926998138428 + }, + { + "auxiliary_loss_clip": 0.01084778, + "auxiliary_loss_mlp": 0.01027049, + "balance_loss_clip": 1.04163742, + "balance_loss_mlp": 1.01508081, + "epoch": 0.609950398316549, + "flos": 44526393763200.0, + "grad_norm": 1.8300954599103036, + "language_loss": 0.72733283, + "learning_rate": 1.394498830235383e-06, + "loss": 0.74845105, + "num_input_tokens_seen": 218457880, + "step": 10145, + "time_per_iteration": 2.7409489154815674 + }, + { + "auxiliary_loss_clip": 0.01084523, + "auxiliary_loss_mlp": 0.01029535, + "balance_loss_clip": 1.03455305, + "balance_loss_mlp": 1.01781654, + "epoch": 0.6100105215692169, + "flos": 23221743223680.0, + "grad_norm": 1.812367853459384, + "language_loss": 0.69748074, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.71862125, + "num_input_tokens_seen": 218475930, + "step": 10146, + "time_per_iteration": 2.539449691772461 + }, + { + "auxiliary_loss_clip": 0.01065625, + "auxiliary_loss_mlp": 0.00784931, + "balance_loss_clip": 1.04261255, + "balance_loss_mlp": 1.01222348, + "epoch": 0.6100706448218849, + "flos": 15012564295680.0, + "grad_norm": 2.0080601145439916, + "language_loss": 0.77256197, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.79106748, + "num_input_tokens_seen": 218493675, + "step": 10147, + "time_per_iteration": 2.572662591934204 + }, + { + "auxiliary_loss_clip": 0.01085018, + "auxiliary_loss_mlp": 0.01023657, + "balance_loss_clip": 1.03516579, + "balance_loss_mlp": 1.01211798, + "epoch": 0.6101307680745528, + "flos": 19639976682240.0, + "grad_norm": 1.86110956229174, + "language_loss": 0.78293884, + "learning_rate": 1.393385381096786e-06, + "loss": 0.80402559, + "num_input_tokens_seen": 218511780, + "step": 10148, + "time_per_iteration": 2.5289101600646973 + }, + { + "auxiliary_loss_clip": 0.01077175, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.03503716, + "balance_loss_mlp": 1.01773477, + "epoch": 0.6101908913272208, + "flos": 29935566028800.0, + "grad_norm": 2.3635902014251835, + "language_loss": 0.54202837, + "learning_rate": 1.39301427737093e-06, + "loss": 0.56311721, + "num_input_tokens_seen": 218531850, + "step": 10149, + "time_per_iteration": 2.619762420654297 + }, + { + "auxiliary_loss_clip": 0.01083485, + "auxiliary_loss_mlp": 0.01026576, + "balance_loss_clip": 1.03855777, + "balance_loss_mlp": 1.01503634, + "epoch": 0.6102510145798887, + "flos": 21798639308160.0, + "grad_norm": 1.8681107549946896, + "language_loss": 0.80730009, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.82840073, + "num_input_tokens_seen": 218551245, + "step": 10150, + "time_per_iteration": 3.9417476654052734 + }, + { + "auxiliary_loss_clip": 0.01084342, + "auxiliary_loss_mlp": 0.01032942, + "balance_loss_clip": 1.03987634, + "balance_loss_mlp": 1.02071667, + "epoch": 0.6103111378325567, + "flos": 20706129192960.0, + "grad_norm": 1.4856071233612973, + "language_loss": 0.68915397, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.71032679, + "num_input_tokens_seen": 218571365, + "step": 10151, + "time_per_iteration": 2.561948776245117 + }, + { + "auxiliary_loss_clip": 0.01106503, + "auxiliary_loss_mlp": 0.01026207, + "balance_loss_clip": 1.03653932, + "balance_loss_mlp": 1.01563323, + "epoch": 0.6103712610852247, + "flos": 29381643417600.0, + "grad_norm": 1.7423662520513388, + "language_loss": 0.70889014, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.73021722, + "num_input_tokens_seen": 218588315, + "step": 10152, + "time_per_iteration": 2.5450708866119385 + }, + { + "auxiliary_loss_clip": 0.01080719, + "auxiliary_loss_mlp": 0.01031472, + "balance_loss_clip": 1.03937674, + "balance_loss_mlp": 1.01929474, + "epoch": 0.6104313843378927, + "flos": 20813035046400.0, + "grad_norm": 1.782877085597693, + "language_loss": 0.78318816, + "learning_rate": 1.391530092777811e-06, + "loss": 0.80431008, + "num_input_tokens_seen": 218605940, + "step": 10153, + "time_per_iteration": 2.5597496032714844 + }, + { + "auxiliary_loss_clip": 0.01085942, + "auxiliary_loss_mlp": 0.01030169, + "balance_loss_clip": 1.03910351, + "balance_loss_mlp": 1.01776481, + "epoch": 0.6104915075905607, + "flos": 26578457101440.0, + "grad_norm": 2.514899705875235, + "language_loss": 0.79401416, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.81517529, + "num_input_tokens_seen": 218626100, + "step": 10154, + "time_per_iteration": 2.594357490539551 + }, + { + "auxiliary_loss_clip": 0.01095669, + "auxiliary_loss_mlp": 0.01027386, + "balance_loss_clip": 1.03743958, + "balance_loss_mlp": 1.01632333, + "epoch": 0.6105516308432286, + "flos": 23915788790400.0, + "grad_norm": 1.5875644640180018, + "language_loss": 0.70466548, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.72589606, + "num_input_tokens_seen": 218645060, + "step": 10155, + "time_per_iteration": 2.5733187198638916 + }, + { + "auxiliary_loss_clip": 0.01099141, + "auxiliary_loss_mlp": 0.01032359, + "balance_loss_clip": 1.0399605, + "balance_loss_mlp": 1.01947212, + "epoch": 0.6106117540958966, + "flos": 31577365900800.0, + "grad_norm": 1.5362458098545764, + "language_loss": 0.71467257, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.7359876, + "num_input_tokens_seen": 218667690, + "step": 10156, + "time_per_iteration": 2.6149215698242188 + }, + { + "auxiliary_loss_clip": 0.0108415, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.03883815, + "balance_loss_mlp": 1.01814198, + "epoch": 0.6106718773485645, + "flos": 19608160210560.0, + "grad_norm": 1.8206241603048816, + "language_loss": 0.67108792, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.69222701, + "num_input_tokens_seen": 218687505, + "step": 10157, + "time_per_iteration": 2.526320219039917 + }, + { + "auxiliary_loss_clip": 0.01067985, + "auxiliary_loss_mlp": 0.01024081, + "balance_loss_clip": 1.03313506, + "balance_loss_mlp": 1.0124526, + "epoch": 0.6107320006012326, + "flos": 17123895774720.0, + "grad_norm": 2.5933821155179406, + "language_loss": 0.71980464, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.74072534, + "num_input_tokens_seen": 218705315, + "step": 10158, + "time_per_iteration": 2.542780637741089 + }, + { + "auxiliary_loss_clip": 0.01099099, + "auxiliary_loss_mlp": 0.01037961, + "balance_loss_clip": 1.04322124, + "balance_loss_mlp": 1.02592087, + "epoch": 0.6107921238539005, + "flos": 30148228500480.0, + "grad_norm": 1.6006702799657258, + "language_loss": 0.69394654, + "learning_rate": 1.389304508366635e-06, + "loss": 0.71531713, + "num_input_tokens_seen": 218725735, + "step": 10159, + "time_per_iteration": 2.584451675415039 + }, + { + "auxiliary_loss_clip": 0.01110662, + "auxiliary_loss_mlp": 0.01028962, + "balance_loss_clip": 1.03923595, + "balance_loss_mlp": 1.01648664, + "epoch": 0.6108522471065685, + "flos": 18440273404800.0, + "grad_norm": 1.9284641850138533, + "language_loss": 0.78986657, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.81126279, + "num_input_tokens_seen": 218743215, + "step": 10160, + "time_per_iteration": 5.160766124725342 + }, + { + "auxiliary_loss_clip": 0.01039791, + "auxiliary_loss_mlp": 0.01001075, + "balance_loss_clip": 1.02613711, + "balance_loss_mlp": 0.99968666, + "epoch": 0.6109123703592364, + "flos": 64135454791680.0, + "grad_norm": 0.8181351945715727, + "language_loss": 0.61448401, + "learning_rate": 1.388562832007295e-06, + "loss": 0.6348927, + "num_input_tokens_seen": 218806440, + "step": 10161, + "time_per_iteration": 3.247194528579712 + }, + { + "auxiliary_loss_clip": 0.01089724, + "auxiliary_loss_mlp": 0.00784919, + "balance_loss_clip": 1.03910327, + "balance_loss_mlp": 1.01073456, + "epoch": 0.6109724936119044, + "flos": 20667848273280.0, + "grad_norm": 1.7856100134031274, + "language_loss": 0.7660346, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.78478098, + "num_input_tokens_seen": 218825720, + "step": 10162, + "time_per_iteration": 2.5445828437805176 + }, + { + "auxiliary_loss_clip": 0.01108449, + "auxiliary_loss_mlp": 0.01030689, + "balance_loss_clip": 1.03837812, + "balance_loss_mlp": 1.01878023, + "epoch": 0.6110326168645723, + "flos": 31351882273920.0, + "grad_norm": 1.7572601513815718, + "language_loss": 0.71885288, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.74024427, + "num_input_tokens_seen": 218847735, + "step": 10163, + "time_per_iteration": 2.58492112159729 + }, + { + "auxiliary_loss_clip": 0.01104489, + "auxiliary_loss_mlp": 0.0102415, + "balance_loss_clip": 1.03618336, + "balance_loss_mlp": 1.01287913, + "epoch": 0.6110927401172404, + "flos": 25003378742400.0, + "grad_norm": 2.2297757113873313, + "language_loss": 0.60236794, + "learning_rate": 1.387450491396625e-06, + "loss": 0.62365437, + "num_input_tokens_seen": 218866585, + "step": 10164, + "time_per_iteration": 3.8871047496795654 + }, + { + "auxiliary_loss_clip": 0.0109438, + "auxiliary_loss_mlp": 0.01033188, + "balance_loss_clip": 1.04022074, + "balance_loss_mlp": 1.02130914, + "epoch": 0.6111528633699083, + "flos": 26248078782720.0, + "grad_norm": 1.8873474024065287, + "language_loss": 0.75990093, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.78117657, + "num_input_tokens_seen": 218885560, + "step": 10165, + "time_per_iteration": 2.532177448272705 + }, + { + "auxiliary_loss_clip": 0.01086034, + "auxiliary_loss_mlp": 0.01025146, + "balance_loss_clip": 1.03884554, + "balance_loss_mlp": 1.0134995, + "epoch": 0.6112129866225763, + "flos": 22382474970240.0, + "grad_norm": 1.5810355111808556, + "language_loss": 0.79265761, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.81376946, + "num_input_tokens_seen": 218905055, + "step": 10166, + "time_per_iteration": 2.5334417819976807 + }, + { + "auxiliary_loss_clip": 0.01086589, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.04019165, + "balance_loss_mlp": 1.01701355, + "epoch": 0.6112731098752443, + "flos": 25227892702080.0, + "grad_norm": 1.9142568512646563, + "language_loss": 0.67655414, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.69771248, + "num_input_tokens_seen": 218924030, + "step": 10167, + "time_per_iteration": 2.5724308490753174 + }, + { + "auxiliary_loss_clip": 0.01106213, + "auxiliary_loss_mlp": 0.01029995, + "balance_loss_clip": 1.03771079, + "balance_loss_mlp": 1.01895666, + "epoch": 0.6113332331279122, + "flos": 22893160584960.0, + "grad_norm": 1.7961700419833686, + "language_loss": 0.79167277, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.81303483, + "num_input_tokens_seen": 218943750, + "step": 10168, + "time_per_iteration": 2.475111484527588 + }, + { + "auxiliary_loss_clip": 0.01115829, + "auxiliary_loss_mlp": 0.01035394, + "balance_loss_clip": 1.03994715, + "balance_loss_mlp": 1.02222157, + "epoch": 0.6113933563805802, + "flos": 18620329305600.0, + "grad_norm": 3.1879555409815588, + "language_loss": 0.86169767, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.88320994, + "num_input_tokens_seen": 218957585, + "step": 10169, + "time_per_iteration": 2.4860074520111084 + }, + { + "auxiliary_loss_clip": 0.01105278, + "auxiliary_loss_mlp": 0.01028232, + "balance_loss_clip": 1.03640556, + "balance_loss_mlp": 1.01732993, + "epoch": 0.6114534796332481, + "flos": 41866275317760.0, + "grad_norm": 1.6698845254163028, + "language_loss": 0.79124212, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.81257725, + "num_input_tokens_seen": 218980025, + "step": 10170, + "time_per_iteration": 2.6383047103881836 + }, + { + "auxiliary_loss_clip": 0.0109204, + "auxiliary_loss_mlp": 0.01036064, + "balance_loss_clip": 1.03851867, + "balance_loss_mlp": 1.02276063, + "epoch": 0.6115136028859162, + "flos": 21908454163200.0, + "grad_norm": 2.367535272214396, + "language_loss": 0.6862554, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.70753646, + "num_input_tokens_seen": 218998200, + "step": 10171, + "time_per_iteration": 2.5391898155212402 + }, + { + "auxiliary_loss_clip": 0.01078992, + "auxiliary_loss_mlp": 0.01032516, + "balance_loss_clip": 1.03605032, + "balance_loss_mlp": 1.01899791, + "epoch": 0.6115737261385841, + "flos": 28804846821120.0, + "grad_norm": 1.606489435806958, + "language_loss": 0.79047585, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.81159091, + "num_input_tokens_seen": 219017910, + "step": 10172, + "time_per_iteration": 2.601757526397705 + }, + { + "auxiliary_loss_clip": 0.01079576, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.03974771, + "balance_loss_mlp": 1.01977921, + "epoch": 0.6116338493912521, + "flos": 21251468453760.0, + "grad_norm": 1.9051435859480492, + "language_loss": 0.66701716, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.68813622, + "num_input_tokens_seen": 219037730, + "step": 10173, + "time_per_iteration": 2.5703351497650146 + }, + { + "auxiliary_loss_clip": 0.01085336, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.03827262, + "balance_loss_mlp": 1.01953828, + "epoch": 0.61169397264392, + "flos": 17530189488000.0, + "grad_norm": 1.8851407338856854, + "language_loss": 0.55363429, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.57481283, + "num_input_tokens_seen": 219056755, + "step": 10174, + "time_per_iteration": 2.487077474594116 + }, + { + "auxiliary_loss_clip": 0.01091291, + "auxiliary_loss_mlp": 0.01033021, + "balance_loss_clip": 1.0400902, + "balance_loss_mlp": 1.02059317, + "epoch": 0.611754095896588, + "flos": 23951555758080.0, + "grad_norm": 2.0585531417243024, + "language_loss": 0.66235387, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.68359697, + "num_input_tokens_seen": 219076985, + "step": 10175, + "time_per_iteration": 2.5436248779296875 + }, + { + "auxiliary_loss_clip": 0.01092964, + "auxiliary_loss_mlp": 0.00784987, + "balance_loss_clip": 1.03435385, + "balance_loss_mlp": 1.01135993, + "epoch": 0.6118142191492559, + "flos": 25994872834560.0, + "grad_norm": 2.1660646922432507, + "language_loss": 0.82745576, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.84623528, + "num_input_tokens_seen": 219096050, + "step": 10176, + "time_per_iteration": 2.5258448123931885 + }, + { + "auxiliary_loss_clip": 0.01091179, + "auxiliary_loss_mlp": 0.01032469, + "balance_loss_clip": 1.03945029, + "balance_loss_mlp": 1.01963639, + "epoch": 0.611874342401924, + "flos": 24603190341120.0, + "grad_norm": 1.9120050131464255, + "language_loss": 0.77478898, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.79602551, + "num_input_tokens_seen": 219112665, + "step": 10177, + "time_per_iteration": 2.5447707176208496 + }, + { + "auxiliary_loss_clip": 0.01094922, + "auxiliary_loss_mlp": 0.00787892, + "balance_loss_clip": 1.03725624, + "balance_loss_mlp": 1.0143013, + "epoch": 0.6119344656545919, + "flos": 15887132640000.0, + "grad_norm": 2.1458630443362723, + "language_loss": 0.75486624, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.7736944, + "num_input_tokens_seen": 219129120, + "step": 10178, + "time_per_iteration": 2.4664957523345947 + }, + { + "auxiliary_loss_clip": 0.01084039, + "auxiliary_loss_mlp": 0.01040886, + "balance_loss_clip": 1.03765059, + "balance_loss_mlp": 1.02677155, + "epoch": 0.6119945889072599, + "flos": 21652877917440.0, + "grad_norm": 2.9479238990543837, + "language_loss": 0.66898644, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.69023561, + "num_input_tokens_seen": 219148950, + "step": 10179, + "time_per_iteration": 2.5287182331085205 + }, + { + "auxiliary_loss_clip": 0.01091012, + "auxiliary_loss_mlp": 0.01038202, + "balance_loss_clip": 1.04047966, + "balance_loss_mlp": 1.02623916, + "epoch": 0.6120547121599279, + "flos": 13772533023360.0, + "grad_norm": 1.7656587862876623, + "language_loss": 0.84259897, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.86389112, + "num_input_tokens_seen": 219165585, + "step": 10180, + "time_per_iteration": 2.4879043102264404 + }, + { + "auxiliary_loss_clip": 0.01108867, + "auxiliary_loss_mlp": 0.01027426, + "balance_loss_clip": 1.03928947, + "balance_loss_mlp": 1.01486194, + "epoch": 0.6121148354125958, + "flos": 20079164275200.0, + "grad_norm": 1.6146685470597626, + "language_loss": 0.77675188, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.79811484, + "num_input_tokens_seen": 219183280, + "step": 10181, + "time_per_iteration": 2.4676477909088135 + }, + { + "auxiliary_loss_clip": 0.01108362, + "auxiliary_loss_mlp": 0.01027401, + "balance_loss_clip": 1.0372467, + "balance_loss_mlp": 1.01569438, + "epoch": 0.6121749586652638, + "flos": 13471313569920.0, + "grad_norm": 1.9976146753898771, + "language_loss": 0.80407619, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.82543385, + "num_input_tokens_seen": 219197200, + "step": 10182, + "time_per_iteration": 2.410322904586792 + }, + { + "auxiliary_loss_clip": 0.01070267, + "auxiliary_loss_mlp": 0.01029706, + "balance_loss_clip": 1.03342152, + "balance_loss_mlp": 1.01888752, + "epoch": 0.6122350819179317, + "flos": 20120533764480.0, + "grad_norm": 2.6237994279498236, + "language_loss": 0.83146781, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.85246748, + "num_input_tokens_seen": 219216825, + "step": 10183, + "time_per_iteration": 2.548956871032715 + }, + { + "auxiliary_loss_clip": 0.01040661, + "auxiliary_loss_mlp": 0.01001611, + "balance_loss_clip": 1.03084183, + "balance_loss_mlp": 1.0001868, + "epoch": 0.6122952051705998, + "flos": 65429242767360.0, + "grad_norm": 0.7022768072656922, + "language_loss": 0.62885171, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.64927441, + "num_input_tokens_seen": 219283795, + "step": 10184, + "time_per_iteration": 3.2226603031158447 + }, + { + "auxiliary_loss_clip": 0.01099949, + "auxiliary_loss_mlp": 0.010286, + "balance_loss_clip": 1.03837574, + "balance_loss_mlp": 1.01743042, + "epoch": 0.6123553284232677, + "flos": 20376253664640.0, + "grad_norm": 1.8147870050876351, + "language_loss": 0.82094753, + "learning_rate": 1.379669981812101e-06, + "loss": 0.84223306, + "num_input_tokens_seen": 219302385, + "step": 10185, + "time_per_iteration": 2.489056348800659 + }, + { + "auxiliary_loss_clip": 0.01086258, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.03816962, + "balance_loss_mlp": 1.02179098, + "epoch": 0.6124154516759357, + "flos": 23987645948160.0, + "grad_norm": 3.5252668067394377, + "language_loss": 0.74537563, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.76658237, + "num_input_tokens_seen": 219319765, + "step": 10186, + "time_per_iteration": 2.578298330307007 + }, + { + "auxiliary_loss_clip": 0.01093134, + "auxiliary_loss_mlp": 0.01026952, + "balance_loss_clip": 1.03544974, + "balance_loss_mlp": 1.01580584, + "epoch": 0.6124755749286036, + "flos": 21468799693440.0, + "grad_norm": 1.6015830423530744, + "language_loss": 0.78132993, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.80253077, + "num_input_tokens_seen": 219337440, + "step": 10187, + "time_per_iteration": 2.502084732055664 + }, + { + "auxiliary_loss_clip": 0.01105797, + "auxiliary_loss_mlp": 0.01028408, + "balance_loss_clip": 1.03572822, + "balance_loss_mlp": 1.01636779, + "epoch": 0.6125356981812716, + "flos": 23879195809920.0, + "grad_norm": 1.76111000557454, + "language_loss": 0.83186466, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.85320669, + "num_input_tokens_seen": 219357525, + "step": 10188, + "time_per_iteration": 2.500108480453491 + }, + { + "auxiliary_loss_clip": 0.01075087, + "auxiliary_loss_mlp": 0.01027012, + "balance_loss_clip": 1.03529668, + "balance_loss_mlp": 1.01556146, + "epoch": 0.6125958214339395, + "flos": 14425604150400.0, + "grad_norm": 1.8096866681312471, + "language_loss": 0.75693601, + "learning_rate": 1.378189152155896e-06, + "loss": 0.77795696, + "num_input_tokens_seen": 219374855, + "step": 10189, + "time_per_iteration": 3.9984209537506104 + }, + { + "auxiliary_loss_clip": 0.01094062, + "auxiliary_loss_mlp": 0.01031518, + "balance_loss_clip": 1.03628778, + "balance_loss_mlp": 1.01901889, + "epoch": 0.6126559446866076, + "flos": 23259090389760.0, + "grad_norm": 1.6778316556019974, + "language_loss": 0.74275112, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.76400685, + "num_input_tokens_seen": 219394740, + "step": 10190, + "time_per_iteration": 2.51216459274292 + }, + { + "auxiliary_loss_clip": 0.01099358, + "auxiliary_loss_mlp": 0.01029618, + "balance_loss_clip": 1.03943455, + "balance_loss_mlp": 1.01706481, + "epoch": 0.6127160679392755, + "flos": 26864808324480.0, + "grad_norm": 1.8199000982029145, + "language_loss": 0.68294638, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.70423615, + "num_input_tokens_seen": 219413755, + "step": 10191, + "time_per_iteration": 2.560763120651245 + }, + { + "auxiliary_loss_clip": 0.01095655, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.03646564, + "balance_loss_mlp": 1.01974344, + "epoch": 0.6127761911919435, + "flos": 26396425952640.0, + "grad_norm": 2.979927296657686, + "language_loss": 0.73893303, + "learning_rate": 1.377078777445467e-06, + "loss": 0.76021528, + "num_input_tokens_seen": 219433560, + "step": 10192, + "time_per_iteration": 2.534044027328491 + }, + { + "auxiliary_loss_clip": 0.01076914, + "auxiliary_loss_mlp": 0.01029556, + "balance_loss_clip": 1.03871632, + "balance_loss_mlp": 1.01784396, + "epoch": 0.6128363144446115, + "flos": 22634747164800.0, + "grad_norm": 1.8328183827410847, + "language_loss": 0.83676761, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.85783231, + "num_input_tokens_seen": 219452640, + "step": 10193, + "time_per_iteration": 2.5681216716766357 + }, + { + "auxiliary_loss_clip": 0.01075755, + "auxiliary_loss_mlp": 0.01030134, + "balance_loss_clip": 1.03667557, + "balance_loss_mlp": 1.01802862, + "epoch": 0.6128964376972794, + "flos": 26759051706240.0, + "grad_norm": 2.266010589479677, + "language_loss": 0.69968528, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.72074413, + "num_input_tokens_seen": 219468585, + "step": 10194, + "time_per_iteration": 2.5876457691192627 + }, + { + "auxiliary_loss_clip": 0.01024636, + "auxiliary_loss_mlp": 0.01001403, + "balance_loss_clip": 1.03119636, + "balance_loss_mlp": 0.99993652, + "epoch": 0.6129565609499474, + "flos": 65567929178880.0, + "grad_norm": 0.8235849902142223, + "language_loss": 0.58698481, + "learning_rate": 1.375968615326149e-06, + "loss": 0.60724521, + "num_input_tokens_seen": 219523015, + "step": 10195, + "time_per_iteration": 2.9530692100524902 + }, + { + "auxiliary_loss_clip": 0.01089044, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.03925681, + "balance_loss_mlp": 1.02026248, + "epoch": 0.6130166842026153, + "flos": 16362087200640.0, + "grad_norm": 2.0283780970817014, + "language_loss": 0.69745886, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.71868044, + "num_input_tokens_seen": 219539980, + "step": 10196, + "time_per_iteration": 2.4988269805908203 + }, + { + "auxiliary_loss_clip": 0.01083896, + "auxiliary_loss_mlp": 0.01036206, + "balance_loss_clip": 1.03658199, + "balance_loss_mlp": 1.02445221, + "epoch": 0.6130768074552834, + "flos": 23652455207040.0, + "grad_norm": 1.8623167749058633, + "language_loss": 0.71213996, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.73334098, + "num_input_tokens_seen": 219556980, + "step": 10197, + "time_per_iteration": 2.5619845390319824 + }, + { + "auxiliary_loss_clip": 0.01096203, + "auxiliary_loss_mlp": 0.01043961, + "balance_loss_clip": 1.03658664, + "balance_loss_mlp": 1.03021014, + "epoch": 0.6131369307079513, + "flos": 20047455544320.0, + "grad_norm": 1.9462460614312307, + "language_loss": 0.78875613, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.81015778, + "num_input_tokens_seen": 219576410, + "step": 10198, + "time_per_iteration": 3.835993528366089 + }, + { + "auxiliary_loss_clip": 0.01079207, + "auxiliary_loss_mlp": 0.01027939, + "balance_loss_clip": 1.03741086, + "balance_loss_mlp": 1.01554728, + "epoch": 0.6131970539606193, + "flos": 22672166158080.0, + "grad_norm": 1.4314566593996108, + "language_loss": 0.74471033, + "learning_rate": 1.374488730519181e-06, + "loss": 0.76578182, + "num_input_tokens_seen": 219597180, + "step": 10199, + "time_per_iteration": 3.9720675945281982 + }, + { + "auxiliary_loss_clip": 0.01086606, + "auxiliary_loss_mlp": 0.01037243, + "balance_loss_clip": 1.03985882, + "balance_loss_mlp": 1.02434492, + "epoch": 0.6132571772132872, + "flos": 26870913636480.0, + "grad_norm": 1.7137009607826121, + "language_loss": 0.61709464, + "learning_rate": 1.374118818580993e-06, + "loss": 0.63833314, + "num_input_tokens_seen": 219617630, + "step": 10200, + "time_per_iteration": 2.5755362510681152 + }, + { + "auxiliary_loss_clip": 0.01082855, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.03654432, + "balance_loss_mlp": 1.02041769, + "epoch": 0.6133173004659552, + "flos": 22892657794560.0, + "grad_norm": 1.8980646399552337, + "language_loss": 0.68832231, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.70947582, + "num_input_tokens_seen": 219637025, + "step": 10201, + "time_per_iteration": 2.5203874111175537 + }, + { + "auxiliary_loss_clip": 0.01083913, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_clip": 1.03465509, + "balance_loss_mlp": 1.01517773, + "epoch": 0.6133774237186231, + "flos": 20485098852480.0, + "grad_norm": 1.8709508096096885, + "language_loss": 0.83417147, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.85528451, + "num_input_tokens_seen": 219656625, + "step": 10202, + "time_per_iteration": 2.523024797439575 + }, + { + "auxiliary_loss_clip": 0.01050576, + "auxiliary_loss_mlp": 0.00999771, + "balance_loss_clip": 1.02662802, + "balance_loss_mlp": 0.99825668, + "epoch": 0.6134375469712912, + "flos": 69413065217280.0, + "grad_norm": 0.9154062420573623, + "language_loss": 0.67095113, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.69145465, + "num_input_tokens_seen": 219718090, + "step": 10203, + "time_per_iteration": 4.557872533798218 + }, + { + "auxiliary_loss_clip": 0.01098851, + "auxiliary_loss_mlp": 0.0103018, + "balance_loss_clip": 1.03732657, + "balance_loss_mlp": 1.01803279, + "epoch": 0.6134976702239591, + "flos": 41281541815680.0, + "grad_norm": 1.7769324382872933, + "language_loss": 0.61393714, + "learning_rate": 1.37263940830327e-06, + "loss": 0.63522744, + "num_input_tokens_seen": 219740100, + "step": 10204, + "time_per_iteration": 2.666447401046753 + }, + { + "auxiliary_loss_clip": 0.01071889, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.03573871, + "balance_loss_mlp": 1.01917291, + "epoch": 0.6135577934766271, + "flos": 22346600261760.0, + "grad_norm": 2.3897787938049118, + "language_loss": 0.72596067, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.74699318, + "num_input_tokens_seen": 219761225, + "step": 10205, + "time_per_iteration": 2.592449426651001 + }, + { + "auxiliary_loss_clip": 0.01095865, + "auxiliary_loss_mlp": 0.01023205, + "balance_loss_clip": 1.03816569, + "balance_loss_mlp": 1.01073587, + "epoch": 0.6136179167292951, + "flos": 23728155120000.0, + "grad_norm": 1.8067155011382532, + "language_loss": 0.76360196, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.78479266, + "num_input_tokens_seen": 219780085, + "step": 10206, + "time_per_iteration": 2.5326292514801025 + }, + { + "auxiliary_loss_clip": 0.01074129, + "auxiliary_loss_mlp": 0.01026323, + "balance_loss_clip": 1.03786755, + "balance_loss_mlp": 1.01322186, + "epoch": 0.613678039981963, + "flos": 26024678144640.0, + "grad_norm": 1.8978621964155893, + "language_loss": 0.75261116, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.77361566, + "num_input_tokens_seen": 219797895, + "step": 10207, + "time_per_iteration": 2.6151251792907715 + }, + { + "auxiliary_loss_clip": 0.01097819, + "auxiliary_loss_mlp": 0.0103179, + "balance_loss_clip": 1.03820753, + "balance_loss_mlp": 1.02004838, + "epoch": 0.613738163234631, + "flos": 9859957200000.0, + "grad_norm": 2.555077534421386, + "language_loss": 0.8252461, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.84654218, + "num_input_tokens_seen": 219811295, + "step": 10208, + "time_per_iteration": 2.4949758052825928 + }, + { + "auxiliary_loss_clip": 0.01092763, + "auxiliary_loss_mlp": 0.01032406, + "balance_loss_clip": 1.04013252, + "balance_loss_mlp": 1.01945424, + "epoch": 0.613798286487299, + "flos": 33182070001920.0, + "grad_norm": 1.7685576060596173, + "language_loss": 0.7251513, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.74640298, + "num_input_tokens_seen": 219832735, + "step": 10209, + "time_per_iteration": 2.6282427310943604 + }, + { + "auxiliary_loss_clip": 0.0110913, + "auxiliary_loss_mlp": 0.01033454, + "balance_loss_clip": 1.0397408, + "balance_loss_mlp": 1.02165842, + "epoch": 0.613858409739967, + "flos": 25627901535360.0, + "grad_norm": 1.6759304400704642, + "language_loss": 0.7409184, + "learning_rate": 1.37042100685438e-06, + "loss": 0.76234424, + "num_input_tokens_seen": 219852755, + "step": 10210, + "time_per_iteration": 2.5209672451019287 + }, + { + "auxiliary_loss_clip": 0.01024328, + "auxiliary_loss_mlp": 0.01000136, + "balance_loss_clip": 1.02747774, + "balance_loss_mlp": 0.99862796, + "epoch": 0.6139185329926349, + "flos": 67192313932800.0, + "grad_norm": 0.8858236280253011, + "language_loss": 0.64987516, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.67011988, + "num_input_tokens_seen": 219922785, + "step": 10211, + "time_per_iteration": 3.3196659088134766 + }, + { + "auxiliary_loss_clip": 0.01086298, + "auxiliary_loss_mlp": 0.00785925, + "balance_loss_clip": 1.03694797, + "balance_loss_mlp": 1.00914359, + "epoch": 0.6139786562453029, + "flos": 21543637680000.0, + "grad_norm": 1.6198379044707127, + "language_loss": 0.75674862, + "learning_rate": 1.369681730544801e-06, + "loss": 0.77547085, + "num_input_tokens_seen": 219942215, + "step": 10212, + "time_per_iteration": 2.5614516735076904 + }, + { + "auxiliary_loss_clip": 0.01083389, + "auxiliary_loss_mlp": 0.01038145, + "balance_loss_clip": 1.03661454, + "balance_loss_mlp": 1.02431655, + "epoch": 0.6140387794979708, + "flos": 26068489758720.0, + "grad_norm": 1.7322671967399428, + "language_loss": 0.7396251, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.76084042, + "num_input_tokens_seen": 219963830, + "step": 10213, + "time_per_iteration": 2.563505172729492 + }, + { + "auxiliary_loss_clip": 0.01091717, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.03780079, + "balance_loss_mlp": 1.02031684, + "epoch": 0.6140989027506388, + "flos": 23694614795520.0, + "grad_norm": 1.4971214823645598, + "language_loss": 0.730308, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.75156212, + "num_input_tokens_seen": 219983815, + "step": 10214, + "time_per_iteration": 2.5438082218170166 + }, + { + "auxiliary_loss_clip": 0.01110875, + "auxiliary_loss_mlp": 0.01028778, + "balance_loss_clip": 1.03800249, + "balance_loss_mlp": 1.01543283, + "epoch": 0.6141590260033067, + "flos": 22231721589120.0, + "grad_norm": 4.450962756596791, + "language_loss": 0.74865997, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.77005649, + "num_input_tokens_seen": 220003165, + "step": 10215, + "time_per_iteration": 2.4792234897613525 + }, + { + "auxiliary_loss_clip": 0.01095603, + "auxiliary_loss_mlp": 0.01028119, + "balance_loss_clip": 1.03800476, + "balance_loss_mlp": 1.01578116, + "epoch": 0.6142191492559748, + "flos": 23871653953920.0, + "grad_norm": 1.8587732938396342, + "language_loss": 0.78437495, + "learning_rate": 1.368203464858542e-06, + "loss": 0.80561221, + "num_input_tokens_seen": 220021015, + "step": 10216, + "time_per_iteration": 2.4932825565338135 + }, + { + "auxiliary_loss_clip": 0.01109399, + "auxiliary_loss_mlp": 0.0103382, + "balance_loss_clip": 1.03902853, + "balance_loss_mlp": 1.02040315, + "epoch": 0.6142792725086427, + "flos": 15042513260160.0, + "grad_norm": 2.318602889699367, + "language_loss": 0.80001408, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.8214463, + "num_input_tokens_seen": 220035780, + "step": 10217, + "time_per_iteration": 2.423921823501587 + }, + { + "auxiliary_loss_clip": 0.01089122, + "auxiliary_loss_mlp": 0.01026092, + "balance_loss_clip": 1.03718984, + "balance_loss_mlp": 1.01382577, + "epoch": 0.6143393957613107, + "flos": 23330947547520.0, + "grad_norm": 2.834210476196226, + "language_loss": 0.78120613, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.80235827, + "num_input_tokens_seen": 220054280, + "step": 10218, + "time_per_iteration": 2.5289647579193115 + }, + { + "auxiliary_loss_clip": 0.01099492, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.03815651, + "balance_loss_mlp": 1.02034843, + "epoch": 0.6143995190139786, + "flos": 20117086058880.0, + "grad_norm": 1.5673785903940236, + "language_loss": 0.81909072, + "learning_rate": 1.367095017101569e-06, + "loss": 0.84041107, + "num_input_tokens_seen": 220074120, + "step": 10219, + "time_per_iteration": 2.4970695972442627 + }, + { + "auxiliary_loss_clip": 0.01096317, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.03866565, + "balance_loss_mlp": 1.01834726, + "epoch": 0.6144596422666466, + "flos": 42303559489920.0, + "grad_norm": 1.93357609024482, + "language_loss": 0.66760385, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.68888044, + "num_input_tokens_seen": 220096320, + "step": 10220, + "time_per_iteration": 2.6759612560272217 + }, + { + "auxiliary_loss_clip": 0.01097559, + "auxiliary_loss_mlp": 0.01027433, + "balance_loss_clip": 1.03649032, + "balance_loss_mlp": 1.01581633, + "epoch": 0.6145197655193146, + "flos": 21573622558080.0, + "grad_norm": 2.1283990410274702, + "language_loss": 0.71633416, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.73758405, + "num_input_tokens_seen": 220114850, + "step": 10221, + "time_per_iteration": 2.494719982147217 + }, + { + "auxiliary_loss_clip": 0.01063045, + "auxiliary_loss_mlp": 0.01028043, + "balance_loss_clip": 1.03575158, + "balance_loss_mlp": 1.01618218, + "epoch": 0.6145798887719826, + "flos": 21471098163840.0, + "grad_norm": 3.7930726611775216, + "language_loss": 0.79521847, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.81612939, + "num_input_tokens_seen": 220133395, + "step": 10222, + "time_per_iteration": 2.57665753364563 + }, + { + "auxiliary_loss_clip": 0.01086133, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.03818393, + "balance_loss_mlp": 1.02349615, + "epoch": 0.6146400120246506, + "flos": 20777016683520.0, + "grad_norm": 1.9537653220063085, + "language_loss": 0.76319802, + "learning_rate": 1.365617422821788e-06, + "loss": 0.78443074, + "num_input_tokens_seen": 220152790, + "step": 10223, + "time_per_iteration": 2.5164098739624023 + }, + { + "auxiliary_loss_clip": 0.01086211, + "auxiliary_loss_mlp": 0.01030261, + "balance_loss_clip": 1.03788996, + "balance_loss_mlp": 1.01796508, + "epoch": 0.6147001352773185, + "flos": 13881306384000.0, + "grad_norm": 2.493494849162463, + "language_loss": 0.78241336, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.80357814, + "num_input_tokens_seen": 220169535, + "step": 10224, + "time_per_iteration": 2.4829282760620117 + }, + { + "auxiliary_loss_clip": 0.01072507, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.0352149, + "balance_loss_mlp": 1.0163064, + "epoch": 0.6147602585299865, + "flos": 56641791807360.0, + "grad_norm": 1.1559311676050361, + "language_loss": 0.66325742, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.684255, + "num_input_tokens_seen": 220195305, + "step": 10225, + "time_per_iteration": 2.8859102725982666 + }, + { + "auxiliary_loss_clip": 0.01099133, + "auxiliary_loss_mlp": 0.00786981, + "balance_loss_clip": 1.04110968, + "balance_loss_mlp": 1.01052558, + "epoch": 0.6148203817826544, + "flos": 32817217605120.0, + "grad_norm": 2.496833825111756, + "language_loss": 0.63203716, + "learning_rate": 1.364509479649357e-06, + "loss": 0.65089834, + "num_input_tokens_seen": 220215040, + "step": 10226, + "time_per_iteration": 2.6053338050842285 + }, + { + "auxiliary_loss_clip": 0.01084718, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.03708267, + "balance_loss_mlp": 1.01979876, + "epoch": 0.6148805050353224, + "flos": 18332038748160.0, + "grad_norm": 1.8368155433849405, + "language_loss": 0.75898898, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.78016782, + "num_input_tokens_seen": 220234205, + "step": 10227, + "time_per_iteration": 2.554015874862671 + }, + { + "auxiliary_loss_clip": 0.0105252, + "auxiliary_loss_mlp": 0.01036876, + "balance_loss_clip": 1.03829098, + "balance_loss_mlp": 1.02161133, + "epoch": 0.6149406282879903, + "flos": 14063983977600.0, + "grad_norm": 2.004585210787863, + "language_loss": 0.61337602, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.63426995, + "num_input_tokens_seen": 220252730, + "step": 10228, + "time_per_iteration": 3.9705421924591064 + }, + { + "auxiliary_loss_clip": 0.01086942, + "auxiliary_loss_mlp": 0.01032551, + "balance_loss_clip": 1.03793836, + "balance_loss_mlp": 1.02023041, + "epoch": 0.6150007515406584, + "flos": 25190186400000.0, + "grad_norm": 1.3152393509810452, + "language_loss": 0.74257982, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.76377475, + "num_input_tokens_seen": 220273345, + "step": 10229, + "time_per_iteration": 2.5854804515838623 + }, + { + "auxiliary_loss_clip": 0.01110701, + "auxiliary_loss_mlp": 0.01036374, + "balance_loss_clip": 1.03990602, + "balance_loss_mlp": 1.02388668, + "epoch": 0.6150608747933263, + "flos": 21945262625280.0, + "grad_norm": 3.324939528301568, + "language_loss": 0.77909106, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.80056179, + "num_input_tokens_seen": 220293845, + "step": 10230, + "time_per_iteration": 2.4635536670684814 + }, + { + "auxiliary_loss_clip": 0.0108274, + "auxiliary_loss_mlp": 0.01029674, + "balance_loss_clip": 1.03993201, + "balance_loss_mlp": 1.01746118, + "epoch": 0.6151209980459943, + "flos": 30117453523200.0, + "grad_norm": 1.6797997199124084, + "language_loss": 0.73174465, + "learning_rate": 1.36266338983927e-06, + "loss": 0.75286883, + "num_input_tokens_seen": 220316070, + "step": 10231, + "time_per_iteration": 2.595430850982666 + }, + { + "auxiliary_loss_clip": 0.01089274, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.03798175, + "balance_loss_mlp": 1.02025807, + "epoch": 0.6151811212986622, + "flos": 30008356940160.0, + "grad_norm": 1.8893920630954821, + "language_loss": 0.69852841, + "learning_rate": 1.362294244324858e-06, + "loss": 0.71973825, + "num_input_tokens_seen": 220335695, + "step": 10232, + "time_per_iteration": 2.585747241973877 + }, + { + "auxiliary_loss_clip": 0.01094224, + "auxiliary_loss_mlp": 0.00784915, + "balance_loss_clip": 1.03705537, + "balance_loss_mlp": 1.00893879, + "epoch": 0.6152412445513302, + "flos": 18872888808960.0, + "grad_norm": 1.9680161414636268, + "language_loss": 0.91989201, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.93868339, + "num_input_tokens_seen": 220353720, + "step": 10233, + "time_per_iteration": 2.478266477584839 + }, + { + "auxiliary_loss_clip": 0.01079566, + "auxiliary_loss_mlp": 0.01034115, + "balance_loss_clip": 1.04080832, + "balance_loss_mlp": 1.02266526, + "epoch": 0.6153013678039982, + "flos": 25703601448320.0, + "grad_norm": 1.7495715166050214, + "language_loss": 0.7149142, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.73605096, + "num_input_tokens_seen": 220372515, + "step": 10234, + "time_per_iteration": 2.558286666870117 + }, + { + "auxiliary_loss_clip": 0.01097236, + "auxiliary_loss_mlp": 0.00786267, + "balance_loss_clip": 1.03533542, + "balance_loss_mlp": 1.01059198, + "epoch": 0.6153614910566662, + "flos": 28510271383680.0, + "grad_norm": 2.200766251748644, + "language_loss": 0.66462857, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.68346357, + "num_input_tokens_seen": 220393490, + "step": 10235, + "time_per_iteration": 2.54298734664917 + }, + { + "auxiliary_loss_clip": 0.01102499, + "auxiliary_loss_mlp": 0.0103039, + "balance_loss_clip": 1.03842068, + "balance_loss_mlp": 1.017712, + "epoch": 0.6154216143093342, + "flos": 23549787158400.0, + "grad_norm": 1.739940392355627, + "language_loss": 0.81289065, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.83421957, + "num_input_tokens_seen": 220412855, + "step": 10236, + "time_per_iteration": 2.5148465633392334 + }, + { + "auxiliary_loss_clip": 0.01111338, + "auxiliary_loss_mlp": 0.01029271, + "balance_loss_clip": 1.03766549, + "balance_loss_mlp": 1.01661694, + "epoch": 0.6154817375620021, + "flos": 22748081552640.0, + "grad_norm": 1.4262432839947412, + "language_loss": 0.80673063, + "learning_rate": 1.360448879760721e-06, + "loss": 0.82813668, + "num_input_tokens_seen": 220433440, + "step": 10237, + "time_per_iteration": 3.8444180488586426 + }, + { + "auxiliary_loss_clip": 0.01094292, + "auxiliary_loss_mlp": 0.01040143, + "balance_loss_clip": 1.03935122, + "balance_loss_mlp": 1.0276264, + "epoch": 0.6155418608146701, + "flos": 27162975121920.0, + "grad_norm": 1.6382636584281152, + "language_loss": 0.76049191, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.78183627, + "num_input_tokens_seen": 220453445, + "step": 10238, + "time_per_iteration": 3.959036350250244 + }, + { + "auxiliary_loss_clip": 0.01009162, + "auxiliary_loss_mlp": 0.01013501, + "balance_loss_clip": 1.03454757, + "balance_loss_mlp": 1.01179624, + "epoch": 0.615601984067338, + "flos": 68811165014400.0, + "grad_norm": 0.7688438613844096, + "language_loss": 0.57610494, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.59633154, + "num_input_tokens_seen": 220509730, + "step": 10239, + "time_per_iteration": 3.2117981910705566 + }, + { + "auxiliary_loss_clip": 0.0108801, + "auxiliary_loss_mlp": 0.01033565, + "balance_loss_clip": 1.03454494, + "balance_loss_mlp": 1.02047002, + "epoch": 0.615662107320006, + "flos": 15517144598400.0, + "grad_norm": 1.8078174938220897, + "language_loss": 0.77351052, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.79472625, + "num_input_tokens_seen": 220527295, + "step": 10240, + "time_per_iteration": 2.51479434967041 + }, + { + "auxiliary_loss_clip": 0.01110679, + "auxiliary_loss_mlp": 0.01030448, + "balance_loss_clip": 1.03921294, + "balance_loss_mlp": 1.0178833, + "epoch": 0.615722230572674, + "flos": 21063691128960.0, + "grad_norm": 2.3083528019141166, + "language_loss": 0.72301877, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.74443007, + "num_input_tokens_seen": 220542730, + "step": 10241, + "time_per_iteration": 3.8369054794311523 + }, + { + "auxiliary_loss_clip": 0.01105561, + "auxiliary_loss_mlp": 0.01025442, + "balance_loss_clip": 1.03683424, + "balance_loss_mlp": 1.0138911, + "epoch": 0.615782353825342, + "flos": 23256791919360.0, + "grad_norm": 14.319940167568046, + "language_loss": 0.71880233, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.74011242, + "num_input_tokens_seen": 220562995, + "step": 10242, + "time_per_iteration": 2.481170177459717 + }, + { + "auxiliary_loss_clip": 0.01098335, + "auxiliary_loss_mlp": 0.01028903, + "balance_loss_clip": 1.0377363, + "balance_loss_mlp": 1.01713133, + "epoch": 0.6158424770780099, + "flos": 21103911383040.0, + "grad_norm": 2.170928545685879, + "language_loss": 0.72744012, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.74871254, + "num_input_tokens_seen": 220581775, + "step": 10243, + "time_per_iteration": 2.475595235824585 + }, + { + "auxiliary_loss_clip": 0.01034927, + "auxiliary_loss_mlp": 0.01003229, + "balance_loss_clip": 1.02173257, + "balance_loss_mlp": 1.0019654, + "epoch": 0.6159026003306779, + "flos": 70333276769280.0, + "grad_norm": 0.7592068581646891, + "language_loss": 0.56892526, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.58930683, + "num_input_tokens_seen": 220646395, + "step": 10244, + "time_per_iteration": 3.1413724422454834 + }, + { + "auxiliary_loss_clip": 0.01107209, + "auxiliary_loss_mlp": 0.01029501, + "balance_loss_clip": 1.03656387, + "balance_loss_mlp": 1.01681161, + "epoch": 0.6159627235833458, + "flos": 33874355802240.0, + "grad_norm": 1.5500352338073242, + "language_loss": 0.63436699, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.65573406, + "num_input_tokens_seen": 220668335, + "step": 10245, + "time_per_iteration": 2.563464879989624 + }, + { + "auxiliary_loss_clip": 0.0105564, + "auxiliary_loss_mlp": 0.01032317, + "balance_loss_clip": 1.0324384, + "balance_loss_mlp": 1.02016997, + "epoch": 0.6160228468360138, + "flos": 26575440359040.0, + "grad_norm": 1.7476371647555244, + "language_loss": 0.79063284, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.81151241, + "num_input_tokens_seen": 220688915, + "step": 10246, + "time_per_iteration": 2.608859062194824 + }, + { + "auxiliary_loss_clip": 0.01077149, + "auxiliary_loss_mlp": 0.00788216, + "balance_loss_clip": 1.04020476, + "balance_loss_mlp": 1.01119256, + "epoch": 0.6160829700886818, + "flos": 17193274894080.0, + "grad_norm": 2.947957169372509, + "language_loss": 0.87565172, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.89430535, + "num_input_tokens_seen": 220703465, + "step": 10247, + "time_per_iteration": 2.5552761554718018 + }, + { + "auxiliary_loss_clip": 0.01043612, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.03960216, + "balance_loss_mlp": 1.02254796, + "epoch": 0.6161430933413498, + "flos": 23623547736960.0, + "grad_norm": 1.5998113456242489, + "language_loss": 0.79863495, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.81943142, + "num_input_tokens_seen": 220722090, + "step": 10248, + "time_per_iteration": 2.6468286514282227 + }, + { + "auxiliary_loss_clip": 0.01063432, + "auxiliary_loss_mlp": 0.01032661, + "balance_loss_clip": 1.03630614, + "balance_loss_mlp": 1.02045357, + "epoch": 0.6162032165940178, + "flos": 23002436736000.0, + "grad_norm": 1.799877011203152, + "language_loss": 0.86936712, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.89032805, + "num_input_tokens_seen": 220741075, + "step": 10249, + "time_per_iteration": 2.5907704830169678 + }, + { + "auxiliary_loss_clip": 0.01107475, + "auxiliary_loss_mlp": 0.01025248, + "balance_loss_clip": 1.03703952, + "balance_loss_mlp": 1.01244521, + "epoch": 0.6162633398466857, + "flos": 39421979740800.0, + "grad_norm": 2.273218177838173, + "language_loss": 0.69177878, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.71310604, + "num_input_tokens_seen": 220763395, + "step": 10250, + "time_per_iteration": 2.6226603984832764 + }, + { + "auxiliary_loss_clip": 0.0107604, + "auxiliary_loss_mlp": 0.01026132, + "balance_loss_clip": 1.03333259, + "balance_loss_mlp": 1.01482522, + "epoch": 0.6163234630993537, + "flos": 19244672530560.0, + "grad_norm": 2.267348926726471, + "language_loss": 0.74363124, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.76465297, + "num_input_tokens_seen": 220780640, + "step": 10251, + "time_per_iteration": 2.509047031402588 + }, + { + "auxiliary_loss_clip": 0.01091166, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.03396702, + "balance_loss_mlp": 1.01780295, + "epoch": 0.6163835863520216, + "flos": 15961791058560.0, + "grad_norm": 2.0825681110960406, + "language_loss": 0.68289703, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.70412266, + "num_input_tokens_seen": 220797960, + "step": 10252, + "time_per_iteration": 2.4604580402374268 + }, + { + "auxiliary_loss_clip": 0.00989968, + "auxiliary_loss_mlp": 0.01004572, + "balance_loss_clip": 1.03122044, + "balance_loss_mlp": 1.00326717, + "epoch": 0.6164437096046896, + "flos": 68103834393600.0, + "grad_norm": 0.9136717854383188, + "language_loss": 0.57880998, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.59875536, + "num_input_tokens_seen": 220856930, + "step": 10253, + "time_per_iteration": 3.4326703548431396 + }, + { + "auxiliary_loss_clip": 0.01086396, + "auxiliary_loss_mlp": 0.01030106, + "balance_loss_clip": 1.03560495, + "balance_loss_mlp": 1.01773763, + "epoch": 0.6165038328573575, + "flos": 21361211481600.0, + "grad_norm": 1.461358401726161, + "language_loss": 0.79552978, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.81669474, + "num_input_tokens_seen": 220877595, + "step": 10254, + "time_per_iteration": 2.8940834999084473 + }, + { + "auxiliary_loss_clip": 0.01085464, + "auxiliary_loss_mlp": 0.01030813, + "balance_loss_clip": 1.0382266, + "balance_loss_mlp": 1.0187844, + "epoch": 0.6165639561100256, + "flos": 21101972048640.0, + "grad_norm": 1.689340356196069, + "language_loss": 0.80574417, + "learning_rate": 1.353810600008846e-06, + "loss": 0.82690692, + "num_input_tokens_seen": 220896880, + "step": 10255, + "time_per_iteration": 2.551485061645508 + }, + { + "auxiliary_loss_clip": 0.01088236, + "auxiliary_loss_mlp": 0.010319, + "balance_loss_clip": 1.03644359, + "balance_loss_mlp": 1.01964569, + "epoch": 0.6166240793626935, + "flos": 25338533569920.0, + "grad_norm": 2.0296515596003637, + "language_loss": 0.65380895, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.67501032, + "num_input_tokens_seen": 220916425, + "step": 10256, + "time_per_iteration": 2.558652877807617 + }, + { + "auxiliary_loss_clip": 0.01094125, + "auxiliary_loss_mlp": 0.01026196, + "balance_loss_clip": 1.03669965, + "balance_loss_mlp": 1.01522863, + "epoch": 0.6166842026153615, + "flos": 19682639061120.0, + "grad_norm": 1.5967727580458746, + "language_loss": 0.72283745, + "learning_rate": 1.353073501949825e-06, + "loss": 0.74404073, + "num_input_tokens_seen": 220935050, + "step": 10257, + "time_per_iteration": 2.499439001083374 + }, + { + "auxiliary_loss_clip": 0.01086563, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.03654647, + "balance_loss_mlp": 1.01778221, + "epoch": 0.6167443258680294, + "flos": 19318361281920.0, + "grad_norm": 1.6120629274208997, + "language_loss": 0.71972466, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.74089229, + "num_input_tokens_seen": 220953085, + "step": 10258, + "time_per_iteration": 2.510300874710083 + }, + { + "auxiliary_loss_clip": 0.0108065, + "auxiliary_loss_mlp": 0.01037214, + "balance_loss_clip": 1.03400958, + "balance_loss_mlp": 1.02370739, + "epoch": 0.6168044491206974, + "flos": 25265239868160.0, + "grad_norm": 3.1279935596064, + "language_loss": 0.63617224, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.6573509, + "num_input_tokens_seen": 220969050, + "step": 10259, + "time_per_iteration": 2.5416276454925537 + }, + { + "auxiliary_loss_clip": 0.01071367, + "auxiliary_loss_mlp": 0.01029676, + "balance_loss_clip": 1.03679192, + "balance_loss_mlp": 1.01696277, + "epoch": 0.6168645723733654, + "flos": 13219903301760.0, + "grad_norm": 1.9535502989265747, + "language_loss": 0.71337712, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.73438758, + "num_input_tokens_seen": 220985825, + "step": 10260, + "time_per_iteration": 2.5156314373016357 + }, + { + "auxiliary_loss_clip": 0.01103344, + "auxiliary_loss_mlp": 0.01034953, + "balance_loss_clip": 1.04115415, + "balance_loss_mlp": 1.02125573, + "epoch": 0.6169246956260334, + "flos": 26652038112000.0, + "grad_norm": 2.038283578406827, + "language_loss": 0.68817651, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.7095595, + "num_input_tokens_seen": 221004465, + "step": 10261, + "time_per_iteration": 2.5381405353546143 + }, + { + "auxiliary_loss_clip": 0.01070509, + "auxiliary_loss_mlp": 0.01039098, + "balance_loss_clip": 1.03544199, + "balance_loss_mlp": 1.027946, + "epoch": 0.6169848188787014, + "flos": 23148413608320.0, + "grad_norm": 1.8179393463184357, + "language_loss": 0.71136773, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.73246378, + "num_input_tokens_seen": 221023260, + "step": 10262, + "time_per_iteration": 2.5414698123931885 + }, + { + "auxiliary_loss_clip": 0.01087335, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.03633368, + "balance_loss_mlp": 1.0212512, + "epoch": 0.6170449421313693, + "flos": 23331917214720.0, + "grad_norm": 1.8264791575709374, + "language_loss": 0.69918311, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.72039181, + "num_input_tokens_seen": 221043090, + "step": 10263, + "time_per_iteration": 2.5504252910614014 + }, + { + "auxiliary_loss_clip": 0.01047187, + "auxiliary_loss_mlp": 0.01028447, + "balance_loss_clip": 1.03797102, + "balance_loss_mlp": 1.01612651, + "epoch": 0.6171050653840373, + "flos": 15851617067520.0, + "grad_norm": 3.32835224168281, + "language_loss": 0.76334941, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.78410578, + "num_input_tokens_seen": 221061435, + "step": 10264, + "time_per_iteration": 2.5779831409454346 + }, + { + "auxiliary_loss_clip": 0.01106245, + "auxiliary_loss_mlp": 0.01029297, + "balance_loss_clip": 1.03624678, + "balance_loss_mlp": 1.01686394, + "epoch": 0.6171651886367052, + "flos": 20045516209920.0, + "grad_norm": 5.066960682632096, + "language_loss": 0.85550296, + "learning_rate": 1.350126092092247e-06, + "loss": 0.87685847, + "num_input_tokens_seen": 221078705, + "step": 10265, + "time_per_iteration": 2.480560541152954 + }, + { + "auxiliary_loss_clip": 0.01053405, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.03426051, + "balance_loss_mlp": 1.01853704, + "epoch": 0.6172253118893732, + "flos": 26432695710720.0, + "grad_norm": 1.9867782983933089, + "language_loss": 0.64988726, + "learning_rate": 1.349757776608153e-06, + "loss": 0.67072999, + "num_input_tokens_seen": 221099245, + "step": 10266, + "time_per_iteration": 4.344377756118774 + }, + { + "auxiliary_loss_clip": 0.01069706, + "auxiliary_loss_mlp": 0.01028356, + "balance_loss_clip": 1.0333643, + "balance_loss_mlp": 1.01644087, + "epoch": 0.6172854351420412, + "flos": 22632879657600.0, + "grad_norm": 1.6163246522770094, + "language_loss": 0.75515342, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.77613401, + "num_input_tokens_seen": 221116930, + "step": 10267, + "time_per_iteration": 2.554121971130371 + }, + { + "auxiliary_loss_clip": 0.01078712, + "auxiliary_loss_mlp": 0.01026404, + "balance_loss_clip": 1.03581953, + "balance_loss_mlp": 1.01338005, + "epoch": 0.6173455583947092, + "flos": 21212936138880.0, + "grad_norm": 1.7166279770541952, + "language_loss": 0.75002134, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.77107257, + "num_input_tokens_seen": 221137660, + "step": 10268, + "time_per_iteration": 2.587272882461548 + }, + { + "auxiliary_loss_clip": 0.01087673, + "auxiliary_loss_mlp": 0.01027021, + "balance_loss_clip": 1.03616226, + "balance_loss_mlp": 1.01489711, + "epoch": 0.6174056816473771, + "flos": 19500284689920.0, + "grad_norm": 1.6121790360367356, + "language_loss": 0.75653851, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.77768546, + "num_input_tokens_seen": 221156225, + "step": 10269, + "time_per_iteration": 2.5345916748046875 + }, + { + "auxiliary_loss_clip": 0.01104588, + "auxiliary_loss_mlp": 0.01027478, + "balance_loss_clip": 1.03485847, + "balance_loss_mlp": 1.01605737, + "epoch": 0.6174658049000451, + "flos": 15997342544640.0, + "grad_norm": 2.254513479315107, + "language_loss": 0.77144992, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.79277062, + "num_input_tokens_seen": 221173820, + "step": 10270, + "time_per_iteration": 2.437910318374634 + }, + { + "auxiliary_loss_clip": 0.01086445, + "auxiliary_loss_mlp": 0.01025662, + "balance_loss_clip": 1.03582597, + "balance_loss_mlp": 1.01327586, + "epoch": 0.617525928152713, + "flos": 21903893136000.0, + "grad_norm": 1.7875085568229236, + "language_loss": 0.8252691, + "learning_rate": 1.347916569325736e-06, + "loss": 0.84639013, + "num_input_tokens_seen": 221191815, + "step": 10271, + "time_per_iteration": 2.5061938762664795 + }, + { + "auxiliary_loss_clip": 0.0110873, + "auxiliary_loss_mlp": 0.00784467, + "balance_loss_clip": 1.03745782, + "balance_loss_mlp": 1.0106523, + "epoch": 0.617586051405381, + "flos": 21105958458240.0, + "grad_norm": 1.555063770090248, + "language_loss": 0.77043736, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.78936929, + "num_input_tokens_seen": 221211205, + "step": 10272, + "time_per_iteration": 2.481411933898926 + }, + { + "auxiliary_loss_clip": 0.01016631, + "auxiliary_loss_mlp": 0.01001503, + "balance_loss_clip": 1.0223248, + "balance_loss_mlp": 1.00011384, + "epoch": 0.617646174658049, + "flos": 58610776665600.0, + "grad_norm": 0.8011639776995017, + "language_loss": 0.59090126, + "learning_rate": 1.347180259404513e-06, + "loss": 0.61108255, + "num_input_tokens_seen": 221268430, + "step": 10273, + "time_per_iteration": 3.0405077934265137 + }, + { + "auxiliary_loss_clip": 0.01078612, + "auxiliary_loss_mlp": 0.01036429, + "balance_loss_clip": 1.03396821, + "balance_loss_mlp": 1.02209997, + "epoch": 0.617706297910717, + "flos": 13878684691200.0, + "grad_norm": 2.4027133598899586, + "language_loss": 0.72935045, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.75050086, + "num_input_tokens_seen": 221281930, + "step": 10274, + "time_per_iteration": 2.483344316482544 + }, + { + "auxiliary_loss_clip": 0.01096752, + "auxiliary_loss_mlp": 0.00785104, + "balance_loss_clip": 1.036479, + "balance_loss_mlp": 1.01038933, + "epoch": 0.617766421163385, + "flos": 19208438686080.0, + "grad_norm": 1.8340669538607355, + "language_loss": 0.77427703, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.79309559, + "num_input_tokens_seen": 221301605, + "step": 10275, + "time_per_iteration": 2.4744439125061035 + }, + { + "auxiliary_loss_clip": 0.01066913, + "auxiliary_loss_mlp": 0.01026109, + "balance_loss_clip": 1.03537679, + "balance_loss_mlp": 1.01476634, + "epoch": 0.6178265444160529, + "flos": 22565978576640.0, + "grad_norm": 1.6823389299960438, + "language_loss": 0.79322952, + "learning_rate": 1.346075980219998e-06, + "loss": 0.81415975, + "num_input_tokens_seen": 221320105, + "step": 10276, + "time_per_iteration": 5.547476053237915 + }, + { + "auxiliary_loss_clip": 0.01053461, + "auxiliary_loss_mlp": 0.01036649, + "balance_loss_clip": 1.03901362, + "balance_loss_mlp": 1.02347684, + "epoch": 0.6178866676687209, + "flos": 11984289402240.0, + "grad_norm": 1.9270541958919831, + "language_loss": 0.81211567, + "learning_rate": 1.345707936733612e-06, + "loss": 0.83301681, + "num_input_tokens_seen": 221335915, + "step": 10277, + "time_per_iteration": 2.5963456630706787 + }, + { + "auxiliary_loss_clip": 0.01079933, + "auxiliary_loss_mlp": 0.01026735, + "balance_loss_clip": 1.03674221, + "balance_loss_mlp": 1.01346087, + "epoch": 0.6179467909213888, + "flos": 20991510748800.0, + "grad_norm": 1.7193063455956077, + "language_loss": 0.81607622, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.83714288, + "num_input_tokens_seen": 221353965, + "step": 10278, + "time_per_iteration": 2.5844919681549072 + }, + { + "auxiliary_loss_clip": 0.01062582, + "auxiliary_loss_mlp": 0.00785535, + "balance_loss_clip": 1.03555202, + "balance_loss_mlp": 1.01073933, + "epoch": 0.6180069141740568, + "flos": 25338102606720.0, + "grad_norm": 1.5301185914636373, + "language_loss": 0.74026775, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.75874889, + "num_input_tokens_seen": 221374080, + "step": 10279, + "time_per_iteration": 4.073084831237793 + }, + { + "auxiliary_loss_clip": 0.01090273, + "auxiliary_loss_mlp": 0.01028244, + "balance_loss_clip": 1.03362036, + "balance_loss_mlp": 1.01634109, + "epoch": 0.6180670374267248, + "flos": 19645722858240.0, + "grad_norm": 1.488923194662771, + "language_loss": 0.70864463, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.72982979, + "num_input_tokens_seen": 221392910, + "step": 10280, + "time_per_iteration": 2.514727830886841 + }, + { + "auxiliary_loss_clip": 0.01108591, + "auxiliary_loss_mlp": 0.01031955, + "balance_loss_clip": 1.03755963, + "balance_loss_mlp": 1.01996887, + "epoch": 0.6181271606793928, + "flos": 19464876858240.0, + "grad_norm": 1.6093896808902897, + "language_loss": 0.72689259, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.74829811, + "num_input_tokens_seen": 221410990, + "step": 10281, + "time_per_iteration": 2.4567606449127197 + }, + { + "auxiliary_loss_clip": 0.01084615, + "auxiliary_loss_mlp": 0.01027857, + "balance_loss_clip": 1.03775978, + "balance_loss_mlp": 1.01691985, + "epoch": 0.6181872839320607, + "flos": 25594289383680.0, + "grad_norm": 1.6278077466563692, + "language_loss": 0.76938921, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.79051393, + "num_input_tokens_seen": 221431020, + "step": 10282, + "time_per_iteration": 2.5994722843170166 + }, + { + "auxiliary_loss_clip": 0.01082563, + "auxiliary_loss_mlp": 0.01034803, + "balance_loss_clip": 1.03503931, + "balance_loss_mlp": 1.01906705, + "epoch": 0.6182474071847287, + "flos": 25551806572800.0, + "grad_norm": 1.6542664626260082, + "language_loss": 0.69322509, + "learning_rate": 1.343500197330931e-06, + "loss": 0.71439874, + "num_input_tokens_seen": 221453235, + "step": 10283, + "time_per_iteration": 2.5605452060699463 + }, + { + "auxiliary_loss_clip": 0.01102248, + "auxiliary_loss_mlp": 0.01026941, + "balance_loss_clip": 1.03720725, + "balance_loss_mlp": 1.01413751, + "epoch": 0.6183075304373966, + "flos": 22123738327680.0, + "grad_norm": 1.6156015153088243, + "language_loss": 0.75176501, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.77305686, + "num_input_tokens_seen": 221472560, + "step": 10284, + "time_per_iteration": 2.5085856914520264 + }, + { + "auxiliary_loss_clip": 0.01091694, + "auxiliary_loss_mlp": 0.01030913, + "balance_loss_clip": 1.04089141, + "balance_loss_mlp": 1.0191772, + "epoch": 0.6183676536900646, + "flos": 22455589104000.0, + "grad_norm": 1.4856857004221031, + "language_loss": 0.75640005, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.77762616, + "num_input_tokens_seen": 221492835, + "step": 10285, + "time_per_iteration": 2.509333372116089 + }, + { + "auxiliary_loss_clip": 0.01073046, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.03572083, + "balance_loss_mlp": 1.01970983, + "epoch": 0.6184277769427327, + "flos": 23364128736000.0, + "grad_norm": 1.5022663645266483, + "language_loss": 0.72952276, + "learning_rate": 1.342396663517503e-06, + "loss": 0.7505703, + "num_input_tokens_seen": 221511870, + "step": 10286, + "time_per_iteration": 2.5647313594818115 + }, + { + "auxiliary_loss_clip": 0.01105529, + "auxiliary_loss_mlp": 0.0102884, + "balance_loss_clip": 1.03666496, + "balance_loss_mlp": 1.01718175, + "epoch": 0.6184879001954006, + "flos": 22711057608960.0, + "grad_norm": 2.1744758048938744, + "language_loss": 0.75755203, + "learning_rate": 1.342028868767199e-06, + "loss": 0.77889574, + "num_input_tokens_seen": 221529915, + "step": 10287, + "time_per_iteration": 2.453536033630371 + }, + { + "auxiliary_loss_clip": 0.01069371, + "auxiliary_loss_mlp": 0.01034871, + "balance_loss_clip": 1.03658044, + "balance_loss_mlp": 1.02319479, + "epoch": 0.6185480234480686, + "flos": 23841920471040.0, + "grad_norm": 1.7165971344134126, + "language_loss": 0.73420835, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.75525069, + "num_input_tokens_seen": 221549745, + "step": 10288, + "time_per_iteration": 2.5722339153289795 + }, + { + "auxiliary_loss_clip": 0.01091018, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.03441691, + "balance_loss_mlp": 1.01799178, + "epoch": 0.6186081467007365, + "flos": 45477595774080.0, + "grad_norm": 1.5838363096575276, + "language_loss": 0.72793931, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.74914211, + "num_input_tokens_seen": 221572455, + "step": 10289, + "time_per_iteration": 2.6831247806549072 + }, + { + "auxiliary_loss_clip": 0.01081792, + "auxiliary_loss_mlp": 0.01032198, + "balance_loss_clip": 1.03572559, + "balance_loss_mlp": 1.01931763, + "epoch": 0.6186682699534045, + "flos": 23550864566400.0, + "grad_norm": 1.498537633321843, + "language_loss": 0.79339182, + "learning_rate": 1.340925634274056e-06, + "loss": 0.81453168, + "num_input_tokens_seen": 221591325, + "step": 10290, + "time_per_iteration": 2.553004264831543 + }, + { + "auxiliary_loss_clip": 0.01099041, + "auxiliary_loss_mlp": 0.01033184, + "balance_loss_clip": 1.03823638, + "balance_loss_mlp": 1.02105451, + "epoch": 0.6187283932060724, + "flos": 25774201630080.0, + "grad_norm": 1.5833518444468346, + "language_loss": 0.81344706, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.83476937, + "num_input_tokens_seen": 221611640, + "step": 10291, + "time_per_iteration": 2.52955961227417 + }, + { + "auxiliary_loss_clip": 0.01108247, + "auxiliary_loss_mlp": 0.0103369, + "balance_loss_clip": 1.03807414, + "balance_loss_mlp": 1.02170944, + "epoch": 0.6187885164587404, + "flos": 25265203954560.0, + "grad_norm": 1.692201678811394, + "language_loss": 0.77624643, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.79766577, + "num_input_tokens_seen": 221631225, + "step": 10292, + "time_per_iteration": 2.4941632747650146 + }, + { + "auxiliary_loss_clip": 0.01082694, + "auxiliary_loss_mlp": 0.01040013, + "balance_loss_clip": 1.03734505, + "balance_loss_mlp": 1.02541006, + "epoch": 0.6188486397114084, + "flos": 26250772302720.0, + "grad_norm": 1.794377095878869, + "language_loss": 0.73138344, + "learning_rate": 1.339822624710401e-06, + "loss": 0.7526105, + "num_input_tokens_seen": 221651035, + "step": 10293, + "time_per_iteration": 2.571413278579712 + }, + { + "auxiliary_loss_clip": 0.01076871, + "auxiliary_loss_mlp": 0.00785003, + "balance_loss_clip": 1.03779554, + "balance_loss_mlp": 1.01049256, + "epoch": 0.6189087629640764, + "flos": 20923388605440.0, + "grad_norm": 1.5083177373802097, + "language_loss": 0.82837451, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.84699327, + "num_input_tokens_seen": 221671300, + "step": 10294, + "time_per_iteration": 2.5740389823913574 + }, + { + "auxiliary_loss_clip": 0.0108895, + "auxiliary_loss_mlp": 0.01031108, + "balance_loss_clip": 1.03736806, + "balance_loss_mlp": 1.01942575, + "epoch": 0.6189688862167443, + "flos": 14829814874880.0, + "grad_norm": 2.1384213592833876, + "language_loss": 0.70723808, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.72843874, + "num_input_tokens_seen": 221687320, + "step": 10295, + "time_per_iteration": 2.515923261642456 + }, + { + "auxiliary_loss_clip": 0.01108449, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.03921723, + "balance_loss_mlp": 1.01958835, + "epoch": 0.6190290094694123, + "flos": 24285058560000.0, + "grad_norm": 1.7178641077291836, + "language_loss": 0.70616865, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.72756886, + "num_input_tokens_seen": 221710175, + "step": 10296, + "time_per_iteration": 2.53440260887146 + }, + { + "auxiliary_loss_clip": 0.01075297, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.03913009, + "balance_loss_mlp": 1.019907, + "epoch": 0.6190891327220802, + "flos": 22529457423360.0, + "grad_norm": 1.7830074676652925, + "language_loss": 0.71781969, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.73890513, + "num_input_tokens_seen": 221728145, + "step": 10297, + "time_per_iteration": 2.5861740112304688 + }, + { + "auxiliary_loss_clip": 0.01044815, + "auxiliary_loss_mlp": 0.0100358, + "balance_loss_clip": 1.0209558, + "balance_loss_mlp": 1.00233996, + "epoch": 0.6191492559747482, + "flos": 67729357152000.0, + "grad_norm": 0.8816755927391389, + "language_loss": 0.64136255, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.66184652, + "num_input_tokens_seen": 221786100, + "step": 10298, + "time_per_iteration": 2.9709692001342773 + }, + { + "auxiliary_loss_clip": 0.01109861, + "auxiliary_loss_mlp": 0.01031404, + "balance_loss_clip": 1.03855181, + "balance_loss_mlp": 1.01944184, + "epoch": 0.6192093792274163, + "flos": 22346672088960.0, + "grad_norm": 1.9823745420851533, + "language_loss": 0.73649418, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.75790679, + "num_input_tokens_seen": 221806450, + "step": 10299, + "time_per_iteration": 2.5974173545837402 + }, + { + "auxiliary_loss_clip": 0.01101892, + "auxiliary_loss_mlp": 0.01028699, + "balance_loss_clip": 1.03896368, + "balance_loss_mlp": 1.01682615, + "epoch": 0.6192695024800842, + "flos": 13553944807680.0, + "grad_norm": 2.5566539974626514, + "language_loss": 0.687989, + "learning_rate": 1.337249812568732e-06, + "loss": 0.70929492, + "num_input_tokens_seen": 221823330, + "step": 10300, + "time_per_iteration": 2.480738401412964 + }, + { + "auxiliary_loss_clip": 0.01098053, + "auxiliary_loss_mlp": 0.00785341, + "balance_loss_clip": 1.04128623, + "balance_loss_mlp": 1.01004052, + "epoch": 0.6193296257327522, + "flos": 17415310815360.0, + "grad_norm": 1.7742560864353165, + "language_loss": 0.66922784, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.68806183, + "num_input_tokens_seen": 221839360, + "step": 10301, + "time_per_iteration": 2.4901843070983887 + }, + { + "auxiliary_loss_clip": 0.01068234, + "auxiliary_loss_mlp": 0.01028135, + "balance_loss_clip": 1.0390507, + "balance_loss_mlp": 1.01672029, + "epoch": 0.6193897489854201, + "flos": 31101118450560.0, + "grad_norm": 1.529122654058849, + "language_loss": 0.72958273, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.7505464, + "num_input_tokens_seen": 221859465, + "step": 10302, + "time_per_iteration": 2.6799395084381104 + }, + { + "auxiliary_loss_clip": 0.01082328, + "auxiliary_loss_mlp": 0.01027953, + "balance_loss_clip": 1.03887951, + "balance_loss_mlp": 1.01532888, + "epoch": 0.6194498722380881, + "flos": 19134031662720.0, + "grad_norm": 1.7015760030566522, + "language_loss": 0.80575347, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.82685632, + "num_input_tokens_seen": 221878555, + "step": 10303, + "time_per_iteration": 2.5071494579315186 + }, + { + "auxiliary_loss_clip": 0.01111572, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.03826904, + "balance_loss_mlp": 1.01704216, + "epoch": 0.619509995490756, + "flos": 21835088634240.0, + "grad_norm": 1.6482641788859587, + "language_loss": 0.76572907, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.78714907, + "num_input_tokens_seen": 221898790, + "step": 10304, + "time_per_iteration": 2.464911460876465 + }, + { + "auxiliary_loss_clip": 0.01079939, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.04153776, + "balance_loss_mlp": 1.01932335, + "epoch": 0.619570118743424, + "flos": 23806548552960.0, + "grad_norm": 2.2077018144314087, + "language_loss": 0.76596689, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.78708434, + "num_input_tokens_seen": 221918875, + "step": 10305, + "time_per_iteration": 3.916080951690674 + }, + { + "auxiliary_loss_clip": 0.01105065, + "auxiliary_loss_mlp": 0.01033138, + "balance_loss_clip": 1.04022217, + "balance_loss_mlp": 1.01931012, + "epoch": 0.619630241996092, + "flos": 21101612912640.0, + "grad_norm": 1.6726846660843253, + "language_loss": 0.7924422, + "learning_rate": 1.335045524968045e-06, + "loss": 0.81382418, + "num_input_tokens_seen": 221937895, + "step": 10306, + "time_per_iteration": 2.530425548553467 + }, + { + "auxiliary_loss_clip": 0.01052084, + "auxiliary_loss_mlp": 0.01028661, + "balance_loss_clip": 1.04066885, + "balance_loss_mlp": 1.01737797, + "epoch": 0.61969036524876, + "flos": 27308269635840.0, + "grad_norm": 1.557628871275947, + "language_loss": 0.8012374, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.82204485, + "num_input_tokens_seen": 221955920, + "step": 10307, + "time_per_iteration": 2.6432037353515625 + }, + { + "auxiliary_loss_clip": 0.01021573, + "auxiliary_loss_mlp": 0.01003084, + "balance_loss_clip": 1.02691841, + "balance_loss_mlp": 1.00165904, + "epoch": 0.6197504885014279, + "flos": 51648955384320.0, + "grad_norm": 0.8092209614688123, + "language_loss": 0.59383929, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.61408579, + "num_input_tokens_seen": 222011405, + "step": 10308, + "time_per_iteration": 3.1812753677368164 + }, + { + "auxiliary_loss_clip": 0.01082104, + "auxiliary_loss_mlp": 0.0102583, + "balance_loss_clip": 1.04003859, + "balance_loss_mlp": 1.0154705, + "epoch": 0.6198106117540959, + "flos": 30557107992960.0, + "grad_norm": 1.7162497927668046, + "language_loss": 0.67710119, + "learning_rate": 1.333943721384037e-06, + "loss": 0.6981805, + "num_input_tokens_seen": 222034545, + "step": 10309, + "time_per_iteration": 2.6088027954101562 + }, + { + "auxiliary_loss_clip": 0.01079463, + "auxiliary_loss_mlp": 0.0102943, + "balance_loss_clip": 1.03533792, + "balance_loss_mlp": 1.01749158, + "epoch": 0.6198707350067638, + "flos": 18909733184640.0, + "grad_norm": 1.5576319839388018, + "language_loss": 0.71857953, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.73966843, + "num_input_tokens_seen": 222052690, + "step": 10310, + "time_per_iteration": 2.5423073768615723 + }, + { + "auxiliary_loss_clip": 0.01093323, + "auxiliary_loss_mlp": 0.01032611, + "balance_loss_clip": 1.04148149, + "balance_loss_mlp": 1.01904464, + "epoch": 0.6199308582594318, + "flos": 21433858738560.0, + "grad_norm": 2.4594977468660293, + "language_loss": 0.78862923, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.8098886, + "num_input_tokens_seen": 222069095, + "step": 10311, + "time_per_iteration": 2.525202989578247 + }, + { + "auxiliary_loss_clip": 0.01073605, + "auxiliary_loss_mlp": 0.01031523, + "balance_loss_clip": 1.03583932, + "balance_loss_mlp": 1.01941156, + "epoch": 0.6199909815120999, + "flos": 18407379525120.0, + "grad_norm": 1.75126879216098, + "language_loss": 0.72338039, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.74443167, + "num_input_tokens_seen": 222087360, + "step": 10312, + "time_per_iteration": 2.559826135635376 + }, + { + "auxiliary_loss_clip": 0.01060855, + "auxiliary_loss_mlp": 0.01028366, + "balance_loss_clip": 1.03760147, + "balance_loss_mlp": 1.01633203, + "epoch": 0.6200511047647678, + "flos": 21466860359040.0, + "grad_norm": 1.7678603314887196, + "language_loss": 0.71853507, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.73942727, + "num_input_tokens_seen": 222106130, + "step": 10313, + "time_per_iteration": 2.570801019668579 + }, + { + "auxiliary_loss_clip": 0.01103209, + "auxiliary_loss_mlp": 0.01030186, + "balance_loss_clip": 1.03893971, + "balance_loss_mlp": 1.01722169, + "epoch": 0.6201112280174358, + "flos": 18215903099520.0, + "grad_norm": 2.0664286698498553, + "language_loss": 0.78303385, + "learning_rate": 1.332107887401416e-06, + "loss": 0.80436784, + "num_input_tokens_seen": 222123125, + "step": 10314, + "time_per_iteration": 3.8403100967407227 + }, + { + "auxiliary_loss_clip": 0.01096006, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.03559232, + "balance_loss_mlp": 1.02052689, + "epoch": 0.6201713512701037, + "flos": 20011185786240.0, + "grad_norm": 1.807697478550938, + "language_loss": 0.78333187, + "learning_rate": 1.331740796528812e-06, + "loss": 0.80462027, + "num_input_tokens_seen": 222140655, + "step": 10315, + "time_per_iteration": 3.851771354675293 + }, + { + "auxiliary_loss_clip": 0.01077511, + "auxiliary_loss_mlp": 0.01035927, + "balance_loss_clip": 1.04183197, + "balance_loss_mlp": 1.02366674, + "epoch": 0.6202314745227717, + "flos": 22487692884480.0, + "grad_norm": 1.8403004707279857, + "language_loss": 0.75833726, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.77947164, + "num_input_tokens_seen": 222160450, + "step": 10316, + "time_per_iteration": 2.598686933517456 + }, + { + "auxiliary_loss_clip": 0.01109253, + "auxiliary_loss_mlp": 0.01028945, + "balance_loss_clip": 1.0361917, + "balance_loss_mlp": 1.01667225, + "epoch": 0.6202915977754396, + "flos": 26828682220800.0, + "grad_norm": 1.7797223190782672, + "language_loss": 0.77644354, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.79782557, + "num_input_tokens_seen": 222179170, + "step": 10317, + "time_per_iteration": 2.5139169692993164 + }, + { + "auxiliary_loss_clip": 0.01028359, + "auxiliary_loss_mlp": 0.01000336, + "balance_loss_clip": 1.02421284, + "balance_loss_mlp": 0.99906081, + "epoch": 0.6203517210281076, + "flos": 62742694890240.0, + "grad_norm": 0.6956812243488443, + "language_loss": 0.59051585, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.61080277, + "num_input_tokens_seen": 222242660, + "step": 10318, + "time_per_iteration": 4.550368547439575 + }, + { + "auxiliary_loss_clip": 0.01083668, + "auxiliary_loss_mlp": 0.01034023, + "balance_loss_clip": 1.04110122, + "balance_loss_mlp": 1.02097535, + "epoch": 0.6204118442807756, + "flos": 23404277162880.0, + "grad_norm": 2.790269011975817, + "language_loss": 0.7773385, + "learning_rate": 1.330272686582143e-06, + "loss": 0.79851538, + "num_input_tokens_seen": 222262170, + "step": 10319, + "time_per_iteration": 2.539809465408325 + }, + { + "auxiliary_loss_clip": 0.01088697, + "auxiliary_loss_mlp": 0.01028265, + "balance_loss_clip": 1.03839445, + "balance_loss_mlp": 1.0164572, + "epoch": 0.6204719675334436, + "flos": 20193647898240.0, + "grad_norm": 1.7207014502681377, + "language_loss": 0.66218185, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.68335152, + "num_input_tokens_seen": 222280375, + "step": 10320, + "time_per_iteration": 2.538475751876831 + }, + { + "auxiliary_loss_clip": 0.01068927, + "auxiliary_loss_mlp": 0.01028818, + "balance_loss_clip": 1.03717184, + "balance_loss_mlp": 1.01701617, + "epoch": 0.6205320907861115, + "flos": 13188050916480.0, + "grad_norm": 1.711420935965839, + "language_loss": 0.76076519, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.78174269, + "num_input_tokens_seen": 222297325, + "step": 10321, + "time_per_iteration": 2.525261640548706 + }, + { + "auxiliary_loss_clip": 0.01081773, + "auxiliary_loss_mlp": 0.01022758, + "balance_loss_clip": 1.03942227, + "balance_loss_mlp": 1.01132631, + "epoch": 0.6205922140387795, + "flos": 20668386977280.0, + "grad_norm": 2.1564900165092955, + "language_loss": 0.73388302, + "learning_rate": 1.329171870732758e-06, + "loss": 0.75492841, + "num_input_tokens_seen": 222317095, + "step": 10322, + "time_per_iteration": 2.52860426902771 + }, + { + "auxiliary_loss_clip": 0.01071381, + "auxiliary_loss_mlp": 0.01025753, + "balance_loss_clip": 1.03729033, + "balance_loss_mlp": 1.01399326, + "epoch": 0.6206523372914474, + "flos": 23877831093120.0, + "grad_norm": 1.776981103225701, + "language_loss": 0.72640198, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.74737334, + "num_input_tokens_seen": 222337055, + "step": 10323, + "time_per_iteration": 2.565063238143921 + }, + { + "auxiliary_loss_clip": 0.01105665, + "auxiliary_loss_mlp": 0.01028668, + "balance_loss_clip": 1.03933084, + "balance_loss_mlp": 1.01592481, + "epoch": 0.6207124605441154, + "flos": 13406603218560.0, + "grad_norm": 2.3102252525514615, + "language_loss": 0.586932, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.60827535, + "num_input_tokens_seen": 222354515, + "step": 10324, + "time_per_iteration": 2.4527344703674316 + }, + { + "auxiliary_loss_clip": 0.01071201, + "auxiliary_loss_mlp": 0.01037858, + "balance_loss_clip": 1.04039443, + "balance_loss_mlp": 1.02391016, + "epoch": 0.6207725837967835, + "flos": 18916341287040.0, + "grad_norm": 2.104094587113255, + "language_loss": 0.76805454, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.78914511, + "num_input_tokens_seen": 222372755, + "step": 10325, + "time_per_iteration": 2.5205447673797607 + }, + { + "auxiliary_loss_clip": 0.01101399, + "auxiliary_loss_mlp": 0.01027308, + "balance_loss_clip": 1.03905392, + "balance_loss_mlp": 1.01425457, + "epoch": 0.6208327070494514, + "flos": 23980211832960.0, + "grad_norm": 2.371576641875241, + "language_loss": 0.72539979, + "learning_rate": 1.327704472462003e-06, + "loss": 0.74668694, + "num_input_tokens_seen": 222391380, + "step": 10326, + "time_per_iteration": 2.501467227935791 + }, + { + "auxiliary_loss_clip": 0.01102767, + "auxiliary_loss_mlp": 0.01037325, + "balance_loss_clip": 1.04048181, + "balance_loss_mlp": 1.02509975, + "epoch": 0.6208928303021194, + "flos": 22820405587200.0, + "grad_norm": 2.59792764824432, + "language_loss": 0.73704427, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.75844526, + "num_input_tokens_seen": 222411165, + "step": 10327, + "time_per_iteration": 2.5073843002319336 + }, + { + "auxiliary_loss_clip": 0.01088193, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.04096103, + "balance_loss_mlp": 1.02153933, + "epoch": 0.6209529535547873, + "flos": 17564519911680.0, + "grad_norm": 2.5240700304934047, + "language_loss": 0.80002755, + "learning_rate": 1.326970926232066e-06, + "loss": 0.82125807, + "num_input_tokens_seen": 222428110, + "step": 10328, + "time_per_iteration": 2.4913713932037354 + }, + { + "auxiliary_loss_clip": 0.01077828, + "auxiliary_loss_mlp": 0.01037384, + "balance_loss_clip": 1.0369004, + "balance_loss_mlp": 1.02487898, + "epoch": 0.6210130768074553, + "flos": 22011912311040.0, + "grad_norm": 1.6867383608971656, + "language_loss": 0.78065646, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.8018086, + "num_input_tokens_seen": 222446385, + "step": 10329, + "time_per_iteration": 2.5424201488494873 + }, + { + "auxiliary_loss_clip": 0.01027342, + "auxiliary_loss_mlp": 0.01006194, + "balance_loss_clip": 1.01567531, + "balance_loss_mlp": 1.00486517, + "epoch": 0.6210732000601232, + "flos": 63676873854720.0, + "grad_norm": 0.8307831784421537, + "language_loss": 0.62188965, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.64222497, + "num_input_tokens_seen": 222502150, + "step": 10330, + "time_per_iteration": 3.069375991821289 + }, + { + "auxiliary_loss_clip": 0.011051, + "auxiliary_loss_mlp": 0.01035233, + "balance_loss_clip": 1.03961062, + "balance_loss_mlp": 1.02148867, + "epoch": 0.6211333233127913, + "flos": 24243365848320.0, + "grad_norm": 2.0695077976396643, + "language_loss": 0.77176577, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.79316908, + "num_input_tokens_seen": 222519880, + "step": 10331, + "time_per_iteration": 2.502328634262085 + }, + { + "auxiliary_loss_clip": 0.01114872, + "auxiliary_loss_mlp": 0.01035757, + "balance_loss_clip": 1.04056787, + "balance_loss_mlp": 1.02265573, + "epoch": 0.6211934465654592, + "flos": 16943803960320.0, + "grad_norm": 1.8742113176044293, + "language_loss": 0.67774278, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.69924909, + "num_input_tokens_seen": 222538545, + "step": 10332, + "time_per_iteration": 2.464479684829712 + }, + { + "auxiliary_loss_clip": 0.0107873, + "auxiliary_loss_mlp": 0.0103072, + "balance_loss_clip": 1.03871882, + "balance_loss_mlp": 1.01891816, + "epoch": 0.6212535698181272, + "flos": 15267386355840.0, + "grad_norm": 1.4222642310937574, + "language_loss": 0.76207268, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.78316718, + "num_input_tokens_seen": 222556935, + "step": 10333, + "time_per_iteration": 2.5225982666015625 + }, + { + "auxiliary_loss_clip": 0.01085125, + "auxiliary_loss_mlp": 0.01029752, + "balance_loss_clip": 1.04086828, + "balance_loss_mlp": 1.01789665, + "epoch": 0.6213136930707951, + "flos": 13443950384640.0, + "grad_norm": 2.185863346171761, + "language_loss": 0.69652915, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.71767795, + "num_input_tokens_seen": 222574035, + "step": 10334, + "time_per_iteration": 2.5230274200439453 + }, + { + "auxiliary_loss_clip": 0.01088844, + "auxiliary_loss_mlp": 0.00784995, + "balance_loss_clip": 1.04090142, + "balance_loss_mlp": 1.0100708, + "epoch": 0.6213738163234631, + "flos": 18111223889280.0, + "grad_norm": 1.9573043984485898, + "language_loss": 0.70273542, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.72147381, + "num_input_tokens_seen": 222592290, + "step": 10335, + "time_per_iteration": 2.51918363571167 + }, + { + "auxiliary_loss_clip": 0.01059882, + "auxiliary_loss_mlp": 0.01033342, + "balance_loss_clip": 1.03663707, + "balance_loss_mlp": 1.02041936, + "epoch": 0.621433939576131, + "flos": 25337348421120.0, + "grad_norm": 1.4624269149397995, + "language_loss": 0.80290651, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.82383871, + "num_input_tokens_seen": 222612805, + "step": 10336, + "time_per_iteration": 2.615630865097046 + }, + { + "auxiliary_loss_clip": 0.0110888, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.0396378, + "balance_loss_mlp": 1.01919365, + "epoch": 0.621494062828799, + "flos": 22565619440640.0, + "grad_norm": 1.7862466225823384, + "language_loss": 0.73111564, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.75252044, + "num_input_tokens_seen": 222632260, + "step": 10337, + "time_per_iteration": 2.468074321746826 + }, + { + "auxiliary_loss_clip": 0.01112796, + "auxiliary_loss_mlp": 0.01036184, + "balance_loss_clip": 1.03913784, + "balance_loss_mlp": 1.02311921, + "epoch": 0.621554186081467, + "flos": 27417976750080.0, + "grad_norm": 1.9997229042462794, + "language_loss": 0.63068038, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.65217018, + "num_input_tokens_seen": 222653570, + "step": 10338, + "time_per_iteration": 2.516507148742676 + }, + { + "auxiliary_loss_clip": 0.01099956, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.04040265, + "balance_loss_mlp": 1.02195001, + "epoch": 0.621614309334135, + "flos": 22346815743360.0, + "grad_norm": 1.6080909449961953, + "language_loss": 0.71445632, + "learning_rate": 1.322938249724991e-06, + "loss": 0.7358017, + "num_input_tokens_seen": 222672480, + "step": 10339, + "time_per_iteration": 2.486429452896118 + }, + { + "auxiliary_loss_clip": 0.0106019, + "auxiliary_loss_mlp": 0.01036279, + "balance_loss_clip": 1.03747368, + "balance_loss_mlp": 1.02235532, + "epoch": 0.621674432586803, + "flos": 19281229597440.0, + "grad_norm": 1.6758144968377342, + "language_loss": 0.69712049, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.71808517, + "num_input_tokens_seen": 222691200, + "step": 10340, + "time_per_iteration": 2.5642330646514893 + }, + { + "auxiliary_loss_clip": 0.01065288, + "auxiliary_loss_mlp": 0.01027713, + "balance_loss_clip": 1.03641653, + "balance_loss_mlp": 1.01572096, + "epoch": 0.6217345558394709, + "flos": 21609533180160.0, + "grad_norm": 1.9192231681758047, + "language_loss": 0.68937415, + "learning_rate": 1.322205369037788e-06, + "loss": 0.71030414, + "num_input_tokens_seen": 222709975, + "step": 10341, + "time_per_iteration": 2.6003472805023193 + }, + { + "auxiliary_loss_clip": 0.01099741, + "auxiliary_loss_mlp": 0.01031007, + "balance_loss_clip": 1.03924048, + "balance_loss_mlp": 1.0170356, + "epoch": 0.6217946790921389, + "flos": 18004102554240.0, + "grad_norm": 1.7398247492128267, + "language_loss": 0.80931234, + "learning_rate": 1.321838967240299e-06, + "loss": 0.83061981, + "num_input_tokens_seen": 222729005, + "step": 10342, + "time_per_iteration": 2.4681191444396973 + }, + { + "auxiliary_loss_clip": 0.01027816, + "auxiliary_loss_mlp": 0.0100228, + "balance_loss_clip": 1.02269125, + "balance_loss_mlp": 1.00106454, + "epoch": 0.6218548023448068, + "flos": 61973631768960.0, + "grad_norm": 1.8906652110824225, + "language_loss": 0.57344019, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.59374118, + "num_input_tokens_seen": 222786090, + "step": 10343, + "time_per_iteration": 4.383916139602661 + }, + { + "auxiliary_loss_clip": 0.0106897, + "auxiliary_loss_mlp": 0.01025635, + "balance_loss_clip": 1.03554857, + "balance_loss_mlp": 1.01416671, + "epoch": 0.6219149255974749, + "flos": 25739152934400.0, + "grad_norm": 1.827995475032794, + "language_loss": 0.73243034, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.75337642, + "num_input_tokens_seen": 222806100, + "step": 10344, + "time_per_iteration": 2.5724427700042725 + }, + { + "auxiliary_loss_clip": 0.01102498, + "auxiliary_loss_mlp": 0.01041809, + "balance_loss_clip": 1.04142594, + "balance_loss_mlp": 1.0301801, + "epoch": 0.6219750488501428, + "flos": 25411073086080.0, + "grad_norm": 1.7640645888649056, + "language_loss": 0.59894419, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.62038732, + "num_input_tokens_seen": 222826575, + "step": 10345, + "time_per_iteration": 2.531609296798706 + }, + { + "auxiliary_loss_clip": 0.01048528, + "auxiliary_loss_mlp": 0.01036025, + "balance_loss_clip": 1.03690147, + "balance_loss_mlp": 1.02268577, + "epoch": 0.6220351721028108, + "flos": 20047383717120.0, + "grad_norm": 1.9353705140344668, + "language_loss": 0.77904141, + "learning_rate": 1.320373617348614e-06, + "loss": 0.79988688, + "num_input_tokens_seen": 222845285, + "step": 10346, + "time_per_iteration": 2.603142261505127 + }, + { + "auxiliary_loss_clip": 0.01078848, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.03928995, + "balance_loss_mlp": 1.02019036, + "epoch": 0.6220952953554787, + "flos": 27488397363840.0, + "grad_norm": 1.7123622155346108, + "language_loss": 0.71470809, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.73583049, + "num_input_tokens_seen": 222864575, + "step": 10347, + "time_per_iteration": 2.7059385776519775 + }, + { + "auxiliary_loss_clip": 0.0109624, + "auxiliary_loss_mlp": 0.01030858, + "balance_loss_clip": 1.0373286, + "balance_loss_mlp": 1.01856172, + "epoch": 0.6221554186081467, + "flos": 19207612673280.0, + "grad_norm": 1.8654247065584086, + "language_loss": 0.71920455, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.74047554, + "num_input_tokens_seen": 222884420, + "step": 10348, + "time_per_iteration": 2.498401403427124 + }, + { + "auxiliary_loss_clip": 0.01021966, + "auxiliary_loss_mlp": 0.01001482, + "balance_loss_clip": 1.03225851, + "balance_loss_mlp": 1.00022411, + "epoch": 0.6222155418608146, + "flos": 62950939989120.0, + "grad_norm": 0.811725789409365, + "language_loss": 0.54196525, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.56219971, + "num_input_tokens_seen": 222944690, + "step": 10349, + "time_per_iteration": 3.1721293926239014 + }, + { + "auxiliary_loss_clip": 0.01075881, + "auxiliary_loss_mlp": 0.01026513, + "balance_loss_clip": 1.04153109, + "balance_loss_mlp": 1.01450253, + "epoch": 0.6222756651134826, + "flos": 22601099099520.0, + "grad_norm": 1.82479656803921, + "language_loss": 0.69939268, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.72041661, + "num_input_tokens_seen": 222962990, + "step": 10350, + "time_per_iteration": 2.57356595993042 + }, + { + "auxiliary_loss_clip": 0.01111037, + "auxiliary_loss_mlp": 0.01034267, + "balance_loss_clip": 1.03903794, + "balance_loss_mlp": 1.02170825, + "epoch": 0.6223357883661506, + "flos": 21142228216320.0, + "grad_norm": 2.03964845280813, + "language_loss": 0.56645346, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.58790648, + "num_input_tokens_seen": 222980715, + "step": 10351, + "time_per_iteration": 2.4460716247558594 + }, + { + "auxiliary_loss_clip": 0.01035462, + "auxiliary_loss_mlp": 0.01001648, + "balance_loss_clip": 1.02616692, + "balance_loss_mlp": 1.00040233, + "epoch": 0.6223959116188186, + "flos": 63765071700480.0, + "grad_norm": 0.8033526009642041, + "language_loss": 0.61123174, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.63160288, + "num_input_tokens_seen": 223040685, + "step": 10352, + "time_per_iteration": 3.0476980209350586 + }, + { + "auxiliary_loss_clip": 0.01106781, + "auxiliary_loss_mlp": 0.01030706, + "balance_loss_clip": 1.03785253, + "balance_loss_mlp": 1.01859462, + "epoch": 0.6224560348714866, + "flos": 22565727181440.0, + "grad_norm": 2.0346916955962193, + "language_loss": 0.81658596, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.83796084, + "num_input_tokens_seen": 223059000, + "step": 10353, + "time_per_iteration": 3.892784595489502 + }, + { + "auxiliary_loss_clip": 0.0109372, + "auxiliary_loss_mlp": 0.01030166, + "balance_loss_clip": 1.03770709, + "balance_loss_mlp": 1.01884675, + "epoch": 0.6225161581241545, + "flos": 24097748112000.0, + "grad_norm": 1.471011403383393, + "language_loss": 0.75709689, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.77833569, + "num_input_tokens_seen": 223079345, + "step": 10354, + "time_per_iteration": 2.5224435329437256 + }, + { + "auxiliary_loss_clip": 0.01072381, + "auxiliary_loss_mlp": 0.01030493, + "balance_loss_clip": 1.03809977, + "balance_loss_mlp": 1.01835728, + "epoch": 0.6225762813768225, + "flos": 20443513881600.0, + "grad_norm": 1.5260825061607357, + "language_loss": 0.78664702, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.80767578, + "num_input_tokens_seen": 223097880, + "step": 10355, + "time_per_iteration": 2.554327964782715 + }, + { + "auxiliary_loss_clip": 0.01099284, + "auxiliary_loss_mlp": 0.01034064, + "balance_loss_clip": 1.03988457, + "balance_loss_mlp": 1.02217841, + "epoch": 0.6226364046294904, + "flos": 27198131558400.0, + "grad_norm": 2.047577098888076, + "language_loss": 0.78286564, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.8041991, + "num_input_tokens_seen": 223118185, + "step": 10356, + "time_per_iteration": 2.535832166671753 + }, + { + "auxiliary_loss_clip": 0.01093008, + "auxiliary_loss_mlp": 0.00784927, + "balance_loss_clip": 1.03905332, + "balance_loss_mlp": 1.00820613, + "epoch": 0.6226965278821585, + "flos": 20445776438400.0, + "grad_norm": 2.1942806145280764, + "language_loss": 0.67501903, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.69379842, + "num_input_tokens_seen": 223137600, + "step": 10357, + "time_per_iteration": 3.95259165763855 + }, + { + "auxiliary_loss_clip": 0.01092412, + "auxiliary_loss_mlp": 0.01031835, + "balance_loss_clip": 1.03930557, + "balance_loss_mlp": 1.01811981, + "epoch": 0.6227566511348264, + "flos": 22162737519360.0, + "grad_norm": 3.4018562099545018, + "language_loss": 0.76039255, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.78163499, + "num_input_tokens_seen": 223154360, + "step": 10358, + "time_per_iteration": 2.5029590129852295 + }, + { + "auxiliary_loss_clip": 0.01086061, + "auxiliary_loss_mlp": 0.01031804, + "balance_loss_clip": 1.03723025, + "balance_loss_mlp": 1.01966226, + "epoch": 0.6228167743874944, + "flos": 18040875102720.0, + "grad_norm": 3.1089172565217766, + "language_loss": 0.82582486, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.84700352, + "num_input_tokens_seen": 223172255, + "step": 10359, + "time_per_iteration": 2.5493814945220947 + }, + { + "auxiliary_loss_clip": 0.01080427, + "auxiliary_loss_mlp": 0.01046967, + "balance_loss_clip": 1.03541434, + "balance_loss_mlp": 1.03285813, + "epoch": 0.6228768976401623, + "flos": 17742851959680.0, + "grad_norm": 5.604555152594505, + "language_loss": 0.73099411, + "learning_rate": 1.315248145768822e-06, + "loss": 0.75226796, + "num_input_tokens_seen": 223186965, + "step": 10360, + "time_per_iteration": 2.473407745361328 + }, + { + "auxiliary_loss_clip": 0.01097597, + "auxiliary_loss_mlp": 0.01036051, + "balance_loss_clip": 1.03653371, + "balance_loss_mlp": 1.02354574, + "epoch": 0.6229370208928303, + "flos": 17894934144000.0, + "grad_norm": 2.001196274797408, + "language_loss": 0.77576435, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.79710084, + "num_input_tokens_seen": 223206045, + "step": 10361, + "time_per_iteration": 2.4644112586975098 + }, + { + "auxiliary_loss_clip": 0.01069927, + "auxiliary_loss_mlp": 0.0103138, + "balance_loss_clip": 1.03875542, + "balance_loss_mlp": 1.01941133, + "epoch": 0.6229971441454982, + "flos": 17347763289600.0, + "grad_norm": 1.5414892471466635, + "language_loss": 0.67892194, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.69993496, + "num_input_tokens_seen": 223224820, + "step": 10362, + "time_per_iteration": 2.5147337913513184 + }, + { + "auxiliary_loss_clip": 0.01089181, + "auxiliary_loss_mlp": 0.01031704, + "balance_loss_clip": 1.0372901, + "balance_loss_mlp": 1.01916337, + "epoch": 0.6230572673981662, + "flos": 29241376807680.0, + "grad_norm": 2.092862548291175, + "language_loss": 0.67383564, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.69504452, + "num_input_tokens_seen": 223243205, + "step": 10363, + "time_per_iteration": 2.5770883560180664 + }, + { + "auxiliary_loss_clip": 0.01070685, + "auxiliary_loss_mlp": 0.01033479, + "balance_loss_clip": 1.03938055, + "balance_loss_mlp": 1.02033055, + "epoch": 0.6231173906508342, + "flos": 16325961096960.0, + "grad_norm": 2.250168678401797, + "language_loss": 0.86724919, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.88829088, + "num_input_tokens_seen": 223261370, + "step": 10364, + "time_per_iteration": 2.5342624187469482 + }, + { + "auxiliary_loss_clip": 0.01023784, + "auxiliary_loss_mlp": 0.01000774, + "balance_loss_clip": 1.02004421, + "balance_loss_mlp": 0.99938548, + "epoch": 0.6231775139035022, + "flos": 68702032517760.0, + "grad_norm": 0.8808761643034688, + "language_loss": 0.60814065, + "learning_rate": 1.313418851605015e-06, + "loss": 0.62838626, + "num_input_tokens_seen": 223315050, + "step": 10365, + "time_per_iteration": 3.1386258602142334 + }, + { + "auxiliary_loss_clip": 0.01076642, + "auxiliary_loss_mlp": 0.00786899, + "balance_loss_clip": 1.03966212, + "balance_loss_mlp": 1.0105809, + "epoch": 0.6232376371561702, + "flos": 19821038163840.0, + "grad_norm": 7.625894810710482, + "language_loss": 0.74899304, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.76762843, + "num_input_tokens_seen": 223332130, + "step": 10366, + "time_per_iteration": 2.541701316833496 + }, + { + "auxiliary_loss_clip": 0.01101756, + "auxiliary_loss_mlp": 0.01040647, + "balance_loss_clip": 1.04080284, + "balance_loss_mlp": 1.0274806, + "epoch": 0.6232977604088381, + "flos": 23258264376960.0, + "grad_norm": 1.9781159198130742, + "language_loss": 0.7611028, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.78252685, + "num_input_tokens_seen": 223351605, + "step": 10367, + "time_per_iteration": 2.5061776638031006 + }, + { + "auxiliary_loss_clip": 0.01096769, + "auxiliary_loss_mlp": 0.01037181, + "balance_loss_clip": 1.03838134, + "balance_loss_mlp": 1.02496839, + "epoch": 0.6233578836615061, + "flos": 21106425335040.0, + "grad_norm": 1.537283045540266, + "language_loss": 0.78581578, + "learning_rate": 1.312321587418457e-06, + "loss": 0.80715531, + "num_input_tokens_seen": 223372090, + "step": 10368, + "time_per_iteration": 2.496858835220337 + }, + { + "auxiliary_loss_clip": 0.0104641, + "auxiliary_loss_mlp": 0.01036249, + "balance_loss_clip": 1.04031181, + "balance_loss_mlp": 1.02340984, + "epoch": 0.623418006914174, + "flos": 23769416868480.0, + "grad_norm": 1.895976559733545, + "language_loss": 0.68021685, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.70104349, + "num_input_tokens_seen": 223390110, + "step": 10369, + "time_per_iteration": 2.6398022174835205 + }, + { + "auxiliary_loss_clip": 0.01112052, + "auxiliary_loss_mlp": 0.01035761, + "balance_loss_clip": 1.04059601, + "balance_loss_mlp": 1.0227617, + "epoch": 0.6234781301668421, + "flos": 17890480857600.0, + "grad_norm": 2.5836196270929586, + "language_loss": 0.88088644, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.90236461, + "num_input_tokens_seen": 223404205, + "step": 10370, + "time_per_iteration": 2.3988420963287354 + }, + { + "auxiliary_loss_clip": 0.01106747, + "auxiliary_loss_mlp": 0.01030359, + "balance_loss_clip": 1.03732979, + "balance_loss_mlp": 1.01847959, + "epoch": 0.62353825341951, + "flos": 26175503352960.0, + "grad_norm": 1.505710372062568, + "language_loss": 0.65805304, + "learning_rate": 1.311224557923402e-06, + "loss": 0.67942405, + "num_input_tokens_seen": 223424855, + "step": 10371, + "time_per_iteration": 2.5088272094726562 + }, + { + "auxiliary_loss_clip": 0.01092385, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.03725612, + "balance_loss_mlp": 1.01713872, + "epoch": 0.623598376672178, + "flos": 31139902160640.0, + "grad_norm": 1.3053524897179611, + "language_loss": 0.77621478, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.79741013, + "num_input_tokens_seen": 223447225, + "step": 10372, + "time_per_iteration": 2.5837314128875732 + }, + { + "auxiliary_loss_clip": 0.01096435, + "auxiliary_loss_mlp": 0.01031537, + "balance_loss_clip": 1.03546214, + "balance_loss_mlp": 1.01871586, + "epoch": 0.6236584999248459, + "flos": 23730202195200.0, + "grad_norm": 1.5828462597840935, + "language_loss": 0.77391291, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.79519272, + "num_input_tokens_seen": 223467520, + "step": 10373, + "time_per_iteration": 2.5462214946746826 + }, + { + "auxiliary_loss_clip": 0.01092891, + "auxiliary_loss_mlp": 0.01027112, + "balance_loss_clip": 1.03700876, + "balance_loss_mlp": 1.01590073, + "epoch": 0.6237186231775139, + "flos": 21762764599680.0, + "grad_norm": 2.4228806480173706, + "language_loss": 0.69275683, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.71395683, + "num_input_tokens_seen": 223488130, + "step": 10374, + "time_per_iteration": 2.4801623821258545 + }, + { + "auxiliary_loss_clip": 0.01092384, + "auxiliary_loss_mlp": 0.01029258, + "balance_loss_clip": 1.039446, + "balance_loss_mlp": 1.01715851, + "epoch": 0.6237787464301818, + "flos": 14939486075520.0, + "grad_norm": 1.7237660611709427, + "language_loss": 0.7708413, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.79205775, + "num_input_tokens_seen": 223505105, + "step": 10375, + "time_per_iteration": 2.5024116039276123 + }, + { + "auxiliary_loss_clip": 0.01083034, + "auxiliary_loss_mlp": 0.01026957, + "balance_loss_clip": 1.03987169, + "balance_loss_mlp": 1.01519108, + "epoch": 0.6238388696828499, + "flos": 35590311302400.0, + "grad_norm": 1.4721036058764214, + "language_loss": 0.70102376, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.72212368, + "num_input_tokens_seen": 223528065, + "step": 10376, + "time_per_iteration": 2.641611099243164 + }, + { + "auxiliary_loss_clip": 0.01079238, + "auxiliary_loss_mlp": 0.01030921, + "balance_loss_clip": 1.0372684, + "balance_loss_mlp": 1.01767731, + "epoch": 0.6238989929355178, + "flos": 23623511823360.0, + "grad_norm": 1.65726625716149, + "language_loss": 0.76454961, + "learning_rate": 1.309031204505301e-06, + "loss": 0.78565121, + "num_input_tokens_seen": 223547305, + "step": 10377, + "time_per_iteration": 2.587254285812378 + }, + { + "auxiliary_loss_clip": 0.01087826, + "auxiliary_loss_mlp": 0.0102827, + "balance_loss_clip": 1.03940511, + "balance_loss_mlp": 1.01745141, + "epoch": 0.6239591161881858, + "flos": 22087468569600.0, + "grad_norm": 2.0477407822732894, + "language_loss": 0.68186331, + "learning_rate": 1.308665737227052e-06, + "loss": 0.70302427, + "num_input_tokens_seen": 223567205, + "step": 10378, + "time_per_iteration": 2.518216609954834 + }, + { + "auxiliary_loss_clip": 0.01084619, + "auxiliary_loss_mlp": 0.01026748, + "balance_loss_clip": 1.03818178, + "balance_loss_mlp": 1.01473141, + "epoch": 0.6240192394408538, + "flos": 24535930124160.0, + "grad_norm": 1.7917867412069672, + "language_loss": 0.76723635, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.78835005, + "num_input_tokens_seen": 223586560, + "step": 10379, + "time_per_iteration": 2.5792078971862793 + }, + { + "auxiliary_loss_clip": 0.01085049, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.04044986, + "balance_loss_mlp": 1.01665318, + "epoch": 0.6240793626935217, + "flos": 27931930502400.0, + "grad_norm": 1.4348673256370892, + "language_loss": 0.79410815, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.81524122, + "num_input_tokens_seen": 223610595, + "step": 10380, + "time_per_iteration": 2.6024346351623535 + }, + { + "auxiliary_loss_clip": 0.01093035, + "auxiliary_loss_mlp": 0.01029581, + "balance_loss_clip": 1.04064012, + "balance_loss_mlp": 1.01884627, + "epoch": 0.6241394859461897, + "flos": 22892514140160.0, + "grad_norm": 1.5306960978845066, + "language_loss": 0.79842067, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.81964684, + "num_input_tokens_seen": 223630230, + "step": 10381, + "time_per_iteration": 2.5170445442199707 + }, + { + "auxiliary_loss_clip": 0.0108241, + "auxiliary_loss_mlp": 0.01032437, + "balance_loss_clip": 1.03534675, + "balance_loss_mlp": 1.0192883, + "epoch": 0.6241996091988576, + "flos": 12750766744320.0, + "grad_norm": 2.161369883552236, + "language_loss": 0.7489233, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.77007174, + "num_input_tokens_seen": 223648360, + "step": 10382, + "time_per_iteration": 3.867119550704956 + }, + { + "auxiliary_loss_clip": 0.01095523, + "auxiliary_loss_mlp": 0.01024456, + "balance_loss_clip": 1.03789592, + "balance_loss_mlp": 1.01317883, + "epoch": 0.6242597324515257, + "flos": 25851302173440.0, + "grad_norm": 1.6190919635976357, + "language_loss": 0.78386092, + "learning_rate": 1.306838794344911e-06, + "loss": 0.80506068, + "num_input_tokens_seen": 223671255, + "step": 10383, + "time_per_iteration": 2.5709927082061768 + }, + { + "auxiliary_loss_clip": 0.01073515, + "auxiliary_loss_mlp": 0.01028624, + "balance_loss_clip": 1.03564739, + "balance_loss_mlp": 1.01720381, + "epoch": 0.6243198557041936, + "flos": 19937712516480.0, + "grad_norm": 1.9323041271273642, + "language_loss": 0.75544882, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.77647024, + "num_input_tokens_seen": 223689860, + "step": 10384, + "time_per_iteration": 2.5589845180511475 + }, + { + "auxiliary_loss_clip": 0.01084772, + "auxiliary_loss_mlp": 0.01037185, + "balance_loss_clip": 1.03560615, + "balance_loss_mlp": 1.02313042, + "epoch": 0.6243799789568616, + "flos": 18406194376320.0, + "grad_norm": 2.2162107632293377, + "language_loss": 0.66755736, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.68877685, + "num_input_tokens_seen": 223707835, + "step": 10385, + "time_per_iteration": 2.5081567764282227 + }, + { + "auxiliary_loss_clip": 0.01034468, + "auxiliary_loss_mlp": 0.01008308, + "balance_loss_clip": 1.03001821, + "balance_loss_mlp": 1.0070858, + "epoch": 0.6244401022095295, + "flos": 66027587523840.0, + "grad_norm": 0.753854596023269, + "language_loss": 0.6196686, + "learning_rate": 1.305742943921692e-06, + "loss": 0.64009643, + "num_input_tokens_seen": 223771875, + "step": 10386, + "time_per_iteration": 3.1580936908721924 + }, + { + "auxiliary_loss_clip": 0.01096665, + "auxiliary_loss_mlp": 0.01031631, + "balance_loss_clip": 1.03673744, + "balance_loss_mlp": 1.01936424, + "epoch": 0.6245002254621975, + "flos": 24571266128640.0, + "grad_norm": 2.8958572725957485, + "language_loss": 0.71930933, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.74059224, + "num_input_tokens_seen": 223788895, + "step": 10387, + "time_per_iteration": 2.50054931640625 + }, + { + "auxiliary_loss_clip": 0.01102334, + "auxiliary_loss_mlp": 0.01036718, + "balance_loss_clip": 1.03838325, + "balance_loss_mlp": 1.02301502, + "epoch": 0.6245603487148654, + "flos": 29168837291520.0, + "grad_norm": 2.9658874948828493, + "language_loss": 0.65422416, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.67561471, + "num_input_tokens_seen": 223810385, + "step": 10388, + "time_per_iteration": 2.5476651191711426 + }, + { + "auxiliary_loss_clip": 0.01070668, + "auxiliary_loss_mlp": 0.01027808, + "balance_loss_clip": 1.03812194, + "balance_loss_mlp": 1.01654899, + "epoch": 0.6246204719675335, + "flos": 14790097411200.0, + "grad_norm": 1.7528521602825702, + "language_loss": 0.79301149, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.81399626, + "num_input_tokens_seen": 223826040, + "step": 10389, + "time_per_iteration": 2.509653091430664 + }, + { + "auxiliary_loss_clip": 0.01083142, + "auxiliary_loss_mlp": 0.01032612, + "balance_loss_clip": 1.03612423, + "balance_loss_mlp": 1.0200417, + "epoch": 0.6246805952202014, + "flos": 12493538472960.0, + "grad_norm": 1.8561598384607805, + "language_loss": 0.60762113, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.6287787, + "num_input_tokens_seen": 223842300, + "step": 10390, + "time_per_iteration": 2.4881911277770996 + }, + { + "auxiliary_loss_clip": 0.01088382, + "auxiliary_loss_mlp": 0.01032524, + "balance_loss_clip": 1.03770399, + "balance_loss_mlp": 1.02013266, + "epoch": 0.6247407184728694, + "flos": 12786677366400.0, + "grad_norm": 1.9640398734552889, + "language_loss": 0.77128243, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.79249144, + "num_input_tokens_seen": 223858320, + "step": 10391, + "time_per_iteration": 2.4894776344299316 + }, + { + "auxiliary_loss_clip": 0.01087652, + "auxiliary_loss_mlp": 0.01028608, + "balance_loss_clip": 1.03794158, + "balance_loss_mlp": 1.01593614, + "epoch": 0.6248008417255374, + "flos": 40629188960640.0, + "grad_norm": 1.5299198989858573, + "language_loss": 0.6466549, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.66781747, + "num_input_tokens_seen": 223883545, + "step": 10392, + "time_per_iteration": 5.53208327293396 + }, + { + "auxiliary_loss_clip": 0.01089913, + "auxiliary_loss_mlp": 0.01029168, + "balance_loss_clip": 1.03911757, + "balance_loss_mlp": 1.01685345, + "epoch": 0.6248609649782053, + "flos": 19902017376000.0, + "grad_norm": 1.7866888538317303, + "language_loss": 0.76745772, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.78864855, + "num_input_tokens_seen": 223901445, + "step": 10393, + "time_per_iteration": 2.520374059677124 + }, + { + "auxiliary_loss_clip": 0.0108032, + "auxiliary_loss_mlp": 0.00784953, + "balance_loss_clip": 1.03901064, + "balance_loss_mlp": 1.00790405, + "epoch": 0.6249210882308733, + "flos": 19682746801920.0, + "grad_norm": 1.771515239873333, + "language_loss": 0.8265655, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.84521818, + "num_input_tokens_seen": 223920170, + "step": 10394, + "time_per_iteration": 2.5773651599884033 + }, + { + "auxiliary_loss_clip": 0.01091014, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.03950429, + "balance_loss_mlp": 1.01928186, + "epoch": 0.6249812114835412, + "flos": 13990726189440.0, + "grad_norm": 1.755840881824861, + "language_loss": 0.74949431, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.77072972, + "num_input_tokens_seen": 223936495, + "step": 10395, + "time_per_iteration": 3.9070889949798584 + }, + { + "auxiliary_loss_clip": 0.01089312, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.03601873, + "balance_loss_mlp": 1.0184232, + "epoch": 0.6250413347362093, + "flos": 14530031965440.0, + "grad_norm": 2.652138439941464, + "language_loss": 0.72603959, + "learning_rate": 1.302091822487119e-06, + "loss": 0.74724436, + "num_input_tokens_seen": 223950070, + "step": 10396, + "time_per_iteration": 2.4720776081085205 + }, + { + "auxiliary_loss_clip": 0.01078763, + "auxiliary_loss_mlp": 0.01034238, + "balance_loss_clip": 1.043872, + "balance_loss_mlp": 1.02221608, + "epoch": 0.6251014579888772, + "flos": 22963006581120.0, + "grad_norm": 11.242142536766837, + "language_loss": 0.75799924, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.77912927, + "num_input_tokens_seen": 223970065, + "step": 10397, + "time_per_iteration": 2.5743565559387207 + }, + { + "auxiliary_loss_clip": 0.01082056, + "auxiliary_loss_mlp": 0.0103089, + "balance_loss_clip": 1.03473949, + "balance_loss_mlp": 1.01837873, + "epoch": 0.6251615812415452, + "flos": 28111232217600.0, + "grad_norm": 2.4450501163243086, + "language_loss": 0.75257838, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.77370787, + "num_input_tokens_seen": 223990315, + "step": 10398, + "time_per_iteration": 2.56254243850708 + }, + { + "auxiliary_loss_clip": 0.01110794, + "auxiliary_loss_mlp": 0.01029827, + "balance_loss_clip": 1.03671312, + "balance_loss_mlp": 1.01629722, + "epoch": 0.6252217044942131, + "flos": 26724469887360.0, + "grad_norm": 1.7573489953865604, + "language_loss": 0.74180114, + "learning_rate": 1.300997001489483e-06, + "loss": 0.76320732, + "num_input_tokens_seen": 224009960, + "step": 10399, + "time_per_iteration": 2.4849932193756104 + }, + { + "auxiliary_loss_clip": 0.01072698, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.03841877, + "balance_loss_mlp": 1.02025211, + "epoch": 0.6252818277468811, + "flos": 20006768413440.0, + "grad_norm": 1.551611372192619, + "language_loss": 0.74387175, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.76491886, + "num_input_tokens_seen": 224028870, + "step": 10400, + "time_per_iteration": 2.5610952377319336 + }, + { + "auxiliary_loss_clip": 0.01023217, + "auxiliary_loss_mlp": 0.01003463, + "balance_loss_clip": 1.02647913, + "balance_loss_mlp": 1.00210404, + "epoch": 0.625341950999549, + "flos": 59278285059840.0, + "grad_norm": 0.8429702007772868, + "language_loss": 0.56505823, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.585325, + "num_input_tokens_seen": 224094140, + "step": 10401, + "time_per_iteration": 3.249868392944336 + }, + { + "auxiliary_loss_clip": 0.01099944, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.03767502, + "balance_loss_mlp": 1.01737118, + "epoch": 0.625402074252217, + "flos": 20157090831360.0, + "grad_norm": 2.2794690756638296, + "language_loss": 0.82826817, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.84956908, + "num_input_tokens_seen": 224113235, + "step": 10402, + "time_per_iteration": 2.482293128967285 + }, + { + "auxiliary_loss_clip": 0.01037223, + "auxiliary_loss_mlp": 0.01031548, + "balance_loss_clip": 1.03736842, + "balance_loss_mlp": 1.01960289, + "epoch": 0.625462197504885, + "flos": 29132531619840.0, + "grad_norm": 2.564860651947877, + "language_loss": 0.69261849, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.71330619, + "num_input_tokens_seen": 224134530, + "step": 10403, + "time_per_iteration": 2.7535903453826904 + }, + { + "auxiliary_loss_clip": 0.01075038, + "auxiliary_loss_mlp": 0.01032263, + "balance_loss_clip": 1.03579271, + "balance_loss_mlp": 1.01786876, + "epoch": 0.625522320757553, + "flos": 26104436294400.0, + "grad_norm": 1.5550691738195606, + "language_loss": 0.71659315, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.73766619, + "num_input_tokens_seen": 224154170, + "step": 10404, + "time_per_iteration": 2.7630813121795654 + }, + { + "auxiliary_loss_clip": 0.01068122, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.0386225, + "balance_loss_mlp": 1.0191499, + "epoch": 0.625582444010221, + "flos": 20630967984000.0, + "grad_norm": 1.7783686166310555, + "language_loss": 0.69349581, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.71449363, + "num_input_tokens_seen": 224172730, + "step": 10405, + "time_per_iteration": 2.5756609439849854 + }, + { + "auxiliary_loss_clip": 0.01082225, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.03801572, + "balance_loss_mlp": 1.02007818, + "epoch": 0.6256425672628889, + "flos": 20521512264960.0, + "grad_norm": 1.566962298246778, + "language_loss": 0.78846246, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.8096056, + "num_input_tokens_seen": 224192620, + "step": 10406, + "time_per_iteration": 2.528078079223633 + }, + { + "auxiliary_loss_clip": 0.01079428, + "auxiliary_loss_mlp": 0.01033924, + "balance_loss_clip": 1.0381856, + "balance_loss_mlp": 1.02164531, + "epoch": 0.6257026905155569, + "flos": 29529200488320.0, + "grad_norm": 1.8508674870017676, + "language_loss": 0.69063848, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.71177202, + "num_input_tokens_seen": 224214660, + "step": 10407, + "time_per_iteration": 2.6205174922943115 + }, + { + "auxiliary_loss_clip": 0.01096013, + "auxiliary_loss_mlp": 0.00783655, + "balance_loss_clip": 1.03869891, + "balance_loss_mlp": 1.009426, + "epoch": 0.6257628137682248, + "flos": 24024885373440.0, + "grad_norm": 1.629642239136061, + "language_loss": 0.85067582, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.8694725, + "num_input_tokens_seen": 224234170, + "step": 10408, + "time_per_iteration": 2.5103774070739746 + }, + { + "auxiliary_loss_clip": 0.01085378, + "auxiliary_loss_mlp": 0.00787292, + "balance_loss_clip": 1.03668761, + "balance_loss_mlp": 1.01578641, + "epoch": 0.6258229370208929, + "flos": 20850956830080.0, + "grad_norm": 1.6257219481484102, + "language_loss": 0.79731441, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.81604111, + "num_input_tokens_seen": 224253115, + "step": 10409, + "time_per_iteration": 2.540896415710449 + }, + { + "auxiliary_loss_clip": 0.01086161, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.03662527, + "balance_loss_mlp": 1.02032137, + "epoch": 0.6258830602735608, + "flos": 22231542021120.0, + "grad_norm": 2.811501506232355, + "language_loss": 0.69803405, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.71922135, + "num_input_tokens_seen": 224271375, + "step": 10410, + "time_per_iteration": 2.5354232788085938 + }, + { + "auxiliary_loss_clip": 0.01061325, + "auxiliary_loss_mlp": 0.01024336, + "balance_loss_clip": 1.03703201, + "balance_loss_mlp": 1.01270103, + "epoch": 0.6259431835262288, + "flos": 25076887925760.0, + "grad_norm": 1.8128142464813768, + "language_loss": 0.67567825, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.69653487, + "num_input_tokens_seen": 224290315, + "step": 10411, + "time_per_iteration": 2.6177170276641846 + }, + { + "auxiliary_loss_clip": 0.01062388, + "auxiliary_loss_mlp": 0.01030892, + "balance_loss_clip": 1.03773892, + "balance_loss_mlp": 1.01920366, + "epoch": 0.6260033067788967, + "flos": 28252288926720.0, + "grad_norm": 6.259631450100867, + "language_loss": 0.69355655, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.7144894, + "num_input_tokens_seen": 224310545, + "step": 10412, + "time_per_iteration": 2.6401069164276123 + }, + { + "auxiliary_loss_clip": 0.01076364, + "auxiliary_loss_mlp": 0.01038343, + "balance_loss_clip": 1.03599548, + "balance_loss_mlp": 1.02546906, + "epoch": 0.6260634300315647, + "flos": 23367432787200.0, + "grad_norm": 1.494274797240139, + "language_loss": 0.69671708, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.71786416, + "num_input_tokens_seen": 224331115, + "step": 10413, + "time_per_iteration": 2.580209732055664 + }, + { + "auxiliary_loss_clip": 0.01076207, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.03482604, + "balance_loss_mlp": 1.01977921, + "epoch": 0.6261235532842326, + "flos": 18035308494720.0, + "grad_norm": 2.4251597160614313, + "language_loss": 0.81072265, + "learning_rate": 1.295526482316796e-06, + "loss": 0.83182049, + "num_input_tokens_seen": 224347525, + "step": 10414, + "time_per_iteration": 2.4999582767486572 + }, + { + "auxiliary_loss_clip": 0.0109864, + "auxiliary_loss_mlp": 0.01033936, + "balance_loss_clip": 1.04002595, + "balance_loss_mlp": 1.02203941, + "epoch": 0.6261836765369007, + "flos": 22011265866240.0, + "grad_norm": 1.6899991779662074, + "language_loss": 0.74586475, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.76719046, + "num_input_tokens_seen": 224367045, + "step": 10415, + "time_per_iteration": 2.512517213821411 + }, + { + "auxiliary_loss_clip": 0.01055623, + "auxiliary_loss_mlp": 0.01028742, + "balance_loss_clip": 1.03598559, + "balance_loss_mlp": 1.0164634, + "epoch": 0.6262437997895686, + "flos": 24936010784640.0, + "grad_norm": 1.5513972733717871, + "language_loss": 0.74768817, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.76853192, + "num_input_tokens_seen": 224388860, + "step": 10416, + "time_per_iteration": 2.6437771320343018 + }, + { + "auxiliary_loss_clip": 0.01081191, + "auxiliary_loss_mlp": 0.01028181, + "balance_loss_clip": 1.04057097, + "balance_loss_mlp": 1.01689196, + "epoch": 0.6263039230422366, + "flos": 31608428186880.0, + "grad_norm": 1.6596885946855786, + "language_loss": 0.84374994, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.86484361, + "num_input_tokens_seen": 224409645, + "step": 10417, + "time_per_iteration": 2.6428351402282715 + }, + { + "auxiliary_loss_clip": 0.01098519, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.03787374, + "balance_loss_mlp": 1.01870775, + "epoch": 0.6263640462949046, + "flos": 17639465639040.0, + "grad_norm": 3.1389044322443667, + "language_loss": 0.57318801, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.59448826, + "num_input_tokens_seen": 224428530, + "step": 10418, + "time_per_iteration": 2.4687135219573975 + }, + { + "auxiliary_loss_clip": 0.01102639, + "auxiliary_loss_mlp": 0.0103628, + "balance_loss_clip": 1.03803611, + "balance_loss_mlp": 1.02283931, + "epoch": 0.6264241695475725, + "flos": 19974951941760.0, + "grad_norm": 1.8368170140089326, + "language_loss": 0.84816325, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.86955237, + "num_input_tokens_seen": 224447175, + "step": 10419, + "time_per_iteration": 2.489936351776123 + }, + { + "auxiliary_loss_clip": 0.01111679, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.04020083, + "balance_loss_mlp": 1.01777017, + "epoch": 0.6264842928002405, + "flos": 27344323912320.0, + "grad_norm": 1.4376554861546764, + "language_loss": 0.64747703, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.66889012, + "num_input_tokens_seen": 224469445, + "step": 10420, + "time_per_iteration": 3.8574912548065186 + }, + { + "auxiliary_loss_clip": 0.01070009, + "auxiliary_loss_mlp": 0.01032825, + "balance_loss_clip": 1.03726792, + "balance_loss_mlp": 1.01945519, + "epoch": 0.6265444160529084, + "flos": 22997265177600.0, + "grad_norm": 2.004632189187469, + "language_loss": 0.86015034, + "learning_rate": 1.292975627485741e-06, + "loss": 0.88117862, + "num_input_tokens_seen": 224486590, + "step": 10421, + "time_per_iteration": 2.548362970352173 + }, + { + "auxiliary_loss_clip": 0.01072644, + "auxiliary_loss_mlp": 0.01031616, + "balance_loss_clip": 1.03743339, + "balance_loss_mlp": 1.0198741, + "epoch": 0.6266045393055765, + "flos": 19938323047680.0, + "grad_norm": 2.094769272788569, + "language_loss": 0.79985642, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.82089901, + "num_input_tokens_seen": 224502795, + "step": 10422, + "time_per_iteration": 2.5369112491607666 + }, + { + "auxiliary_loss_clip": 0.01095666, + "auxiliary_loss_mlp": 0.01025654, + "balance_loss_clip": 1.03545022, + "balance_loss_mlp": 1.01291609, + "epoch": 0.6266646625582444, + "flos": 24389091325440.0, + "grad_norm": 1.9877085515847919, + "language_loss": 0.7444396, + "learning_rate": 1.292247052906389e-06, + "loss": 0.76565278, + "num_input_tokens_seen": 224522300, + "step": 10423, + "time_per_iteration": 2.5348217487335205 + }, + { + "auxiliary_loss_clip": 0.01106551, + "auxiliary_loss_mlp": 0.01027551, + "balance_loss_clip": 1.03696132, + "balance_loss_mlp": 1.01588047, + "epoch": 0.6267247858109124, + "flos": 14683802088960.0, + "grad_norm": 2.7841963141948907, + "language_loss": 0.77676523, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.79810631, + "num_input_tokens_seen": 224538260, + "step": 10424, + "time_per_iteration": 2.422861099243164 + }, + { + "auxiliary_loss_clip": 0.01106234, + "auxiliary_loss_mlp": 0.01032231, + "balance_loss_clip": 1.03700781, + "balance_loss_mlp": 1.01921892, + "epoch": 0.6267849090635803, + "flos": 24929977299840.0, + "grad_norm": 1.8180902902581721, + "language_loss": 0.69285065, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.71423531, + "num_input_tokens_seen": 224559155, + "step": 10425, + "time_per_iteration": 2.479146957397461 + }, + { + "auxiliary_loss_clip": 0.01082064, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.03597403, + "balance_loss_mlp": 1.01942599, + "epoch": 0.6268450323162483, + "flos": 25337851211520.0, + "grad_norm": 1.611407812190046, + "language_loss": 0.74569792, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.76682281, + "num_input_tokens_seen": 224578660, + "step": 10426, + "time_per_iteration": 2.555128812789917 + }, + { + "auxiliary_loss_clip": 0.01098036, + "auxiliary_loss_mlp": 0.00786314, + "balance_loss_clip": 1.0368886, + "balance_loss_mlp": 1.01085114, + "epoch": 0.6269051555689162, + "flos": 26177299032960.0, + "grad_norm": 1.425805946791349, + "language_loss": 0.80349433, + "learning_rate": 1.290790225914929e-06, + "loss": 0.82233787, + "num_input_tokens_seen": 224599080, + "step": 10427, + "time_per_iteration": 2.553365468978882 + }, + { + "auxiliary_loss_clip": 0.01076682, + "auxiliary_loss_mlp": 0.01031944, + "balance_loss_clip": 1.03792226, + "balance_loss_mlp": 1.01962972, + "epoch": 0.6269652788215843, + "flos": 18256877539200.0, + "grad_norm": 1.8475751978887245, + "language_loss": 0.68124717, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.70233345, + "num_input_tokens_seen": 224614225, + "step": 10428, + "time_per_iteration": 2.507397174835205 + }, + { + "auxiliary_loss_clip": 0.01067205, + "auxiliary_loss_mlp": 0.01047759, + "balance_loss_clip": 1.0371592, + "balance_loss_mlp": 1.03386509, + "epoch": 0.6270254020742522, + "flos": 11765413877760.0, + "grad_norm": 1.8234825614288965, + "language_loss": 0.71407247, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.7352221, + "num_input_tokens_seen": 224632365, + "step": 10429, + "time_per_iteration": 2.5222201347351074 + }, + { + "auxiliary_loss_clip": 0.01102614, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.0397048, + "balance_loss_mlp": 1.02147579, + "epoch": 0.6270855253269202, + "flos": 23475631530240.0, + "grad_norm": 1.743923439680762, + "language_loss": 0.79471499, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.81608421, + "num_input_tokens_seen": 224651125, + "step": 10430, + "time_per_iteration": 3.9124057292938232 + }, + { + "auxiliary_loss_clip": 0.01049955, + "auxiliary_loss_mlp": 0.01004965, + "balance_loss_clip": 1.02635908, + "balance_loss_mlp": 1.00360036, + "epoch": 0.6271456485795882, + "flos": 70064520232320.0, + "grad_norm": 0.7686165039481807, + "language_loss": 0.59174538, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.61229455, + "num_input_tokens_seen": 224716115, + "step": 10431, + "time_per_iteration": 4.584550857543945 + }, + { + "auxiliary_loss_clip": 0.01031006, + "auxiliary_loss_mlp": 0.0100516, + "balance_loss_clip": 1.02803683, + "balance_loss_mlp": 1.00378323, + "epoch": 0.6272057718322561, + "flos": 65156718280320.0, + "grad_norm": 0.8744097845670681, + "language_loss": 0.63741279, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.65777445, + "num_input_tokens_seen": 224782930, + "step": 10432, + "time_per_iteration": 3.1971993446350098 + }, + { + "auxiliary_loss_clip": 0.01084216, + "auxiliary_loss_mlp": 0.01030319, + "balance_loss_clip": 1.03753877, + "balance_loss_mlp": 1.0195545, + "epoch": 0.6272658950849241, + "flos": 24389342720640.0, + "grad_norm": 3.99249862163295, + "language_loss": 0.6458472, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.66699255, + "num_input_tokens_seen": 224802010, + "step": 10433, + "time_per_iteration": 3.944110870361328 + }, + { + "auxiliary_loss_clip": 0.01102605, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.03975785, + "balance_loss_mlp": 1.01906776, + "epoch": 0.627326018337592, + "flos": 17966001202560.0, + "grad_norm": 2.750243860524733, + "language_loss": 0.61885941, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.64020354, + "num_input_tokens_seen": 224818875, + "step": 10434, + "time_per_iteration": 2.494591236114502 + }, + { + "auxiliary_loss_clip": 0.01069389, + "auxiliary_loss_mlp": 0.01025425, + "balance_loss_clip": 1.03280759, + "balance_loss_mlp": 1.01368928, + "epoch": 0.6273861415902601, + "flos": 20230097224320.0, + "grad_norm": 1.6102089799984283, + "language_loss": 0.84554607, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.86649418, + "num_input_tokens_seen": 224837790, + "step": 10435, + "time_per_iteration": 2.560455799102783 + }, + { + "auxiliary_loss_clip": 0.01049343, + "auxiliary_loss_mlp": 0.01002958, + "balance_loss_clip": 1.02559829, + "balance_loss_mlp": 1.00161076, + "epoch": 0.627446264842928, + "flos": 64953210798720.0, + "grad_norm": 0.7287984806956278, + "language_loss": 0.61533028, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.63585329, + "num_input_tokens_seen": 224899685, + "step": 10436, + "time_per_iteration": 3.0884416103363037 + }, + { + "auxiliary_loss_clip": 0.01088045, + "auxiliary_loss_mlp": 0.01037397, + "balance_loss_clip": 1.04009748, + "balance_loss_mlp": 1.0241468, + "epoch": 0.627506388095596, + "flos": 23584261236480.0, + "grad_norm": 1.4383569300774937, + "language_loss": 0.77256131, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.79381573, + "num_input_tokens_seen": 224918650, + "step": 10437, + "time_per_iteration": 2.5357086658477783 + }, + { + "auxiliary_loss_clip": 0.0103856, + "auxiliary_loss_mlp": 0.01002528, + "balance_loss_clip": 1.02459991, + "balance_loss_mlp": 1.00127625, + "epoch": 0.6275665113482639, + "flos": 67583631674880.0, + "grad_norm": 0.8055904083249047, + "language_loss": 0.54314667, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.56355751, + "num_input_tokens_seen": 224981575, + "step": 10438, + "time_per_iteration": 3.0434789657592773 + }, + { + "auxiliary_loss_clip": 0.01057016, + "auxiliary_loss_mlp": 0.0103828, + "balance_loss_clip": 1.0337944, + "balance_loss_mlp": 1.02538204, + "epoch": 0.6276266346009319, + "flos": 27636924101760.0, + "grad_norm": 1.8268920442601184, + "language_loss": 0.84416324, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.86511624, + "num_input_tokens_seen": 225000820, + "step": 10439, + "time_per_iteration": 2.6678171157836914 + }, + { + "auxiliary_loss_clip": 0.01076378, + "auxiliary_loss_mlp": 0.01040074, + "balance_loss_clip": 1.03852093, + "balance_loss_mlp": 1.02671027, + "epoch": 0.6276867578535998, + "flos": 22746142218240.0, + "grad_norm": 2.508422080702262, + "language_loss": 0.80865014, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.82981455, + "num_input_tokens_seen": 225017585, + "step": 10440, + "time_per_iteration": 2.555166244506836 + }, + { + "auxiliary_loss_clip": 0.01055044, + "auxiliary_loss_mlp": 0.01030056, + "balance_loss_clip": 1.03748941, + "balance_loss_mlp": 1.01893353, + "epoch": 0.6277468811062679, + "flos": 24644200694400.0, + "grad_norm": 1.7098780243676146, + "language_loss": 0.74343956, + "learning_rate": 1.285694725799337e-06, + "loss": 0.76429057, + "num_input_tokens_seen": 225039085, + "step": 10441, + "time_per_iteration": 2.6546590328216553 + }, + { + "auxiliary_loss_clip": 0.01091845, + "auxiliary_loss_mlp": 0.01028767, + "balance_loss_clip": 1.03727984, + "balance_loss_mlp": 1.01571918, + "epoch": 0.6278070043589358, + "flos": 19678975873920.0, + "grad_norm": 1.8383259840663142, + "language_loss": 0.72230935, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.74351549, + "num_input_tokens_seen": 225058105, + "step": 10442, + "time_per_iteration": 2.509080171585083 + }, + { + "auxiliary_loss_clip": 0.01075829, + "auxiliary_loss_mlp": 0.01030231, + "balance_loss_clip": 1.03729677, + "balance_loss_mlp": 1.01780891, + "epoch": 0.6278671276116038, + "flos": 22121834906880.0, + "grad_norm": 1.5202285859416114, + "language_loss": 0.71443617, + "learning_rate": 1.284967229712762e-06, + "loss": 0.73549676, + "num_input_tokens_seen": 225077605, + "step": 10443, + "time_per_iteration": 2.588299512863159 + }, + { + "auxiliary_loss_clip": 0.01108643, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.03854513, + "balance_loss_mlp": 1.01713276, + "epoch": 0.6279272508642717, + "flos": 23038562839680.0, + "grad_norm": 2.14392193710865, + "language_loss": 0.73154235, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.75291967, + "num_input_tokens_seen": 225097775, + "step": 10444, + "time_per_iteration": 2.492387294769287 + }, + { + "auxiliary_loss_clip": 0.01071985, + "auxiliary_loss_mlp": 0.01028078, + "balance_loss_clip": 1.03842688, + "balance_loss_mlp": 1.01588273, + "epoch": 0.6279873741169397, + "flos": 19824090819840.0, + "grad_norm": 1.7962479354736811, + "language_loss": 0.72045946, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.74146008, + "num_input_tokens_seen": 225115585, + "step": 10445, + "time_per_iteration": 2.553072929382324 + }, + { + "auxiliary_loss_clip": 0.01094436, + "auxiliary_loss_mlp": 0.01027553, + "balance_loss_clip": 1.03948474, + "balance_loss_mlp": 1.01539373, + "epoch": 0.6280474973696077, + "flos": 23915393740800.0, + "grad_norm": 1.531478507658272, + "language_loss": 0.69112146, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.71234137, + "num_input_tokens_seen": 225135575, + "step": 10446, + "time_per_iteration": 2.511483907699585 + }, + { + "auxiliary_loss_clip": 0.01068177, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.03662562, + "balance_loss_mlp": 1.01545405, + "epoch": 0.6281076206222757, + "flos": 17967976450560.0, + "grad_norm": 1.776753116634411, + "language_loss": 0.73856747, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.7595427, + "num_input_tokens_seen": 225154230, + "step": 10447, + "time_per_iteration": 2.5679781436920166 + }, + { + "auxiliary_loss_clip": 0.01043792, + "auxiliary_loss_mlp": 0.01006207, + "balance_loss_clip": 1.02967763, + "balance_loss_mlp": 1.00492597, + "epoch": 0.6281677438749437, + "flos": 66778370622720.0, + "grad_norm": 0.6869506431524492, + "language_loss": 0.5236814, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.54418141, + "num_input_tokens_seen": 225213650, + "step": 10448, + "time_per_iteration": 2.990557909011841 + }, + { + "auxiliary_loss_clip": 0.01085086, + "auxiliary_loss_mlp": 0.01048298, + "balance_loss_clip": 1.0377562, + "balance_loss_mlp": 1.03411841, + "epoch": 0.6282278671276116, + "flos": 11656173640320.0, + "grad_norm": 1.8944972927434274, + "language_loss": 0.90761006, + "learning_rate": 1.282785392633079e-06, + "loss": 0.92894393, + "num_input_tokens_seen": 225230135, + "step": 10449, + "time_per_iteration": 2.5036983489990234 + }, + { + "auxiliary_loss_clip": 0.01106798, + "auxiliary_loss_mlp": 0.01028375, + "balance_loss_clip": 1.03751111, + "balance_loss_mlp": 1.01682949, + "epoch": 0.6282879903802796, + "flos": 42741597847680.0, + "grad_norm": 1.4834722397891185, + "language_loss": 0.60122532, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.62257707, + "num_input_tokens_seen": 225253520, + "step": 10450, + "time_per_iteration": 2.639711618423462 + }, + { + "auxiliary_loss_clip": 0.01081561, + "auxiliary_loss_mlp": 0.01028382, + "balance_loss_clip": 1.0379591, + "balance_loss_mlp": 1.01666951, + "epoch": 0.6283481136329475, + "flos": 20009210538240.0, + "grad_norm": 1.588551342234234, + "language_loss": 0.76779187, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.78889132, + "num_input_tokens_seen": 225272460, + "step": 10451, + "time_per_iteration": 2.532128095626831 + }, + { + "auxiliary_loss_clip": 0.01083765, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.03846264, + "balance_loss_mlp": 1.01854515, + "epoch": 0.6284082368856155, + "flos": 21904431840000.0, + "grad_norm": 2.194690266465946, + "language_loss": 0.77720118, + "learning_rate": 1.281694841064566e-06, + "loss": 0.79835111, + "num_input_tokens_seen": 225291700, + "step": 10452, + "time_per_iteration": 2.523129940032959 + }, + { + "auxiliary_loss_clip": 0.01078399, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.03804016, + "balance_loss_mlp": 1.01728773, + "epoch": 0.6284683601382834, + "flos": 25484187219840.0, + "grad_norm": 1.6950617626260656, + "language_loss": 0.72531033, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.74639511, + "num_input_tokens_seen": 225311470, + "step": 10453, + "time_per_iteration": 2.6004416942596436 + }, + { + "auxiliary_loss_clip": 0.01050795, + "auxiliary_loss_mlp": 0.01034909, + "balance_loss_clip": 1.03386927, + "balance_loss_mlp": 1.01974547, + "epoch": 0.6285284833909515, + "flos": 16538695395840.0, + "grad_norm": 1.6545657219540946, + "language_loss": 0.80570495, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.82656199, + "num_input_tokens_seen": 225328385, + "step": 10454, + "time_per_iteration": 2.5904788970947266 + }, + { + "auxiliary_loss_clip": 0.01074603, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.03764808, + "balance_loss_mlp": 1.01815748, + "epoch": 0.6285886066436194, + "flos": 22820692896000.0, + "grad_norm": 2.07112487949402, + "language_loss": 0.81907523, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.84011316, + "num_input_tokens_seen": 225348415, + "step": 10455, + "time_per_iteration": 2.575608730316162 + }, + { + "auxiliary_loss_clip": 0.01060475, + "auxiliary_loss_mlp": 0.00784683, + "balance_loss_clip": 1.03584111, + "balance_loss_mlp": 1.00549078, + "epoch": 0.6286487298962874, + "flos": 24715734629760.0, + "grad_norm": 1.541619301668581, + "language_loss": 0.81592834, + "learning_rate": 1.280241153705706e-06, + "loss": 0.83437991, + "num_input_tokens_seen": 225367740, + "step": 10456, + "time_per_iteration": 2.6370105743408203 + }, + { + "auxiliary_loss_clip": 0.01091321, + "auxiliary_loss_mlp": 0.01029678, + "balance_loss_clip": 1.0441339, + "balance_loss_mlp": 1.01692295, + "epoch": 0.6287088531489553, + "flos": 20740818752640.0, + "grad_norm": 1.5018675779804156, + "language_loss": 0.72194254, + "learning_rate": 1.27987780006486e-06, + "loss": 0.74315262, + "num_input_tokens_seen": 225388405, + "step": 10457, + "time_per_iteration": 2.5565288066864014 + }, + { + "auxiliary_loss_clip": 0.01101149, + "auxiliary_loss_mlp": 0.01028119, + "balance_loss_clip": 1.03623724, + "balance_loss_mlp": 1.01517272, + "epoch": 0.6287689764016233, + "flos": 23070630706560.0, + "grad_norm": 1.8241817018671518, + "language_loss": 0.79583764, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.81713033, + "num_input_tokens_seen": 225408360, + "step": 10458, + "time_per_iteration": 2.489292621612549 + }, + { + "auxiliary_loss_clip": 0.01099832, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.03849244, + "balance_loss_mlp": 1.02071261, + "epoch": 0.6288290996542913, + "flos": 32233669251840.0, + "grad_norm": 1.4902561451532517, + "language_loss": 0.61248761, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.63381386, + "num_input_tokens_seen": 225431310, + "step": 10459, + "time_per_iteration": 3.933231830596924 + }, + { + "auxiliary_loss_clip": 0.01083965, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.03682375, + "balance_loss_mlp": 1.01722038, + "epoch": 0.6288892229069593, + "flos": 24641327606400.0, + "grad_norm": 1.800175076774128, + "language_loss": 0.78969765, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.81082845, + "num_input_tokens_seen": 225450385, + "step": 10460, + "time_per_iteration": 2.5650625228881836 + }, + { + "auxiliary_loss_clip": 0.01067188, + "auxiliary_loss_mlp": 0.01027537, + "balance_loss_clip": 1.03648257, + "balance_loss_mlp": 1.01518083, + "epoch": 0.6289493461596273, + "flos": 17858341163520.0, + "grad_norm": 1.752754339269406, + "language_loss": 0.74070787, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.76165515, + "num_input_tokens_seen": 225467325, + "step": 10461, + "time_per_iteration": 2.5390443801879883 + }, + { + "auxiliary_loss_clip": 0.01084882, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.03583491, + "balance_loss_mlp": 1.02319658, + "epoch": 0.6290094694122952, + "flos": 22345379199360.0, + "grad_norm": 1.6507179811252104, + "language_loss": 0.70163506, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.72283244, + "num_input_tokens_seen": 225487370, + "step": 10462, + "time_per_iteration": 2.543658971786499 + }, + { + "auxiliary_loss_clip": 0.01103109, + "auxiliary_loss_mlp": 0.01028842, + "balance_loss_clip": 1.03808475, + "balance_loss_mlp": 1.01845336, + "epoch": 0.6290695926649632, + "flos": 28402431776640.0, + "grad_norm": 1.841764030354768, + "language_loss": 0.72116363, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.74248308, + "num_input_tokens_seen": 225506915, + "step": 10463, + "time_per_iteration": 2.508042335510254 + }, + { + "auxiliary_loss_clip": 0.01087609, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.04092717, + "balance_loss_mlp": 1.02065718, + "epoch": 0.6291297159176311, + "flos": 21505464501120.0, + "grad_norm": 1.6381239241135095, + "language_loss": 0.72186029, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.74305928, + "num_input_tokens_seen": 225525670, + "step": 10464, + "time_per_iteration": 2.5293281078338623 + }, + { + "auxiliary_loss_clip": 0.01085454, + "auxiliary_loss_mlp": 0.01028171, + "balance_loss_clip": 1.03850806, + "balance_loss_mlp": 1.01698923, + "epoch": 0.6291898391702991, + "flos": 12203308581120.0, + "grad_norm": 1.6177134344889372, + "language_loss": 0.69458163, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.71571791, + "num_input_tokens_seen": 225542235, + "step": 10465, + "time_per_iteration": 2.492640972137451 + }, + { + "auxiliary_loss_clip": 0.010391, + "auxiliary_loss_mlp": 0.0100376, + "balance_loss_clip": 1.02578509, + "balance_loss_mlp": 1.00250244, + "epoch": 0.629249962422967, + "flos": 69299479434240.0, + "grad_norm": 0.7009259653390438, + "language_loss": 0.59794044, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.61836898, + "num_input_tokens_seen": 225607185, + "step": 10466, + "time_per_iteration": 3.272179365158081 + }, + { + "auxiliary_loss_clip": 0.01066533, + "auxiliary_loss_mlp": 0.01029548, + "balance_loss_clip": 1.03397846, + "balance_loss_mlp": 1.01871216, + "epoch": 0.6293100856756351, + "flos": 40077888042240.0, + "grad_norm": 2.393849595449034, + "language_loss": 0.64921367, + "learning_rate": 1.276245767820154e-06, + "loss": 0.67017448, + "num_input_tokens_seen": 225628785, + "step": 10467, + "time_per_iteration": 2.7018699645996094 + }, + { + "auxiliary_loss_clip": 0.01019268, + "auxiliary_loss_mlp": 0.01002856, + "balance_loss_clip": 1.01706111, + "balance_loss_mlp": 1.00156808, + "epoch": 0.629370208928303, + "flos": 67501108177920.0, + "grad_norm": 0.7942453267081774, + "language_loss": 0.56899703, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.58921826, + "num_input_tokens_seen": 225678980, + "step": 10468, + "time_per_iteration": 2.908304452896118 + }, + { + "auxiliary_loss_clip": 0.01008467, + "auxiliary_loss_mlp": 0.01000136, + "balance_loss_clip": 1.02712178, + "balance_loss_mlp": 0.99870509, + "epoch": 0.629430332180971, + "flos": 60660450449280.0, + "grad_norm": 0.7405193155279937, + "language_loss": 0.58006585, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.6001519, + "num_input_tokens_seen": 225740295, + "step": 10469, + "time_per_iteration": 5.97453761100769 + }, + { + "auxiliary_loss_clip": 0.01036168, + "auxiliary_loss_mlp": 0.00999944, + "balance_loss_clip": 1.02890873, + "balance_loss_mlp": 0.99828738, + "epoch": 0.6294904554336389, + "flos": 66869764778880.0, + "grad_norm": 0.6767443153658657, + "language_loss": 0.52137339, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.54173458, + "num_input_tokens_seen": 225805615, + "step": 10470, + "time_per_iteration": 3.2685842514038086 + }, + { + "auxiliary_loss_clip": 0.01093847, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.04072762, + "balance_loss_mlp": 1.01885629, + "epoch": 0.6295505786863069, + "flos": 42522794150400.0, + "grad_norm": 1.9510836308884505, + "language_loss": 0.74669957, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.76793861, + "num_input_tokens_seen": 225826585, + "step": 10471, + "time_per_iteration": 2.686530590057373 + }, + { + "auxiliary_loss_clip": 0.01080681, + "auxiliary_loss_mlp": 0.01030393, + "balance_loss_clip": 1.04161811, + "balance_loss_mlp": 1.01884758, + "epoch": 0.629610701938975, + "flos": 17384140788480.0, + "grad_norm": 1.7565582357091873, + "language_loss": 0.62086272, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.6419735, + "num_input_tokens_seen": 225844095, + "step": 10472, + "time_per_iteration": 3.914329767227173 + }, + { + "auxiliary_loss_clip": 0.01111821, + "auxiliary_loss_mlp": 0.01030811, + "balance_loss_clip": 1.03961885, + "balance_loss_mlp": 1.01852083, + "epoch": 0.6296708251916429, + "flos": 24242934885120.0, + "grad_norm": 1.6115408607331574, + "language_loss": 0.69474196, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.71616828, + "num_input_tokens_seen": 225864310, + "step": 10473, + "time_per_iteration": 2.497291088104248 + }, + { + "auxiliary_loss_clip": 0.0108226, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.03530264, + "balance_loss_mlp": 1.01836729, + "epoch": 0.6297309484443109, + "flos": 19278536077440.0, + "grad_norm": 1.5506833814762324, + "language_loss": 0.7438013, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.76492083, + "num_input_tokens_seen": 225883830, + "step": 10474, + "time_per_iteration": 2.5240895748138428 + }, + { + "auxiliary_loss_clip": 0.01084032, + "auxiliary_loss_mlp": 0.00784835, + "balance_loss_clip": 1.036659, + "balance_loss_mlp": 1.01253259, + "epoch": 0.6297910716969788, + "flos": 30662685043200.0, + "grad_norm": 1.4450556041237104, + "language_loss": 0.66291058, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.68159926, + "num_input_tokens_seen": 225905755, + "step": 10475, + "time_per_iteration": 2.6339662075042725 + }, + { + "auxiliary_loss_clip": 0.01057661, + "auxiliary_loss_mlp": 0.01029973, + "balance_loss_clip": 1.03506255, + "balance_loss_mlp": 1.01895189, + "epoch": 0.6298511949496468, + "flos": 14423018371200.0, + "grad_norm": 1.7687442924172339, + "language_loss": 0.9007566, + "learning_rate": 1.272979284940101e-06, + "loss": 0.92163295, + "num_input_tokens_seen": 225922155, + "step": 10476, + "time_per_iteration": 2.537750005722046 + }, + { + "auxiliary_loss_clip": 0.01105513, + "auxiliary_loss_mlp": 0.01036464, + "balance_loss_clip": 1.03718662, + "balance_loss_mlp": 1.02496672, + "epoch": 0.6299113182023147, + "flos": 23514163845120.0, + "grad_norm": 1.7832388501654397, + "language_loss": 0.74788046, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.76930022, + "num_input_tokens_seen": 225941060, + "step": 10477, + "time_per_iteration": 2.4685287475585938 + }, + { + "auxiliary_loss_clip": 0.01093125, + "auxiliary_loss_mlp": 0.01029171, + "balance_loss_clip": 1.03766918, + "balance_loss_mlp": 1.01692247, + "epoch": 0.6299714414549827, + "flos": 22674500542080.0, + "grad_norm": 1.6778968843349085, + "language_loss": 0.70425665, + "learning_rate": 1.272253702758138e-06, + "loss": 0.72547966, + "num_input_tokens_seen": 225960870, + "step": 10478, + "time_per_iteration": 2.4794669151306152 + }, + { + "auxiliary_loss_clip": 0.01101266, + "auxiliary_loss_mlp": 0.01029021, + "balance_loss_clip": 1.03814077, + "balance_loss_mlp": 1.01621771, + "epoch": 0.6300315647076506, + "flos": 14501735026560.0, + "grad_norm": 2.7074303051437973, + "language_loss": 0.67291868, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.69422162, + "num_input_tokens_seen": 225977895, + "step": 10479, + "time_per_iteration": 2.4521307945251465 + }, + { + "auxiliary_loss_clip": 0.0108355, + "auxiliary_loss_mlp": 0.00785358, + "balance_loss_clip": 1.0402422, + "balance_loss_mlp": 1.01043844, + "epoch": 0.6300916879603187, + "flos": 21871681614720.0, + "grad_norm": 1.8990660902812975, + "language_loss": 0.73871839, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.75740749, + "num_input_tokens_seen": 225997835, + "step": 10480, + "time_per_iteration": 2.520901679992676 + }, + { + "auxiliary_loss_clip": 0.01095591, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.036443, + "balance_loss_mlp": 1.02116251, + "epoch": 0.6301518112129866, + "flos": 21834047139840.0, + "grad_norm": 1.9568995595898655, + "language_loss": 0.78821027, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.80950725, + "num_input_tokens_seen": 226017620, + "step": 10481, + "time_per_iteration": 2.484386920928955 + }, + { + "auxiliary_loss_clip": 0.01032231, + "auxiliary_loss_mlp": 0.01010949, + "balance_loss_clip": 1.02637422, + "balance_loss_mlp": 1.00939965, + "epoch": 0.6302119344656546, + "flos": 44334237957120.0, + "grad_norm": 0.8954171672310948, + "language_loss": 0.6186353, + "learning_rate": 1.2708028696588e-06, + "loss": 0.63906717, + "num_input_tokens_seen": 226068755, + "step": 10482, + "time_per_iteration": 2.8795056343078613 + }, + { + "auxiliary_loss_clip": 0.01101226, + "auxiliary_loss_mlp": 0.01032438, + "balance_loss_clip": 1.03761482, + "balance_loss_mlp": 1.01893759, + "epoch": 0.6302720577183225, + "flos": 11217919800960.0, + "grad_norm": 2.035149665709166, + "language_loss": 0.82634151, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.84767812, + "num_input_tokens_seen": 226084395, + "step": 10483, + "time_per_iteration": 2.448918581008911 + }, + { + "auxiliary_loss_clip": 0.01086029, + "auxiliary_loss_mlp": 0.0103189, + "balance_loss_clip": 1.03589058, + "balance_loss_mlp": 1.02026749, + "epoch": 0.6303321809709905, + "flos": 27964932122880.0, + "grad_norm": 1.4582160143364844, + "language_loss": 0.72521245, + "learning_rate": 1.270077618961487e-06, + "loss": 0.74639165, + "num_input_tokens_seen": 226105890, + "step": 10484, + "time_per_iteration": 2.530829906463623 + }, + { + "auxiliary_loss_clip": 0.01076054, + "auxiliary_loss_mlp": 0.01028038, + "balance_loss_clip": 1.03542852, + "balance_loss_mlp": 1.0160991, + "epoch": 0.6303923042236586, + "flos": 28220759763840.0, + "grad_norm": 2.1379980548812685, + "language_loss": 0.74501657, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.76605749, + "num_input_tokens_seen": 226126760, + "step": 10485, + "time_per_iteration": 2.630833148956299 + }, + { + "auxiliary_loss_clip": 0.01083941, + "auxiliary_loss_mlp": 0.00784366, + "balance_loss_clip": 1.03724194, + "balance_loss_mlp": 1.00880313, + "epoch": 0.6304524274763265, + "flos": 27631034271360.0, + "grad_norm": 1.9531258039914436, + "language_loss": 0.81068009, + "learning_rate": 1.269352478979093e-06, + "loss": 0.82936317, + "num_input_tokens_seen": 226147315, + "step": 10486, + "time_per_iteration": 2.5729544162750244 + }, + { + "auxiliary_loss_clip": 0.01082417, + "auxiliary_loss_mlp": 0.01034659, + "balance_loss_clip": 1.03560126, + "balance_loss_mlp": 1.02316141, + "epoch": 0.6305125507289945, + "flos": 17311313963520.0, + "grad_norm": 1.6826916855888017, + "language_loss": 0.63334012, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.65451086, + "num_input_tokens_seen": 226165935, + "step": 10487, + "time_per_iteration": 2.514842987060547 + }, + { + "auxiliary_loss_clip": 0.01108009, + "auxiliary_loss_mlp": 0.01038306, + "balance_loss_clip": 1.03787124, + "balance_loss_mlp": 1.02642083, + "epoch": 0.6305726739816624, + "flos": 25808280658560.0, + "grad_norm": 1.5504982762983408, + "language_loss": 0.6718967, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.69335985, + "num_input_tokens_seen": 226186890, + "step": 10488, + "time_per_iteration": 2.498486280441284 + }, + { + "auxiliary_loss_clip": 0.0108743, + "auxiliary_loss_mlp": 0.01031539, + "balance_loss_clip": 1.03654432, + "balance_loss_mlp": 1.01993954, + "epoch": 0.6306327972343304, + "flos": 21797454159360.0, + "grad_norm": 2.094565465839246, + "language_loss": 0.6705246, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.69171429, + "num_input_tokens_seen": 226206710, + "step": 10489, + "time_per_iteration": 2.5409953594207764 + }, + { + "auxiliary_loss_clip": 0.01072758, + "auxiliary_loss_mlp": 0.01041532, + "balance_loss_clip": 1.03659379, + "balance_loss_mlp": 1.02753639, + "epoch": 0.6306929204869983, + "flos": 20777375819520.0, + "grad_norm": 1.7948449882716373, + "language_loss": 0.69217938, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.71332228, + "num_input_tokens_seen": 226225565, + "step": 10490, + "time_per_iteration": 2.5488736629486084 + }, + { + "auxiliary_loss_clip": 0.01086427, + "auxiliary_loss_mlp": 0.01035201, + "balance_loss_clip": 1.0366075, + "balance_loss_mlp": 1.02272558, + "epoch": 0.6307530437396663, + "flos": 23654214973440.0, + "grad_norm": 1.825427942427149, + "language_loss": 0.78223962, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.80345595, + "num_input_tokens_seen": 226243680, + "step": 10491, + "time_per_iteration": 2.5528132915496826 + }, + { + "auxiliary_loss_clip": 0.01083466, + "auxiliary_loss_mlp": 0.01035587, + "balance_loss_clip": 1.03730237, + "balance_loss_mlp": 1.02329659, + "epoch": 0.6308131669923343, + "flos": 24719002767360.0, + "grad_norm": 1.8944825292091856, + "language_loss": 0.55792022, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.57911074, + "num_input_tokens_seen": 226264345, + "step": 10492, + "time_per_iteration": 2.5597493648529053 + }, + { + "auxiliary_loss_clip": 0.01110738, + "auxiliary_loss_mlp": 0.0104064, + "balance_loss_clip": 1.03852737, + "balance_loss_mlp": 1.0273962, + "epoch": 0.6308732902450023, + "flos": 22565403959040.0, + "grad_norm": 1.8615324605616275, + "language_loss": 0.64181936, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.66333318, + "num_input_tokens_seen": 226283165, + "step": 10493, + "time_per_iteration": 2.48433518409729 + }, + { + "auxiliary_loss_clip": 0.01073759, + "auxiliary_loss_mlp": 0.01029784, + "balance_loss_clip": 1.03834248, + "balance_loss_mlp": 1.01732635, + "epoch": 0.6309334134976702, + "flos": 24644200694400.0, + "grad_norm": 1.4202608420929337, + "language_loss": 0.82673806, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.84777343, + "num_input_tokens_seen": 226304080, + "step": 10494, + "time_per_iteration": 2.594668388366699 + }, + { + "auxiliary_loss_clip": 0.01091924, + "auxiliary_loss_mlp": 0.01033698, + "balance_loss_clip": 1.03738141, + "balance_loss_mlp": 1.02078748, + "epoch": 0.6309935367503382, + "flos": 41427949651200.0, + "grad_norm": 3.434646394729339, + "language_loss": 0.79658103, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.8178373, + "num_input_tokens_seen": 226325925, + "step": 10495, + "time_per_iteration": 2.6996254920959473 + }, + { + "auxiliary_loss_clip": 0.01081919, + "auxiliary_loss_mlp": 0.01033486, + "balance_loss_clip": 1.03627288, + "balance_loss_mlp": 1.02038503, + "epoch": 0.6310536600030061, + "flos": 15118931445120.0, + "grad_norm": 1.7743230445697442, + "language_loss": 0.7063098, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.72746384, + "num_input_tokens_seen": 226344190, + "step": 10496, + "time_per_iteration": 2.5031557083129883 + }, + { + "auxiliary_loss_clip": 0.01090348, + "auxiliary_loss_mlp": 0.01036718, + "balance_loss_clip": 1.03776622, + "balance_loss_mlp": 1.02354538, + "epoch": 0.6311137832556741, + "flos": 15231619388160.0, + "grad_norm": 1.9867693697785116, + "language_loss": 0.79791808, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.81918871, + "num_input_tokens_seen": 226361520, + "step": 10497, + "time_per_iteration": 2.5074574947357178 + }, + { + "auxiliary_loss_clip": 0.0107574, + "auxiliary_loss_mlp": 0.01032038, + "balance_loss_clip": 1.03615975, + "balance_loss_mlp": 1.02045727, + "epoch": 0.6311739065083422, + "flos": 22018664067840.0, + "grad_norm": 1.9590378822756374, + "language_loss": 0.7390185, + "learning_rate": 1.265003970256247e-06, + "loss": 0.76009625, + "num_input_tokens_seen": 226381920, + "step": 10498, + "time_per_iteration": 3.965893268585205 + }, + { + "auxiliary_loss_clip": 0.01098145, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.03659105, + "balance_loss_mlp": 1.01632166, + "epoch": 0.6312340297610101, + "flos": 22710770300160.0, + "grad_norm": 2.1780558938648276, + "language_loss": 0.69721425, + "learning_rate": 1.264641775364217e-06, + "loss": 0.71848333, + "num_input_tokens_seen": 226400035, + "step": 10499, + "time_per_iteration": 2.488330125808716 + }, + { + "auxiliary_loss_clip": 0.01097316, + "auxiliary_loss_mlp": 0.01040215, + "balance_loss_clip": 1.03871763, + "balance_loss_mlp": 1.02818084, + "epoch": 0.6312941530136781, + "flos": 24280102483200.0, + "grad_norm": 1.69885770270483, + "language_loss": 0.7015202, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.7228955, + "num_input_tokens_seen": 226418280, + "step": 10500, + "time_per_iteration": 2.520646810531616 + }, + { + "auxiliary_loss_clip": 0.01110193, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.0394671, + "balance_loss_mlp": 1.01985013, + "epoch": 0.631354276266346, + "flos": 21725956137600.0, + "grad_norm": 1.7522789144702988, + "language_loss": 0.7435782, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.7649951, + "num_input_tokens_seen": 226436650, + "step": 10501, + "time_per_iteration": 2.4520750045776367 + }, + { + "auxiliary_loss_clip": 0.01096283, + "auxiliary_loss_mlp": 0.00783534, + "balance_loss_clip": 1.03789592, + "balance_loss_mlp": 1.00925899, + "epoch": 0.631414399519014, + "flos": 24025100855040.0, + "grad_norm": 1.6276588518667332, + "language_loss": 0.75542927, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.77422738, + "num_input_tokens_seen": 226456275, + "step": 10502, + "time_per_iteration": 2.5318431854248047 + }, + { + "auxiliary_loss_clip": 0.01101556, + "auxiliary_loss_mlp": 0.01044399, + "balance_loss_clip": 1.0386728, + "balance_loss_mlp": 1.03218639, + "epoch": 0.6314745227716819, + "flos": 24315797623680.0, + "grad_norm": 3.800702862327018, + "language_loss": 0.85384303, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.87530255, + "num_input_tokens_seen": 226473610, + "step": 10503, + "time_per_iteration": 2.506032705307007 + }, + { + "auxiliary_loss_clip": 0.01087914, + "auxiliary_loss_mlp": 0.0103367, + "balance_loss_clip": 1.04024482, + "balance_loss_mlp": 1.02131426, + "epoch": 0.6315346460243499, + "flos": 23366391292800.0, + "grad_norm": 1.9024358108121282, + "language_loss": 0.86522186, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.88643765, + "num_input_tokens_seen": 226493665, + "step": 10504, + "time_per_iteration": 2.5335652828216553 + }, + { + "auxiliary_loss_clip": 0.01078504, + "auxiliary_loss_mlp": 0.01035085, + "balance_loss_clip": 1.04135215, + "balance_loss_mlp": 1.02256835, + "epoch": 0.6315947692770179, + "flos": 20260333497600.0, + "grad_norm": 1.6384216761153434, + "language_loss": 0.76190734, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.78304321, + "num_input_tokens_seen": 226511625, + "step": 10505, + "time_per_iteration": 2.5487937927246094 + }, + { + "auxiliary_loss_clip": 0.01068355, + "auxiliary_loss_mlp": 0.01032616, + "balance_loss_clip": 1.03893781, + "balance_loss_mlp": 1.01916361, + "epoch": 0.6316548925296859, + "flos": 25265850399360.0, + "grad_norm": 1.8319588736677788, + "language_loss": 0.81721222, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.83822191, + "num_input_tokens_seen": 226530085, + "step": 10506, + "time_per_iteration": 2.6095211505889893 + }, + { + "auxiliary_loss_clip": 0.011114, + "auxiliary_loss_mlp": 0.01036511, + "balance_loss_clip": 1.0398252, + "balance_loss_mlp": 1.02427411, + "epoch": 0.6317150157823538, + "flos": 22930579578240.0, + "grad_norm": 2.01629539375655, + "language_loss": 0.74459285, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.76607203, + "num_input_tokens_seen": 226548115, + "step": 10507, + "time_per_iteration": 3.8870036602020264 + }, + { + "auxiliary_loss_clip": 0.01089849, + "auxiliary_loss_mlp": 0.01036765, + "balance_loss_clip": 1.04218125, + "balance_loss_mlp": 1.0238905, + "epoch": 0.6317751390350218, + "flos": 22527051212160.0, + "grad_norm": 1.6501918638687654, + "language_loss": 0.67755139, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.69881749, + "num_input_tokens_seen": 226567955, + "step": 10508, + "time_per_iteration": 4.017300605773926 + }, + { + "auxiliary_loss_clip": 0.01075927, + "auxiliary_loss_mlp": 0.01032185, + "balance_loss_clip": 1.03583074, + "balance_loss_mlp": 1.01983523, + "epoch": 0.6318352622876897, + "flos": 23294749616640.0, + "grad_norm": 1.610016961259701, + "language_loss": 0.70742261, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.72850376, + "num_input_tokens_seen": 226588205, + "step": 10509, + "time_per_iteration": 2.6113953590393066 + }, + { + "auxiliary_loss_clip": 0.01092941, + "auxiliary_loss_mlp": 0.01026058, + "balance_loss_clip": 1.03823936, + "balance_loss_mlp": 1.01464987, + "epoch": 0.6318953855403577, + "flos": 20704082117760.0, + "grad_norm": 1.5948657287673125, + "language_loss": 0.79462647, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.81581646, + "num_input_tokens_seen": 226606965, + "step": 10510, + "time_per_iteration": 2.5095455646514893 + }, + { + "auxiliary_loss_clip": 0.01066681, + "auxiliary_loss_mlp": 0.00785347, + "balance_loss_clip": 1.03814363, + "balance_loss_mlp": 1.0115844, + "epoch": 0.6319555087930258, + "flos": 22820046451200.0, + "grad_norm": 1.5847971526489208, + "language_loss": 0.70468187, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.72320211, + "num_input_tokens_seen": 226627845, + "step": 10511, + "time_per_iteration": 4.087428092956543 + }, + { + "auxiliary_loss_clip": 0.01105327, + "auxiliary_loss_mlp": 0.01031349, + "balance_loss_clip": 1.03692687, + "balance_loss_mlp": 1.01991105, + "epoch": 0.6320156320456937, + "flos": 19970929618560.0, + "grad_norm": 1.8580616848983822, + "language_loss": 0.79993069, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.82129741, + "num_input_tokens_seen": 226645855, + "step": 10512, + "time_per_iteration": 2.464200019836426 + }, + { + "auxiliary_loss_clip": 0.01100752, + "auxiliary_loss_mlp": 0.0103292, + "balance_loss_clip": 1.04056525, + "balance_loss_mlp": 1.02010477, + "epoch": 0.6320757552983617, + "flos": 27013406889600.0, + "grad_norm": 1.9764285329037476, + "language_loss": 0.70472342, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.72606009, + "num_input_tokens_seen": 226665375, + "step": 10513, + "time_per_iteration": 2.5635552406311035 + }, + { + "auxiliary_loss_clip": 0.01102045, + "auxiliary_loss_mlp": 0.01030321, + "balance_loss_clip": 1.03888941, + "balance_loss_mlp": 1.01708317, + "epoch": 0.6321358785510296, + "flos": 23695943598720.0, + "grad_norm": 1.716797906945892, + "language_loss": 0.66222686, + "learning_rate": 1.259212205855459e-06, + "loss": 0.68355048, + "num_input_tokens_seen": 226685270, + "step": 10514, + "time_per_iteration": 2.5186305046081543 + }, + { + "auxiliary_loss_clip": 0.01074081, + "auxiliary_loss_mlp": 0.01028489, + "balance_loss_clip": 1.03432751, + "balance_loss_mlp": 1.01656771, + "epoch": 0.6321960018036976, + "flos": 25995231970560.0, + "grad_norm": 1.971696350095617, + "language_loss": 0.74315715, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.76418281, + "num_input_tokens_seen": 226705325, + "step": 10515, + "time_per_iteration": 2.618481397628784 + }, + { + "auxiliary_loss_clip": 0.01082854, + "auxiliary_loss_mlp": 0.01026245, + "balance_loss_clip": 1.04078698, + "balance_loss_mlp": 1.01483631, + "epoch": 0.6322561250563655, + "flos": 22821016118400.0, + "grad_norm": 1.7586305557149635, + "language_loss": 0.90038139, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.92147237, + "num_input_tokens_seen": 226723815, + "step": 10516, + "time_per_iteration": 2.5367228984832764 + }, + { + "auxiliary_loss_clip": 0.01117405, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.04213536, + "balance_loss_mlp": 1.01790667, + "epoch": 0.6323162483090335, + "flos": 18988413926400.0, + "grad_norm": 1.7411984915827756, + "language_loss": 0.81769997, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.83919251, + "num_input_tokens_seen": 226741550, + "step": 10517, + "time_per_iteration": 2.4627575874328613 + }, + { + "auxiliary_loss_clip": 0.01053949, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.03771651, + "balance_loss_mlp": 1.02563858, + "epoch": 0.6323763715617015, + "flos": 19865173000320.0, + "grad_norm": 1.7297809544068898, + "language_loss": 0.7778542, + "learning_rate": 1.257765386189541e-06, + "loss": 0.79876888, + "num_input_tokens_seen": 226761115, + "step": 10518, + "time_per_iteration": 2.6264665126800537 + }, + { + "auxiliary_loss_clip": 0.01092704, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.03918958, + "balance_loss_mlp": 1.01821136, + "epoch": 0.6324364948143695, + "flos": 22782699285120.0, + "grad_norm": 1.6727278190339019, + "language_loss": 0.85325098, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.87448186, + "num_input_tokens_seen": 226782225, + "step": 10519, + "time_per_iteration": 2.5523741245269775 + }, + { + "auxiliary_loss_clip": 0.01083174, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.03699279, + "balance_loss_mlp": 1.02389896, + "epoch": 0.6324966180670374, + "flos": 22235923480320.0, + "grad_norm": 1.4426770262871134, + "language_loss": 0.71873581, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.73992419, + "num_input_tokens_seen": 226802375, + "step": 10520, + "time_per_iteration": 2.5580546855926514 + }, + { + "auxiliary_loss_clip": 0.01095247, + "auxiliary_loss_mlp": 0.01032168, + "balance_loss_clip": 1.03676105, + "balance_loss_mlp": 1.02065825, + "epoch": 0.6325567413197054, + "flos": 21689183589120.0, + "grad_norm": 1.6578344010073633, + "language_loss": 0.71758783, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.73886198, + "num_input_tokens_seen": 226822165, + "step": 10521, + "time_per_iteration": 2.487914800643921 + }, + { + "auxiliary_loss_clip": 0.01071764, + "auxiliary_loss_mlp": 0.01036686, + "balance_loss_clip": 1.03828335, + "balance_loss_mlp": 1.02196383, + "epoch": 0.6326168645723733, + "flos": 19937137898880.0, + "grad_norm": 1.6807197210836269, + "language_loss": 0.72257626, + "learning_rate": 1.256319016853377e-06, + "loss": 0.74366081, + "num_input_tokens_seen": 226841645, + "step": 10522, + "time_per_iteration": 2.568253755569458 + }, + { + "auxiliary_loss_clip": 0.01066753, + "auxiliary_loss_mlp": 0.01029309, + "balance_loss_clip": 1.04293251, + "balance_loss_mlp": 1.01737583, + "epoch": 0.6326769878250413, + "flos": 20230348619520.0, + "grad_norm": 1.9482538328279864, + "language_loss": 0.81763285, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.83859342, + "num_input_tokens_seen": 226860355, + "step": 10523, + "time_per_iteration": 2.5915725231170654 + }, + { + "auxiliary_loss_clip": 0.01096528, + "auxiliary_loss_mlp": 0.01027074, + "balance_loss_clip": 1.03789043, + "balance_loss_mlp": 1.01523089, + "epoch": 0.6327371110777094, + "flos": 20775759707520.0, + "grad_norm": 1.980881709990781, + "language_loss": 0.73682535, + "learning_rate": 1.255596001333195e-06, + "loss": 0.75806141, + "num_input_tokens_seen": 226878390, + "step": 10524, + "time_per_iteration": 2.503743886947632 + }, + { + "auxiliary_loss_clip": 0.01096141, + "auxiliary_loss_mlp": 0.01034498, + "balance_loss_clip": 1.04060507, + "balance_loss_mlp": 1.02089047, + "epoch": 0.6327972343303773, + "flos": 30336544529280.0, + "grad_norm": 2.937902524645182, + "language_loss": 0.84871769, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.87002409, + "num_input_tokens_seen": 226898420, + "step": 10525, + "time_per_iteration": 2.588752269744873 + }, + { + "auxiliary_loss_clip": 0.01078106, + "auxiliary_loss_mlp": 0.01028034, + "balance_loss_clip": 1.03365541, + "balance_loss_mlp": 1.01505256, + "epoch": 0.6328573575830453, + "flos": 17092258871040.0, + "grad_norm": 1.5877539651621269, + "language_loss": 0.66856855, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.68962991, + "num_input_tokens_seen": 226916305, + "step": 10526, + "time_per_iteration": 2.5043728351593018 + }, + { + "auxiliary_loss_clip": 0.01104247, + "auxiliary_loss_mlp": 0.01034052, + "balance_loss_clip": 1.04123306, + "balance_loss_mlp": 1.02062964, + "epoch": 0.6329174808357132, + "flos": 25047154442880.0, + "grad_norm": 1.5695055229957688, + "language_loss": 0.73526955, + "learning_rate": 1.254511689796244e-06, + "loss": 0.75665253, + "num_input_tokens_seen": 226937705, + "step": 10527, + "time_per_iteration": 2.5358808040618896 + }, + { + "auxiliary_loss_clip": 0.01096332, + "auxiliary_loss_mlp": 0.01031171, + "balance_loss_clip": 1.03936195, + "balance_loss_mlp": 1.01972675, + "epoch": 0.6329776040883812, + "flos": 16836826279680.0, + "grad_norm": 2.628399565810566, + "language_loss": 0.71817124, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.73944628, + "num_input_tokens_seen": 226954880, + "step": 10528, + "time_per_iteration": 2.59552264213562 + }, + { + "auxiliary_loss_clip": 0.01094437, + "auxiliary_loss_mlp": 0.01028836, + "balance_loss_clip": 1.03611851, + "balance_loss_mlp": 1.01602137, + "epoch": 0.6330377273410491, + "flos": 13516705382400.0, + "grad_norm": 1.939395114410684, + "language_loss": 0.66771805, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.68895084, + "num_input_tokens_seen": 226972595, + "step": 10529, + "time_per_iteration": 2.456913948059082 + }, + { + "auxiliary_loss_clip": 0.01100628, + "auxiliary_loss_mlp": 0.01029875, + "balance_loss_clip": 1.03807127, + "balance_loss_mlp": 1.01672602, + "epoch": 0.6330978505937171, + "flos": 21538825257600.0, + "grad_norm": 2.3215196514771894, + "language_loss": 0.75099337, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.7722984, + "num_input_tokens_seen": 226991910, + "step": 10530, + "time_per_iteration": 2.5166869163513184 + }, + { + "auxiliary_loss_clip": 0.01100665, + "auxiliary_loss_mlp": 0.00787797, + "balance_loss_clip": 1.04024553, + "balance_loss_mlp": 1.01569068, + "epoch": 0.6331579738463851, + "flos": 25009484054400.0, + "grad_norm": 1.5957438975326417, + "language_loss": 0.73999476, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.75887942, + "num_input_tokens_seen": 227010175, + "step": 10531, + "time_per_iteration": 2.5931880474090576 + }, + { + "auxiliary_loss_clip": 0.01068481, + "auxiliary_loss_mlp": 0.01028153, + "balance_loss_clip": 1.0345068, + "balance_loss_mlp": 1.01580334, + "epoch": 0.6332180970990531, + "flos": 14976007228800.0, + "grad_norm": 2.66832208196273, + "language_loss": 0.79366064, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.81462693, + "num_input_tokens_seen": 227025540, + "step": 10532, + "time_per_iteration": 2.524585247039795 + }, + { + "auxiliary_loss_clip": 0.0109445, + "auxiliary_loss_mlp": 0.01024925, + "balance_loss_clip": 1.0368259, + "balance_loss_mlp": 1.01436889, + "epoch": 0.633278220351721, + "flos": 22706963458560.0, + "grad_norm": 1.5063211651932675, + "language_loss": 0.74266708, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.76386082, + "num_input_tokens_seen": 227045520, + "step": 10533, + "time_per_iteration": 2.531118869781494 + }, + { + "auxiliary_loss_clip": 0.01088628, + "auxiliary_loss_mlp": 0.01036176, + "balance_loss_clip": 1.04040611, + "balance_loss_mlp": 1.02297938, + "epoch": 0.633338343604389, + "flos": 12602922364800.0, + "grad_norm": 2.4538112656450113, + "language_loss": 0.76876557, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.79001367, + "num_input_tokens_seen": 227059420, + "step": 10534, + "time_per_iteration": 2.489245891571045 + }, + { + "auxiliary_loss_clip": 0.01075489, + "auxiliary_loss_mlp": 0.01040224, + "balance_loss_clip": 1.0376786, + "balance_loss_mlp": 1.02655053, + "epoch": 0.6333984668570569, + "flos": 25960111447680.0, + "grad_norm": 1.513563332197795, + "language_loss": 0.85622549, + "learning_rate": 1.251621437204777e-06, + "loss": 0.87738258, + "num_input_tokens_seen": 227081310, + "step": 10535, + "time_per_iteration": 2.6177361011505127 + }, + { + "auxiliary_loss_clip": 0.01100602, + "auxiliary_loss_mlp": 0.01031115, + "balance_loss_clip": 1.03882122, + "balance_loss_mlp": 1.01838899, + "epoch": 0.6334585901097249, + "flos": 23659242877440.0, + "grad_norm": 1.9602123185922353, + "language_loss": 0.76321566, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.78453285, + "num_input_tokens_seen": 227100365, + "step": 10536, + "time_per_iteration": 2.526759147644043 + }, + { + "auxiliary_loss_clip": 0.01094546, + "auxiliary_loss_mlp": 0.01030403, + "balance_loss_clip": 1.03875113, + "balance_loss_mlp": 1.0177604, + "epoch": 0.633518713362393, + "flos": 28760496503040.0, + "grad_norm": 1.9444093213721925, + "language_loss": 0.59998196, + "learning_rate": 1.250899157568855e-06, + "loss": 0.62123144, + "num_input_tokens_seen": 227119680, + "step": 10537, + "time_per_iteration": 3.973539352416992 + }, + { + "auxiliary_loss_clip": 0.01023145, + "auxiliary_loss_mlp": 0.01001659, + "balance_loss_clip": 1.03101969, + "balance_loss_mlp": 1.00019884, + "epoch": 0.6335788366150609, + "flos": 70420322401920.0, + "grad_norm": 0.7826699306019819, + "language_loss": 0.52401054, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.54425859, + "num_input_tokens_seen": 227184465, + "step": 10538, + "time_per_iteration": 3.271909475326538 + }, + { + "auxiliary_loss_clip": 0.0108849, + "auxiliary_loss_mlp": 0.01029852, + "balance_loss_clip": 1.03758752, + "balance_loss_mlp": 1.0167985, + "epoch": 0.6336389598677289, + "flos": 23732069702400.0, + "grad_norm": 1.785509499663211, + "language_loss": 0.83286905, + "learning_rate": 1.250176991556848e-06, + "loss": 0.85405242, + "num_input_tokens_seen": 227202185, + "step": 10539, + "time_per_iteration": 2.5774717330932617 + }, + { + "auxiliary_loss_clip": 0.01082994, + "auxiliary_loss_mlp": 0.01030872, + "balance_loss_clip": 1.03720069, + "balance_loss_mlp": 1.01722229, + "epoch": 0.6336990831203968, + "flos": 29276676898560.0, + "grad_norm": 1.74746656512703, + "language_loss": 0.86958325, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.89072186, + "num_input_tokens_seen": 227222020, + "step": 10540, + "time_per_iteration": 2.561495304107666 + }, + { + "auxiliary_loss_clip": 0.01083062, + "auxiliary_loss_mlp": 0.01029738, + "balance_loss_clip": 1.03696024, + "balance_loss_mlp": 1.01899767, + "epoch": 0.6337592063730648, + "flos": 29096836479360.0, + "grad_norm": 1.7199646866032428, + "language_loss": 0.72492909, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.74605703, + "num_input_tokens_seen": 227240885, + "step": 10541, + "time_per_iteration": 2.609320878982544 + }, + { + "auxiliary_loss_clip": 0.01101306, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.03793716, + "balance_loss_mlp": 1.01880002, + "epoch": 0.6338193296257327, + "flos": 34706477249280.0, + "grad_norm": 2.241530031887131, + "language_loss": 0.84868586, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.87001985, + "num_input_tokens_seen": 227257880, + "step": 10542, + "time_per_iteration": 2.5793919563293457 + }, + { + "auxiliary_loss_clip": 0.01098041, + "auxiliary_loss_mlp": 0.01027509, + "balance_loss_clip": 1.03885901, + "balance_loss_mlp": 1.01450884, + "epoch": 0.6338794528784008, + "flos": 16687581269760.0, + "grad_norm": 1.6306502789302466, + "language_loss": 0.77579951, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.79705495, + "num_input_tokens_seen": 227274840, + "step": 10543, + "time_per_iteration": 2.5054731369018555 + }, + { + "auxiliary_loss_clip": 0.01055056, + "auxiliary_loss_mlp": 0.01031077, + "balance_loss_clip": 1.03563094, + "balance_loss_mlp": 1.0194366, + "epoch": 0.6339395761310687, + "flos": 22346600261760.0, + "grad_norm": 1.6385980734790173, + "language_loss": 0.73702031, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.75788164, + "num_input_tokens_seen": 227294835, + "step": 10544, + "time_per_iteration": 2.5859904289245605 + }, + { + "auxiliary_loss_clip": 0.01082034, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.03790951, + "balance_loss_mlp": 1.02080226, + "epoch": 0.6339996993837367, + "flos": 18551812112640.0, + "grad_norm": 1.9337740477926275, + "language_loss": 0.6832059, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.70436072, + "num_input_tokens_seen": 227314935, + "step": 10545, + "time_per_iteration": 2.6033241748809814 + }, + { + "auxiliary_loss_clip": 0.01084084, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.03618336, + "balance_loss_mlp": 1.02166057, + "epoch": 0.6340598226364046, + "flos": 12969498614400.0, + "grad_norm": 2.2348258232641505, + "language_loss": 0.7162692, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.73745084, + "num_input_tokens_seen": 227332905, + "step": 10546, + "time_per_iteration": 3.8842544555664062 + }, + { + "auxiliary_loss_clip": 0.01091283, + "auxiliary_loss_mlp": 0.0102749, + "balance_loss_clip": 1.03733659, + "balance_loss_mlp": 1.01613498, + "epoch": 0.6341199458890726, + "flos": 26687984647680.0, + "grad_norm": 1.357550578734522, + "language_loss": 0.77875954, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.79994726, + "num_input_tokens_seen": 227354915, + "step": 10547, + "time_per_iteration": 3.960388422012329 + }, + { + "auxiliary_loss_clip": 0.01065754, + "auxiliary_loss_mlp": 0.01032103, + "balance_loss_clip": 1.03406513, + "balance_loss_mlp": 1.0198487, + "epoch": 0.6341800691417405, + "flos": 18734274224640.0, + "grad_norm": 1.7761098996400293, + "language_loss": 0.63565892, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.65663749, + "num_input_tokens_seen": 227372990, + "step": 10548, + "time_per_iteration": 2.558922290802002 + }, + { + "auxiliary_loss_clip": 0.01082999, + "auxiliary_loss_mlp": 0.01031295, + "balance_loss_clip": 1.03667426, + "balance_loss_mlp": 1.01923144, + "epoch": 0.6342401923944085, + "flos": 26249443499520.0, + "grad_norm": 1.955875792108172, + "language_loss": 0.61780417, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.63894713, + "num_input_tokens_seen": 227393270, + "step": 10549, + "time_per_iteration": 2.6088898181915283 + }, + { + "auxiliary_loss_clip": 0.01058167, + "auxiliary_loss_mlp": 0.01028106, + "balance_loss_clip": 1.03806317, + "balance_loss_mlp": 1.01706696, + "epoch": 0.6343003156470765, + "flos": 24680937329280.0, + "grad_norm": 1.5351086264604867, + "language_loss": 0.73456025, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.75542301, + "num_input_tokens_seen": 227413630, + "step": 10550, + "time_per_iteration": 3.9695441722869873 + }, + { + "auxiliary_loss_clip": 0.0100266, + "auxiliary_loss_mlp": 0.01001434, + "balance_loss_clip": 1.01397312, + "balance_loss_mlp": 0.99997979, + "epoch": 0.6343604388997445, + "flos": 69805352626560.0, + "grad_norm": 0.6879621401315671, + "language_loss": 0.57727134, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.59731233, + "num_input_tokens_seen": 227476630, + "step": 10551, + "time_per_iteration": 3.1948065757751465 + }, + { + "auxiliary_loss_clip": 0.01072036, + "auxiliary_loss_mlp": 0.01026089, + "balance_loss_clip": 1.03960013, + "balance_loss_mlp": 1.0151633, + "epoch": 0.6344205621524125, + "flos": 21982430223360.0, + "grad_norm": 1.69756485178634, + "language_loss": 0.67153227, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.69251347, + "num_input_tokens_seen": 227496060, + "step": 10552, + "time_per_iteration": 2.5714244842529297 + }, + { + "auxiliary_loss_clip": 0.01072104, + "auxiliary_loss_mlp": 0.01029449, + "balance_loss_clip": 1.03612089, + "balance_loss_mlp": 1.01658618, + "epoch": 0.6344806854050804, + "flos": 20448865008000.0, + "grad_norm": 1.7273900690083654, + "language_loss": 0.81999224, + "learning_rate": 1.24512502014147e-06, + "loss": 0.84100783, + "num_input_tokens_seen": 227513440, + "step": 10553, + "time_per_iteration": 2.5873239040374756 + }, + { + "auxiliary_loss_clip": 0.01099567, + "auxiliary_loss_mlp": 0.01031168, + "balance_loss_clip": 1.03832245, + "balance_loss_mlp": 1.01934874, + "epoch": 0.6345408086577484, + "flos": 40510611187200.0, + "grad_norm": 2.0535091885207724, + "language_loss": 0.55393964, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.57524699, + "num_input_tokens_seen": 227535395, + "step": 10554, + "time_per_iteration": 2.66570782661438 + }, + { + "auxiliary_loss_clip": 0.01090598, + "auxiliary_loss_mlp": 0.01028703, + "balance_loss_clip": 1.03940797, + "balance_loss_mlp": 1.01651394, + "epoch": 0.6346009319104163, + "flos": 21361319222400.0, + "grad_norm": 2.0507047877694307, + "language_loss": 0.70662886, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.72782189, + "num_input_tokens_seen": 227554545, + "step": 10555, + "time_per_iteration": 2.5427963733673096 + }, + { + "auxiliary_loss_clip": 0.0102667, + "auxiliary_loss_mlp": 0.01001924, + "balance_loss_clip": 1.02273798, + "balance_loss_mlp": 1.00069571, + "epoch": 0.6346610551630844, + "flos": 71365419100800.0, + "grad_norm": 0.768927993630889, + "language_loss": 0.55314529, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.57343125, + "num_input_tokens_seen": 227608575, + "step": 10556, + "time_per_iteration": 3.045254945755005 + }, + { + "auxiliary_loss_clip": 0.01089012, + "auxiliary_loss_mlp": 0.01031473, + "balance_loss_clip": 1.03687024, + "balance_loss_mlp": 1.01731133, + "epoch": 0.6347211784157523, + "flos": 25411504049280.0, + "grad_norm": 1.8054348455867575, + "language_loss": 0.68206024, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.70326513, + "num_input_tokens_seen": 227628175, + "step": 10557, + "time_per_iteration": 2.5723249912261963 + }, + { + "auxiliary_loss_clip": 0.01077479, + "auxiliary_loss_mlp": 0.01034883, + "balance_loss_clip": 1.03510571, + "balance_loss_mlp": 1.02255046, + "epoch": 0.6347813016684203, + "flos": 15742735966080.0, + "grad_norm": 1.7233972761198761, + "language_loss": 0.70164657, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.72277015, + "num_input_tokens_seen": 227645330, + "step": 10558, + "time_per_iteration": 2.5047454833984375 + }, + { + "auxiliary_loss_clip": 0.0108616, + "auxiliary_loss_mlp": 0.0102853, + "balance_loss_clip": 1.03999698, + "balance_loss_mlp": 1.01613224, + "epoch": 0.6348414249210882, + "flos": 21464777370240.0, + "grad_norm": 1.5896154627905474, + "language_loss": 0.78300655, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.80415344, + "num_input_tokens_seen": 227665250, + "step": 10559, + "time_per_iteration": 2.549590826034546 + }, + { + "auxiliary_loss_clip": 0.01084934, + "auxiliary_loss_mlp": 0.01037077, + "balance_loss_clip": 1.03663993, + "balance_loss_mlp": 1.02411306, + "epoch": 0.6349015481737562, + "flos": 21653057485440.0, + "grad_norm": 1.9576938440010785, + "language_loss": 0.67729032, + "learning_rate": 1.242601136020078e-06, + "loss": 0.69851041, + "num_input_tokens_seen": 227685070, + "step": 10560, + "time_per_iteration": 2.549433708190918 + }, + { + "auxiliary_loss_clip": 0.0108257, + "auxiliary_loss_mlp": 0.01039993, + "balance_loss_clip": 1.03630352, + "balance_loss_mlp": 1.02605128, + "epoch": 0.6349616714264241, + "flos": 22194984954240.0, + "grad_norm": 1.8616636705562717, + "language_loss": 0.76985544, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.79108107, + "num_input_tokens_seen": 227704430, + "step": 10561, + "time_per_iteration": 2.569769859313965 + }, + { + "auxiliary_loss_clip": 0.01087622, + "auxiliary_loss_mlp": 0.01030268, + "balance_loss_clip": 1.03796577, + "balance_loss_mlp": 1.01819801, + "epoch": 0.6350217946790921, + "flos": 25410354814080.0, + "grad_norm": 1.904674523468591, + "language_loss": 0.72441125, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.74559015, + "num_input_tokens_seen": 227724920, + "step": 10562, + "time_per_iteration": 2.576662302017212 + }, + { + "auxiliary_loss_clip": 0.01097431, + "auxiliary_loss_mlp": 0.01032684, + "balance_loss_clip": 1.03995991, + "balance_loss_mlp": 1.01952946, + "epoch": 0.63508191793176, + "flos": 19718944732800.0, + "grad_norm": 2.8593023211441033, + "language_loss": 0.8088994, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.83020055, + "num_input_tokens_seen": 227743400, + "step": 10563, + "time_per_iteration": 2.4870877265930176 + }, + { + "auxiliary_loss_clip": 0.01082045, + "auxiliary_loss_mlp": 0.0103485, + "balance_loss_clip": 1.04034185, + "balance_loss_mlp": 1.02276254, + "epoch": 0.6351420411844281, + "flos": 18186923802240.0, + "grad_norm": 2.243216889151289, + "language_loss": 0.80948257, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.8306514, + "num_input_tokens_seen": 227759990, + "step": 10564, + "time_per_iteration": 2.5609982013702393 + }, + { + "auxiliary_loss_clip": 0.01081514, + "auxiliary_loss_mlp": 0.01038817, + "balance_loss_clip": 1.03971624, + "balance_loss_mlp": 1.02494144, + "epoch": 0.6352021644370961, + "flos": 33726511422720.0, + "grad_norm": 1.55146090560981, + "language_loss": 0.7262218, + "learning_rate": 1.240799222993407e-06, + "loss": 0.7474252, + "num_input_tokens_seen": 227780835, + "step": 10565, + "time_per_iteration": 2.639836072921753 + }, + { + "auxiliary_loss_clip": 0.01094597, + "auxiliary_loss_mlp": 0.01031296, + "balance_loss_clip": 1.03827131, + "balance_loss_mlp": 1.01763475, + "epoch": 0.635262287689764, + "flos": 20374781207040.0, + "grad_norm": 1.9175968216730581, + "language_loss": 0.68946874, + "learning_rate": 1.240438926700324e-06, + "loss": 0.71072769, + "num_input_tokens_seen": 227798580, + "step": 10566, + "time_per_iteration": 2.4870307445526123 + }, + { + "auxiliary_loss_clip": 0.01095158, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.03828096, + "balance_loss_mlp": 1.02089095, + "epoch": 0.635322410942432, + "flos": 27525421307520.0, + "grad_norm": 1.5963396740231894, + "language_loss": 0.69727767, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.71855307, + "num_input_tokens_seen": 227819210, + "step": 10567, + "time_per_iteration": 2.535710573196411 + }, + { + "auxiliary_loss_clip": 0.01095542, + "auxiliary_loss_mlp": 0.01028201, + "balance_loss_clip": 1.04300332, + "balance_loss_mlp": 1.01695323, + "epoch": 0.6353825341950999, + "flos": 21543601766400.0, + "grad_norm": 1.8626197517743353, + "language_loss": 0.84522688, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.86646426, + "num_input_tokens_seen": 227838340, + "step": 10568, + "time_per_iteration": 2.5074243545532227 + }, + { + "auxiliary_loss_clip": 0.01058869, + "auxiliary_loss_mlp": 0.01039135, + "balance_loss_clip": 1.04156578, + "balance_loss_mlp": 1.0260638, + "epoch": 0.635442657447768, + "flos": 31759756185600.0, + "grad_norm": 1.7697464809189671, + "language_loss": 0.84041762, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.86139762, + "num_input_tokens_seen": 227859170, + "step": 10569, + "time_per_iteration": 2.6775002479553223 + }, + { + "auxiliary_loss_clip": 0.01097443, + "auxiliary_loss_mlp": 0.0102956, + "balance_loss_clip": 1.03916085, + "balance_loss_mlp": 1.01692367, + "epoch": 0.6355027807004359, + "flos": 19828831415040.0, + "grad_norm": 1.6430141818593216, + "language_loss": 0.69300056, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.71427059, + "num_input_tokens_seen": 227878545, + "step": 10570, + "time_per_iteration": 2.5021378993988037 + }, + { + "auxiliary_loss_clip": 0.01099807, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.03807199, + "balance_loss_mlp": 1.01876664, + "epoch": 0.6355629039531039, + "flos": 30372383324160.0, + "grad_norm": 1.6486580787950484, + "language_loss": 0.65871722, + "learning_rate": 1.2386378775476e-06, + "loss": 0.68002951, + "num_input_tokens_seen": 227898875, + "step": 10571, + "time_per_iteration": 2.5564448833465576 + }, + { + "auxiliary_loss_clip": 0.01104579, + "auxiliary_loss_mlp": 0.01027409, + "balance_loss_clip": 1.04042172, + "balance_loss_mlp": 1.01494551, + "epoch": 0.6356230272057718, + "flos": 17932065828480.0, + "grad_norm": 1.7713311403265748, + "language_loss": 0.71233553, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.73365545, + "num_input_tokens_seen": 227917130, + "step": 10572, + "time_per_iteration": 2.4843251705169678 + }, + { + "auxiliary_loss_clip": 0.01075187, + "auxiliary_loss_mlp": 0.01030305, + "balance_loss_clip": 1.0363152, + "balance_loss_mlp": 1.01859879, + "epoch": 0.6356831504584398, + "flos": 25375844822400.0, + "grad_norm": 1.4083202598842577, + "language_loss": 0.81385899, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.83491391, + "num_input_tokens_seen": 227939550, + "step": 10573, + "time_per_iteration": 2.6022627353668213 + }, + { + "auxiliary_loss_clip": 0.01091274, + "auxiliary_loss_mlp": 0.01030503, + "balance_loss_clip": 1.0388453, + "balance_loss_mlp": 1.01841497, + "epoch": 0.6357432737111077, + "flos": 46500331720320.0, + "grad_norm": 1.6141386554762005, + "language_loss": 0.69002855, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.71124625, + "num_input_tokens_seen": 227962200, + "step": 10574, + "time_per_iteration": 2.738844633102417 + }, + { + "auxiliary_loss_clip": 0.01109528, + "auxiliary_loss_mlp": 0.01027475, + "balance_loss_clip": 1.03956556, + "balance_loss_mlp": 1.01494622, + "epoch": 0.6358033969637757, + "flos": 17274361847040.0, + "grad_norm": 2.3325537115895014, + "language_loss": 0.8660686, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.88743865, + "num_input_tokens_seen": 227979270, + "step": 10575, + "time_per_iteration": 3.8155055046081543 + }, + { + "auxiliary_loss_clip": 0.01109239, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.03939342, + "balance_loss_mlp": 1.01878428, + "epoch": 0.6358635202164437, + "flos": 27125520215040.0, + "grad_norm": 2.0030041306584563, + "language_loss": 0.72213882, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.74353814, + "num_input_tokens_seen": 228000550, + "step": 10576, + "time_per_iteration": 2.4968883991241455 + }, + { + "auxiliary_loss_clip": 0.01089871, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.03925872, + "balance_loss_mlp": 1.02000165, + "epoch": 0.6359236434691117, + "flos": 27525205825920.0, + "grad_norm": 1.5600574988079687, + "language_loss": 0.69240618, + "learning_rate": 1.236477571455085e-06, + "loss": 0.71363485, + "num_input_tokens_seen": 228022005, + "step": 10577, + "time_per_iteration": 2.5839946269989014 + }, + { + "auxiliary_loss_clip": 0.01067328, + "auxiliary_loss_mlp": 0.01032321, + "balance_loss_clip": 1.03625107, + "balance_loss_mlp": 1.01995277, + "epoch": 0.6359837667217797, + "flos": 39348290989440.0, + "grad_norm": 1.7317600226230725, + "language_loss": 0.72062105, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.74161756, + "num_input_tokens_seen": 228043770, + "step": 10578, + "time_per_iteration": 2.719874143600464 + }, + { + "auxiliary_loss_clip": 0.01021625, + "auxiliary_loss_mlp": 0.00770629, + "balance_loss_clip": 1.02529371, + "balance_loss_mlp": 1.01208818, + "epoch": 0.6360438899744476, + "flos": 56413797206400.0, + "grad_norm": 0.7026955199567267, + "language_loss": 0.54541039, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.56333292, + "num_input_tokens_seen": 228104985, + "step": 10579, + "time_per_iteration": 3.253638982772827 + }, + { + "auxiliary_loss_clip": 0.01086378, + "auxiliary_loss_mlp": 0.01029923, + "balance_loss_clip": 1.0377028, + "balance_loss_mlp": 1.01717365, + "epoch": 0.6361040132271156, + "flos": 24973106555520.0, + "grad_norm": 2.396740357738586, + "language_loss": 0.77945763, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.80062068, + "num_input_tokens_seen": 228125620, + "step": 10580, + "time_per_iteration": 2.5658862590789795 + }, + { + "auxiliary_loss_clip": 0.0108074, + "auxiliary_loss_mlp": 0.00783536, + "balance_loss_clip": 1.03921103, + "balance_loss_mlp": 1.00873637, + "epoch": 0.6361641364797835, + "flos": 23259198130560.0, + "grad_norm": 2.4370329345016284, + "language_loss": 0.66725898, + "learning_rate": 1.235037946268301e-06, + "loss": 0.68590176, + "num_input_tokens_seen": 228143495, + "step": 10581, + "time_per_iteration": 2.5811774730682373 + }, + { + "auxiliary_loss_clip": 0.01095828, + "auxiliary_loss_mlp": 0.01029557, + "balance_loss_clip": 1.04072952, + "balance_loss_mlp": 1.01765358, + "epoch": 0.6362242597324516, + "flos": 25994513698560.0, + "grad_norm": 1.4179251893570455, + "language_loss": 0.68543196, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.70668578, + "num_input_tokens_seen": 228166500, + "step": 10582, + "time_per_iteration": 2.5556700229644775 + }, + { + "auxiliary_loss_clip": 0.0108419, + "auxiliary_loss_mlp": 0.01036269, + "balance_loss_clip": 1.04179013, + "balance_loss_mlp": 1.0239011, + "epoch": 0.6362843829851195, + "flos": 25703242312320.0, + "grad_norm": 2.3198661430020158, + "language_loss": 0.84813017, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.86933476, + "num_input_tokens_seen": 228185325, + "step": 10583, + "time_per_iteration": 2.573737621307373 + }, + { + "auxiliary_loss_clip": 0.01088817, + "auxiliary_loss_mlp": 0.01027765, + "balance_loss_clip": 1.04400635, + "balance_loss_mlp": 1.01596332, + "epoch": 0.6363445062377875, + "flos": 20522912895360.0, + "grad_norm": 1.5038935965841294, + "language_loss": 0.75519216, + "learning_rate": 1.233958531908538e-06, + "loss": 0.77635801, + "num_input_tokens_seen": 228204050, + "step": 10584, + "time_per_iteration": 2.5386343002319336 + }, + { + "auxiliary_loss_clip": 0.01094055, + "auxiliary_loss_mlp": 0.01036339, + "balance_loss_clip": 1.03997135, + "balance_loss_mlp": 1.02245712, + "epoch": 0.6364046294904554, + "flos": 19463799450240.0, + "grad_norm": 1.8629655573226724, + "language_loss": 0.72640276, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.74770677, + "num_input_tokens_seen": 228222430, + "step": 10585, + "time_per_iteration": 5.3067967891693115 + }, + { + "auxiliary_loss_clip": 0.01076809, + "auxiliary_loss_mlp": 0.01029179, + "balance_loss_clip": 1.04106331, + "balance_loss_mlp": 1.017717, + "epoch": 0.6364647527431234, + "flos": 20995892208000.0, + "grad_norm": 1.7410569703313215, + "language_loss": 0.82380742, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.84486729, + "num_input_tokens_seen": 228241925, + "step": 10586, + "time_per_iteration": 2.5842554569244385 + }, + { + "auxiliary_loss_clip": 0.01097977, + "auxiliary_loss_mlp": 0.01026002, + "balance_loss_clip": 1.03952742, + "balance_loss_mlp": 1.01419473, + "epoch": 0.6365248759957913, + "flos": 25770789838080.0, + "grad_norm": 1.6122861917773275, + "language_loss": 0.7263, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.74753976, + "num_input_tokens_seen": 228262535, + "step": 10587, + "time_per_iteration": 2.5400798320770264 + }, + { + "auxiliary_loss_clip": 0.01086407, + "auxiliary_loss_mlp": 0.01025559, + "balance_loss_clip": 1.03835928, + "balance_loss_mlp": 1.01397169, + "epoch": 0.6365849992484593, + "flos": 22455589104000.0, + "grad_norm": 1.9977663847203795, + "language_loss": 0.77079427, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.79191393, + "num_input_tokens_seen": 228281340, + "step": 10588, + "time_per_iteration": 3.921621561050415 + }, + { + "auxiliary_loss_clip": 0.01061936, + "auxiliary_loss_mlp": 0.01027897, + "balance_loss_clip": 1.03755939, + "balance_loss_mlp": 1.0152545, + "epoch": 0.6366451225011273, + "flos": 19025689265280.0, + "grad_norm": 1.432642160747503, + "language_loss": 0.80006671, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.82096499, + "num_input_tokens_seen": 228300865, + "step": 10589, + "time_per_iteration": 2.6001532077789307 + }, + { + "auxiliary_loss_clip": 0.01088129, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.03787804, + "balance_loss_mlp": 1.01867509, + "epoch": 0.6367052457537953, + "flos": 25228395492480.0, + "grad_norm": 2.014756581317245, + "language_loss": 0.67392558, + "learning_rate": 1.231800487863257e-06, + "loss": 0.69511688, + "num_input_tokens_seen": 228320815, + "step": 10590, + "time_per_iteration": 2.5307834148406982 + }, + { + "auxiliary_loss_clip": 0.0110423, + "auxiliary_loss_mlp": 0.01034873, + "balance_loss_clip": 1.03927469, + "balance_loss_mlp": 1.02252293, + "epoch": 0.6367653690064633, + "flos": 19208438686080.0, + "grad_norm": 1.6268985843307984, + "language_loss": 0.79291576, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.8143068, + "num_input_tokens_seen": 228339065, + "step": 10591, + "time_per_iteration": 2.4899609088897705 + }, + { + "auxiliary_loss_clip": 0.01087565, + "auxiliary_loss_mlp": 0.01029456, + "balance_loss_clip": 1.04006362, + "balance_loss_mlp": 1.01813149, + "epoch": 0.6368254922591312, + "flos": 23546806329600.0, + "grad_norm": 1.476222640370243, + "language_loss": 0.8902781, + "learning_rate": 1.231081372744317e-06, + "loss": 0.91144836, + "num_input_tokens_seen": 228359210, + "step": 10592, + "time_per_iteration": 2.5395405292510986 + }, + { + "auxiliary_loss_clip": 0.01095085, + "auxiliary_loss_mlp": 0.01028206, + "balance_loss_clip": 1.03732872, + "balance_loss_mlp": 1.01722145, + "epoch": 0.6368856155117992, + "flos": 26467313443200.0, + "grad_norm": 1.3845321305252674, + "language_loss": 0.68453342, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.70576632, + "num_input_tokens_seen": 228379630, + "step": 10593, + "time_per_iteration": 2.5410890579223633 + }, + { + "auxiliary_loss_clip": 0.01059228, + "auxiliary_loss_mlp": 0.01041815, + "balance_loss_clip": 1.03358293, + "balance_loss_mlp": 1.02866042, + "epoch": 0.6369457387644671, + "flos": 33692432394240.0, + "grad_norm": 1.6788798902175317, + "language_loss": 0.63272697, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.65373731, + "num_input_tokens_seen": 228401410, + "step": 10594, + "time_per_iteration": 2.687072992324829 + }, + { + "auxiliary_loss_clip": 0.01031247, + "auxiliary_loss_mlp": 0.01003319, + "balance_loss_clip": 1.02186799, + "balance_loss_mlp": 1.00213242, + "epoch": 0.6370058620171352, + "flos": 70908600908160.0, + "grad_norm": 0.7706815545726196, + "language_loss": 0.54639351, + "learning_rate": 1.230002918781022e-06, + "loss": 0.5667392, + "num_input_tokens_seen": 228470335, + "step": 10595, + "time_per_iteration": 3.2205002307891846 + }, + { + "auxiliary_loss_clip": 0.01113028, + "auxiliary_loss_mlp": 0.01035685, + "balance_loss_clip": 1.03995776, + "balance_loss_mlp": 1.02271485, + "epoch": 0.6370659852698031, + "flos": 21141940907520.0, + "grad_norm": 1.716430089546519, + "language_loss": 0.66389728, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.68538439, + "num_input_tokens_seen": 228490765, + "step": 10596, + "time_per_iteration": 2.493497371673584 + }, + { + "auxiliary_loss_clip": 0.01094543, + "auxiliary_loss_mlp": 0.01036079, + "balance_loss_clip": 1.038589, + "balance_loss_mlp": 1.02345443, + "epoch": 0.6371261085224711, + "flos": 20193288762240.0, + "grad_norm": 2.533463305799403, + "language_loss": 0.79094434, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.81225055, + "num_input_tokens_seen": 228509700, + "step": 10597, + "time_per_iteration": 2.533220052719116 + }, + { + "auxiliary_loss_clip": 0.01100456, + "auxiliary_loss_mlp": 0.01032748, + "balance_loss_clip": 1.04082847, + "balance_loss_mlp": 1.02132785, + "epoch": 0.637186231775139, + "flos": 19683536901120.0, + "grad_norm": 1.6471371615898471, + "language_loss": 0.74772608, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.76905811, + "num_input_tokens_seen": 228529050, + "step": 10598, + "time_per_iteration": 2.504966974258423 + }, + { + "auxiliary_loss_clip": 0.01077132, + "auxiliary_loss_mlp": 0.007904, + "balance_loss_clip": 1.03781414, + "balance_loss_mlp": 1.02102137, + "epoch": 0.637246355027807, + "flos": 13071196995840.0, + "grad_norm": 1.8015614543218852, + "language_loss": 0.68382251, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.70249784, + "num_input_tokens_seen": 228544665, + "step": 10599, + "time_per_iteration": 2.5411181449890137 + }, + { + "auxiliary_loss_clip": 0.01080667, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.03808689, + "balance_loss_mlp": 1.01914954, + "epoch": 0.6373064782804749, + "flos": 18222654856320.0, + "grad_norm": 1.9271204030144624, + "language_loss": 0.80587375, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.8270008, + "num_input_tokens_seen": 228562060, + "step": 10600, + "time_per_iteration": 2.5319392681121826 + }, + { + "auxiliary_loss_clip": 0.01097512, + "auxiliary_loss_mlp": 0.01032623, + "balance_loss_clip": 1.03736496, + "balance_loss_mlp": 1.02058268, + "epoch": 0.637366601533143, + "flos": 24498475217280.0, + "grad_norm": 1.5201785343886824, + "language_loss": 0.79992706, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.82122838, + "num_input_tokens_seen": 228582550, + "step": 10601, + "time_per_iteration": 2.537689447402954 + }, + { + "auxiliary_loss_clip": 0.0107755, + "auxiliary_loss_mlp": 0.01026697, + "balance_loss_clip": 1.03894663, + "balance_loss_mlp": 1.01497912, + "epoch": 0.6374267247858109, + "flos": 26359042872960.0, + "grad_norm": 1.806291525451451, + "language_loss": 0.66894019, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.68998265, + "num_input_tokens_seen": 228604960, + "step": 10602, + "time_per_iteration": 2.6020562648773193 + }, + { + "auxiliary_loss_clip": 0.01035194, + "auxiliary_loss_mlp": 0.01027661, + "balance_loss_clip": 1.03490067, + "balance_loss_mlp": 1.01494145, + "epoch": 0.6374868480384789, + "flos": 20371728551040.0, + "grad_norm": 1.5914099239666384, + "language_loss": 0.79837358, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.81900203, + "num_input_tokens_seen": 228622195, + "step": 10603, + "time_per_iteration": 2.6560206413269043 + }, + { + "auxiliary_loss_clip": 0.0106639, + "auxiliary_loss_mlp": 0.00791115, + "balance_loss_clip": 1.03710616, + "balance_loss_mlp": 1.02125776, + "epoch": 0.6375469712911469, + "flos": 20996251344000.0, + "grad_norm": 1.874311847492079, + "language_loss": 0.77045524, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.78903031, + "num_input_tokens_seen": 228639735, + "step": 10604, + "time_per_iteration": 2.5924839973449707 + }, + { + "auxiliary_loss_clip": 0.01091034, + "auxiliary_loss_mlp": 0.0102782, + "balance_loss_clip": 1.03786027, + "balance_loss_mlp": 1.01539803, + "epoch": 0.6376070945438148, + "flos": 19715748422400.0, + "grad_norm": 1.7642229448911142, + "language_loss": 0.76682401, + "learning_rate": 1.226409972197281e-06, + "loss": 0.78801262, + "num_input_tokens_seen": 228658195, + "step": 10605, + "time_per_iteration": 2.539275884628296 + }, + { + "auxiliary_loss_clip": 0.01049274, + "auxiliary_loss_mlp": 0.01034901, + "balance_loss_clip": 1.03460908, + "balance_loss_mlp": 1.0202266, + "epoch": 0.6376672177964828, + "flos": 21506757390720.0, + "grad_norm": 1.6943622817758432, + "language_loss": 0.65842664, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.67926836, + "num_input_tokens_seen": 228677415, + "step": 10606, + "time_per_iteration": 2.6188032627105713 + }, + { + "auxiliary_loss_clip": 0.01082432, + "auxiliary_loss_mlp": 0.01030893, + "balance_loss_clip": 1.03741598, + "balance_loss_mlp": 1.01977038, + "epoch": 0.6377273410491507, + "flos": 18843873598080.0, + "grad_norm": 1.5060057860549643, + "language_loss": 0.74787706, + "learning_rate": 1.225691734459971e-06, + "loss": 0.76901031, + "num_input_tokens_seen": 228696450, + "step": 10607, + "time_per_iteration": 2.5601186752319336 + }, + { + "auxiliary_loss_clip": 0.01087619, + "auxiliary_loss_mlp": 0.01031525, + "balance_loss_clip": 1.04268575, + "balance_loss_mlp": 1.01981235, + "epoch": 0.6377874643018188, + "flos": 53062970181120.0, + "grad_norm": 1.625230585795342, + "language_loss": 0.65903187, + "learning_rate": 1.225332659627278e-06, + "loss": 0.68022335, + "num_input_tokens_seen": 228721600, + "step": 10608, + "time_per_iteration": 2.835698127746582 + }, + { + "auxiliary_loss_clip": 0.00993806, + "auxiliary_loss_mlp": 0.01003597, + "balance_loss_clip": 1.02922952, + "balance_loss_mlp": 1.00214314, + "epoch": 0.6378475875544867, + "flos": 65135026465920.0, + "grad_norm": 0.7259006701989857, + "language_loss": 0.51886308, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.53883708, + "num_input_tokens_seen": 228784535, + "step": 10609, + "time_per_iteration": 3.326773166656494 + }, + { + "auxiliary_loss_clip": 0.01094753, + "auxiliary_loss_mlp": 0.01023027, + "balance_loss_clip": 1.03626096, + "balance_loss_mlp": 1.01241696, + "epoch": 0.6379077108071547, + "flos": 23002759958400.0, + "grad_norm": 1.6244175203052962, + "language_loss": 0.74367881, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.76485658, + "num_input_tokens_seen": 228804110, + "step": 10610, + "time_per_iteration": 2.8030338287353516 + }, + { + "auxiliary_loss_clip": 0.01027526, + "auxiliary_loss_mlp": 0.01001043, + "balance_loss_clip": 1.02578855, + "balance_loss_mlp": 0.9997372, + "epoch": 0.6379678340598226, + "flos": 67601947610880.0, + "grad_norm": 0.8444297028765864, + "language_loss": 0.63132167, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.65160728, + "num_input_tokens_seen": 228867705, + "step": 10611, + "time_per_iteration": 3.192202568054199 + }, + { + "auxiliary_loss_clip": 0.01096088, + "auxiliary_loss_mlp": 0.01030956, + "balance_loss_clip": 1.04088545, + "balance_loss_mlp": 1.01874256, + "epoch": 0.6380279573124906, + "flos": 29680061610240.0, + "grad_norm": 2.1266035582521443, + "language_loss": 0.72293448, + "learning_rate": 1.223896654187282e-06, + "loss": 0.74420488, + "num_input_tokens_seen": 228889215, + "step": 10612, + "time_per_iteration": 2.558593988418579 + }, + { + "auxiliary_loss_clip": 0.01028617, + "auxiliary_loss_mlp": 0.01000551, + "balance_loss_clip": 1.02391768, + "balance_loss_mlp": 0.99934673, + "epoch": 0.6380880805651585, + "flos": 66484046580480.0, + "grad_norm": 0.7098322955574932, + "language_loss": 0.57913148, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.59942317, + "num_input_tokens_seen": 228948465, + "step": 10613, + "time_per_iteration": 4.628596067428589 + }, + { + "auxiliary_loss_clip": 0.01065626, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.03560281, + "balance_loss_mlp": 1.01849449, + "epoch": 0.6381482038178266, + "flos": 23914998691200.0, + "grad_norm": 1.697003726146515, + "language_loss": 0.75300199, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.77397299, + "num_input_tokens_seen": 228967955, + "step": 10614, + "time_per_iteration": 2.6400740146636963 + }, + { + "auxiliary_loss_clip": 0.0108878, + "auxiliary_loss_mlp": 0.00789274, + "balance_loss_clip": 1.03912497, + "balance_loss_mlp": 1.01940835, + "epoch": 0.6382083270704945, + "flos": 24243042625920.0, + "grad_norm": 1.9305483265645065, + "language_loss": 0.80068582, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.81946635, + "num_input_tokens_seen": 228985495, + "step": 10615, + "time_per_iteration": 2.5855653285980225 + }, + { + "auxiliary_loss_clip": 0.01023924, + "auxiliary_loss_mlp": 0.01001207, + "balance_loss_clip": 1.02456212, + "balance_loss_mlp": 1.00005686, + "epoch": 0.6382684503231625, + "flos": 70775552931840.0, + "grad_norm": 0.6609482387874969, + "language_loss": 0.55633688, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.57658815, + "num_input_tokens_seen": 229052995, + "step": 10616, + "time_per_iteration": 3.21382999420166 + }, + { + "auxiliary_loss_clip": 0.01081936, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.03583002, + "balance_loss_mlp": 1.01957989, + "epoch": 0.6383285735758305, + "flos": 16544836621440.0, + "grad_norm": 7.199249743760886, + "language_loss": 0.84164095, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.86278093, + "num_input_tokens_seen": 229071030, + "step": 10617, + "time_per_iteration": 2.546757221221924 + }, + { + "auxiliary_loss_clip": 0.01097971, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.03711343, + "balance_loss_mlp": 1.02587509, + "epoch": 0.6383886968284984, + "flos": 14427651225600.0, + "grad_norm": 1.7834522254103407, + "language_loss": 0.87007916, + "learning_rate": 1.221743529196936e-06, + "loss": 0.89144516, + "num_input_tokens_seen": 229088275, + "step": 10618, + "time_per_iteration": 2.479863166809082 + }, + { + "auxiliary_loss_clip": 0.01057729, + "auxiliary_loss_mlp": 0.01034686, + "balance_loss_clip": 1.03724813, + "balance_loss_mlp": 1.0240643, + "epoch": 0.6384488200811664, + "flos": 17929659617280.0, + "grad_norm": 1.7048161234910315, + "language_loss": 0.73478442, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.75570858, + "num_input_tokens_seen": 229105190, + "step": 10619, + "time_per_iteration": 2.6371538639068604 + }, + { + "auxiliary_loss_clip": 0.01087406, + "auxiliary_loss_mlp": 0.01036796, + "balance_loss_clip": 1.03632832, + "balance_loss_mlp": 1.02292585, + "epoch": 0.6385089433338343, + "flos": 18515578268160.0, + "grad_norm": 1.8394048520634028, + "language_loss": 0.76501107, + "learning_rate": 1.221026056814193e-06, + "loss": 0.78625315, + "num_input_tokens_seen": 229122290, + "step": 10620, + "time_per_iteration": 2.513417959213257 + }, + { + "auxiliary_loss_clip": 0.01085036, + "auxiliary_loss_mlp": 0.01033367, + "balance_loss_clip": 1.03768075, + "balance_loss_mlp": 1.0214994, + "epoch": 0.6385690665865024, + "flos": 24753620499840.0, + "grad_norm": 3.4623536299118802, + "language_loss": 0.70625973, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.72744375, + "num_input_tokens_seen": 229141620, + "step": 10621, + "time_per_iteration": 2.5744690895080566 + }, + { + "auxiliary_loss_clip": 0.01078944, + "auxiliary_loss_mlp": 0.01027197, + "balance_loss_clip": 1.03351283, + "balance_loss_mlp": 1.01633143, + "epoch": 0.6386291898391703, + "flos": 20120569678080.0, + "grad_norm": 2.4354088198756507, + "language_loss": 0.77532756, + "learning_rate": 1.220308702586529e-06, + "loss": 0.79638892, + "num_input_tokens_seen": 229161570, + "step": 10622, + "time_per_iteration": 2.5386087894439697 + }, + { + "auxiliary_loss_clip": 0.01069983, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.03689754, + "balance_loss_mlp": 1.01947868, + "epoch": 0.6386893130918383, + "flos": 16867278034560.0, + "grad_norm": 1.982349775054232, + "language_loss": 0.74486357, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.76587415, + "num_input_tokens_seen": 229178465, + "step": 10623, + "time_per_iteration": 5.300457715988159 + }, + { + "auxiliary_loss_clip": 0.01082646, + "auxiliary_loss_mlp": 0.01029706, + "balance_loss_clip": 1.03433108, + "balance_loss_mlp": 1.01910865, + "epoch": 0.6387494363445062, + "flos": 22966274718720.0, + "grad_norm": 1.4123451840168164, + "language_loss": 0.76775539, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.78887892, + "num_input_tokens_seen": 229198975, + "step": 10624, + "time_per_iteration": 2.5468196868896484 + }, + { + "auxiliary_loss_clip": 0.01046308, + "auxiliary_loss_mlp": 0.01038088, + "balance_loss_clip": 1.03643274, + "balance_loss_mlp": 1.02594066, + "epoch": 0.6388095595971742, + "flos": 22857716839680.0, + "grad_norm": 1.8563133958409395, + "language_loss": 0.80021334, + "learning_rate": 1.21923289302382e-06, + "loss": 0.82105732, + "num_input_tokens_seen": 229218825, + "step": 10625, + "time_per_iteration": 2.6852328777313232 + }, + { + "auxiliary_loss_clip": 0.01086704, + "auxiliary_loss_mlp": 0.01036439, + "balance_loss_clip": 1.0389812, + "balance_loss_mlp": 1.0244348, + "epoch": 0.6388696828498421, + "flos": 17311529445120.0, + "grad_norm": 1.9805391918811175, + "language_loss": 0.73081028, + "learning_rate": 1.218874349031654e-06, + "loss": 0.7520417, + "num_input_tokens_seen": 229236060, + "step": 10626, + "time_per_iteration": 3.8844687938690186 + }, + { + "auxiliary_loss_clip": 0.01087223, + "auxiliary_loss_mlp": 0.01031883, + "balance_loss_clip": 1.03671002, + "balance_loss_mlp": 1.01958609, + "epoch": 0.6389298061025102, + "flos": 17128636369920.0, + "grad_norm": 1.7459912572064038, + "language_loss": 0.73049319, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.75168431, + "num_input_tokens_seen": 229255160, + "step": 10627, + "time_per_iteration": 2.5216500759124756 + }, + { + "auxiliary_loss_clip": 0.01084389, + "auxiliary_loss_mlp": 0.01032766, + "balance_loss_clip": 1.0376972, + "balance_loss_mlp": 1.01897335, + "epoch": 0.6389899293551781, + "flos": 27710971989120.0, + "grad_norm": 1.7498744173717078, + "language_loss": 0.66715705, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.68832862, + "num_input_tokens_seen": 229278705, + "step": 10628, + "time_per_iteration": 2.6234233379364014 + }, + { + "auxiliary_loss_clip": 0.01103771, + "auxiliary_loss_mlp": 0.01027333, + "balance_loss_clip": 1.03736091, + "balance_loss_mlp": 1.01625872, + "epoch": 0.6390500526078461, + "flos": 21215701486080.0, + "grad_norm": 1.8088700466854297, + "language_loss": 0.68440163, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.70571268, + "num_input_tokens_seen": 229299990, + "step": 10629, + "time_per_iteration": 2.5285661220550537 + }, + { + "auxiliary_loss_clip": 0.01073556, + "auxiliary_loss_mlp": 0.01038788, + "balance_loss_clip": 1.03982925, + "balance_loss_mlp": 1.02466726, + "epoch": 0.6391101758605141, + "flos": 21581056673280.0, + "grad_norm": 1.525273910218152, + "language_loss": 0.75580662, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.77693009, + "num_input_tokens_seen": 229319230, + "step": 10630, + "time_per_iteration": 2.585092067718506 + }, + { + "auxiliary_loss_clip": 0.01084004, + "auxiliary_loss_mlp": 0.01034175, + "balance_loss_clip": 1.03681135, + "balance_loss_mlp": 1.02271867, + "epoch": 0.639170299113182, + "flos": 19900473091200.0, + "grad_norm": 1.68123864272701, + "language_loss": 0.70551634, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.72669816, + "num_input_tokens_seen": 229338600, + "step": 10631, + "time_per_iteration": 2.553377628326416 + }, + { + "auxiliary_loss_clip": 0.01023401, + "auxiliary_loss_mlp": 0.01003214, + "balance_loss_clip": 1.02293193, + "balance_loss_mlp": 1.0016408, + "epoch": 0.63923042236585, + "flos": 69877604833920.0, + "grad_norm": 0.7669052218146686, + "language_loss": 0.63028049, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.65054661, + "num_input_tokens_seen": 229402420, + "step": 10632, + "time_per_iteration": 3.1966795921325684 + }, + { + "auxiliary_loss_clip": 0.01084947, + "auxiliary_loss_mlp": 0.01034015, + "balance_loss_clip": 1.03842092, + "balance_loss_mlp": 1.02209973, + "epoch": 0.639290545618518, + "flos": 22674823764480.0, + "grad_norm": 1.9294946291815969, + "language_loss": 0.66229236, + "learning_rate": 1.216365371217893e-06, + "loss": 0.68348199, + "num_input_tokens_seen": 229419185, + "step": 10633, + "time_per_iteration": 2.552764415740967 + }, + { + "auxiliary_loss_clip": 0.01041353, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.03705561, + "balance_loss_mlp": 1.0213387, + "epoch": 0.639350668871186, + "flos": 19829190551040.0, + "grad_norm": 1.9667574030115986, + "language_loss": 0.82007074, + "learning_rate": 1.216007064569225e-06, + "loss": 0.84081304, + "num_input_tokens_seen": 229436735, + "step": 10634, + "time_per_iteration": 2.6581647396087646 + }, + { + "auxiliary_loss_clip": 0.01084513, + "auxiliary_loss_mlp": 0.01033768, + "balance_loss_clip": 1.03957093, + "balance_loss_mlp": 1.02049422, + "epoch": 0.6394107921238539, + "flos": 20553328736640.0, + "grad_norm": 1.5998754826832327, + "language_loss": 0.75050831, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.77169108, + "num_input_tokens_seen": 229455595, + "step": 10635, + "time_per_iteration": 2.5308456420898438 + }, + { + "auxiliary_loss_clip": 0.0109347, + "auxiliary_loss_mlp": 0.01031985, + "balance_loss_clip": 1.03821421, + "balance_loss_mlp": 1.01972461, + "epoch": 0.6394709153765219, + "flos": 25774991729280.0, + "grad_norm": 1.6398251517255955, + "language_loss": 0.71796834, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.73922288, + "num_input_tokens_seen": 229476230, + "step": 10636, + "time_per_iteration": 2.5443918704986572 + }, + { + "auxiliary_loss_clip": 0.0108799, + "auxiliary_loss_mlp": 0.01036748, + "balance_loss_clip": 1.03783464, + "balance_loss_mlp": 1.02408767, + "epoch": 0.6395310386291898, + "flos": 17530153574400.0, + "grad_norm": 2.3152411707440383, + "language_loss": 0.73855585, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.75980318, + "num_input_tokens_seen": 229494300, + "step": 10637, + "time_per_iteration": 2.5376334190368652 + }, + { + "auxiliary_loss_clip": 0.01093353, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.03733158, + "balance_loss_mlp": 1.01874948, + "epoch": 0.6395911618818578, + "flos": 18588225525120.0, + "grad_norm": 1.7574470293202038, + "language_loss": 0.7759431, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.79718673, + "num_input_tokens_seen": 229512985, + "step": 10638, + "time_per_iteration": 2.4902961254119873 + }, + { + "auxiliary_loss_clip": 0.01085546, + "auxiliary_loss_mlp": 0.01033191, + "balance_loss_clip": 1.03722572, + "balance_loss_mlp": 1.02097845, + "epoch": 0.6396512851345257, + "flos": 28366557068160.0, + "grad_norm": 1.6472630730060214, + "language_loss": 0.8168813, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.8380686, + "num_input_tokens_seen": 229534270, + "step": 10639, + "time_per_iteration": 2.623180627822876 + }, + { + "auxiliary_loss_clip": 0.01024798, + "auxiliary_loss_mlp": 0.01008083, + "balance_loss_clip": 1.02058208, + "balance_loss_mlp": 1.0069741, + "epoch": 0.6397114083871938, + "flos": 70724307202560.0, + "grad_norm": 0.8097543018368745, + "language_loss": 0.59057498, + "learning_rate": 1.21385784946359e-06, + "loss": 0.6109038, + "num_input_tokens_seen": 229596455, + "step": 10640, + "time_per_iteration": 3.094064950942993 + }, + { + "auxiliary_loss_clip": 0.01077691, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.03505087, + "balance_loss_mlp": 1.01860905, + "epoch": 0.6397715316398617, + "flos": 18142537570560.0, + "grad_norm": 2.0004483854090194, + "language_loss": 0.78249389, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.80357039, + "num_input_tokens_seen": 229612860, + "step": 10641, + "time_per_iteration": 2.55521559715271 + }, + { + "auxiliary_loss_clip": 0.01064215, + "auxiliary_loss_mlp": 0.01034078, + "balance_loss_clip": 1.03840911, + "balance_loss_mlp": 1.02188921, + "epoch": 0.6398316548925297, + "flos": 25739512070400.0, + "grad_norm": 1.7605405144010378, + "language_loss": 0.63275725, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.65374017, + "num_input_tokens_seen": 229633960, + "step": 10642, + "time_per_iteration": 2.6367416381835938 + }, + { + "auxiliary_loss_clip": 0.01023784, + "auxiliary_loss_mlp": 0.01009325, + "balance_loss_clip": 1.01989901, + "balance_loss_mlp": 1.00781715, + "epoch": 0.6398917781451977, + "flos": 71214234756480.0, + "grad_norm": 0.9186626590139303, + "language_loss": 0.55989778, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.58022892, + "num_input_tokens_seen": 229686730, + "step": 10643, + "time_per_iteration": 3.0957283973693848 + }, + { + "auxiliary_loss_clip": 0.01076696, + "auxiliary_loss_mlp": 0.01025561, + "balance_loss_clip": 1.03767824, + "balance_loss_mlp": 1.01366985, + "epoch": 0.6399519013978656, + "flos": 20521835487360.0, + "grad_norm": 1.869268047431947, + "language_loss": 0.76367986, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.78470242, + "num_input_tokens_seen": 229704800, + "step": 10644, + "time_per_iteration": 2.589582920074463 + }, + { + "auxiliary_loss_clip": 0.01074414, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.03845167, + "balance_loss_mlp": 1.01866651, + "epoch": 0.6400120246505336, + "flos": 24460840742400.0, + "grad_norm": 1.5506676687480787, + "language_loss": 0.82183152, + "learning_rate": 1.212067656542203e-06, + "loss": 0.84288692, + "num_input_tokens_seen": 229725265, + "step": 10645, + "time_per_iteration": 2.626453161239624 + }, + { + "auxiliary_loss_clip": 0.01101633, + "auxiliary_loss_mlp": 0.01037156, + "balance_loss_clip": 1.03813922, + "balance_loss_mlp": 1.02339339, + "epoch": 0.6400721479032015, + "flos": 28366090191360.0, + "grad_norm": 1.8248219168910742, + "language_loss": 0.73576993, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.7571578, + "num_input_tokens_seen": 229744840, + "step": 10646, + "time_per_iteration": 2.5459043979644775 + }, + { + "auxiliary_loss_clip": 0.01070311, + "auxiliary_loss_mlp": 0.01033661, + "balance_loss_clip": 1.03601599, + "balance_loss_mlp": 1.02047634, + "epoch": 0.6401322711558696, + "flos": 17816540711040.0, + "grad_norm": 3.0329530619297964, + "language_loss": 0.79693675, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.81797642, + "num_input_tokens_seen": 229759095, + "step": 10647, + "time_per_iteration": 2.561713933944702 + }, + { + "auxiliary_loss_clip": 0.01063798, + "auxiliary_loss_mlp": 0.01029492, + "balance_loss_clip": 1.03699231, + "balance_loss_mlp": 1.01746941, + "epoch": 0.6401923944085375, + "flos": 26030855283840.0, + "grad_norm": 1.4714823235803258, + "language_loss": 0.75659335, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.77752626, + "num_input_tokens_seen": 229777750, + "step": 10648, + "time_per_iteration": 2.618218183517456 + }, + { + "auxiliary_loss_clip": 0.01079695, + "auxiliary_loss_mlp": 0.01031251, + "balance_loss_clip": 1.03484452, + "balance_loss_mlp": 1.01912701, + "epoch": 0.6402525176612055, + "flos": 23586451966080.0, + "grad_norm": 1.8215906032668436, + "language_loss": 0.78631699, + "learning_rate": 1.210636039936138e-06, + "loss": 0.80742645, + "num_input_tokens_seen": 229796785, + "step": 10649, + "time_per_iteration": 2.5896358489990234 + }, + { + "auxiliary_loss_clip": 0.0105315, + "auxiliary_loss_mlp": 0.01034399, + "balance_loss_clip": 1.03871858, + "balance_loss_mlp": 1.02163172, + "epoch": 0.6403126409138734, + "flos": 18041413806720.0, + "grad_norm": 2.0627578325323275, + "language_loss": 0.75343543, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.77431095, + "num_input_tokens_seen": 229815425, + "step": 10650, + "time_per_iteration": 2.6042821407318115 + }, + { + "auxiliary_loss_clip": 0.01106651, + "auxiliary_loss_mlp": 0.01033172, + "balance_loss_clip": 1.03691292, + "balance_loss_mlp": 1.02023828, + "epoch": 0.6403727641665414, + "flos": 21979485308160.0, + "grad_norm": 2.124133180054986, + "language_loss": 0.70639223, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.72779042, + "num_input_tokens_seen": 229834545, + "step": 10651, + "time_per_iteration": 2.506472587585449 + }, + { + "auxiliary_loss_clip": 0.01076281, + "auxiliary_loss_mlp": 0.01035259, + "balance_loss_clip": 1.03668284, + "balance_loss_mlp": 1.02267003, + "epoch": 0.6404328874192093, + "flos": 24895539135360.0, + "grad_norm": 2.3233289959957935, + "language_loss": 0.63792026, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.65903568, + "num_input_tokens_seen": 229849175, + "step": 10652, + "time_per_iteration": 3.994550943374634 + }, + { + "auxiliary_loss_clip": 0.01086032, + "auxiliary_loss_mlp": 0.01026127, + "balance_loss_clip": 1.03702414, + "balance_loss_mlp": 1.01421189, + "epoch": 0.6404930106718774, + "flos": 17597198309760.0, + "grad_norm": 1.945934460867734, + "language_loss": 0.79144204, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.8125636, + "num_input_tokens_seen": 229865400, + "step": 10653, + "time_per_iteration": 2.5494399070739746 + }, + { + "auxiliary_loss_clip": 0.0108468, + "auxiliary_loss_mlp": 0.01053337, + "balance_loss_clip": 1.03476858, + "balance_loss_mlp": 1.0374285, + "epoch": 0.6405531339245453, + "flos": 20157880930560.0, + "grad_norm": 2.506373753271721, + "language_loss": 0.70548904, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.72686911, + "num_input_tokens_seen": 229882945, + "step": 10654, + "time_per_iteration": 2.540357828140259 + }, + { + "auxiliary_loss_clip": 0.0110224, + "auxiliary_loss_mlp": 0.01038472, + "balance_loss_clip": 1.03888631, + "balance_loss_mlp": 1.02534103, + "epoch": 0.6406132571772133, + "flos": 21942281796480.0, + "grad_norm": 1.9879002018190768, + "language_loss": 0.72897911, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.75038624, + "num_input_tokens_seen": 229901590, + "step": 10655, + "time_per_iteration": 2.5071895122528076 + }, + { + "auxiliary_loss_clip": 0.01075262, + "auxiliary_loss_mlp": 0.01031901, + "balance_loss_clip": 1.03950953, + "balance_loss_mlp": 1.01967013, + "epoch": 0.6406733804298813, + "flos": 28768002445440.0, + "grad_norm": 1.6574084248589094, + "language_loss": 0.82783759, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.8489092, + "num_input_tokens_seen": 229922535, + "step": 10656, + "time_per_iteration": 2.632880687713623 + }, + { + "auxiliary_loss_clip": 0.01059402, + "auxiliary_loss_mlp": 0.01034412, + "balance_loss_clip": 1.03604412, + "balance_loss_mlp": 1.02277684, + "epoch": 0.6407335036825492, + "flos": 17457183095040.0, + "grad_norm": 2.5307208687193583, + "language_loss": 0.72402805, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.74496621, + "num_input_tokens_seen": 229939575, + "step": 10657, + "time_per_iteration": 2.5616750717163086 + }, + { + "auxiliary_loss_clip": 0.01076301, + "auxiliary_loss_mlp": 0.01034233, + "balance_loss_clip": 1.03701544, + "balance_loss_mlp": 1.02262223, + "epoch": 0.6407936269352172, + "flos": 22125282612480.0, + "grad_norm": 1.6143709492864202, + "language_loss": 0.77109498, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.79220033, + "num_input_tokens_seen": 229958840, + "step": 10658, + "time_per_iteration": 2.5812625885009766 + }, + { + "auxiliary_loss_clip": 0.01109909, + "auxiliary_loss_mlp": 0.01033437, + "balance_loss_clip": 1.03756893, + "balance_loss_mlp": 1.02066398, + "epoch": 0.6408537501878852, + "flos": 23110635479040.0, + "grad_norm": 1.6040513429097012, + "language_loss": 0.76382488, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.78525835, + "num_input_tokens_seen": 229979680, + "step": 10659, + "time_per_iteration": 2.5037031173706055 + }, + { + "auxiliary_loss_clip": 0.01097619, + "auxiliary_loss_mlp": 0.01031433, + "balance_loss_clip": 1.03777409, + "balance_loss_mlp": 1.01901102, + "epoch": 0.6409138734405532, + "flos": 16472440759680.0, + "grad_norm": 1.7913422737024063, + "language_loss": 0.78062034, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.80191088, + "num_input_tokens_seen": 229996830, + "step": 10660, + "time_per_iteration": 2.483149766921997 + }, + { + "auxiliary_loss_clip": 0.01087589, + "auxiliary_loss_mlp": 0.010357, + "balance_loss_clip": 1.0388366, + "balance_loss_mlp": 1.02194905, + "epoch": 0.6409739966932211, + "flos": 22777922776320.0, + "grad_norm": 1.757879581550092, + "language_loss": 0.68496054, + "learning_rate": 1.206344067135727e-06, + "loss": 0.70619345, + "num_input_tokens_seen": 230015115, + "step": 10661, + "time_per_iteration": 2.5265753269195557 + }, + { + "auxiliary_loss_clip": 0.01106617, + "auxiliary_loss_mlp": 0.01033158, + "balance_loss_clip": 1.03826094, + "balance_loss_mlp": 1.02225637, + "epoch": 0.6410341199458891, + "flos": 25152049134720.0, + "grad_norm": 1.5420823415939664, + "language_loss": 0.75890774, + "learning_rate": 1.205986598033362e-06, + "loss": 0.7803055, + "num_input_tokens_seen": 230035515, + "step": 10662, + "time_per_iteration": 3.891908645629883 + }, + { + "auxiliary_loss_clip": 0.01092904, + "auxiliary_loss_mlp": 0.01035063, + "balance_loss_clip": 1.03514111, + "balance_loss_mlp": 1.02218819, + "epoch": 0.641094243198557, + "flos": 27046193028480.0, + "grad_norm": 1.9166916891724162, + "language_loss": 0.70033383, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.72161353, + "num_input_tokens_seen": 230054355, + "step": 10663, + "time_per_iteration": 2.5537500381469727 + }, + { + "auxiliary_loss_clip": 0.01075068, + "auxiliary_loss_mlp": 0.0103879, + "balance_loss_clip": 1.03740311, + "balance_loss_mlp": 1.0250566, + "epoch": 0.641154366451225, + "flos": 25374551932800.0, + "grad_norm": 1.9311504788833653, + "language_loss": 0.68331772, + "learning_rate": 1.205271750169389e-06, + "loss": 0.70445633, + "num_input_tokens_seen": 230074605, + "step": 10664, + "time_per_iteration": 2.615206241607666 + }, + { + "auxiliary_loss_clip": 0.01083814, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.03548193, + "balance_loss_mlp": 1.01928163, + "epoch": 0.6412144897038929, + "flos": 25153342024320.0, + "grad_norm": 1.8779500932300415, + "language_loss": 0.66696513, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.68811095, + "num_input_tokens_seen": 230093820, + "step": 10665, + "time_per_iteration": 3.9181010723114014 + }, + { + "auxiliary_loss_clip": 0.01096171, + "auxiliary_loss_mlp": 0.01027476, + "balance_loss_clip": 1.036533, + "balance_loss_mlp": 1.01583552, + "epoch": 0.641274612956561, + "flos": 23440762402560.0, + "grad_norm": 1.605038628058924, + "language_loss": 0.645751, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.66698748, + "num_input_tokens_seen": 230114285, + "step": 10666, + "time_per_iteration": 2.514486312866211 + }, + { + "auxiliary_loss_clip": 0.01098931, + "auxiliary_loss_mlp": 0.01032719, + "balance_loss_clip": 1.03799176, + "balance_loss_mlp": 1.02086365, + "epoch": 0.6413347362092289, + "flos": 19427493778560.0, + "grad_norm": 1.5483041282076118, + "language_loss": 0.71107274, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.73238927, + "num_input_tokens_seen": 230132760, + "step": 10667, + "time_per_iteration": 2.500885009765625 + }, + { + "auxiliary_loss_clip": 0.01063031, + "auxiliary_loss_mlp": 0.00789424, + "balance_loss_clip": 1.03703284, + "balance_loss_mlp": 1.0124836, + "epoch": 0.6413948594618969, + "flos": 17196578945280.0, + "grad_norm": 2.1429649200452947, + "language_loss": 0.7761206, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.79464513, + "num_input_tokens_seen": 230149690, + "step": 10668, + "time_per_iteration": 2.56838321685791 + }, + { + "auxiliary_loss_clip": 0.01100436, + "auxiliary_loss_mlp": 0.01034341, + "balance_loss_clip": 1.03936613, + "balance_loss_mlp": 1.02212846, + "epoch": 0.6414549827145648, + "flos": 22269787027200.0, + "grad_norm": 1.515874035966659, + "language_loss": 0.67534292, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.69669074, + "num_input_tokens_seen": 230166950, + "step": 10669, + "time_per_iteration": 2.5067434310913086 + }, + { + "auxiliary_loss_clip": 0.01107529, + "auxiliary_loss_mlp": 0.01036429, + "balance_loss_clip": 1.04118359, + "balance_loss_mlp": 1.02344155, + "epoch": 0.6415151059672328, + "flos": 19640192163840.0, + "grad_norm": 2.0794713233847877, + "language_loss": 0.78583986, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.80727947, + "num_input_tokens_seen": 230184785, + "step": 10670, + "time_per_iteration": 2.4940426349639893 + }, + { + "auxiliary_loss_clip": 0.01075647, + "auxiliary_loss_mlp": 0.01028839, + "balance_loss_clip": 1.03982055, + "balance_loss_mlp": 1.01607776, + "epoch": 0.6415752292199008, + "flos": 14865833237760.0, + "grad_norm": 2.5087119477926425, + "language_loss": 0.88529766, + "learning_rate": 1.20277073264638e-06, + "loss": 0.90634251, + "num_input_tokens_seen": 230201385, + "step": 10671, + "time_per_iteration": 2.5395915508270264 + }, + { + "auxiliary_loss_clip": 0.01097744, + "auxiliary_loss_mlp": 0.0102375, + "balance_loss_clip": 1.03886104, + "balance_loss_mlp": 1.01241875, + "epoch": 0.6416353524725688, + "flos": 13735580906880.0, + "grad_norm": 1.482995887741132, + "language_loss": 0.68903661, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.71025157, + "num_input_tokens_seen": 230220380, + "step": 10672, + "time_per_iteration": 2.4820749759674072 + }, + { + "auxiliary_loss_clip": 0.01103532, + "auxiliary_loss_mlp": 0.01033143, + "balance_loss_clip": 1.03869033, + "balance_loss_mlp": 1.01925528, + "epoch": 0.6416954757252368, + "flos": 24534924543360.0, + "grad_norm": 2.1619931151472205, + "language_loss": 0.74549836, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.76686513, + "num_input_tokens_seen": 230239845, + "step": 10673, + "time_per_iteration": 2.523625373840332 + }, + { + "auxiliary_loss_clip": 0.01073728, + "auxiliary_loss_mlp": 0.01036104, + "balance_loss_clip": 1.03991282, + "balance_loss_mlp": 1.02327693, + "epoch": 0.6417555989779047, + "flos": 27710002321920.0, + "grad_norm": 1.5930787829842543, + "language_loss": 0.69138294, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.71248126, + "num_input_tokens_seen": 230262420, + "step": 10674, + "time_per_iteration": 2.6199512481689453 + }, + { + "auxiliary_loss_clip": 0.01113578, + "auxiliary_loss_mlp": 0.01034777, + "balance_loss_clip": 1.03821135, + "balance_loss_mlp": 1.02205193, + "epoch": 0.6418157222305727, + "flos": 20556632787840.0, + "grad_norm": 1.8267320230228428, + "language_loss": 0.66165471, + "learning_rate": 1.201342244560338e-06, + "loss": 0.68313825, + "num_input_tokens_seen": 230279950, + "step": 10675, + "time_per_iteration": 2.4864213466644287 + }, + { + "auxiliary_loss_clip": 0.01111771, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.04024839, + "balance_loss_mlp": 1.02460313, + "epoch": 0.6418758454832406, + "flos": 22601530062720.0, + "grad_norm": 1.7515998993860813, + "language_loss": 0.6624611, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.68394423, + "num_input_tokens_seen": 230299705, + "step": 10676, + "time_per_iteration": 2.503316640853882 + }, + { + "auxiliary_loss_clip": 0.0111234, + "auxiliary_loss_mlp": 0.01028498, + "balance_loss_clip": 1.04013729, + "balance_loss_mlp": 1.01450264, + "epoch": 0.6419359687359086, + "flos": 27375098889600.0, + "grad_norm": 2.025926092179471, + "language_loss": 0.75690615, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.77831453, + "num_input_tokens_seen": 230320030, + "step": 10677, + "time_per_iteration": 2.514413356781006 + }, + { + "auxiliary_loss_clip": 0.01026168, + "auxiliary_loss_mlp": 0.01003983, + "balance_loss_clip": 1.02320063, + "balance_loss_mlp": 1.00276744, + "epoch": 0.6419960919885765, + "flos": 67251924552960.0, + "grad_norm": 0.7674776320867205, + "language_loss": 0.60715884, + "learning_rate": 1.200271196442818e-06, + "loss": 0.6274603, + "num_input_tokens_seen": 230381495, + "step": 10678, + "time_per_iteration": 3.224430561065674 + }, + { + "auxiliary_loss_clip": 0.0109653, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.03887713, + "balance_loss_mlp": 1.02336085, + "epoch": 0.6420562152412446, + "flos": 19901873721600.0, + "grad_norm": 1.6599348145572048, + "language_loss": 0.67208672, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.69340456, + "num_input_tokens_seen": 230401385, + "step": 10679, + "time_per_iteration": 2.5160202980041504 + }, + { + "auxiliary_loss_clip": 0.01098628, + "auxiliary_loss_mlp": 0.01039237, + "balance_loss_clip": 1.03889012, + "balance_loss_mlp": 1.02445543, + "epoch": 0.6421163384939125, + "flos": 24790177566720.0, + "grad_norm": 1.6922279659592054, + "language_loss": 0.72859657, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.74997526, + "num_input_tokens_seen": 230421340, + "step": 10680, + "time_per_iteration": 2.5372424125671387 + }, + { + "auxiliary_loss_clip": 0.01071857, + "auxiliary_loss_mlp": 0.01026686, + "balance_loss_clip": 1.03520751, + "balance_loss_mlp": 1.01578999, + "epoch": 0.6421764617465805, + "flos": 25592816926080.0, + "grad_norm": 1.7693621300510207, + "language_loss": 0.6754694, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.69645476, + "num_input_tokens_seen": 230441270, + "step": 10681, + "time_per_iteration": 2.595010995864868 + }, + { + "auxiliary_loss_clip": 0.01105926, + "auxiliary_loss_mlp": 0.01027745, + "balance_loss_clip": 1.03621423, + "balance_loss_mlp": 1.01591969, + "epoch": 0.6422365849992484, + "flos": 14134727813760.0, + "grad_norm": 1.9759917465024635, + "language_loss": 0.74773932, + "learning_rate": 1.198843556910427e-06, + "loss": 0.76907599, + "num_input_tokens_seen": 230457455, + "step": 10682, + "time_per_iteration": 2.446984052658081 + }, + { + "auxiliary_loss_clip": 0.01045584, + "auxiliary_loss_mlp": 0.01027991, + "balance_loss_clip": 1.03712058, + "balance_loss_mlp": 1.016976, + "epoch": 0.6422967082519164, + "flos": 22383911514240.0, + "grad_norm": 1.4183250818430206, + "language_loss": 0.7905221, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.81125778, + "num_input_tokens_seen": 230478955, + "step": 10683, + "time_per_iteration": 2.6508278846740723 + }, + { + "auxiliary_loss_clip": 0.01110467, + "auxiliary_loss_mlp": 0.01034158, + "balance_loss_clip": 1.03834367, + "balance_loss_mlp": 1.02148557, + "epoch": 0.6423568315045844, + "flos": 14647927380480.0, + "grad_norm": 1.820414586406883, + "language_loss": 0.67033017, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.69177639, + "num_input_tokens_seen": 230496425, + "step": 10684, + "time_per_iteration": 2.4523346424102783 + }, + { + "auxiliary_loss_clip": 0.01098402, + "auxiliary_loss_mlp": 0.01033185, + "balance_loss_clip": 1.03757918, + "balance_loss_mlp": 1.02088892, + "epoch": 0.6424169547572524, + "flos": 26833925606400.0, + "grad_norm": 2.096095937857753, + "language_loss": 0.71103621, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.73235202, + "num_input_tokens_seen": 230516245, + "step": 10685, + "time_per_iteration": 2.5548653602600098 + }, + { + "auxiliary_loss_clip": 0.01071244, + "auxiliary_loss_mlp": 0.01032009, + "balance_loss_clip": 1.03491533, + "balance_loss_mlp": 1.02060032, + "epoch": 0.6424770780099204, + "flos": 22707430335360.0, + "grad_norm": 1.529714359533987, + "language_loss": 0.75596321, + "learning_rate": 1.197416403456935e-06, + "loss": 0.77699578, + "num_input_tokens_seen": 230534745, + "step": 10686, + "time_per_iteration": 2.574174642562866 + }, + { + "auxiliary_loss_clip": 0.01077799, + "auxiliary_loss_mlp": 0.01030736, + "balance_loss_clip": 1.04172373, + "balance_loss_mlp": 1.01783204, + "epoch": 0.6425372012625883, + "flos": 28469512425600.0, + "grad_norm": 2.9893543470126636, + "language_loss": 0.68566996, + "learning_rate": 1.197059691144867e-06, + "loss": 0.70675528, + "num_input_tokens_seen": 230555895, + "step": 10687, + "time_per_iteration": 2.6129000186920166 + }, + { + "auxiliary_loss_clip": 0.01083928, + "auxiliary_loss_mlp": 0.0103331, + "balance_loss_clip": 1.03662455, + "balance_loss_mlp": 1.02087033, + "epoch": 0.6425973245152563, + "flos": 29351694453120.0, + "grad_norm": 1.9504893445404612, + "language_loss": 0.66426373, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.68543607, + "num_input_tokens_seen": 230577460, + "step": 10688, + "time_per_iteration": 2.606288194656372 + }, + { + "auxiliary_loss_clip": 0.01107812, + "auxiliary_loss_mlp": 0.01032259, + "balance_loss_clip": 1.03751671, + "balance_loss_mlp": 1.01989651, + "epoch": 0.6426574477679242, + "flos": 16430388912000.0, + "grad_norm": 1.9398742556351503, + "language_loss": 0.73157191, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.75297266, + "num_input_tokens_seen": 230595030, + "step": 10689, + "time_per_iteration": 2.448148012161255 + }, + { + "auxiliary_loss_clip": 0.01097194, + "auxiliary_loss_mlp": 0.01031155, + "balance_loss_clip": 1.03837478, + "balance_loss_mlp": 1.01992571, + "epoch": 0.6427175710205922, + "flos": 21835914647040.0, + "grad_norm": 2.0876584893358805, + "language_loss": 0.7145409, + "learning_rate": 1.195989736948226e-06, + "loss": 0.73582435, + "num_input_tokens_seen": 230615135, + "step": 10690, + "time_per_iteration": 2.510194778442383 + }, + { + "auxiliary_loss_clip": 0.01079313, + "auxiliary_loss_mlp": 0.01030675, + "balance_loss_clip": 1.03566325, + "balance_loss_mlp": 1.01806879, + "epoch": 0.6427776942732601, + "flos": 17786627660160.0, + "grad_norm": 1.8016449168418145, + "language_loss": 0.77499115, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.79609108, + "num_input_tokens_seen": 230631965, + "step": 10691, + "time_per_iteration": 3.928469181060791 + }, + { + "auxiliary_loss_clip": 0.01089317, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.03904366, + "balance_loss_mlp": 1.01830685, + "epoch": 0.6428378175259282, + "flos": 15085893911040.0, + "grad_norm": 2.0544852232811754, + "language_loss": 0.74434912, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.76554573, + "num_input_tokens_seen": 230649565, + "step": 10692, + "time_per_iteration": 2.5260448455810547 + }, + { + "auxiliary_loss_clip": 0.01097025, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.03669238, + "balance_loss_mlp": 1.02143645, + "epoch": 0.6428979407785961, + "flos": 23841776816640.0, + "grad_norm": 2.0732558200951052, + "language_loss": 0.61448371, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.63578808, + "num_input_tokens_seen": 230669265, + "step": 10693, + "time_per_iteration": 2.5173304080963135 + }, + { + "auxiliary_loss_clip": 0.01073114, + "auxiliary_loss_mlp": 0.01025598, + "balance_loss_clip": 1.03783488, + "balance_loss_mlp": 1.01325965, + "epoch": 0.6429580640312641, + "flos": 32926852892160.0, + "grad_norm": 1.5807021092004159, + "language_loss": 0.5953294, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.61631656, + "num_input_tokens_seen": 230690575, + "step": 10694, + "time_per_iteration": 2.63952374458313 + }, + { + "auxiliary_loss_clip": 0.01089077, + "auxiliary_loss_mlp": 0.0103271, + "balance_loss_clip": 1.03687632, + "balance_loss_mlp": 1.02082491, + "epoch": 0.643018187283932, + "flos": 21068359896960.0, + "grad_norm": 1.460974579309108, + "language_loss": 0.79723209, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.81844997, + "num_input_tokens_seen": 230709420, + "step": 10695, + "time_per_iteration": 2.5102667808532715 + }, + { + "auxiliary_loss_clip": 0.01109631, + "auxiliary_loss_mlp": 0.01039429, + "balance_loss_clip": 1.03777957, + "balance_loss_mlp": 1.026739, + "epoch": 0.6430783105366, + "flos": 26724649455360.0, + "grad_norm": 1.7222336275599037, + "language_loss": 0.73721987, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.7587105, + "num_input_tokens_seen": 230729350, + "step": 10696, + "time_per_iteration": 2.5229713916778564 + }, + { + "auxiliary_loss_clip": 0.01072787, + "auxiliary_loss_mlp": 0.01028325, + "balance_loss_clip": 1.03618991, + "balance_loss_mlp": 1.01615405, + "epoch": 0.643138433789268, + "flos": 23696841438720.0, + "grad_norm": 1.6906056994983796, + "language_loss": 0.75699425, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.77800536, + "num_input_tokens_seen": 230749220, + "step": 10697, + "time_per_iteration": 2.5625908374786377 + }, + { + "auxiliary_loss_clip": 0.01082497, + "auxiliary_loss_mlp": 0.01031461, + "balance_loss_clip": 1.03581583, + "balance_loss_mlp": 1.01986766, + "epoch": 0.643198557041936, + "flos": 34202184255360.0, + "grad_norm": 1.8433623439356899, + "language_loss": 0.65914893, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.68028855, + "num_input_tokens_seen": 230770245, + "step": 10698, + "time_per_iteration": 2.6539034843444824 + }, + { + "auxiliary_loss_clip": 0.01043119, + "auxiliary_loss_mlp": 0.0100344, + "balance_loss_clip": 1.01932836, + "balance_loss_mlp": 1.00206935, + "epoch": 0.643258680294604, + "flos": 67626473621760.0, + "grad_norm": 1.2503049063321305, + "language_loss": 0.63498163, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.65544713, + "num_input_tokens_seen": 230837030, + "step": 10699, + "time_per_iteration": 3.0581884384155273 + }, + { + "auxiliary_loss_clip": 0.01096487, + "auxiliary_loss_mlp": 0.01024817, + "balance_loss_clip": 1.03864038, + "balance_loss_mlp": 1.01382637, + "epoch": 0.6433188035472719, + "flos": 25185984508800.0, + "grad_norm": 1.6248772242465586, + "language_loss": 0.69215727, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.71337032, + "num_input_tokens_seen": 230856845, + "step": 10700, + "time_per_iteration": 5.349958419799805 + }, + { + "auxiliary_loss_clip": 0.01107929, + "auxiliary_loss_mlp": 0.01024682, + "balance_loss_clip": 1.03784144, + "balance_loss_mlp": 1.01314282, + "epoch": 0.6433789267999399, + "flos": 24973573432320.0, + "grad_norm": 1.740074667730253, + "language_loss": 0.73258615, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.75391227, + "num_input_tokens_seen": 230878785, + "step": 10701, + "time_per_iteration": 2.5069432258605957 + }, + { + "auxiliary_loss_clip": 0.01098587, + "auxiliary_loss_mlp": 0.01031206, + "balance_loss_clip": 1.03620863, + "balance_loss_mlp": 1.01763403, + "epoch": 0.6434390500526078, + "flos": 17566028282880.0, + "grad_norm": 1.9822277230578806, + "language_loss": 0.81884062, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.84013855, + "num_input_tokens_seen": 230895445, + "step": 10702, + "time_per_iteration": 2.485189437866211 + }, + { + "auxiliary_loss_clip": 0.01077349, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.03441417, + "balance_loss_mlp": 1.02397299, + "epoch": 0.6434991733052758, + "flos": 20843594542080.0, + "grad_norm": 1.892178901806213, + "language_loss": 0.74422443, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.76535702, + "num_input_tokens_seen": 230911375, + "step": 10703, + "time_per_iteration": 3.90875244140625 + }, + { + "auxiliary_loss_clip": 0.00985179, + "auxiliary_loss_mlp": 0.01003421, + "balance_loss_clip": 1.01602709, + "balance_loss_mlp": 1.00199056, + "epoch": 0.6435592965579437, + "flos": 66094596345600.0, + "grad_norm": 0.655118093932809, + "language_loss": 0.54591918, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.5658052, + "num_input_tokens_seen": 230975990, + "step": 10704, + "time_per_iteration": 3.2133750915527344 + }, + { + "auxiliary_loss_clip": 0.01067165, + "auxiliary_loss_mlp": 0.0102325, + "balance_loss_clip": 1.03984773, + "balance_loss_mlp": 1.01282549, + "epoch": 0.6436194198106118, + "flos": 23768842250880.0, + "grad_norm": 1.5656041012221376, + "language_loss": 0.76886475, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.78976882, + "num_input_tokens_seen": 230997110, + "step": 10705, + "time_per_iteration": 2.5902888774871826 + }, + { + "auxiliary_loss_clip": 0.01072254, + "auxiliary_loss_mlp": 0.01032448, + "balance_loss_clip": 1.03522885, + "balance_loss_mlp": 1.02078331, + "epoch": 0.6436795430632797, + "flos": 20230312705920.0, + "grad_norm": 1.7628866599601776, + "language_loss": 0.79084647, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.81189346, + "num_input_tokens_seen": 231015590, + "step": 10706, + "time_per_iteration": 2.53989315032959 + }, + { + "auxiliary_loss_clip": 0.01067623, + "auxiliary_loss_mlp": 0.01032826, + "balance_loss_clip": 1.033602, + "balance_loss_mlp": 1.02001119, + "epoch": 0.6437396663159477, + "flos": 20301846641280.0, + "grad_norm": 2.09196144795845, + "language_loss": 0.79975772, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.82076222, + "num_input_tokens_seen": 231033800, + "step": 10707, + "time_per_iteration": 2.5570809841156006 + }, + { + "auxiliary_loss_clip": 0.01095537, + "auxiliary_loss_mlp": 0.01031402, + "balance_loss_clip": 1.03617036, + "balance_loss_mlp": 1.01977921, + "epoch": 0.6437997895686156, + "flos": 23878585278720.0, + "grad_norm": 2.430675151707551, + "language_loss": 0.85490155, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.87617087, + "num_input_tokens_seen": 231053160, + "step": 10708, + "time_per_iteration": 2.507781505584717 + }, + { + "auxiliary_loss_clip": 0.01063857, + "auxiliary_loss_mlp": 0.01039504, + "balance_loss_clip": 1.03857517, + "balance_loss_mlp": 1.02596784, + "epoch": 0.6438599128212836, + "flos": 18989275852800.0, + "grad_norm": 2.310358254164252, + "language_loss": 0.65404481, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.67507851, + "num_input_tokens_seen": 231069470, + "step": 10709, + "time_per_iteration": 2.595135450363159 + }, + { + "auxiliary_loss_clip": 0.01106285, + "auxiliary_loss_mlp": 0.01028894, + "balance_loss_clip": 1.03671169, + "balance_loss_mlp": 1.01774192, + "epoch": 0.6439200360739517, + "flos": 24096347481600.0, + "grad_norm": 1.8010250619603243, + "language_loss": 0.80538428, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.82673609, + "num_input_tokens_seen": 231088205, + "step": 10710, + "time_per_iteration": 2.4793200492858887 + }, + { + "auxiliary_loss_clip": 0.01095147, + "auxiliary_loss_mlp": 0.01029469, + "balance_loss_clip": 1.03561211, + "balance_loss_mlp": 1.01789427, + "epoch": 0.6439801593266196, + "flos": 31902141697920.0, + "grad_norm": 1.6801219724805279, + "language_loss": 0.65769738, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.67894357, + "num_input_tokens_seen": 231107850, + "step": 10711, + "time_per_iteration": 2.5856051445007324 + }, + { + "auxiliary_loss_clip": 0.01077833, + "auxiliary_loss_mlp": 0.01032019, + "balance_loss_clip": 1.04254329, + "balance_loss_mlp": 1.02016997, + "epoch": 0.6440402825792876, + "flos": 27125879351040.0, + "grad_norm": 1.5874691447264566, + "language_loss": 0.78942645, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.810525, + "num_input_tokens_seen": 231127200, + "step": 10712, + "time_per_iteration": 2.592106819152832 + }, + { + "auxiliary_loss_clip": 0.01095215, + "auxiliary_loss_mlp": 0.01035634, + "balance_loss_clip": 1.03543472, + "balance_loss_mlp": 1.02328992, + "epoch": 0.6441004058319555, + "flos": 20667704618880.0, + "grad_norm": 1.6502827891128635, + "language_loss": 0.82609928, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.84740776, + "num_input_tokens_seen": 231146360, + "step": 10713, + "time_per_iteration": 2.492992639541626 + }, + { + "auxiliary_loss_clip": 0.01105242, + "auxiliary_loss_mlp": 0.01036179, + "balance_loss_clip": 1.03849769, + "balance_loss_mlp": 1.02529478, + "epoch": 0.6441605290846235, + "flos": 26026006947840.0, + "grad_norm": 1.4415022433147315, + "language_loss": 0.78526127, + "learning_rate": 1.187440012188684e-06, + "loss": 0.80667549, + "num_input_tokens_seen": 231168350, + "step": 10714, + "time_per_iteration": 2.5010437965393066 + }, + { + "auxiliary_loss_clip": 0.01073708, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.03567171, + "balance_loss_mlp": 1.02064371, + "epoch": 0.6442206523372914, + "flos": 24899489631360.0, + "grad_norm": 1.429510608203221, + "language_loss": 0.81507838, + "learning_rate": 1.187084157517583e-06, + "loss": 0.83613098, + "num_input_tokens_seen": 231188385, + "step": 10715, + "time_per_iteration": 2.597356081008911 + }, + { + "auxiliary_loss_clip": 0.01077905, + "auxiliary_loss_mlp": 0.01032225, + "balance_loss_clip": 1.03378403, + "balance_loss_mlp": 1.02011895, + "epoch": 0.6442807755899594, + "flos": 25156322853120.0, + "grad_norm": 2.391603752972217, + "language_loss": 0.81440055, + "learning_rate": 1.186728333672332e-06, + "loss": 0.83550191, + "num_input_tokens_seen": 231209880, + "step": 10716, + "time_per_iteration": 2.567639112472534 + }, + { + "auxiliary_loss_clip": 0.01076389, + "auxiliary_loss_mlp": 0.01037475, + "balance_loss_clip": 1.03721952, + "balance_loss_mlp": 1.02370667, + "epoch": 0.6443408988426274, + "flos": 27344503480320.0, + "grad_norm": 1.8761185512391365, + "language_loss": 0.77988088, + "learning_rate": 1.186372540666424e-06, + "loss": 0.80101955, + "num_input_tokens_seen": 231230765, + "step": 10717, + "time_per_iteration": 2.610337495803833 + }, + { + "auxiliary_loss_clip": 0.01104986, + "auxiliary_loss_mlp": 0.01030663, + "balance_loss_clip": 1.0378381, + "balance_loss_mlp": 1.01920652, + "epoch": 0.6444010220952954, + "flos": 27928339142400.0, + "grad_norm": 1.6964793201411565, + "language_loss": 0.68676591, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.70812237, + "num_input_tokens_seen": 231252350, + "step": 10718, + "time_per_iteration": 2.526170253753662 + }, + { + "auxiliary_loss_clip": 0.01032978, + "auxiliary_loss_mlp": 0.01008761, + "balance_loss_clip": 1.01939046, + "balance_loss_mlp": 1.00758076, + "epoch": 0.6444611453479633, + "flos": 71215024855680.0, + "grad_norm": 0.772576237681771, + "language_loss": 0.49658263, + "learning_rate": 1.185661047226603e-06, + "loss": 0.51699996, + "num_input_tokens_seen": 231313865, + "step": 10719, + "time_per_iteration": 3.2545406818389893 + }, + { + "auxiliary_loss_clip": 0.01111479, + "auxiliary_loss_mlp": 0.01041004, + "balance_loss_clip": 1.0397985, + "balance_loss_mlp": 1.0286057, + "epoch": 0.6445212686006313, + "flos": 22705131864960.0, + "grad_norm": 1.8224569597914426, + "language_loss": 0.78261185, + "learning_rate": 1.18530534681967e-06, + "loss": 0.80413663, + "num_input_tokens_seen": 231331710, + "step": 10720, + "time_per_iteration": 2.464001178741455 + }, + { + "auxiliary_loss_clip": 0.010867, + "auxiliary_loss_mlp": 0.01035724, + "balance_loss_clip": 1.03712773, + "balance_loss_mlp": 1.02273667, + "epoch": 0.6445813918532992, + "flos": 21178821196800.0, + "grad_norm": 1.7049677385774804, + "language_loss": 0.76939267, + "learning_rate": 1.18494967730604e-06, + "loss": 0.79061693, + "num_input_tokens_seen": 231350705, + "step": 10721, + "time_per_iteration": 2.5521111488342285 + }, + { + "auxiliary_loss_clip": 0.0107624, + "auxiliary_loss_mlp": 0.01033391, + "balance_loss_clip": 1.03623903, + "balance_loss_mlp": 1.02096903, + "epoch": 0.6446415151059672, + "flos": 25191910252800.0, + "grad_norm": 1.8867814526378333, + "language_loss": 0.7289089, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.75000525, + "num_input_tokens_seen": 231369550, + "step": 10722, + "time_per_iteration": 2.5758020877838135 + }, + { + "auxiliary_loss_clip": 0.01107628, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.03904021, + "balance_loss_mlp": 1.0204947, + "epoch": 0.6447016383586353, + "flos": 25302227898240.0, + "grad_norm": 1.7175229646101169, + "language_loss": 0.77943432, + "learning_rate": 1.184238431012635e-06, + "loss": 0.80083168, + "num_input_tokens_seen": 231389285, + "step": 10723, + "time_per_iteration": 2.508270502090454 + }, + { + "auxiliary_loss_clip": 0.01099986, + "auxiliary_loss_mlp": 0.01033358, + "balance_loss_clip": 1.03770494, + "balance_loss_mlp": 1.02068007, + "epoch": 0.6447617616113032, + "flos": 27703142824320.0, + "grad_norm": 1.8833660437019557, + "language_loss": 0.5864929, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.60782635, + "num_input_tokens_seen": 231408820, + "step": 10724, + "time_per_iteration": 2.541466474533081 + }, + { + "auxiliary_loss_clip": 0.01094911, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.03866494, + "balance_loss_mlp": 1.02152836, + "epoch": 0.6448218848639712, + "flos": 23039101543680.0, + "grad_norm": 1.857538111681203, + "language_loss": 0.83943677, + "learning_rate": 1.183527308454271e-06, + "loss": 0.86070943, + "num_input_tokens_seen": 231428100, + "step": 10725, + "time_per_iteration": 2.5026473999023438 + }, + { + "auxiliary_loss_clip": 0.01083616, + "auxiliary_loss_mlp": 0.01037141, + "balance_loss_clip": 1.03489971, + "balance_loss_mlp": 1.02459383, + "epoch": 0.6448820081166391, + "flos": 24496104919680.0, + "grad_norm": 1.8793116716435447, + "language_loss": 0.82239032, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.84359789, + "num_input_tokens_seen": 231445810, + "step": 10726, + "time_per_iteration": 2.538616895675659 + }, + { + "auxiliary_loss_clip": 0.01094501, + "auxiliary_loss_mlp": 0.01036534, + "balance_loss_clip": 1.03625321, + "balance_loss_mlp": 1.02374887, + "epoch": 0.6449421313693071, + "flos": 22419283432320.0, + "grad_norm": 1.7217019998434009, + "language_loss": 0.81072003, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.83203042, + "num_input_tokens_seen": 231463570, + "step": 10727, + "time_per_iteration": 2.49383282661438 + }, + { + "auxiliary_loss_clip": 0.01103252, + "auxiliary_loss_mlp": 0.01030496, + "balance_loss_clip": 1.03792632, + "balance_loss_mlp": 1.0178833, + "epoch": 0.645002254621975, + "flos": 20225715765120.0, + "grad_norm": 2.4965734116377942, + "language_loss": 0.791372, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.81270945, + "num_input_tokens_seen": 231482155, + "step": 10728, + "time_per_iteration": 2.4856793880462646 + }, + { + "auxiliary_loss_clip": 0.010232, + "auxiliary_loss_mlp": 0.01036034, + "balance_loss_clip": 1.03572488, + "balance_loss_mlp": 1.02210474, + "epoch": 0.645062377874643, + "flos": 27855440490240.0, + "grad_norm": 1.6412189808494695, + "language_loss": 0.74160379, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.76219618, + "num_input_tokens_seen": 231502465, + "step": 10729, + "time_per_iteration": 2.84660267829895 + }, + { + "auxiliary_loss_clip": 0.01075382, + "auxiliary_loss_mlp": 0.01034042, + "balance_loss_clip": 1.04068279, + "balance_loss_mlp": 1.02145994, + "epoch": 0.645122501127311, + "flos": 25301509626240.0, + "grad_norm": 1.8341320947960646, + "language_loss": 0.66785014, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.6889444, + "num_input_tokens_seen": 231522740, + "step": 10730, + "time_per_iteration": 4.323291540145874 + }, + { + "auxiliary_loss_clip": 0.01045341, + "auxiliary_loss_mlp": 0.01031662, + "balance_loss_clip": 1.03647614, + "balance_loss_mlp": 1.01815557, + "epoch": 0.645182624379979, + "flos": 18807352444800.0, + "grad_norm": 1.6242795209044594, + "language_loss": 0.63822514, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.65899521, + "num_input_tokens_seen": 231542050, + "step": 10731, + "time_per_iteration": 2.6461358070373535 + }, + { + "auxiliary_loss_clip": 0.01105792, + "auxiliary_loss_mlp": 0.01033371, + "balance_loss_clip": 1.03678143, + "balance_loss_mlp": 1.02167082, + "epoch": 0.6452427476326469, + "flos": 18332182402560.0, + "grad_norm": 1.6292537941411842, + "language_loss": 0.67954171, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.70093334, + "num_input_tokens_seen": 231560380, + "step": 10732, + "time_per_iteration": 2.4734530448913574 + }, + { + "auxiliary_loss_clip": 0.01096116, + "auxiliary_loss_mlp": 0.01035244, + "balance_loss_clip": 1.03679085, + "balance_loss_mlp": 1.02326918, + "epoch": 0.6453028708853149, + "flos": 22784746360320.0, + "grad_norm": 1.710046091057466, + "language_loss": 0.75712347, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.77843708, + "num_input_tokens_seen": 231580810, + "step": 10733, + "time_per_iteration": 2.5369744300842285 + }, + { + "auxiliary_loss_clip": 0.01098785, + "auxiliary_loss_mlp": 0.01039892, + "balance_loss_clip": 1.03820229, + "balance_loss_mlp": 1.02666605, + "epoch": 0.6453629941379828, + "flos": 23945989150080.0, + "grad_norm": 1.984953096963954, + "language_loss": 0.6647867, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.68617356, + "num_input_tokens_seen": 231600585, + "step": 10734, + "time_per_iteration": 2.542736530303955 + }, + { + "auxiliary_loss_clip": 0.01107706, + "auxiliary_loss_mlp": 0.01037093, + "balance_loss_clip": 1.04120278, + "balance_loss_mlp": 1.02532673, + "epoch": 0.6454231173906508, + "flos": 17676381841920.0, + "grad_norm": 2.344353347254017, + "language_loss": 0.73335731, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.75480533, + "num_input_tokens_seen": 231618765, + "step": 10735, + "time_per_iteration": 2.4669041633605957 + }, + { + "auxiliary_loss_clip": 0.01046055, + "auxiliary_loss_mlp": 0.00787337, + "balance_loss_clip": 1.03531909, + "balance_loss_mlp": 1.01391506, + "epoch": 0.6454832406433189, + "flos": 23292774368640.0, + "grad_norm": 1.769851821522435, + "language_loss": 0.74800116, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.76633501, + "num_input_tokens_seen": 231638525, + "step": 10736, + "time_per_iteration": 2.6443064212799072 + }, + { + "auxiliary_loss_clip": 0.01102691, + "auxiliary_loss_mlp": 0.01030444, + "balance_loss_clip": 1.04011691, + "balance_loss_mlp": 1.01754546, + "epoch": 0.6455433638959868, + "flos": 20157198572160.0, + "grad_norm": 2.3168225654948875, + "language_loss": 0.70609915, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.72743058, + "num_input_tokens_seen": 231656785, + "step": 10737, + "time_per_iteration": 2.497847080230713 + }, + { + "auxiliary_loss_clip": 0.01032626, + "auxiliary_loss_mlp": 0.01003183, + "balance_loss_clip": 1.01770639, + "balance_loss_mlp": 1.00198483, + "epoch": 0.6456034871486548, + "flos": 66532922012160.0, + "grad_norm": 0.7813204639379189, + "language_loss": 0.58500129, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.60535932, + "num_input_tokens_seen": 231719075, + "step": 10738, + "time_per_iteration": 3.1762871742248535 + }, + { + "auxiliary_loss_clip": 0.01073087, + "auxiliary_loss_mlp": 0.01027247, + "balance_loss_clip": 1.03997445, + "balance_loss_mlp": 1.01525474, + "epoch": 0.6456636104013227, + "flos": 24206090509440.0, + "grad_norm": 1.8449617508384977, + "language_loss": 0.74379617, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.76479948, + "num_input_tokens_seen": 231737810, + "step": 10739, + "time_per_iteration": 4.017168045043945 + }, + { + "auxiliary_loss_clip": 0.01089754, + "auxiliary_loss_mlp": 0.00787377, + "balance_loss_clip": 1.03843188, + "balance_loss_mlp": 1.01397634, + "epoch": 0.6457237336539907, + "flos": 23624086440960.0, + "grad_norm": 1.734705617852928, + "language_loss": 0.71410245, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.7328738, + "num_input_tokens_seen": 231756140, + "step": 10740, + "time_per_iteration": 2.55741810798645 + }, + { + "auxiliary_loss_clip": 0.01027809, + "auxiliary_loss_mlp": 0.00999753, + "balance_loss_clip": 1.02273357, + "balance_loss_mlp": 0.99845999, + "epoch": 0.6457838569066586, + "flos": 65846023251840.0, + "grad_norm": 0.6628763083831545, + "language_loss": 0.55314875, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.57342434, + "num_input_tokens_seen": 231823665, + "step": 10741, + "time_per_iteration": 4.818443059921265 + }, + { + "auxiliary_loss_clip": 0.01107349, + "auxiliary_loss_mlp": 0.0103369, + "balance_loss_clip": 1.03827596, + "balance_loss_mlp": 1.02201939, + "epoch": 0.6458439801593266, + "flos": 22381972179840.0, + "grad_norm": 1.8635666008698664, + "language_loss": 0.80573636, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.82714677, + "num_input_tokens_seen": 231844500, + "step": 10742, + "time_per_iteration": 2.52644681930542 + }, + { + "auxiliary_loss_clip": 0.01083242, + "auxiliary_loss_mlp": 0.01028499, + "balance_loss_clip": 1.03633404, + "balance_loss_mlp": 1.01678717, + "epoch": 0.6459041034119946, + "flos": 24789243813120.0, + "grad_norm": 2.4806653089138946, + "language_loss": 0.81827974, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.83939719, + "num_input_tokens_seen": 231864510, + "step": 10743, + "time_per_iteration": 2.5649142265319824 + }, + { + "auxiliary_loss_clip": 0.01084996, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.04103112, + "balance_loss_mlp": 1.01785362, + "epoch": 0.6459642266646626, + "flos": 18325358818560.0, + "grad_norm": 1.9457046909617843, + "language_loss": 0.72075367, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.74189937, + "num_input_tokens_seen": 231881555, + "step": 10744, + "time_per_iteration": 2.5138468742370605 + }, + { + "auxiliary_loss_clip": 0.01108293, + "auxiliary_loss_mlp": 0.01030506, + "balance_loss_clip": 1.03772449, + "balance_loss_mlp": 1.01893651, + "epoch": 0.6460243499173305, + "flos": 43581368891520.0, + "grad_norm": 1.7399079530100137, + "language_loss": 0.66602194, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.68740988, + "num_input_tokens_seen": 231905945, + "step": 10745, + "time_per_iteration": 2.6695830821990967 + }, + { + "auxiliary_loss_clip": 0.01095568, + "auxiliary_loss_mlp": 0.01035373, + "balance_loss_clip": 1.03650928, + "balance_loss_mlp": 1.02243829, + "epoch": 0.6460844731699985, + "flos": 19244026085760.0, + "grad_norm": 2.4615505988355024, + "language_loss": 0.73407769, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.75538701, + "num_input_tokens_seen": 231922535, + "step": 10746, + "time_per_iteration": 2.487253427505493 + }, + { + "auxiliary_loss_clip": 0.01099982, + "auxiliary_loss_mlp": 0.01033294, + "balance_loss_clip": 1.0389322, + "balance_loss_mlp": 1.02168298, + "epoch": 0.6461445964226664, + "flos": 27453348668160.0, + "grad_norm": 1.460959762464095, + "language_loss": 0.66358662, + "learning_rate": 1.175713157660413e-06, + "loss": 0.68491936, + "num_input_tokens_seen": 231944800, + "step": 10747, + "time_per_iteration": 2.543606996536255 + }, + { + "auxiliary_loss_clip": 0.01086158, + "auxiliary_loss_mlp": 0.01036792, + "balance_loss_clip": 1.040797, + "balance_loss_mlp": 1.02482319, + "epoch": 0.6462047196753344, + "flos": 20295489934080.0, + "grad_norm": 1.6227589634632975, + "language_loss": 0.67213959, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.69336903, + "num_input_tokens_seen": 231962970, + "step": 10748, + "time_per_iteration": 2.530475616455078 + }, + { + "auxiliary_loss_clip": 0.01113162, + "auxiliary_loss_mlp": 0.01040421, + "balance_loss_clip": 1.03946543, + "balance_loss_mlp": 1.02708769, + "epoch": 0.6462648429280025, + "flos": 22018340845440.0, + "grad_norm": 1.7253829082692396, + "language_loss": 0.75941634, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.78095222, + "num_input_tokens_seen": 231981195, + "step": 10749, + "time_per_iteration": 2.4684014320373535 + }, + { + "auxiliary_loss_clip": 0.0107207, + "auxiliary_loss_mlp": 0.01039995, + "balance_loss_clip": 1.03483081, + "balance_loss_mlp": 1.02628577, + "epoch": 0.6463249661806704, + "flos": 27781141207680.0, + "grad_norm": 1.5287751419560436, + "language_loss": 0.7697928, + "learning_rate": 1.17464876058473e-06, + "loss": 0.7909134, + "num_input_tokens_seen": 232001735, + "step": 10750, + "time_per_iteration": 2.599959373474121 + }, + { + "auxiliary_loss_clip": 0.01093723, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.03934991, + "balance_loss_mlp": 1.01911867, + "epoch": 0.6463850894333384, + "flos": 22050588280320.0, + "grad_norm": 1.982609519196789, + "language_loss": 0.68403465, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.70530486, + "num_input_tokens_seen": 232019830, + "step": 10751, + "time_per_iteration": 2.5396275520324707 + }, + { + "auxiliary_loss_clip": 0.01090121, + "auxiliary_loss_mlp": 0.01033009, + "balance_loss_clip": 1.03819633, + "balance_loss_mlp": 1.02038455, + "epoch": 0.6464452126860063, + "flos": 21106245767040.0, + "grad_norm": 1.9948023528023426, + "language_loss": 0.71233034, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.73356169, + "num_input_tokens_seen": 232039625, + "step": 10752, + "time_per_iteration": 2.5323193073272705 + }, + { + "auxiliary_loss_clip": 0.01079024, + "auxiliary_loss_mlp": 0.01037798, + "balance_loss_clip": 1.03662348, + "balance_loss_mlp": 1.02335572, + "epoch": 0.6465053359386743, + "flos": 16028045694720.0, + "grad_norm": 4.659649573876688, + "language_loss": 0.78227007, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.80343831, + "num_input_tokens_seen": 232055855, + "step": 10753, + "time_per_iteration": 2.511215925216675 + }, + { + "auxiliary_loss_clip": 0.01109301, + "auxiliary_loss_mlp": 0.01043237, + "balance_loss_clip": 1.03921962, + "balance_loss_mlp": 1.03084528, + "epoch": 0.6465654591913422, + "flos": 23398674641280.0, + "grad_norm": 1.7292863076245988, + "language_loss": 0.8475771, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.86910248, + "num_input_tokens_seen": 232073475, + "step": 10754, + "time_per_iteration": 2.51213002204895 + }, + { + "auxiliary_loss_clip": 0.01084695, + "auxiliary_loss_mlp": 0.01038073, + "balance_loss_clip": 1.03652668, + "balance_loss_mlp": 1.02545476, + "epoch": 0.6466255824440102, + "flos": 15377273038080.0, + "grad_norm": 2.2507102663776375, + "language_loss": 0.59696364, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.61819136, + "num_input_tokens_seen": 232091090, + "step": 10755, + "time_per_iteration": 2.5140185356140137 + }, + { + "auxiliary_loss_clip": 0.01073443, + "auxiliary_loss_mlp": 0.01033761, + "balance_loss_clip": 1.03756607, + "balance_loss_mlp": 1.02066028, + "epoch": 0.6466857056966782, + "flos": 16252846963200.0, + "grad_norm": 2.1763052449966884, + "language_loss": 0.67887831, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.69995034, + "num_input_tokens_seen": 232107320, + "step": 10756, + "time_per_iteration": 2.519484043121338 + }, + { + "auxiliary_loss_clip": 0.01071164, + "auxiliary_loss_mlp": 0.0104248, + "balance_loss_clip": 1.03862834, + "balance_loss_mlp": 1.02786446, + "epoch": 0.6467458289493462, + "flos": 21178246579200.0, + "grad_norm": 2.6495682462564654, + "language_loss": 0.73768914, + "learning_rate": 1.172166263444844e-06, + "loss": 0.75882554, + "num_input_tokens_seen": 232123930, + "step": 10757, + "time_per_iteration": 2.5767314434051514 + }, + { + "auxiliary_loss_clip": 0.01058943, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.04149616, + "balance_loss_mlp": 1.01830792, + "epoch": 0.6468059522020141, + "flos": 17968299672960.0, + "grad_norm": 1.9890584897278025, + "language_loss": 0.74637449, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.7672708, + "num_input_tokens_seen": 232142905, + "step": 10758, + "time_per_iteration": 2.591008186340332 + }, + { + "auxiliary_loss_clip": 0.01074277, + "auxiliary_loss_mlp": 0.01032116, + "balance_loss_clip": 1.04017925, + "balance_loss_mlp": 1.01912189, + "epoch": 0.6468660754546821, + "flos": 17890157635200.0, + "grad_norm": 1.5665831968537518, + "language_loss": 0.67982197, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.70088589, + "num_input_tokens_seen": 232162230, + "step": 10759, + "time_per_iteration": 2.5653076171875 + }, + { + "auxiliary_loss_clip": 0.01076027, + "auxiliary_loss_mlp": 0.01034024, + "balance_loss_clip": 1.0356946, + "balance_loss_mlp": 1.0200702, + "epoch": 0.64692619870735, + "flos": 22600991358720.0, + "grad_norm": 2.6767163976055564, + "language_loss": 0.75280559, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.77390611, + "num_input_tokens_seen": 232182700, + "step": 10760, + "time_per_iteration": 2.577657699584961 + }, + { + "auxiliary_loss_clip": 0.01081889, + "auxiliary_loss_mlp": 0.01034462, + "balance_loss_clip": 1.03400064, + "balance_loss_mlp": 1.02096128, + "epoch": 0.646986321960018, + "flos": 49600786993920.0, + "grad_norm": 2.1457542250923383, + "language_loss": 0.6537292, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.67489266, + "num_input_tokens_seen": 232208235, + "step": 10761, + "time_per_iteration": 2.793678045272827 + }, + { + "auxiliary_loss_clip": 0.01064524, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.03952003, + "balance_loss_mlp": 1.01956296, + "epoch": 0.6470464452126861, + "flos": 21908454163200.0, + "grad_norm": 2.691367679127516, + "language_loss": 0.69507748, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.71604955, + "num_input_tokens_seen": 232228720, + "step": 10762, + "time_per_iteration": 2.6064181327819824 + }, + { + "auxiliary_loss_clip": 0.01113191, + "auxiliary_loss_mlp": 0.01032917, + "balance_loss_clip": 1.03908443, + "balance_loss_mlp": 1.02010202, + "epoch": 0.647106568465354, + "flos": 18106124158080.0, + "grad_norm": 2.1550699648300107, + "language_loss": 0.82997417, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.85143524, + "num_input_tokens_seen": 232244655, + "step": 10763, + "time_per_iteration": 2.4468226432800293 + }, + { + "auxiliary_loss_clip": 0.01041708, + "auxiliary_loss_mlp": 0.01003134, + "balance_loss_clip": 1.01763368, + "balance_loss_mlp": 1.00188279, + "epoch": 0.647166691718022, + "flos": 69480038125440.0, + "grad_norm": 0.7178166113170772, + "language_loss": 0.5779472, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.59839565, + "num_input_tokens_seen": 232308685, + "step": 10764, + "time_per_iteration": 3.246119499206543 + }, + { + "auxiliary_loss_clip": 0.01074275, + "auxiliary_loss_mlp": 0.01034693, + "balance_loss_clip": 1.03737903, + "balance_loss_mlp": 1.02218151, + "epoch": 0.6472268149706899, + "flos": 34095170661120.0, + "grad_norm": 1.7882582856362779, + "language_loss": 0.60231578, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.62340546, + "num_input_tokens_seen": 232327520, + "step": 10765, + "time_per_iteration": 2.664862632751465 + }, + { + "auxiliary_loss_clip": 0.01108362, + "auxiliary_loss_mlp": 0.01028249, + "balance_loss_clip": 1.03808594, + "balance_loss_mlp": 1.01583946, + "epoch": 0.6472869382233579, + "flos": 28111232217600.0, + "grad_norm": 2.601342048524849, + "language_loss": 0.62936431, + "learning_rate": 1.168976742243437e-06, + "loss": 0.65073037, + "num_input_tokens_seen": 232349025, + "step": 10766, + "time_per_iteration": 2.5155725479125977 + }, + { + "auxiliary_loss_clip": 0.01083722, + "auxiliary_loss_mlp": 0.01034892, + "balance_loss_clip": 1.03673732, + "balance_loss_mlp": 1.02087307, + "epoch": 0.6473470614760258, + "flos": 22492146170880.0, + "grad_norm": 1.7035968320643127, + "language_loss": 0.75789464, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.77908075, + "num_input_tokens_seen": 232367835, + "step": 10767, + "time_per_iteration": 2.544235944747925 + }, + { + "auxiliary_loss_clip": 0.01100124, + "auxiliary_loss_mlp": 0.01032247, + "balance_loss_clip": 1.03880954, + "balance_loss_mlp": 1.01970649, + "epoch": 0.6474071847286939, + "flos": 14538938538240.0, + "grad_norm": 1.9178373725294062, + "language_loss": 0.77569056, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.79701436, + "num_input_tokens_seen": 232385840, + "step": 10768, + "time_per_iteration": 2.468111753463745 + }, + { + "auxiliary_loss_clip": 0.01060561, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.03844023, + "balance_loss_mlp": 1.01776338, + "epoch": 0.6474673079813618, + "flos": 24098214988800.0, + "grad_norm": 1.6734653304041587, + "language_loss": 0.71524322, + "learning_rate": 1.167914135250663e-06, + "loss": 0.7361573, + "num_input_tokens_seen": 232406205, + "step": 10769, + "time_per_iteration": 4.009365558624268 + }, + { + "auxiliary_loss_clip": 0.01107484, + "auxiliary_loss_mlp": 0.01033151, + "balance_loss_clip": 1.03935421, + "balance_loss_mlp": 1.02070558, + "epoch": 0.6475274312340298, + "flos": 14976186796800.0, + "grad_norm": 1.9675535972396452, + "language_loss": 0.7214489, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.74285531, + "num_input_tokens_seen": 232424995, + "step": 10770, + "time_per_iteration": 2.452566623687744 + }, + { + "auxiliary_loss_clip": 0.01070731, + "auxiliary_loss_mlp": 0.0103096, + "balance_loss_clip": 1.03674281, + "balance_loss_mlp": 1.01728714, + "epoch": 0.6475875544866977, + "flos": 25045322849280.0, + "grad_norm": 1.7926108212100622, + "language_loss": 0.7352742, + "learning_rate": 1.167205888330325e-06, + "loss": 0.75629115, + "num_input_tokens_seen": 232445870, + "step": 10771, + "time_per_iteration": 2.7192792892456055 + }, + { + "auxiliary_loss_clip": 0.01072603, + "auxiliary_loss_mlp": 0.01034167, + "balance_loss_clip": 1.03630257, + "balance_loss_mlp": 1.02071381, + "epoch": 0.6476476777393657, + "flos": 16472153450880.0, + "grad_norm": 2.149520472917965, + "language_loss": 0.741907, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.76297474, + "num_input_tokens_seen": 232464285, + "step": 10772, + "time_per_iteration": 2.5347797870635986 + }, + { + "auxiliary_loss_clip": 0.01084278, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.03940511, + "balance_loss_mlp": 1.01914763, + "epoch": 0.6477078009920336, + "flos": 25812267068160.0, + "grad_norm": 1.4925691973003317, + "language_loss": 0.82980812, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.85095406, + "num_input_tokens_seen": 232485815, + "step": 10773, + "time_per_iteration": 2.6020233631134033 + }, + { + "auxiliary_loss_clip": 0.01095559, + "auxiliary_loss_mlp": 0.00785626, + "balance_loss_clip": 1.03757477, + "balance_loss_mlp": 1.01295435, + "epoch": 0.6477679242447016, + "flos": 17676130446720.0, + "grad_norm": 1.492190015844518, + "language_loss": 0.7888521, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.80766398, + "num_input_tokens_seen": 232504875, + "step": 10774, + "time_per_iteration": 2.4853177070617676 + }, + { + "auxiliary_loss_clip": 0.01099789, + "auxiliary_loss_mlp": 0.01037461, + "balance_loss_clip": 1.03829122, + "balance_loss_mlp": 1.02493179, + "epoch": 0.6478280474973696, + "flos": 21032305620480.0, + "grad_norm": 2.3486306343146244, + "language_loss": 0.69258976, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.71396232, + "num_input_tokens_seen": 232521945, + "step": 10775, + "time_per_iteration": 2.508967638015747 + }, + { + "auxiliary_loss_clip": 0.01076953, + "auxiliary_loss_mlp": 0.01037181, + "balance_loss_clip": 1.04035926, + "balance_loss_mlp": 1.02459884, + "epoch": 0.6478881707500376, + "flos": 21616931381760.0, + "grad_norm": 1.9964279025709317, + "language_loss": 0.6570127, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.67815405, + "num_input_tokens_seen": 232541500, + "step": 10776, + "time_per_iteration": 2.5700719356536865 + }, + { + "auxiliary_loss_clip": 0.01080352, + "auxiliary_loss_mlp": 0.01036913, + "balance_loss_clip": 1.03493881, + "balance_loss_mlp": 1.02377045, + "epoch": 0.6479482940027056, + "flos": 18442571875200.0, + "grad_norm": 2.378883810921812, + "language_loss": 0.78561085, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.80678356, + "num_input_tokens_seen": 232559720, + "step": 10777, + "time_per_iteration": 3.9503233432769775 + }, + { + "auxiliary_loss_clip": 0.01097566, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.04234338, + "balance_loss_mlp": 1.01873028, + "epoch": 0.6480084172553735, + "flos": 22164066322560.0, + "grad_norm": 1.8539494197163027, + "language_loss": 0.73406875, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.75535715, + "num_input_tokens_seen": 232579370, + "step": 10778, + "time_per_iteration": 4.317025899887085 + }, + { + "auxiliary_loss_clip": 0.01091898, + "auxiliary_loss_mlp": 0.01030295, + "balance_loss_clip": 1.03476238, + "balance_loss_mlp": 1.01854706, + "epoch": 0.6480685405080415, + "flos": 24316228586880.0, + "grad_norm": 1.4645261642371676, + "language_loss": 0.77937901, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.80060095, + "num_input_tokens_seen": 232600495, + "step": 10779, + "time_per_iteration": 2.5587470531463623 + }, + { + "auxiliary_loss_clip": 0.01030093, + "auxiliary_loss_mlp": 0.01000108, + "balance_loss_clip": 1.02036703, + "balance_loss_mlp": 0.99889803, + "epoch": 0.6481286637607094, + "flos": 59891207760000.0, + "grad_norm": 0.7192576263371642, + "language_loss": 0.59413499, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.61443698, + "num_input_tokens_seen": 232663165, + "step": 10780, + "time_per_iteration": 4.460371971130371 + }, + { + "auxiliary_loss_clip": 0.01020616, + "auxiliary_loss_mlp": 0.010298, + "balance_loss_clip": 1.03719664, + "balance_loss_mlp": 1.01755738, + "epoch": 0.6481887870133775, + "flos": 25484187219840.0, + "grad_norm": 1.9390187131851686, + "language_loss": 0.79268229, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.81318653, + "num_input_tokens_seen": 232683385, + "step": 10781, + "time_per_iteration": 2.9183521270751953 + }, + { + "auxiliary_loss_clip": 0.01113693, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.04026341, + "balance_loss_mlp": 1.02173603, + "epoch": 0.6482489102660454, + "flos": 19930206574080.0, + "grad_norm": 2.141203443765911, + "language_loss": 0.7890631, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.8105644, + "num_input_tokens_seen": 232699095, + "step": 10782, + "time_per_iteration": 2.594644784927368 + }, + { + "auxiliary_loss_clip": 0.01101141, + "auxiliary_loss_mlp": 0.00784812, + "balance_loss_clip": 1.03910315, + "balance_loss_mlp": 1.00894487, + "epoch": 0.6483090335187134, + "flos": 26979471515520.0, + "grad_norm": 2.1197321037722388, + "language_loss": 0.6439662, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.6628257, + "num_input_tokens_seen": 232717920, + "step": 10783, + "time_per_iteration": 2.5316011905670166 + }, + { + "auxiliary_loss_clip": 0.01112955, + "auxiliary_loss_mlp": 0.01034064, + "balance_loss_clip": 1.03939986, + "balance_loss_mlp": 1.02026546, + "epoch": 0.6483691567713813, + "flos": 25077965333760.0, + "grad_norm": 1.7445508451507303, + "language_loss": 0.8865329, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.90800309, + "num_input_tokens_seen": 232737605, + "step": 10784, + "time_per_iteration": 2.5041744709014893 + }, + { + "auxiliary_loss_clip": 0.01085048, + "auxiliary_loss_mlp": 0.0102645, + "balance_loss_clip": 1.04044604, + "balance_loss_mlp": 1.01337898, + "epoch": 0.6484292800240493, + "flos": 16105972250880.0, + "grad_norm": 2.2517179017266353, + "language_loss": 0.730865, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.75197995, + "num_input_tokens_seen": 232755110, + "step": 10785, + "time_per_iteration": 2.4927921295166016 + }, + { + "auxiliary_loss_clip": 0.01071694, + "auxiliary_loss_mlp": 0.01030423, + "balance_loss_clip": 1.03565121, + "balance_loss_mlp": 1.01844835, + "epoch": 0.6484894032767172, + "flos": 28840398307200.0, + "grad_norm": 1.4504981767277063, + "language_loss": 0.69174457, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.71276569, + "num_input_tokens_seen": 232779040, + "step": 10786, + "time_per_iteration": 2.637023687362671 + }, + { + "auxiliary_loss_clip": 0.0107124, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.03702283, + "balance_loss_mlp": 1.01659942, + "epoch": 0.6485495265293852, + "flos": 30227052896640.0, + "grad_norm": 1.955527208248782, + "language_loss": 0.71080935, + "learning_rate": 1.161544469455041e-06, + "loss": 0.73180962, + "num_input_tokens_seen": 232800515, + "step": 10787, + "time_per_iteration": 2.614147186279297 + }, + { + "auxiliary_loss_clip": 0.01113598, + "auxiliary_loss_mlp": 0.0103362, + "balance_loss_clip": 1.03914118, + "balance_loss_mlp": 1.02028656, + "epoch": 0.6486096497820532, + "flos": 20082181017600.0, + "grad_norm": 2.055583677718124, + "language_loss": 0.84486711, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.86633927, + "num_input_tokens_seen": 232818450, + "step": 10788, + "time_per_iteration": 2.4580438137054443 + }, + { + "auxiliary_loss_clip": 0.01072576, + "auxiliary_loss_mlp": 0.01030618, + "balance_loss_clip": 1.03824687, + "balance_loss_mlp": 1.01737952, + "epoch": 0.6486697730347212, + "flos": 17129067333120.0, + "grad_norm": 1.8839890860574426, + "language_loss": 0.77342075, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.79445267, + "num_input_tokens_seen": 232834785, + "step": 10789, + "time_per_iteration": 2.5165505409240723 + }, + { + "auxiliary_loss_clip": 0.01093536, + "auxiliary_loss_mlp": 0.01029313, + "balance_loss_clip": 1.03598702, + "balance_loss_mlp": 1.01721287, + "epoch": 0.6487298962873892, + "flos": 38911940570880.0, + "grad_norm": 1.6244439796516261, + "language_loss": 0.75932133, + "learning_rate": 1.160483857897479e-06, + "loss": 0.78054976, + "num_input_tokens_seen": 232856050, + "step": 10790, + "time_per_iteration": 2.6491663455963135 + }, + { + "auxiliary_loss_clip": 0.01108302, + "auxiliary_loss_mlp": 0.01032588, + "balance_loss_clip": 1.03946781, + "balance_loss_mlp": 1.02140069, + "epoch": 0.6487900195400571, + "flos": 11947840076160.0, + "grad_norm": 2.0093216122545896, + "language_loss": 0.60459471, + "learning_rate": 1.160130384362823e-06, + "loss": 0.62600356, + "num_input_tokens_seen": 232873945, + "step": 10791, + "time_per_iteration": 2.446176052093506 + }, + { + "auxiliary_loss_clip": 0.01072077, + "auxiliary_loss_mlp": 0.01029806, + "balance_loss_clip": 1.03958428, + "balance_loss_mlp": 1.0174973, + "epoch": 0.6488501427927251, + "flos": 22344445445760.0, + "grad_norm": 1.9670572843143324, + "language_loss": 0.85917628, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.88019502, + "num_input_tokens_seen": 232892160, + "step": 10792, + "time_per_iteration": 2.5741450786590576 + }, + { + "auxiliary_loss_clip": 0.01087515, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.03758466, + "balance_loss_mlp": 1.02231252, + "epoch": 0.648910266045393, + "flos": 22236282616320.0, + "grad_norm": 2.0601792562184547, + "language_loss": 0.77837694, + "learning_rate": 1.159423532850735e-06, + "loss": 0.79960227, + "num_input_tokens_seen": 232911725, + "step": 10793, + "time_per_iteration": 2.5487234592437744 + }, + { + "auxiliary_loss_clip": 0.0107859, + "auxiliary_loss_mlp": 0.01027874, + "balance_loss_clip": 1.0392127, + "balance_loss_mlp": 1.01485062, + "epoch": 0.6489703892980611, + "flos": 25301258231040.0, + "grad_norm": 1.8377240530924162, + "language_loss": 0.74552679, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.76659143, + "num_input_tokens_seen": 232929085, + "step": 10794, + "time_per_iteration": 2.588306427001953 + }, + { + "auxiliary_loss_clip": 0.01096046, + "auxiliary_loss_mlp": 0.00784646, + "balance_loss_clip": 1.03560793, + "balance_loss_mlp": 1.00858915, + "epoch": 0.649030512550729, + "flos": 24571912573440.0, + "grad_norm": 2.3336258793750413, + "language_loss": 0.70063579, + "learning_rate": 1.158716808837621e-06, + "loss": 0.71944273, + "num_input_tokens_seen": 232949455, + "step": 10795, + "time_per_iteration": 2.5507302284240723 + }, + { + "auxiliary_loss_clip": 0.01089269, + "auxiliary_loss_mlp": 0.01034702, + "balance_loss_clip": 1.03774738, + "balance_loss_mlp": 1.02191687, + "epoch": 0.649090635803397, + "flos": 26244702904320.0, + "grad_norm": 1.7307885102877514, + "language_loss": 0.53935683, + "learning_rate": 1.158363494676679e-06, + "loss": 0.56059659, + "num_input_tokens_seen": 232969445, + "step": 10796, + "time_per_iteration": 2.571962833404541 + }, + { + "auxiliary_loss_clip": 0.0109816, + "auxiliary_loss_mlp": 0.01027203, + "balance_loss_clip": 1.03845549, + "balance_loss_mlp": 1.01566398, + "epoch": 0.6491507590560649, + "flos": 24937375501440.0, + "grad_norm": 1.5254690241461684, + "language_loss": 0.77848792, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.79974163, + "num_input_tokens_seen": 232988900, + "step": 10797, + "time_per_iteration": 2.5275557041168213 + }, + { + "auxiliary_loss_clip": 0.0106141, + "auxiliary_loss_mlp": 0.0102776, + "balance_loss_clip": 1.04046857, + "balance_loss_mlp": 1.01644707, + "epoch": 0.6492108823087329, + "flos": 19499781899520.0, + "grad_norm": 2.01200120791438, + "language_loss": 0.70773047, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.72862214, + "num_input_tokens_seen": 233005060, + "step": 10798, + "time_per_iteration": 2.613248586654663 + }, + { + "auxiliary_loss_clip": 0.01065558, + "auxiliary_loss_mlp": 0.01027845, + "balance_loss_clip": 1.03709698, + "balance_loss_mlp": 1.01646614, + "epoch": 0.6492710055614008, + "flos": 19719303868800.0, + "grad_norm": 1.7369758207834491, + "language_loss": 0.76312768, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.78406179, + "num_input_tokens_seen": 233023375, + "step": 10799, + "time_per_iteration": 2.5846493244171143 + }, + { + "auxiliary_loss_clip": 0.01096481, + "auxiliary_loss_mlp": 0.01036077, + "balance_loss_clip": 1.03801656, + "balance_loss_mlp": 1.02311933, + "epoch": 0.6493311288140688, + "flos": 24317018686080.0, + "grad_norm": 1.7850094599071913, + "language_loss": 0.71932316, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.74064875, + "num_input_tokens_seen": 233043130, + "step": 10800, + "time_per_iteration": 2.525892734527588 + }, + { + "auxiliary_loss_clip": 0.01036204, + "auxiliary_loss_mlp": 0.01012844, + "balance_loss_clip": 1.02189076, + "balance_loss_mlp": 1.01123524, + "epoch": 0.6493912520667368, + "flos": 70934635290240.0, + "grad_norm": 1.3964363959007293, + "language_loss": 0.60226667, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.62275708, + "num_input_tokens_seen": 233110560, + "step": 10801, + "time_per_iteration": 3.2199690341949463 + }, + { + "auxiliary_loss_clip": 0.01100035, + "auxiliary_loss_mlp": 0.01039297, + "balance_loss_clip": 1.03923988, + "balance_loss_mlp": 1.02536178, + "epoch": 0.6494513753194048, + "flos": 25337779384320.0, + "grad_norm": 2.606048263019423, + "language_loss": 0.78505588, + "learning_rate": 1.156244280393614e-06, + "loss": 0.80644923, + "num_input_tokens_seen": 233130080, + "step": 10802, + "time_per_iteration": 2.5339558124542236 + }, + { + "auxiliary_loss_clip": 0.01108572, + "auxiliary_loss_mlp": 0.01039378, + "balance_loss_clip": 1.03718543, + "balance_loss_mlp": 1.02625942, + "epoch": 0.6495114985720728, + "flos": 24681978823680.0, + "grad_norm": 1.628258856748789, + "language_loss": 0.7490561, + "learning_rate": 1.155891189918541e-06, + "loss": 0.77053565, + "num_input_tokens_seen": 233150235, + "step": 10803, + "time_per_iteration": 2.503297805786133 + }, + { + "auxiliary_loss_clip": 0.01047514, + "auxiliary_loss_mlp": 0.01033403, + "balance_loss_clip": 1.03505397, + "balance_loss_mlp": 1.02104068, + "epoch": 0.6495716218247407, + "flos": 23651162317440.0, + "grad_norm": 3.233980426963653, + "language_loss": 0.70581353, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.72662276, + "num_input_tokens_seen": 233166710, + "step": 10804, + "time_per_iteration": 2.6159756183624268 + }, + { + "auxiliary_loss_clip": 0.01095566, + "auxiliary_loss_mlp": 0.01029845, + "balance_loss_clip": 1.04000044, + "balance_loss_mlp": 1.01717353, + "epoch": 0.6496317450774087, + "flos": 22346169298560.0, + "grad_norm": 1.7186501701784938, + "language_loss": 0.7280727, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.74932683, + "num_input_tokens_seen": 233185445, + "step": 10805, + "time_per_iteration": 2.508727788925171 + }, + { + "auxiliary_loss_clip": 0.01082426, + "auxiliary_loss_mlp": 0.01029346, + "balance_loss_clip": 1.03741968, + "balance_loss_mlp": 1.01754451, + "epoch": 0.6496918683300766, + "flos": 30518647505280.0, + "grad_norm": 2.472781373991973, + "language_loss": 0.66088378, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.68200147, + "num_input_tokens_seen": 233205805, + "step": 10806, + "time_per_iteration": 2.593019485473633 + }, + { + "auxiliary_loss_clip": 0.01087898, + "auxiliary_loss_mlp": 0.00784409, + "balance_loss_clip": 1.03662145, + "balance_loss_mlp": 1.00834572, + "epoch": 0.6497519915827447, + "flos": 12458992567680.0, + "grad_norm": 2.444582300884639, + "language_loss": 0.79103303, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.80975616, + "num_input_tokens_seen": 233224215, + "step": 10807, + "time_per_iteration": 2.508929491043091 + }, + { + "auxiliary_loss_clip": 0.01024422, + "auxiliary_loss_mlp": 0.01007793, + "balance_loss_clip": 1.01980305, + "balance_loss_mlp": 1.00626683, + "epoch": 0.6498121148354126, + "flos": 69093748287360.0, + "grad_norm": 0.7795136053045956, + "language_loss": 0.58864272, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.6089648, + "num_input_tokens_seen": 233294440, + "step": 10808, + "time_per_iteration": 4.63586688041687 + }, + { + "auxiliary_loss_clip": 0.01088008, + "auxiliary_loss_mlp": 0.01028692, + "balance_loss_clip": 1.03988504, + "balance_loss_mlp": 1.01629984, + "epoch": 0.6498722380880806, + "flos": 36897135914880.0, + "grad_norm": 1.8207848275677214, + "language_loss": 0.62903976, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.6502068, + "num_input_tokens_seen": 233316125, + "step": 10809, + "time_per_iteration": 2.6601736545562744 + }, + { + "auxiliary_loss_clip": 0.01096209, + "auxiliary_loss_mlp": 0.00783379, + "balance_loss_clip": 1.03831279, + "balance_loss_mlp": 1.00902462, + "epoch": 0.6499323613407485, + "flos": 29017760688000.0, + "grad_norm": 1.7898135189730002, + "language_loss": 0.814785, + "learning_rate": 1.153420453586008e-06, + "loss": 0.83358085, + "num_input_tokens_seen": 233336140, + "step": 10810, + "time_per_iteration": 2.588871955871582 + }, + { + "auxiliary_loss_clip": 0.01074225, + "auxiliary_loss_mlp": 0.0103531, + "balance_loss_clip": 1.0373497, + "balance_loss_mlp": 1.02410424, + "epoch": 0.6499924845934165, + "flos": 20119240874880.0, + "grad_norm": 1.5827835015202556, + "language_loss": 0.71862185, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.73971725, + "num_input_tokens_seen": 233356095, + "step": 10811, + "time_per_iteration": 2.6089158058166504 + }, + { + "auxiliary_loss_clip": 0.0105107, + "auxiliary_loss_mlp": 0.01029996, + "balance_loss_clip": 1.04375672, + "balance_loss_mlp": 1.01810455, + "epoch": 0.6500526078460844, + "flos": 24421338760320.0, + "grad_norm": 1.7525204743255034, + "language_loss": 0.77620107, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.79701173, + "num_input_tokens_seen": 233376830, + "step": 10812, + "time_per_iteration": 2.6667354106903076 + }, + { + "auxiliary_loss_clip": 0.01094441, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.03827488, + "balance_loss_mlp": 1.02031267, + "epoch": 0.6501127310987524, + "flos": 23331019374720.0, + "grad_norm": 1.890557051759369, + "language_loss": 0.85433686, + "learning_rate": 1.152362047854413e-06, + "loss": 0.87561345, + "num_input_tokens_seen": 233395275, + "step": 10813, + "time_per_iteration": 2.5099005699157715 + }, + { + "auxiliary_loss_clip": 0.01067381, + "auxiliary_loss_mlp": 0.01035483, + "balance_loss_clip": 1.0348376, + "balance_loss_mlp": 1.02188146, + "epoch": 0.6501728543514204, + "flos": 18697824898560.0, + "grad_norm": 2.2667465136846943, + "language_loss": 0.79894125, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.81996989, + "num_input_tokens_seen": 233413345, + "step": 10814, + "time_per_iteration": 2.5397024154663086 + }, + { + "auxiliary_loss_clip": 0.01062621, + "auxiliary_loss_mlp": 0.00786453, + "balance_loss_clip": 1.03896356, + "balance_loss_mlp": 1.00645459, + "epoch": 0.6502329776040884, + "flos": 44199858199680.0, + "grad_norm": 1.5161634010550664, + "language_loss": 0.65540689, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.67389762, + "num_input_tokens_seen": 233436105, + "step": 10815, + "time_per_iteration": 2.784630298614502 + }, + { + "auxiliary_loss_clip": 0.01113056, + "auxiliary_loss_mlp": 0.0103452, + "balance_loss_clip": 1.03862357, + "balance_loss_mlp": 1.01941073, + "epoch": 0.6502931008567564, + "flos": 14574741419520.0, + "grad_norm": 1.8653314857749332, + "language_loss": 0.75280809, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.77428377, + "num_input_tokens_seen": 233452320, + "step": 10816, + "time_per_iteration": 3.8703465461730957 + }, + { + "auxiliary_loss_clip": 0.01082646, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.03866863, + "balance_loss_mlp": 1.02021015, + "epoch": 0.6503532241094243, + "flos": 21395003201280.0, + "grad_norm": 1.6818816060432134, + "language_loss": 0.72664881, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.74780047, + "num_input_tokens_seen": 233469920, + "step": 10817, + "time_per_iteration": 4.233865737915039 + }, + { + "auxiliary_loss_clip": 0.01067492, + "auxiliary_loss_mlp": 0.0104173, + "balance_loss_clip": 1.03493023, + "balance_loss_mlp": 1.02741921, + "epoch": 0.6504133473620923, + "flos": 74740840986240.0, + "grad_norm": 1.465798716362421, + "language_loss": 0.72209191, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.74318409, + "num_input_tokens_seen": 233499780, + "step": 10818, + "time_per_iteration": 4.36546516418457 + }, + { + "auxiliary_loss_clip": 0.01080152, + "auxiliary_loss_mlp": 0.01029584, + "balance_loss_clip": 1.03738284, + "balance_loss_mlp": 1.01678741, + "epoch": 0.6504734706147602, + "flos": 19713270384000.0, + "grad_norm": 1.8389009891727541, + "language_loss": 0.65284097, + "learning_rate": 1.150246104600249e-06, + "loss": 0.67393839, + "num_input_tokens_seen": 233518235, + "step": 10819, + "time_per_iteration": 2.5477025508880615 + }, + { + "auxiliary_loss_clip": 0.01075335, + "auxiliary_loss_mlp": 0.01035021, + "balance_loss_clip": 1.03602982, + "balance_loss_mlp": 1.0218606, + "epoch": 0.6505335938674283, + "flos": 25556870390400.0, + "grad_norm": 1.8708765659572038, + "language_loss": 0.83945853, + "learning_rate": 1.14989356009286e-06, + "loss": 0.86056209, + "num_input_tokens_seen": 233535215, + "step": 10820, + "time_per_iteration": 2.6009325981140137 + }, + { + "auxiliary_loss_clip": 0.01102907, + "auxiliary_loss_mlp": 0.01031039, + "balance_loss_clip": 1.03822231, + "balance_loss_mlp": 1.01763439, + "epoch": 0.6505937171200962, + "flos": 17821424960640.0, + "grad_norm": 2.0361158991282338, + "language_loss": 0.78221142, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.80355084, + "num_input_tokens_seen": 233552775, + "step": 10821, + "time_per_iteration": 2.469614267349243 + }, + { + "auxiliary_loss_clip": 0.01076003, + "auxiliary_loss_mlp": 0.01027571, + "balance_loss_clip": 1.03945732, + "balance_loss_mlp": 1.01641262, + "epoch": 0.6506538403727642, + "flos": 20668135582080.0, + "grad_norm": 1.528245507586087, + "language_loss": 0.80212843, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.82316417, + "num_input_tokens_seen": 233572080, + "step": 10822, + "time_per_iteration": 2.570295810699463 + }, + { + "auxiliary_loss_clip": 0.01076058, + "auxiliary_loss_mlp": 0.01026325, + "balance_loss_clip": 1.037027, + "balance_loss_mlp": 1.01476789, + "epoch": 0.6507139636254321, + "flos": 11721422695680.0, + "grad_norm": 1.8140927505067028, + "language_loss": 0.87250751, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.89353132, + "num_input_tokens_seen": 233589155, + "step": 10823, + "time_per_iteration": 2.5395734310150146 + }, + { + "auxiliary_loss_clip": 0.01108799, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.03718066, + "balance_loss_mlp": 1.01967657, + "epoch": 0.6507740868781001, + "flos": 26761745226240.0, + "grad_norm": 1.7479068227122119, + "language_loss": 0.66900325, + "learning_rate": 1.148483704558183e-06, + "loss": 0.69041425, + "num_input_tokens_seen": 233608180, + "step": 10824, + "time_per_iteration": 2.533613443374634 + }, + { + "auxiliary_loss_clip": 0.01089043, + "auxiliary_loss_mlp": 0.01031949, + "balance_loss_clip": 1.03653038, + "balance_loss_mlp": 1.01918149, + "epoch": 0.650834210130768, + "flos": 16471722487680.0, + "grad_norm": 2.797305843123813, + "language_loss": 0.87735647, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.89856637, + "num_input_tokens_seen": 233625750, + "step": 10825, + "time_per_iteration": 2.494980812072754 + }, + { + "auxiliary_loss_clip": 0.01090448, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_clip": 1.03720713, + "balance_loss_mlp": 1.01662171, + "epoch": 0.650894333383436, + "flos": 17128672283520.0, + "grad_norm": 2.244732216280115, + "language_loss": 0.73206115, + "learning_rate": 1.147778970474885e-06, + "loss": 0.75327575, + "num_input_tokens_seen": 233644235, + "step": 10826, + "time_per_iteration": 2.502307891845703 + }, + { + "auxiliary_loss_clip": 0.01101314, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.03999078, + "balance_loss_mlp": 1.01863539, + "epoch": 0.650954456636104, + "flos": 18734238311040.0, + "grad_norm": 2.082191236876549, + "language_loss": 0.68971097, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.71102935, + "num_input_tokens_seen": 233662845, + "step": 10827, + "time_per_iteration": 2.461458444595337 + }, + { + "auxiliary_loss_clip": 0.01087093, + "auxiliary_loss_mlp": 0.01030947, + "balance_loss_clip": 1.03717375, + "balance_loss_mlp": 1.01939607, + "epoch": 0.651014579888772, + "flos": 24528244613760.0, + "grad_norm": 1.956977286298672, + "language_loss": 0.76755702, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.78873742, + "num_input_tokens_seen": 233681990, + "step": 10828, + "time_per_iteration": 2.538895845413208 + }, + { + "auxiliary_loss_clip": 0.01098203, + "auxiliary_loss_mlp": 0.01027382, + "balance_loss_clip": 1.03946972, + "balance_loss_mlp": 1.01598549, + "epoch": 0.65107470314144, + "flos": 24061083304320.0, + "grad_norm": 1.845372111498304, + "language_loss": 0.8926639, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.91391969, + "num_input_tokens_seen": 233698930, + "step": 10829, + "time_per_iteration": 2.49698805809021 + }, + { + "auxiliary_loss_clip": 0.01040847, + "auxiliary_loss_mlp": 0.01003437, + "balance_loss_clip": 1.01726055, + "balance_loss_mlp": 1.00222158, + "epoch": 0.6511348263941079, + "flos": 72480734352000.0, + "grad_norm": 0.6468861240069984, + "language_loss": 0.55441332, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.57485616, + "num_input_tokens_seen": 233769825, + "step": 10830, + "time_per_iteration": 3.224823474884033 + }, + { + "auxiliary_loss_clip": 0.01076734, + "auxiliary_loss_mlp": 0.01030084, + "balance_loss_clip": 1.03698134, + "balance_loss_mlp": 1.01686358, + "epoch": 0.6511949496467759, + "flos": 23367684182400.0, + "grad_norm": 1.8468766852587188, + "language_loss": 0.74901825, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.77008641, + "num_input_tokens_seen": 233787095, + "step": 10831, + "time_per_iteration": 2.559882402420044 + }, + { + "auxiliary_loss_clip": 0.01024253, + "auxiliary_loss_mlp": 0.01000358, + "balance_loss_clip": 1.02023482, + "balance_loss_mlp": 0.99905294, + "epoch": 0.6512550728994438, + "flos": 67333191073920.0, + "grad_norm": 0.6433499206561896, + "language_loss": 0.51024532, + "learning_rate": 1.145665544243828e-06, + "loss": 0.53049141, + "num_input_tokens_seen": 233853050, + "step": 10832, + "time_per_iteration": 3.2183475494384766 + }, + { + "auxiliary_loss_clip": 0.01091328, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.03718877, + "balance_loss_mlp": 1.02052534, + "epoch": 0.6513151961521119, + "flos": 21141689512320.0, + "grad_norm": 2.2675436416416983, + "language_loss": 0.82866722, + "learning_rate": 1.145313419848316e-06, + "loss": 0.84991914, + "num_input_tokens_seen": 233871385, + "step": 10833, + "time_per_iteration": 2.539659261703491 + }, + { + "auxiliary_loss_clip": 0.01089093, + "auxiliary_loss_mlp": 0.01036349, + "balance_loss_clip": 1.03957152, + "balance_loss_mlp": 1.02384984, + "epoch": 0.6513753194047798, + "flos": 15158828476800.0, + "grad_norm": 2.0150143147862507, + "language_loss": 0.83618259, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.85743701, + "num_input_tokens_seen": 233888175, + "step": 10834, + "time_per_iteration": 2.500420570373535 + }, + { + "auxiliary_loss_clip": 0.01098123, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.03775632, + "balance_loss_mlp": 1.02295923, + "epoch": 0.6514354426574478, + "flos": 30226621933440.0, + "grad_norm": 1.506117679789368, + "language_loss": 0.77330196, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.79464179, + "num_input_tokens_seen": 233911470, + "step": 10835, + "time_per_iteration": 2.563380479812622 + }, + { + "auxiliary_loss_clip": 0.01085391, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.040411, + "balance_loss_mlp": 1.02320218, + "epoch": 0.6514955659101157, + "flos": 24205587719040.0, + "grad_norm": 1.4675255036248633, + "language_loss": 0.77480686, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.79601979, + "num_input_tokens_seen": 233932135, + "step": 10836, + "time_per_iteration": 2.577540636062622 + }, + { + "auxiliary_loss_clip": 0.0107192, + "auxiliary_loss_mlp": 0.01031715, + "balance_loss_clip": 1.03894758, + "balance_loss_mlp": 1.01941276, + "epoch": 0.6515556891627837, + "flos": 12377761960320.0, + "grad_norm": 2.3385840011820282, + "language_loss": 0.82533896, + "learning_rate": 1.143905246497783e-06, + "loss": 0.84637529, + "num_input_tokens_seen": 233947880, + "step": 10837, + "time_per_iteration": 2.5280227661132812 + }, + { + "auxiliary_loss_clip": 0.01070959, + "auxiliary_loss_mlp": 0.01034056, + "balance_loss_clip": 1.03985631, + "balance_loss_mlp": 1.0203588, + "epoch": 0.6516158124154516, + "flos": 49601217957120.0, + "grad_norm": 1.7852471160976675, + "language_loss": 0.58364999, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.60470009, + "num_input_tokens_seen": 233971475, + "step": 10838, + "time_per_iteration": 2.794718027114868 + }, + { + "auxiliary_loss_clip": 0.01040663, + "auxiliary_loss_mlp": 0.01006199, + "balance_loss_clip": 1.0169487, + "balance_loss_mlp": 1.00512064, + "epoch": 0.6516759356681197, + "flos": 59702748076800.0, + "grad_norm": 0.7274251215956774, + "language_loss": 0.60958636, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.63005495, + "num_input_tokens_seen": 234030690, + "step": 10839, + "time_per_iteration": 3.1185834407806396 + }, + { + "auxiliary_loss_clip": 0.01085601, + "auxiliary_loss_mlp": 0.01027683, + "balance_loss_clip": 1.03666115, + "balance_loss_mlp": 1.01624513, + "epoch": 0.6517360589207876, + "flos": 37450807130880.0, + "grad_norm": 1.6739563623864795, + "language_loss": 0.67641318, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.69754606, + "num_input_tokens_seen": 234052470, + "step": 10840, + "time_per_iteration": 2.64815092086792 + }, + { + "auxiliary_loss_clip": 0.01064177, + "auxiliary_loss_mlp": 0.01030037, + "balance_loss_clip": 1.03569424, + "balance_loss_mlp": 1.01843202, + "epoch": 0.6517961821734556, + "flos": 25374911068800.0, + "grad_norm": 1.9474680950646095, + "language_loss": 0.73262733, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.75356948, + "num_input_tokens_seen": 234071495, + "step": 10841, + "time_per_iteration": 2.6112797260284424 + }, + { + "auxiliary_loss_clip": 0.01110645, + "auxiliary_loss_mlp": 0.01033798, + "balance_loss_clip": 1.0379777, + "balance_loss_mlp": 1.02126884, + "epoch": 0.6518563054261236, + "flos": 28766996864640.0, + "grad_norm": 1.4773492185626065, + "language_loss": 0.62787426, + "learning_rate": 1.142145760331648e-06, + "loss": 0.6493187, + "num_input_tokens_seen": 234092325, + "step": 10842, + "time_per_iteration": 2.5213606357574463 + }, + { + "auxiliary_loss_clip": 0.01035363, + "auxiliary_loss_mlp": 0.01002822, + "balance_loss_clip": 1.02118802, + "balance_loss_mlp": 1.00182664, + "epoch": 0.6519164286787915, + "flos": 68924750797440.0, + "grad_norm": 0.8463672212232929, + "language_loss": 0.56147408, + "learning_rate": 1.141793960634807e-06, + "loss": 0.58185595, + "num_input_tokens_seen": 234148005, + "step": 10843, + "time_per_iteration": 3.0202598571777344 + }, + { + "auxiliary_loss_clip": 0.01101656, + "auxiliary_loss_mlp": 0.01038961, + "balance_loss_clip": 1.03770614, + "balance_loss_mlp": 1.02552009, + "epoch": 0.6519765519314595, + "flos": 20441933683200.0, + "grad_norm": 1.6773474343015606, + "language_loss": 0.828035, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.84944117, + "num_input_tokens_seen": 234164280, + "step": 10844, + "time_per_iteration": 2.4833710193634033 + }, + { + "auxiliary_loss_clip": 0.01098385, + "auxiliary_loss_mlp": 0.01029329, + "balance_loss_clip": 1.03728831, + "balance_loss_mlp": 1.01649618, + "epoch": 0.6520366751841274, + "flos": 28402970480640.0, + "grad_norm": 2.077900733313284, + "language_loss": 0.59636086, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.61763799, + "num_input_tokens_seen": 234185090, + "step": 10845, + "time_per_iteration": 2.548968553543091 + }, + { + "auxiliary_loss_clip": 0.01099984, + "auxiliary_loss_mlp": 0.01029579, + "balance_loss_clip": 1.03880823, + "balance_loss_mlp": 1.01677001, + "epoch": 0.6520967984367955, + "flos": 22273414300800.0, + "grad_norm": 2.091054401889517, + "language_loss": 0.7937665, + "learning_rate": 1.140738756857194e-06, + "loss": 0.81506211, + "num_input_tokens_seen": 234204050, + "step": 10846, + "time_per_iteration": 3.8867971897125244 + }, + { + "auxiliary_loss_clip": 0.01034825, + "auxiliary_loss_mlp": 0.01001706, + "balance_loss_clip": 1.02056575, + "balance_loss_mlp": 1.00061476, + "epoch": 0.6521569216894634, + "flos": 68917140092160.0, + "grad_norm": 0.7449381766435149, + "language_loss": 0.6020146, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.6223799, + "num_input_tokens_seen": 234269790, + "step": 10847, + "time_per_iteration": 3.2183175086975098 + }, + { + "auxiliary_loss_clip": 0.01115772, + "auxiliary_loss_mlp": 0.01041456, + "balance_loss_clip": 1.04113114, + "balance_loss_mlp": 1.02837324, + "epoch": 0.6522170449421314, + "flos": 29130520458240.0, + "grad_norm": 1.8120169668174, + "language_loss": 0.80988997, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.83146226, + "num_input_tokens_seen": 234290135, + "step": 10848, + "time_per_iteration": 2.544066905975342 + }, + { + "auxiliary_loss_clip": 0.01088277, + "auxiliary_loss_mlp": 0.01035117, + "balance_loss_clip": 1.03760076, + "balance_loss_mlp": 1.02301168, + "epoch": 0.6522771681947993, + "flos": 26651930371200.0, + "grad_norm": 3.6354759927776548, + "language_loss": 0.7523973, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.77363127, + "num_input_tokens_seen": 234309535, + "step": 10849, + "time_per_iteration": 2.5597898960113525 + }, + { + "auxiliary_loss_clip": 0.01059093, + "auxiliary_loss_mlp": 0.01035362, + "balance_loss_clip": 1.0357666, + "balance_loss_mlp": 1.02317894, + "epoch": 0.6523372914474673, + "flos": 25739763465600.0, + "grad_norm": 1.416184795344904, + "language_loss": 0.68250722, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.70345175, + "num_input_tokens_seen": 234328755, + "step": 10850, + "time_per_iteration": 2.6185190677642822 + }, + { + "auxiliary_loss_clip": 0.01084319, + "auxiliary_loss_mlp": 0.00784098, + "balance_loss_clip": 1.03772867, + "balance_loss_mlp": 1.00883222, + "epoch": 0.6523974147001352, + "flos": 24827345164800.0, + "grad_norm": 1.7948936087414975, + "language_loss": 0.66825646, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.68694067, + "num_input_tokens_seen": 234348655, + "step": 10851, + "time_per_iteration": 2.5662763118743896 + }, + { + "auxiliary_loss_clip": 0.01089198, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.04136252, + "balance_loss_mlp": 1.01853704, + "epoch": 0.6524575379528033, + "flos": 26317637470080.0, + "grad_norm": 2.6535981320643844, + "language_loss": 0.73673689, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.75793493, + "num_input_tokens_seen": 234367445, + "step": 10852, + "time_per_iteration": 2.552741289138794 + }, + { + "auxiliary_loss_clip": 0.01086573, + "auxiliary_loss_mlp": 0.01031536, + "balance_loss_clip": 1.03940094, + "balance_loss_mlp": 1.01766562, + "epoch": 0.6525176612054712, + "flos": 19494143464320.0, + "grad_norm": 1.7751625330413754, + "language_loss": 0.66367018, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.68485129, + "num_input_tokens_seen": 234384825, + "step": 10853, + "time_per_iteration": 2.514789581298828 + }, + { + "auxiliary_loss_clip": 0.01011277, + "auxiliary_loss_mlp": 0.0100139, + "balance_loss_clip": 1.02000296, + "balance_loss_mlp": 1.00000668, + "epoch": 0.6525777844581392, + "flos": 71706894721920.0, + "grad_norm": 0.7223150624548311, + "language_loss": 0.63072127, + "learning_rate": 1.137926314758634e-06, + "loss": 0.65084791, + "num_input_tokens_seen": 234450630, + "step": 10854, + "time_per_iteration": 4.630734205245972 + }, + { + "auxiliary_loss_clip": 0.01095443, + "auxiliary_loss_mlp": 0.0104241, + "balance_loss_clip": 1.03637886, + "balance_loss_mlp": 1.02713907, + "epoch": 0.6526379077108072, + "flos": 26653115520000.0, + "grad_norm": 1.6978022973130333, + "language_loss": 0.77782345, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.79920197, + "num_input_tokens_seen": 234473505, + "step": 10855, + "time_per_iteration": 2.583150863647461 + }, + { + "auxiliary_loss_clip": 0.01069276, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.03323829, + "balance_loss_mlp": 1.01720095, + "epoch": 0.6526980309634751, + "flos": 22820369673600.0, + "grad_norm": 2.257752124681326, + "language_loss": 0.79010236, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.81108856, + "num_input_tokens_seen": 234492485, + "step": 10856, + "time_per_iteration": 3.9577157497406006 + }, + { + "auxiliary_loss_clip": 0.01110234, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.0382998, + "balance_loss_mlp": 1.01691365, + "epoch": 0.6527581542161431, + "flos": 28365048696960.0, + "grad_norm": 1.504921792299299, + "language_loss": 0.7373727, + "learning_rate": 1.136872187988815e-06, + "loss": 0.75878382, + "num_input_tokens_seen": 234512645, + "step": 10857, + "time_per_iteration": 3.8830676078796387 + }, + { + "auxiliary_loss_clip": 0.01088845, + "auxiliary_loss_mlp": 0.01032874, + "balance_loss_clip": 1.03803396, + "balance_loss_mlp": 1.02070236, + "epoch": 0.652818277468811, + "flos": 18369206346240.0, + "grad_norm": 1.9997293574171637, + "language_loss": 0.62626219, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.6474793, + "num_input_tokens_seen": 234529310, + "step": 10858, + "time_per_iteration": 2.543156147003174 + }, + { + "auxiliary_loss_clip": 0.01105718, + "auxiliary_loss_mlp": 0.01032892, + "balance_loss_clip": 1.03653944, + "balance_loss_mlp": 1.02067947, + "epoch": 0.6528784007214791, + "flos": 18036170421120.0, + "grad_norm": 2.118610447873736, + "language_loss": 0.78523827, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.80662441, + "num_input_tokens_seen": 234546685, + "step": 10859, + "time_per_iteration": 2.439804792404175 + }, + { + "auxiliary_loss_clip": 0.01100464, + "auxiliary_loss_mlp": 0.01031562, + "balance_loss_clip": 1.03716373, + "balance_loss_mlp": 1.01834214, + "epoch": 0.652938523974147, + "flos": 22382008093440.0, + "grad_norm": 1.496511111290527, + "language_loss": 0.67768341, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.6990037, + "num_input_tokens_seen": 234566255, + "step": 10860, + "time_per_iteration": 2.5201399326324463 + }, + { + "auxiliary_loss_clip": 0.01101014, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.04219747, + "balance_loss_mlp": 1.01795506, + "epoch": 0.652998647226815, + "flos": 16764035368320.0, + "grad_norm": 2.0296975956061822, + "language_loss": 0.66773653, + "learning_rate": 1.135467143909712e-06, + "loss": 0.68905663, + "num_input_tokens_seen": 234585405, + "step": 10861, + "time_per_iteration": 2.470863103866577 + }, + { + "auxiliary_loss_clip": 0.01089393, + "auxiliary_loss_mlp": 0.01032265, + "balance_loss_clip": 1.03825879, + "balance_loss_mlp": 1.0185796, + "epoch": 0.6530587704794829, + "flos": 35772522019200.0, + "grad_norm": 1.7064096510872873, + "language_loss": 0.65419567, + "learning_rate": 1.135115964814572e-06, + "loss": 0.67541224, + "num_input_tokens_seen": 234608095, + "step": 10862, + "time_per_iteration": 2.674995183944702 + }, + { + "auxiliary_loss_clip": 0.01087565, + "auxiliary_loss_mlp": 0.01032565, + "balance_loss_clip": 1.03752661, + "balance_loss_mlp": 1.0202384, + "epoch": 0.6531188937321509, + "flos": 19316134638720.0, + "grad_norm": 1.8515995145069566, + "language_loss": 0.76960623, + "learning_rate": 1.13476481851592e-06, + "loss": 0.7908076, + "num_input_tokens_seen": 234627335, + "step": 10863, + "time_per_iteration": 2.509773015975952 + }, + { + "auxiliary_loss_clip": 0.01088656, + "auxiliary_loss_mlp": 0.01031279, + "balance_loss_clip": 1.03767371, + "balance_loss_mlp": 1.01945353, + "epoch": 0.6531790169848188, + "flos": 22893771116160.0, + "grad_norm": 1.7485438131262032, + "language_loss": 0.7496413, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.77084064, + "num_input_tokens_seen": 234646540, + "step": 10864, + "time_per_iteration": 2.5541629791259766 + }, + { + "auxiliary_loss_clip": 0.01096195, + "auxiliary_loss_mlp": 0.01035596, + "balance_loss_clip": 1.03746128, + "balance_loss_mlp": 1.0236752, + "epoch": 0.6532391402374869, + "flos": 29563530912000.0, + "grad_norm": 2.0607523566270265, + "language_loss": 0.86101133, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.88232923, + "num_input_tokens_seen": 234665470, + "step": 10865, + "time_per_iteration": 2.5562562942504883 + }, + { + "auxiliary_loss_clip": 0.01082487, + "auxiliary_loss_mlp": 0.00784578, + "balance_loss_clip": 1.03818643, + "balance_loss_mlp": 1.0074079, + "epoch": 0.6532992634901548, + "flos": 23105463920640.0, + "grad_norm": 1.613110110397674, + "language_loss": 0.81552613, + "learning_rate": 1.133711576532051e-06, + "loss": 0.83419681, + "num_input_tokens_seen": 234683955, + "step": 10866, + "time_per_iteration": 2.5847620964050293 + }, + { + "auxiliary_loss_clip": 0.01087972, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_clip": 1.03894162, + "balance_loss_mlp": 1.017501, + "epoch": 0.6533593867428228, + "flos": 26067340523520.0, + "grad_norm": 1.4194325761346736, + "language_loss": 0.82335746, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.84453118, + "num_input_tokens_seen": 234704595, + "step": 10867, + "time_per_iteration": 2.5538136959075928 + }, + { + "auxiliary_loss_clip": 0.010893, + "auxiliary_loss_mlp": 0.01026039, + "balance_loss_clip": 1.03794134, + "balance_loss_mlp": 1.01400495, + "epoch": 0.6534195099954908, + "flos": 21212469262080.0, + "grad_norm": 1.769531282493091, + "language_loss": 0.81221932, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.83337271, + "num_input_tokens_seen": 234724090, + "step": 10868, + "time_per_iteration": 2.5559310913085938 + }, + { + "auxiliary_loss_clip": 0.01080513, + "auxiliary_loss_mlp": 0.01029419, + "balance_loss_clip": 1.03910804, + "balance_loss_mlp": 1.01614511, + "epoch": 0.6534796332481587, + "flos": 19646584784640.0, + "grad_norm": 1.8119193745419222, + "language_loss": 0.79635876, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.81745803, + "num_input_tokens_seen": 234742560, + "step": 10869, + "time_per_iteration": 2.5338783264160156 + }, + { + "auxiliary_loss_clip": 0.0110191, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.04027116, + "balance_loss_mlp": 1.02400899, + "epoch": 0.6535397565008267, + "flos": 24022479162240.0, + "grad_norm": 1.6470435219842774, + "language_loss": 0.71647787, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.73786157, + "num_input_tokens_seen": 234762315, + "step": 10870, + "time_per_iteration": 2.5453672409057617 + }, + { + "auxiliary_loss_clip": 0.01072374, + "auxiliary_loss_mlp": 0.01034361, + "balance_loss_clip": 1.04071116, + "balance_loss_mlp": 1.02201152, + "epoch": 0.6535998797534947, + "flos": 24602759377920.0, + "grad_norm": 2.0343108444804243, + "language_loss": 0.74712098, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.7681883, + "num_input_tokens_seen": 234781300, + "step": 10871, + "time_per_iteration": 2.5975208282470703 + }, + { + "auxiliary_loss_clip": 0.01096596, + "auxiliary_loss_mlp": 0.00782041, + "balance_loss_clip": 1.03843784, + "balance_loss_mlp": 1.0076139, + "epoch": 0.6536600030061627, + "flos": 23364164649600.0, + "grad_norm": 1.875299406284731, + "language_loss": 0.55750573, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.57629216, + "num_input_tokens_seen": 234801040, + "step": 10872, + "time_per_iteration": 2.568873167037964 + }, + { + "auxiliary_loss_clip": 0.01084774, + "auxiliary_loss_mlp": 0.01030444, + "balance_loss_clip": 1.03760636, + "balance_loss_mlp": 1.01895797, + "epoch": 0.6537201262588306, + "flos": 23878477537920.0, + "grad_norm": 2.0157718828644065, + "language_loss": 0.74593991, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.76709205, + "num_input_tokens_seen": 234821415, + "step": 10873, + "time_per_iteration": 2.549917221069336 + }, + { + "auxiliary_loss_clip": 0.01099346, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.0393014, + "balance_loss_mlp": 1.02011669, + "epoch": 0.6537802495114986, + "flos": 24354760901760.0, + "grad_norm": 2.8793260969181955, + "language_loss": 0.75525743, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.77657235, + "num_input_tokens_seen": 234843795, + "step": 10874, + "time_per_iteration": 2.5836093425750732 + }, + { + "auxiliary_loss_clip": 0.01066368, + "auxiliary_loss_mlp": 0.01033984, + "balance_loss_clip": 1.03696704, + "balance_loss_mlp": 1.02151453, + "epoch": 0.6538403727641665, + "flos": 27996892248960.0, + "grad_norm": 1.5187063072012412, + "language_loss": 0.81436902, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.83537251, + "num_input_tokens_seen": 234862350, + "step": 10875, + "time_per_iteration": 2.607084035873413 + }, + { + "auxiliary_loss_clip": 0.01109289, + "auxiliary_loss_mlp": 0.01033868, + "balance_loss_clip": 1.03787315, + "balance_loss_mlp": 1.02200723, + "epoch": 0.6539004960168345, + "flos": 27563594486400.0, + "grad_norm": 1.5632735539068399, + "language_loss": 0.69671249, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.71814412, + "num_input_tokens_seen": 234881790, + "step": 10876, + "time_per_iteration": 2.519221782684326 + }, + { + "auxiliary_loss_clip": 0.01022337, + "auxiliary_loss_mlp": 0.01032827, + "balance_loss_clip": 1.03706181, + "balance_loss_mlp": 1.02051878, + "epoch": 0.6539606192695024, + "flos": 14530067879040.0, + "grad_norm": 2.095460281106697, + "language_loss": 0.795169, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.81572068, + "num_input_tokens_seen": 234897775, + "step": 10877, + "time_per_iteration": 2.6558961868286133 + }, + { + "auxiliary_loss_clip": 0.01087244, + "auxiliary_loss_mlp": 0.007837, + "balance_loss_clip": 1.03653634, + "balance_loss_mlp": 1.00687265, + "epoch": 0.6540207425221705, + "flos": 21616356764160.0, + "grad_norm": 2.4313923327090117, + "language_loss": 0.80020893, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.81891835, + "num_input_tokens_seen": 234918395, + "step": 10878, + "time_per_iteration": 2.5630314350128174 + }, + { + "auxiliary_loss_clip": 0.01087121, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.03700638, + "balance_loss_mlp": 1.01731443, + "epoch": 0.6540808657748384, + "flos": 17668983640320.0, + "grad_norm": 1.841258597107234, + "language_loss": 0.84446859, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.86564398, + "num_input_tokens_seen": 234936260, + "step": 10879, + "time_per_iteration": 2.5075719356536865 + }, + { + "auxiliary_loss_clip": 0.0108959, + "auxiliary_loss_mlp": 0.01030572, + "balance_loss_clip": 1.0381403, + "balance_loss_mlp": 1.01776934, + "epoch": 0.6541409890275064, + "flos": 14538292093440.0, + "grad_norm": 2.4825455538715664, + "language_loss": 0.71814543, + "learning_rate": 1.128800362199601e-06, + "loss": 0.7393471, + "num_input_tokens_seen": 234952110, + "step": 10880, + "time_per_iteration": 2.5109384059906006 + }, + { + "auxiliary_loss_clip": 0.01068131, + "auxiliary_loss_mlp": 0.01035287, + "balance_loss_clip": 1.03466284, + "balance_loss_mlp": 1.02333605, + "epoch": 0.6542011122801744, + "flos": 17165301177600.0, + "grad_norm": 1.847362264682207, + "language_loss": 0.84179729, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.86283147, + "num_input_tokens_seen": 234970810, + "step": 10881, + "time_per_iteration": 2.534040927886963 + }, + { + "auxiliary_loss_clip": 0.01075729, + "auxiliary_loss_mlp": 0.01034845, + "balance_loss_clip": 1.03714895, + "balance_loss_mlp": 1.02121353, + "epoch": 0.6542612355328423, + "flos": 18186600579840.0, + "grad_norm": 4.525366677783798, + "language_loss": 0.77792585, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.79903162, + "num_input_tokens_seen": 234989565, + "step": 10882, + "time_per_iteration": 2.5615649223327637 + }, + { + "auxiliary_loss_clip": 0.01113417, + "auxiliary_loss_mlp": 0.0103057, + "balance_loss_clip": 1.04128957, + "balance_loss_mlp": 1.01773167, + "epoch": 0.6543213587855103, + "flos": 19792453916160.0, + "grad_norm": 2.3371318049137706, + "language_loss": 0.81885612, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.84029591, + "num_input_tokens_seen": 235007955, + "step": 10883, + "time_per_iteration": 2.4713690280914307 + }, + { + "auxiliary_loss_clip": 0.01067735, + "auxiliary_loss_mlp": 0.0103696, + "balance_loss_clip": 1.03847122, + "balance_loss_mlp": 1.0242939, + "epoch": 0.6543814820381783, + "flos": 21105096531840.0, + "grad_norm": 2.7435056228217864, + "language_loss": 0.85496563, + "learning_rate": 1.127398345803988e-06, + "loss": 0.87601256, + "num_input_tokens_seen": 235024860, + "step": 10884, + "time_per_iteration": 2.5894830226898193 + }, + { + "auxiliary_loss_clip": 0.01087766, + "auxiliary_loss_mlp": 0.01033576, + "balance_loss_clip": 1.03778529, + "balance_loss_mlp": 1.0211308, + "epoch": 0.6544416052908463, + "flos": 20194042947840.0, + "grad_norm": 3.9736559598563046, + "language_loss": 0.8019731, + "learning_rate": 1.127047924394715e-06, + "loss": 0.82318646, + "num_input_tokens_seen": 235043815, + "step": 10885, + "time_per_iteration": 3.912710189819336 + }, + { + "auxiliary_loss_clip": 0.01071218, + "auxiliary_loss_mlp": 0.0102659, + "balance_loss_clip": 1.03616595, + "balance_loss_mlp": 1.01410866, + "epoch": 0.6545017285435142, + "flos": 23368258800000.0, + "grad_norm": 1.9799649291500878, + "language_loss": 0.72243744, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.74341553, + "num_input_tokens_seen": 235062985, + "step": 10886, + "time_per_iteration": 2.6095807552337646 + }, + { + "auxiliary_loss_clip": 0.0109142, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.03834307, + "balance_loss_mlp": 1.01833439, + "epoch": 0.6545618517961822, + "flos": 19134714021120.0, + "grad_norm": 1.9253947887259737, + "language_loss": 0.78439051, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.80560243, + "num_input_tokens_seen": 235081670, + "step": 10887, + "time_per_iteration": 2.4991934299468994 + }, + { + "auxiliary_loss_clip": 0.01081157, + "auxiliary_loss_mlp": 0.01028871, + "balance_loss_clip": 1.03582907, + "balance_loss_mlp": 1.01706386, + "epoch": 0.6546219750488501, + "flos": 14938624149120.0, + "grad_norm": 1.8295015868568918, + "language_loss": 0.78982043, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.81092072, + "num_input_tokens_seen": 235098510, + "step": 10888, + "time_per_iteration": 2.5154595375061035 + }, + { + "auxiliary_loss_clip": 0.01094807, + "auxiliary_loss_mlp": 0.01027046, + "balance_loss_clip": 1.03706431, + "balance_loss_mlp": 1.01581025, + "epoch": 0.6546820983015181, + "flos": 36320518886400.0, + "grad_norm": 1.6388845895555322, + "language_loss": 0.66443855, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.68565708, + "num_input_tokens_seen": 235119990, + "step": 10889, + "time_per_iteration": 2.635481595993042 + }, + { + "auxiliary_loss_clip": 0.01082975, + "auxiliary_loss_mlp": 0.01037286, + "balance_loss_clip": 1.0362339, + "balance_loss_mlp": 1.0228734, + "epoch": 0.654742221554186, + "flos": 20411446014720.0, + "grad_norm": 1.4411008137715386, + "language_loss": 0.79942608, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.8206287, + "num_input_tokens_seen": 235139255, + "step": 10890, + "time_per_iteration": 2.535886287689209 + }, + { + "auxiliary_loss_clip": 0.01096971, + "auxiliary_loss_mlp": 0.00785566, + "balance_loss_clip": 1.03591442, + "balance_loss_mlp": 1.01205635, + "epoch": 0.6548023448068541, + "flos": 24863650836480.0, + "grad_norm": 2.1220157167283005, + "language_loss": 0.65587497, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.67470032, + "num_input_tokens_seen": 235158455, + "step": 10891, + "time_per_iteration": 2.531151056289673 + }, + { + "auxiliary_loss_clip": 0.01095059, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.03685737, + "balance_loss_mlp": 1.01994455, + "epoch": 0.654862468059522, + "flos": 21427573858560.0, + "grad_norm": 1.7752708378056687, + "language_loss": 0.79383391, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.81510198, + "num_input_tokens_seen": 235177350, + "step": 10892, + "time_per_iteration": 2.506728410720825 + }, + { + "auxiliary_loss_clip": 0.01103002, + "auxiliary_loss_mlp": 0.01032236, + "balance_loss_clip": 1.04070961, + "balance_loss_mlp": 1.0207864, + "epoch": 0.65492259131219, + "flos": 26577846570240.0, + "grad_norm": 1.9735708056841559, + "language_loss": 0.78078663, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.80213904, + "num_input_tokens_seen": 235196435, + "step": 10893, + "time_per_iteration": 3.959026336669922 + }, + { + "auxiliary_loss_clip": 0.01113494, + "auxiliary_loss_mlp": 0.01029418, + "balance_loss_clip": 1.03963482, + "balance_loss_mlp": 1.01593578, + "epoch": 0.6549827145648579, + "flos": 21501334437120.0, + "grad_norm": 1.6143446364421794, + "language_loss": 0.70238882, + "learning_rate": 1.123895622914766e-06, + "loss": 0.72381794, + "num_input_tokens_seen": 235215430, + "step": 10894, + "time_per_iteration": 2.4856536388397217 + }, + { + "auxiliary_loss_clip": 0.01100924, + "auxiliary_loss_mlp": 0.01032113, + "balance_loss_clip": 1.0370717, + "balance_loss_mlp": 1.01997137, + "epoch": 0.6550428378175259, + "flos": 22594275515520.0, + "grad_norm": 2.7203556887317437, + "language_loss": 0.62210202, + "learning_rate": 1.123545533127549e-06, + "loss": 0.64343238, + "num_input_tokens_seen": 235232015, + "step": 10895, + "time_per_iteration": 3.8678109645843506 + }, + { + "auxiliary_loss_clip": 0.01096384, + "auxiliary_loss_mlp": 0.01034398, + "balance_loss_clip": 1.03733349, + "balance_loss_mlp": 1.02252483, + "epoch": 0.655102961070194, + "flos": 12823809050880.0, + "grad_norm": 2.0154067341578075, + "language_loss": 0.79223192, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.81353974, + "num_input_tokens_seen": 235248115, + "step": 10896, + "time_per_iteration": 3.919327735900879 + }, + { + "auxiliary_loss_clip": 0.01086508, + "auxiliary_loss_mlp": 0.0103315, + "balance_loss_clip": 1.03857374, + "balance_loss_mlp": 1.02164006, + "epoch": 0.6551630843228619, + "flos": 24791075406720.0, + "grad_norm": 1.4040748850926725, + "language_loss": 0.70801938, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.72921598, + "num_input_tokens_seen": 235270785, + "step": 10897, + "time_per_iteration": 2.583308696746826 + }, + { + "auxiliary_loss_clip": 0.01110685, + "auxiliary_loss_mlp": 0.01032442, + "balance_loss_clip": 1.03797257, + "balance_loss_mlp": 1.02001405, + "epoch": 0.6552232075755299, + "flos": 16724461559040.0, + "grad_norm": 1.5446789999960828, + "language_loss": 0.75449693, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.77592826, + "num_input_tokens_seen": 235287905, + "step": 10898, + "time_per_iteration": 2.4441328048706055 + }, + { + "auxiliary_loss_clip": 0.01088076, + "auxiliary_loss_mlp": 0.01033366, + "balance_loss_clip": 1.03904319, + "balance_loss_mlp": 1.02148664, + "epoch": 0.6552833308281978, + "flos": 22016473338240.0, + "grad_norm": 2.607113137539881, + "language_loss": 0.7370007, + "learning_rate": 1.122145506463827e-06, + "loss": 0.75821513, + "num_input_tokens_seen": 235305525, + "step": 10899, + "time_per_iteration": 2.522364616394043 + }, + { + "auxiliary_loss_clip": 0.01088621, + "auxiliary_loss_mlp": 0.01028141, + "balance_loss_clip": 1.03916264, + "balance_loss_mlp": 1.01621389, + "epoch": 0.6553434540808658, + "flos": 24863399441280.0, + "grad_norm": 1.8367856406331626, + "language_loss": 0.55525243, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.57642007, + "num_input_tokens_seen": 235324415, + "step": 10900, + "time_per_iteration": 2.5799546241760254 + }, + { + "auxiliary_loss_clip": 0.0109805, + "auxiliary_loss_mlp": 0.01034605, + "balance_loss_clip": 1.03920555, + "balance_loss_mlp": 1.02137923, + "epoch": 0.6554035773335337, + "flos": 23221060865280.0, + "grad_norm": 1.7363179209225317, + "language_loss": 0.76883984, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.79016644, + "num_input_tokens_seen": 235341595, + "step": 10901, + "time_per_iteration": 2.492652416229248 + }, + { + "auxiliary_loss_clip": 0.01107665, + "auxiliary_loss_mlp": 0.01029335, + "balance_loss_clip": 1.03799772, + "balance_loss_mlp": 1.01632953, + "epoch": 0.6554637005862017, + "flos": 22783597125120.0, + "grad_norm": 1.803819093287761, + "language_loss": 0.73036075, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.7517308, + "num_input_tokens_seen": 235361700, + "step": 10902, + "time_per_iteration": 2.4898486137390137 + }, + { + "auxiliary_loss_clip": 0.01108466, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.03959239, + "balance_loss_mlp": 1.01951742, + "epoch": 0.6555238238388696, + "flos": 21507224267520.0, + "grad_norm": 1.5487699655676994, + "language_loss": 0.67837745, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.6997757, + "num_input_tokens_seen": 235382065, + "step": 10903, + "time_per_iteration": 2.4815502166748047 + }, + { + "auxiliary_loss_clip": 0.01089874, + "auxiliary_loss_mlp": 0.00786394, + "balance_loss_clip": 1.03664851, + "balance_loss_mlp": 1.01119137, + "epoch": 0.6555839470915377, + "flos": 30519473518080.0, + "grad_norm": 1.6766263631888907, + "language_loss": 0.6687727, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.68753541, + "num_input_tokens_seen": 235402130, + "step": 10904, + "time_per_iteration": 2.6318843364715576 + }, + { + "auxiliary_loss_clip": 0.01101284, + "auxiliary_loss_mlp": 0.01040065, + "balance_loss_clip": 1.03723419, + "balance_loss_mlp": 1.02643991, + "epoch": 0.6556440703442056, + "flos": 24642943718400.0, + "grad_norm": 1.7050090131214304, + "language_loss": 0.90187824, + "learning_rate": 1.120046465383464e-06, + "loss": 0.92329174, + "num_input_tokens_seen": 235420435, + "step": 10905, + "time_per_iteration": 2.5309221744537354 + }, + { + "auxiliary_loss_clip": 0.01096587, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.0370605, + "balance_loss_mlp": 1.02168131, + "epoch": 0.6557041935968736, + "flos": 23732464752000.0, + "grad_norm": 1.741994651852035, + "language_loss": 0.75895923, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.78026205, + "num_input_tokens_seen": 235439960, + "step": 10906, + "time_per_iteration": 2.5316450595855713 + }, + { + "auxiliary_loss_clip": 0.01114595, + "auxiliary_loss_mlp": 0.01040804, + "balance_loss_clip": 1.04115486, + "balance_loss_mlp": 1.02823925, + "epoch": 0.6557643168495415, + "flos": 11102753819520.0, + "grad_norm": 2.4926634801994014, + "language_loss": 0.74042159, + "learning_rate": 1.119347051825267e-06, + "loss": 0.76197553, + "num_input_tokens_seen": 235457495, + "step": 10907, + "time_per_iteration": 2.4513485431671143 + }, + { + "auxiliary_loss_clip": 0.01069272, + "auxiliary_loss_mlp": 0.01030498, + "balance_loss_clip": 1.03824818, + "balance_loss_mlp": 1.01720047, + "epoch": 0.6558244401022095, + "flos": 30191034533760.0, + "grad_norm": 1.4563662230411119, + "language_loss": 0.72106874, + "learning_rate": 1.118997395131211e-06, + "loss": 0.74206644, + "num_input_tokens_seen": 235479525, + "step": 10908, + "time_per_iteration": 2.651982307434082 + }, + { + "auxiliary_loss_clip": 0.01111297, + "auxiliary_loss_mlp": 0.01034454, + "balance_loss_clip": 1.03956842, + "balance_loss_mlp": 1.02095902, + "epoch": 0.6558845633548775, + "flos": 17931060247680.0, + "grad_norm": 2.3400954174110056, + "language_loss": 0.81519341, + "learning_rate": 1.118647771844861e-06, + "loss": 0.83665097, + "num_input_tokens_seen": 235496305, + "step": 10909, + "time_per_iteration": 2.4321510791778564 + }, + { + "auxiliary_loss_clip": 0.01111168, + "auxiliary_loss_mlp": 0.01037052, + "balance_loss_clip": 1.03928018, + "balance_loss_mlp": 1.02397442, + "epoch": 0.6559446866075455, + "flos": 21904144531200.0, + "grad_norm": 2.1365357843882196, + "language_loss": 0.6432696, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.66475177, + "num_input_tokens_seen": 235512545, + "step": 10910, + "time_per_iteration": 2.4814348220825195 + }, + { + "auxiliary_loss_clip": 0.01088948, + "auxiliary_loss_mlp": 0.01035976, + "balance_loss_clip": 1.03747988, + "balance_loss_mlp": 1.02165258, + "epoch": 0.6560048098602135, + "flos": 14127976056960.0, + "grad_norm": 2.7659221474624665, + "language_loss": 0.75299627, + "learning_rate": 1.117948625548313e-06, + "loss": 0.7742455, + "num_input_tokens_seen": 235526045, + "step": 10911, + "time_per_iteration": 2.4738376140594482 + }, + { + "auxiliary_loss_clip": 0.01105677, + "auxiliary_loss_mlp": 0.01029545, + "balance_loss_clip": 1.03766096, + "balance_loss_mlp": 1.01815462, + "epoch": 0.6560649331128814, + "flos": 18807567926400.0, + "grad_norm": 1.5200852465908343, + "language_loss": 0.75296658, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.77431881, + "num_input_tokens_seen": 235545285, + "step": 10912, + "time_per_iteration": 2.514439821243286 + }, + { + "auxiliary_loss_clip": 0.01076924, + "auxiliary_loss_mlp": 0.00787216, + "balance_loss_clip": 1.0415169, + "balance_loss_mlp": 1.00960577, + "epoch": 0.6561250563655494, + "flos": 17053618815360.0, + "grad_norm": 1.618503188776137, + "language_loss": 0.77449143, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.79313278, + "num_input_tokens_seen": 235563150, + "step": 10913, + "time_per_iteration": 2.5515522956848145 + }, + { + "auxiliary_loss_clip": 0.01078895, + "auxiliary_loss_mlp": 0.01030943, + "balance_loss_clip": 1.03542709, + "balance_loss_mlp": 1.01988077, + "epoch": 0.6561851796182173, + "flos": 22637656166400.0, + "grad_norm": 1.6684927938194365, + "language_loss": 0.71227038, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.73336875, + "num_input_tokens_seen": 235582535, + "step": 10914, + "time_per_iteration": 2.531829357147217 + }, + { + "auxiliary_loss_clip": 0.01078783, + "auxiliary_loss_mlp": 0.01031537, + "balance_loss_clip": 1.03844404, + "balance_loss_mlp": 1.01913905, + "epoch": 0.6562453028708853, + "flos": 19239213663360.0, + "grad_norm": 1.7070527763054648, + "language_loss": 0.7402603, + "learning_rate": 1.116550734430958e-06, + "loss": 0.76136351, + "num_input_tokens_seen": 235601490, + "step": 10915, + "time_per_iteration": 2.5312304496765137 + }, + { + "auxiliary_loss_clip": 0.01067605, + "auxiliary_loss_mlp": 0.01030573, + "balance_loss_clip": 1.0364567, + "balance_loss_mlp": 1.01738238, + "epoch": 0.6563054261235532, + "flos": 23801305167360.0, + "grad_norm": 1.5556434291678525, + "language_loss": 0.79454994, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.81553173, + "num_input_tokens_seen": 235619165, + "step": 10916, + "time_per_iteration": 2.5691590309143066 + }, + { + "auxiliary_loss_clip": 0.01087859, + "auxiliary_loss_mlp": 0.01036089, + "balance_loss_clip": 1.03686178, + "balance_loss_mlp": 1.0242393, + "epoch": 0.6563655493762213, + "flos": 19240039676160.0, + "grad_norm": 1.8932857043353615, + "language_loss": 0.76368535, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.78492486, + "num_input_tokens_seen": 235637115, + "step": 10917, + "time_per_iteration": 2.4964098930358887 + }, + { + "auxiliary_loss_clip": 0.01108189, + "auxiliary_loss_mlp": 0.00785681, + "balance_loss_clip": 1.03817725, + "balance_loss_mlp": 1.01213837, + "epoch": 0.6564256726288892, + "flos": 25556439427200.0, + "grad_norm": 2.058053847025102, + "language_loss": 0.70270252, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.72164118, + "num_input_tokens_seen": 235656330, + "step": 10918, + "time_per_iteration": 2.5073118209838867 + }, + { + "auxiliary_loss_clip": 0.01073582, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.03773141, + "balance_loss_mlp": 1.02238274, + "epoch": 0.6564857958815572, + "flos": 22200623389440.0, + "grad_norm": 1.717615659092299, + "language_loss": 0.76589179, + "learning_rate": 1.115153379321332e-06, + "loss": 0.78696871, + "num_input_tokens_seen": 235674510, + "step": 10919, + "time_per_iteration": 2.5614089965820312 + }, + { + "auxiliary_loss_clip": 0.01033647, + "auxiliary_loss_mlp": 0.00769247, + "balance_loss_clip": 1.01979208, + "balance_loss_mlp": 1.010167, + "epoch": 0.6565459191342251, + "flos": 58123144604160.0, + "grad_norm": 0.8189887772813239, + "language_loss": 0.53056657, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.54859543, + "num_input_tokens_seen": 235735050, + "step": 10920, + "time_per_iteration": 3.1434290409088135 + }, + { + "auxiliary_loss_clip": 0.0109809, + "auxiliary_loss_mlp": 0.01029961, + "balance_loss_clip": 1.03791666, + "balance_loss_mlp": 1.01754498, + "epoch": 0.6566060423868931, + "flos": 30809631582720.0, + "grad_norm": 1.7048351079846842, + "language_loss": 0.65675789, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.67803842, + "num_input_tokens_seen": 235757545, + "step": 10921, + "time_per_iteration": 2.5793631076812744 + }, + { + "auxiliary_loss_clip": 0.01083497, + "auxiliary_loss_mlp": 0.01040231, + "balance_loss_clip": 1.0362246, + "balance_loss_mlp": 1.02535915, + "epoch": 0.6566661656395612, + "flos": 23367432787200.0, + "grad_norm": 1.9742450489124095, + "language_loss": 0.81682491, + "learning_rate": 1.114105715254205e-06, + "loss": 0.83806217, + "num_input_tokens_seen": 235777265, + "step": 10922, + "time_per_iteration": 2.5383758544921875 + }, + { + "auxiliary_loss_clip": 0.01061919, + "auxiliary_loss_mlp": 0.00786362, + "balance_loss_clip": 1.0395447, + "balance_loss_mlp": 1.00730646, + "epoch": 0.6567262888922291, + "flos": 25735597488000.0, + "grad_norm": 2.6797044190952404, + "language_loss": 0.71179086, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.73027366, + "num_input_tokens_seen": 235796565, + "step": 10923, + "time_per_iteration": 2.6199796199798584 + }, + { + "auxiliary_loss_clip": 0.01074722, + "auxiliary_loss_mlp": 0.01031441, + "balance_loss_clip": 1.03949857, + "balance_loss_mlp": 1.01933503, + "epoch": 0.6567864121448971, + "flos": 17123716206720.0, + "grad_norm": 2.1300040269042864, + "language_loss": 0.81046969, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.83153129, + "num_input_tokens_seen": 235814805, + "step": 10924, + "time_per_iteration": 3.9857194423675537 + }, + { + "auxiliary_loss_clip": 0.01094173, + "auxiliary_loss_mlp": 0.01030409, + "balance_loss_clip": 1.03714895, + "balance_loss_mlp": 1.01877975, + "epoch": 0.656846535397565, + "flos": 22419319345920.0, + "grad_norm": 1.5318290311240728, + "language_loss": 0.72245705, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.74370289, + "num_input_tokens_seen": 235833405, + "step": 10925, + "time_per_iteration": 2.5227954387664795 + }, + { + "auxiliary_loss_clip": 0.01096496, + "auxiliary_loss_mlp": 0.01028031, + "balance_loss_clip": 1.03675818, + "balance_loss_mlp": 1.0155493, + "epoch": 0.656906658650233, + "flos": 17704535126400.0, + "grad_norm": 2.1032170221013704, + "language_loss": 0.72379535, + "learning_rate": 1.112709300197942e-06, + "loss": 0.74504066, + "num_input_tokens_seen": 235848530, + "step": 10926, + "time_per_iteration": 2.454610824584961 + }, + { + "auxiliary_loss_clip": 0.01072578, + "auxiliary_loss_mlp": 0.01032557, + "balance_loss_clip": 1.03776121, + "balance_loss_mlp": 1.01945615, + "epoch": 0.6569667819029009, + "flos": 21175158009600.0, + "grad_norm": 1.6901319476694325, + "language_loss": 0.72691488, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.74796629, + "num_input_tokens_seen": 235867225, + "step": 10927, + "time_per_iteration": 2.5982627868652344 + }, + { + "auxiliary_loss_clip": 0.0101282, + "auxiliary_loss_mlp": 0.01002606, + "balance_loss_clip": 1.0197196, + "balance_loss_mlp": 1.00136662, + "epoch": 0.6570269051555689, + "flos": 68761897511040.0, + "grad_norm": 0.7425789246489861, + "language_loss": 0.64459002, + "learning_rate": 1.112011294493775e-06, + "loss": 0.66474426, + "num_input_tokens_seen": 235932925, + "step": 10928, + "time_per_iteration": 3.1639184951782227 + }, + { + "auxiliary_loss_clip": 0.01097391, + "auxiliary_loss_mlp": 0.01031812, + "balance_loss_clip": 1.0370152, + "balance_loss_mlp": 1.01933718, + "epoch": 0.6570870284082369, + "flos": 26319289495680.0, + "grad_norm": 1.8127148322452193, + "language_loss": 0.778157, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.79944897, + "num_input_tokens_seen": 235952680, + "step": 10929, + "time_per_iteration": 2.5535881519317627 + }, + { + "auxiliary_loss_clip": 0.01076061, + "auxiliary_loss_mlp": 0.01032997, + "balance_loss_clip": 1.03775978, + "balance_loss_mlp": 1.02052212, + "epoch": 0.6571471516609049, + "flos": 26174749167360.0, + "grad_norm": 1.8692325043586049, + "language_loss": 0.65370613, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.6747967, + "num_input_tokens_seen": 235972075, + "step": 10930, + "time_per_iteration": 2.599954605102539 + }, + { + "auxiliary_loss_clip": 0.01062329, + "auxiliary_loss_mlp": 0.01030535, + "balance_loss_clip": 1.03582382, + "balance_loss_mlp": 1.01740372, + "epoch": 0.6572072749135728, + "flos": 20376253664640.0, + "grad_norm": 1.540139726926366, + "language_loss": 0.70585549, + "learning_rate": 1.110964538515258e-06, + "loss": 0.72678411, + "num_input_tokens_seen": 235990340, + "step": 10931, + "time_per_iteration": 2.588287591934204 + }, + { + "auxiliary_loss_clip": 0.01066136, + "auxiliary_loss_mlp": 0.01033907, + "balance_loss_clip": 1.04021871, + "balance_loss_mlp": 1.02209318, + "epoch": 0.6572673981662408, + "flos": 17128744110720.0, + "grad_norm": 1.9249751670693578, + "language_loss": 0.68712294, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.70812333, + "num_input_tokens_seen": 236007470, + "step": 10932, + "time_per_iteration": 3.9705886840820312 + }, + { + "auxiliary_loss_clip": 0.01083894, + "auxiliary_loss_mlp": 0.00784373, + "balance_loss_clip": 1.03421426, + "balance_loss_mlp": 1.00894582, + "epoch": 0.6573275214189087, + "flos": 41275113281280.0, + "grad_norm": 1.7262791622039473, + "language_loss": 0.80009705, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.81877971, + "num_input_tokens_seen": 236029030, + "step": 10933, + "time_per_iteration": 2.6912357807159424 + }, + { + "auxiliary_loss_clip": 0.01062026, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.03975034, + "balance_loss_mlp": 1.01804948, + "epoch": 0.6573876446715767, + "flos": 22890143842560.0, + "grad_norm": 1.577479343975404, + "language_loss": 0.7354871, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.75641948, + "num_input_tokens_seen": 236047160, + "step": 10934, + "time_per_iteration": 2.59609317779541 + }, + { + "auxiliary_loss_clip": 0.01092295, + "auxiliary_loss_mlp": 0.01042411, + "balance_loss_clip": 1.03622735, + "balance_loss_mlp": 1.02836251, + "epoch": 0.6574477679242448, + "flos": 44018150273280.0, + "grad_norm": 1.5900709566484772, + "language_loss": 0.76386952, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.78521657, + "num_input_tokens_seen": 236069215, + "step": 10935, + "time_per_iteration": 4.12696099281311 + }, + { + "auxiliary_loss_clip": 0.01071261, + "auxiliary_loss_mlp": 0.01040486, + "balance_loss_clip": 1.03728843, + "balance_loss_mlp": 1.02656865, + "epoch": 0.6575078911769127, + "flos": 24571517523840.0, + "grad_norm": 1.7686229124657173, + "language_loss": 0.78033817, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.80145562, + "num_input_tokens_seen": 236088335, + "step": 10936, + "time_per_iteration": 2.5832505226135254 + }, + { + "auxiliary_loss_clip": 0.01069666, + "auxiliary_loss_mlp": 0.01028636, + "balance_loss_clip": 1.03751242, + "balance_loss_mlp": 1.01660132, + "epoch": 0.6575680144295807, + "flos": 20924035050240.0, + "grad_norm": 1.840250693405559, + "language_loss": 0.69199264, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.71297562, + "num_input_tokens_seen": 236108540, + "step": 10937, + "time_per_iteration": 2.595842123031616 + }, + { + "auxiliary_loss_clip": 0.01083317, + "auxiliary_loss_mlp": 0.01033101, + "balance_loss_clip": 1.0394367, + "balance_loss_mlp": 1.02059031, + "epoch": 0.6576281376822486, + "flos": 10925642833920.0, + "grad_norm": 2.211503383760117, + "language_loss": 0.69073159, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.7118957, + "num_input_tokens_seen": 236124495, + "step": 10938, + "time_per_iteration": 2.5007216930389404 + }, + { + "auxiliary_loss_clip": 0.01082918, + "auxiliary_loss_mlp": 0.01032266, + "balance_loss_clip": 1.03909218, + "balance_loss_mlp": 1.01921892, + "epoch": 0.6576882609349166, + "flos": 19281552819840.0, + "grad_norm": 1.997515160961871, + "language_loss": 0.71224135, + "learning_rate": 1.108174673550927e-06, + "loss": 0.73339319, + "num_input_tokens_seen": 236142550, + "step": 10939, + "time_per_iteration": 2.52908992767334 + }, + { + "auxiliary_loss_clip": 0.01088779, + "auxiliary_loss_mlp": 0.0078919, + "balance_loss_clip": 1.04016244, + "balance_loss_mlp": 1.01674581, + "epoch": 0.6577483841875845, + "flos": 20220544206720.0, + "grad_norm": 1.9809672850798477, + "language_loss": 0.77577031, + "learning_rate": 1.107826092473037e-06, + "loss": 0.79455006, + "num_input_tokens_seen": 236156620, + "step": 10940, + "time_per_iteration": 2.5060229301452637 + }, + { + "auxiliary_loss_clip": 0.01067396, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.03545451, + "balance_loss_mlp": 1.01865435, + "epoch": 0.6578085074402525, + "flos": 34751078962560.0, + "grad_norm": 1.8593915106700138, + "language_loss": 0.68431807, + "learning_rate": 1.107477545226471e-06, + "loss": 0.70530993, + "num_input_tokens_seen": 236177095, + "step": 10941, + "time_per_iteration": 2.7031853199005127 + }, + { + "auxiliary_loss_clip": 0.01093888, + "auxiliary_loss_mlp": 0.00785908, + "balance_loss_clip": 1.03533781, + "balance_loss_mlp": 1.01122284, + "epoch": 0.6578686306929205, + "flos": 23470998675840.0, + "grad_norm": 1.9365051030933063, + "language_loss": 0.68396783, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.70276582, + "num_input_tokens_seen": 236194695, + "step": 10942, + "time_per_iteration": 2.5266947746276855 + }, + { + "auxiliary_loss_clip": 0.01079372, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.03797984, + "balance_loss_mlp": 1.02235544, + "epoch": 0.6579287539455885, + "flos": 18077073033600.0, + "grad_norm": 1.78271150059956, + "language_loss": 0.71440423, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.73556513, + "num_input_tokens_seen": 236213885, + "step": 10943, + "time_per_iteration": 2.539088726043701 + }, + { + "auxiliary_loss_clip": 0.01072207, + "auxiliary_loss_mlp": 0.01033719, + "balance_loss_clip": 1.03580832, + "balance_loss_mlp": 1.02075434, + "epoch": 0.6579888771982564, + "flos": 28661383900800.0, + "grad_norm": 1.6602153534335515, + "language_loss": 0.5902738, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.61133307, + "num_input_tokens_seen": 236237315, + "step": 10944, + "time_per_iteration": 2.6308932304382324 + }, + { + "auxiliary_loss_clip": 0.01102569, + "auxiliary_loss_mlp": 0.01034697, + "balance_loss_clip": 1.03838277, + "balance_loss_mlp": 1.02163219, + "epoch": 0.6580490004509244, + "flos": 25046543911680.0, + "grad_norm": 1.5134433290499227, + "language_loss": 0.72436273, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.74573541, + "num_input_tokens_seen": 236256345, + "step": 10945, + "time_per_iteration": 2.5506606101989746 + }, + { + "auxiliary_loss_clip": 0.01087112, + "auxiliary_loss_mlp": 0.01026005, + "balance_loss_clip": 1.03854847, + "balance_loss_mlp": 1.01456702, + "epoch": 0.6581091237035923, + "flos": 43508793461760.0, + "grad_norm": 1.4777038696822193, + "language_loss": 0.70577073, + "learning_rate": 1.105735316926046e-06, + "loss": 0.72690189, + "num_input_tokens_seen": 236281890, + "step": 10946, + "time_per_iteration": 2.727125406265259 + }, + { + "auxiliary_loss_clip": 0.01098843, + "auxiliary_loss_mlp": 0.01032271, + "balance_loss_clip": 1.03856599, + "balance_loss_mlp": 1.01964712, + "epoch": 0.6581692469562603, + "flos": 22415404763520.0, + "grad_norm": 2.0722928857646834, + "language_loss": 0.81608349, + "learning_rate": 1.105386972944934e-06, + "loss": 0.8373946, + "num_input_tokens_seen": 236298370, + "step": 10947, + "time_per_iteration": 2.5034046173095703 + }, + { + "auxiliary_loss_clip": 0.01054511, + "auxiliary_loss_mlp": 0.00786713, + "balance_loss_clip": 1.03518009, + "balance_loss_mlp": 1.00965214, + "epoch": 0.6582293702089284, + "flos": 24859772167680.0, + "grad_norm": 1.5638099324036803, + "language_loss": 0.77447677, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.792889, + "num_input_tokens_seen": 236317380, + "step": 10948, + "time_per_iteration": 2.620025157928467 + }, + { + "auxiliary_loss_clip": 0.01099287, + "auxiliary_loss_mlp": 0.01028059, + "balance_loss_clip": 1.03937781, + "balance_loss_mlp": 1.0159955, + "epoch": 0.6582894934615963, + "flos": 23039676161280.0, + "grad_norm": 1.634848404571932, + "language_loss": 0.78899598, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.81026942, + "num_input_tokens_seen": 236336210, + "step": 10949, + "time_per_iteration": 2.5271012783050537 + }, + { + "auxiliary_loss_clip": 0.01032543, + "auxiliary_loss_mlp": 0.01001961, + "balance_loss_clip": 1.01848471, + "balance_loss_mlp": 1.00080478, + "epoch": 0.6583496167142643, + "flos": 72551980978560.0, + "grad_norm": 0.7377619949118958, + "language_loss": 0.61851352, + "learning_rate": 1.104342144597323e-06, + "loss": 0.63885856, + "num_input_tokens_seen": 236403090, + "step": 10950, + "time_per_iteration": 3.187147617340088 + }, + { + "auxiliary_loss_clip": 0.01096108, + "auxiliary_loss_mlp": 0.01034052, + "balance_loss_clip": 1.03713667, + "balance_loss_mlp": 1.02284098, + "epoch": 0.6584097399669322, + "flos": 13078846592640.0, + "grad_norm": 2.149396029143752, + "language_loss": 0.66765654, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.68895811, + "num_input_tokens_seen": 236420475, + "step": 10951, + "time_per_iteration": 2.4940176010131836 + }, + { + "auxiliary_loss_clip": 0.01096812, + "auxiliary_loss_mlp": 0.01031196, + "balance_loss_clip": 1.03847194, + "balance_loss_mlp": 1.01919723, + "epoch": 0.6584698632196002, + "flos": 28693164458880.0, + "grad_norm": 1.3849748252602445, + "language_loss": 0.76362801, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.78490806, + "num_input_tokens_seen": 236441915, + "step": 10952, + "time_per_iteration": 2.5679867267608643 + }, + { + "auxiliary_loss_clip": 0.01107508, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.03843665, + "balance_loss_mlp": 1.01596212, + "epoch": 0.6585299864722681, + "flos": 14319272914560.0, + "grad_norm": 1.7050995072644481, + "language_loss": 0.73508406, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.75644064, + "num_input_tokens_seen": 236460340, + "step": 10953, + "time_per_iteration": 2.4455037117004395 + }, + { + "auxiliary_loss_clip": 0.01077469, + "auxiliary_loss_mlp": 0.01036487, + "balance_loss_clip": 1.03841233, + "balance_loss_mlp": 1.02426815, + "epoch": 0.6585901097249361, + "flos": 26797907243520.0, + "grad_norm": 2.0280863441706964, + "language_loss": 0.78416699, + "learning_rate": 1.102949515683546e-06, + "loss": 0.80530655, + "num_input_tokens_seen": 236478280, + "step": 10954, + "time_per_iteration": 2.594600200653076 + }, + { + "auxiliary_loss_clip": 0.01086415, + "auxiliary_loss_mlp": 0.01037053, + "balance_loss_clip": 1.03639007, + "balance_loss_mlp": 1.02402949, + "epoch": 0.658650232977604, + "flos": 18733124989440.0, + "grad_norm": 2.2078788133817437, + "language_loss": 0.6941635, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.71539819, + "num_input_tokens_seen": 236493225, + "step": 10955, + "time_per_iteration": 2.506420135498047 + }, + { + "auxiliary_loss_clip": 0.01078707, + "auxiliary_loss_mlp": 0.01032871, + "balance_loss_clip": 1.03514743, + "balance_loss_mlp": 1.02122378, + "epoch": 0.6587103562302721, + "flos": 24753440931840.0, + "grad_norm": 2.0094008943933144, + "language_loss": 0.80655712, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.8276729, + "num_input_tokens_seen": 236514420, + "step": 10956, + "time_per_iteration": 2.539672374725342 + }, + { + "auxiliary_loss_clip": 0.01095897, + "auxiliary_loss_mlp": 0.01035682, + "balance_loss_clip": 1.03782034, + "balance_loss_mlp": 1.02274764, + "epoch": 0.65877047948294, + "flos": 22346133384960.0, + "grad_norm": 3.2955298818046432, + "language_loss": 0.81297946, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.83429521, + "num_input_tokens_seen": 236532785, + "step": 10957, + "time_per_iteration": 2.5078375339508057 + }, + { + "auxiliary_loss_clip": 0.01085278, + "auxiliary_loss_mlp": 0.01032202, + "balance_loss_clip": 1.03803444, + "balance_loss_mlp": 1.02090049, + "epoch": 0.658830602735608, + "flos": 45180542298240.0, + "grad_norm": 1.700017406455416, + "language_loss": 0.75681484, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.77798969, + "num_input_tokens_seen": 236553330, + "step": 10958, + "time_per_iteration": 2.7244529724121094 + }, + { + "auxiliary_loss_clip": 0.01066663, + "auxiliary_loss_mlp": 0.01041742, + "balance_loss_clip": 1.03698802, + "balance_loss_mlp": 1.02769351, + "epoch": 0.6588907259882759, + "flos": 19901622326400.0, + "grad_norm": 1.6301790002542313, + "language_loss": 0.75097644, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.77206051, + "num_input_tokens_seen": 236572960, + "step": 10959, + "time_per_iteration": 2.553173303604126 + }, + { + "auxiliary_loss_clip": 0.01099147, + "auxiliary_loss_mlp": 0.0102815, + "balance_loss_clip": 1.03878355, + "balance_loss_mlp": 1.01620483, + "epoch": 0.6589508492409439, + "flos": 24133766474880.0, + "grad_norm": 1.4609028550256833, + "language_loss": 0.65063661, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.67190957, + "num_input_tokens_seen": 236594090, + "step": 10960, + "time_per_iteration": 2.518650531768799 + }, + { + "auxiliary_loss_clip": 0.011138, + "auxiliary_loss_mlp": 0.01034029, + "balance_loss_clip": 1.03985524, + "balance_loss_mlp": 1.02090454, + "epoch": 0.659010972493612, + "flos": 18222906251520.0, + "grad_norm": 2.0655555520332776, + "language_loss": 0.81823611, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.83971441, + "num_input_tokens_seen": 236610190, + "step": 10961, + "time_per_iteration": 2.4724037647247314 + }, + { + "auxiliary_loss_clip": 0.0107201, + "auxiliary_loss_mlp": 0.01028116, + "balance_loss_clip": 1.0380106, + "balance_loss_mlp": 1.01555717, + "epoch": 0.6590710957462799, + "flos": 27600007898880.0, + "grad_norm": 1.646902415267453, + "language_loss": 0.73463553, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.75563681, + "num_input_tokens_seen": 236631575, + "step": 10962, + "time_per_iteration": 3.992591142654419 + }, + { + "auxiliary_loss_clip": 0.01092617, + "auxiliary_loss_mlp": 0.01033288, + "balance_loss_clip": 1.03636169, + "balance_loss_mlp": 1.02105141, + "epoch": 0.6591312189989479, + "flos": 20302959962880.0, + "grad_norm": 1.9481164769281438, + "language_loss": 0.79391396, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.81517303, + "num_input_tokens_seen": 236649815, + "step": 10963, + "time_per_iteration": 2.4816064834594727 + }, + { + "auxiliary_loss_clip": 0.0106407, + "auxiliary_loss_mlp": 0.00783774, + "balance_loss_clip": 1.03766823, + "balance_loss_mlp": 1.00918508, + "epoch": 0.6591913422516158, + "flos": 12312943868160.0, + "grad_norm": 1.703441493194395, + "language_loss": 0.78184438, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.80032277, + "num_input_tokens_seen": 236668335, + "step": 10964, + "time_per_iteration": 2.5914885997772217 + }, + { + "auxiliary_loss_clip": 0.010701, + "auxiliary_loss_mlp": 0.01036323, + "balance_loss_clip": 1.03348804, + "balance_loss_mlp": 1.02393115, + "epoch": 0.6592514655042838, + "flos": 25884591102720.0, + "grad_norm": 1.632598999353539, + "language_loss": 0.74086773, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.7619319, + "num_input_tokens_seen": 236688945, + "step": 10965, + "time_per_iteration": 2.5860815048217773 + }, + { + "auxiliary_loss_clip": 0.01077072, + "auxiliary_loss_mlp": 0.01039815, + "balance_loss_clip": 1.03657007, + "balance_loss_mlp": 1.0260818, + "epoch": 0.6593115887569517, + "flos": 14063624841600.0, + "grad_norm": 2.0081459045722636, + "language_loss": 0.7359612, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.75713003, + "num_input_tokens_seen": 236707055, + "step": 10966, + "time_per_iteration": 2.5395796298980713 + }, + { + "auxiliary_loss_clip": 0.01098246, + "auxiliary_loss_mlp": 0.01030968, + "balance_loss_clip": 1.03718257, + "balance_loss_mlp": 1.018296, + "epoch": 0.6593717120096197, + "flos": 24717925359360.0, + "grad_norm": 1.5918891907391046, + "language_loss": 0.76943457, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.79072672, + "num_input_tokens_seen": 236725900, + "step": 10967, + "time_per_iteration": 2.531558036804199 + }, + { + "auxiliary_loss_clip": 0.01034609, + "auxiliary_loss_mlp": 0.01002903, + "balance_loss_clip": 1.02028227, + "balance_loss_mlp": 1.0016278, + "epoch": 0.6594318352622877, + "flos": 55558083502080.0, + "grad_norm": 0.6972305191951089, + "language_loss": 0.48431277, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.50468791, + "num_input_tokens_seen": 236788415, + "step": 10968, + "time_per_iteration": 3.092557191848755 + }, + { + "auxiliary_loss_clip": 0.01062771, + "auxiliary_loss_mlp": 0.01039081, + "balance_loss_clip": 1.03428519, + "balance_loss_mlp": 1.02474022, + "epoch": 0.6594919585149557, + "flos": 17456931699840.0, + "grad_norm": 1.79877549657663, + "language_loss": 0.7931813, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.81419981, + "num_input_tokens_seen": 236805155, + "step": 10969, + "time_per_iteration": 2.597667694091797 + }, + { + "auxiliary_loss_clip": 0.01098388, + "auxiliary_loss_mlp": 0.01030483, + "balance_loss_clip": 1.03732538, + "balance_loss_mlp": 1.01818681, + "epoch": 0.6595520817676236, + "flos": 18223229473920.0, + "grad_norm": 1.9422096989428075, + "language_loss": 0.65215999, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.67344868, + "num_input_tokens_seen": 236824360, + "step": 10970, + "time_per_iteration": 3.9430458545684814 + }, + { + "auxiliary_loss_clip": 0.01094226, + "auxiliary_loss_mlp": 0.01028264, + "balance_loss_clip": 1.03527927, + "balance_loss_mlp": 1.01595521, + "epoch": 0.6596122050202916, + "flos": 22199761463040.0, + "grad_norm": 1.5786732364178528, + "language_loss": 0.76502132, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.7862463, + "num_input_tokens_seen": 236844640, + "step": 10971, + "time_per_iteration": 2.514813184738159 + }, + { + "auxiliary_loss_clip": 0.01043378, + "auxiliary_loss_mlp": 0.01044852, + "balance_loss_clip": 1.0335182, + "balance_loss_mlp": 1.03017163, + "epoch": 0.6596723282729595, + "flos": 14173834746240.0, + "grad_norm": 3.5379035853757523, + "language_loss": 0.70206261, + "learning_rate": 1.096689432978629e-06, + "loss": 0.72294486, + "num_input_tokens_seen": 236861160, + "step": 10972, + "time_per_iteration": 2.5806121826171875 + }, + { + "auxiliary_loss_clip": 0.01096231, + "auxiliary_loss_mlp": 0.01025676, + "balance_loss_clip": 1.03761888, + "balance_loss_mlp": 1.01261687, + "epoch": 0.6597324515256275, + "flos": 30553193410560.0, + "grad_norm": 1.594671924616411, + "language_loss": 0.55900061, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.58021963, + "num_input_tokens_seen": 236880465, + "step": 10973, + "time_per_iteration": 4.008740186691284 + }, + { + "auxiliary_loss_clip": 0.01091342, + "auxiliary_loss_mlp": 0.01042363, + "balance_loss_clip": 1.03891706, + "balance_loss_mlp": 1.03013206, + "epoch": 0.6597925747782956, + "flos": 17639860688640.0, + "grad_norm": 1.892908870824964, + "language_loss": 0.78921151, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.81054848, + "num_input_tokens_seen": 236897730, + "step": 10974, + "time_per_iteration": 2.502997398376465 + }, + { + "auxiliary_loss_clip": 0.01094204, + "auxiliary_loss_mlp": 0.01036404, + "balance_loss_clip": 1.03799474, + "balance_loss_mlp": 1.0244056, + "epoch": 0.6598526980309635, + "flos": 22819112697600.0, + "grad_norm": 2.1299921944402223, + "language_loss": 0.68948513, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.71079123, + "num_input_tokens_seen": 236917300, + "step": 10975, + "time_per_iteration": 2.5102145671844482 + }, + { + "auxiliary_loss_clip": 0.01098361, + "auxiliary_loss_mlp": 0.01027822, + "balance_loss_clip": 1.03691709, + "balance_loss_mlp": 1.01613367, + "epoch": 0.6599128212836315, + "flos": 21068036674560.0, + "grad_norm": 1.611104817529319, + "language_loss": 0.70939487, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.73065674, + "num_input_tokens_seen": 236935590, + "step": 10976, + "time_per_iteration": 2.5006821155548096 + }, + { + "auxiliary_loss_clip": 0.01078949, + "auxiliary_loss_mlp": 0.01034011, + "balance_loss_clip": 1.03654623, + "balance_loss_mlp": 1.02143455, + "epoch": 0.6599729445362994, + "flos": 22163527618560.0, + "grad_norm": 1.9683526658730834, + "language_loss": 0.67455268, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.69568229, + "num_input_tokens_seen": 236952830, + "step": 10977, + "time_per_iteration": 2.516997814178467 + }, + { + "auxiliary_loss_clip": 0.01075723, + "auxiliary_loss_mlp": 0.01033768, + "balance_loss_clip": 1.03831661, + "balance_loss_mlp": 1.01985645, + "epoch": 0.6600330677889674, + "flos": 18150079426560.0, + "grad_norm": 2.5127972780395926, + "language_loss": 0.81452751, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.83562243, + "num_input_tokens_seen": 236971930, + "step": 10978, + "time_per_iteration": 2.5371835231781006 + }, + { + "auxiliary_loss_clip": 0.01080499, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.03906441, + "balance_loss_mlp": 1.02377236, + "epoch": 0.6600931910416353, + "flos": 18150115340160.0, + "grad_norm": 2.1973208368420263, + "language_loss": 0.67483437, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.69600898, + "num_input_tokens_seen": 236989920, + "step": 10979, + "time_per_iteration": 2.510268449783325 + }, + { + "auxiliary_loss_clip": 0.01078525, + "auxiliary_loss_mlp": 0.01030196, + "balance_loss_clip": 1.03651333, + "balance_loss_mlp": 1.01692748, + "epoch": 0.6601533142943034, + "flos": 17420733768960.0, + "grad_norm": 2.309782370268834, + "language_loss": 0.73387527, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.75496256, + "num_input_tokens_seen": 237006570, + "step": 10980, + "time_per_iteration": 2.5269112586975098 + }, + { + "auxiliary_loss_clip": 0.01068587, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.03577161, + "balance_loss_mlp": 1.01847219, + "epoch": 0.6602134375469713, + "flos": 28219574615040.0, + "grad_norm": 1.539564115176228, + "language_loss": 0.72533095, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.74631172, + "num_input_tokens_seen": 237028415, + "step": 10981, + "time_per_iteration": 2.6143224239349365 + }, + { + "auxiliary_loss_clip": 0.01066544, + "auxiliary_loss_mlp": 0.01034617, + "balance_loss_clip": 1.04045057, + "balance_loss_mlp": 1.02226067, + "epoch": 0.6602735607996393, + "flos": 29418056830080.0, + "grad_norm": 2.029004689529742, + "language_loss": 0.68557507, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.70658672, + "num_input_tokens_seen": 237046595, + "step": 10982, + "time_per_iteration": 2.671036720275879 + }, + { + "auxiliary_loss_clip": 0.01099073, + "auxiliary_loss_mlp": 0.01032544, + "balance_loss_clip": 1.039433, + "balance_loss_mlp": 1.02007449, + "epoch": 0.6603336840523072, + "flos": 18588045957120.0, + "grad_norm": 1.535531817469894, + "language_loss": 0.69561803, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.71693426, + "num_input_tokens_seen": 237066150, + "step": 10983, + "time_per_iteration": 2.4844815731048584 + }, + { + "auxiliary_loss_clip": 0.01099264, + "auxiliary_loss_mlp": 0.0103055, + "balance_loss_clip": 1.03642571, + "balance_loss_mlp": 1.0176934, + "epoch": 0.6603938073049752, + "flos": 33254860913280.0, + "grad_norm": 1.7446803785531146, + "language_loss": 0.70136803, + "learning_rate": 1.092522205413239e-06, + "loss": 0.72266614, + "num_input_tokens_seen": 237087060, + "step": 10984, + "time_per_iteration": 2.595243215560913 + }, + { + "auxiliary_loss_clip": 0.01074969, + "auxiliary_loss_mlp": 0.01033822, + "balance_loss_clip": 1.03625524, + "balance_loss_mlp": 1.02173424, + "epoch": 0.6604539305576431, + "flos": 17384284442880.0, + "grad_norm": 1.6302502547189304, + "language_loss": 0.8385613, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.85964918, + "num_input_tokens_seen": 237103825, + "step": 10985, + "time_per_iteration": 2.531421184539795 + }, + { + "auxiliary_loss_clip": 0.01099241, + "auxiliary_loss_mlp": 0.01032226, + "balance_loss_clip": 1.03824401, + "balance_loss_mlp": 1.01939893, + "epoch": 0.6605140538103111, + "flos": 21251145231360.0, + "grad_norm": 2.035015959733069, + "language_loss": 0.74134773, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.76266241, + "num_input_tokens_seen": 237121740, + "step": 10986, + "time_per_iteration": 2.5107126235961914 + }, + { + "auxiliary_loss_clip": 0.01098411, + "auxiliary_loss_mlp": 0.01031647, + "balance_loss_clip": 1.03912783, + "balance_loss_mlp": 1.01910043, + "epoch": 0.6605741770629792, + "flos": 13881701433600.0, + "grad_norm": 1.8256307821213793, + "language_loss": 0.79115045, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.812451, + "num_input_tokens_seen": 237139565, + "step": 10987, + "time_per_iteration": 2.4756245613098145 + }, + { + "auxiliary_loss_clip": 0.01017669, + "auxiliary_loss_mlp": 0.01002607, + "balance_loss_clip": 1.01724267, + "balance_loss_mlp": 1.00105715, + "epoch": 0.6606343003156471, + "flos": 69316215171840.0, + "grad_norm": 0.8249888511591914, + "language_loss": 0.54088306, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.56108594, + "num_input_tokens_seen": 237201055, + "step": 10988, + "time_per_iteration": 3.2313458919525146 + }, + { + "auxiliary_loss_clip": 0.01050108, + "auxiliary_loss_mlp": 0.0103833, + "balance_loss_clip": 1.03544879, + "balance_loss_mlp": 1.0265702, + "epoch": 0.6606944235683151, + "flos": 27272394927360.0, + "grad_norm": 1.58414147408499, + "language_loss": 0.77414286, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.79502726, + "num_input_tokens_seen": 237221805, + "step": 10989, + "time_per_iteration": 2.672549247741699 + }, + { + "auxiliary_loss_clip": 0.01086907, + "auxiliary_loss_mlp": 0.01035778, + "balance_loss_clip": 1.03882933, + "balance_loss_mlp": 1.0236845, + "epoch": 0.660754546820983, + "flos": 13772820332160.0, + "grad_norm": 3.8991220155516277, + "language_loss": 0.77416521, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.7953921, + "num_input_tokens_seen": 237238270, + "step": 10990, + "time_per_iteration": 2.5019192695617676 + }, + { + "auxiliary_loss_clip": 0.01111253, + "auxiliary_loss_mlp": 0.01031841, + "balance_loss_clip": 1.03877926, + "balance_loss_mlp": 1.01921678, + "epoch": 0.660814670073651, + "flos": 15705209232000.0, + "grad_norm": 1.890721713600211, + "language_loss": 0.60847914, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.62991005, + "num_input_tokens_seen": 237255400, + "step": 10991, + "time_per_iteration": 2.426070213317871 + }, + { + "auxiliary_loss_clip": 0.01085545, + "auxiliary_loss_mlp": 0.01039386, + "balance_loss_clip": 1.03754497, + "balance_loss_mlp": 1.02638626, + "epoch": 0.6608747933263189, + "flos": 20850023076480.0, + "grad_norm": 2.351906829554656, + "language_loss": 0.68451601, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.70576537, + "num_input_tokens_seen": 237273105, + "step": 10992, + "time_per_iteration": 2.526390552520752 + }, + { + "auxiliary_loss_clip": 0.01097595, + "auxiliary_loss_mlp": 0.01031316, + "balance_loss_clip": 1.03999448, + "balance_loss_mlp": 1.01879311, + "epoch": 0.660934916578987, + "flos": 20632117219200.0, + "grad_norm": 1.692606113014454, + "language_loss": 0.87810957, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.89939868, + "num_input_tokens_seen": 237292650, + "step": 10993, + "time_per_iteration": 2.509467840194702 + }, + { + "auxiliary_loss_clip": 0.01106376, + "auxiliary_loss_mlp": 0.01033804, + "balance_loss_clip": 1.03956771, + "balance_loss_mlp": 1.01949263, + "epoch": 0.6609950398316549, + "flos": 25113588647040.0, + "grad_norm": 1.586754515859154, + "language_loss": 0.6688354, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.69023716, + "num_input_tokens_seen": 237312865, + "step": 10994, + "time_per_iteration": 2.5339748859405518 + }, + { + "auxiliary_loss_clip": 0.01075443, + "auxiliary_loss_mlp": 0.01036637, + "balance_loss_clip": 1.03810287, + "balance_loss_mlp": 1.02291608, + "epoch": 0.6610551630843229, + "flos": 18661196004480.0, + "grad_norm": 1.6643150612945448, + "language_loss": 0.76312196, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.78424281, + "num_input_tokens_seen": 237331210, + "step": 10995, + "time_per_iteration": 2.6006290912628174 + }, + { + "auxiliary_loss_clip": 0.01088144, + "auxiliary_loss_mlp": 0.01028232, + "balance_loss_clip": 1.03749371, + "balance_loss_mlp": 1.01662111, + "epoch": 0.6611152863369908, + "flos": 23258192549760.0, + "grad_norm": 1.8493694949902377, + "language_loss": 0.74405807, + "learning_rate": 1.088359933123053e-06, + "loss": 0.76522183, + "num_input_tokens_seen": 237349455, + "step": 10996, + "time_per_iteration": 2.5358595848083496 + }, + { + "auxiliary_loss_clip": 0.01110505, + "auxiliary_loss_mlp": 0.01035082, + "balance_loss_clip": 1.04009736, + "balance_loss_mlp": 1.02275598, + "epoch": 0.6611754095896588, + "flos": 22159720776960.0, + "grad_norm": 1.6543055538088762, + "language_loss": 0.68897629, + "learning_rate": 1.088013301487126e-06, + "loss": 0.71043217, + "num_input_tokens_seen": 237367100, + "step": 10997, + "time_per_iteration": 2.463078498840332 + }, + { + "auxiliary_loss_clip": 0.01089983, + "auxiliary_loss_mlp": 0.0103382, + "balance_loss_clip": 1.03695679, + "balance_loss_mlp": 1.02201223, + "epoch": 0.6612355328423267, + "flos": 13991228979840.0, + "grad_norm": 2.4279057244130757, + "language_loss": 0.68274689, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.70398498, + "num_input_tokens_seen": 237384840, + "step": 10998, + "time_per_iteration": 2.513188600540161 + }, + { + "auxiliary_loss_clip": 0.01033112, + "auxiliary_loss_mlp": 0.01001796, + "balance_loss_clip": 1.01902902, + "balance_loss_mlp": 1.00069368, + "epoch": 0.6612956560949947, + "flos": 61453716359040.0, + "grad_norm": 0.6504061895015423, + "language_loss": 0.51181149, + "learning_rate": 1.087320141976297e-06, + "loss": 0.53216058, + "num_input_tokens_seen": 237443355, + "step": 10999, + "time_per_iteration": 3.0710206031799316 + }, + { + "auxiliary_loss_clip": 0.01112354, + "auxiliary_loss_mlp": 0.00784465, + "balance_loss_clip": 1.03849626, + "balance_loss_mlp": 1.00969279, + "epoch": 0.6613557793476627, + "flos": 21616644072960.0, + "grad_norm": 2.3532159853796895, + "language_loss": 0.70549917, + "learning_rate": 1.086973614127679e-06, + "loss": 0.72446728, + "num_input_tokens_seen": 237459205, + "step": 11000, + "time_per_iteration": 2.4705307483673096 + }, + { + "auxiliary_loss_clip": 0.01084651, + "auxiliary_loss_mlp": 0.01036257, + "balance_loss_clip": 1.03694868, + "balance_loss_mlp": 1.02486074, + "epoch": 0.6614159026003307, + "flos": 34020117192960.0, + "grad_norm": 1.4885447331513868, + "language_loss": 0.65230799, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.67351711, + "num_input_tokens_seen": 237483580, + "step": 11001, + "time_per_iteration": 4.037245035171509 + }, + { + "auxiliary_loss_clip": 0.01107428, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.03702545, + "balance_loss_mlp": 1.01818371, + "epoch": 0.6614760258529987, + "flos": 24097281235200.0, + "grad_norm": 1.8165935099746786, + "language_loss": 0.72964859, + "learning_rate": 1.086280662309739e-06, + "loss": 0.75102627, + "num_input_tokens_seen": 237502860, + "step": 11002, + "time_per_iteration": 2.4819676876068115 + }, + { + "auxiliary_loss_clip": 0.01092983, + "auxiliary_loss_mlp": 0.01034802, + "balance_loss_clip": 1.0361222, + "balance_loss_mlp": 1.02177238, + "epoch": 0.6615361491056666, + "flos": 14903790935040.0, + "grad_norm": 2.25590901041263, + "language_loss": 0.78587663, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.80715454, + "num_input_tokens_seen": 237521030, + "step": 11003, + "time_per_iteration": 2.5022425651550293 + }, + { + "auxiliary_loss_clip": 0.01099959, + "auxiliary_loss_mlp": 0.01035469, + "balance_loss_clip": 1.03900003, + "balance_loss_mlp": 1.0221833, + "epoch": 0.6615962723583346, + "flos": 15304877176320.0, + "grad_norm": 1.859887957516743, + "language_loss": 0.68861866, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.70997298, + "num_input_tokens_seen": 237539585, + "step": 11004, + "time_per_iteration": 2.502073287963867 + }, + { + "auxiliary_loss_clip": 0.01100187, + "auxiliary_loss_mlp": 0.01037841, + "balance_loss_clip": 1.03811502, + "balance_loss_mlp": 1.02441776, + "epoch": 0.6616563956110025, + "flos": 18732586285440.0, + "grad_norm": 2.132041449999643, + "language_loss": 0.69262278, + "learning_rate": 1.085241494478132e-06, + "loss": 0.71400309, + "num_input_tokens_seen": 237557655, + "step": 11005, + "time_per_iteration": 2.4770278930664062 + }, + { + "auxiliary_loss_clip": 0.01086684, + "auxiliary_loss_mlp": 0.01028522, + "balance_loss_clip": 1.03623819, + "balance_loss_mlp": 1.01651168, + "epoch": 0.6617165188636706, + "flos": 24495063425280.0, + "grad_norm": 1.620151244122595, + "language_loss": 0.78177696, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.80292898, + "num_input_tokens_seen": 237577000, + "step": 11006, + "time_per_iteration": 2.5543947219848633 + }, + { + "auxiliary_loss_clip": 0.01096718, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.03731656, + "balance_loss_mlp": 1.01916933, + "epoch": 0.6617766421163385, + "flos": 22379673709440.0, + "grad_norm": 1.436141257845966, + "language_loss": 0.76022017, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.78150177, + "num_input_tokens_seen": 237597960, + "step": 11007, + "time_per_iteration": 2.5301594734191895 + }, + { + "auxiliary_loss_clip": 0.01095554, + "auxiliary_loss_mlp": 0.01027678, + "balance_loss_clip": 1.03835046, + "balance_loss_mlp": 1.01569152, + "epoch": 0.6618367653690065, + "flos": 20850418126080.0, + "grad_norm": 1.6843715653485822, + "language_loss": 0.78426909, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.8055014, + "num_input_tokens_seen": 237616385, + "step": 11008, + "time_per_iteration": 2.5143046379089355 + }, + { + "auxiliary_loss_clip": 0.01112487, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.0380621, + "balance_loss_mlp": 1.01972055, + "epoch": 0.6618968886216744, + "flos": 17712328377600.0, + "grad_norm": 2.35417988748547, + "language_loss": 0.81828797, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.83974171, + "num_input_tokens_seen": 237634930, + "step": 11009, + "time_per_iteration": 3.8380331993103027 + }, + { + "auxiliary_loss_clip": 0.01024452, + "auxiliary_loss_mlp": 0.01005524, + "balance_loss_clip": 1.02895141, + "balance_loss_mlp": 1.00392616, + "epoch": 0.6619570118743424, + "flos": 67035347498880.0, + "grad_norm": 0.9750546269005811, + "language_loss": 0.67333871, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.6936385, + "num_input_tokens_seen": 237693175, + "step": 11010, + "time_per_iteration": 3.09132719039917 + }, + { + "auxiliary_loss_clip": 0.01097992, + "auxiliary_loss_mlp": 0.01033668, + "balance_loss_clip": 1.03670752, + "balance_loss_mlp": 1.02033472, + "epoch": 0.6620171351270103, + "flos": 18660908695680.0, + "grad_norm": 1.545539040673936, + "language_loss": 0.71293044, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.73424703, + "num_input_tokens_seen": 237713160, + "step": 11011, + "time_per_iteration": 3.877072334289551 + }, + { + "auxiliary_loss_clip": 0.01099008, + "auxiliary_loss_mlp": 0.0103112, + "balance_loss_clip": 1.04056048, + "balance_loss_mlp": 1.01999807, + "epoch": 0.6620772583796783, + "flos": 24170503109760.0, + "grad_norm": 1.5140888882876735, + "language_loss": 0.7239567, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.74525791, + "num_input_tokens_seen": 237733600, + "step": 11012, + "time_per_iteration": 2.5689635276794434 + }, + { + "auxiliary_loss_clip": 0.01091247, + "auxiliary_loss_mlp": 0.01030157, + "balance_loss_clip": 1.03635955, + "balance_loss_mlp": 1.01941633, + "epoch": 0.6621373816323463, + "flos": 23623547736960.0, + "grad_norm": 1.5496519389432861, + "language_loss": 0.79415476, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.81536877, + "num_input_tokens_seen": 237752135, + "step": 11013, + "time_per_iteration": 2.5208241939544678 + }, + { + "auxiliary_loss_clip": 0.01085436, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.03705502, + "balance_loss_mlp": 1.01847315, + "epoch": 0.6621975048850143, + "flos": 18442212739200.0, + "grad_norm": 2.580843846316289, + "language_loss": 0.70719492, + "learning_rate": 1.082125865538971e-06, + "loss": 0.72835106, + "num_input_tokens_seen": 237770735, + "step": 11014, + "time_per_iteration": 2.5338551998138428 + }, + { + "auxiliary_loss_clip": 0.01078431, + "auxiliary_loss_mlp": 0.00781704, + "balance_loss_clip": 1.04077458, + "balance_loss_mlp": 1.00886154, + "epoch": 0.6622576281376823, + "flos": 14063876236800.0, + "grad_norm": 1.7434265747445956, + "language_loss": 0.77246261, + "learning_rate": 1.081779858400137e-06, + "loss": 0.7910639, + "num_input_tokens_seen": 237789005, + "step": 11015, + "time_per_iteration": 2.538248300552368 + }, + { + "auxiliary_loss_clip": 0.01096627, + "auxiliary_loss_mlp": 0.00783727, + "balance_loss_clip": 1.03737009, + "balance_loss_mlp": 1.00835776, + "epoch": 0.6623177513903502, + "flos": 17018965169280.0, + "grad_norm": 1.738101777870775, + "language_loss": 0.81962752, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.83843106, + "num_input_tokens_seen": 237807740, + "step": 11016, + "time_per_iteration": 2.5041842460632324 + }, + { + "auxiliary_loss_clip": 0.01092416, + "auxiliary_loss_mlp": 0.01033888, + "balance_loss_clip": 1.03647399, + "balance_loss_mlp": 1.02165079, + "epoch": 0.6623778746430182, + "flos": 17271021882240.0, + "grad_norm": 1.986037931733407, + "language_loss": 0.69949079, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.72075379, + "num_input_tokens_seen": 237826340, + "step": 11017, + "time_per_iteration": 2.519240379333496 + }, + { + "auxiliary_loss_clip": 0.01078409, + "auxiliary_loss_mlp": 0.01038691, + "balance_loss_clip": 1.03462887, + "balance_loss_mlp": 1.02642465, + "epoch": 0.6624379978956861, + "flos": 48792688767360.0, + "grad_norm": 1.7244389799511692, + "language_loss": 0.77387106, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.79504204, + "num_input_tokens_seen": 237848305, + "step": 11018, + "time_per_iteration": 2.7897450923919678 + }, + { + "auxiliary_loss_clip": 0.01087543, + "auxiliary_loss_mlp": 0.01041037, + "balance_loss_clip": 1.03831017, + "balance_loss_mlp": 1.02868092, + "epoch": 0.6624981211483542, + "flos": 18952431477120.0, + "grad_norm": 2.200270648644647, + "language_loss": 0.83262616, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.853912, + "num_input_tokens_seen": 237867020, + "step": 11019, + "time_per_iteration": 2.530020236968994 + }, + { + "auxiliary_loss_clip": 0.01090986, + "auxiliary_loss_mlp": 0.00786273, + "balance_loss_clip": 1.0362401, + "balance_loss_mlp": 1.009444, + "epoch": 0.6625582444010221, + "flos": 23256576437760.0, + "grad_norm": 1.578391139759986, + "language_loss": 0.71927679, + "learning_rate": 1.080050345253328e-06, + "loss": 0.73804933, + "num_input_tokens_seen": 237886710, + "step": 11020, + "time_per_iteration": 2.504007339477539 + }, + { + "auxiliary_loss_clip": 0.0108644, + "auxiliary_loss_mlp": 0.01029977, + "balance_loss_clip": 1.03569579, + "balance_loss_mlp": 1.01604128, + "epoch": 0.6626183676536901, + "flos": 21394823633280.0, + "grad_norm": 1.8676620977001952, + "language_loss": 0.72544914, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.74661332, + "num_input_tokens_seen": 237904795, + "step": 11021, + "time_per_iteration": 2.541767120361328 + }, + { + "auxiliary_loss_clip": 0.01083561, + "auxiliary_loss_mlp": 0.01032259, + "balance_loss_clip": 1.03970146, + "balance_loss_mlp": 1.01971245, + "epoch": 0.662678490906358, + "flos": 14571293713920.0, + "grad_norm": 2.0392094865709107, + "language_loss": 0.83303922, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.85419738, + "num_input_tokens_seen": 237921320, + "step": 11022, + "time_per_iteration": 2.5115621089935303 + }, + { + "auxiliary_loss_clip": 0.01091535, + "auxiliary_loss_mlp": 0.0103249, + "balance_loss_clip": 1.03603125, + "balance_loss_mlp": 1.01925802, + "epoch": 0.662738614159026, + "flos": 15992350554240.0, + "grad_norm": 2.2559057639958957, + "language_loss": 0.72427523, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.74551547, + "num_input_tokens_seen": 237933525, + "step": 11023, + "time_per_iteration": 2.4849395751953125 + }, + { + "auxiliary_loss_clip": 0.01071707, + "auxiliary_loss_mlp": 0.0103186, + "balance_loss_clip": 1.03268158, + "balance_loss_mlp": 1.02000427, + "epoch": 0.6627987374116939, + "flos": 19536338966400.0, + "grad_norm": 1.7089693147990586, + "language_loss": 0.74997479, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.77101046, + "num_input_tokens_seen": 237953395, + "step": 11024, + "time_per_iteration": 2.554080009460449 + }, + { + "auxiliary_loss_clip": 0.0107883, + "auxiliary_loss_mlp": 0.01033115, + "balance_loss_clip": 1.03908741, + "balance_loss_mlp": 1.02029371, + "epoch": 0.662858860664362, + "flos": 15702838934400.0, + "grad_norm": 2.9515079458586246, + "language_loss": 0.69405419, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.71517366, + "num_input_tokens_seen": 237971445, + "step": 11025, + "time_per_iteration": 2.5423531532287598 + }, + { + "auxiliary_loss_clip": 0.01109895, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.03895485, + "balance_loss_mlp": 1.02103114, + "epoch": 0.6629189839170299, + "flos": 20154289570560.0, + "grad_norm": 1.5456591667701003, + "language_loss": 0.78641146, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.80784494, + "num_input_tokens_seen": 237989965, + "step": 11026, + "time_per_iteration": 2.4615278244018555 + }, + { + "auxiliary_loss_clip": 0.01097427, + "auxiliary_loss_mlp": 0.01032279, + "balance_loss_clip": 1.03814387, + "balance_loss_mlp": 1.02080536, + "epoch": 0.6629791071696979, + "flos": 20915415786240.0, + "grad_norm": 1.7348636602386804, + "language_loss": 0.75763988, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.77893698, + "num_input_tokens_seen": 238006820, + "step": 11027, + "time_per_iteration": 2.508775234222412 + }, + { + "auxiliary_loss_clip": 0.01083159, + "auxiliary_loss_mlp": 0.01037634, + "balance_loss_clip": 1.03754854, + "balance_loss_mlp": 1.02429414, + "epoch": 0.6630392304223659, + "flos": 20846898593280.0, + "grad_norm": 2.4881292857626076, + "language_loss": 0.70050722, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.72171509, + "num_input_tokens_seen": 238022560, + "step": 11028, + "time_per_iteration": 2.508983612060547 + }, + { + "auxiliary_loss_clip": 0.01096318, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.03727007, + "balance_loss_mlp": 1.02498734, + "epoch": 0.6630993536750338, + "flos": 20995820380800.0, + "grad_norm": 2.036728257625756, + "language_loss": 0.79152519, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.81284463, + "num_input_tokens_seen": 238041895, + "step": 11029, + "time_per_iteration": 2.5053725242614746 + }, + { + "auxiliary_loss_clip": 0.01109523, + "auxiliary_loss_mlp": 0.01034906, + "balance_loss_clip": 1.03683639, + "balance_loss_mlp": 1.02207947, + "epoch": 0.6631594769277018, + "flos": 18259032355200.0, + "grad_norm": 2.0924422070046576, + "language_loss": 0.76275468, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.784199, + "num_input_tokens_seen": 238060445, + "step": 11030, + "time_per_iteration": 2.427781581878662 + }, + { + "auxiliary_loss_clip": 0.0110572, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.04026532, + "balance_loss_mlp": 1.01993585, + "epoch": 0.6632196001803697, + "flos": 17820491207040.0, + "grad_norm": 2.484757292142552, + "language_loss": 0.74786896, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.76925427, + "num_input_tokens_seen": 238077080, + "step": 11031, + "time_per_iteration": 2.4635274410247803 + }, + { + "auxiliary_loss_clip": 0.01099601, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.03786683, + "balance_loss_mlp": 1.02336836, + "epoch": 0.6632797234330378, + "flos": 12670182581760.0, + "grad_norm": 3.4373314916005393, + "language_loss": 0.75015366, + "learning_rate": 1.075903075048228e-06, + "loss": 0.77150607, + "num_input_tokens_seen": 238091045, + "step": 11032, + "time_per_iteration": 2.4324967861175537 + }, + { + "auxiliary_loss_clip": 0.01065898, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.03565323, + "balance_loss_mlp": 1.02117372, + "epoch": 0.6633398466857057, + "flos": 23584728113280.0, + "grad_norm": 1.664783140786834, + "language_loss": 0.80556256, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.82655191, + "num_input_tokens_seen": 238110220, + "step": 11033, + "time_per_iteration": 2.5921905040740967 + }, + { + "auxiliary_loss_clip": 0.01087237, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.03588367, + "balance_loss_mlp": 1.01797116, + "epoch": 0.6633999699383737, + "flos": 20631686256000.0, + "grad_norm": 1.7960508557916515, + "language_loss": 0.80459327, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.82576585, + "num_input_tokens_seen": 238130400, + "step": 11034, + "time_per_iteration": 2.5333621501922607 + }, + { + "auxiliary_loss_clip": 0.01094604, + "auxiliary_loss_mlp": 0.01028273, + "balance_loss_clip": 1.04068053, + "balance_loss_mlp": 1.01717472, + "epoch": 0.6634600931910416, + "flos": 21797095023360.0, + "grad_norm": 1.5659032669100699, + "language_loss": 0.75768536, + "learning_rate": 1.074867045054166e-06, + "loss": 0.77891409, + "num_input_tokens_seen": 238148165, + "step": 11035, + "time_per_iteration": 2.515777349472046 + }, + { + "auxiliary_loss_clip": 0.01075631, + "auxiliary_loss_mlp": 0.0102893, + "balance_loss_clip": 1.03583193, + "balance_loss_mlp": 1.01657438, + "epoch": 0.6635202164437096, + "flos": 18732873594240.0, + "grad_norm": 1.8389129377232585, + "language_loss": 0.83159602, + "learning_rate": 1.074521771867622e-06, + "loss": 0.85264158, + "num_input_tokens_seen": 238166360, + "step": 11036, + "time_per_iteration": 2.5286686420440674 + }, + { + "auxiliary_loss_clip": 0.01041688, + "auxiliary_loss_mlp": 0.0101001, + "balance_loss_clip": 1.0179472, + "balance_loss_mlp": 1.00891924, + "epoch": 0.6635803396963775, + "flos": 60222771227520.0, + "grad_norm": 0.7784713802701722, + "language_loss": 0.52358103, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.54409802, + "num_input_tokens_seen": 238227630, + "step": 11037, + "time_per_iteration": 3.066641092300415 + }, + { + "auxiliary_loss_clip": 0.01061705, + "auxiliary_loss_mlp": 0.01036086, + "balance_loss_clip": 1.03859568, + "balance_loss_mlp": 1.0231874, + "epoch": 0.6636404629490456, + "flos": 29167041611520.0, + "grad_norm": 1.626875357304339, + "language_loss": 0.78969097, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.81066883, + "num_input_tokens_seen": 238248435, + "step": 11038, + "time_per_iteration": 2.6506268978118896 + }, + { + "auxiliary_loss_clip": 0.01076779, + "auxiliary_loss_mlp": 0.01039656, + "balance_loss_clip": 1.03708959, + "balance_loss_mlp": 1.02632225, + "epoch": 0.6637005862017135, + "flos": 38907702766080.0, + "grad_norm": 2.178367291989814, + "language_loss": 0.64003748, + "learning_rate": 1.073486162925716e-06, + "loss": 0.66120183, + "num_input_tokens_seen": 238268755, + "step": 11039, + "time_per_iteration": 2.708106279373169 + }, + { + "auxiliary_loss_clip": 0.01077766, + "auxiliary_loss_mlp": 0.01027431, + "balance_loss_clip": 1.03969169, + "balance_loss_mlp": 1.01515269, + "epoch": 0.6637607094543815, + "flos": 22783345729920.0, + "grad_norm": 1.4294960898227393, + "language_loss": 0.63908958, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.66014159, + "num_input_tokens_seen": 238290120, + "step": 11040, + "time_per_iteration": 3.9671413898468018 + }, + { + "auxiliary_loss_clip": 0.01069837, + "auxiliary_loss_mlp": 0.01039659, + "balance_loss_clip": 1.03451109, + "balance_loss_mlp": 1.02609932, + "epoch": 0.6638208327070495, + "flos": 18114096977280.0, + "grad_norm": 2.5208647684566343, + "language_loss": 0.71539247, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.73648739, + "num_input_tokens_seen": 238309290, + "step": 11041, + "time_per_iteration": 2.5506534576416016 + }, + { + "auxiliary_loss_clip": 0.01093377, + "auxiliary_loss_mlp": 0.0104439, + "balance_loss_clip": 1.03661895, + "balance_loss_mlp": 1.02985275, + "epoch": 0.6638809559597174, + "flos": 29424880414080.0, + "grad_norm": 2.1595034504289075, + "language_loss": 0.62019277, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.64157045, + "num_input_tokens_seen": 238327280, + "step": 11042, + "time_per_iteration": 2.5637094974517822 + }, + { + "auxiliary_loss_clip": 0.01100338, + "auxiliary_loss_mlp": 0.0103117, + "balance_loss_clip": 1.03690696, + "balance_loss_mlp": 1.01724625, + "epoch": 0.6639410792123854, + "flos": 28072699902720.0, + "grad_norm": 1.7953385432722393, + "language_loss": 0.68618917, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.70750421, + "num_input_tokens_seen": 238346330, + "step": 11043, + "time_per_iteration": 2.5357120037078857 + }, + { + "auxiliary_loss_clip": 0.01095319, + "auxiliary_loss_mlp": 0.01027001, + "balance_loss_clip": 1.0392592, + "balance_loss_mlp": 1.0164752, + "epoch": 0.6640012024650533, + "flos": 25556367600000.0, + "grad_norm": 1.663911642267484, + "language_loss": 0.83953547, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.86075866, + "num_input_tokens_seen": 238364650, + "step": 11044, + "time_per_iteration": 2.5406494140625 + }, + { + "auxiliary_loss_clip": 0.01070382, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.03660154, + "balance_loss_mlp": 1.01787353, + "epoch": 0.6640613257177214, + "flos": 14866946559360.0, + "grad_norm": 2.271812081657629, + "language_loss": 0.69502258, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.71603417, + "num_input_tokens_seen": 238381630, + "step": 11045, + "time_per_iteration": 2.541288375854492 + }, + { + "auxiliary_loss_clip": 0.01099832, + "auxiliary_loss_mlp": 0.01028612, + "balance_loss_clip": 1.03987265, + "balance_loss_mlp": 1.01607108, + "epoch": 0.6641214489703893, + "flos": 23221096778880.0, + "grad_norm": 1.5296019298084123, + "language_loss": 0.64230257, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.66358709, + "num_input_tokens_seen": 238402595, + "step": 11046, + "time_per_iteration": 2.5263280868530273 + }, + { + "auxiliary_loss_clip": 0.01078289, + "auxiliary_loss_mlp": 0.01029634, + "balance_loss_clip": 1.03760195, + "balance_loss_mlp": 1.01658607, + "epoch": 0.6641815722230573, + "flos": 37742617221120.0, + "grad_norm": 1.4711934492700298, + "language_loss": 0.71332347, + "learning_rate": 1.070726085914088e-06, + "loss": 0.73440266, + "num_input_tokens_seen": 238426860, + "step": 11047, + "time_per_iteration": 4.136700391769409 + }, + { + "auxiliary_loss_clip": 0.01051542, + "auxiliary_loss_mlp": 0.01034909, + "balance_loss_clip": 1.04150629, + "balance_loss_mlp": 1.02262414, + "epoch": 0.6642416954757252, + "flos": 17931132074880.0, + "grad_norm": 1.9708782645392695, + "language_loss": 0.77218825, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.79305273, + "num_input_tokens_seen": 238443990, + "step": 11048, + "time_per_iteration": 4.022372007369995 + }, + { + "auxiliary_loss_clip": 0.01017733, + "auxiliary_loss_mlp": 0.01005796, + "balance_loss_clip": 1.02388501, + "balance_loss_mlp": 1.00467575, + "epoch": 0.6643018187283932, + "flos": 51995384104320.0, + "grad_norm": 0.7538568889327667, + "language_loss": 0.55069649, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.57093179, + "num_input_tokens_seen": 238503045, + "step": 11049, + "time_per_iteration": 3.184821605682373 + }, + { + "auxiliary_loss_clip": 0.01098711, + "auxiliary_loss_mlp": 0.01026587, + "balance_loss_clip": 1.03860414, + "balance_loss_mlp": 1.01542914, + "epoch": 0.6643619419810611, + "flos": 30226657847040.0, + "grad_norm": 1.6827277619146914, + "language_loss": 0.6425401, + "learning_rate": 1.069691638104648e-06, + "loss": 0.66379309, + "num_input_tokens_seen": 238527320, + "step": 11050, + "time_per_iteration": 2.5646986961364746 + }, + { + "auxiliary_loss_clip": 0.01107558, + "auxiliary_loss_mlp": 0.01027664, + "balance_loss_clip": 1.03929281, + "balance_loss_mlp": 1.01625538, + "epoch": 0.6644220652337292, + "flos": 22966131064320.0, + "grad_norm": 1.9107587604432172, + "language_loss": 0.78804231, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.80939454, + "num_input_tokens_seen": 238546030, + "step": 11051, + "time_per_iteration": 3.8733208179473877 + }, + { + "auxiliary_loss_clip": 0.01087765, + "auxiliary_loss_mlp": 0.01032768, + "balance_loss_clip": 1.04077959, + "balance_loss_mlp": 1.02090096, + "epoch": 0.6644821884863971, + "flos": 21142228216320.0, + "grad_norm": 1.6879672573824887, + "language_loss": 0.85401672, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.87522209, + "num_input_tokens_seen": 238564175, + "step": 11052, + "time_per_iteration": 2.525153160095215 + }, + { + "auxiliary_loss_clip": 0.01062033, + "auxiliary_loss_mlp": 0.01034936, + "balance_loss_clip": 1.03534222, + "balance_loss_mlp": 1.02130437, + "epoch": 0.6645423117390651, + "flos": 20192821885440.0, + "grad_norm": 2.2837350206064007, + "language_loss": 0.74819767, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.76916742, + "num_input_tokens_seen": 238581010, + "step": 11053, + "time_per_iteration": 2.5611822605133057 + }, + { + "auxiliary_loss_clip": 0.01086221, + "auxiliary_loss_mlp": 0.01026306, + "balance_loss_clip": 1.03843653, + "balance_loss_mlp": 1.01485026, + "epoch": 0.6646024349917331, + "flos": 24351959640960.0, + "grad_norm": 1.5354267960615073, + "language_loss": 0.79526347, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.81638873, + "num_input_tokens_seen": 238601365, + "step": 11054, + "time_per_iteration": 2.547175407409668 + }, + { + "auxiliary_loss_clip": 0.01061832, + "auxiliary_loss_mlp": 0.01027393, + "balance_loss_clip": 1.03673625, + "balance_loss_mlp": 1.01603222, + "epoch": 0.664662558244401, + "flos": 18806706000000.0, + "grad_norm": 1.5082398458964315, + "language_loss": 0.73926282, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.76015508, + "num_input_tokens_seen": 238619850, + "step": 11055, + "time_per_iteration": 2.5629475116729736 + }, + { + "auxiliary_loss_clip": 0.01074713, + "auxiliary_loss_mlp": 0.01037022, + "balance_loss_clip": 1.03642583, + "balance_loss_mlp": 1.02378964, + "epoch": 0.664722681497069, + "flos": 18952790613120.0, + "grad_norm": 1.7578777973609383, + "language_loss": 0.73113978, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.75225711, + "num_input_tokens_seen": 238637635, + "step": 11056, + "time_per_iteration": 2.5518975257873535 + }, + { + "auxiliary_loss_clip": 0.01065975, + "auxiliary_loss_mlp": 0.01029609, + "balance_loss_clip": 1.03763461, + "balance_loss_mlp": 1.0172236, + "epoch": 0.6647828047497369, + "flos": 19571279921280.0, + "grad_norm": 1.7264258080387986, + "language_loss": 0.69815093, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.71910679, + "num_input_tokens_seen": 238656200, + "step": 11057, + "time_per_iteration": 2.587862014770508 + }, + { + "auxiliary_loss_clip": 0.01098117, + "auxiliary_loss_mlp": 0.01030395, + "balance_loss_clip": 1.03796077, + "balance_loss_mlp": 1.01761603, + "epoch": 0.664842928002405, + "flos": 23149455102720.0, + "grad_norm": 1.8080603369582222, + "language_loss": 0.80389333, + "learning_rate": 1.066934663776291e-06, + "loss": 0.82517844, + "num_input_tokens_seen": 238675005, + "step": 11058, + "time_per_iteration": 2.5208160877227783 + }, + { + "auxiliary_loss_clip": 0.01017268, + "auxiliary_loss_mlp": 0.01000923, + "balance_loss_clip": 1.02322757, + "balance_loss_mlp": 0.9997074, + "epoch": 0.6649030512550729, + "flos": 65244913148160.0, + "grad_norm": 0.8122905214309895, + "language_loss": 0.62603319, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.64621508, + "num_input_tokens_seen": 238731425, + "step": 11059, + "time_per_iteration": 3.0559463500976562 + }, + { + "auxiliary_loss_clip": 0.01096188, + "auxiliary_loss_mlp": 0.01034315, + "balance_loss_clip": 1.03824413, + "balance_loss_mlp": 1.02291262, + "epoch": 0.6649631745077409, + "flos": 20194797133440.0, + "grad_norm": 1.4885684902407454, + "language_loss": 0.78889346, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.81019849, + "num_input_tokens_seen": 238752020, + "step": 11060, + "time_per_iteration": 2.5351386070251465 + }, + { + "auxiliary_loss_clip": 0.01074799, + "auxiliary_loss_mlp": 0.01032418, + "balance_loss_clip": 1.03679013, + "balance_loss_mlp": 1.01965082, + "epoch": 0.6650232977604088, + "flos": 17238558965760.0, + "grad_norm": 3.0884884290643306, + "language_loss": 0.78827119, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.80934334, + "num_input_tokens_seen": 238769665, + "step": 11061, + "time_per_iteration": 2.53368878364563 + }, + { + "auxiliary_loss_clip": 0.01085812, + "auxiliary_loss_mlp": 0.01027401, + "balance_loss_clip": 1.03967547, + "balance_loss_mlp": 1.01592684, + "epoch": 0.6650834210130768, + "flos": 10006867825920.0, + "grad_norm": 2.1040815979927614, + "language_loss": 0.57230723, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.59343934, + "num_input_tokens_seen": 238782180, + "step": 11062, + "time_per_iteration": 2.4920263290405273 + }, + { + "auxiliary_loss_clip": 0.01097108, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.03653121, + "balance_loss_mlp": 1.01552808, + "epoch": 0.6651435442657447, + "flos": 10452088903680.0, + "grad_norm": 1.7387067973288919, + "language_loss": 0.76328224, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.78456068, + "num_input_tokens_seen": 238800315, + "step": 11063, + "time_per_iteration": 2.4727320671081543 + }, + { + "auxiliary_loss_clip": 0.01050048, + "auxiliary_loss_mlp": 0.01039573, + "balance_loss_clip": 1.03824139, + "balance_loss_mlp": 1.02701473, + "epoch": 0.6652036675184128, + "flos": 22344229964160.0, + "grad_norm": 1.3284605367771636, + "language_loss": 0.70547807, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.72637433, + "num_input_tokens_seen": 238822250, + "step": 11064, + "time_per_iteration": 2.6677913665771484 + }, + { + "auxiliary_loss_clip": 0.01042667, + "auxiliary_loss_mlp": 0.01003371, + "balance_loss_clip": 1.018852, + "balance_loss_mlp": 1.00215495, + "epoch": 0.6652637907710807, + "flos": 52909633998720.0, + "grad_norm": 0.8461319635708201, + "language_loss": 0.63093621, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.65139657, + "num_input_tokens_seen": 238877190, + "step": 11065, + "time_per_iteration": 3.047184705734253 + }, + { + "auxiliary_loss_clip": 0.0109385, + "auxiliary_loss_mlp": 0.01033788, + "balance_loss_clip": 1.03674412, + "balance_loss_mlp": 1.02041245, + "epoch": 0.6653239140237487, + "flos": 23104637907840.0, + "grad_norm": 1.910241746635521, + "language_loss": 0.62350047, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.64477682, + "num_input_tokens_seen": 238896010, + "step": 11066, + "time_per_iteration": 2.505260467529297 + }, + { + "auxiliary_loss_clip": 0.01071102, + "auxiliary_loss_mlp": 0.01035685, + "balance_loss_clip": 1.034127, + "balance_loss_mlp": 1.02080214, + "epoch": 0.6653840372764167, + "flos": 25959393175680.0, + "grad_norm": 1.549308035522599, + "language_loss": 0.70160532, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.72267318, + "num_input_tokens_seen": 238918990, + "step": 11067, + "time_per_iteration": 2.6121559143066406 + }, + { + "auxiliary_loss_clip": 0.01029928, + "auxiliary_loss_mlp": 0.01010654, + "balance_loss_clip": 1.02550006, + "balance_loss_mlp": 1.00906897, + "epoch": 0.6654441605290846, + "flos": 66041985899520.0, + "grad_norm": 0.9272667256793777, + "language_loss": 0.72155225, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.74195802, + "num_input_tokens_seen": 238975735, + "step": 11068, + "time_per_iteration": 3.097623586654663 + }, + { + "auxiliary_loss_clip": 0.01014911, + "auxiliary_loss_mlp": 0.01003074, + "balance_loss_clip": 1.01997912, + "balance_loss_mlp": 1.00184572, + "epoch": 0.6655042837817526, + "flos": 65196112521600.0, + "grad_norm": 0.7059489001146961, + "language_loss": 0.57854605, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.59872591, + "num_input_tokens_seen": 239042360, + "step": 11069, + "time_per_iteration": 3.2743537425994873 + }, + { + "auxiliary_loss_clip": 0.01031491, + "auxiliary_loss_mlp": 0.01002015, + "balance_loss_clip": 1.01893866, + "balance_loss_mlp": 1.00082862, + "epoch": 0.6655644070344205, + "flos": 69008746752000.0, + "grad_norm": 0.7448897092041924, + "language_loss": 0.63509452, + "learning_rate": 1.062803450204029e-06, + "loss": 0.6554296, + "num_input_tokens_seen": 239109410, + "step": 11070, + "time_per_iteration": 3.1486968994140625 + }, + { + "auxiliary_loss_clip": 0.0110611, + "auxiliary_loss_mlp": 0.01024661, + "balance_loss_clip": 1.03591967, + "balance_loss_mlp": 1.0125258, + "epoch": 0.6656245302870886, + "flos": 36315562809600.0, + "grad_norm": 2.326293851220407, + "language_loss": 0.58568168, + "learning_rate": 1.062459413096116e-06, + "loss": 0.60698938, + "num_input_tokens_seen": 239135345, + "step": 11071, + "time_per_iteration": 2.6182057857513428 + }, + { + "auxiliary_loss_clip": 0.01101404, + "auxiliary_loss_mlp": 0.01026133, + "balance_loss_clip": 1.04060376, + "balance_loss_mlp": 1.01470709, + "epoch": 0.6656846535397565, + "flos": 21794832466560.0, + "grad_norm": 1.8650627524177608, + "language_loss": 0.72921616, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.7504915, + "num_input_tokens_seen": 239154340, + "step": 11072, + "time_per_iteration": 2.5002121925354004 + }, + { + "auxiliary_loss_clip": 0.01098142, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.03989959, + "balance_loss_mlp": 1.01642013, + "epoch": 0.6657447767924245, + "flos": 37487615592960.0, + "grad_norm": 1.7615903543181435, + "language_loss": 0.70327383, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.72454822, + "num_input_tokens_seen": 239177815, + "step": 11073, + "time_per_iteration": 2.6457648277282715 + }, + { + "auxiliary_loss_clip": 0.01078691, + "auxiliary_loss_mlp": 0.0102946, + "balance_loss_clip": 1.03865123, + "balance_loss_mlp": 1.01669288, + "epoch": 0.6658049000450924, + "flos": 16837688206080.0, + "grad_norm": 2.325918437425113, + "language_loss": 0.5627259, + "learning_rate": 1.061427515134354e-06, + "loss": 0.58380747, + "num_input_tokens_seen": 239195735, + "step": 11074, + "time_per_iteration": 2.539134979248047 + }, + { + "auxiliary_loss_clip": 0.01110839, + "auxiliary_loss_mlp": 0.00786026, + "balance_loss_clip": 1.04087174, + "balance_loss_mlp": 1.0130589, + "epoch": 0.6658650232977604, + "flos": 33510975863040.0, + "grad_norm": 1.5143850287098277, + "language_loss": 0.72126311, + "learning_rate": 1.061083620311235e-06, + "loss": 0.74023175, + "num_input_tokens_seen": 239217535, + "step": 11075, + "time_per_iteration": 2.583421468734741 + }, + { + "auxiliary_loss_clip": 0.01093605, + "auxiliary_loss_mlp": 0.01029748, + "balance_loss_clip": 1.03642213, + "balance_loss_mlp": 1.0184288, + "epoch": 0.6659251465504283, + "flos": 37706311549440.0, + "grad_norm": 1.4088291285426227, + "language_loss": 0.65885389, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.68008739, + "num_input_tokens_seen": 239241975, + "step": 11076, + "time_per_iteration": 2.6276655197143555 + }, + { + "auxiliary_loss_clip": 0.01085386, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.03583133, + "balance_loss_mlp": 1.01777601, + "epoch": 0.6659852698030964, + "flos": 24893420232960.0, + "grad_norm": 2.2159325474363185, + "language_loss": 0.75381356, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.77497762, + "num_input_tokens_seen": 239262025, + "step": 11077, + "time_per_iteration": 2.557206392288208 + }, + { + "auxiliary_loss_clip": 0.01085709, + "auxiliary_loss_mlp": 0.01027927, + "balance_loss_clip": 1.03738892, + "balance_loss_mlp": 1.01607215, + "epoch": 0.6660453930557643, + "flos": 24352821567360.0, + "grad_norm": 1.5968885040484282, + "language_loss": 0.66666043, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.68779683, + "num_input_tokens_seen": 239282775, + "step": 11078, + "time_per_iteration": 2.5548598766326904 + }, + { + "auxiliary_loss_clip": 0.01109768, + "auxiliary_loss_mlp": 0.01029689, + "balance_loss_clip": 1.03803039, + "balance_loss_mlp": 1.0168916, + "epoch": 0.6661055163084323, + "flos": 10597814380800.0, + "grad_norm": 1.850906655953811, + "language_loss": 0.69527125, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.71666586, + "num_input_tokens_seen": 239299775, + "step": 11079, + "time_per_iteration": 3.8391008377075195 + }, + { + "auxiliary_loss_clip": 0.01084683, + "auxiliary_loss_mlp": 0.01026764, + "balance_loss_clip": 1.03612328, + "balance_loss_mlp": 1.01481938, + "epoch": 0.6661656395611003, + "flos": 24057491944320.0, + "grad_norm": 1.4989782896625081, + "language_loss": 0.8039813, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.82509577, + "num_input_tokens_seen": 239319660, + "step": 11080, + "time_per_iteration": 2.5860471725463867 + }, + { + "auxiliary_loss_clip": 0.01072455, + "auxiliary_loss_mlp": 0.01026845, + "balance_loss_clip": 1.03668118, + "balance_loss_mlp": 1.01608622, + "epoch": 0.6662257628137682, + "flos": 23036192542080.0, + "grad_norm": 1.72490820734142, + "language_loss": 0.78342462, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.80441761, + "num_input_tokens_seen": 239339215, + "step": 11081, + "time_per_iteration": 2.5940327644348145 + }, + { + "auxiliary_loss_clip": 0.01071467, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.03891361, + "balance_loss_mlp": 1.02035475, + "epoch": 0.6662858860664362, + "flos": 24754446512640.0, + "grad_norm": 2.251565942930042, + "language_loss": 0.80400741, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.82507259, + "num_input_tokens_seen": 239358545, + "step": 11082, + "time_per_iteration": 2.5947303771972656 + }, + { + "auxiliary_loss_clip": 0.01077123, + "auxiliary_loss_mlp": 0.01030396, + "balance_loss_clip": 1.03947341, + "balance_loss_mlp": 1.01901186, + "epoch": 0.6663460093191041, + "flos": 20009066883840.0, + "grad_norm": 1.5600028838695947, + "language_loss": 0.84018499, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.86126018, + "num_input_tokens_seen": 239376665, + "step": 11083, + "time_per_iteration": 2.6363441944122314 + }, + { + "auxiliary_loss_clip": 0.01077011, + "auxiliary_loss_mlp": 0.01030233, + "balance_loss_clip": 1.0433321, + "balance_loss_mlp": 1.01742387, + "epoch": 0.6664061325717722, + "flos": 17821389047040.0, + "grad_norm": 2.426734344852017, + "language_loss": 0.85635322, + "learning_rate": 1.057990170638731e-06, + "loss": 0.87742567, + "num_input_tokens_seen": 239394345, + "step": 11084, + "time_per_iteration": 2.5958895683288574 + }, + { + "auxiliary_loss_clip": 0.01091009, + "auxiliary_loss_mlp": 0.01029227, + "balance_loss_clip": 1.03904784, + "balance_loss_mlp": 1.01581621, + "epoch": 0.6664662558244401, + "flos": 18076893465600.0, + "grad_norm": 2.3770810926584423, + "language_loss": 0.73197442, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.75317675, + "num_input_tokens_seen": 239410605, + "step": 11085, + "time_per_iteration": 2.534353256225586 + }, + { + "auxiliary_loss_clip": 0.01085899, + "auxiliary_loss_mlp": 0.01030976, + "balance_loss_clip": 1.03704047, + "balance_loss_mlp": 1.01878667, + "epoch": 0.6665263790771081, + "flos": 21574197175680.0, + "grad_norm": 1.8523175316190241, + "language_loss": 0.80661798, + "learning_rate": 1.057303129975894e-06, + "loss": 0.82778674, + "num_input_tokens_seen": 239427155, + "step": 11086, + "time_per_iteration": 3.9244210720062256 + }, + { + "auxiliary_loss_clip": 0.01082954, + "auxiliary_loss_mlp": 0.0103486, + "balance_loss_clip": 1.03553081, + "balance_loss_mlp": 1.02252233, + "epoch": 0.666586502329776, + "flos": 24206629213440.0, + "grad_norm": 1.9839264829117593, + "language_loss": 0.75059915, + "learning_rate": 1.056959663258702e-06, + "loss": 0.77177727, + "num_input_tokens_seen": 239445510, + "step": 11087, + "time_per_iteration": 3.9119529724121094 + }, + { + "auxiliary_loss_clip": 0.01096081, + "auxiliary_loss_mlp": 0.01029192, + "balance_loss_clip": 1.03673339, + "balance_loss_mlp": 1.01684153, + "epoch": 0.666646625582444, + "flos": 22200515648640.0, + "grad_norm": 1.69722909199316, + "language_loss": 0.6493963, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.67064899, + "num_input_tokens_seen": 239464805, + "step": 11088, + "time_per_iteration": 2.5081703662872314 + }, + { + "auxiliary_loss_clip": 0.01096813, + "auxiliary_loss_mlp": 0.01027546, + "balance_loss_clip": 1.03666997, + "balance_loss_mlp": 1.01424813, + "epoch": 0.6667067488351119, + "flos": 18259930195200.0, + "grad_norm": 1.8340074928453254, + "language_loss": 0.63883829, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.66008186, + "num_input_tokens_seen": 239483890, + "step": 11089, + "time_per_iteration": 3.9034316539764404 + }, + { + "auxiliary_loss_clip": 0.01107341, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.03779531, + "balance_loss_mlp": 1.01869845, + "epoch": 0.66676687208778, + "flos": 17236547804160.0, + "grad_norm": 2.19155075225952, + "language_loss": 0.80883372, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.8302108, + "num_input_tokens_seen": 239500080, + "step": 11090, + "time_per_iteration": 2.430241823196411 + }, + { + "auxiliary_loss_clip": 0.01088615, + "auxiliary_loss_mlp": 0.01029589, + "balance_loss_clip": 1.03666914, + "balance_loss_mlp": 1.0170784, + "epoch": 0.6668269953404479, + "flos": 19752197748480.0, + "grad_norm": 2.0814703782893886, + "language_loss": 0.77926624, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.8004483, + "num_input_tokens_seen": 239517335, + "step": 11091, + "time_per_iteration": 2.498779058456421 + }, + { + "auxiliary_loss_clip": 0.01107187, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.03669667, + "balance_loss_mlp": 1.02038991, + "epoch": 0.6668871185931159, + "flos": 20558428467840.0, + "grad_norm": 1.852239748468378, + "language_loss": 0.79544377, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.81684566, + "num_input_tokens_seen": 239536240, + "step": 11092, + "time_per_iteration": 2.4648191928863525 + }, + { + "auxiliary_loss_clip": 0.01018619, + "auxiliary_loss_mlp": 0.01004018, + "balance_loss_clip": 1.02090716, + "balance_loss_mlp": 1.00262916, + "epoch": 0.6669472418457839, + "flos": 58088167735680.0, + "grad_norm": 0.7547449259343832, + "language_loss": 0.57736588, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.59759223, + "num_input_tokens_seen": 239598000, + "step": 11093, + "time_per_iteration": 3.1754696369171143 + }, + { + "auxiliary_loss_clip": 0.01107076, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.03759003, + "balance_loss_mlp": 1.01833153, + "epoch": 0.6670073650984518, + "flos": 26065113880320.0, + "grad_norm": 1.5235225362191867, + "language_loss": 0.76402473, + "learning_rate": 1.054556398252703e-06, + "loss": 0.78540367, + "num_input_tokens_seen": 239617650, + "step": 11094, + "time_per_iteration": 2.5357391834259033 + }, + { + "auxiliary_loss_clip": 0.01109025, + "auxiliary_loss_mlp": 0.01033487, + "balance_loss_clip": 1.03803003, + "balance_loss_mlp": 1.01992083, + "epoch": 0.6670674883511198, + "flos": 32416849635840.0, + "grad_norm": 2.0519537655987357, + "language_loss": 0.73284245, + "learning_rate": 1.05421321798155e-06, + "loss": 0.75426763, + "num_input_tokens_seen": 239639825, + "step": 11095, + "time_per_iteration": 2.547170877456665 + }, + { + "auxiliary_loss_clip": 0.01097626, + "auxiliary_loss_mlp": 0.0103473, + "balance_loss_clip": 1.03813982, + "balance_loss_mlp": 1.02246368, + "epoch": 0.6671276116037878, + "flos": 18037786533120.0, + "grad_norm": 2.158558505714864, + "language_loss": 0.73227227, + "learning_rate": 1.053870073574727e-06, + "loss": 0.75359583, + "num_input_tokens_seen": 239656300, + "step": 11096, + "time_per_iteration": 2.5008816719055176 + }, + { + "auxiliary_loss_clip": 0.01066157, + "auxiliary_loss_mlp": 0.01031927, + "balance_loss_clip": 1.03745914, + "balance_loss_mlp": 1.01955879, + "epoch": 0.6671877348564558, + "flos": 23767046570880.0, + "grad_norm": 1.8789209083310365, + "language_loss": 0.6456641, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.66664493, + "num_input_tokens_seen": 239676655, + "step": 11097, + "time_per_iteration": 2.567314624786377 + }, + { + "auxiliary_loss_clip": 0.01096721, + "auxiliary_loss_mlp": 0.01034365, + "balance_loss_clip": 1.0366118, + "balance_loss_mlp": 1.02208662, + "epoch": 0.6672478581091237, + "flos": 20918360701440.0, + "grad_norm": 2.0577321443636003, + "language_loss": 0.75588703, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.77719796, + "num_input_tokens_seen": 239695430, + "step": 11098, + "time_per_iteration": 2.5067646503448486 + }, + { + "auxiliary_loss_clip": 0.01110228, + "auxiliary_loss_mlp": 0.01028932, + "balance_loss_clip": 1.03892994, + "balance_loss_mlp": 1.01721382, + "epoch": 0.6673079813617917, + "flos": 27855799626240.0, + "grad_norm": 1.5330941094557409, + "language_loss": 0.74252093, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.76391256, + "num_input_tokens_seen": 239717070, + "step": 11099, + "time_per_iteration": 2.5015766620635986 + }, + { + "auxiliary_loss_clip": 0.01095709, + "auxiliary_loss_mlp": 0.01032527, + "balance_loss_clip": 1.03664899, + "balance_loss_mlp": 1.02033794, + "epoch": 0.6673681046144596, + "flos": 21616859554560.0, + "grad_norm": 2.0439684907817783, + "language_loss": 0.78070068, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.80198306, + "num_input_tokens_seen": 239737105, + "step": 11100, + "time_per_iteration": 2.518545389175415 + }, + { + "auxiliary_loss_clip": 0.01106245, + "auxiliary_loss_mlp": 0.01038531, + "balance_loss_clip": 1.03745496, + "balance_loss_mlp": 1.02649665, + "epoch": 0.6674282278671276, + "flos": 20889884194560.0, + "grad_norm": 2.4292090260575248, + "language_loss": 0.6000495, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.62149727, + "num_input_tokens_seen": 239757835, + "step": 11101, + "time_per_iteration": 2.5018982887268066 + }, + { + "auxiliary_loss_clip": 0.01092704, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.03760481, + "balance_loss_mlp": 1.01976132, + "epoch": 0.6674883511197955, + "flos": 23624194181760.0, + "grad_norm": 1.9021233140857503, + "language_loss": 0.71461767, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.73587102, + "num_input_tokens_seen": 239775425, + "step": 11102, + "time_per_iteration": 2.5586724281311035 + }, + { + "auxiliary_loss_clip": 0.01098808, + "auxiliary_loss_mlp": 0.01030107, + "balance_loss_clip": 1.03743768, + "balance_loss_mlp": 1.01751864, + "epoch": 0.6675484743724636, + "flos": 19609668581760.0, + "grad_norm": 1.7679899271873711, + "language_loss": 0.84191072, + "learning_rate": 1.051469068021034e-06, + "loss": 0.86319995, + "num_input_tokens_seen": 239794605, + "step": 11103, + "time_per_iteration": 2.4873523712158203 + }, + { + "auxiliary_loss_clip": 0.0108739, + "auxiliary_loss_mlp": 0.01026331, + "balance_loss_clip": 1.0367136, + "balance_loss_mlp": 1.01423693, + "epoch": 0.6676085976251315, + "flos": 14319452482560.0, + "grad_norm": 1.9992358746769094, + "language_loss": 0.78408754, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.80522472, + "num_input_tokens_seen": 239812135, + "step": 11104, + "time_per_iteration": 2.5060653686523438 + }, + { + "auxiliary_loss_clip": 0.0106754, + "auxiliary_loss_mlp": 0.01030663, + "balance_loss_clip": 1.04049158, + "balance_loss_mlp": 1.01853347, + "epoch": 0.6676687208777995, + "flos": 38104596529920.0, + "grad_norm": 1.6015175838503426, + "language_loss": 0.58029187, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.60127389, + "num_input_tokens_seen": 239835845, + "step": 11105, + "time_per_iteration": 2.7381420135498047 + }, + { + "auxiliary_loss_clip": 0.01101919, + "auxiliary_loss_mlp": 0.01036095, + "balance_loss_clip": 1.03843141, + "balance_loss_mlp": 1.02258813, + "epoch": 0.6677288441304675, + "flos": 23981576549760.0, + "grad_norm": 1.7268760904705092, + "language_loss": 0.73059368, + "learning_rate": 1.0504406049066e-06, + "loss": 0.75197375, + "num_input_tokens_seen": 239853820, + "step": 11106, + "time_per_iteration": 2.5066568851470947 + }, + { + "auxiliary_loss_clip": 0.01108837, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.03845429, + "balance_loss_mlp": 1.01892936, + "epoch": 0.6677889673831354, + "flos": 24170682677760.0, + "grad_norm": 1.6293079266028472, + "language_loss": 0.7678858, + "learning_rate": 1.0500978558659e-06, + "loss": 0.78928888, + "num_input_tokens_seen": 239873365, + "step": 11107, + "time_per_iteration": 2.513930559158325 + }, + { + "auxiliary_loss_clip": 0.0108366, + "auxiliary_loss_mlp": 0.01029065, + "balance_loss_clip": 1.03562307, + "balance_loss_mlp": 1.01665592, + "epoch": 0.6678490906358034, + "flos": 22309648145280.0, + "grad_norm": 2.1193948039654833, + "language_loss": 0.89753294, + "learning_rate": 1.049755142845583e-06, + "loss": 0.91866016, + "num_input_tokens_seen": 239891215, + "step": 11108, + "time_per_iteration": 2.533521890640259 + }, + { + "auxiliary_loss_clip": 0.01075058, + "auxiliary_loss_mlp": 0.01022584, + "balance_loss_clip": 1.0388515, + "balance_loss_mlp": 1.01208735, + "epoch": 0.6679092138884714, + "flos": 36898752026880.0, + "grad_norm": 1.4242363988399898, + "language_loss": 0.82517898, + "learning_rate": 1.049412465858646e-06, + "loss": 0.84615535, + "num_input_tokens_seen": 239913490, + "step": 11109, + "time_per_iteration": 2.6941442489624023 + }, + { + "auxiliary_loss_clip": 0.01081675, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.03705513, + "balance_loss_mlp": 1.01865733, + "epoch": 0.6679693371411394, + "flos": 18150294908160.0, + "grad_norm": 2.116081089081043, + "language_loss": 0.69058001, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.71171516, + "num_input_tokens_seen": 239931565, + "step": 11110, + "time_per_iteration": 2.605433225631714 + }, + { + "auxiliary_loss_clip": 0.01082559, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.04038692, + "balance_loss_mlp": 1.01716423, + "epoch": 0.6680294603938073, + "flos": 27198167472000.0, + "grad_norm": 1.4191915130798025, + "language_loss": 0.73227501, + "learning_rate": 1.04872722003689e-06, + "loss": 0.75340688, + "num_input_tokens_seen": 239952395, + "step": 11111, + "time_per_iteration": 2.594424247741699 + }, + { + "auxiliary_loss_clip": 0.01106634, + "auxiliary_loss_mlp": 0.01029921, + "balance_loss_clip": 1.03738177, + "balance_loss_mlp": 1.0181911, + "epoch": 0.6680895836464753, + "flos": 21725309692800.0, + "grad_norm": 1.9700593284828976, + "language_loss": 0.64723182, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.6685974, + "num_input_tokens_seen": 239968910, + "step": 11112, + "time_per_iteration": 2.466550588607788 + }, + { + "auxiliary_loss_clip": 0.01082668, + "auxiliary_loss_mlp": 0.01029341, + "balance_loss_clip": 1.03659797, + "balance_loss_mlp": 1.01683021, + "epoch": 0.6681497068991432, + "flos": 19646477043840.0, + "grad_norm": 2.1642911536883163, + "language_loss": 0.63146424, + "learning_rate": 1.048042118504569e-06, + "loss": 0.65258431, + "num_input_tokens_seen": 239987680, + "step": 11113, + "time_per_iteration": 2.516191005706787 + }, + { + "auxiliary_loss_clip": 0.01065058, + "auxiliary_loss_mlp": 0.01031405, + "balance_loss_clip": 1.03824902, + "balance_loss_mlp": 1.01986599, + "epoch": 0.6682098301518112, + "flos": 17419153570560.0, + "grad_norm": 1.7717385213509131, + "language_loss": 0.65932858, + "learning_rate": 1.047699621879422e-06, + "loss": 0.6802932, + "num_input_tokens_seen": 240005790, + "step": 11114, + "time_per_iteration": 2.608201026916504 + }, + { + "auxiliary_loss_clip": 0.01098472, + "auxiliary_loss_mlp": 0.0103635, + "balance_loss_clip": 1.03740478, + "balance_loss_mlp": 1.02400577, + "epoch": 0.6682699534044791, + "flos": 22599016110720.0, + "grad_norm": 1.5397630694577742, + "language_loss": 0.78444195, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.80579019, + "num_input_tokens_seen": 240025895, + "step": 11115, + "time_per_iteration": 2.5170388221740723 + }, + { + "auxiliary_loss_clip": 0.01056736, + "auxiliary_loss_mlp": 0.00786172, + "balance_loss_clip": 1.03332734, + "balance_loss_mlp": 1.00995076, + "epoch": 0.6683300766571472, + "flos": 24863686750080.0, + "grad_norm": 1.7204507478249427, + "language_loss": 0.79509777, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.81352687, + "num_input_tokens_seen": 240044880, + "step": 11116, + "time_per_iteration": 2.635437250137329 + }, + { + "auxiliary_loss_clip": 0.01078927, + "auxiliary_loss_mlp": 0.01032375, + "balance_loss_clip": 1.03794909, + "balance_loss_mlp": 1.01902342, + "epoch": 0.6683901999098151, + "flos": 27126633536640.0, + "grad_norm": 1.654898173189274, + "language_loss": 0.79437375, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.81548679, + "num_input_tokens_seen": 240065785, + "step": 11117, + "time_per_iteration": 2.614931106567383 + }, + { + "auxiliary_loss_clip": 0.01062651, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.03810024, + "balance_loss_mlp": 1.01727247, + "epoch": 0.6684503231624831, + "flos": 20739023072640.0, + "grad_norm": 3.5624378159656036, + "language_loss": 0.65671599, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.67765278, + "num_input_tokens_seen": 240085130, + "step": 11118, + "time_per_iteration": 3.97074294090271 + }, + { + "auxiliary_loss_clip": 0.01086293, + "auxiliary_loss_mlp": 0.01027758, + "balance_loss_clip": 1.03691721, + "balance_loss_mlp": 1.01620054, + "epoch": 0.668510446415151, + "flos": 21762189982080.0, + "grad_norm": 1.5060460234290567, + "language_loss": 0.69316787, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.71430838, + "num_input_tokens_seen": 240105495, + "step": 11119, + "time_per_iteration": 2.642090082168579 + }, + { + "auxiliary_loss_clip": 0.01082906, + "auxiliary_loss_mlp": 0.0103454, + "balance_loss_clip": 1.03614581, + "balance_loss_mlp": 1.02185643, + "epoch": 0.668570569667819, + "flos": 30191250015360.0, + "grad_norm": 1.7088413246480656, + "language_loss": 0.6716072, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.69278169, + "num_input_tokens_seen": 240125455, + "step": 11120, + "time_per_iteration": 2.6110098361968994 + }, + { + "auxiliary_loss_clip": 0.01077055, + "auxiliary_loss_mlp": 0.01030823, + "balance_loss_clip": 1.03615975, + "balance_loss_mlp": 1.01772237, + "epoch": 0.668630692920487, + "flos": 24170646764160.0, + "grad_norm": 1.9286932427393801, + "language_loss": 0.7203868, + "learning_rate": 1.045303157347638e-06, + "loss": 0.74146557, + "num_input_tokens_seen": 240143870, + "step": 11121, + "time_per_iteration": 2.589672327041626 + }, + { + "auxiliary_loss_clip": 0.01087448, + "auxiliary_loss_mlp": 0.01036959, + "balance_loss_clip": 1.03637934, + "balance_loss_mlp": 1.02428746, + "epoch": 0.668690816173155, + "flos": 17457147181440.0, + "grad_norm": 2.592174320613198, + "language_loss": 0.69477034, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.71601439, + "num_input_tokens_seen": 240161020, + "step": 11122, + "time_per_iteration": 2.504709482192993 + }, + { + "auxiliary_loss_clip": 0.01050187, + "auxiliary_loss_mlp": 0.00788142, + "balance_loss_clip": 1.03421211, + "balance_loss_mlp": 1.01366878, + "epoch": 0.668750939425823, + "flos": 25005102595200.0, + "grad_norm": 2.153701545684978, + "language_loss": 0.71475536, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.73313862, + "num_input_tokens_seen": 240179820, + "step": 11123, + "time_per_iteration": 2.652235746383667 + }, + { + "auxiliary_loss_clip": 0.01092825, + "auxiliary_loss_mlp": 0.01033687, + "balance_loss_clip": 1.04074097, + "balance_loss_mlp": 1.02096784, + "epoch": 0.6688110626784909, + "flos": 24096778444800.0, + "grad_norm": 1.854922254012912, + "language_loss": 0.79357159, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.81483674, + "num_input_tokens_seen": 240200130, + "step": 11124, + "time_per_iteration": 2.5575473308563232 + }, + { + "auxiliary_loss_clip": 0.01086132, + "auxiliary_loss_mlp": 0.01039241, + "balance_loss_clip": 1.04114175, + "balance_loss_mlp": 1.02678323, + "epoch": 0.6688711859311589, + "flos": 21759532375680.0, + "grad_norm": 1.7887270666401778, + "language_loss": 0.74375939, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.7650131, + "num_input_tokens_seen": 240217945, + "step": 11125, + "time_per_iteration": 3.960653066635132 + }, + { + "auxiliary_loss_clip": 0.01082429, + "auxiliary_loss_mlp": 0.01033705, + "balance_loss_clip": 1.04057789, + "balance_loss_mlp": 1.02084816, + "epoch": 0.6689313091838268, + "flos": 22929645824640.0, + "grad_norm": 1.7680983525976157, + "language_loss": 0.6638881, + "learning_rate": 1.043592482774116e-06, + "loss": 0.68504941, + "num_input_tokens_seen": 240237220, + "step": 11126, + "time_per_iteration": 3.9651849269866943 + }, + { + "auxiliary_loss_clip": 0.01090915, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.03428817, + "balance_loss_mlp": 1.01454997, + "epoch": 0.6689914324364948, + "flos": 20886149180160.0, + "grad_norm": 1.79171889965827, + "language_loss": 0.70812583, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.72930592, + "num_input_tokens_seen": 240256000, + "step": 11127, + "time_per_iteration": 2.5190625190734863 + }, + { + "auxiliary_loss_clip": 0.01090194, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.03724146, + "balance_loss_mlp": 1.01904416, + "epoch": 0.6690515556891627, + "flos": 22748225207040.0, + "grad_norm": 1.8406556428845449, + "language_loss": 0.80339694, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.82462233, + "num_input_tokens_seen": 240275845, + "step": 11128, + "time_per_iteration": 3.926719903945923 + }, + { + "auxiliary_loss_clip": 0.01110719, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.03837144, + "balance_loss_mlp": 1.01971459, + "epoch": 0.6691116789418308, + "flos": 23331450337920.0, + "grad_norm": 1.8637335035113767, + "language_loss": 0.81007028, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.83149713, + "num_input_tokens_seen": 240294095, + "step": 11129, + "time_per_iteration": 2.5060718059539795 + }, + { + "auxiliary_loss_clip": 0.0108876, + "auxiliary_loss_mlp": 0.01033604, + "balance_loss_clip": 1.03524077, + "balance_loss_mlp": 1.02204108, + "epoch": 0.6691718021944987, + "flos": 32447014081920.0, + "grad_norm": 1.5203831596751347, + "language_loss": 0.70229268, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.72351635, + "num_input_tokens_seen": 240313460, + "step": 11130, + "time_per_iteration": 2.587913751602173 + }, + { + "auxiliary_loss_clip": 0.01084579, + "auxiliary_loss_mlp": 0.0103518, + "balance_loss_clip": 1.03711021, + "balance_loss_mlp": 1.023265, + "epoch": 0.6692319254471667, + "flos": 23731602825600.0, + "grad_norm": 1.7452408168586244, + "language_loss": 0.70099163, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.72218925, + "num_input_tokens_seen": 240333540, + "step": 11131, + "time_per_iteration": 2.5660438537597656 + }, + { + "auxiliary_loss_clip": 0.01099497, + "auxiliary_loss_mlp": 0.01031695, + "balance_loss_clip": 1.03750181, + "balance_loss_mlp": 1.01809371, + "epoch": 0.6692920486998346, + "flos": 14427902620800.0, + "grad_norm": 2.3591237404501735, + "language_loss": 0.65898609, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.68029797, + "num_input_tokens_seen": 240350085, + "step": 11132, + "time_per_iteration": 2.4764678478240967 + }, + { + "auxiliary_loss_clip": 0.0109867, + "auxiliary_loss_mlp": 0.01033902, + "balance_loss_clip": 1.03604078, + "balance_loss_mlp": 1.02032971, + "epoch": 0.6693521719525026, + "flos": 21507475662720.0, + "grad_norm": 1.7313026276371768, + "language_loss": 0.74369109, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.76501679, + "num_input_tokens_seen": 240370015, + "step": 11133, + "time_per_iteration": 2.511470317840576 + }, + { + "auxiliary_loss_clip": 0.01104507, + "auxiliary_loss_mlp": 0.01034093, + "balance_loss_clip": 1.04059196, + "balance_loss_mlp": 1.02056313, + "epoch": 0.6694122952051706, + "flos": 25406943022080.0, + "grad_norm": 1.7945153149203978, + "language_loss": 0.66560668, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.68699265, + "num_input_tokens_seen": 240390770, + "step": 11134, + "time_per_iteration": 2.5532233715057373 + }, + { + "auxiliary_loss_clip": 0.01103437, + "auxiliary_loss_mlp": 0.01034132, + "balance_loss_clip": 1.04024124, + "balance_loss_mlp": 1.01973701, + "epoch": 0.6694724184578386, + "flos": 25661729168640.0, + "grad_norm": 1.9922334018781445, + "language_loss": 0.77483219, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.7962079, + "num_input_tokens_seen": 240409590, + "step": 11135, + "time_per_iteration": 2.5475106239318848 + }, + { + "auxiliary_loss_clip": 0.01097916, + "auxiliary_loss_mlp": 0.0102759, + "balance_loss_clip": 1.0381211, + "balance_loss_mlp": 1.01518035, + "epoch": 0.6695325417105066, + "flos": 17709311635200.0, + "grad_norm": 1.6280874162001817, + "language_loss": 0.74363661, + "learning_rate": 1.040173855277898e-06, + "loss": 0.76489168, + "num_input_tokens_seen": 240428180, + "step": 11136, + "time_per_iteration": 2.4975924491882324 + }, + { + "auxiliary_loss_clip": 0.01103267, + "auxiliary_loss_mlp": 0.01031744, + "balance_loss_clip": 1.04021764, + "balance_loss_mlp": 1.01864886, + "epoch": 0.6695926649631745, + "flos": 24460050643200.0, + "grad_norm": 1.6470798393037906, + "language_loss": 0.62345713, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.64480722, + "num_input_tokens_seen": 240447815, + "step": 11137, + "time_per_iteration": 2.535275459289551 + }, + { + "auxiliary_loss_clip": 0.01108315, + "auxiliary_loss_mlp": 0.01030459, + "balance_loss_clip": 1.03794074, + "balance_loss_mlp": 1.01772773, + "epoch": 0.6696527882158425, + "flos": 24280138396800.0, + "grad_norm": 1.7154699088642178, + "language_loss": 0.65805918, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.67944688, + "num_input_tokens_seen": 240468635, + "step": 11138, + "time_per_iteration": 2.4959747791290283 + }, + { + "auxiliary_loss_clip": 0.01075254, + "auxiliary_loss_mlp": 0.0103293, + "balance_loss_clip": 1.03577018, + "balance_loss_mlp": 1.02035308, + "epoch": 0.6697129114685104, + "flos": 23002759958400.0, + "grad_norm": 1.6244088113888004, + "language_loss": 0.72562897, + "learning_rate": 1.039148976175053e-06, + "loss": 0.74671084, + "num_input_tokens_seen": 240488550, + "step": 11139, + "time_per_iteration": 2.5799169540405273 + }, + { + "auxiliary_loss_clip": 0.01069867, + "auxiliary_loss_mlp": 0.01030705, + "balance_loss_clip": 1.03786778, + "balance_loss_mlp": 1.01923132, + "epoch": 0.6697730347211784, + "flos": 22638123043200.0, + "grad_norm": 1.9194515136399346, + "language_loss": 0.70211923, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.72312498, + "num_input_tokens_seen": 240508330, + "step": 11140, + "time_per_iteration": 2.6189382076263428 + }, + { + "auxiliary_loss_clip": 0.01099209, + "auxiliary_loss_mlp": 0.01027442, + "balance_loss_clip": 1.03550708, + "balance_loss_mlp": 1.01453161, + "epoch": 0.6698331579738463, + "flos": 28877242682880.0, + "grad_norm": 1.9749737472382, + "language_loss": 0.75714004, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.77840656, + "num_input_tokens_seen": 240528470, + "step": 11141, + "time_per_iteration": 2.5399861335754395 + }, + { + "auxiliary_loss_clip": 0.01099553, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.03879547, + "balance_loss_mlp": 1.01982749, + "epoch": 0.6698932812265144, + "flos": 24207096090240.0, + "grad_norm": 1.7954899902479762, + "language_loss": 0.81750733, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.83882892, + "num_input_tokens_seen": 240547815, + "step": 11142, + "time_per_iteration": 2.5337319374084473 + }, + { + "auxiliary_loss_clip": 0.01061848, + "auxiliary_loss_mlp": 0.01028272, + "balance_loss_clip": 1.03777277, + "balance_loss_mlp": 1.01543319, + "epoch": 0.6699534044791823, + "flos": 22090269830400.0, + "grad_norm": 1.5050005388668242, + "language_loss": 0.69980276, + "learning_rate": 1.037782980862959e-06, + "loss": 0.72070396, + "num_input_tokens_seen": 240567765, + "step": 11143, + "time_per_iteration": 2.594655752182007 + }, + { + "auxiliary_loss_clip": 0.01068609, + "auxiliary_loss_mlp": 0.00787856, + "balance_loss_clip": 1.03652573, + "balance_loss_mlp": 1.01380205, + "epoch": 0.6700135277318503, + "flos": 25192377129600.0, + "grad_norm": 1.4494566555493236, + "language_loss": 0.70115811, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.71972275, + "num_input_tokens_seen": 240590750, + "step": 11144, + "time_per_iteration": 2.6190598011016846 + }, + { + "auxiliary_loss_clip": 0.01084731, + "auxiliary_loss_mlp": 0.01032379, + "balance_loss_clip": 1.03690815, + "balance_loss_mlp": 1.01941538, + "epoch": 0.6700736509845182, + "flos": 23440187784960.0, + "grad_norm": 1.7250605210360679, + "language_loss": 0.74402547, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.76519656, + "num_input_tokens_seen": 240608875, + "step": 11145, + "time_per_iteration": 2.546212673187256 + }, + { + "auxiliary_loss_clip": 0.01088137, + "auxiliary_loss_mlp": 0.01029175, + "balance_loss_clip": 1.03650618, + "balance_loss_mlp": 1.01631868, + "epoch": 0.6701337742371862, + "flos": 24389953251840.0, + "grad_norm": 1.5400994755234974, + "language_loss": 0.70848072, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.72965389, + "num_input_tokens_seen": 240628565, + "step": 11146, + "time_per_iteration": 2.559746026992798 + }, + { + "auxiliary_loss_clip": 0.01103769, + "auxiliary_loss_mlp": 0.00784546, + "balance_loss_clip": 1.03633714, + "balance_loss_mlp": 1.01254535, + "epoch": 0.6701938974898543, + "flos": 14793652857600.0, + "grad_norm": 2.2118845348916896, + "language_loss": 0.78795528, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.80683845, + "num_input_tokens_seen": 240646325, + "step": 11147, + "time_per_iteration": 2.4373953342437744 + }, + { + "auxiliary_loss_clip": 0.01099769, + "auxiliary_loss_mlp": 0.00786169, + "balance_loss_clip": 1.03881836, + "balance_loss_mlp": 1.01124632, + "epoch": 0.6702540207425222, + "flos": 20154002261760.0, + "grad_norm": 1.669961269119579, + "language_loss": 0.70442593, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.72328532, + "num_input_tokens_seen": 240666145, + "step": 11148, + "time_per_iteration": 2.5195634365081787 + }, + { + "auxiliary_loss_clip": 0.01089226, + "auxiliary_loss_mlp": 0.01037473, + "balance_loss_clip": 1.03607345, + "balance_loss_mlp": 1.02494454, + "epoch": 0.6703141439951902, + "flos": 21214157201280.0, + "grad_norm": 1.6725129498581126, + "language_loss": 0.70427907, + "learning_rate": 1.035735082774636e-06, + "loss": 0.72554606, + "num_input_tokens_seen": 240685570, + "step": 11149, + "time_per_iteration": 2.5429177284240723 + }, + { + "auxiliary_loss_clip": 0.0108917, + "auxiliary_loss_mlp": 0.01031564, + "balance_loss_clip": 1.037588, + "balance_loss_mlp": 1.01982152, + "epoch": 0.6703742672478581, + "flos": 23112538899840.0, + "grad_norm": 2.117238624376736, + "language_loss": 0.73890287, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.7601102, + "num_input_tokens_seen": 240706945, + "step": 11150, + "time_per_iteration": 2.600705146789551 + }, + { + "auxiliary_loss_clip": 0.01099246, + "auxiliary_loss_mlp": 0.01031402, + "balance_loss_clip": 1.04018855, + "balance_loss_mlp": 1.01926064, + "epoch": 0.6704343905005261, + "flos": 22528918719360.0, + "grad_norm": 1.809909937030258, + "language_loss": 0.78517437, + "learning_rate": 1.035052742460671e-06, + "loss": 0.80648088, + "num_input_tokens_seen": 240727990, + "step": 11151, + "time_per_iteration": 2.535999298095703 + }, + { + "auxiliary_loss_clip": 0.01010281, + "auxiliary_loss_mlp": 0.01002854, + "balance_loss_clip": 1.0277344, + "balance_loss_mlp": 1.00166786, + "epoch": 0.670494513753194, + "flos": 64793158773120.0, + "grad_norm": 0.821624527455594, + "language_loss": 0.55449015, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.5746215, + "num_input_tokens_seen": 240790380, + "step": 11152, + "time_per_iteration": 3.3057069778442383 + }, + { + "auxiliary_loss_clip": 0.01083942, + "auxiliary_loss_mlp": 0.01035123, + "balance_loss_clip": 1.03544688, + "balance_loss_mlp": 1.0226357, + "epoch": 0.670554637005862, + "flos": 23511506238720.0, + "grad_norm": 1.7718278081132208, + "language_loss": 0.80944306, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.8306337, + "num_input_tokens_seen": 240811545, + "step": 11153, + "time_per_iteration": 2.570666551589966 + }, + { + "auxiliary_loss_clip": 0.0107097, + "auxiliary_loss_mlp": 0.007849, + "balance_loss_clip": 1.03694868, + "balance_loss_mlp": 1.00856972, + "epoch": 0.67061476025853, + "flos": 19463404400640.0, + "grad_norm": 1.4533893613368856, + "language_loss": 0.76416385, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.78272259, + "num_input_tokens_seen": 240831380, + "step": 11154, + "time_per_iteration": 2.573660135269165 + }, + { + "auxiliary_loss_clip": 0.01090807, + "auxiliary_loss_mlp": 0.01035316, + "balance_loss_clip": 1.03838861, + "balance_loss_mlp": 1.02220321, + "epoch": 0.670674883511198, + "flos": 20519967980160.0, + "grad_norm": 1.5533702824568059, + "language_loss": 0.76374054, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.78500175, + "num_input_tokens_seen": 240851855, + "step": 11155, + "time_per_iteration": 2.5386762619018555 + }, + { + "auxiliary_loss_clip": 0.01111646, + "auxiliary_loss_mlp": 0.01036989, + "balance_loss_clip": 1.04007804, + "balance_loss_mlp": 1.024418, + "epoch": 0.6707350067638659, + "flos": 25483971738240.0, + "grad_norm": 2.0616804940236624, + "language_loss": 0.81810671, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.83959305, + "num_input_tokens_seen": 240869980, + "step": 11156, + "time_per_iteration": 2.491286277770996 + }, + { + "auxiliary_loss_clip": 0.01107788, + "auxiliary_loss_mlp": 0.01029687, + "balance_loss_clip": 1.03756714, + "balance_loss_mlp": 1.01793313, + "epoch": 0.6707951300165339, + "flos": 22273450214400.0, + "grad_norm": 2.452254137979995, + "language_loss": 0.7468152, + "learning_rate": 1.033006600114165e-06, + "loss": 0.76819003, + "num_input_tokens_seen": 240888680, + "step": 11157, + "time_per_iteration": 3.949521780014038 + }, + { + "auxiliary_loss_clip": 0.01102319, + "auxiliary_loss_mlp": 0.010354, + "balance_loss_clip": 1.03972936, + "balance_loss_mlp": 1.02232265, + "epoch": 0.6708552532692018, + "flos": 23984593292160.0, + "grad_norm": 1.5780989958744807, + "language_loss": 0.74441206, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.76578927, + "num_input_tokens_seen": 240909050, + "step": 11158, + "time_per_iteration": 2.5311756134033203 + }, + { + "auxiliary_loss_clip": 0.01113754, + "auxiliary_loss_mlp": 0.01032582, + "balance_loss_clip": 1.04026949, + "balance_loss_mlp": 1.0197196, + "epoch": 0.6709153765218698, + "flos": 24937519155840.0, + "grad_norm": 1.629136663987161, + "language_loss": 0.81752515, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.83898854, + "num_input_tokens_seen": 240930035, + "step": 11159, + "time_per_iteration": 2.5144901275634766 + }, + { + "auxiliary_loss_clip": 0.0108846, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.03810954, + "balance_loss_mlp": 1.01912379, + "epoch": 0.6709754997745379, + "flos": 17530225401600.0, + "grad_norm": 1.690002783568212, + "language_loss": 0.76928449, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.79048181, + "num_input_tokens_seen": 240948895, + "step": 11160, + "time_per_iteration": 2.541985273361206 + }, + { + "auxiliary_loss_clip": 0.01082295, + "auxiliary_loss_mlp": 0.01029282, + "balance_loss_clip": 1.03641176, + "balance_loss_mlp": 1.01755214, + "epoch": 0.6710356230272058, + "flos": 22090880361600.0, + "grad_norm": 2.1949819872839424, + "language_loss": 0.73530984, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.75642562, + "num_input_tokens_seen": 240967770, + "step": 11161, + "time_per_iteration": 2.546900987625122 + }, + { + "auxiliary_loss_clip": 0.01089946, + "auxiliary_loss_mlp": 0.01037883, + "balance_loss_clip": 1.03725076, + "balance_loss_mlp": 1.02448392, + "epoch": 0.6710957462798738, + "flos": 24206449645440.0, + "grad_norm": 1.7251392150168565, + "language_loss": 0.68347704, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.70475537, + "num_input_tokens_seen": 240988985, + "step": 11162, + "time_per_iteration": 2.573518753051758 + }, + { + "auxiliary_loss_clip": 0.01084979, + "auxiliary_loss_mlp": 0.01037204, + "balance_loss_clip": 1.03518832, + "balance_loss_mlp": 1.0250144, + "epoch": 0.6711558695325417, + "flos": 19093955063040.0, + "grad_norm": 1.7983279135850545, + "language_loss": 0.70118272, + "learning_rate": 1.030961777833032e-06, + "loss": 0.72240454, + "num_input_tokens_seen": 241005455, + "step": 11163, + "time_per_iteration": 2.50148868560791 + }, + { + "auxiliary_loss_clip": 0.01108389, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.04009414, + "balance_loss_mlp": 1.0182122, + "epoch": 0.6712159927852097, + "flos": 25557875971200.0, + "grad_norm": 1.7311372854804425, + "language_loss": 0.75485975, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.77624452, + "num_input_tokens_seen": 241026175, + "step": 11164, + "time_per_iteration": 4.046183347702026 + }, + { + "auxiliary_loss_clip": 0.01108915, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.03876221, + "balance_loss_mlp": 1.02166545, + "epoch": 0.6712761160378776, + "flos": 22228812587520.0, + "grad_norm": 2.891102490826458, + "language_loss": 0.64820254, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.66962826, + "num_input_tokens_seen": 241044040, + "step": 11165, + "time_per_iteration": 3.8313703536987305 + }, + { + "auxiliary_loss_clip": 0.0110735, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.03859735, + "balance_loss_mlp": 1.01962554, + "epoch": 0.6713362392905456, + "flos": 22455517276800.0, + "grad_norm": 1.8073188172542927, + "language_loss": 0.71737725, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.7387681, + "num_input_tokens_seen": 241063615, + "step": 11166, + "time_per_iteration": 2.4835660457611084 + }, + { + "auxiliary_loss_clip": 0.01107072, + "auxiliary_loss_mlp": 0.0102923, + "balance_loss_clip": 1.03940558, + "balance_loss_mlp": 1.01815522, + "epoch": 0.6713963625432136, + "flos": 25630200005760.0, + "grad_norm": 1.832026511197579, + "language_loss": 0.77227235, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.79363537, + "num_input_tokens_seen": 241082520, + "step": 11167, + "time_per_iteration": 3.973959445953369 + }, + { + "auxiliary_loss_clip": 0.01096591, + "auxiliary_loss_mlp": 0.01036114, + "balance_loss_clip": 1.03569508, + "balance_loss_mlp": 1.02395487, + "epoch": 0.6714564857958816, + "flos": 35006475640320.0, + "grad_norm": 1.7576613324708539, + "language_loss": 0.69122016, + "learning_rate": 1.029258769662629e-06, + "loss": 0.71254718, + "num_input_tokens_seen": 241103505, + "step": 11168, + "time_per_iteration": 2.6422994136810303 + }, + { + "auxiliary_loss_clip": 0.01079879, + "auxiliary_loss_mlp": 0.01040955, + "balance_loss_clip": 1.03839016, + "balance_loss_mlp": 1.02774119, + "epoch": 0.6715166090485495, + "flos": 26279931168000.0, + "grad_norm": 2.07282971909186, + "language_loss": 0.73275113, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.75395942, + "num_input_tokens_seen": 241122885, + "step": 11169, + "time_per_iteration": 2.6094915866851807 + }, + { + "auxiliary_loss_clip": 0.01102115, + "auxiliary_loss_mlp": 0.01034457, + "balance_loss_clip": 1.03890109, + "balance_loss_mlp": 1.02139151, + "epoch": 0.6715767323012175, + "flos": 15924156583680.0, + "grad_norm": 2.0675575403293234, + "language_loss": 0.76189041, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.78325611, + "num_input_tokens_seen": 241140865, + "step": 11170, + "time_per_iteration": 2.482563018798828 + }, + { + "auxiliary_loss_clip": 0.01089138, + "auxiliary_loss_mlp": 0.01028099, + "balance_loss_clip": 1.03741002, + "balance_loss_mlp": 1.01529014, + "epoch": 0.6716368555538854, + "flos": 17491441691520.0, + "grad_norm": 2.003326158799613, + "language_loss": 0.74310929, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.76428163, + "num_input_tokens_seen": 241158225, + "step": 11171, + "time_per_iteration": 2.535719871520996 + }, + { + "auxiliary_loss_clip": 0.01069851, + "auxiliary_loss_mlp": 0.01041668, + "balance_loss_clip": 1.03620553, + "balance_loss_mlp": 1.027601, + "epoch": 0.6716969788065534, + "flos": 16761521416320.0, + "grad_norm": 1.5155629790740002, + "language_loss": 0.86344361, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.8845588, + "num_input_tokens_seen": 241175215, + "step": 11172, + "time_per_iteration": 2.5671091079711914 + }, + { + "auxiliary_loss_clip": 0.01094401, + "auxiliary_loss_mlp": 0.01033697, + "balance_loss_clip": 1.03556895, + "balance_loss_mlp": 1.02127504, + "epoch": 0.6717571020592215, + "flos": 22709800632960.0, + "grad_norm": 1.655590981767189, + "language_loss": 0.63643694, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.65771782, + "num_input_tokens_seen": 241195250, + "step": 11173, + "time_per_iteration": 2.5429654121398926 + }, + { + "auxiliary_loss_clip": 0.01104758, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_clip": 1.03741825, + "balance_loss_mlp": 1.02170753, + "epoch": 0.6718172253118894, + "flos": 18734094656640.0, + "grad_norm": 2.294228763400917, + "language_loss": 0.71856713, + "learning_rate": 1.02721637475002e-06, + "loss": 0.73997062, + "num_input_tokens_seen": 241210720, + "step": 11174, + "time_per_iteration": 2.4690868854522705 + }, + { + "auxiliary_loss_clip": 0.0107672, + "auxiliary_loss_mlp": 0.01026719, + "balance_loss_clip": 1.03739667, + "balance_loss_mlp": 1.01505446, + "epoch": 0.6718773485645574, + "flos": 15632526061440.0, + "grad_norm": 1.9065860058608657, + "language_loss": 0.69085479, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.71188927, + "num_input_tokens_seen": 241227395, + "step": 11175, + "time_per_iteration": 2.550529718399048 + }, + { + "auxiliary_loss_clip": 0.01087032, + "auxiliary_loss_mlp": 0.01034059, + "balance_loss_clip": 1.03828883, + "balance_loss_mlp": 1.02229929, + "epoch": 0.6719374718172253, + "flos": 19354774694400.0, + "grad_norm": 3.1871664522555108, + "language_loss": 0.73600304, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.75721395, + "num_input_tokens_seen": 241246355, + "step": 11176, + "time_per_iteration": 2.521174192428589 + }, + { + "auxiliary_loss_clip": 0.01086752, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.03646684, + "balance_loss_mlp": 1.01799536, + "epoch": 0.6719975950698933, + "flos": 21981316901760.0, + "grad_norm": 1.6028528203342292, + "language_loss": 0.73358011, + "learning_rate": 1.026195675108182e-06, + "loss": 0.75475746, + "num_input_tokens_seen": 241264180, + "step": 11177, + "time_per_iteration": 2.5350260734558105 + }, + { + "auxiliary_loss_clip": 0.0110925, + "auxiliary_loss_mlp": 0.01031259, + "balance_loss_clip": 1.03796065, + "balance_loss_mlp": 1.01836669, + "epoch": 0.6720577183225612, + "flos": 25228072270080.0, + "grad_norm": 2.1925361146129725, + "language_loss": 0.76340181, + "learning_rate": 1.025855515730551e-06, + "loss": 0.78480685, + "num_input_tokens_seen": 241282245, + "step": 11178, + "time_per_iteration": 2.5118415355682373 + }, + { + "auxiliary_loss_clip": 0.01101523, + "auxiliary_loss_mlp": 0.01033525, + "balance_loss_clip": 1.04026687, + "balance_loss_mlp": 1.02184832, + "epoch": 0.6721178415752292, + "flos": 16945886949120.0, + "grad_norm": 1.7084218941072153, + "language_loss": 0.70352155, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.72487199, + "num_input_tokens_seen": 241300745, + "step": 11179, + "time_per_iteration": 2.4907569885253906 + }, + { + "auxiliary_loss_clip": 0.01063802, + "auxiliary_loss_mlp": 0.01027477, + "balance_loss_clip": 1.03762841, + "balance_loss_mlp": 1.01594996, + "epoch": 0.6721779648278972, + "flos": 21541375123200.0, + "grad_norm": 1.7765959475979738, + "language_loss": 0.73905778, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.75997061, + "num_input_tokens_seen": 241319320, + "step": 11180, + "time_per_iteration": 2.6004762649536133 + }, + { + "auxiliary_loss_clip": 0.01088491, + "auxiliary_loss_mlp": 0.01029591, + "balance_loss_clip": 1.03749859, + "balance_loss_mlp": 1.01752067, + "epoch": 0.6722380880805652, + "flos": 22605444645120.0, + "grad_norm": 1.390113056655007, + "language_loss": 0.75353789, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.7747187, + "num_input_tokens_seen": 241342225, + "step": 11181, + "time_per_iteration": 2.5657315254211426 + }, + { + "auxiliary_loss_clip": 0.01088386, + "auxiliary_loss_mlp": 0.01026605, + "balance_loss_clip": 1.03632402, + "balance_loss_mlp": 1.01505351, + "epoch": 0.6722982113332331, + "flos": 15925269905280.0, + "grad_norm": 2.1507330573896506, + "language_loss": 0.7449019, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.76605177, + "num_input_tokens_seen": 241358240, + "step": 11182, + "time_per_iteration": 2.4705393314361572 + }, + { + "auxiliary_loss_clip": 0.01093469, + "auxiliary_loss_mlp": 0.01030924, + "balance_loss_clip": 1.03639483, + "balance_loss_mlp": 1.01946795, + "epoch": 0.6723583345859011, + "flos": 20596170683520.0, + "grad_norm": 2.140643625195713, + "language_loss": 0.69452417, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.7157681, + "num_input_tokens_seen": 241378420, + "step": 11183, + "time_per_iteration": 2.518209934234619 + }, + { + "auxiliary_loss_clip": 0.01061801, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.03899431, + "balance_loss_mlp": 1.02020192, + "epoch": 0.672418457838569, + "flos": 21725848396800.0, + "grad_norm": 1.6451151439031801, + "language_loss": 0.77900374, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.79995084, + "num_input_tokens_seen": 241397185, + "step": 11184, + "time_per_iteration": 2.590519905090332 + }, + { + "auxiliary_loss_clip": 0.01091958, + "auxiliary_loss_mlp": 0.00789843, + "balance_loss_clip": 1.04020548, + "balance_loss_mlp": 1.01178253, + "epoch": 0.672478581091237, + "flos": 21470379891840.0, + "grad_norm": 1.9979396566701118, + "language_loss": 0.66266799, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.68148601, + "num_input_tokens_seen": 241415785, + "step": 11185, + "time_per_iteration": 2.534670829772949 + }, + { + "auxiliary_loss_clip": 0.01073227, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.0356158, + "balance_loss_mlp": 1.01966286, + "epoch": 0.6725387043439051, + "flos": 30846763267200.0, + "grad_norm": 1.6997577867553462, + "language_loss": 0.80412382, + "learning_rate": 1.023135571620345e-06, + "loss": 0.82518536, + "num_input_tokens_seen": 241437390, + "step": 11186, + "time_per_iteration": 2.617849349975586 + }, + { + "auxiliary_loss_clip": 0.0109758, + "auxiliary_loss_mlp": 0.0103274, + "balance_loss_clip": 1.03992486, + "balance_loss_mlp": 1.02151608, + "epoch": 0.672598827596573, + "flos": 24055947659520.0, + "grad_norm": 1.434669984834181, + "language_loss": 0.79968005, + "learning_rate": 1.022795745163813e-06, + "loss": 0.82098323, + "num_input_tokens_seen": 241458085, + "step": 11187, + "time_per_iteration": 2.5392544269561768 + }, + { + "auxiliary_loss_clip": 0.01076437, + "auxiliary_loss_mlp": 0.01030686, + "balance_loss_clip": 1.04419088, + "balance_loss_mlp": 1.0173707, + "epoch": 0.672658950849241, + "flos": 21871861182720.0, + "grad_norm": 1.7464528252004903, + "language_loss": 0.70637846, + "learning_rate": 1.022455955762965e-06, + "loss": 0.72744966, + "num_input_tokens_seen": 241476880, + "step": 11188, + "time_per_iteration": 2.593278646469116 + }, + { + "auxiliary_loss_clip": 0.01044573, + "auxiliary_loss_mlp": 0.01030197, + "balance_loss_clip": 1.03801847, + "balance_loss_mlp": 1.0186162, + "epoch": 0.6727190741019089, + "flos": 23222102359680.0, + "grad_norm": 1.6492729140266218, + "language_loss": 0.75602478, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.7767725, + "num_input_tokens_seen": 241496535, + "step": 11189, + "time_per_iteration": 2.880229949951172 + }, + { + "auxiliary_loss_clip": 0.01111935, + "auxiliary_loss_mlp": 0.01030487, + "balance_loss_clip": 1.03764439, + "balance_loss_mlp": 1.01632464, + "epoch": 0.6727791973545769, + "flos": 15778610674560.0, + "grad_norm": 4.000289212610063, + "language_loss": 0.75086403, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.7722882, + "num_input_tokens_seen": 241513465, + "step": 11190, + "time_per_iteration": 2.9061520099639893 + }, + { + "auxiliary_loss_clip": 0.01053457, + "auxiliary_loss_mlp": 0.01033777, + "balance_loss_clip": 1.03577077, + "balance_loss_mlp": 1.02028871, + "epoch": 0.6728393206072448, + "flos": 21249852341760.0, + "grad_norm": 2.0448653418114193, + "language_loss": 0.76895911, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.7898314, + "num_input_tokens_seen": 241534125, + "step": 11191, + "time_per_iteration": 2.6471197605133057 + }, + { + "auxiliary_loss_clip": 0.01106829, + "auxiliary_loss_mlp": 0.01033093, + "balance_loss_clip": 1.03835869, + "balance_loss_mlp": 1.02099955, + "epoch": 0.6728994438599128, + "flos": 32123279779200.0, + "grad_norm": 1.9418401640012881, + "language_loss": 0.86514127, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.88654047, + "num_input_tokens_seen": 241556340, + "step": 11192, + "time_per_iteration": 2.539520740509033 + }, + { + "auxiliary_loss_clip": 0.010992, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.03783131, + "balance_loss_mlp": 1.02059817, + "epoch": 0.6729595671125808, + "flos": 23112359331840.0, + "grad_norm": 2.047186374413518, + "language_loss": 0.75897503, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.78030598, + "num_input_tokens_seen": 241575185, + "step": 11193, + "time_per_iteration": 2.514896869659424 + }, + { + "auxiliary_loss_clip": 0.01075128, + "auxiliary_loss_mlp": 0.010297, + "balance_loss_clip": 1.04095149, + "balance_loss_mlp": 1.01752877, + "epoch": 0.6730196903652488, + "flos": 14611406227200.0, + "grad_norm": 1.8418265841298234, + "language_loss": 0.79120541, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.81225371, + "num_input_tokens_seen": 241592970, + "step": 11194, + "time_per_iteration": 2.5387256145477295 + }, + { + "auxiliary_loss_clip": 0.01098367, + "auxiliary_loss_mlp": 0.01027926, + "balance_loss_clip": 1.03702855, + "balance_loss_mlp": 1.01619554, + "epoch": 0.6730798136179167, + "flos": 21105922544640.0, + "grad_norm": 1.8973532738663774, + "language_loss": 0.89677441, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.9180373, + "num_input_tokens_seen": 241610245, + "step": 11195, + "time_per_iteration": 3.9281036853790283 + }, + { + "auxiliary_loss_clip": 0.01098807, + "auxiliary_loss_mlp": 0.01030775, + "balance_loss_clip": 1.03831482, + "balance_loss_mlp": 1.01892591, + "epoch": 0.6731399368705847, + "flos": 28986267438720.0, + "grad_norm": 1.667219489195078, + "language_loss": 0.72454709, + "learning_rate": 1.019738976106662e-06, + "loss": 0.74584293, + "num_input_tokens_seen": 241630350, + "step": 11196, + "time_per_iteration": 2.579883575439453 + }, + { + "auxiliary_loss_clip": 0.00993809, + "auxiliary_loss_mlp": 0.01002569, + "balance_loss_clip": 1.02653849, + "balance_loss_mlp": 1.00146055, + "epoch": 0.6732000601232526, + "flos": 64743708723840.0, + "grad_norm": 0.775854963192077, + "language_loss": 0.56544685, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.58541059, + "num_input_tokens_seen": 241692380, + "step": 11197, + "time_per_iteration": 3.1687936782836914 + }, + { + "auxiliary_loss_clip": 0.01086007, + "auxiliary_loss_mlp": 0.0102803, + "balance_loss_clip": 1.03831542, + "balance_loss_mlp": 1.0164609, + "epoch": 0.6732601833759206, + "flos": 17201642762880.0, + "grad_norm": 2.033903051069441, + "language_loss": 0.75428402, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.77542436, + "num_input_tokens_seen": 241710430, + "step": 11198, + "time_per_iteration": 2.541346311569214 + }, + { + "auxiliary_loss_clip": 0.01098282, + "auxiliary_loss_mlp": 0.01033094, + "balance_loss_clip": 1.0363934, + "balance_loss_mlp": 1.01990926, + "epoch": 0.6733203066285887, + "flos": 18658861620480.0, + "grad_norm": 4.624830922006331, + "language_loss": 0.81814289, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.83945668, + "num_input_tokens_seen": 241724775, + "step": 11199, + "time_per_iteration": 2.4509060382843018 + }, + { + "auxiliary_loss_clip": 0.0105772, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.03935146, + "balance_loss_mlp": 1.02043951, + "epoch": 0.6733804298812566, + "flos": 35809330481280.0, + "grad_norm": 1.9259791812715914, + "language_loss": 0.71202707, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.73294175, + "num_input_tokens_seen": 241744440, + "step": 11200, + "time_per_iteration": 2.714695930480957 + }, + { + "auxiliary_loss_clip": 0.0111199, + "auxiliary_loss_mlp": 0.0103467, + "balance_loss_clip": 1.04087305, + "balance_loss_mlp": 1.02252269, + "epoch": 0.6734405531339246, + "flos": 61638833099520.0, + "grad_norm": 1.6328925533307903, + "language_loss": 0.64701486, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.66848141, + "num_input_tokens_seen": 241771705, + "step": 11201, + "time_per_iteration": 2.833658456802368 + }, + { + "auxiliary_loss_clip": 0.01088358, + "auxiliary_loss_mlp": 0.01032795, + "balance_loss_clip": 1.03687835, + "balance_loss_mlp": 1.02041554, + "epoch": 0.6735006763865925, + "flos": 20522338277760.0, + "grad_norm": 1.579770557438319, + "language_loss": 0.63168699, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.65289849, + "num_input_tokens_seen": 241790830, + "step": 11202, + "time_per_iteration": 3.9499635696411133 + }, + { + "auxiliary_loss_clip": 0.01109855, + "auxiliary_loss_mlp": 0.01027862, + "balance_loss_clip": 1.03839087, + "balance_loss_mlp": 1.01599455, + "epoch": 0.6735607996392605, + "flos": 13918869031680.0, + "grad_norm": 1.7293273690747857, + "language_loss": 0.74743617, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.76881337, + "num_input_tokens_seen": 241808165, + "step": 11203, + "time_per_iteration": 4.1642467975616455 + }, + { + "auxiliary_loss_clip": 0.01092979, + "auxiliary_loss_mlp": 0.01032343, + "balance_loss_clip": 1.03863263, + "balance_loss_mlp": 1.01810956, + "epoch": 0.6736209228919284, + "flos": 18807244704000.0, + "grad_norm": 1.6105073564340877, + "language_loss": 0.67384911, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.69510233, + "num_input_tokens_seen": 241826925, + "step": 11204, + "time_per_iteration": 2.5578625202178955 + }, + { + "auxiliary_loss_clip": 0.01105027, + "auxiliary_loss_mlp": 0.01033115, + "balance_loss_clip": 1.04087293, + "balance_loss_mlp": 1.01978779, + "epoch": 0.6736810461445965, + "flos": 20373129181440.0, + "grad_norm": 1.5526410561168842, + "language_loss": 0.73993373, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.76131511, + "num_input_tokens_seen": 241845525, + "step": 11205, + "time_per_iteration": 2.5272927284240723 + }, + { + "auxiliary_loss_clip": 0.01106327, + "auxiliary_loss_mlp": 0.01032327, + "balance_loss_clip": 1.03770375, + "balance_loss_mlp": 1.02038181, + "epoch": 0.6737411693972644, + "flos": 30007530927360.0, + "grad_norm": 1.6012801619297645, + "language_loss": 0.71229994, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.73368645, + "num_input_tokens_seen": 241866815, + "step": 11206, + "time_per_iteration": 4.160272836685181 + }, + { + "auxiliary_loss_clip": 0.01077571, + "auxiliary_loss_mlp": 0.00788087, + "balance_loss_clip": 1.04172087, + "balance_loss_mlp": 1.01261926, + "epoch": 0.6738012926499324, + "flos": 25447342844160.0, + "grad_norm": 2.6293643485688274, + "language_loss": 0.67332584, + "learning_rate": 1.016007014855092e-06, + "loss": 0.69198245, + "num_input_tokens_seen": 241887050, + "step": 11207, + "time_per_iteration": 2.630535364151001 + }, + { + "auxiliary_loss_clip": 0.01057752, + "auxiliary_loss_mlp": 0.01033185, + "balance_loss_clip": 1.03516972, + "balance_loss_mlp": 1.02081728, + "epoch": 0.6738614159026003, + "flos": 20776873029120.0, + "grad_norm": 2.0532750887548135, + "language_loss": 0.73619837, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.75710773, + "num_input_tokens_seen": 241904280, + "step": 11208, + "time_per_iteration": 2.5767581462860107 + }, + { + "auxiliary_loss_clip": 0.01094087, + "auxiliary_loss_mlp": 0.01040581, + "balance_loss_clip": 1.0358007, + "balance_loss_mlp": 1.02476788, + "epoch": 0.6739215391552683, + "flos": 19566898462080.0, + "grad_norm": 1.8552214927405155, + "language_loss": 0.75483179, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.77617854, + "num_input_tokens_seen": 241919190, + "step": 11209, + "time_per_iteration": 2.4877548217773438 + }, + { + "auxiliary_loss_clip": 0.01069849, + "auxiliary_loss_mlp": 0.01027791, + "balance_loss_clip": 1.0386343, + "balance_loss_mlp": 1.01668704, + "epoch": 0.6739816624079362, + "flos": 24388193485440.0, + "grad_norm": 1.7746572513346497, + "language_loss": 0.66730881, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.68828523, + "num_input_tokens_seen": 241940525, + "step": 11210, + "time_per_iteration": 2.6432743072509766 + }, + { + "auxiliary_loss_clip": 0.01103598, + "auxiliary_loss_mlp": 0.01028597, + "balance_loss_clip": 1.03594232, + "balance_loss_mlp": 1.01734376, + "epoch": 0.6740417856606042, + "flos": 22528164533760.0, + "grad_norm": 2.1679953652701145, + "language_loss": 0.80335391, + "learning_rate": 1.014651056529377e-06, + "loss": 0.82467592, + "num_input_tokens_seen": 241959290, + "step": 11211, + "time_per_iteration": 2.5106260776519775 + }, + { + "auxiliary_loss_clip": 0.01057759, + "auxiliary_loss_mlp": 0.01027955, + "balance_loss_clip": 1.03544784, + "balance_loss_mlp": 1.01611114, + "epoch": 0.6741019089132723, + "flos": 25775458606080.0, + "grad_norm": 1.3414673890790885, + "language_loss": 0.76675403, + "learning_rate": 1.014312160327143e-06, + "loss": 0.78761113, + "num_input_tokens_seen": 241980715, + "step": 11212, + "time_per_iteration": 2.634998083114624 + }, + { + "auxiliary_loss_clip": 0.01069544, + "auxiliary_loss_mlp": 0.00789201, + "balance_loss_clip": 1.03742933, + "balance_loss_mlp": 1.01390529, + "epoch": 0.6741620321659402, + "flos": 21105671149440.0, + "grad_norm": 1.5627067309024232, + "language_loss": 0.78160793, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.8001954, + "num_input_tokens_seen": 241999985, + "step": 11213, + "time_per_iteration": 2.575955629348755 + }, + { + "auxiliary_loss_clip": 0.01067286, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.04294205, + "balance_loss_mlp": 1.01846242, + "epoch": 0.6742221554186082, + "flos": 20740423703040.0, + "grad_norm": 1.888988724255275, + "language_loss": 0.67686701, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.69785523, + "num_input_tokens_seen": 242018990, + "step": 11214, + "time_per_iteration": 2.5863049030303955 + }, + { + "auxiliary_loss_clip": 0.01108122, + "auxiliary_loss_mlp": 0.00785562, + "balance_loss_clip": 1.03770804, + "balance_loss_mlp": 1.01265097, + "epoch": 0.6742822786712761, + "flos": 37774146384000.0, + "grad_norm": 1.8826555093447497, + "language_loss": 0.72790879, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.7468456, + "num_input_tokens_seen": 242039340, + "step": 11215, + "time_per_iteration": 2.618163824081421 + }, + { + "auxiliary_loss_clip": 0.01099039, + "auxiliary_loss_mlp": 0.00783671, + "balance_loss_clip": 1.03703034, + "balance_loss_mlp": 1.00937867, + "epoch": 0.6743424019239441, + "flos": 37263891732480.0, + "grad_norm": 1.7781193511213345, + "language_loss": 0.66751736, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.6863445, + "num_input_tokens_seen": 242062215, + "step": 11216, + "time_per_iteration": 2.6357555389404297 + }, + { + "auxiliary_loss_clip": 0.01040756, + "auxiliary_loss_mlp": 0.01001271, + "balance_loss_clip": 1.01703167, + "balance_loss_mlp": 1.0001564, + "epoch": 0.674402525176612, + "flos": 65997746300160.0, + "grad_norm": 0.6726541678347203, + "language_loss": 0.56246388, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.58288413, + "num_input_tokens_seen": 242131130, + "step": 11217, + "time_per_iteration": 3.2011559009552 + }, + { + "auxiliary_loss_clip": 0.01095697, + "auxiliary_loss_mlp": 0.0103033, + "balance_loss_clip": 1.03691816, + "balance_loss_mlp": 1.01786625, + "epoch": 0.67446264842928, + "flos": 26461208131200.0, + "grad_norm": 2.006648604304057, + "language_loss": 0.74572873, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.76698899, + "num_input_tokens_seen": 242149720, + "step": 11218, + "time_per_iteration": 2.519341468811035 + }, + { + "auxiliary_loss_clip": 0.01076575, + "auxiliary_loss_mlp": 0.01045191, + "balance_loss_clip": 1.03749061, + "balance_loss_mlp": 1.03101707, + "epoch": 0.674522771681948, + "flos": 23732392924800.0, + "grad_norm": 1.6833081610275735, + "language_loss": 0.66056311, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.68178082, + "num_input_tokens_seen": 242168875, + "step": 11219, + "time_per_iteration": 2.5805864334106445 + }, + { + "auxiliary_loss_clip": 0.01068136, + "auxiliary_loss_mlp": 0.01044542, + "balance_loss_clip": 1.03388619, + "balance_loss_mlp": 1.02952158, + "epoch": 0.674582894934616, + "flos": 24754338771840.0, + "grad_norm": 1.6769712672294836, + "language_loss": 0.74949503, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.77062178, + "num_input_tokens_seen": 242188465, + "step": 11220, + "time_per_iteration": 2.5654799938201904 + }, + { + "auxiliary_loss_clip": 0.01093699, + "auxiliary_loss_mlp": 0.01031658, + "balance_loss_clip": 1.03836036, + "balance_loss_mlp": 1.01894999, + "epoch": 0.6746430181872839, + "flos": 24826626892800.0, + "grad_norm": 1.684097534818142, + "language_loss": 0.70311868, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.72437227, + "num_input_tokens_seen": 242208675, + "step": 11221, + "time_per_iteration": 2.531670570373535 + }, + { + "auxiliary_loss_clip": 0.01074835, + "auxiliary_loss_mlp": 0.01026447, + "balance_loss_clip": 1.03820896, + "balance_loss_mlp": 1.01497889, + "epoch": 0.6747031414399519, + "flos": 16873491087360.0, + "grad_norm": 1.8396711416520009, + "language_loss": 0.58010447, + "learning_rate": 1.010925256180498e-06, + "loss": 0.60111725, + "num_input_tokens_seen": 242227440, + "step": 11222, + "time_per_iteration": 2.539881944656372 + }, + { + "auxiliary_loss_clip": 0.01095852, + "auxiliary_loss_mlp": 0.0103165, + "balance_loss_clip": 1.03996491, + "balance_loss_mlp": 1.01926422, + "epoch": 0.6747632646926198, + "flos": 22784925928320.0, + "grad_norm": 1.6803421286531328, + "language_loss": 0.76848686, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.7897619, + "num_input_tokens_seen": 242245240, + "step": 11223, + "time_per_iteration": 2.524216890335083 + }, + { + "auxiliary_loss_clip": 0.01101226, + "auxiliary_loss_mlp": 0.01030111, + "balance_loss_clip": 1.03852725, + "balance_loss_mlp": 1.01780891, + "epoch": 0.6748233879452878, + "flos": 20046090827520.0, + "grad_norm": 1.6622883997081865, + "language_loss": 0.75132096, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.77263439, + "num_input_tokens_seen": 242263435, + "step": 11224, + "time_per_iteration": 2.493838310241699 + }, + { + "auxiliary_loss_clip": 0.010615, + "auxiliary_loss_mlp": 0.01025783, + "balance_loss_clip": 1.03717279, + "balance_loss_mlp": 1.01547194, + "epoch": 0.6748835111979558, + "flos": 23002831785600.0, + "grad_norm": 1.5439312517237889, + "language_loss": 0.6325618, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.65343463, + "num_input_tokens_seen": 242282765, + "step": 11225, + "time_per_iteration": 2.5922675132751465 + }, + { + "auxiliary_loss_clip": 0.01103167, + "auxiliary_loss_mlp": 0.00781354, + "balance_loss_clip": 1.03706169, + "balance_loss_mlp": 1.00684762, + "epoch": 0.6749436344506238, + "flos": 12197311009920.0, + "grad_norm": 1.8251466787149069, + "language_loss": 0.64401162, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.66285682, + "num_input_tokens_seen": 242298980, + "step": 11226, + "time_per_iteration": 2.4293322563171387 + }, + { + "auxiliary_loss_clip": 0.01098274, + "auxiliary_loss_mlp": 0.01029576, + "balance_loss_clip": 1.03761756, + "balance_loss_mlp": 1.01703525, + "epoch": 0.6750037577032918, + "flos": 11873720361600.0, + "grad_norm": 2.197825593419333, + "language_loss": 0.71788269, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.73916119, + "num_input_tokens_seen": 242315420, + "step": 11227, + "time_per_iteration": 2.471691846847534 + }, + { + "auxiliary_loss_clip": 0.01080051, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.03549123, + "balance_loss_mlp": 1.01935267, + "epoch": 0.6750638809559597, + "flos": 17019611614080.0, + "grad_norm": 3.047396895402804, + "language_loss": 0.70984793, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.73097771, + "num_input_tokens_seen": 242332805, + "step": 11228, + "time_per_iteration": 2.5047240257263184 + }, + { + "auxiliary_loss_clip": 0.01024254, + "auxiliary_loss_mlp": 0.01003042, + "balance_loss_clip": 1.02404177, + "balance_loss_mlp": 1.00190973, + "epoch": 0.6751240042086277, + "flos": 70951011891840.0, + "grad_norm": 0.7554039387089345, + "language_loss": 0.53265351, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.55292648, + "num_input_tokens_seen": 242396160, + "step": 11229, + "time_per_iteration": 3.2080209255218506 + }, + { + "auxiliary_loss_clip": 0.01095109, + "auxiliary_loss_mlp": 0.01025725, + "balance_loss_clip": 1.03643477, + "balance_loss_mlp": 1.0145191, + "epoch": 0.6751841274612956, + "flos": 22675146986880.0, + "grad_norm": 1.699454475147527, + "language_loss": 0.80108643, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.82229471, + "num_input_tokens_seen": 242414660, + "step": 11230, + "time_per_iteration": 2.5285768508911133 + }, + { + "auxiliary_loss_clip": 0.01076092, + "auxiliary_loss_mlp": 0.01031441, + "balance_loss_clip": 1.0347693, + "balance_loss_mlp": 1.0200808, + "epoch": 0.6752442507139637, + "flos": 21288636051840.0, + "grad_norm": 1.4917026545505254, + "language_loss": 0.6578297, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.67890501, + "num_input_tokens_seen": 242434225, + "step": 11231, + "time_per_iteration": 2.537249803543091 + }, + { + "auxiliary_loss_clip": 0.01068856, + "auxiliary_loss_mlp": 0.01035599, + "balance_loss_clip": 1.03798318, + "balance_loss_mlp": 1.02221203, + "epoch": 0.6753043739666316, + "flos": 28256921781120.0, + "grad_norm": 1.8353877667159697, + "language_loss": 0.66488421, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.68592876, + "num_input_tokens_seen": 242454355, + "step": 11232, + "time_per_iteration": 2.6465675830841064 + }, + { + "auxiliary_loss_clip": 0.01072187, + "auxiliary_loss_mlp": 0.01028129, + "balance_loss_clip": 1.03568149, + "balance_loss_mlp": 1.01588058, + "epoch": 0.6753644972192996, + "flos": 21360349555200.0, + "grad_norm": 1.8952136019121177, + "language_loss": 0.72473449, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.74573767, + "num_input_tokens_seen": 242474935, + "step": 11233, + "time_per_iteration": 2.5725321769714355 + }, + { + "auxiliary_loss_clip": 0.01097944, + "auxiliary_loss_mlp": 0.01031698, + "balance_loss_clip": 1.03702009, + "balance_loss_mlp": 1.01965809, + "epoch": 0.6754246204719675, + "flos": 26541971861760.0, + "grad_norm": 2.2188958137715957, + "language_loss": 0.77114558, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.79244196, + "num_input_tokens_seen": 242495530, + "step": 11234, + "time_per_iteration": 3.921712875366211 + }, + { + "auxiliary_loss_clip": 0.01106449, + "auxiliary_loss_mlp": 0.01032155, + "balance_loss_clip": 1.03728724, + "balance_loss_mlp": 1.01972139, + "epoch": 0.6754847437246355, + "flos": 25556690822400.0, + "grad_norm": 1.5365311895682032, + "language_loss": 0.75671494, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.77810097, + "num_input_tokens_seen": 242514550, + "step": 11235, + "time_per_iteration": 2.497377634048462 + }, + { + "auxiliary_loss_clip": 0.01031891, + "auxiliary_loss_mlp": 0.01002204, + "balance_loss_clip": 1.01776671, + "balance_loss_mlp": 1.00104797, + "epoch": 0.6755448669773034, + "flos": 59513318726400.0, + "grad_norm": 0.7804714761872624, + "language_loss": 0.51341504, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.53375596, + "num_input_tokens_seen": 242569200, + "step": 11236, + "time_per_iteration": 3.0710935592651367 + }, + { + "auxiliary_loss_clip": 0.01074998, + "auxiliary_loss_mlp": 0.01027878, + "balance_loss_clip": 1.03601551, + "balance_loss_mlp": 1.0141151, + "epoch": 0.6756049902299714, + "flos": 23294534135040.0, + "grad_norm": 1.8812417926536926, + "language_loss": 0.75262332, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.77365208, + "num_input_tokens_seen": 242586950, + "step": 11237, + "time_per_iteration": 2.5880773067474365 + }, + { + "auxiliary_loss_clip": 0.01083009, + "auxiliary_loss_mlp": 0.01041087, + "balance_loss_clip": 1.03868961, + "balance_loss_mlp": 1.02909994, + "epoch": 0.6756651134826394, + "flos": 31575426566400.0, + "grad_norm": 1.6963738923401783, + "language_loss": 0.77451974, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.79576063, + "num_input_tokens_seen": 242607380, + "step": 11238, + "time_per_iteration": 2.6080729961395264 + }, + { + "auxiliary_loss_clip": 0.01096538, + "auxiliary_loss_mlp": 0.01034803, + "balance_loss_clip": 1.03722048, + "balance_loss_mlp": 1.0215826, + "epoch": 0.6757252367353074, + "flos": 27272287186560.0, + "grad_norm": 1.8825860175814748, + "language_loss": 0.66638398, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.68769735, + "num_input_tokens_seen": 242628025, + "step": 11239, + "time_per_iteration": 2.580176830291748 + }, + { + "auxiliary_loss_clip": 0.01083033, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.0383172, + "balance_loss_mlp": 1.01654768, + "epoch": 0.6757853599879754, + "flos": 16830900535680.0, + "grad_norm": 1.8279893691017335, + "language_loss": 0.82793653, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.84905487, + "num_input_tokens_seen": 242643825, + "step": 11240, + "time_per_iteration": 2.515573024749756 + }, + { + "auxiliary_loss_clip": 0.01083185, + "auxiliary_loss_mlp": 0.01033271, + "balance_loss_clip": 1.0386827, + "balance_loss_mlp": 1.01850116, + "epoch": 0.6758454832406433, + "flos": 23220055284480.0, + "grad_norm": 1.9926900253676532, + "language_loss": 0.74531102, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.76647556, + "num_input_tokens_seen": 242661820, + "step": 11241, + "time_per_iteration": 3.920835018157959 + }, + { + "auxiliary_loss_clip": 0.01066617, + "auxiliary_loss_mlp": 0.01034307, + "balance_loss_clip": 1.03940296, + "balance_loss_mlp": 1.0219692, + "epoch": 0.6759056064933113, + "flos": 16289547684480.0, + "grad_norm": 2.0795740830007174, + "language_loss": 0.79858404, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.81959331, + "num_input_tokens_seen": 242679890, + "step": 11242, + "time_per_iteration": 4.099387168884277 + }, + { + "auxiliary_loss_clip": 0.0109138, + "auxiliary_loss_mlp": 0.0103711, + "balance_loss_clip": 1.03500128, + "balance_loss_mlp": 1.02411628, + "epoch": 0.6759657297459792, + "flos": 25922297404800.0, + "grad_norm": 1.8088287677359127, + "language_loss": 0.73073232, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.7520172, + "num_input_tokens_seen": 242699495, + "step": 11243, + "time_per_iteration": 2.5249197483062744 + }, + { + "auxiliary_loss_clip": 0.01097196, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.03774071, + "balance_loss_mlp": 1.02360046, + "epoch": 0.6760258529986473, + "flos": 23000820624000.0, + "grad_norm": 1.5683994600089841, + "language_loss": 0.72768569, + "learning_rate": 1.003487287162221e-06, + "loss": 0.74901414, + "num_input_tokens_seen": 242719500, + "step": 11244, + "time_per_iteration": 3.8905811309814453 + }, + { + "auxiliary_loss_clip": 0.01109765, + "auxiliary_loss_mlp": 0.01037245, + "balance_loss_clip": 1.0386796, + "balance_loss_mlp": 1.0249424, + "epoch": 0.6760859762513152, + "flos": 20959335141120.0, + "grad_norm": 1.971878203645556, + "language_loss": 0.85613072, + "learning_rate": 1.003149631190393e-06, + "loss": 0.87760085, + "num_input_tokens_seen": 242738325, + "step": 11245, + "time_per_iteration": 2.462027072906494 + }, + { + "auxiliary_loss_clip": 0.01111663, + "auxiliary_loss_mlp": 0.00785207, + "balance_loss_clip": 1.03794718, + "balance_loss_mlp": 1.0105319, + "epoch": 0.6761460995039832, + "flos": 23622937205760.0, + "grad_norm": 1.7724238345245786, + "language_loss": 0.73446345, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.75343215, + "num_input_tokens_seen": 242756620, + "step": 11246, + "time_per_iteration": 2.4964733123779297 + }, + { + "auxiliary_loss_clip": 0.01093931, + "auxiliary_loss_mlp": 0.01024137, + "balance_loss_clip": 1.03896332, + "balance_loss_mlp": 1.01258612, + "epoch": 0.6762062227566511, + "flos": 20770875457920.0, + "grad_norm": 1.7517141650394952, + "language_loss": 0.87737894, + "learning_rate": 1.002474432661539e-06, + "loss": 0.89855963, + "num_input_tokens_seen": 242774505, + "step": 11247, + "time_per_iteration": 2.4913222789764404 + }, + { + "auxiliary_loss_clip": 0.01022719, + "auxiliary_loss_mlp": 0.00999919, + "balance_loss_clip": 1.02173889, + "balance_loss_mlp": 0.99860191, + "epoch": 0.6762663460093191, + "flos": 52818099166080.0, + "grad_norm": 0.827180242164474, + "language_loss": 0.54015899, + "learning_rate": 1.002136890130115e-06, + "loss": 0.56038541, + "num_input_tokens_seen": 242828645, + "step": 11248, + "time_per_iteration": 3.166027069091797 + }, + { + "auxiliary_loss_clip": 0.01053103, + "auxiliary_loss_mlp": 0.01026683, + "balance_loss_clip": 1.03786385, + "balance_loss_mlp": 1.01540577, + "epoch": 0.676326469261987, + "flos": 23696302734720.0, + "grad_norm": 1.4603574843310045, + "language_loss": 0.73628283, + "learning_rate": 1.001799385437761e-06, + "loss": 0.75708067, + "num_input_tokens_seen": 242850100, + "step": 11249, + "time_per_iteration": 2.705786943435669 + }, + { + "auxiliary_loss_clip": 0.01097202, + "auxiliary_loss_mlp": 0.01035333, + "balance_loss_clip": 1.03576481, + "balance_loss_mlp": 1.02255416, + "epoch": 0.676386592514655, + "flos": 14063732582400.0, + "grad_norm": 2.3171966899326395, + "language_loss": 0.74269789, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.76402324, + "num_input_tokens_seen": 242867775, + "step": 11250, + "time_per_iteration": 2.4809460639953613 + }, + { + "auxiliary_loss_clip": 0.01107524, + "auxiliary_loss_mlp": 0.01027296, + "balance_loss_clip": 1.03709388, + "balance_loss_mlp": 1.01509547, + "epoch": 0.676446715767323, + "flos": 20412236113920.0, + "grad_norm": 1.9202803592555908, + "language_loss": 0.74984312, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.7711913, + "num_input_tokens_seen": 242886865, + "step": 11251, + "time_per_iteration": 2.4777307510375977 + }, + { + "auxiliary_loss_clip": 0.01079672, + "auxiliary_loss_mlp": 0.01027935, + "balance_loss_clip": 1.03933764, + "balance_loss_mlp": 1.01590085, + "epoch": 0.676506839019991, + "flos": 21288241002240.0, + "grad_norm": 1.7684979797640559, + "language_loss": 0.7011463, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.72222239, + "num_input_tokens_seen": 242906705, + "step": 11252, + "time_per_iteration": 2.5613412857055664 + }, + { + "auxiliary_loss_clip": 0.01066767, + "auxiliary_loss_mlp": 0.01030242, + "balance_loss_clip": 1.03815317, + "balance_loss_mlp": 1.01848757, + "epoch": 0.676566962272659, + "flos": 29932477459200.0, + "grad_norm": 2.036885969286238, + "language_loss": 0.66977358, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.69074357, + "num_input_tokens_seen": 242925215, + "step": 11253, + "time_per_iteration": 2.663843870162964 + }, + { + "auxiliary_loss_clip": 0.01068672, + "auxiliary_loss_mlp": 0.00788535, + "balance_loss_clip": 1.03625953, + "balance_loss_mlp": 1.01045847, + "epoch": 0.6766270855253269, + "flos": 17931203902080.0, + "grad_norm": 1.6113550405494432, + "language_loss": 0.77346969, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.79204178, + "num_input_tokens_seen": 242944750, + "step": 11254, + "time_per_iteration": 2.567986488342285 + }, + { + "auxiliary_loss_clip": 0.01096643, + "auxiliary_loss_mlp": 0.01028438, + "balance_loss_clip": 1.0365442, + "balance_loss_mlp": 1.01601648, + "epoch": 0.6766872087779949, + "flos": 23104853389440.0, + "grad_norm": 2.360344964153428, + "language_loss": 0.72230399, + "learning_rate": 9.997751526206835e-07, + "loss": 0.74355477, + "num_input_tokens_seen": 242963860, + "step": 11255, + "time_per_iteration": 2.520810604095459 + }, + { + "auxiliary_loss_clip": 0.0105826, + "auxiliary_loss_mlp": 0.00783077, + "balance_loss_clip": 1.03344357, + "balance_loss_mlp": 1.00534487, + "epoch": 0.6767473320306628, + "flos": 26213137827840.0, + "grad_norm": 1.989044825054772, + "language_loss": 0.75307876, + "learning_rate": 9.994379131600828e-07, + "loss": 0.77149218, + "num_input_tokens_seen": 242983050, + "step": 11256, + "time_per_iteration": 2.6196107864379883 + }, + { + "auxiliary_loss_clip": 0.01099093, + "auxiliary_loss_mlp": 0.01032004, + "balance_loss_clip": 1.03954887, + "balance_loss_mlp": 1.01974297, + "epoch": 0.6768074552833309, + "flos": 18368739469440.0, + "grad_norm": 2.4110915340058345, + "language_loss": 0.65320396, + "learning_rate": 9.991007116408965e-07, + "loss": 0.67451495, + "num_input_tokens_seen": 243001125, + "step": 11257, + "time_per_iteration": 2.4969475269317627 + }, + { + "auxiliary_loss_clip": 0.0105978, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.03739333, + "balance_loss_mlp": 1.01819921, + "epoch": 0.6768675785359988, + "flos": 23039927556480.0, + "grad_norm": 1.561676436815102, + "language_loss": 0.75489664, + "learning_rate": 9.987635480759109e-07, + "loss": 0.77578902, + "num_input_tokens_seen": 243021865, + "step": 11258, + "time_per_iteration": 2.5986268520355225 + }, + { + "auxiliary_loss_clip": 0.01083282, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.03721976, + "balance_loss_mlp": 1.02185917, + "epoch": 0.6769277017886668, + "flos": 33036524092800.0, + "grad_norm": 1.7374839245482832, + "language_loss": 0.6713357, + "learning_rate": 9.984264224779127e-07, + "loss": 0.69249833, + "num_input_tokens_seen": 243042970, + "step": 11259, + "time_per_iteration": 2.635932445526123 + }, + { + "auxiliary_loss_clip": 0.01087147, + "auxiliary_loss_mlp": 0.01034055, + "balance_loss_clip": 1.03711748, + "balance_loss_mlp": 1.02165174, + "epoch": 0.6769878250413347, + "flos": 20848406964480.0, + "grad_norm": 2.3103119216870254, + "language_loss": 0.85854387, + "learning_rate": 9.980893348596839e-07, + "loss": 0.87975585, + "num_input_tokens_seen": 243058470, + "step": 11260, + "time_per_iteration": 2.5188682079315186 + }, + { + "auxiliary_loss_clip": 0.01082256, + "auxiliary_loss_mlp": 0.01038412, + "balance_loss_clip": 1.03511262, + "balance_loss_mlp": 1.02519727, + "epoch": 0.6770479482940027, + "flos": 15595968994560.0, + "grad_norm": 2.4979688850736275, + "language_loss": 0.77731162, + "learning_rate": 9.977522852340081e-07, + "loss": 0.7985183, + "num_input_tokens_seen": 243076630, + "step": 11261, + "time_per_iteration": 2.5241751670837402 + }, + { + "auxiliary_loss_clip": 0.01082731, + "auxiliary_loss_mlp": 0.01033641, + "balance_loss_clip": 1.03349948, + "balance_loss_mlp": 1.02192903, + "epoch": 0.6771080715466706, + "flos": 18621011664000.0, + "grad_norm": 1.6817148993457622, + "language_loss": 0.87913311, + "learning_rate": 9.97415273613666e-07, + "loss": 0.90029681, + "num_input_tokens_seen": 243092260, + "step": 11262, + "time_per_iteration": 2.496824264526367 + }, + { + "auxiliary_loss_clip": 0.01085594, + "auxiliary_loss_mlp": 0.01030522, + "balance_loss_clip": 1.0404706, + "balance_loss_mlp": 1.01820183, + "epoch": 0.6771681947993387, + "flos": 12495441893760.0, + "grad_norm": 2.8954232470334356, + "language_loss": 0.74345362, + "learning_rate": 9.97078300011439e-07, + "loss": 0.76461482, + "num_input_tokens_seen": 243109405, + "step": 11263, + "time_per_iteration": 2.51190447807312 + }, + { + "auxiliary_loss_clip": 0.01101446, + "auxiliary_loss_mlp": 0.01034895, + "balance_loss_clip": 1.03728461, + "balance_loss_mlp": 1.02133501, + "epoch": 0.6772283180520066, + "flos": 22236964974720.0, + "grad_norm": 2.460272594087343, + "language_loss": 0.67923069, + "learning_rate": 9.967413644401016e-07, + "loss": 0.70059407, + "num_input_tokens_seen": 243128135, + "step": 11264, + "time_per_iteration": 2.4950995445251465 + }, + { + "auxiliary_loss_clip": 0.01084964, + "auxiliary_loss_mlp": 0.01031281, + "balance_loss_clip": 1.03830028, + "balance_loss_mlp": 1.01872218, + "epoch": 0.6772884413046746, + "flos": 16143139848960.0, + "grad_norm": 1.9464328432111242, + "language_loss": 0.73068166, + "learning_rate": 9.964044669124324e-07, + "loss": 0.75184417, + "num_input_tokens_seen": 243146785, + "step": 11265, + "time_per_iteration": 2.5188872814178467 + }, + { + "auxiliary_loss_clip": 0.01067693, + "auxiliary_loss_mlp": 0.01035907, + "balance_loss_clip": 1.03675711, + "balance_loss_mlp": 1.02288306, + "epoch": 0.6773485645573426, + "flos": 19135755515520.0, + "grad_norm": 4.948670490078943, + "language_loss": 0.61745793, + "learning_rate": 9.96067607441207e-07, + "loss": 0.6384939, + "num_input_tokens_seen": 243165275, + "step": 11266, + "time_per_iteration": 2.5451176166534424 + }, + { + "auxiliary_loss_clip": 0.01078076, + "auxiliary_loss_mlp": 0.01033687, + "balance_loss_clip": 1.03779149, + "balance_loss_mlp": 1.02128363, + "epoch": 0.6774086878100105, + "flos": 14136918543360.0, + "grad_norm": 2.1612913539893635, + "language_loss": 0.70582259, + "learning_rate": 9.957307860391976e-07, + "loss": 0.72694021, + "num_input_tokens_seen": 243182845, + "step": 11267, + "time_per_iteration": 2.5539705753326416 + }, + { + "auxiliary_loss_clip": 0.01107455, + "auxiliary_loss_mlp": 0.01031406, + "balance_loss_clip": 1.03654552, + "balance_loss_mlp": 1.01895428, + "epoch": 0.6774688110626785, + "flos": 22197067943040.0, + "grad_norm": 2.2445925100737596, + "language_loss": 0.71771944, + "learning_rate": 9.953940027191785e-07, + "loss": 0.73910803, + "num_input_tokens_seen": 243201475, + "step": 11268, + "time_per_iteration": 2.4651012420654297 + }, + { + "auxiliary_loss_clip": 0.01087331, + "auxiliary_loss_mlp": 0.01037103, + "balance_loss_clip": 1.03935814, + "balance_loss_mlp": 1.02393007, + "epoch": 0.6775289343153464, + "flos": 23039963470080.0, + "grad_norm": 1.6259732999444814, + "language_loss": 0.76856041, + "learning_rate": 9.950572574939194e-07, + "loss": 0.78980476, + "num_input_tokens_seen": 243221850, + "step": 11269, + "time_per_iteration": 2.5684173107147217 + }, + { + "auxiliary_loss_clip": 0.01077042, + "auxiliary_loss_mlp": 0.01038206, + "balance_loss_clip": 1.03705502, + "balance_loss_mlp": 1.02501512, + "epoch": 0.6775890575680145, + "flos": 18293506433280.0, + "grad_norm": 1.863150776143917, + "language_loss": 0.74140954, + "learning_rate": 9.94720550376189e-07, + "loss": 0.76256198, + "num_input_tokens_seen": 243239855, + "step": 11270, + "time_per_iteration": 2.5385191440582275 + }, + { + "auxiliary_loss_clip": 0.01062043, + "auxiliary_loss_mlp": 0.01037934, + "balance_loss_clip": 1.04216564, + "balance_loss_mlp": 1.02490413, + "epoch": 0.6776491808206824, + "flos": 25336450581120.0, + "grad_norm": 1.7226179730345514, + "language_loss": 0.73134023, + "learning_rate": 9.94383881378756e-07, + "loss": 0.75233996, + "num_input_tokens_seen": 243260085, + "step": 11271, + "time_per_iteration": 2.64546537399292 + }, + { + "auxiliary_loss_clip": 0.01108033, + "auxiliary_loss_mlp": 0.01036523, + "balance_loss_clip": 1.03797531, + "balance_loss_mlp": 1.02462053, + "epoch": 0.6777093040733504, + "flos": 26028233591040.0, + "grad_norm": 1.665639811972565, + "language_loss": 0.67906159, + "learning_rate": 9.94047250514387e-07, + "loss": 0.70050716, + "num_input_tokens_seen": 243280065, + "step": 11272, + "time_per_iteration": 3.9303812980651855 + }, + { + "auxiliary_loss_clip": 0.0110138, + "auxiliary_loss_mlp": 0.01035731, + "balance_loss_clip": 1.0423739, + "balance_loss_mlp": 1.02158046, + "epoch": 0.6777694273260183, + "flos": 18003599763840.0, + "grad_norm": 1.8388325812392945, + "language_loss": 0.73865318, + "learning_rate": 9.937106577958481e-07, + "loss": 0.76002431, + "num_input_tokens_seen": 243297775, + "step": 11273, + "time_per_iteration": 2.4871222972869873 + }, + { + "auxiliary_loss_clip": 0.01098013, + "auxiliary_loss_mlp": 0.01042299, + "balance_loss_clip": 1.03939009, + "balance_loss_mlp": 1.02987742, + "epoch": 0.6778295505786863, + "flos": 23441085624960.0, + "grad_norm": 1.9395464682107304, + "language_loss": 0.70598471, + "learning_rate": 9.933741032359015e-07, + "loss": 0.72738785, + "num_input_tokens_seen": 243315760, + "step": 11274, + "time_per_iteration": 2.5556960105895996 + }, + { + "auxiliary_loss_clip": 0.0111118, + "auxiliary_loss_mlp": 0.01033175, + "balance_loss_clip": 1.038715, + "balance_loss_mlp": 1.02058089, + "epoch": 0.6778896738313542, + "flos": 19098408349440.0, + "grad_norm": 1.658397391296205, + "language_loss": 0.65436578, + "learning_rate": 9.930375868473093e-07, + "loss": 0.67580932, + "num_input_tokens_seen": 243335715, + "step": 11275, + "time_per_iteration": 2.508690118789673 + }, + { + "auxiliary_loss_clip": 0.01100248, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.03983307, + "balance_loss_mlp": 1.02070642, + "epoch": 0.6779497970840223, + "flos": 26103933504000.0, + "grad_norm": 2.5968849791762154, + "language_loss": 0.72783303, + "learning_rate": 9.927011086428335e-07, + "loss": 0.74915588, + "num_input_tokens_seen": 243356935, + "step": 11276, + "time_per_iteration": 2.5464112758636475 + }, + { + "auxiliary_loss_clip": 0.01083235, + "auxiliary_loss_mlp": 0.00783898, + "balance_loss_clip": 1.03691387, + "balance_loss_mlp": 1.00857639, + "epoch": 0.6780099203366902, + "flos": 19719232041600.0, + "grad_norm": 1.7086017109518157, + "language_loss": 0.76773846, + "learning_rate": 9.923646686352317e-07, + "loss": 0.7864098, + "num_input_tokens_seen": 243375625, + "step": 11277, + "time_per_iteration": 2.5250627994537354 + }, + { + "auxiliary_loss_clip": 0.01088594, + "auxiliary_loss_mlp": 0.0103047, + "balance_loss_clip": 1.03731298, + "balance_loss_mlp": 1.01781034, + "epoch": 0.6780700435893582, + "flos": 18214538382720.0, + "grad_norm": 2.6549513351788536, + "language_loss": 0.83685207, + "learning_rate": 9.920282668372627e-07, + "loss": 0.85804272, + "num_input_tokens_seen": 243390195, + "step": 11278, + "time_per_iteration": 2.4845290184020996 + }, + { + "auxiliary_loss_clip": 0.01077936, + "auxiliary_loss_mlp": 0.00781669, + "balance_loss_clip": 1.03755689, + "balance_loss_mlp": 1.00728488, + "epoch": 0.6781301668420262, + "flos": 25376239872000.0, + "grad_norm": 1.6048465874229025, + "language_loss": 0.70756793, + "learning_rate": 9.916919032616844e-07, + "loss": 0.72616398, + "num_input_tokens_seen": 243411690, + "step": 11279, + "time_per_iteration": 3.9578559398651123 + }, + { + "auxiliary_loss_clip": 0.01101721, + "auxiliary_loss_mlp": 0.01032261, + "balance_loss_clip": 1.04064119, + "balance_loss_mlp": 1.01906419, + "epoch": 0.6781902900946941, + "flos": 24020432087040.0, + "grad_norm": 2.1978858686198333, + "language_loss": 0.73653269, + "learning_rate": 9.913555779212485e-07, + "loss": 0.75787246, + "num_input_tokens_seen": 243430280, + "step": 11280, + "time_per_iteration": 2.5262610912323 + }, + { + "auxiliary_loss_clip": 0.01098812, + "auxiliary_loss_mlp": 0.01031105, + "balance_loss_clip": 1.03679895, + "balance_loss_mlp": 1.01798582, + "epoch": 0.6782504133473621, + "flos": 19646764352640.0, + "grad_norm": 1.7627693475441908, + "language_loss": 0.69765925, + "learning_rate": 9.910192908287104e-07, + "loss": 0.71895838, + "num_input_tokens_seen": 243448690, + "step": 11281, + "time_per_iteration": 3.85888671875 + }, + { + "auxiliary_loss_clip": 0.01105408, + "auxiliary_loss_mlp": 0.0102916, + "balance_loss_clip": 1.03735423, + "balance_loss_mlp": 1.01753139, + "epoch": 0.67831053660003, + "flos": 24932742647040.0, + "grad_norm": 1.5115776096943059, + "language_loss": 0.63709664, + "learning_rate": 9.906830419968217e-07, + "loss": 0.65844238, + "num_input_tokens_seen": 243470695, + "step": 11282, + "time_per_iteration": 2.547520399093628 + }, + { + "auxiliary_loss_clip": 0.01073788, + "auxiliary_loss_mlp": 0.01045916, + "balance_loss_clip": 1.03701115, + "balance_loss_mlp": 1.02998936, + "epoch": 0.6783706598526981, + "flos": 31208383440000.0, + "grad_norm": 1.5575533717162613, + "language_loss": 0.7416662, + "learning_rate": 9.90346831438334e-07, + "loss": 0.76286322, + "num_input_tokens_seen": 243493345, + "step": 11283, + "time_per_iteration": 4.0113184452056885 + }, + { + "auxiliary_loss_clip": 0.01099059, + "auxiliary_loss_mlp": 0.01027761, + "balance_loss_clip": 1.03813148, + "balance_loss_mlp": 1.01580453, + "epoch": 0.678430783105366, + "flos": 35441317687680.0, + "grad_norm": 1.5241142442773663, + "language_loss": 0.57079846, + "learning_rate": 9.900106591659948e-07, + "loss": 0.59206665, + "num_input_tokens_seen": 243515670, + "step": 11284, + "time_per_iteration": 2.627899169921875 + }, + { + "auxiliary_loss_clip": 0.0108468, + "auxiliary_loss_mlp": 0.01028351, + "balance_loss_clip": 1.03672862, + "balance_loss_mlp": 1.01635873, + "epoch": 0.678490906358034, + "flos": 14428800460800.0, + "grad_norm": 2.264045487179026, + "language_loss": 0.75050914, + "learning_rate": 9.896745251925535e-07, + "loss": 0.77163941, + "num_input_tokens_seen": 243533625, + "step": 11285, + "time_per_iteration": 2.496394634246826 + }, + { + "auxiliary_loss_clip": 0.01107334, + "auxiliary_loss_mlp": 0.01029987, + "balance_loss_clip": 1.03958416, + "balance_loss_mlp": 1.01804769, + "epoch": 0.6785510296107019, + "flos": 24311236596480.0, + "grad_norm": 1.5404605155249622, + "language_loss": 0.66311276, + "learning_rate": 9.893384295307557e-07, + "loss": 0.68448597, + "num_input_tokens_seen": 243553040, + "step": 11286, + "time_per_iteration": 2.5142822265625 + }, + { + "auxiliary_loss_clip": 0.01085951, + "auxiliary_loss_mlp": 0.01028972, + "balance_loss_clip": 1.03610098, + "balance_loss_mlp": 1.01659179, + "epoch": 0.6786111528633699, + "flos": 26977244872320.0, + "grad_norm": 2.353817619532873, + "language_loss": 0.5237444, + "learning_rate": 9.890023721933447e-07, + "loss": 0.54489362, + "num_input_tokens_seen": 243572590, + "step": 11287, + "time_per_iteration": 2.5641238689422607 + }, + { + "auxiliary_loss_clip": 0.01064997, + "auxiliary_loss_mlp": 0.01034058, + "balance_loss_clip": 1.0366056, + "balance_loss_mlp": 1.02114117, + "epoch": 0.6786712761160378, + "flos": 24317557390080.0, + "grad_norm": 1.632726162135347, + "language_loss": 0.77089953, + "learning_rate": 9.886663531930655e-07, + "loss": 0.79189008, + "num_input_tokens_seen": 243594140, + "step": 11288, + "time_per_iteration": 2.599918842315674 + }, + { + "auxiliary_loss_clip": 0.01101803, + "auxiliary_loss_mlp": 0.01031559, + "balance_loss_clip": 1.04069889, + "balance_loss_mlp": 1.01964998, + "epoch": 0.6787313993687059, + "flos": 22930435923840.0, + "grad_norm": 2.6085445459847323, + "language_loss": 0.73474169, + "learning_rate": 9.883303725426593e-07, + "loss": 0.75607538, + "num_input_tokens_seen": 243615170, + "step": 11289, + "time_per_iteration": 2.528892755508423 + }, + { + "auxiliary_loss_clip": 0.0111037, + "auxiliary_loss_mlp": 0.0103123, + "balance_loss_clip": 1.03953648, + "balance_loss_mlp": 1.0189395, + "epoch": 0.6787915226213738, + "flos": 26868435598080.0, + "grad_norm": 2.633804296201977, + "language_loss": 0.8003493, + "learning_rate": 9.879944302548682e-07, + "loss": 0.8217653, + "num_input_tokens_seen": 243635675, + "step": 11290, + "time_per_iteration": 2.5173683166503906 + }, + { + "auxiliary_loss_clip": 0.0109449, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.0380677, + "balance_loss_mlp": 1.01853991, + "epoch": 0.6788516458740418, + "flos": 20008851402240.0, + "grad_norm": 1.4910515264103732, + "language_loss": 0.75296432, + "learning_rate": 9.87658526342428e-07, + "loss": 0.77421004, + "num_input_tokens_seen": 243654950, + "step": 11291, + "time_per_iteration": 2.5094945430755615 + }, + { + "auxiliary_loss_clip": 0.01087394, + "auxiliary_loss_mlp": 0.00783182, + "balance_loss_clip": 1.03688264, + "balance_loss_mlp": 1.00812459, + "epoch": 0.6789117691267098, + "flos": 28727099832960.0, + "grad_norm": 1.729904044414138, + "language_loss": 0.75527298, + "learning_rate": 9.873226608180785e-07, + "loss": 0.77397871, + "num_input_tokens_seen": 243674970, + "step": 11292, + "time_per_iteration": 2.5904951095581055 + }, + { + "auxiliary_loss_clip": 0.0106963, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.03809905, + "balance_loss_mlp": 1.0191009, + "epoch": 0.6789718923793777, + "flos": 23403451150080.0, + "grad_norm": 1.8272803045658337, + "language_loss": 0.84175199, + "learning_rate": 9.869868336945556e-07, + "loss": 0.86276549, + "num_input_tokens_seen": 243693440, + "step": 11293, + "time_per_iteration": 2.5738327503204346 + }, + { + "auxiliary_loss_clip": 0.01115488, + "auxiliary_loss_mlp": 0.01038931, + "balance_loss_clip": 1.04129934, + "balance_loss_mlp": 1.02562761, + "epoch": 0.6790320156320457, + "flos": 20448865008000.0, + "grad_norm": 2.377145510561933, + "language_loss": 0.7906661, + "learning_rate": 9.866510449845929e-07, + "loss": 0.81221032, + "num_input_tokens_seen": 243710055, + "step": 11294, + "time_per_iteration": 2.488168239593506 + }, + { + "auxiliary_loss_clip": 0.0108498, + "auxiliary_loss_mlp": 0.01028211, + "balance_loss_clip": 1.03740191, + "balance_loss_mlp": 1.0166955, + "epoch": 0.6790921388847136, + "flos": 24167199058560.0, + "grad_norm": 1.6404121332622654, + "language_loss": 0.78962719, + "learning_rate": 9.86315294700924e-07, + "loss": 0.81075919, + "num_input_tokens_seen": 243728635, + "step": 11295, + "time_per_iteration": 2.5469017028808594 + }, + { + "auxiliary_loss_clip": 0.01080085, + "auxiliary_loss_mlp": 0.01029027, + "balance_loss_clip": 1.03869069, + "balance_loss_mlp": 1.01853025, + "epoch": 0.6791522621373817, + "flos": 21908095027200.0, + "grad_norm": 1.73703517013498, + "language_loss": 0.71212947, + "learning_rate": 9.859795828562823e-07, + "loss": 0.73322058, + "num_input_tokens_seen": 243748330, + "step": 11296, + "time_per_iteration": 2.5319128036499023 + }, + { + "auxiliary_loss_clip": 0.01096353, + "auxiliary_loss_mlp": 0.01029331, + "balance_loss_clip": 1.03779042, + "balance_loss_mlp": 1.01744008, + "epoch": 0.6792123853900496, + "flos": 24826519152000.0, + "grad_norm": 1.5398975463998565, + "language_loss": 0.70905346, + "learning_rate": 9.856439094633949e-07, + "loss": 0.73031026, + "num_input_tokens_seen": 243769380, + "step": 11297, + "time_per_iteration": 2.535752058029175 + }, + { + "auxiliary_loss_clip": 0.01081402, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.03778899, + "balance_loss_mlp": 1.01838255, + "epoch": 0.6792725086427176, + "flos": 17566279678080.0, + "grad_norm": 2.137310019739984, + "language_loss": 0.66094947, + "learning_rate": 9.853082745349918e-07, + "loss": 0.68207824, + "num_input_tokens_seen": 243785510, + "step": 11298, + "time_per_iteration": 2.5172343254089355 + }, + { + "auxiliary_loss_clip": 0.01101602, + "auxiliary_loss_mlp": 0.01027485, + "balance_loss_clip": 1.04063594, + "balance_loss_mlp": 1.01586175, + "epoch": 0.6793326318953855, + "flos": 26941837040640.0, + "grad_norm": 2.148893226096472, + "language_loss": 0.71899366, + "learning_rate": 9.84972678083801e-07, + "loss": 0.7402845, + "num_input_tokens_seen": 243805545, + "step": 11299, + "time_per_iteration": 2.536752462387085 + }, + { + "auxiliary_loss_clip": 0.01110755, + "auxiliary_loss_mlp": 0.01031713, + "balance_loss_clip": 1.04001009, + "balance_loss_mlp": 1.01889205, + "epoch": 0.6793927551480535, + "flos": 24318275662080.0, + "grad_norm": 1.353178559706865, + "language_loss": 0.77259094, + "learning_rate": 9.846371201225488e-07, + "loss": 0.79401559, + "num_input_tokens_seen": 243825185, + "step": 11300, + "time_per_iteration": 2.5052132606506348 + }, + { + "auxiliary_loss_clip": 0.0109676, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.03802788, + "balance_loss_mlp": 1.0179112, + "epoch": 0.6794528784007214, + "flos": 11436615757440.0, + "grad_norm": 1.7830687395494682, + "language_loss": 0.62994963, + "learning_rate": 9.843016006639577e-07, + "loss": 0.65121937, + "num_input_tokens_seen": 243841600, + "step": 11301, + "time_per_iteration": 2.4741768836975098 + }, + { + "auxiliary_loss_clip": 0.01096666, + "auxiliary_loss_mlp": 0.01027887, + "balance_loss_clip": 1.03781223, + "balance_loss_mlp": 1.01597798, + "epoch": 0.6795130016533895, + "flos": 25229688382080.0, + "grad_norm": 1.7008111347600436, + "language_loss": 0.8289358, + "learning_rate": 9.839661197207525e-07, + "loss": 0.85018134, + "num_input_tokens_seen": 243862250, + "step": 11302, + "time_per_iteration": 2.547041893005371 + }, + { + "auxiliary_loss_clip": 0.01100924, + "auxiliary_loss_mlp": 0.01032871, + "balance_loss_clip": 1.03853989, + "balance_loss_mlp": 1.0206995, + "epoch": 0.6795731249060574, + "flos": 18296415434880.0, + "grad_norm": 1.8492822343041018, + "language_loss": 0.69587982, + "learning_rate": 9.83630677305654e-07, + "loss": 0.7172178, + "num_input_tokens_seen": 243880560, + "step": 11303, + "time_per_iteration": 2.4684371948242188 + }, + { + "auxiliary_loss_clip": 0.01079833, + "auxiliary_loss_mlp": 0.01030364, + "balance_loss_clip": 1.03808653, + "balance_loss_mlp": 1.01817465, + "epoch": 0.6796332481587254, + "flos": 20300374183680.0, + "grad_norm": 1.9099078014163595, + "language_loss": 0.69987142, + "learning_rate": 9.832952734313813e-07, + "loss": 0.72097337, + "num_input_tokens_seen": 243900635, + "step": 11304, + "time_per_iteration": 2.5655274391174316 + }, + { + "auxiliary_loss_clip": 0.01099787, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.04047132, + "balance_loss_mlp": 1.0191704, + "epoch": 0.6796933714113934, + "flos": 23586847015680.0, + "grad_norm": 1.85155893041327, + "language_loss": 0.72529292, + "learning_rate": 9.829599081106536e-07, + "loss": 0.74660993, + "num_input_tokens_seen": 243920160, + "step": 11305, + "time_per_iteration": 2.5372602939605713 + }, + { + "auxiliary_loss_clip": 0.01087784, + "auxiliary_loss_mlp": 0.01028124, + "balance_loss_clip": 1.03835177, + "balance_loss_mlp": 1.01537418, + "epoch": 0.6797534946640613, + "flos": 27119917693440.0, + "grad_norm": 2.2138651303726786, + "language_loss": 0.65991563, + "learning_rate": 9.826245813561882e-07, + "loss": 0.68107462, + "num_input_tokens_seen": 243939015, + "step": 11306, + "time_per_iteration": 2.5896072387695312 + }, + { + "auxiliary_loss_clip": 0.01083843, + "auxiliary_loss_mlp": 0.01028852, + "balance_loss_clip": 1.0373795, + "balance_loss_mlp": 1.0157268, + "epoch": 0.6798136179167293, + "flos": 22127437428480.0, + "grad_norm": 1.6580266369311318, + "language_loss": 0.80147004, + "learning_rate": 9.822892931807021e-07, + "loss": 0.82259703, + "num_input_tokens_seen": 243958470, + "step": 11307, + "time_per_iteration": 2.553403615951538 + }, + { + "auxiliary_loss_clip": 0.01083212, + "auxiliary_loss_mlp": 0.0103153, + "balance_loss_clip": 1.03764081, + "balance_loss_mlp": 1.0182023, + "epoch": 0.6798737411693972, + "flos": 17488640430720.0, + "grad_norm": 1.5159396369257339, + "language_loss": 0.89017034, + "learning_rate": 9.819540435969066e-07, + "loss": 0.91131777, + "num_input_tokens_seen": 243975450, + "step": 11308, + "time_per_iteration": 2.49739670753479 + }, + { + "auxiliary_loss_clip": 0.01074635, + "auxiliary_loss_mlp": 0.01036185, + "balance_loss_clip": 1.0372324, + "balance_loss_mlp": 1.02182078, + "epoch": 0.6799338644220653, + "flos": 22892262744960.0, + "grad_norm": 1.8680959500455288, + "language_loss": 0.71436697, + "learning_rate": 9.816188326175154e-07, + "loss": 0.73547518, + "num_input_tokens_seen": 243994355, + "step": 11309, + "time_per_iteration": 2.58166241645813 + }, + { + "auxiliary_loss_clip": 0.01079032, + "auxiliary_loss_mlp": 0.01037473, + "balance_loss_clip": 1.03870201, + "balance_loss_mlp": 1.02503943, + "epoch": 0.6799939876747332, + "flos": 23180409648000.0, + "grad_norm": 1.6627840320014546, + "language_loss": 0.84366608, + "learning_rate": 9.812836602552411e-07, + "loss": 0.86483121, + "num_input_tokens_seen": 244011620, + "step": 11310, + "time_per_iteration": 2.5842363834381104 + }, + { + "auxiliary_loss_clip": 0.01082058, + "auxiliary_loss_mlp": 0.01027157, + "balance_loss_clip": 1.04048085, + "balance_loss_mlp": 1.01548624, + "epoch": 0.6800541109274012, + "flos": 19499925553920.0, + "grad_norm": 2.3346566499196224, + "language_loss": 0.82924855, + "learning_rate": 9.80948526522792e-07, + "loss": 0.85034072, + "num_input_tokens_seen": 244029925, + "step": 11311, + "time_per_iteration": 3.947524070739746 + }, + { + "auxiliary_loss_clip": 0.01069683, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.03477478, + "balance_loss_mlp": 1.01635337, + "epoch": 0.6801142341800691, + "flos": 22277652105600.0, + "grad_norm": 1.9532074566577264, + "language_loss": 0.7633853, + "learning_rate": 9.806134314328767e-07, + "loss": 0.78438842, + "num_input_tokens_seen": 244051225, + "step": 11312, + "time_per_iteration": 2.623706579208374 + }, + { + "auxiliary_loss_clip": 0.01040385, + "auxiliary_loss_mlp": 0.01002701, + "balance_loss_clip": 1.01683259, + "balance_loss_mlp": 1.00162244, + "epoch": 0.6801743574327371, + "flos": 68714817759360.0, + "grad_norm": 0.6726405090063666, + "language_loss": 0.57192826, + "learning_rate": 9.802783749982038e-07, + "loss": 0.59235913, + "num_input_tokens_seen": 244115930, + "step": 11313, + "time_per_iteration": 3.1924264430999756 + }, + { + "auxiliary_loss_clip": 0.01098393, + "auxiliary_loss_mlp": 0.01028376, + "balance_loss_clip": 1.03727269, + "balance_loss_mlp": 1.01570952, + "epoch": 0.680234480685405, + "flos": 29460467813760.0, + "grad_norm": 1.7953557966967988, + "language_loss": 0.68546128, + "learning_rate": 9.799433572314754e-07, + "loss": 0.70672899, + "num_input_tokens_seen": 244137320, + "step": 11314, + "time_per_iteration": 2.578650712966919 + }, + { + "auxiliary_loss_clip": 0.01095249, + "auxiliary_loss_mlp": 0.01030387, + "balance_loss_clip": 1.0357101, + "balance_loss_mlp": 1.01903844, + "epoch": 0.6802946039380731, + "flos": 15916866122880.0, + "grad_norm": 1.7862059790901734, + "language_loss": 0.81197375, + "learning_rate": 9.796083781453972e-07, + "loss": 0.83323014, + "num_input_tokens_seen": 244152755, + "step": 11315, + "time_per_iteration": 2.468475341796875 + }, + { + "auxiliary_loss_clip": 0.010682, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.03781319, + "balance_loss_mlp": 1.0175159, + "epoch": 0.680354727190741, + "flos": 22018664067840.0, + "grad_norm": 1.819915167524122, + "language_loss": 0.70187336, + "learning_rate": 9.792734377526718e-07, + "loss": 0.72285879, + "num_input_tokens_seen": 244171480, + "step": 11316, + "time_per_iteration": 2.606863021850586 + }, + { + "auxiliary_loss_clip": 0.01097202, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.0389272, + "balance_loss_mlp": 1.01730275, + "epoch": 0.680414850443409, + "flos": 18441494467200.0, + "grad_norm": 2.3346384910493065, + "language_loss": 0.66890556, + "learning_rate": 9.789385360660003e-07, + "loss": 0.6901691, + "num_input_tokens_seen": 244187920, + "step": 11317, + "time_per_iteration": 2.468327045440674 + }, + { + "auxiliary_loss_clip": 0.01100303, + "auxiliary_loss_mlp": 0.01041996, + "balance_loss_clip": 1.04138422, + "balance_loss_mlp": 1.02968729, + "epoch": 0.680474973696077, + "flos": 26358611909760.0, + "grad_norm": 1.4470042030747343, + "language_loss": 0.75017303, + "learning_rate": 9.78603673098082e-07, + "loss": 0.77159607, + "num_input_tokens_seen": 244209565, + "step": 11318, + "time_per_iteration": 3.980802536010742 + }, + { + "auxiliary_loss_clip": 0.01078326, + "auxiliary_loss_mlp": 0.0102978, + "balance_loss_clip": 1.03511858, + "balance_loss_mlp": 1.01764417, + "epoch": 0.6805350969487449, + "flos": 18333116156160.0, + "grad_norm": 1.752481762407116, + "language_loss": 0.67976326, + "learning_rate": 9.782688488616143e-07, + "loss": 0.70084429, + "num_input_tokens_seen": 244228015, + "step": 11319, + "time_per_iteration": 3.930938720703125 + }, + { + "auxiliary_loss_clip": 0.01066684, + "auxiliary_loss_mlp": 0.00783765, + "balance_loss_clip": 1.03812683, + "balance_loss_mlp": 1.01020217, + "epoch": 0.6805952202014129, + "flos": 19937497034880.0, + "grad_norm": 1.5719463127360238, + "language_loss": 0.76486695, + "learning_rate": 9.779340633692945e-07, + "loss": 0.78337145, + "num_input_tokens_seen": 244245615, + "step": 11320, + "time_per_iteration": 2.617776393890381 + }, + { + "auxiliary_loss_clip": 0.01082389, + "auxiliary_loss_mlp": 0.01033109, + "balance_loss_clip": 1.03925347, + "balance_loss_mlp": 1.02062809, + "epoch": 0.6806553434540809, + "flos": 25224301342080.0, + "grad_norm": 1.767672852683966, + "language_loss": 0.74750793, + "learning_rate": 9.77599316633817e-07, + "loss": 0.76866293, + "num_input_tokens_seen": 244263625, + "step": 11321, + "time_per_iteration": 2.5507652759552 + }, + { + "auxiliary_loss_clip": 0.01089101, + "auxiliary_loss_mlp": 0.0103497, + "balance_loss_clip": 1.03983974, + "balance_loss_mlp": 1.0228461, + "epoch": 0.6807154667067489, + "flos": 17785586165760.0, + "grad_norm": 1.8064561006328692, + "language_loss": 0.73274028, + "learning_rate": 9.772646086678758e-07, + "loss": 0.75398099, + "num_input_tokens_seen": 244282745, + "step": 11322, + "time_per_iteration": 3.9771735668182373 + }, + { + "auxiliary_loss_clip": 0.01058416, + "auxiliary_loss_mlp": 0.00784286, + "balance_loss_clip": 1.03563952, + "balance_loss_mlp": 1.00806046, + "epoch": 0.6807755899594168, + "flos": 22199905117440.0, + "grad_norm": 1.6378377296218878, + "language_loss": 0.78434718, + "learning_rate": 9.769299394841638e-07, + "loss": 0.80277425, + "num_input_tokens_seen": 244303770, + "step": 11323, + "time_per_iteration": 2.6198973655700684 + }, + { + "auxiliary_loss_clip": 0.01004988, + "auxiliary_loss_mlp": 0.01000746, + "balance_loss_clip": 1.01535845, + "balance_loss_mlp": 0.99944705, + "epoch": 0.6808357132120848, + "flos": 68631073200000.0, + "grad_norm": 0.7797215423739108, + "language_loss": 0.57184803, + "learning_rate": 9.765953090953714e-07, + "loss": 0.59190536, + "num_input_tokens_seen": 244355910, + "step": 11324, + "time_per_iteration": 2.9474036693573 + }, + { + "auxiliary_loss_clip": 0.01090824, + "auxiliary_loss_mlp": 0.01036381, + "balance_loss_clip": 1.03919363, + "balance_loss_mlp": 1.02360153, + "epoch": 0.6808958364647527, + "flos": 23843357015040.0, + "grad_norm": 1.8917357527354923, + "language_loss": 0.67793894, + "learning_rate": 9.76260717514186e-07, + "loss": 0.699211, + "num_input_tokens_seen": 244376610, + "step": 11325, + "time_per_iteration": 2.584002733230591 + }, + { + "auxiliary_loss_clip": 0.01100804, + "auxiliary_loss_mlp": 0.01031193, + "balance_loss_clip": 1.03773487, + "balance_loss_mlp": 1.0175494, + "epoch": 0.6809559597174207, + "flos": 17711717846400.0, + "grad_norm": 2.2234321127830112, + "language_loss": 0.70260054, + "learning_rate": 9.759261647532974e-07, + "loss": 0.72392046, + "num_input_tokens_seen": 244393000, + "step": 11326, + "time_per_iteration": 2.468193769454956 + }, + { + "auxiliary_loss_clip": 0.01109435, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.03811026, + "balance_loss_mlp": 1.01783752, + "epoch": 0.6810160829700886, + "flos": 22491894775680.0, + "grad_norm": 1.7641944298454715, + "language_loss": 0.72747242, + "learning_rate": 9.75591650825392e-07, + "loss": 0.7488693, + "num_input_tokens_seen": 244409515, + "step": 11327, + "time_per_iteration": 2.470832586288452 + }, + { + "auxiliary_loss_clip": 0.01095245, + "auxiliary_loss_mlp": 0.01027005, + "balance_loss_clip": 1.03701615, + "balance_loss_mlp": 1.01457763, + "epoch": 0.6810762062227567, + "flos": 16832875783680.0, + "grad_norm": 2.2110267127136884, + "language_loss": 0.77548981, + "learning_rate": 9.752571757431526e-07, + "loss": 0.79671234, + "num_input_tokens_seen": 244427165, + "step": 11328, + "time_per_iteration": 2.4907774925231934 + }, + { + "auxiliary_loss_clip": 0.0110952, + "auxiliary_loss_mlp": 0.01030277, + "balance_loss_clip": 1.03842664, + "balance_loss_mlp": 1.0177182, + "epoch": 0.6811363294754246, + "flos": 12714676554240.0, + "grad_norm": 1.7972976167893433, + "language_loss": 0.64053321, + "learning_rate": 9.74922739519265e-07, + "loss": 0.66193122, + "num_input_tokens_seen": 244445705, + "step": 11329, + "time_per_iteration": 2.463643789291382 + }, + { + "auxiliary_loss_clip": 0.01054594, + "auxiliary_loss_mlp": 0.0078534, + "balance_loss_clip": 1.03984284, + "balance_loss_mlp": 1.01065469, + "epoch": 0.6811964527280926, + "flos": 17711969241600.0, + "grad_norm": 2.32060578602773, + "language_loss": 0.79223466, + "learning_rate": 9.745883421664096e-07, + "loss": 0.8106339, + "num_input_tokens_seen": 244460415, + "step": 11330, + "time_per_iteration": 2.5927722454071045 + }, + { + "auxiliary_loss_clip": 0.01097879, + "auxiliary_loss_mlp": 0.01029578, + "balance_loss_clip": 1.03905749, + "balance_loss_mlp": 1.01736474, + "epoch": 0.6812565759807605, + "flos": 24863471268480.0, + "grad_norm": 1.8904605851726493, + "language_loss": 0.63908291, + "learning_rate": 9.742539836972665e-07, + "loss": 0.66035748, + "num_input_tokens_seen": 244480555, + "step": 11331, + "time_per_iteration": 2.545167922973633 + }, + { + "auxiliary_loss_clip": 0.01062222, + "auxiliary_loss_mlp": 0.01036265, + "balance_loss_clip": 1.03728533, + "balance_loss_mlp": 1.02191257, + "epoch": 0.6813166992334285, + "flos": 17166019449600.0, + "grad_norm": 1.5712565503147384, + "language_loss": 0.72481, + "learning_rate": 9.739196641245148e-07, + "loss": 0.74579489, + "num_input_tokens_seen": 244498540, + "step": 11332, + "time_per_iteration": 2.5672414302825928 + }, + { + "auxiliary_loss_clip": 0.01099028, + "auxiliary_loss_mlp": 0.01032209, + "balance_loss_clip": 1.0414741, + "balance_loss_mlp": 1.01950085, + "epoch": 0.6813768224860965, + "flos": 18843550375680.0, + "grad_norm": 2.198148990591329, + "language_loss": 0.74718726, + "learning_rate": 9.735853834608326e-07, + "loss": 0.76849961, + "num_input_tokens_seen": 244517015, + "step": 11333, + "time_per_iteration": 2.4991281032562256 + }, + { + "auxiliary_loss_clip": 0.01101331, + "auxiliary_loss_mlp": 0.01029407, + "balance_loss_clip": 1.03947449, + "balance_loss_mlp": 1.0164485, + "epoch": 0.6814369457387645, + "flos": 24532733813760.0, + "grad_norm": 1.5281602425151177, + "language_loss": 0.72173041, + "learning_rate": 9.732511417188963e-07, + "loss": 0.74303782, + "num_input_tokens_seen": 244537450, + "step": 11334, + "time_per_iteration": 2.5361852645874023 + }, + { + "auxiliary_loss_clip": 0.01093795, + "auxiliary_loss_mlp": 0.01031294, + "balance_loss_clip": 1.04000282, + "balance_loss_mlp": 1.01941502, + "epoch": 0.6814970689914325, + "flos": 18222978078720.0, + "grad_norm": 1.8316761165276925, + "language_loss": 0.86018687, + "learning_rate": 9.729169389113791e-07, + "loss": 0.88143778, + "num_input_tokens_seen": 244555640, + "step": 11335, + "time_per_iteration": 2.495326519012451 + }, + { + "auxiliary_loss_clip": 0.01090138, + "auxiliary_loss_mlp": 0.01030014, + "balance_loss_clip": 1.03561282, + "balance_loss_mlp": 1.01862955, + "epoch": 0.6815571922441004, + "flos": 25228790542080.0, + "grad_norm": 1.7131706427574538, + "language_loss": 0.82225895, + "learning_rate": 9.725827750509542e-07, + "loss": 0.8434605, + "num_input_tokens_seen": 244574005, + "step": 11336, + "time_per_iteration": 2.5380184650421143 + }, + { + "auxiliary_loss_clip": 0.01065252, + "auxiliary_loss_mlp": 0.01029359, + "balance_loss_clip": 1.03850031, + "balance_loss_mlp": 1.01767099, + "epoch": 0.6816173154967684, + "flos": 19456078026240.0, + "grad_norm": 1.918348983684643, + "language_loss": 0.81621361, + "learning_rate": 9.72248650150294e-07, + "loss": 0.83715975, + "num_input_tokens_seen": 244591395, + "step": 11337, + "time_per_iteration": 2.564608097076416 + }, + { + "auxiliary_loss_clip": 0.0106218, + "auxiliary_loss_mlp": 0.01028783, + "balance_loss_clip": 1.03797662, + "balance_loss_mlp": 1.01690936, + "epoch": 0.6816774387494363, + "flos": 17931455297280.0, + "grad_norm": 1.6496382397559601, + "language_loss": 0.7265414, + "learning_rate": 9.719145642220673e-07, + "loss": 0.74745095, + "num_input_tokens_seen": 244610400, + "step": 11338, + "time_per_iteration": 2.583285331726074 + }, + { + "auxiliary_loss_clip": 0.01065471, + "auxiliary_loss_mlp": 0.01037713, + "balance_loss_clip": 1.03513169, + "balance_loss_mlp": 1.02502298, + "epoch": 0.6817375620021043, + "flos": 22233014478720.0, + "grad_norm": 1.4971786125462128, + "language_loss": 0.77644396, + "learning_rate": 9.715805172789435e-07, + "loss": 0.79747576, + "num_input_tokens_seen": 244630400, + "step": 11339, + "time_per_iteration": 2.5944504737854004 + }, + { + "auxiliary_loss_clip": 0.01075877, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.03746629, + "balance_loss_mlp": 1.01984739, + "epoch": 0.6817976852547722, + "flos": 25374408278400.0, + "grad_norm": 2.107595286540718, + "language_loss": 0.70708972, + "learning_rate": 9.712465093335901e-07, + "loss": 0.72817028, + "num_input_tokens_seen": 244649155, + "step": 11340, + "time_per_iteration": 2.603073835372925 + }, + { + "auxiliary_loss_clip": 0.01088816, + "auxiliary_loss_mlp": 0.01033854, + "balance_loss_clip": 1.03917527, + "balance_loss_mlp": 1.02212334, + "epoch": 0.6818578085074403, + "flos": 22265764704000.0, + "grad_norm": 3.0919727721284187, + "language_loss": 0.83444768, + "learning_rate": 9.709125403986722e-07, + "loss": 0.85567439, + "num_input_tokens_seen": 244665470, + "step": 11341, + "time_per_iteration": 2.5407416820526123 + }, + { + "auxiliary_loss_clip": 0.01075115, + "auxiliary_loss_mlp": 0.01034773, + "balance_loss_clip": 1.03949237, + "balance_loss_mlp": 1.02148724, + "epoch": 0.6819179317601082, + "flos": 19318145800320.0, + "grad_norm": 1.7484711111410332, + "language_loss": 0.68129337, + "learning_rate": 9.705786104868531e-07, + "loss": 0.70239222, + "num_input_tokens_seen": 244684390, + "step": 11342, + "time_per_iteration": 2.5858495235443115 + }, + { + "auxiliary_loss_clip": 0.01061678, + "auxiliary_loss_mlp": 0.01025698, + "balance_loss_clip": 1.03961182, + "balance_loss_mlp": 1.01389647, + "epoch": 0.6819780550127762, + "flos": 21104126864640.0, + "grad_norm": 1.6705947512339279, + "language_loss": 0.75156009, + "learning_rate": 9.702447196107963e-07, + "loss": 0.77243388, + "num_input_tokens_seen": 244703370, + "step": 11343, + "time_per_iteration": 2.594759225845337 + }, + { + "auxiliary_loss_clip": 0.01069195, + "auxiliary_loss_mlp": 0.01040065, + "balance_loss_clip": 1.03855479, + "balance_loss_mlp": 1.02624261, + "epoch": 0.6820381782654441, + "flos": 29716403195520.0, + "grad_norm": 1.5452535094416928, + "language_loss": 0.79770076, + "learning_rate": 9.699108677831639e-07, + "loss": 0.81879342, + "num_input_tokens_seen": 244723325, + "step": 11344, + "time_per_iteration": 2.6296212673187256 + }, + { + "auxiliary_loss_clip": 0.01072494, + "auxiliary_loss_mlp": 0.01034267, + "balance_loss_clip": 1.03777766, + "balance_loss_mlp": 1.02190471, + "epoch": 0.6820983015181121, + "flos": 29242130993280.0, + "grad_norm": 2.079376972446784, + "language_loss": 0.6657576, + "learning_rate": 9.695770550166136e-07, + "loss": 0.68682516, + "num_input_tokens_seen": 244745650, + "step": 11345, + "time_per_iteration": 2.6608028411865234 + }, + { + "auxiliary_loss_clip": 0.01088922, + "auxiliary_loss_mlp": 0.01034172, + "balance_loss_clip": 1.03829432, + "balance_loss_mlp": 1.02187562, + "epoch": 0.6821584247707801, + "flos": 18871775487360.0, + "grad_norm": 2.2982905693772193, + "language_loss": 0.65017307, + "learning_rate": 9.692432813238054e-07, + "loss": 0.671404, + "num_input_tokens_seen": 244760270, + "step": 11346, + "time_per_iteration": 2.5151944160461426 + }, + { + "auxiliary_loss_clip": 0.01046362, + "auxiliary_loss_mlp": 0.00788847, + "balance_loss_clip": 1.03304911, + "balance_loss_mlp": 1.01023161, + "epoch": 0.6822185480234481, + "flos": 21324582587520.0, + "grad_norm": 1.6266741612945521, + "language_loss": 0.78512639, + "learning_rate": 9.689095467173952e-07, + "loss": 0.80347848, + "num_input_tokens_seen": 244779565, + "step": 11347, + "time_per_iteration": 2.6327075958251953 + }, + { + "auxiliary_loss_clip": 0.01034309, + "auxiliary_loss_mlp": 0.01007326, + "balance_loss_clip": 1.02003956, + "balance_loss_mlp": 1.00626516, + "epoch": 0.6822786712761161, + "flos": 63488306430720.0, + "grad_norm": 0.7972561555383708, + "language_loss": 0.52531981, + "learning_rate": 9.685758512100378e-07, + "loss": 0.54573619, + "num_input_tokens_seen": 244838480, + "step": 11348, + "time_per_iteration": 3.0992941856384277 + }, + { + "auxiliary_loss_clip": 0.01106358, + "auxiliary_loss_mlp": 0.01033273, + "balance_loss_clip": 1.03746831, + "balance_loss_mlp": 1.02195454, + "epoch": 0.682338794528784, + "flos": 21068934514560.0, + "grad_norm": 1.7735176642726107, + "language_loss": 0.79889691, + "learning_rate": 9.682421948143873e-07, + "loss": 0.82029319, + "num_input_tokens_seen": 244855265, + "step": 11349, + "time_per_iteration": 2.4684245586395264 + }, + { + "auxiliary_loss_clip": 0.01103746, + "auxiliary_loss_mlp": 0.01028953, + "balance_loss_clip": 1.03963041, + "balance_loss_mlp": 1.01445687, + "epoch": 0.682398917781452, + "flos": 36283243547520.0, + "grad_norm": 1.738595501565156, + "language_loss": 0.74083048, + "learning_rate": 9.67908577543096e-07, + "loss": 0.76215744, + "num_input_tokens_seen": 244875555, + "step": 11350, + "time_per_iteration": 4.031501054763794 + }, + { + "auxiliary_loss_clip": 0.01109196, + "auxiliary_loss_mlp": 0.01030694, + "balance_loss_clip": 1.03981709, + "balance_loss_mlp": 1.01803982, + "epoch": 0.6824590410341199, + "flos": 24859197550080.0, + "grad_norm": 2.6012320625598915, + "language_loss": 0.79583561, + "learning_rate": 9.675749994088161e-07, + "loss": 0.81723452, + "num_input_tokens_seen": 244895270, + "step": 11351, + "time_per_iteration": 2.503495931625366 + }, + { + "auxiliary_loss_clip": 0.01095326, + "auxiliary_loss_mlp": 0.01032304, + "balance_loss_clip": 1.037503, + "balance_loss_mlp": 1.02043688, + "epoch": 0.6825191642867879, + "flos": 22452392793600.0, + "grad_norm": 1.742169009480748, + "language_loss": 0.7348603, + "learning_rate": 9.672414604241954e-07, + "loss": 0.75613666, + "num_input_tokens_seen": 244914535, + "step": 11352, + "time_per_iteration": 2.515620231628418 + }, + { + "auxiliary_loss_clip": 0.01064416, + "auxiliary_loss_mlp": 0.01036509, + "balance_loss_clip": 1.03673625, + "balance_loss_mlp": 1.02307463, + "epoch": 0.6825792875394558, + "flos": 29424377623680.0, + "grad_norm": 1.4123191336238536, + "language_loss": 0.79971373, + "learning_rate": 9.669079606018814e-07, + "loss": 0.820723, + "num_input_tokens_seen": 244936095, + "step": 11353, + "time_per_iteration": 2.6720385551452637 + }, + { + "auxiliary_loss_clip": 0.01099518, + "auxiliary_loss_mlp": 0.01025603, + "balance_loss_clip": 1.03843141, + "balance_loss_mlp": 1.01349711, + "epoch": 0.6826394107921239, + "flos": 18770974945920.0, + "grad_norm": 1.8151097952738282, + "language_loss": 0.78500664, + "learning_rate": 9.665744999545218e-07, + "loss": 0.80625784, + "num_input_tokens_seen": 244955290, + "step": 11354, + "time_per_iteration": 2.519303321838379 + }, + { + "auxiliary_loss_clip": 0.01048615, + "auxiliary_loss_mlp": 0.01025831, + "balance_loss_clip": 1.03794158, + "balance_loss_mlp": 1.01429737, + "epoch": 0.6826995340447918, + "flos": 16617591619200.0, + "grad_norm": 3.093361099137183, + "language_loss": 0.62438875, + "learning_rate": 9.662410784947599e-07, + "loss": 0.6451332, + "num_input_tokens_seen": 244972935, + "step": 11355, + "time_per_iteration": 2.6056294441223145 + }, + { + "auxiliary_loss_clip": 0.01056004, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.03393865, + "balance_loss_mlp": 1.01874971, + "epoch": 0.6827596572974598, + "flos": 20848299223680.0, + "grad_norm": 2.297275313866363, + "language_loss": 0.82370067, + "learning_rate": 9.659076962352398e-07, + "loss": 0.84457302, + "num_input_tokens_seen": 244989440, + "step": 11356, + "time_per_iteration": 2.599545478820801 + }, + { + "auxiliary_loss_clip": 0.01090238, + "auxiliary_loss_mlp": 0.01029568, + "balance_loss_clip": 1.03895962, + "balance_loss_mlp": 1.01721799, + "epoch": 0.6828197805501277, + "flos": 22748081552640.0, + "grad_norm": 1.8048899478883387, + "language_loss": 0.78528112, + "learning_rate": 9.655743531886052e-07, + "loss": 0.80647922, + "num_input_tokens_seen": 245007830, + "step": 11357, + "time_per_iteration": 3.951115131378174 + }, + { + "auxiliary_loss_clip": 0.01026009, + "auxiliary_loss_mlp": 0.01015501, + "balance_loss_clip": 1.02150726, + "balance_loss_mlp": 1.01396322, + "epoch": 0.6828799038027957, + "flos": 71646565829760.0, + "grad_norm": 0.8253304017308701, + "language_loss": 0.59631574, + "learning_rate": 9.65241049367493e-07, + "loss": 0.61673081, + "num_input_tokens_seen": 245070720, + "step": 11358, + "time_per_iteration": 4.559441804885864 + }, + { + "auxiliary_loss_clip": 0.01073793, + "auxiliary_loss_mlp": 0.01043988, + "balance_loss_clip": 1.03598976, + "balance_loss_mlp": 1.02833617, + "epoch": 0.6829400270554637, + "flos": 19829154637440.0, + "grad_norm": 1.6460912612489027, + "language_loss": 0.78148782, + "learning_rate": 9.64907784784544e-07, + "loss": 0.80266565, + "num_input_tokens_seen": 245089070, + "step": 11359, + "time_per_iteration": 2.547093152999878 + }, + { + "auxiliary_loss_clip": 0.01097999, + "auxiliary_loss_mlp": 0.01035025, + "balance_loss_clip": 1.03795624, + "balance_loss_mlp": 1.02295542, + "epoch": 0.6830001503081317, + "flos": 21980634543360.0, + "grad_norm": 1.9944399875935386, + "language_loss": 0.81711262, + "learning_rate": 9.645745594523958e-07, + "loss": 0.83844286, + "num_input_tokens_seen": 245106500, + "step": 11360, + "time_per_iteration": 2.5164954662323 + }, + { + "auxiliary_loss_clip": 0.01098589, + "auxiliary_loss_mlp": 0.01041626, + "balance_loss_clip": 1.04038978, + "balance_loss_mlp": 1.0273509, + "epoch": 0.6830602735607997, + "flos": 24316767290880.0, + "grad_norm": 1.7543633563900936, + "language_loss": 0.75226271, + "learning_rate": 9.642413733836844e-07, + "loss": 0.77366495, + "num_input_tokens_seen": 245125260, + "step": 11361, + "time_per_iteration": 3.8951685428619385 + }, + { + "auxiliary_loss_clip": 0.01026905, + "auxiliary_loss_mlp": 0.01001758, + "balance_loss_clip": 1.02968693, + "balance_loss_mlp": 1.00010133, + "epoch": 0.6831203968134676, + "flos": 57690062323200.0, + "grad_norm": 0.8736122754526313, + "language_loss": 0.59712279, + "learning_rate": 9.639082265910437e-07, + "loss": 0.61740941, + "num_input_tokens_seen": 245188730, + "step": 11362, + "time_per_iteration": 3.2457966804504395 + }, + { + "auxiliary_loss_clip": 0.01084027, + "auxiliary_loss_mlp": 0.01031305, + "balance_loss_clip": 1.03762841, + "balance_loss_mlp": 1.01829326, + "epoch": 0.6831805200661356, + "flos": 14388436552320.0, + "grad_norm": 2.013118795762214, + "language_loss": 0.75559527, + "learning_rate": 9.635751190871074e-07, + "loss": 0.77674854, + "num_input_tokens_seen": 245205065, + "step": 11363, + "time_per_iteration": 2.5156185626983643 + }, + { + "auxiliary_loss_clip": 0.01085886, + "auxiliary_loss_mlp": 0.01037594, + "balance_loss_clip": 1.03690422, + "balance_loss_mlp": 1.02457058, + "epoch": 0.6832406433188035, + "flos": 22820297846400.0, + "grad_norm": 2.3034784906298116, + "language_loss": 0.88843584, + "learning_rate": 9.632420508845063e-07, + "loss": 0.90967065, + "num_input_tokens_seen": 245224265, + "step": 11364, + "time_per_iteration": 2.6027309894561768 + }, + { + "auxiliary_loss_clip": 0.01085488, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.03705812, + "balance_loss_mlp": 1.01824105, + "epoch": 0.6833007665714715, + "flos": 17561718650880.0, + "grad_norm": 1.829993237633201, + "language_loss": 0.87884098, + "learning_rate": 9.629090219958697e-07, + "loss": 0.89999592, + "num_input_tokens_seen": 245243360, + "step": 11365, + "time_per_iteration": 2.5539305210113525 + }, + { + "auxiliary_loss_clip": 0.010746, + "auxiliary_loss_mlp": 0.0104208, + "balance_loss_clip": 1.03959465, + "balance_loss_mlp": 1.02779245, + "epoch": 0.6833608898241395, + "flos": 22445928345600.0, + "grad_norm": 2.160290711893958, + "language_loss": 0.80838954, + "learning_rate": 9.625760324338272e-07, + "loss": 0.82955635, + "num_input_tokens_seen": 245256350, + "step": 11366, + "time_per_iteration": 2.59409499168396 + }, + { + "auxiliary_loss_clip": 0.01087494, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.0372858, + "balance_loss_mlp": 1.01573873, + "epoch": 0.6834210130768075, + "flos": 24534637234560.0, + "grad_norm": 1.3894193812855193, + "language_loss": 0.765728, + "learning_rate": 9.622430822110062e-07, + "loss": 0.78687888, + "num_input_tokens_seen": 245277575, + "step": 11367, + "time_per_iteration": 2.580702543258667 + }, + { + "auxiliary_loss_clip": 0.01083542, + "auxiliary_loss_mlp": 0.01034899, + "balance_loss_clip": 1.04022753, + "balance_loss_mlp": 1.02219093, + "epoch": 0.6834811363294754, + "flos": 20047132321920.0, + "grad_norm": 1.4337130130469145, + "language_loss": 0.68759072, + "learning_rate": 9.619101713400312e-07, + "loss": 0.70877516, + "num_input_tokens_seen": 245296615, + "step": 11368, + "time_per_iteration": 2.5280230045318604 + }, + { + "auxiliary_loss_clip": 0.01072724, + "auxiliary_loss_mlp": 0.01034831, + "balance_loss_clip": 1.03652501, + "balance_loss_mlp": 1.02256465, + "epoch": 0.6835412595821434, + "flos": 24790752184320.0, + "grad_norm": 1.7418089827258016, + "language_loss": 0.73126447, + "learning_rate": 9.615772998335261e-07, + "loss": 0.75234008, + "num_input_tokens_seen": 245316275, + "step": 11369, + "time_per_iteration": 2.6257810592651367 + }, + { + "auxiliary_loss_clip": 0.01094768, + "auxiliary_loss_mlp": 0.01029793, + "balance_loss_clip": 1.03916466, + "balance_loss_mlp": 1.01758575, + "epoch": 0.6836013828348113, + "flos": 19500356517120.0, + "grad_norm": 1.9916695327197576, + "language_loss": 0.79240727, + "learning_rate": 9.612444677041138e-07, + "loss": 0.81365281, + "num_input_tokens_seen": 245334595, + "step": 11370, + "time_per_iteration": 2.512848377227783 + }, + { + "auxiliary_loss_clip": 0.01033342, + "auxiliary_loss_mlp": 0.01000816, + "balance_loss_clip": 1.01909912, + "balance_loss_mlp": 0.99958223, + "epoch": 0.6836615060874793, + "flos": 58363999251840.0, + "grad_norm": 0.7455881973849126, + "language_loss": 0.59805, + "learning_rate": 9.609116749644162e-07, + "loss": 0.61839163, + "num_input_tokens_seen": 245389750, + "step": 11371, + "time_per_iteration": 3.042795181274414 + }, + { + "auxiliary_loss_clip": 0.01080399, + "auxiliary_loss_mlp": 0.0102586, + "balance_loss_clip": 1.03813672, + "balance_loss_mlp": 1.01443934, + "epoch": 0.6837216293401474, + "flos": 12166895168640.0, + "grad_norm": 2.2867942593909842, + "language_loss": 0.63596565, + "learning_rate": 9.605789216270511e-07, + "loss": 0.6570282, + "num_input_tokens_seen": 245407530, + "step": 11372, + "time_per_iteration": 2.5345940589904785 + }, + { + "auxiliary_loss_clip": 0.01096055, + "auxiliary_loss_mlp": 0.0102583, + "balance_loss_clip": 1.04106116, + "balance_loss_mlp": 1.01366496, + "epoch": 0.6837817525928153, + "flos": 22127581082880.0, + "grad_norm": 1.5448013075167855, + "language_loss": 0.71649814, + "learning_rate": 9.602462077046375e-07, + "loss": 0.73771703, + "num_input_tokens_seen": 245427000, + "step": 11373, + "time_per_iteration": 2.5215413570404053 + }, + { + "auxiliary_loss_clip": 0.01016624, + "auxiliary_loss_mlp": 0.01002736, + "balance_loss_clip": 1.02275836, + "balance_loss_mlp": 1.00150192, + "epoch": 0.6838418758454833, + "flos": 65005928985600.0, + "grad_norm": 1.181953799033402, + "language_loss": 0.56667852, + "learning_rate": 9.599135332097935e-07, + "loss": 0.5868721, + "num_input_tokens_seen": 245491620, + "step": 11374, + "time_per_iteration": 3.328202247619629 + }, + { + "auxiliary_loss_clip": 0.01100091, + "auxiliary_loss_mlp": 0.01024921, + "balance_loss_clip": 1.03909492, + "balance_loss_mlp": 1.01192713, + "epoch": 0.6839019990981512, + "flos": 21030833162880.0, + "grad_norm": 1.5320337329056335, + "language_loss": 0.73462367, + "learning_rate": 9.595808981551312e-07, + "loss": 0.7558738, + "num_input_tokens_seen": 245511285, + "step": 11375, + "time_per_iteration": 2.540921449661255 + }, + { + "auxiliary_loss_clip": 0.01087643, + "auxiliary_loss_mlp": 0.01029436, + "balance_loss_clip": 1.03876781, + "balance_loss_mlp": 1.01765227, + "epoch": 0.6839621223508192, + "flos": 24935543907840.0, + "grad_norm": 1.7868558021118364, + "language_loss": 0.7068814, + "learning_rate": 9.592483025532651e-07, + "loss": 0.72805214, + "num_input_tokens_seen": 245532910, + "step": 11376, + "time_per_iteration": 2.6083621978759766 + }, + { + "auxiliary_loss_clip": 0.01111528, + "auxiliary_loss_mlp": 0.01031669, + "balance_loss_clip": 1.03941512, + "balance_loss_mlp": 1.01911676, + "epoch": 0.6840222456034871, + "flos": 26358827391360.0, + "grad_norm": 2.024560998065436, + "language_loss": 0.74704027, + "learning_rate": 9.58915746416808e-07, + "loss": 0.76847225, + "num_input_tokens_seen": 245550540, + "step": 11377, + "time_per_iteration": 2.512059211730957 + }, + { + "auxiliary_loss_clip": 0.0102507, + "auxiliary_loss_mlp": 0.01001488, + "balance_loss_clip": 1.01996422, + "balance_loss_mlp": 1.00016451, + "epoch": 0.6840823688561551, + "flos": 65988336936960.0, + "grad_norm": 0.7432249475106598, + "language_loss": 0.56819725, + "learning_rate": 9.585832297583707e-07, + "loss": 0.58846283, + "num_input_tokens_seen": 245619570, + "step": 11378, + "time_per_iteration": 3.2503654956817627 + }, + { + "auxiliary_loss_clip": 0.01110055, + "auxiliary_loss_mlp": 0.01033149, + "balance_loss_clip": 1.03868163, + "balance_loss_mlp": 1.02085805, + "epoch": 0.684142492108823, + "flos": 21397588980480.0, + "grad_norm": 1.829940400253596, + "language_loss": 0.7836839, + "learning_rate": 9.58250752590561e-07, + "loss": 0.80511594, + "num_input_tokens_seen": 245637980, + "step": 11379, + "time_per_iteration": 2.462859869003296 + }, + { + "auxiliary_loss_clip": 0.01104389, + "auxiliary_loss_mlp": 0.01026321, + "balance_loss_clip": 1.03928351, + "balance_loss_mlp": 1.01545465, + "epoch": 0.6842026153614911, + "flos": 18801426700800.0, + "grad_norm": 1.7543472145137915, + "language_loss": 0.68883812, + "learning_rate": 9.57918314925988e-07, + "loss": 0.71014524, + "num_input_tokens_seen": 245655690, + "step": 11380, + "time_per_iteration": 2.4678092002868652 + }, + { + "auxiliary_loss_clip": 0.01081955, + "auxiliary_loss_mlp": 0.01032349, + "balance_loss_clip": 1.03509045, + "balance_loss_mlp": 1.02004695, + "epoch": 0.684262738614159, + "flos": 19646405216640.0, + "grad_norm": 1.8137124533758013, + "language_loss": 0.78094637, + "learning_rate": 9.575859167772568e-07, + "loss": 0.80208939, + "num_input_tokens_seen": 245671525, + "step": 11381, + "time_per_iteration": 2.517038345336914 + }, + { + "auxiliary_loss_clip": 0.01030891, + "auxiliary_loss_mlp": 0.01000374, + "balance_loss_clip": 1.01793683, + "balance_loss_mlp": 0.99906886, + "epoch": 0.684322861866827, + "flos": 62354462739840.0, + "grad_norm": 0.9011887082202911, + "language_loss": 0.671359, + "learning_rate": 9.572535581569713e-07, + "loss": 0.69167167, + "num_input_tokens_seen": 245724115, + "step": 11382, + "time_per_iteration": 2.966581344604492 + }, + { + "auxiliary_loss_clip": 0.01034154, + "auxiliary_loss_mlp": 0.01002146, + "balance_loss_clip": 1.02082193, + "balance_loss_mlp": 1.00106132, + "epoch": 0.6843829851194949, + "flos": 65805048812160.0, + "grad_norm": 0.8162837053982014, + "language_loss": 0.58098197, + "learning_rate": 9.569212390777356e-07, + "loss": 0.60134494, + "num_input_tokens_seen": 245789245, + "step": 11383, + "time_per_iteration": 3.1445367336273193 + }, + { + "auxiliary_loss_clip": 0.01057001, + "auxiliary_loss_mlp": 0.01029528, + "balance_loss_clip": 1.03648496, + "balance_loss_mlp": 1.01663613, + "epoch": 0.6844431083721629, + "flos": 27855153181440.0, + "grad_norm": 1.7851111497915635, + "language_loss": 0.79878062, + "learning_rate": 9.565889595521517e-07, + "loss": 0.81964594, + "num_input_tokens_seen": 245812420, + "step": 11384, + "time_per_iteration": 2.646496295928955 + }, + { + "auxiliary_loss_clip": 0.0109685, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.03658664, + "balance_loss_mlp": 1.02141654, + "epoch": 0.684503231624831, + "flos": 18255010032000.0, + "grad_norm": 2.0098304209651845, + "language_loss": 0.77383798, + "learning_rate": 9.562567195928187e-07, + "loss": 0.79514205, + "num_input_tokens_seen": 245829135, + "step": 11385, + "time_per_iteration": 2.463932991027832 + }, + { + "auxiliary_loss_clip": 0.01076556, + "auxiliary_loss_mlp": 0.01035219, + "balance_loss_clip": 1.03820896, + "balance_loss_mlp": 1.02159929, + "epoch": 0.6845633548774989, + "flos": 17639681120640.0, + "grad_norm": 2.1078684767727665, + "language_loss": 0.84392172, + "learning_rate": 9.55924519212335e-07, + "loss": 0.86503947, + "num_input_tokens_seen": 245847140, + "step": 11386, + "time_per_iteration": 2.5560219287872314 + }, + { + "auxiliary_loss_clip": 0.01098611, + "auxiliary_loss_mlp": 0.01038926, + "balance_loss_clip": 1.03967607, + "balance_loss_mlp": 1.02738023, + "epoch": 0.6846234781301669, + "flos": 20807576179200.0, + "grad_norm": 1.9916059577906498, + "language_loss": 0.83388048, + "learning_rate": 9.555923584232984e-07, + "loss": 0.85525584, + "num_input_tokens_seen": 245862855, + "step": 11387, + "time_per_iteration": 2.489928722381592 + }, + { + "auxiliary_loss_clip": 0.01089618, + "auxiliary_loss_mlp": 0.01029691, + "balance_loss_clip": 1.03529549, + "balance_loss_mlp": 1.0175674, + "epoch": 0.6846836013828348, + "flos": 36101176485120.0, + "grad_norm": 1.8323405784119928, + "language_loss": 0.72747052, + "learning_rate": 9.552602372383047e-07, + "loss": 0.7486636, + "num_input_tokens_seen": 245885415, + "step": 11388, + "time_per_iteration": 2.6355738639831543 + }, + { + "auxiliary_loss_clip": 0.01090608, + "auxiliary_loss_mlp": 0.01023893, + "balance_loss_clip": 1.03869319, + "balance_loss_mlp": 1.01250851, + "epoch": 0.6847437246355028, + "flos": 43142468607360.0, + "grad_norm": 1.775987397619916, + "language_loss": 0.62148428, + "learning_rate": 9.549281556699469e-07, + "loss": 0.64262927, + "num_input_tokens_seen": 245906285, + "step": 11389, + "time_per_iteration": 4.121963977813721 + }, + { + "auxiliary_loss_clip": 0.01019581, + "auxiliary_loss_mlp": 0.01004076, + "balance_loss_clip": 1.01684594, + "balance_loss_mlp": 1.00279999, + "epoch": 0.6848038478881707, + "flos": 71663729552640.0, + "grad_norm": 0.7275726256563138, + "language_loss": 0.55985212, + "learning_rate": 9.54596113730818e-07, + "loss": 0.58008873, + "num_input_tokens_seen": 245967620, + "step": 11390, + "time_per_iteration": 3.2232506275177 + }, + { + "auxiliary_loss_clip": 0.01072564, + "auxiliary_loss_mlp": 0.00784662, + "balance_loss_clip": 1.03897607, + "balance_loss_mlp": 1.01127028, + "epoch": 0.6848639711408387, + "flos": 19937820257280.0, + "grad_norm": 2.0468234556533, + "language_loss": 0.8801896, + "learning_rate": 9.542641114335109e-07, + "loss": 0.89876187, + "num_input_tokens_seen": 245985075, + "step": 11391, + "time_per_iteration": 2.5837225914001465 + }, + { + "auxiliary_loss_clip": 0.01065401, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.03754306, + "balance_loss_mlp": 1.02037835, + "epoch": 0.6849240943935067, + "flos": 26867501844480.0, + "grad_norm": 1.6152753449798716, + "language_loss": 0.79395598, + "learning_rate": 9.539321487906117e-07, + "loss": 0.8149358, + "num_input_tokens_seen": 246003560, + "step": 11392, + "time_per_iteration": 2.6332597732543945 + }, + { + "auxiliary_loss_clip": 0.01083457, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.0365659, + "balance_loss_mlp": 1.01779032, + "epoch": 0.6849842176461747, + "flos": 13735365425280.0, + "grad_norm": 2.35059530763193, + "language_loss": 0.70904553, + "learning_rate": 9.536002258147104e-07, + "loss": 0.73017609, + "num_input_tokens_seen": 246019600, + "step": 11393, + "time_per_iteration": 2.502363443374634 + }, + { + "auxiliary_loss_clip": 0.01068315, + "auxiliary_loss_mlp": 0.01029452, + "balance_loss_clip": 1.03562844, + "balance_loss_mlp": 1.01657748, + "epoch": 0.6850443408988426, + "flos": 24973070641920.0, + "grad_norm": 1.5830588230547482, + "language_loss": 0.64385206, + "learning_rate": 9.532683425183936e-07, + "loss": 0.66482973, + "num_input_tokens_seen": 246038920, + "step": 11394, + "time_per_iteration": 2.636575698852539 + }, + { + "auxiliary_loss_clip": 0.01081125, + "auxiliary_loss_mlp": 0.00786992, + "balance_loss_clip": 1.03684044, + "balance_loss_mlp": 1.01286519, + "epoch": 0.6851044641515106, + "flos": 27744225004800.0, + "grad_norm": 2.940442165797074, + "language_loss": 0.80673003, + "learning_rate": 9.529364989142468e-07, + "loss": 0.82541126, + "num_input_tokens_seen": 246060490, + "step": 11395, + "time_per_iteration": 3.9998552799224854 + }, + { + "auxiliary_loss_clip": 0.01076742, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.03838277, + "balance_loss_mlp": 1.01743436, + "epoch": 0.6851645874041785, + "flos": 24351061800960.0, + "grad_norm": 1.8595842335394746, + "language_loss": 0.73059464, + "learning_rate": 9.526046950148527e-07, + "loss": 0.75166488, + "num_input_tokens_seen": 246081465, + "step": 11396, + "time_per_iteration": 4.009026050567627 + }, + { + "auxiliary_loss_clip": 0.01078479, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.03799772, + "balance_loss_mlp": 1.01641679, + "epoch": 0.6852247106568465, + "flos": 15077849264640.0, + "grad_norm": 2.269817428417604, + "language_loss": 0.78728217, + "learning_rate": 9.522729308327931e-07, + "loss": 0.80836272, + "num_input_tokens_seen": 246096110, + "step": 11397, + "time_per_iteration": 2.5394840240478516 + }, + { + "auxiliary_loss_clip": 0.01031734, + "auxiliary_loss_mlp": 0.01031483, + "balance_loss_clip": 1.0340718, + "balance_loss_mlp": 1.01889491, + "epoch": 0.6852848339095146, + "flos": 18770005278720.0, + "grad_norm": 1.7676110547601929, + "language_loss": 0.71246088, + "learning_rate": 9.519412063806493e-07, + "loss": 0.73309302, + "num_input_tokens_seen": 246114785, + "step": 11398, + "time_per_iteration": 2.6638059616088867 + }, + { + "auxiliary_loss_clip": 0.01060211, + "auxiliary_loss_mlp": 0.01028504, + "balance_loss_clip": 1.03939176, + "balance_loss_mlp": 1.01724505, + "epoch": 0.6853449571621825, + "flos": 27854363082240.0, + "grad_norm": 1.617733669740887, + "language_loss": 0.7059083, + "learning_rate": 9.516095216709996e-07, + "loss": 0.72679543, + "num_input_tokens_seen": 246136375, + "step": 11399, + "time_per_iteration": 2.667738199234009 + }, + { + "auxiliary_loss_clip": 0.01093752, + "auxiliary_loss_mlp": 0.01032957, + "balance_loss_clip": 1.03763115, + "balance_loss_mlp": 1.02054763, + "epoch": 0.6854050804148505, + "flos": 18150510389760.0, + "grad_norm": 1.5863679957489545, + "language_loss": 0.70315838, + "learning_rate": 9.512778767164217e-07, + "loss": 0.72442555, + "num_input_tokens_seen": 246155090, + "step": 11400, + "time_per_iteration": 4.011098384857178 + }, + { + "auxiliary_loss_clip": 0.01071444, + "auxiliary_loss_mlp": 0.01033995, + "balance_loss_clip": 1.04054964, + "balance_loss_mlp": 1.01917708, + "epoch": 0.6854652036675184, + "flos": 16326212492160.0, + "grad_norm": 1.9814019869484805, + "language_loss": 0.7803458, + "learning_rate": 9.509462715294927e-07, + "loss": 0.80140018, + "num_input_tokens_seen": 246172645, + "step": 11401, + "time_per_iteration": 2.6040945053100586 + }, + { + "auxiliary_loss_clip": 0.01106112, + "auxiliary_loss_mlp": 0.01034656, + "balance_loss_clip": 1.03719592, + "balance_loss_mlp": 1.02287817, + "epoch": 0.6855253269201864, + "flos": 14940814878720.0, + "grad_norm": 1.7749016626328709, + "language_loss": 0.7542643, + "learning_rate": 9.50614706122786e-07, + "loss": 0.77567196, + "num_input_tokens_seen": 246189055, + "step": 11402, + "time_per_iteration": 2.458620309829712 + }, + { + "auxiliary_loss_clip": 0.01093512, + "auxiliary_loss_mlp": 0.01034954, + "balance_loss_clip": 1.03650677, + "balance_loss_mlp": 1.02157283, + "epoch": 0.6855854501728543, + "flos": 23037736826880.0, + "grad_norm": 1.6999621881828242, + "language_loss": 0.72720325, + "learning_rate": 9.502831805088742e-07, + "loss": 0.74848795, + "num_input_tokens_seen": 246207990, + "step": 11403, + "time_per_iteration": 2.520113229751587 + }, + { + "auxiliary_loss_clip": 0.01105597, + "auxiliary_loss_mlp": 0.01032573, + "balance_loss_clip": 1.03764164, + "balance_loss_mlp": 1.02079546, + "epoch": 0.6856455734255223, + "flos": 13253623194240.0, + "grad_norm": 2.536807438699413, + "language_loss": 0.81293035, + "learning_rate": 9.499516947003294e-07, + "loss": 0.83431196, + "num_input_tokens_seen": 246221595, + "step": 11404, + "time_per_iteration": 2.4568464756011963 + }, + { + "auxiliary_loss_clip": 0.01081239, + "auxiliary_loss_mlp": 0.01033805, + "balance_loss_clip": 1.03802299, + "balance_loss_mlp": 1.02190208, + "epoch": 0.6857056966781903, + "flos": 23333461499520.0, + "grad_norm": 1.4723023974445661, + "language_loss": 0.77854437, + "learning_rate": 9.496202487097222e-07, + "loss": 0.79969478, + "num_input_tokens_seen": 246242970, + "step": 11405, + "time_per_iteration": 2.578655481338501 + }, + { + "auxiliary_loss_clip": 0.01033966, + "auxiliary_loss_mlp": 0.01000903, + "balance_loss_clip": 1.02004254, + "balance_loss_mlp": 0.99963301, + "epoch": 0.6857658199308583, + "flos": 61852647784320.0, + "grad_norm": 0.7898547458298292, + "language_loss": 0.61001587, + "learning_rate": 9.492888425496199e-07, + "loss": 0.6303646, + "num_input_tokens_seen": 246300405, + "step": 11406, + "time_per_iteration": 3.1653146743774414 + }, + { + "auxiliary_loss_clip": 0.01075611, + "auxiliary_loss_mlp": 0.01033986, + "balance_loss_clip": 1.03691149, + "balance_loss_mlp": 1.02046216, + "epoch": 0.6858259431835262, + "flos": 16654543735680.0, + "grad_norm": 1.8719105601890103, + "language_loss": 0.77124083, + "learning_rate": 9.489574762325907e-07, + "loss": 0.79233682, + "num_input_tokens_seen": 246318780, + "step": 11407, + "time_per_iteration": 2.56085467338562 + }, + { + "auxiliary_loss_clip": 0.01086802, + "auxiliary_loss_mlp": 0.01033378, + "balance_loss_clip": 1.03794515, + "balance_loss_mlp": 1.02019405, + "epoch": 0.6858860664361942, + "flos": 21872974504320.0, + "grad_norm": 2.225081083741925, + "language_loss": 0.71079099, + "learning_rate": 9.486261497711991e-07, + "loss": 0.73199284, + "num_input_tokens_seen": 246339405, + "step": 11408, + "time_per_iteration": 2.640377998352051 + }, + { + "auxiliary_loss_clip": 0.01100329, + "auxiliary_loss_mlp": 0.01028856, + "balance_loss_clip": 1.03923047, + "balance_loss_mlp": 1.01620746, + "epoch": 0.6859461896888621, + "flos": 15267637751040.0, + "grad_norm": 1.7751810859494537, + "language_loss": 0.70194495, + "learning_rate": 9.482948631780087e-07, + "loss": 0.72323674, + "num_input_tokens_seen": 246357055, + "step": 11409, + "time_per_iteration": 2.55047345161438 + }, + { + "auxiliary_loss_clip": 0.0106277, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.03840387, + "balance_loss_mlp": 1.01808286, + "epoch": 0.6860063129415301, + "flos": 18620293392000.0, + "grad_norm": 1.7040722363644618, + "language_loss": 0.7829141, + "learning_rate": 9.479636164655825e-07, + "loss": 0.80383474, + "num_input_tokens_seen": 246374050, + "step": 11410, + "time_per_iteration": 2.6055891513824463 + }, + { + "auxiliary_loss_clip": 0.01101767, + "auxiliary_loss_mlp": 0.01037662, + "balance_loss_clip": 1.03859377, + "balance_loss_mlp": 1.02454865, + "epoch": 0.6860664361941982, + "flos": 23951376190080.0, + "grad_norm": 1.9950751465373529, + "language_loss": 0.71431071, + "learning_rate": 9.476324096464821e-07, + "loss": 0.73570502, + "num_input_tokens_seen": 246392910, + "step": 11411, + "time_per_iteration": 2.558617115020752 + }, + { + "auxiliary_loss_clip": 0.01056707, + "auxiliary_loss_mlp": 0.01034377, + "balance_loss_clip": 1.03479791, + "balance_loss_mlp": 1.01996493, + "epoch": 0.6861265594468661, + "flos": 20407782827520.0, + "grad_norm": 2.064182975961849, + "language_loss": 0.70096773, + "learning_rate": 9.473012427332654e-07, + "loss": 0.72187859, + "num_input_tokens_seen": 246411540, + "step": 11412, + "time_per_iteration": 2.682128667831421 + }, + { + "auxiliary_loss_clip": 0.01110037, + "auxiliary_loss_mlp": 0.01033626, + "balance_loss_clip": 1.03911066, + "balance_loss_mlp": 1.02067947, + "epoch": 0.6861866826995341, + "flos": 11428571111040.0, + "grad_norm": 5.795879312976152, + "language_loss": 0.71915996, + "learning_rate": 9.469701157384919e-07, + "loss": 0.74059659, + "num_input_tokens_seen": 246423295, + "step": 11413, + "time_per_iteration": 2.458012819290161 + }, + { + "auxiliary_loss_clip": 0.01097792, + "auxiliary_loss_mlp": 0.01030015, + "balance_loss_clip": 1.03783751, + "balance_loss_mlp": 1.01827264, + "epoch": 0.686246805952202, + "flos": 15997593939840.0, + "grad_norm": 1.7930772299805322, + "language_loss": 0.73785454, + "learning_rate": 9.466390286747164e-07, + "loss": 0.75913262, + "num_input_tokens_seen": 246441045, + "step": 11414, + "time_per_iteration": 2.4923696517944336 + }, + { + "auxiliary_loss_clip": 0.01086172, + "auxiliary_loss_mlp": 0.01033301, + "balance_loss_clip": 1.04000449, + "balance_loss_mlp": 1.02067029, + "epoch": 0.68630692920487, + "flos": 19826712512640.0, + "grad_norm": 2.7877135287137427, + "language_loss": 0.86407208, + "learning_rate": 9.46307981554495e-07, + "loss": 0.88526678, + "num_input_tokens_seen": 246456905, + "step": 11415, + "time_per_iteration": 2.538278341293335 + }, + { + "auxiliary_loss_clip": 0.01100452, + "auxiliary_loss_mlp": 0.01033772, + "balance_loss_clip": 1.03888345, + "balance_loss_mlp": 1.02102852, + "epoch": 0.6863670524575379, + "flos": 26286216048000.0, + "grad_norm": 1.9061608587414378, + "language_loss": 0.67228067, + "learning_rate": 9.459769743903801e-07, + "loss": 0.69362295, + "num_input_tokens_seen": 246477545, + "step": 11416, + "time_per_iteration": 2.553169012069702 + }, + { + "auxiliary_loss_clip": 0.01083689, + "auxiliary_loss_mlp": 0.0103632, + "balance_loss_clip": 1.03695393, + "balance_loss_mlp": 1.02345753, + "epoch": 0.686427175710206, + "flos": 19173138595200.0, + "grad_norm": 1.3836163866428348, + "language_loss": 0.75890708, + "learning_rate": 9.456460071949237e-07, + "loss": 0.78010714, + "num_input_tokens_seen": 246496705, + "step": 11417, + "time_per_iteration": 2.5242466926574707 + }, + { + "auxiliary_loss_clip": 0.01083122, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.03656077, + "balance_loss_mlp": 1.02145481, + "epoch": 0.6864872989628739, + "flos": 18916628595840.0, + "grad_norm": 1.7836321650040894, + "language_loss": 0.76978844, + "learning_rate": 9.45315079980678e-07, + "loss": 0.79097229, + "num_input_tokens_seen": 246514860, + "step": 11418, + "time_per_iteration": 2.524371862411499 + }, + { + "auxiliary_loss_clip": 0.01062194, + "auxiliary_loss_mlp": 0.01030069, + "balance_loss_clip": 1.03855324, + "balance_loss_mlp": 1.01848185, + "epoch": 0.6865474222155419, + "flos": 25956196865280.0, + "grad_norm": 1.7082498959988437, + "language_loss": 0.76234138, + "learning_rate": 9.449841927601887e-07, + "loss": 0.78326404, + "num_input_tokens_seen": 246536145, + "step": 11419, + "time_per_iteration": 2.6247496604919434 + }, + { + "auxiliary_loss_clip": 0.01107955, + "auxiliary_loss_mlp": 0.010373, + "balance_loss_clip": 1.03886867, + "balance_loss_mlp": 1.02554655, + "epoch": 0.6866075454682098, + "flos": 18478087447680.0, + "grad_norm": 1.6983114077629229, + "language_loss": 0.71529216, + "learning_rate": 9.446533455460044e-07, + "loss": 0.73674476, + "num_input_tokens_seen": 246553265, + "step": 11420, + "time_per_iteration": 2.4778966903686523 + }, + { + "auxiliary_loss_clip": 0.01068991, + "auxiliary_loss_mlp": 0.01031503, + "balance_loss_clip": 1.03464687, + "balance_loss_mlp": 1.01856935, + "epoch": 0.6866676687208778, + "flos": 34239998298240.0, + "grad_norm": 2.193146275849193, + "language_loss": 0.74469376, + "learning_rate": 9.443225383506712e-07, + "loss": 0.76569867, + "num_input_tokens_seen": 246575130, + "step": 11421, + "time_per_iteration": 2.66667103767395 + }, + { + "auxiliary_loss_clip": 0.01092991, + "auxiliary_loss_mlp": 0.0103167, + "balance_loss_clip": 1.03733706, + "balance_loss_mlp": 1.01999342, + "epoch": 0.6867277919735457, + "flos": 21721754246400.0, + "grad_norm": 1.777018691523352, + "language_loss": 0.7735886, + "learning_rate": 9.439917711867338e-07, + "loss": 0.79483521, + "num_input_tokens_seen": 246593095, + "step": 11422, + "time_per_iteration": 2.5080108642578125 + }, + { + "auxiliary_loss_clip": 0.01100764, + "auxiliary_loss_mlp": 0.01036988, + "balance_loss_clip": 1.03911531, + "balance_loss_mlp": 1.02383876, + "epoch": 0.6867879152262137, + "flos": 24097999507200.0, + "grad_norm": 1.7994108828596531, + "language_loss": 0.76967907, + "learning_rate": 9.436610440667334e-07, + "loss": 0.79105651, + "num_input_tokens_seen": 246612165, + "step": 11423, + "time_per_iteration": 2.5217063426971436 + }, + { + "auxiliary_loss_clip": 0.01078917, + "auxiliary_loss_mlp": 0.01031786, + "balance_loss_clip": 1.03899741, + "balance_loss_mlp": 1.01953125, + "epoch": 0.6868480384788818, + "flos": 21615818060160.0, + "grad_norm": 1.524977571288621, + "language_loss": 0.73016912, + "learning_rate": 9.433303570032129e-07, + "loss": 0.75127614, + "num_input_tokens_seen": 246632065, + "step": 11424, + "time_per_iteration": 2.6006581783294678 + }, + { + "auxiliary_loss_clip": 0.01087787, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.03919303, + "balance_loss_mlp": 1.01761305, + "epoch": 0.6869081617315497, + "flos": 26286144220800.0, + "grad_norm": 1.9642174724006458, + "language_loss": 0.64932722, + "learning_rate": 9.429997100087112e-07, + "loss": 0.67049521, + "num_input_tokens_seen": 246651245, + "step": 11425, + "time_per_iteration": 2.569391965866089 + }, + { + "auxiliary_loss_clip": 0.01071264, + "auxiliary_loss_mlp": 0.01028106, + "balance_loss_clip": 1.04019499, + "balance_loss_mlp": 1.01622033, + "epoch": 0.6869682849842177, + "flos": 21105096531840.0, + "grad_norm": 1.4142545289190422, + "language_loss": 0.71945751, + "learning_rate": 9.426691030957657e-07, + "loss": 0.74045122, + "num_input_tokens_seen": 246672225, + "step": 11426, + "time_per_iteration": 2.5821170806884766 + }, + { + "auxiliary_loss_clip": 0.01057033, + "auxiliary_loss_mlp": 0.01030521, + "balance_loss_clip": 1.03673804, + "balance_loss_mlp": 1.0175333, + "epoch": 0.6870284082368856, + "flos": 17092653920640.0, + "grad_norm": 2.0454233430095763, + "language_loss": 0.84965444, + "learning_rate": 9.423385362769136e-07, + "loss": 0.87052995, + "num_input_tokens_seen": 246688385, + "step": 11427, + "time_per_iteration": 2.5578720569610596 + }, + { + "auxiliary_loss_clip": 0.01097558, + "auxiliary_loss_mlp": 0.01033786, + "balance_loss_clip": 1.04009426, + "balance_loss_mlp": 1.02188277, + "epoch": 0.6870885314895536, + "flos": 27308090067840.0, + "grad_norm": 1.445908983231136, + "language_loss": 0.76231742, + "learning_rate": 9.420080095646909e-07, + "loss": 0.78363085, + "num_input_tokens_seen": 246710730, + "step": 11428, + "time_per_iteration": 3.9582669734954834 + }, + { + "auxiliary_loss_clip": 0.01075587, + "auxiliary_loss_mlp": 0.01034424, + "balance_loss_clip": 1.03871548, + "balance_loss_mlp": 1.02173972, + "epoch": 0.6871486547422215, + "flos": 20814543417600.0, + "grad_norm": 1.9062096245793856, + "language_loss": 0.73327291, + "learning_rate": 9.4167752297163e-07, + "loss": 0.75437301, + "num_input_tokens_seen": 246730350, + "step": 11429, + "time_per_iteration": 2.5858376026153564 + }, + { + "auxiliary_loss_clip": 0.01087148, + "auxiliary_loss_mlp": 0.01024392, + "balance_loss_clip": 1.03928888, + "balance_loss_mlp": 1.01250052, + "epoch": 0.6872087779948896, + "flos": 30154118330880.0, + "grad_norm": 1.662589406254346, + "language_loss": 0.83055973, + "learning_rate": 9.413470765102643e-07, + "loss": 0.85167515, + "num_input_tokens_seen": 246751700, + "step": 11430, + "time_per_iteration": 2.598278522491455 + }, + { + "auxiliary_loss_clip": 0.01098133, + "auxiliary_loss_mlp": 0.01032419, + "balance_loss_clip": 1.03750777, + "balance_loss_mlp": 1.0205934, + "epoch": 0.6872689012475575, + "flos": 20704584908160.0, + "grad_norm": 2.0104257300578623, + "language_loss": 0.70134217, + "learning_rate": 9.410166701931225e-07, + "loss": 0.72264767, + "num_input_tokens_seen": 246769860, + "step": 11431, + "time_per_iteration": 2.526407480239868 + }, + { + "auxiliary_loss_clip": 0.0108811, + "auxiliary_loss_mlp": 0.00787454, + "balance_loss_clip": 1.03745556, + "balance_loss_mlp": 1.01696467, + "epoch": 0.6873290245002255, + "flos": 25520852027520.0, + "grad_norm": 1.824118707970035, + "language_loss": 0.79818785, + "learning_rate": 9.406863040327355e-07, + "loss": 0.81694353, + "num_input_tokens_seen": 246789905, + "step": 11432, + "time_per_iteration": 2.570183038711548 + }, + { + "auxiliary_loss_clip": 0.01083503, + "auxiliary_loss_mlp": 0.01026461, + "balance_loss_clip": 1.0376724, + "balance_loss_mlp": 1.01515985, + "epoch": 0.6873891477528934, + "flos": 25191479289600.0, + "grad_norm": 1.579242325458712, + "language_loss": 0.67906314, + "learning_rate": 9.403559780416295e-07, + "loss": 0.70016277, + "num_input_tokens_seen": 246808815, + "step": 11433, + "time_per_iteration": 2.5659139156341553 + }, + { + "auxiliary_loss_clip": 0.01100434, + "auxiliary_loss_mlp": 0.01036137, + "balance_loss_clip": 1.04089642, + "balance_loss_mlp": 1.0239954, + "epoch": 0.6874492710055614, + "flos": 35152380685440.0, + "grad_norm": 1.95080507916852, + "language_loss": 0.73071945, + "learning_rate": 9.400256922323309e-07, + "loss": 0.75208509, + "num_input_tokens_seen": 246829775, + "step": 11434, + "time_per_iteration": 5.479167699813843 + }, + { + "auxiliary_loss_clip": 0.01072833, + "auxiliary_loss_mlp": 0.0102573, + "balance_loss_clip": 1.03958762, + "balance_loss_mlp": 1.01397049, + "epoch": 0.6875093942582293, + "flos": 17822215059840.0, + "grad_norm": 1.5768000847545227, + "language_loss": 0.80319405, + "learning_rate": 9.396954466173657e-07, + "loss": 0.82417971, + "num_input_tokens_seen": 246848045, + "step": 11435, + "time_per_iteration": 2.544827699661255 + }, + { + "auxiliary_loss_clip": 0.01108312, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.03763103, + "balance_loss_mlp": 1.01811004, + "epoch": 0.6875695175108973, + "flos": 20704548994560.0, + "grad_norm": 2.0596846662916657, + "language_loss": 0.81089199, + "learning_rate": 9.393652412092538e-07, + "loss": 0.83228093, + "num_input_tokens_seen": 246866095, + "step": 11436, + "time_per_iteration": 2.4881951808929443 + }, + { + "auxiliary_loss_clip": 0.01063329, + "auxiliary_loss_mlp": 0.01034274, + "balance_loss_clip": 1.03495896, + "balance_loss_mlp": 1.02287722, + "epoch": 0.6876296407635654, + "flos": 25374013228800.0, + "grad_norm": 1.7494839947163467, + "language_loss": 0.82151163, + "learning_rate": 9.390350760205183e-07, + "loss": 0.84248769, + "num_input_tokens_seen": 246883975, + "step": 11437, + "time_per_iteration": 2.586728096008301 + }, + { + "auxiliary_loss_clip": 0.01090755, + "auxiliary_loss_mlp": 0.0103588, + "balance_loss_clip": 1.03755188, + "balance_loss_mlp": 1.02252257, + "epoch": 0.6876897640162333, + "flos": 23222317841280.0, + "grad_norm": 2.6163446464936597, + "language_loss": 0.78465873, + "learning_rate": 9.387049510636793e-07, + "loss": 0.80592507, + "num_input_tokens_seen": 246901560, + "step": 11438, + "time_per_iteration": 3.9343626499176025 + }, + { + "auxiliary_loss_clip": 0.01103305, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.03793025, + "balance_loss_mlp": 1.01960468, + "epoch": 0.6877498872689013, + "flos": 27124335066240.0, + "grad_norm": 1.6861873561798344, + "language_loss": 0.7256217, + "learning_rate": 9.383748663512554e-07, + "loss": 0.74696481, + "num_input_tokens_seen": 246922655, + "step": 11439, + "time_per_iteration": 2.5388436317443848 + }, + { + "auxiliary_loss_clip": 0.01095734, + "auxiliary_loss_mlp": 0.01024314, + "balance_loss_clip": 1.03848076, + "balance_loss_mlp": 1.01272082, + "epoch": 0.6878100105215692, + "flos": 11581658876160.0, + "grad_norm": 2.8440063880928834, + "language_loss": 0.75213927, + "learning_rate": 9.380448218957623e-07, + "loss": 0.77333975, + "num_input_tokens_seen": 246940100, + "step": 11440, + "time_per_iteration": 2.4714014530181885 + }, + { + "auxiliary_loss_clip": 0.01062046, + "auxiliary_loss_mlp": 0.0103617, + "balance_loss_clip": 1.03529191, + "balance_loss_mlp": 1.02452874, + "epoch": 0.6878701337742372, + "flos": 20303175444480.0, + "grad_norm": 1.6414942417203626, + "language_loss": 0.71753949, + "learning_rate": 9.377148177097167e-07, + "loss": 0.73852164, + "num_input_tokens_seen": 246958545, + "step": 11441, + "time_per_iteration": 2.571171760559082 + }, + { + "auxiliary_loss_clip": 0.01072869, + "auxiliary_loss_mlp": 0.01038817, + "balance_loss_clip": 1.03888178, + "balance_loss_mlp": 1.02421975, + "epoch": 0.6879302570269051, + "flos": 13840080549120.0, + "grad_norm": 1.6860979342043239, + "language_loss": 0.67043531, + "learning_rate": 9.373848538056317e-07, + "loss": 0.69155216, + "num_input_tokens_seen": 246974805, + "step": 11442, + "time_per_iteration": 2.564851999282837 + }, + { + "auxiliary_loss_clip": 0.01094096, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.03994882, + "balance_loss_mlp": 1.01913345, + "epoch": 0.6879903802795732, + "flos": 21324654414720.0, + "grad_norm": 1.8966687516498697, + "language_loss": 0.69875681, + "learning_rate": 9.370549301960189e-07, + "loss": 0.72000146, + "num_input_tokens_seen": 246992505, + "step": 11443, + "time_per_iteration": 2.528290033340454 + }, + { + "auxiliary_loss_clip": 0.01091565, + "auxiliary_loss_mlp": 0.01031857, + "balance_loss_clip": 1.04010141, + "balance_loss_mlp": 1.01885724, + "epoch": 0.6880505035322411, + "flos": 25152049134720.0, + "grad_norm": 1.4390044685152852, + "language_loss": 0.76349401, + "learning_rate": 9.367250468933893e-07, + "loss": 0.78472823, + "num_input_tokens_seen": 247013370, + "step": 11444, + "time_per_iteration": 2.5917370319366455 + }, + { + "auxiliary_loss_clip": 0.01105579, + "auxiliary_loss_mlp": 0.01028441, + "balance_loss_clip": 1.03835213, + "balance_loss_mlp": 1.01742601, + "epoch": 0.6881106267849091, + "flos": 23215530170880.0, + "grad_norm": 2.353068624582353, + "language_loss": 0.77054465, + "learning_rate": 9.363952039102536e-07, + "loss": 0.79188484, + "num_input_tokens_seen": 247029855, + "step": 11445, + "time_per_iteration": 2.477593421936035 + }, + { + "auxiliary_loss_clip": 0.01033652, + "auxiliary_loss_mlp": 0.01009225, + "balance_loss_clip": 1.01920426, + "balance_loss_mlp": 1.00802124, + "epoch": 0.688170750037577, + "flos": 48484397312640.0, + "grad_norm": 0.8159958908223361, + "language_loss": 0.58371258, + "learning_rate": 9.360654012591183e-07, + "loss": 0.60414135, + "num_input_tokens_seen": 247085030, + "step": 11446, + "time_per_iteration": 3.138810634613037 + }, + { + "auxiliary_loss_clip": 0.01096145, + "auxiliary_loss_mlp": 0.01028908, + "balance_loss_clip": 1.03529334, + "balance_loss_mlp": 1.01658213, + "epoch": 0.688230873290245, + "flos": 22783633038720.0, + "grad_norm": 1.4991421741977153, + "language_loss": 0.75608331, + "learning_rate": 9.357356389524886e-07, + "loss": 0.77733386, + "num_input_tokens_seen": 247104840, + "step": 11447, + "time_per_iteration": 2.563877582550049 + }, + { + "auxiliary_loss_clip": 0.01088315, + "auxiliary_loss_mlp": 0.01033648, + "balance_loss_clip": 1.03789973, + "balance_loss_mlp": 1.02148247, + "epoch": 0.6882909965429129, + "flos": 22455660931200.0, + "grad_norm": 2.11551184388984, + "language_loss": 0.73368597, + "learning_rate": 9.354059170028705e-07, + "loss": 0.75490558, + "num_input_tokens_seen": 247121905, + "step": 11448, + "time_per_iteration": 2.5566985607147217 + }, + { + "auxiliary_loss_clip": 0.01096727, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.03699541, + "balance_loss_mlp": 1.02445889, + "epoch": 0.688351119795581, + "flos": 26214143408640.0, + "grad_norm": 2.06113313338257, + "language_loss": 0.74604321, + "learning_rate": 9.350762354227673e-07, + "loss": 0.76739323, + "num_input_tokens_seen": 247142375, + "step": 11449, + "time_per_iteration": 2.5545544624328613 + }, + { + "auxiliary_loss_clip": 0.01105536, + "auxiliary_loss_mlp": 0.0103598, + "balance_loss_clip": 1.03778148, + "balance_loss_mlp": 1.02476811, + "epoch": 0.6884112430482489, + "flos": 22565260304640.0, + "grad_norm": 1.7323562048981278, + "language_loss": 0.7006005, + "learning_rate": 9.34746594224679e-07, + "loss": 0.72201574, + "num_input_tokens_seen": 247161095, + "step": 11450, + "time_per_iteration": 2.4812815189361572 + }, + { + "auxiliary_loss_clip": 0.01079645, + "auxiliary_loss_mlp": 0.01033478, + "balance_loss_clip": 1.04131794, + "balance_loss_mlp": 1.02066302, + "epoch": 0.6884713663009169, + "flos": 17341047446400.0, + "grad_norm": 1.9053156360853918, + "language_loss": 0.76227933, + "learning_rate": 9.344169934211068e-07, + "loss": 0.78341061, + "num_input_tokens_seen": 247178565, + "step": 11451, + "time_per_iteration": 2.564218759536743 + }, + { + "auxiliary_loss_clip": 0.01095961, + "auxiliary_loss_mlp": 0.01027679, + "balance_loss_clip": 1.04069829, + "balance_loss_mlp": 1.01589513, + "epoch": 0.6885314895535849, + "flos": 26470832976000.0, + "grad_norm": 1.347276968038282, + "language_loss": 0.69318259, + "learning_rate": 9.340874330245505e-07, + "loss": 0.71441901, + "num_input_tokens_seen": 247202345, + "step": 11452, + "time_per_iteration": 2.5687992572784424 + }, + { + "auxiliary_loss_clip": 0.01108843, + "auxiliary_loss_mlp": 0.01033018, + "balance_loss_clip": 1.03970218, + "balance_loss_mlp": 1.01935613, + "epoch": 0.6885916128062528, + "flos": 20521548178560.0, + "grad_norm": 1.648134840434037, + "language_loss": 0.71994674, + "learning_rate": 9.337579130475042e-07, + "loss": 0.74136537, + "num_input_tokens_seen": 247219240, + "step": 11453, + "time_per_iteration": 2.4747517108917236 + }, + { + "auxiliary_loss_clip": 0.01035385, + "auxiliary_loss_mlp": 0.00768007, + "balance_loss_clip": 1.02127814, + "balance_loss_mlp": 1.00858808, + "epoch": 0.6886517360589208, + "flos": 70715795679360.0, + "grad_norm": 0.784916887746031, + "language_loss": 0.50684935, + "learning_rate": 9.334284335024644e-07, + "loss": 0.52488333, + "num_input_tokens_seen": 247272010, + "step": 11454, + "time_per_iteration": 2.9891676902770996 + }, + { + "auxiliary_loss_clip": 0.01095286, + "auxiliary_loss_mlp": 0.01031577, + "balance_loss_clip": 1.03927326, + "balance_loss_mlp": 1.01999569, + "epoch": 0.6887118593115887, + "flos": 17893533513600.0, + "grad_norm": 1.83434119553909, + "language_loss": 0.75101876, + "learning_rate": 9.330989944019263e-07, + "loss": 0.77228737, + "num_input_tokens_seen": 247290630, + "step": 11455, + "time_per_iteration": 2.5094077587127686 + }, + { + "auxiliary_loss_clip": 0.01086757, + "auxiliary_loss_mlp": 0.0103474, + "balance_loss_clip": 1.03523481, + "balance_loss_mlp": 1.02131057, + "epoch": 0.6887719825642568, + "flos": 17453017117440.0, + "grad_norm": 2.985845080615853, + "language_loss": 0.72799158, + "learning_rate": 9.327695957583803e-07, + "loss": 0.74920654, + "num_input_tokens_seen": 247304800, + "step": 11456, + "time_per_iteration": 2.503765821456909 + }, + { + "auxiliary_loss_clip": 0.01084597, + "auxiliary_loss_mlp": 0.01036065, + "balance_loss_clip": 1.03876019, + "balance_loss_mlp": 1.02443588, + "epoch": 0.6888321058169247, + "flos": 23070199743360.0, + "grad_norm": 1.5929865752865773, + "language_loss": 0.80839884, + "learning_rate": 9.32440237584319e-07, + "loss": 0.8296054, + "num_input_tokens_seen": 247323450, + "step": 11457, + "time_per_iteration": 2.5366861820220947 + }, + { + "auxiliary_loss_clip": 0.01101335, + "auxiliary_loss_mlp": 0.00786439, + "balance_loss_clip": 1.04080307, + "balance_loss_mlp": 1.01491094, + "epoch": 0.6888922290695927, + "flos": 23368833417600.0, + "grad_norm": 1.5730700375361752, + "language_loss": 0.76262349, + "learning_rate": 9.321109198922301e-07, + "loss": 0.78150117, + "num_input_tokens_seen": 247343845, + "step": 11458, + "time_per_iteration": 2.543712854385376 + }, + { + "auxiliary_loss_clip": 0.01108561, + "auxiliary_loss_mlp": 0.01032465, + "balance_loss_clip": 1.03920674, + "balance_loss_mlp": 1.02136064, + "epoch": 0.6889523523222606, + "flos": 17631636474240.0, + "grad_norm": 2.8012430757822546, + "language_loss": 0.68008316, + "learning_rate": 9.31781642694603e-07, + "loss": 0.70149338, + "num_input_tokens_seen": 247356650, + "step": 11459, + "time_per_iteration": 2.4367411136627197 + }, + { + "auxiliary_loss_clip": 0.0106729, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.03747118, + "balance_loss_mlp": 1.0205524, + "epoch": 0.6890124755749286, + "flos": 25228144097280.0, + "grad_norm": 1.5504796504736178, + "language_loss": 0.68578011, + "learning_rate": 9.314524060039221e-07, + "loss": 0.70676893, + "num_input_tokens_seen": 247377340, + "step": 11460, + "time_per_iteration": 2.617654800415039 + }, + { + "auxiliary_loss_clip": 0.01078716, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.03686678, + "balance_loss_mlp": 1.01571012, + "epoch": 0.6890725988275965, + "flos": 20230240878720.0, + "grad_norm": 1.719543490378219, + "language_loss": 0.76796889, + "learning_rate": 9.311232098326731e-07, + "loss": 0.78904343, + "num_input_tokens_seen": 247395805, + "step": 11461, + "time_per_iteration": 2.581594705581665 + }, + { + "auxiliary_loss_clip": 0.01085678, + "auxiliary_loss_mlp": 0.01034437, + "balance_loss_clip": 1.03688693, + "balance_loss_mlp": 1.02212918, + "epoch": 0.6891327220802645, + "flos": 14535311264640.0, + "grad_norm": 1.626075294528417, + "language_loss": 0.6937964, + "learning_rate": 9.307940541933401e-07, + "loss": 0.71499753, + "num_input_tokens_seen": 247413165, + "step": 11462, + "time_per_iteration": 2.5369086265563965 + }, + { + "auxiliary_loss_clip": 0.01098153, + "auxiliary_loss_mlp": 0.01027771, + "balance_loss_clip": 1.03927135, + "balance_loss_mlp": 1.01575506, + "epoch": 0.6891928453329325, + "flos": 21139139646720.0, + "grad_norm": 1.5177456877688893, + "language_loss": 0.8743223, + "learning_rate": 9.304649390984034e-07, + "loss": 0.89558148, + "num_input_tokens_seen": 247433140, + "step": 11463, + "time_per_iteration": 2.531630039215088 + }, + { + "auxiliary_loss_clip": 0.01056129, + "auxiliary_loss_mlp": 0.0102571, + "balance_loss_clip": 1.03694463, + "balance_loss_mlp": 1.01515388, + "epoch": 0.6892529685856005, + "flos": 17858520731520.0, + "grad_norm": 1.5516166483067624, + "language_loss": 0.68284923, + "learning_rate": 9.301358645603428e-07, + "loss": 0.70366764, + "num_input_tokens_seen": 247451265, + "step": 11464, + "time_per_iteration": 2.620295524597168 + }, + { + "auxiliary_loss_clip": 0.01098104, + "auxiliary_loss_mlp": 0.01033107, + "balance_loss_clip": 1.03891134, + "balance_loss_mlp": 1.02078032, + "epoch": 0.6893130918382685, + "flos": 29934811843200.0, + "grad_norm": 2.0434623909115106, + "language_loss": 0.65135026, + "learning_rate": 9.298068305916373e-07, + "loss": 0.67266238, + "num_input_tokens_seen": 247471645, + "step": 11465, + "time_per_iteration": 2.573216438293457 + }, + { + "auxiliary_loss_clip": 0.01099285, + "auxiliary_loss_mlp": 0.01033309, + "balance_loss_clip": 1.03826201, + "balance_loss_mlp": 1.02178788, + "epoch": 0.6893732150909364, + "flos": 24388516707840.0, + "grad_norm": 1.4113955974712737, + "language_loss": 0.72682059, + "learning_rate": 9.294778372047649e-07, + "loss": 0.74814653, + "num_input_tokens_seen": 247491170, + "step": 11466, + "time_per_iteration": 2.564704418182373 + }, + { + "auxiliary_loss_clip": 0.01109742, + "auxiliary_loss_mlp": 0.01030109, + "balance_loss_clip": 1.03976667, + "balance_loss_mlp": 1.0185163, + "epoch": 0.6894333383436044, + "flos": 16982874979200.0, + "grad_norm": 1.619239188073845, + "language_loss": 0.71228349, + "learning_rate": 9.291488844121995e-07, + "loss": 0.73368204, + "num_input_tokens_seen": 247509005, + "step": 11467, + "time_per_iteration": 3.8964946269989014 + }, + { + "auxiliary_loss_clip": 0.01088983, + "auxiliary_loss_mlp": 0.01031787, + "balance_loss_clip": 1.03825259, + "balance_loss_mlp": 1.01835203, + "epoch": 0.6894934615962723, + "flos": 18985540838400.0, + "grad_norm": 1.8735746515695006, + "language_loss": 0.80700499, + "learning_rate": 9.288199722264156e-07, + "loss": 0.82821268, + "num_input_tokens_seen": 247527050, + "step": 11468, + "time_per_iteration": 2.530848741531372 + }, + { + "auxiliary_loss_clip": 0.011108, + "auxiliary_loss_mlp": 0.01035027, + "balance_loss_clip": 1.03990638, + "balance_loss_mlp": 1.02287912, + "epoch": 0.6895535848489404, + "flos": 34531664734080.0, + "grad_norm": 1.4993232749910217, + "language_loss": 0.66019428, + "learning_rate": 9.284911006598875e-07, + "loss": 0.68165249, + "num_input_tokens_seen": 247547765, + "step": 11469, + "time_per_iteration": 2.5818252563476562 + }, + { + "auxiliary_loss_clip": 0.01032964, + "auxiliary_loss_mlp": 0.01000552, + "balance_loss_clip": 1.01902914, + "balance_loss_mlp": 0.99922925, + "epoch": 0.6896137081016083, + "flos": 50075852273280.0, + "grad_norm": 0.8020540980364241, + "language_loss": 0.55178946, + "learning_rate": 9.281622697250824e-07, + "loss": 0.57212466, + "num_input_tokens_seen": 247603515, + "step": 11470, + "time_per_iteration": 3.0206611156463623 + }, + { + "auxiliary_loss_clip": 0.01093128, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.03805709, + "balance_loss_mlp": 1.02043438, + "epoch": 0.6896738313542763, + "flos": 19938215306880.0, + "grad_norm": 1.8052213816998894, + "language_loss": 0.78427488, + "learning_rate": 9.278334794344715e-07, + "loss": 0.80551034, + "num_input_tokens_seen": 247622110, + "step": 11471, + "time_per_iteration": 2.5166096687316895 + }, + { + "auxiliary_loss_clip": 0.01084252, + "auxiliary_loss_mlp": 0.01029672, + "balance_loss_clip": 1.03623652, + "balance_loss_mlp": 1.01750708, + "epoch": 0.6897339546069442, + "flos": 21725489260800.0, + "grad_norm": 2.3083258065226606, + "language_loss": 0.78290164, + "learning_rate": 9.275047298005232e-07, + "loss": 0.80404091, + "num_input_tokens_seen": 247641905, + "step": 11472, + "time_per_iteration": 2.5432848930358887 + }, + { + "auxiliary_loss_clip": 0.01085399, + "auxiliary_loss_mlp": 0.01029241, + "balance_loss_clip": 1.03692961, + "balance_loss_mlp": 1.01779723, + "epoch": 0.6897940778596122, + "flos": 19826497031040.0, + "grad_norm": 1.5446035072598734, + "language_loss": 0.76133192, + "learning_rate": 9.271760208357024e-07, + "loss": 0.78247833, + "num_input_tokens_seen": 247660945, + "step": 11473, + "time_per_iteration": 5.38184118270874 + }, + { + "auxiliary_loss_clip": 0.01067694, + "auxiliary_loss_mlp": 0.01043327, + "balance_loss_clip": 1.03552461, + "balance_loss_mlp": 1.02788973, + "epoch": 0.6898542011122801, + "flos": 17310056987520.0, + "grad_norm": 2.169301010265059, + "language_loss": 0.75581002, + "learning_rate": 9.268473525524751e-07, + "loss": 0.7769202, + "num_input_tokens_seen": 247678395, + "step": 11474, + "time_per_iteration": 2.56606388092041 + }, + { + "auxiliary_loss_clip": 0.01057735, + "auxiliary_loss_mlp": 0.01028637, + "balance_loss_clip": 1.03975761, + "balance_loss_mlp": 1.0169661, + "epoch": 0.6899143243649482, + "flos": 24754051463040.0, + "grad_norm": 1.5968351408096138, + "language_loss": 0.74305242, + "learning_rate": 9.26518724963303e-07, + "loss": 0.76391613, + "num_input_tokens_seen": 247698380, + "step": 11475, + "time_per_iteration": 2.6367831230163574 + }, + { + "auxiliary_loss_clip": 0.01076173, + "auxiliary_loss_mlp": 0.01029732, + "balance_loss_clip": 1.03456497, + "balance_loss_mlp": 1.01714337, + "epoch": 0.6899744476176161, + "flos": 17234536642560.0, + "grad_norm": 2.2658212986996524, + "language_loss": 0.88487005, + "learning_rate": 9.261901380806491e-07, + "loss": 0.90592903, + "num_input_tokens_seen": 247716370, + "step": 11476, + "time_per_iteration": 2.5360498428344727 + }, + { + "auxiliary_loss_clip": 0.01106591, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.03791142, + "balance_loss_mlp": 1.02009022, + "epoch": 0.6900345708702841, + "flos": 25410678036480.0, + "grad_norm": 1.350912154783926, + "language_loss": 0.70155728, + "learning_rate": 9.258615919169724e-07, + "loss": 0.72294247, + "num_input_tokens_seen": 247737335, + "step": 11477, + "time_per_iteration": 3.9897637367248535 + }, + { + "auxiliary_loss_clip": 0.01101457, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.03915668, + "balance_loss_mlp": 1.02430379, + "epoch": 0.6900946941229521, + "flos": 23434190213760.0, + "grad_norm": 2.0402804301591315, + "language_loss": 0.68282974, + "learning_rate": 9.255330864847313e-07, + "loss": 0.70421576, + "num_input_tokens_seen": 247756680, + "step": 11478, + "time_per_iteration": 2.5227434635162354 + }, + { + "auxiliary_loss_clip": 0.01099613, + "auxiliary_loss_mlp": 0.01031825, + "balance_loss_clip": 1.04012632, + "balance_loss_mlp": 1.01992822, + "epoch": 0.69015481737562, + "flos": 17820096157440.0, + "grad_norm": 1.9277083020336874, + "language_loss": 0.76375258, + "learning_rate": 9.252046217963843e-07, + "loss": 0.78506696, + "num_input_tokens_seen": 247774265, + "step": 11479, + "time_per_iteration": 2.4884185791015625 + }, + { + "auxiliary_loss_clip": 0.01098143, + "auxiliary_loss_mlp": 0.01028102, + "balance_loss_clip": 1.03714812, + "balance_loss_mlp": 1.01502442, + "epoch": 0.690214940628288, + "flos": 17456500736640.0, + "grad_norm": 1.7504323783780453, + "language_loss": 0.78582793, + "learning_rate": 9.248761978643856e-07, + "loss": 0.80709034, + "num_input_tokens_seen": 247792395, + "step": 11480, + "time_per_iteration": 2.493065357208252 + }, + { + "auxiliary_loss_clip": 0.01069474, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.03823185, + "balance_loss_mlp": 1.01706147, + "epoch": 0.6902750638809559, + "flos": 29566691308800.0, + "grad_norm": 1.7087624689519594, + "language_loss": 0.75639343, + "learning_rate": 9.245478147011885e-07, + "loss": 0.7773847, + "num_input_tokens_seen": 247811985, + "step": 11481, + "time_per_iteration": 2.64522123336792 + }, + { + "auxiliary_loss_clip": 0.01071827, + "auxiliary_loss_mlp": 0.01026442, + "balance_loss_clip": 1.0383749, + "balance_loss_mlp": 1.0142473, + "epoch": 0.690335187133624, + "flos": 25557121785600.0, + "grad_norm": 1.6449690653431126, + "language_loss": 0.69518256, + "learning_rate": 9.24219472319246e-07, + "loss": 0.71616518, + "num_input_tokens_seen": 247831880, + "step": 11482, + "time_per_iteration": 2.599090576171875 + }, + { + "auxiliary_loss_clip": 0.01107959, + "auxiliary_loss_mlp": 0.01026531, + "balance_loss_clip": 1.03889692, + "balance_loss_mlp": 1.01492596, + "epoch": 0.6903953103862919, + "flos": 22488447070080.0, + "grad_norm": 2.2324430710717484, + "language_loss": 0.83057004, + "learning_rate": 9.238911707310096e-07, + "loss": 0.85191488, + "num_input_tokens_seen": 247851170, + "step": 11483, + "time_per_iteration": 2.499938726425171 + }, + { + "auxiliary_loss_clip": 0.01109281, + "auxiliary_loss_mlp": 0.01026648, + "balance_loss_clip": 1.03849447, + "balance_loss_mlp": 1.01506126, + "epoch": 0.6904554336389599, + "flos": 26100521712000.0, + "grad_norm": 1.9324833726294723, + "language_loss": 0.65330982, + "learning_rate": 9.235629099489273e-07, + "loss": 0.67466915, + "num_input_tokens_seen": 247868950, + "step": 11484, + "time_per_iteration": 2.510108709335327 + }, + { + "auxiliary_loss_clip": 0.01074964, + "auxiliary_loss_mlp": 0.01038129, + "balance_loss_clip": 1.03615117, + "balance_loss_mlp": 1.02524233, + "epoch": 0.6905155568916278, + "flos": 31171754545920.0, + "grad_norm": 1.5246888137201462, + "language_loss": 0.73463404, + "learning_rate": 9.232346899854479e-07, + "loss": 0.7557649, + "num_input_tokens_seen": 247889805, + "step": 11485, + "time_per_iteration": 2.6824488639831543 + }, + { + "auxiliary_loss_clip": 0.01096005, + "auxiliary_loss_mlp": 0.00786125, + "balance_loss_clip": 1.04018223, + "balance_loss_mlp": 1.01482916, + "epoch": 0.6905756801442958, + "flos": 17639681120640.0, + "grad_norm": 1.8083237321300252, + "language_loss": 0.85021484, + "learning_rate": 9.22906510853017e-07, + "loss": 0.86903608, + "num_input_tokens_seen": 247908585, + "step": 11486, + "time_per_iteration": 2.5284829139709473 + }, + { + "auxiliary_loss_clip": 0.01047712, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.03719711, + "balance_loss_mlp": 1.02093041, + "epoch": 0.6906358033969637, + "flos": 22343691260160.0, + "grad_norm": 1.4919931089704883, + "language_loss": 0.72719777, + "learning_rate": 9.225783725640786e-07, + "loss": 0.74800378, + "num_input_tokens_seen": 247928480, + "step": 11487, + "time_per_iteration": 2.677279233932495 + }, + { + "auxiliary_loss_clip": 0.01025064, + "auxiliary_loss_mlp": 0.01007333, + "balance_loss_clip": 1.02179337, + "balance_loss_mlp": 1.00610542, + "epoch": 0.6906959266496318, + "flos": 69747789081600.0, + "grad_norm": 0.8997483272382313, + "language_loss": 0.66656947, + "learning_rate": 9.222502751310759e-07, + "loss": 0.68689346, + "num_input_tokens_seen": 247988855, + "step": 11488, + "time_per_iteration": 3.141378879547119 + }, + { + "auxiliary_loss_clip": 0.01085145, + "auxiliary_loss_mlp": 0.01029974, + "balance_loss_clip": 1.03876352, + "balance_loss_mlp": 1.01665831, + "epoch": 0.6907560499022997, + "flos": 21434253788160.0, + "grad_norm": 1.7555176107071115, + "language_loss": 0.74565029, + "learning_rate": 9.219222185664519e-07, + "loss": 0.76680148, + "num_input_tokens_seen": 248007685, + "step": 11489, + "time_per_iteration": 2.5758278369903564 + }, + { + "auxiliary_loss_clip": 0.0109822, + "auxiliary_loss_mlp": 0.01035625, + "balance_loss_clip": 1.03867936, + "balance_loss_mlp": 1.02264345, + "epoch": 0.6908161731549677, + "flos": 14392207480320.0, + "grad_norm": 1.9774004828026746, + "language_loss": 0.62383074, + "learning_rate": 9.215942028826445e-07, + "loss": 0.64516914, + "num_input_tokens_seen": 248025145, + "step": 11490, + "time_per_iteration": 2.491811752319336 + }, + { + "auxiliary_loss_clip": 0.01085356, + "auxiliary_loss_mlp": 0.01027434, + "balance_loss_clip": 1.03801596, + "balance_loss_mlp": 1.01579905, + "epoch": 0.6908762964076357, + "flos": 20010970304640.0, + "grad_norm": 1.6320383516224128, + "language_loss": 0.72620809, + "learning_rate": 9.212662280920937e-07, + "loss": 0.74733597, + "num_input_tokens_seen": 248043750, + "step": 11491, + "time_per_iteration": 2.537949800491333 + }, + { + "auxiliary_loss_clip": 0.01082294, + "auxiliary_loss_mlp": 0.00784719, + "balance_loss_clip": 1.03577888, + "balance_loss_mlp": 1.01184583, + "epoch": 0.6909364196603036, + "flos": 28769079853440.0, + "grad_norm": 1.3337280727263037, + "language_loss": 0.70180809, + "learning_rate": 9.20938294207235e-07, + "loss": 0.7204783, + "num_input_tokens_seen": 248065765, + "step": 11492, + "time_per_iteration": 2.62387752532959 + }, + { + "auxiliary_loss_clip": 0.01069976, + "auxiliary_loss_mlp": 0.01030664, + "balance_loss_clip": 1.03867435, + "balance_loss_mlp": 1.01834333, + "epoch": 0.6909965429129716, + "flos": 22528128620160.0, + "grad_norm": 1.781710254864284, + "language_loss": 0.7487042, + "learning_rate": 9.206104012405049e-07, + "loss": 0.76971066, + "num_input_tokens_seen": 248083810, + "step": 11493, + "time_per_iteration": 2.6055917739868164 + }, + { + "auxiliary_loss_clip": 0.0110836, + "auxiliary_loss_mlp": 0.01026955, + "balance_loss_clip": 1.03904271, + "balance_loss_mlp": 1.01478982, + "epoch": 0.6910566661656395, + "flos": 18405942981120.0, + "grad_norm": 1.7515749065651012, + "language_loss": 0.74460083, + "learning_rate": 9.20282549204336e-07, + "loss": 0.76595396, + "num_input_tokens_seen": 248103185, + "step": 11494, + "time_per_iteration": 2.4650652408599854 + }, + { + "auxiliary_loss_clip": 0.0108188, + "auxiliary_loss_mlp": 0.01029112, + "balance_loss_clip": 1.03782964, + "balance_loss_mlp": 1.01735795, + "epoch": 0.6911167894183076, + "flos": 30773972355840.0, + "grad_norm": 1.6060273319418366, + "language_loss": 0.68092096, + "learning_rate": 9.19954738111161e-07, + "loss": 0.7020309, + "num_input_tokens_seen": 248125665, + "step": 11495, + "time_per_iteration": 2.602530002593994 + }, + { + "auxiliary_loss_clip": 0.01086733, + "auxiliary_loss_mlp": 0.01028045, + "balance_loss_clip": 1.03678966, + "balance_loss_mlp": 1.01601052, + "epoch": 0.6911769126709755, + "flos": 13735724561280.0, + "grad_norm": 1.8540005339578358, + "language_loss": 0.74233568, + "learning_rate": 9.196269679734119e-07, + "loss": 0.76348341, + "num_input_tokens_seen": 248142545, + "step": 11496, + "time_per_iteration": 2.5139498710632324 + }, + { + "auxiliary_loss_clip": 0.01073489, + "auxiliary_loss_mlp": 0.01029773, + "balance_loss_clip": 1.03654218, + "balance_loss_mlp": 1.01822186, + "epoch": 0.6912370359236435, + "flos": 17566854295680.0, + "grad_norm": 1.6634619754334028, + "language_loss": 0.80280024, + "learning_rate": 9.19299238803515e-07, + "loss": 0.82383287, + "num_input_tokens_seen": 248160225, + "step": 11497, + "time_per_iteration": 2.595149517059326 + }, + { + "auxiliary_loss_clip": 0.01070507, + "auxiliary_loss_mlp": 0.01032623, + "balance_loss_clip": 1.03659368, + "balance_loss_mlp": 1.02026153, + "epoch": 0.6912971591763114, + "flos": 22090772620800.0, + "grad_norm": 1.5666459166352902, + "language_loss": 0.80494916, + "learning_rate": 9.189715506138993e-07, + "loss": 0.82598054, + "num_input_tokens_seen": 248180430, + "step": 11498, + "time_per_iteration": 2.593575954437256 + }, + { + "auxiliary_loss_clip": 0.01094559, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.03701878, + "balance_loss_mlp": 1.01856613, + "epoch": 0.6913572824289794, + "flos": 29971476650880.0, + "grad_norm": 1.6360571499419487, + "language_loss": 0.85776275, + "learning_rate": 9.186439034169915e-07, + "loss": 0.879017, + "num_input_tokens_seen": 248202365, + "step": 11499, + "time_per_iteration": 2.614140033721924 + }, + { + "auxiliary_loss_clip": 0.01083387, + "auxiliary_loss_mlp": 0.00786161, + "balance_loss_clip": 1.04025352, + "balance_loss_mlp": 1.01319003, + "epoch": 0.6914174056816473, + "flos": 20448936835200.0, + "grad_norm": 1.6576742317174742, + "language_loss": 0.75895166, + "learning_rate": 9.183162972252145e-07, + "loss": 0.77764714, + "num_input_tokens_seen": 248221750, + "step": 11500, + "time_per_iteration": 2.542088031768799 + }, + { + "auxiliary_loss_clip": 0.01050551, + "auxiliary_loss_mlp": 0.0103942, + "balance_loss_clip": 1.03364122, + "balance_loss_mlp": 1.02534711, + "epoch": 0.6914775289343154, + "flos": 21282530739840.0, + "grad_norm": 1.9332684901417738, + "language_loss": 0.77126652, + "learning_rate": 9.179887320509921e-07, + "loss": 0.79216623, + "num_input_tokens_seen": 248239535, + "step": 11501, + "time_per_iteration": 2.6261849403381348 + }, + { + "auxiliary_loss_clip": 0.01094944, + "auxiliary_loss_mlp": 0.01036617, + "balance_loss_clip": 1.03756201, + "balance_loss_mlp": 1.02380168, + "epoch": 0.6915376521869833, + "flos": 23878118401920.0, + "grad_norm": 1.7147280278722756, + "language_loss": 0.73538828, + "learning_rate": 9.176612079067458e-07, + "loss": 0.75670385, + "num_input_tokens_seen": 248259055, + "step": 11502, + "time_per_iteration": 2.5268783569335938 + }, + { + "auxiliary_loss_clip": 0.01039332, + "auxiliary_loss_mlp": 0.01038969, + "balance_loss_clip": 1.03506398, + "balance_loss_mlp": 1.02438414, + "epoch": 0.6915977754396513, + "flos": 11510268595200.0, + "grad_norm": 1.7539512755822142, + "language_loss": 0.7345463, + "learning_rate": 9.173337248048953e-07, + "loss": 0.75532937, + "num_input_tokens_seen": 248276765, + "step": 11503, + "time_per_iteration": 2.6470236778259277 + }, + { + "auxiliary_loss_clip": 0.01093247, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.03653204, + "balance_loss_mlp": 1.02107167, + "epoch": 0.6916578986923193, + "flos": 22601278667520.0, + "grad_norm": 1.6932377836640133, + "language_loss": 0.77073497, + "learning_rate": 9.170062827578575e-07, + "loss": 0.79200852, + "num_input_tokens_seen": 248295310, + "step": 11504, + "time_per_iteration": 2.5143537521362305 + }, + { + "auxiliary_loss_clip": 0.01065735, + "auxiliary_loss_mlp": 0.01029651, + "balance_loss_clip": 1.03413653, + "balance_loss_mlp": 1.01576948, + "epoch": 0.6917180219449872, + "flos": 23477355383040.0, + "grad_norm": 1.7519993769180897, + "language_loss": 0.73731923, + "learning_rate": 9.166788817780499e-07, + "loss": 0.75827312, + "num_input_tokens_seen": 248315230, + "step": 11505, + "time_per_iteration": 2.593613862991333 + }, + { + "auxiliary_loss_clip": 0.01048753, + "auxiliary_loss_mlp": 0.00783879, + "balance_loss_clip": 1.03420949, + "balance_loss_mlp": 1.00903058, + "epoch": 0.6917781451976552, + "flos": 23732536579200.0, + "grad_norm": 1.8026245567771824, + "language_loss": 0.87746751, + "learning_rate": 9.163515218778886e-07, + "loss": 0.8957938, + "num_input_tokens_seen": 248332980, + "step": 11506, + "time_per_iteration": 4.057895183563232 + }, + { + "auxiliary_loss_clip": 0.01081745, + "auxiliary_loss_mlp": 0.01026309, + "balance_loss_clip": 1.03855765, + "balance_loss_mlp": 1.01452518, + "epoch": 0.6918382684503231, + "flos": 31466760946560.0, + "grad_norm": 1.9780133252791468, + "language_loss": 0.69949859, + "learning_rate": 9.160242030697856e-07, + "loss": 0.72057915, + "num_input_tokens_seen": 248352865, + "step": 11507, + "time_per_iteration": 2.597958564758301 + }, + { + "auxiliary_loss_clip": 0.01090087, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.03736949, + "balance_loss_mlp": 1.0217973, + "epoch": 0.6918983917029912, + "flos": 21650471706240.0, + "grad_norm": 1.880003189312143, + "language_loss": 0.76902592, + "learning_rate": 9.156969253661538e-07, + "loss": 0.79027092, + "num_input_tokens_seen": 248371125, + "step": 11508, + "time_per_iteration": 2.535619020462036 + }, + { + "auxiliary_loss_clip": 0.01091399, + "auxiliary_loss_mlp": 0.01027637, + "balance_loss_clip": 1.03742683, + "balance_loss_mlp": 1.01637757, + "epoch": 0.6919585149556591, + "flos": 25550082720000.0, + "grad_norm": 1.6071026767318808, + "language_loss": 0.75224471, + "learning_rate": 9.153696887794027e-07, + "loss": 0.77343506, + "num_input_tokens_seen": 248390455, + "step": 11509, + "time_per_iteration": 2.532285213470459 + }, + { + "auxiliary_loss_clip": 0.01058346, + "auxiliary_loss_mlp": 0.01031165, + "balance_loss_clip": 1.04072499, + "balance_loss_mlp": 1.01964355, + "epoch": 0.6920186382083271, + "flos": 23659781581440.0, + "grad_norm": 1.6680142996265412, + "language_loss": 0.63952225, + "learning_rate": 9.150424933219425e-07, + "loss": 0.66041744, + "num_input_tokens_seen": 248411305, + "step": 11510, + "time_per_iteration": 2.6466453075408936 + }, + { + "auxiliary_loss_clip": 0.0107867, + "auxiliary_loss_mlp": 0.01032557, + "balance_loss_clip": 1.03913891, + "balance_loss_mlp": 1.01890159, + "epoch": 0.692078761460995, + "flos": 19061959023360.0, + "grad_norm": 2.0849651324754226, + "language_loss": 0.75363463, + "learning_rate": 9.147153390061788e-07, + "loss": 0.77474689, + "num_input_tokens_seen": 248430190, + "step": 11511, + "time_per_iteration": 2.5598580837249756 + }, + { + "auxiliary_loss_clip": 0.01076386, + "auxiliary_loss_mlp": 0.01027077, + "balance_loss_clip": 1.03788114, + "balance_loss_mlp": 1.0158236, + "epoch": 0.692138884713663, + "flos": 29023291382400.0, + "grad_norm": 1.5764586035969197, + "language_loss": 0.62677753, + "learning_rate": 9.143882258445184e-07, + "loss": 0.64781219, + "num_input_tokens_seen": 248450830, + "step": 11512, + "time_per_iteration": 4.0212719440460205 + }, + { + "auxiliary_loss_clip": 0.01076562, + "auxiliary_loss_mlp": 0.01031216, + "balance_loss_clip": 1.03709912, + "balance_loss_mlp": 1.01877666, + "epoch": 0.6921990079663309, + "flos": 14757849976320.0, + "grad_norm": 1.8368351142741068, + "language_loss": 0.83071059, + "learning_rate": 9.140611538493666e-07, + "loss": 0.8517884, + "num_input_tokens_seen": 248468585, + "step": 11513, + "time_per_iteration": 2.534562110900879 + }, + { + "auxiliary_loss_clip": 0.01052205, + "auxiliary_loss_mlp": 0.01030107, + "balance_loss_clip": 1.03649378, + "balance_loss_mlp": 1.0188117, + "epoch": 0.692259131218999, + "flos": 23841848643840.0, + "grad_norm": 2.2776837305804483, + "language_loss": 0.78419566, + "learning_rate": 9.137341230331233e-07, + "loss": 0.80501878, + "num_input_tokens_seen": 248490535, + "step": 11514, + "time_per_iteration": 2.6435012817382812 + }, + { + "auxiliary_loss_clip": 0.01062341, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.03468084, + "balance_loss_mlp": 1.01869404, + "epoch": 0.6923192544716669, + "flos": 19135073157120.0, + "grad_norm": 2.148732784216196, + "language_loss": 0.74760127, + "learning_rate": 9.134071334081907e-07, + "loss": 0.76853085, + "num_input_tokens_seen": 248508575, + "step": 11515, + "time_per_iteration": 2.5892364978790283 + }, + { + "auxiliary_loss_clip": 0.01061557, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.03609776, + "balance_loss_mlp": 1.01927888, + "epoch": 0.6923793777243349, + "flos": 28074639237120.0, + "grad_norm": 2.154242519472449, + "language_loss": 0.53908622, + "learning_rate": 9.130801849869694e-07, + "loss": 0.56001556, + "num_input_tokens_seen": 248527025, + "step": 11516, + "time_per_iteration": 4.123652935028076 + }, + { + "auxiliary_loss_clip": 0.01092494, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_clip": 1.03654683, + "balance_loss_mlp": 1.02220249, + "epoch": 0.6924395009770029, + "flos": 16581250033920.0, + "grad_norm": 1.964842651787121, + "language_loss": 0.73093152, + "learning_rate": 9.127532777818557e-07, + "loss": 0.75220144, + "num_input_tokens_seen": 248544275, + "step": 11517, + "time_per_iteration": 2.4755313396453857 + }, + { + "auxiliary_loss_clip": 0.01109496, + "auxiliary_loss_mlp": 0.0103376, + "balance_loss_clip": 1.03839397, + "balance_loss_mlp": 1.02141595, + "epoch": 0.6924996242296708, + "flos": 16655297921280.0, + "grad_norm": 1.7210671489724525, + "language_loss": 0.7633394, + "learning_rate": 9.124264118052465e-07, + "loss": 0.78477198, + "num_input_tokens_seen": 248561870, + "step": 11518, + "time_per_iteration": 2.460892677307129 + }, + { + "auxiliary_loss_clip": 0.01102218, + "auxiliary_loss_mlp": 0.01033521, + "balance_loss_clip": 1.0394969, + "balance_loss_mlp": 1.02003217, + "epoch": 0.6925597474823388, + "flos": 34754167532160.0, + "grad_norm": 1.3275083687108058, + "language_loss": 0.64251351, + "learning_rate": 9.120995870695376e-07, + "loss": 0.66387093, + "num_input_tokens_seen": 248588190, + "step": 11519, + "time_per_iteration": 2.648158550262451 + }, + { + "auxiliary_loss_clip": 0.01076527, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.03673315, + "balance_loss_mlp": 1.01984429, + "epoch": 0.6926198707350067, + "flos": 21871717528320.0, + "grad_norm": 1.732336126451384, + "language_loss": 0.62479293, + "learning_rate": 9.117728035871212e-07, + "loss": 0.64588082, + "num_input_tokens_seen": 248606460, + "step": 11520, + "time_per_iteration": 2.539802074432373 + }, + { + "auxiliary_loss_clip": 0.01072514, + "auxiliary_loss_mlp": 0.01034323, + "balance_loss_clip": 1.03715491, + "balance_loss_mlp": 1.0202328, + "epoch": 0.6926799939876748, + "flos": 13006271162880.0, + "grad_norm": 2.089066612918736, + "language_loss": 0.78004092, + "learning_rate": 9.114460613703887e-07, + "loss": 0.80110937, + "num_input_tokens_seen": 248623715, + "step": 11521, + "time_per_iteration": 2.5447330474853516 + }, + { + "auxiliary_loss_clip": 0.0109783, + "auxiliary_loss_mlp": 0.01038239, + "balance_loss_clip": 1.03801024, + "balance_loss_mlp": 1.0234983, + "epoch": 0.6927401172403427, + "flos": 16761234107520.0, + "grad_norm": 2.12506696948796, + "language_loss": 0.81809753, + "learning_rate": 9.111193604317304e-07, + "loss": 0.83945823, + "num_input_tokens_seen": 248640575, + "step": 11522, + "time_per_iteration": 2.4877171516418457 + }, + { + "auxiliary_loss_clip": 0.01093234, + "auxiliary_loss_mlp": 0.01029576, + "balance_loss_clip": 1.04046619, + "balance_loss_mlp": 1.01786947, + "epoch": 0.6928002404930107, + "flos": 25705648523520.0, + "grad_norm": 1.3680752820117759, + "language_loss": 0.76819265, + "learning_rate": 9.107927007835361e-07, + "loss": 0.78942072, + "num_input_tokens_seen": 248663535, + "step": 11523, + "time_per_iteration": 2.5879862308502197 + }, + { + "auxiliary_loss_clip": 0.01076897, + "auxiliary_loss_mlp": 0.01033836, + "balance_loss_clip": 1.03781271, + "balance_loss_mlp": 1.02240419, + "epoch": 0.6928603637456786, + "flos": 18588261438720.0, + "grad_norm": 1.6828766233132744, + "language_loss": 0.6805048, + "learning_rate": 9.104660824381915e-07, + "loss": 0.70161211, + "num_input_tokens_seen": 248681125, + "step": 11524, + "time_per_iteration": 2.5742835998535156 + }, + { + "auxiliary_loss_clip": 0.01074975, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.03701997, + "balance_loss_mlp": 1.0172776, + "epoch": 0.6929204869983466, + "flos": 22200874784640.0, + "grad_norm": 1.566347873841233, + "language_loss": 0.64769542, + "learning_rate": 9.101395054080815e-07, + "loss": 0.66874218, + "num_input_tokens_seen": 248700555, + "step": 11525, + "time_per_iteration": 2.57802677154541 + }, + { + "auxiliary_loss_clip": 0.0106811, + "auxiliary_loss_mlp": 0.01036019, + "balance_loss_clip": 1.03706932, + "balance_loss_mlp": 1.0238539, + "epoch": 0.6929806102510145, + "flos": 17894754576000.0, + "grad_norm": 2.1958125975303218, + "language_loss": 0.70275247, + "learning_rate": 9.098129697055907e-07, + "loss": 0.72379375, + "num_input_tokens_seen": 248716095, + "step": 11526, + "time_per_iteration": 2.5674993991851807 + }, + { + "auxiliary_loss_clip": 0.01083358, + "auxiliary_loss_mlp": 0.01028052, + "balance_loss_clip": 1.03500605, + "balance_loss_mlp": 1.01619065, + "epoch": 0.6930407335036826, + "flos": 19755178577280.0, + "grad_norm": 1.503721439238539, + "language_loss": 0.76036912, + "learning_rate": 9.094864753431022e-07, + "loss": 0.78148317, + "num_input_tokens_seen": 248735330, + "step": 11527, + "time_per_iteration": 2.51962947845459 + }, + { + "auxiliary_loss_clip": 0.01080973, + "auxiliary_loss_mlp": 0.01033222, + "balance_loss_clip": 1.03536916, + "balance_loss_mlp": 1.0208776, + "epoch": 0.6931008567563505, + "flos": 21544248211200.0, + "grad_norm": 1.4546127968777582, + "language_loss": 0.79153526, + "learning_rate": 9.091600223329952e-07, + "loss": 0.8126772, + "num_input_tokens_seen": 248754530, + "step": 11528, + "time_per_iteration": 2.5327298641204834 + }, + { + "auxiliary_loss_clip": 0.01093706, + "auxiliary_loss_mlp": 0.01030535, + "balance_loss_clip": 1.03776288, + "balance_loss_mlp": 1.01863217, + "epoch": 0.6931609800090185, + "flos": 26250018117120.0, + "grad_norm": 1.4104602640715094, + "language_loss": 0.76036894, + "learning_rate": 9.088336106876491e-07, + "loss": 0.78161132, + "num_input_tokens_seen": 248775825, + "step": 11529, + "time_per_iteration": 2.539283275604248 + }, + { + "auxiliary_loss_clip": 0.01106062, + "auxiliary_loss_mlp": 0.00782841, + "balance_loss_clip": 1.03866982, + "balance_loss_mlp": 1.0085938, + "epoch": 0.6932211032616865, + "flos": 32343376366080.0, + "grad_norm": 6.982333751388168, + "language_loss": 0.71928287, + "learning_rate": 9.085072404194436e-07, + "loss": 0.73817194, + "num_input_tokens_seen": 248796180, + "step": 11530, + "time_per_iteration": 2.5521209239959717 + }, + { + "auxiliary_loss_clip": 0.01093341, + "auxiliary_loss_mlp": 0.01033172, + "balance_loss_clip": 1.04072857, + "balance_loss_mlp": 1.01924264, + "epoch": 0.6932812265143544, + "flos": 22049079909120.0, + "grad_norm": 1.9515254673676938, + "language_loss": 0.78686053, + "learning_rate": 9.081809115407513e-07, + "loss": 0.80812573, + "num_input_tokens_seen": 248814735, + "step": 11531, + "time_per_iteration": 2.5200512409210205 + }, + { + "auxiliary_loss_clip": 0.01092199, + "auxiliary_loss_mlp": 0.01034945, + "balance_loss_clip": 1.03893888, + "balance_loss_mlp": 1.0242753, + "epoch": 0.6933413497670224, + "flos": 26256626219520.0, + "grad_norm": 1.4085295446256927, + "language_loss": 0.69592738, + "learning_rate": 9.078546240639484e-07, + "loss": 0.71719885, + "num_input_tokens_seen": 248839140, + "step": 11532, + "time_per_iteration": 2.58559513092041 + }, + { + "auxiliary_loss_clip": 0.01084519, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.03865266, + "balance_loss_mlp": 1.01858234, + "epoch": 0.6934014730196904, + "flos": 19573003774080.0, + "grad_norm": 1.554673290477012, + "language_loss": 0.67017949, + "learning_rate": 9.075283780014082e-07, + "loss": 0.69133836, + "num_input_tokens_seen": 248858300, + "step": 11533, + "time_per_iteration": 2.5110485553741455 + }, + { + "auxiliary_loss_clip": 0.01085987, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.04090405, + "balance_loss_mlp": 1.01711404, + "epoch": 0.6934615962723584, + "flos": 22119249127680.0, + "grad_norm": 2.977107955291198, + "language_loss": 0.59072703, + "learning_rate": 9.072021733655007e-07, + "loss": 0.61188698, + "num_input_tokens_seen": 248876310, + "step": 11534, + "time_per_iteration": 2.52600359916687 + }, + { + "auxiliary_loss_clip": 0.01073416, + "auxiliary_loss_mlp": 0.0103144, + "balance_loss_clip": 1.03744996, + "balance_loss_mlp": 1.01847553, + "epoch": 0.6935217195250263, + "flos": 21360816432000.0, + "grad_norm": 2.0324317405649754, + "language_loss": 0.70779502, + "learning_rate": 9.068760101685971e-07, + "loss": 0.72884357, + "num_input_tokens_seen": 248895650, + "step": 11535, + "time_per_iteration": 2.5498600006103516 + }, + { + "auxiliary_loss_clip": 0.01021595, + "auxiliary_loss_mlp": 0.01001887, + "balance_loss_clip": 1.0176003, + "balance_loss_mlp": 1.00069451, + "epoch": 0.6935818427776943, + "flos": 64063813115520.0, + "grad_norm": 0.7241843174833122, + "language_loss": 0.59044254, + "learning_rate": 9.065498884230638e-07, + "loss": 0.6106773, + "num_input_tokens_seen": 248963920, + "step": 11536, + "time_per_iteration": 3.255309581756592 + }, + { + "auxiliary_loss_clip": 0.01096492, + "auxiliary_loss_mlp": 0.00784736, + "balance_loss_clip": 1.03987789, + "balance_loss_mlp": 1.01075459, + "epoch": 0.6936419660303622, + "flos": 20302564913280.0, + "grad_norm": 1.7529541804176498, + "language_loss": 0.72382271, + "learning_rate": 9.062238081412692e-07, + "loss": 0.74263501, + "num_input_tokens_seen": 248983380, + "step": 11537, + "time_per_iteration": 2.5206539630889893 + }, + { + "auxiliary_loss_clip": 0.01031541, + "auxiliary_loss_mlp": 0.00766933, + "balance_loss_clip": 1.01679254, + "balance_loss_mlp": 1.00851095, + "epoch": 0.6937020892830302, + "flos": 67182581347200.0, + "grad_norm": 0.7528622964845968, + "language_loss": 0.55615675, + "learning_rate": 9.058977693355767e-07, + "loss": 0.5741415, + "num_input_tokens_seen": 249044680, + "step": 11538, + "time_per_iteration": 3.119452714920044 + }, + { + "auxiliary_loss_clip": 0.01093215, + "auxiliary_loss_mlp": 0.01028778, + "balance_loss_clip": 1.0377512, + "balance_loss_mlp": 1.01763177, + "epoch": 0.6937622125356981, + "flos": 23878190229120.0, + "grad_norm": 1.5262308818576538, + "language_loss": 0.77768296, + "learning_rate": 9.055717720183505e-07, + "loss": 0.79890287, + "num_input_tokens_seen": 249061060, + "step": 11539, + "time_per_iteration": 2.5113086700439453 + }, + { + "auxiliary_loss_clip": 0.01084979, + "auxiliary_loss_mlp": 0.01027042, + "balance_loss_clip": 1.03773904, + "balance_loss_mlp": 1.0156281, + "epoch": 0.6938223357883662, + "flos": 28730619365760.0, + "grad_norm": 1.5878029127671371, + "language_loss": 0.63956314, + "learning_rate": 9.05245816201953e-07, + "loss": 0.66068333, + "num_input_tokens_seen": 249081430, + "step": 11540, + "time_per_iteration": 2.5757639408111572 + }, + { + "auxiliary_loss_clip": 0.01071618, + "auxiliary_loss_mlp": 0.01032531, + "balance_loss_clip": 1.03737032, + "balance_loss_mlp": 1.02079487, + "epoch": 0.6938824590410341, + "flos": 28655027193600.0, + "grad_norm": 1.4489586272488857, + "language_loss": 0.86628324, + "learning_rate": 9.049199018987437e-07, + "loss": 0.88732469, + "num_input_tokens_seen": 249103020, + "step": 11541, + "time_per_iteration": 2.6268303394317627 + }, + { + "auxiliary_loss_clip": 0.01109956, + "auxiliary_loss_mlp": 0.00783026, + "balance_loss_clip": 1.03911757, + "balance_loss_mlp": 1.00755906, + "epoch": 0.6939425822937021, + "flos": 18983062800000.0, + "grad_norm": 1.766015316804286, + "language_loss": 0.841021, + "learning_rate": 9.04594029121081e-07, + "loss": 0.8599509, + "num_input_tokens_seen": 249120810, + "step": 11542, + "time_per_iteration": 2.4829230308532715 + }, + { + "auxiliary_loss_clip": 0.01096475, + "auxiliary_loss_mlp": 0.01030197, + "balance_loss_clip": 1.03753901, + "balance_loss_mlp": 1.01699436, + "epoch": 0.6940027055463701, + "flos": 23075838178560.0, + "grad_norm": 1.8431457194930394, + "language_loss": 0.75162065, + "learning_rate": 9.04268197881323e-07, + "loss": 0.77288735, + "num_input_tokens_seen": 249138050, + "step": 11543, + "time_per_iteration": 2.5082197189331055 + }, + { + "auxiliary_loss_clip": 0.01085049, + "auxiliary_loss_mlp": 0.01031878, + "balance_loss_clip": 1.03581154, + "balance_loss_mlp": 1.0199337, + "epoch": 0.694062828799038, + "flos": 18186564666240.0, + "grad_norm": 1.6546455438981238, + "language_loss": 0.76035595, + "learning_rate": 9.039424081918241e-07, + "loss": 0.78152525, + "num_input_tokens_seen": 249155570, + "step": 11544, + "time_per_iteration": 2.518556594848633 + }, + { + "auxiliary_loss_clip": 0.0105658, + "auxiliary_loss_mlp": 0.01038495, + "balance_loss_clip": 1.03617811, + "balance_loss_mlp": 1.02508414, + "epoch": 0.694122952051706, + "flos": 17821532701440.0, + "grad_norm": 1.7018896824648146, + "language_loss": 0.70961523, + "learning_rate": 9.036166600649388e-07, + "loss": 0.73056597, + "num_input_tokens_seen": 249172960, + "step": 11545, + "time_per_iteration": 3.967756748199463 + }, + { + "auxiliary_loss_clip": 0.01093874, + "auxiliary_loss_mlp": 0.01028782, + "balance_loss_clip": 1.03851128, + "balance_loss_mlp": 1.01778531, + "epoch": 0.694183075304374, + "flos": 21215306436480.0, + "grad_norm": 1.8222998799391918, + "language_loss": 0.79489124, + "learning_rate": 9.0329095351302e-07, + "loss": 0.81611782, + "num_input_tokens_seen": 249192450, + "step": 11546, + "time_per_iteration": 2.4853761196136475 + }, + { + "auxiliary_loss_clip": 0.01073808, + "auxiliary_loss_mlp": 0.01028575, + "balance_loss_clip": 1.03572989, + "balance_loss_mlp": 1.01664805, + "epoch": 0.694243198557042, + "flos": 24060508686720.0, + "grad_norm": 1.3841970929035312, + "language_loss": 0.78597176, + "learning_rate": 9.029652885484194e-07, + "loss": 0.80699551, + "num_input_tokens_seen": 249214320, + "step": 11547, + "time_per_iteration": 2.5980873107910156 + }, + { + "auxiliary_loss_clip": 0.01085946, + "auxiliary_loss_mlp": 0.00783726, + "balance_loss_clip": 1.04119778, + "balance_loss_mlp": 1.00895524, + "epoch": 0.6943033218097099, + "flos": 21141869080320.0, + "grad_norm": 1.9956712134191052, + "language_loss": 0.80624211, + "learning_rate": 9.026396651834834e-07, + "loss": 0.82493889, + "num_input_tokens_seen": 249230925, + "step": 11548, + "time_per_iteration": 2.5110929012298584 + }, + { + "auxiliary_loss_clip": 0.01040014, + "auxiliary_loss_mlp": 0.00767949, + "balance_loss_clip": 1.01642752, + "balance_loss_mlp": 1.00862718, + "epoch": 0.6943634450623779, + "flos": 57812015975040.0, + "grad_norm": 0.693627695318321, + "language_loss": 0.53716421, + "learning_rate": 9.023140834305613e-07, + "loss": 0.55524385, + "num_input_tokens_seen": 249293975, + "step": 11549, + "time_per_iteration": 3.0799429416656494 + }, + { + "auxiliary_loss_clip": 0.01090059, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.03443229, + "balance_loss_mlp": 1.02002859, + "epoch": 0.6944235683150458, + "flos": 30590684231040.0, + "grad_norm": 1.3328138422694822, + "language_loss": 0.73501015, + "learning_rate": 9.01988543302e-07, + "loss": 0.75623608, + "num_input_tokens_seen": 249315285, + "step": 11550, + "time_per_iteration": 3.981412410736084 + }, + { + "auxiliary_loss_clip": 0.0108495, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.04017138, + "balance_loss_mlp": 1.01856518, + "epoch": 0.6944836915677138, + "flos": 19719447523200.0, + "grad_norm": 1.8918275700491303, + "language_loss": 0.74127704, + "learning_rate": 9.016630448101425e-07, + "loss": 0.76243436, + "num_input_tokens_seen": 249333505, + "step": 11551, + "time_per_iteration": 2.5003764629364014 + }, + { + "auxiliary_loss_clip": 0.01110293, + "auxiliary_loss_mlp": 0.01033751, + "balance_loss_clip": 1.03992248, + "balance_loss_mlp": 1.02130008, + "epoch": 0.6945438148203817, + "flos": 24863579009280.0, + "grad_norm": 1.480116436208681, + "language_loss": 0.84529865, + "learning_rate": 9.01337587967333e-07, + "loss": 0.86673903, + "num_input_tokens_seen": 249354180, + "step": 11552, + "time_per_iteration": 2.510984182357788 + }, + { + "auxiliary_loss_clip": 0.01107875, + "auxiliary_loss_mlp": 0.01035585, + "balance_loss_clip": 1.03833151, + "balance_loss_mlp": 1.02324045, + "epoch": 0.6946039380730498, + "flos": 33326646243840.0, + "grad_norm": 1.456649241300002, + "language_loss": 0.67661858, + "learning_rate": 9.010121727859117e-07, + "loss": 0.69805324, + "num_input_tokens_seen": 249377035, + "step": 11553, + "time_per_iteration": 2.56299090385437 + }, + { + "auxiliary_loss_clip": 0.01091482, + "auxiliary_loss_mlp": 0.01030902, + "balance_loss_clip": 1.03878379, + "balance_loss_mlp": 1.01799822, + "epoch": 0.6946640613257177, + "flos": 20850956830080.0, + "grad_norm": 1.5602166679508476, + "language_loss": 0.79334867, + "learning_rate": 9.006867992782195e-07, + "loss": 0.81457257, + "num_input_tokens_seen": 249396155, + "step": 11554, + "time_per_iteration": 2.522529125213623 + }, + { + "auxiliary_loss_clip": 0.01099624, + "auxiliary_loss_mlp": 0.01027459, + "balance_loss_clip": 1.03815126, + "balance_loss_mlp": 1.01539516, + "epoch": 0.6947241845783857, + "flos": 19354846521600.0, + "grad_norm": 1.8916059669058414, + "language_loss": 0.72587615, + "learning_rate": 9.003614674565934e-07, + "loss": 0.74714702, + "num_input_tokens_seen": 249414555, + "step": 11555, + "time_per_iteration": 3.9766833782196045 + }, + { + "auxiliary_loss_clip": 0.01071789, + "auxiliary_loss_mlp": 0.01028035, + "balance_loss_clip": 1.03719246, + "balance_loss_mlp": 1.01648962, + "epoch": 0.6947843078310536, + "flos": 27120240915840.0, + "grad_norm": 1.6388796685923996, + "language_loss": 0.78356063, + "learning_rate": 9.000361773333705e-07, + "loss": 0.80455887, + "num_input_tokens_seen": 249433570, + "step": 11556, + "time_per_iteration": 2.601313829421997 + }, + { + "auxiliary_loss_clip": 0.01048784, + "auxiliary_loss_mlp": 0.01036342, + "balance_loss_clip": 1.03508961, + "balance_loss_mlp": 1.02427769, + "epoch": 0.6948444310837216, + "flos": 28585109370240.0, + "grad_norm": 1.9986140100262613, + "language_loss": 0.60326982, + "learning_rate": 8.997109289208869e-07, + "loss": 0.62412107, + "num_input_tokens_seen": 249453735, + "step": 11557, + "time_per_iteration": 2.6899044513702393 + }, + { + "auxiliary_loss_clip": 0.01081671, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.03883576, + "balance_loss_mlp": 1.02695775, + "epoch": 0.6949045543363896, + "flos": 15669262696320.0, + "grad_norm": 1.9502714952820943, + "language_loss": 0.85429251, + "learning_rate": 8.993857222314752e-07, + "loss": 0.8755033, + "num_input_tokens_seen": 249470805, + "step": 11558, + "time_per_iteration": 2.5189859867095947 + }, + { + "auxiliary_loss_clip": 0.01099455, + "auxiliary_loss_mlp": 0.01032285, + "balance_loss_clip": 1.0372684, + "balance_loss_mlp": 1.01921415, + "epoch": 0.6949646775890576, + "flos": 23259413612160.0, + "grad_norm": 1.6264404527466336, + "language_loss": 0.70318854, + "learning_rate": 8.990605572774664e-07, + "loss": 0.7245059, + "num_input_tokens_seen": 249491150, + "step": 11559, + "time_per_iteration": 2.5682179927825928 + }, + { + "auxiliary_loss_clip": 0.01072851, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.03717589, + "balance_loss_mlp": 1.01955581, + "epoch": 0.6950248008417256, + "flos": 22382546797440.0, + "grad_norm": 1.502713768165792, + "language_loss": 0.78759617, + "learning_rate": 8.987354340711921e-07, + "loss": 0.80863613, + "num_input_tokens_seen": 249511560, + "step": 11560, + "time_per_iteration": 2.5688703060150146 + }, + { + "auxiliary_loss_clip": 0.01083954, + "auxiliary_loss_mlp": 0.0103489, + "balance_loss_clip": 1.03869629, + "balance_loss_mlp": 1.02327943, + "epoch": 0.6950849240943935, + "flos": 23477355383040.0, + "grad_norm": 1.5939421772320905, + "language_loss": 0.77200353, + "learning_rate": 8.9841035262498e-07, + "loss": 0.79319191, + "num_input_tokens_seen": 249531910, + "step": 11561, + "time_per_iteration": 2.5506646633148193 + }, + { + "auxiliary_loss_clip": 0.01105125, + "auxiliary_loss_mlp": 0.0103237, + "balance_loss_clip": 1.03588986, + "balance_loss_mlp": 1.01931643, + "epoch": 0.6951450473470615, + "flos": 17420554200960.0, + "grad_norm": 1.9766727316345976, + "language_loss": 0.78556663, + "learning_rate": 8.980853129511577e-07, + "loss": 0.80694163, + "num_input_tokens_seen": 249550300, + "step": 11562, + "time_per_iteration": 2.434783697128296 + }, + { + "auxiliary_loss_clip": 0.01098796, + "auxiliary_loss_mlp": 0.01032259, + "balance_loss_clip": 1.03601718, + "balance_loss_mlp": 1.01979017, + "epoch": 0.6952051705997294, + "flos": 20485745297280.0, + "grad_norm": 2.4021135688952255, + "language_loss": 0.69444847, + "learning_rate": 8.977603150620515e-07, + "loss": 0.71575904, + "num_input_tokens_seen": 249567740, + "step": 11563, + "time_per_iteration": 2.5006840229034424 + }, + { + "auxiliary_loss_clip": 0.0109366, + "auxiliary_loss_mlp": 0.01026818, + "balance_loss_clip": 1.037328, + "balance_loss_mlp": 1.01570773, + "epoch": 0.6952652938523974, + "flos": 13989541040640.0, + "grad_norm": 2.1051244499605395, + "language_loss": 0.73340476, + "learning_rate": 8.974353589699846e-07, + "loss": 0.75460958, + "num_input_tokens_seen": 249582700, + "step": 11564, + "time_per_iteration": 2.4636423587799072 + }, + { + "auxiliary_loss_clip": 0.0108898, + "auxiliary_loss_mlp": 0.01035111, + "balance_loss_clip": 1.04445517, + "balance_loss_mlp": 1.02017379, + "epoch": 0.6953254171050653, + "flos": 30953956429440.0, + "grad_norm": 1.9197100866343377, + "language_loss": 0.71899599, + "learning_rate": 8.971104446872785e-07, + "loss": 0.74023688, + "num_input_tokens_seen": 249602920, + "step": 11565, + "time_per_iteration": 2.6331241130828857 + }, + { + "auxiliary_loss_clip": 0.01022016, + "auxiliary_loss_mlp": 0.01001067, + "balance_loss_clip": 1.0166868, + "balance_loss_mlp": 0.99992907, + "epoch": 0.6953855403577334, + "flos": 61670257499520.0, + "grad_norm": 0.9210811614800977, + "language_loss": 0.58498913, + "learning_rate": 8.96785572226255e-07, + "loss": 0.60521996, + "num_input_tokens_seen": 249660400, + "step": 11566, + "time_per_iteration": 2.984909772872925 + }, + { + "auxiliary_loss_clip": 0.01072804, + "auxiliary_loss_mlp": 0.01031733, + "balance_loss_clip": 1.0366025, + "balance_loss_mlp": 1.0192641, + "epoch": 0.6954456636104013, + "flos": 23039029716480.0, + "grad_norm": 1.942226661943873, + "language_loss": 0.73774338, + "learning_rate": 8.964607415992338e-07, + "loss": 0.7587887, + "num_input_tokens_seen": 249679335, + "step": 11567, + "time_per_iteration": 2.576402425765991 + }, + { + "auxiliary_loss_clip": 0.01077205, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.03492141, + "balance_loss_mlp": 1.02175689, + "epoch": 0.6955057868630693, + "flos": 23918518224000.0, + "grad_norm": 1.2375882145837898, + "language_loss": 0.7700156, + "learning_rate": 8.961359528185313e-07, + "loss": 0.79113525, + "num_input_tokens_seen": 249701805, + "step": 11568, + "time_per_iteration": 2.571838140487671 + }, + { + "auxiliary_loss_clip": 0.01100896, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.0418129, + "balance_loss_mlp": 1.02191293, + "epoch": 0.6955659101157372, + "flos": 22594634651520.0, + "grad_norm": 1.6531010744445864, + "language_loss": 0.72638625, + "learning_rate": 8.958112058964649e-07, + "loss": 0.74773085, + "num_input_tokens_seen": 249720550, + "step": 11569, + "time_per_iteration": 2.506361961364746 + }, + { + "auxiliary_loss_clip": 0.01085143, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.03961325, + "balance_loss_mlp": 1.01775467, + "epoch": 0.6956260333684052, + "flos": 24572523104640.0, + "grad_norm": 1.9059175120258178, + "language_loss": 0.77295828, + "learning_rate": 8.954865008453471e-07, + "loss": 0.79411232, + "num_input_tokens_seen": 249740325, + "step": 11570, + "time_per_iteration": 2.5600571632385254 + }, + { + "auxiliary_loss_clip": 0.01098457, + "auxiliary_loss_mlp": 0.01030787, + "balance_loss_clip": 1.03717446, + "balance_loss_mlp": 1.01830602, + "epoch": 0.6956861566210732, + "flos": 25846058787840.0, + "grad_norm": 1.7048480305460711, + "language_loss": 0.74396443, + "learning_rate": 8.95161837677493e-07, + "loss": 0.76525694, + "num_input_tokens_seen": 249760570, + "step": 11571, + "time_per_iteration": 2.537386417388916 + }, + { + "auxiliary_loss_clip": 0.01092564, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.03711987, + "balance_loss_mlp": 1.02169919, + "epoch": 0.6957462798737412, + "flos": 15301393557120.0, + "grad_norm": 1.8242420376776582, + "language_loss": 0.74985242, + "learning_rate": 8.948372164052118e-07, + "loss": 0.77111757, + "num_input_tokens_seen": 249778290, + "step": 11572, + "time_per_iteration": 2.4695069789886475 + }, + { + "auxiliary_loss_clip": 0.01084639, + "auxiliary_loss_mlp": 0.01026366, + "balance_loss_clip": 1.03598559, + "balance_loss_mlp": 1.01436734, + "epoch": 0.6958064031264092, + "flos": 36246830135040.0, + "grad_norm": 1.716612678357441, + "language_loss": 0.70222354, + "learning_rate": 8.94512637040814e-07, + "loss": 0.72333366, + "num_input_tokens_seen": 249800925, + "step": 11573, + "time_per_iteration": 2.6503615379333496 + }, + { + "auxiliary_loss_clip": 0.01090087, + "auxiliary_loss_mlp": 0.01034883, + "balance_loss_clip": 1.04286349, + "balance_loss_mlp": 1.02230668, + "epoch": 0.6958665263790771, + "flos": 19208725994880.0, + "grad_norm": 1.729779742476007, + "language_loss": 0.74797636, + "learning_rate": 8.941880995966095e-07, + "loss": 0.76922613, + "num_input_tokens_seen": 249820500, + "step": 11574, + "time_per_iteration": 2.5210659503936768 + }, + { + "auxiliary_loss_clip": 0.01074309, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.03454471, + "balance_loss_mlp": 1.02037668, + "epoch": 0.6959266496317451, + "flos": 21795838047360.0, + "grad_norm": 1.6066623123268264, + "language_loss": 0.74561888, + "learning_rate": 8.938636040849014e-07, + "loss": 0.76668692, + "num_input_tokens_seen": 249839845, + "step": 11575, + "time_per_iteration": 2.549417734146118 + }, + { + "auxiliary_loss_clip": 0.01099038, + "auxiliary_loss_mlp": 0.01031629, + "balance_loss_clip": 1.03831816, + "balance_loss_mlp": 1.01856947, + "epoch": 0.695986772884413, + "flos": 20558248899840.0, + "grad_norm": 1.8538208113472099, + "language_loss": 0.78650939, + "learning_rate": 8.935391505179966e-07, + "loss": 0.80781603, + "num_input_tokens_seen": 249857400, + "step": 11576, + "time_per_iteration": 2.48110032081604 + }, + { + "auxiliary_loss_clip": 0.01066454, + "auxiliary_loss_mlp": 0.01032171, + "balance_loss_clip": 1.036865, + "balance_loss_mlp": 1.02029741, + "epoch": 0.696046896137081, + "flos": 14936217937920.0, + "grad_norm": 2.1154820446153177, + "language_loss": 0.56747109, + "learning_rate": 8.932147389081985e-07, + "loss": 0.58845735, + "num_input_tokens_seen": 249871645, + "step": 11577, + "time_per_iteration": 2.546812057495117 + }, + { + "auxiliary_loss_clip": 0.01028256, + "auxiliary_loss_mlp": 0.01025362, + "balance_loss_clip": 1.03237867, + "balance_loss_mlp": 1.01451969, + "epoch": 0.696107019389749, + "flos": 30740216549760.0, + "grad_norm": 1.297726010261835, + "language_loss": 0.76719797, + "learning_rate": 8.928903692678081e-07, + "loss": 0.78773415, + "num_input_tokens_seen": 249894215, + "step": 11578, + "time_per_iteration": 2.885678291320801 + }, + { + "auxiliary_loss_clip": 0.01074987, + "auxiliary_loss_mlp": 0.01035105, + "balance_loss_clip": 1.03776205, + "balance_loss_mlp": 1.02268338, + "epoch": 0.696167142642417, + "flos": 20776729374720.0, + "grad_norm": 1.9584420358393861, + "language_loss": 0.79627872, + "learning_rate": 8.925660416091254e-07, + "loss": 0.81737959, + "num_input_tokens_seen": 249912850, + "step": 11579, + "time_per_iteration": 2.7108285427093506 + }, + { + "auxiliary_loss_clip": 0.01067143, + "auxiliary_loss_mlp": 0.0102794, + "balance_loss_clip": 1.03409719, + "balance_loss_mlp": 1.01533961, + "epoch": 0.6962272658950849, + "flos": 22565152563840.0, + "grad_norm": 1.7330315618683318, + "language_loss": 0.73211539, + "learning_rate": 8.922417559444502e-07, + "loss": 0.75306618, + "num_input_tokens_seen": 249932650, + "step": 11580, + "time_per_iteration": 2.577279567718506 + }, + { + "auxiliary_loss_clip": 0.01090181, + "auxiliary_loss_mlp": 0.01032286, + "balance_loss_clip": 1.0374825, + "balance_loss_mlp": 1.01890469, + "epoch": 0.6962873891477529, + "flos": 22200156512640.0, + "grad_norm": 2.0967796874827416, + "language_loss": 0.65886384, + "learning_rate": 8.919175122860787e-07, + "loss": 0.68008846, + "num_input_tokens_seen": 249951205, + "step": 11581, + "time_per_iteration": 2.5310490131378174 + }, + { + "auxiliary_loss_clip": 0.01110171, + "auxiliary_loss_mlp": 0.01030606, + "balance_loss_clip": 1.03931451, + "balance_loss_mlp": 1.01875687, + "epoch": 0.6963475124004208, + "flos": 12489695717760.0, + "grad_norm": 2.0552883552307475, + "language_loss": 0.76492858, + "learning_rate": 8.915933106463056e-07, + "loss": 0.78633636, + "num_input_tokens_seen": 249967045, + "step": 11582, + "time_per_iteration": 2.4396276473999023 + }, + { + "auxiliary_loss_clip": 0.01083505, + "auxiliary_loss_mlp": 0.01029433, + "balance_loss_clip": 1.03555989, + "balance_loss_mlp": 1.01787579, + "epoch": 0.6964076356530888, + "flos": 17165085696000.0, + "grad_norm": 1.887196774370862, + "language_loss": 0.70021725, + "learning_rate": 8.91269151037425e-07, + "loss": 0.72134668, + "num_input_tokens_seen": 249984565, + "step": 11583, + "time_per_iteration": 2.5314180850982666 + }, + { + "auxiliary_loss_clip": 0.01078262, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.03958964, + "balance_loss_mlp": 1.02225995, + "epoch": 0.6964677589057569, + "flos": 19937317466880.0, + "grad_norm": 1.6615103163418148, + "language_loss": 0.82128561, + "learning_rate": 8.909450334717301e-07, + "loss": 0.84241521, + "num_input_tokens_seen": 250004235, + "step": 11584, + "time_per_iteration": 3.9758424758911133 + }, + { + "auxiliary_loss_clip": 0.01054117, + "auxiliary_loss_mlp": 0.01036788, + "balance_loss_clip": 1.03500772, + "balance_loss_mlp": 1.02348447, + "epoch": 0.6965278821584248, + "flos": 22784064001920.0, + "grad_norm": 2.1972421121543135, + "language_loss": 0.79556167, + "learning_rate": 8.906209579615107e-07, + "loss": 0.81647068, + "num_input_tokens_seen": 250017645, + "step": 11585, + "time_per_iteration": 2.6209864616394043 + }, + { + "auxiliary_loss_clip": 0.01105688, + "auxiliary_loss_mlp": 0.01028836, + "balance_loss_clip": 1.03845549, + "balance_loss_mlp": 1.01750565, + "epoch": 0.6965880054110928, + "flos": 20047563285120.0, + "grad_norm": 1.6149166186203592, + "language_loss": 0.77747899, + "learning_rate": 8.90296924519055e-07, + "loss": 0.79882419, + "num_input_tokens_seen": 250037640, + "step": 11586, + "time_per_iteration": 2.464665174484253 + }, + { + "auxiliary_loss_clip": 0.01092575, + "auxiliary_loss_mlp": 0.01031165, + "balance_loss_clip": 1.03671098, + "balance_loss_mlp": 1.01974487, + "epoch": 0.6966481286637607, + "flos": 21908238681600.0, + "grad_norm": 1.6118720034053964, + "language_loss": 0.78739834, + "learning_rate": 8.899729331566519e-07, + "loss": 0.80863577, + "num_input_tokens_seen": 250056490, + "step": 11587, + "time_per_iteration": 2.485600233078003 + }, + { + "auxiliary_loss_clip": 0.01084437, + "auxiliary_loss_mlp": 0.01034282, + "balance_loss_clip": 1.03889596, + "balance_loss_mlp": 1.02200902, + "epoch": 0.6967082519164287, + "flos": 15633172506240.0, + "grad_norm": 1.871355851597638, + "language_loss": 0.72749972, + "learning_rate": 8.896489838865857e-07, + "loss": 0.74868691, + "num_input_tokens_seen": 250074285, + "step": 11588, + "time_per_iteration": 4.004549026489258 + }, + { + "auxiliary_loss_clip": 0.01081705, + "auxiliary_loss_mlp": 0.01026241, + "balance_loss_clip": 1.03766048, + "balance_loss_mlp": 1.01523161, + "epoch": 0.6967683751690966, + "flos": 24024598064640.0, + "grad_norm": 1.7589821407936792, + "language_loss": 0.75029171, + "learning_rate": 8.893250767211413e-07, + "loss": 0.77137113, + "num_input_tokens_seen": 250093350, + "step": 11589, + "time_per_iteration": 4.230475187301636 + }, + { + "auxiliary_loss_clip": 0.01087643, + "auxiliary_loss_mlp": 0.01033031, + "balance_loss_clip": 1.03818691, + "balance_loss_mlp": 1.02108061, + "epoch": 0.6968284984217646, + "flos": 31024700265600.0, + "grad_norm": 1.9310478595577625, + "language_loss": 0.63802624, + "learning_rate": 8.890012116726012e-07, + "loss": 0.65923297, + "num_input_tokens_seen": 250114170, + "step": 11590, + "time_per_iteration": 2.6473636627197266 + }, + { + "auxiliary_loss_clip": 0.0100175, + "auxiliary_loss_mlp": 0.01006028, + "balance_loss_clip": 1.02569675, + "balance_loss_mlp": 1.00447798, + "epoch": 0.6968886216744326, + "flos": 67622990002560.0, + "grad_norm": 0.755309384811812, + "language_loss": 0.61233872, + "learning_rate": 8.88677388753248e-07, + "loss": 0.63241649, + "num_input_tokens_seen": 250178250, + "step": 11591, + "time_per_iteration": 3.332343578338623 + }, + { + "auxiliary_loss_clip": 0.01061291, + "auxiliary_loss_mlp": 0.00784282, + "balance_loss_clip": 1.04065645, + "balance_loss_mlp": 1.01007962, + "epoch": 0.6969487449271006, + "flos": 24863686750080.0, + "grad_norm": 1.4878111979042927, + "language_loss": 0.69142789, + "learning_rate": 8.883536079753582e-07, + "loss": 0.70988357, + "num_input_tokens_seen": 250198420, + "step": 11592, + "time_per_iteration": 2.696342945098877 + }, + { + "auxiliary_loss_clip": 0.01074144, + "auxiliary_loss_mlp": 0.0103007, + "balance_loss_clip": 1.03810918, + "balance_loss_mlp": 1.01860237, + "epoch": 0.6970088681797685, + "flos": 28767858791040.0, + "grad_norm": 2.1586066381260367, + "language_loss": 0.61897862, + "learning_rate": 8.880298693512109e-07, + "loss": 0.64002073, + "num_input_tokens_seen": 250220650, + "step": 11593, + "time_per_iteration": 4.039926052093506 + }, + { + "auxiliary_loss_clip": 0.01082229, + "auxiliary_loss_mlp": 0.01026232, + "balance_loss_clip": 1.03727734, + "balance_loss_mlp": 1.01521659, + "epoch": 0.6970689914324365, + "flos": 27308556944640.0, + "grad_norm": 1.6113703907142711, + "language_loss": 0.54399586, + "learning_rate": 8.877061728930832e-07, + "loss": 0.56508052, + "num_input_tokens_seen": 250241750, + "step": 11594, + "time_per_iteration": 2.5867154598236084 + }, + { + "auxiliary_loss_clip": 0.01096877, + "auxiliary_loss_mlp": 0.01029105, + "balance_loss_clip": 1.03802431, + "balance_loss_mlp": 1.01773202, + "epoch": 0.6971291146851044, + "flos": 19136258305920.0, + "grad_norm": 1.9689699017760949, + "language_loss": 0.76842809, + "learning_rate": 8.87382518613248e-07, + "loss": 0.78968793, + "num_input_tokens_seen": 250259445, + "step": 11595, + "time_per_iteration": 2.4944686889648438 + }, + { + "auxiliary_loss_clip": 0.01088432, + "auxiliary_loss_mlp": 0.00784121, + "balance_loss_clip": 1.04163313, + "balance_loss_mlp": 1.00978994, + "epoch": 0.6971892379377724, + "flos": 14610508387200.0, + "grad_norm": 2.360032110718637, + "language_loss": 0.71422029, + "learning_rate": 8.870589065239793e-07, + "loss": 0.7329458, + "num_input_tokens_seen": 250275640, + "step": 11596, + "time_per_iteration": 2.513915538787842 + }, + { + "auxiliary_loss_clip": 0.01111856, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.04144597, + "balance_loss_mlp": 1.02019596, + "epoch": 0.6972493611904405, + "flos": 22307457415680.0, + "grad_norm": 1.614574847703069, + "language_loss": 0.76117587, + "learning_rate": 8.867353366375492e-07, + "loss": 0.78262019, + "num_input_tokens_seen": 250296435, + "step": 11597, + "time_per_iteration": 2.495518684387207 + }, + { + "auxiliary_loss_clip": 0.01098346, + "auxiliary_loss_mlp": 0.01035012, + "balance_loss_clip": 1.03905272, + "balance_loss_mlp": 1.02273369, + "epoch": 0.6973094844431084, + "flos": 17420374632960.0, + "grad_norm": 1.7916977176158162, + "language_loss": 0.75067943, + "learning_rate": 8.864118089662267e-07, + "loss": 0.77201301, + "num_input_tokens_seen": 250314035, + "step": 11598, + "time_per_iteration": 2.4761149883270264 + }, + { + "auxiliary_loss_clip": 0.01092342, + "auxiliary_loss_mlp": 0.01035458, + "balance_loss_clip": 1.03887093, + "balance_loss_mlp": 1.0220468, + "epoch": 0.6973696076957764, + "flos": 27235370983680.0, + "grad_norm": 2.004033220693446, + "language_loss": 0.8987689, + "learning_rate": 8.860883235222791e-07, + "loss": 0.92004699, + "num_input_tokens_seen": 250332995, + "step": 11599, + "time_per_iteration": 2.5988433361053467 + }, + { + "auxiliary_loss_clip": 0.01103858, + "auxiliary_loss_mlp": 0.01038963, + "balance_loss_clip": 1.0388124, + "balance_loss_mlp": 1.02531934, + "epoch": 0.6974297309484443, + "flos": 22018089450240.0, + "grad_norm": 2.0039679646590995, + "language_loss": 0.69956863, + "learning_rate": 8.85764880317974e-07, + "loss": 0.72099686, + "num_input_tokens_seen": 250352120, + "step": 11600, + "time_per_iteration": 2.5185418128967285 + }, + { + "auxiliary_loss_clip": 0.01067837, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.03533983, + "balance_loss_mlp": 1.02106142, + "epoch": 0.6974898542011123, + "flos": 28366449327360.0, + "grad_norm": 1.8179634263588829, + "language_loss": 0.76815355, + "learning_rate": 8.854414793655771e-07, + "loss": 0.78916514, + "num_input_tokens_seen": 250371705, + "step": 11601, + "time_per_iteration": 2.648249864578247 + }, + { + "auxiliary_loss_clip": 0.01090894, + "auxiliary_loss_mlp": 0.00781846, + "balance_loss_clip": 1.0363965, + "balance_loss_mlp": 1.00815308, + "epoch": 0.6975499774537802, + "flos": 15232050351360.0, + "grad_norm": 1.7901274720395772, + "language_loss": 0.72351599, + "learning_rate": 8.851181206773508e-07, + "loss": 0.74224341, + "num_input_tokens_seen": 250390485, + "step": 11602, + "time_per_iteration": 2.4818081855773926 + }, + { + "auxiliary_loss_clip": 0.01087237, + "auxiliary_loss_mlp": 0.00783236, + "balance_loss_clip": 1.0370779, + "balance_loss_mlp": 1.0086174, + "epoch": 0.6976101007064482, + "flos": 22157422306560.0, + "grad_norm": 1.9797595866147253, + "language_loss": 0.76548946, + "learning_rate": 8.847948042655567e-07, + "loss": 0.78419423, + "num_input_tokens_seen": 250407020, + "step": 11603, + "time_per_iteration": 2.5223965644836426 + }, + { + "auxiliary_loss_clip": 0.01061024, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.03567362, + "balance_loss_mlp": 1.01982713, + "epoch": 0.6976702239591162, + "flos": 22273522041600.0, + "grad_norm": 1.4872085981213494, + "language_loss": 0.6234889, + "learning_rate": 8.844715301424557e-07, + "loss": 0.64441955, + "num_input_tokens_seen": 250425880, + "step": 11604, + "time_per_iteration": 2.587207078933716 + }, + { + "auxiliary_loss_clip": 0.0110062, + "auxiliary_loss_mlp": 0.01032298, + "balance_loss_clip": 1.03902102, + "balance_loss_mlp": 1.01891124, + "epoch": 0.6977303472117842, + "flos": 25848608653440.0, + "grad_norm": 2.70760858262613, + "language_loss": 0.81257117, + "learning_rate": 8.841482983203057e-07, + "loss": 0.83390039, + "num_input_tokens_seen": 250442925, + "step": 11605, + "time_per_iteration": 2.523991346359253 + }, + { + "auxiliary_loss_clip": 0.01096993, + "auxiliary_loss_mlp": 0.01034336, + "balance_loss_clip": 1.03790116, + "balance_loss_mlp": 1.02259374, + "epoch": 0.6977904704644521, + "flos": 20959586536320.0, + "grad_norm": 1.827302283519282, + "language_loss": 0.703269, + "learning_rate": 8.838251088113638e-07, + "loss": 0.72458231, + "num_input_tokens_seen": 250461220, + "step": 11606, + "time_per_iteration": 2.489455461502075 + }, + { + "auxiliary_loss_clip": 0.01088456, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.03850365, + "balance_loss_mlp": 1.01974845, + "epoch": 0.6978505937171201, + "flos": 22055041566720.0, + "grad_norm": 1.9199621148293204, + "language_loss": 0.82697958, + "learning_rate": 8.835019616278856e-07, + "loss": 0.84818423, + "num_input_tokens_seen": 250480975, + "step": 11607, + "time_per_iteration": 2.5460731983184814 + }, + { + "auxiliary_loss_clip": 0.01089748, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.04107738, + "balance_loss_mlp": 1.02070999, + "epoch": 0.697910716969788, + "flos": 20043720529920.0, + "grad_norm": 1.9845516772369873, + "language_loss": 0.7886852, + "learning_rate": 8.831788567821265e-07, + "loss": 0.80991787, + "num_input_tokens_seen": 250497980, + "step": 11608, + "time_per_iteration": 2.5225706100463867 + }, + { + "auxiliary_loss_clip": 0.01087328, + "auxiliary_loss_mlp": 0.01030459, + "balance_loss_clip": 1.03504729, + "balance_loss_mlp": 1.0183531, + "epoch": 0.697970840222456, + "flos": 15888245961600.0, + "grad_norm": 1.9927316823391503, + "language_loss": 0.89995676, + "learning_rate": 8.828557942863357e-07, + "loss": 0.92113459, + "num_input_tokens_seen": 250511910, + "step": 11609, + "time_per_iteration": 2.4873039722442627 + }, + { + "auxiliary_loss_clip": 0.01074088, + "auxiliary_loss_mlp": 0.01029503, + "balance_loss_clip": 1.03751016, + "balance_loss_mlp": 1.0172956, + "epoch": 0.698030963475124, + "flos": 21215629658880.0, + "grad_norm": 1.7984799746843998, + "language_loss": 0.6438781, + "learning_rate": 8.82532774152765e-07, + "loss": 0.66491401, + "num_input_tokens_seen": 250531090, + "step": 11610, + "time_per_iteration": 2.5700743198394775 + }, + { + "auxiliary_loss_clip": 0.01074583, + "auxiliary_loss_mlp": 0.01032269, + "balance_loss_clip": 1.03653002, + "balance_loss_mlp": 1.02085459, + "epoch": 0.698091086727792, + "flos": 33759728524800.0, + "grad_norm": 1.7783151681090366, + "language_loss": 0.8446793, + "learning_rate": 8.822097963936643e-07, + "loss": 0.86574781, + "num_input_tokens_seen": 250551565, + "step": 11611, + "time_per_iteration": 2.6626999378204346 + }, + { + "auxiliary_loss_clip": 0.01099727, + "auxiliary_loss_mlp": 0.01035132, + "balance_loss_clip": 1.03826952, + "balance_loss_mlp": 1.02264535, + "epoch": 0.69815120998046, + "flos": 15887850912000.0, + "grad_norm": 1.97151326710403, + "language_loss": 0.708148, + "learning_rate": 8.818868610212793e-07, + "loss": 0.7294966, + "num_input_tokens_seen": 250569625, + "step": 11612, + "time_per_iteration": 2.478031635284424 + }, + { + "auxiliary_loss_clip": 0.01089094, + "auxiliary_loss_mlp": 0.01036026, + "balance_loss_clip": 1.03435469, + "balance_loss_mlp": 1.02365208, + "epoch": 0.6982113332331279, + "flos": 18947044437120.0, + "grad_norm": 1.7458639871337391, + "language_loss": 0.80888253, + "learning_rate": 8.815639680478573e-07, + "loss": 0.83013374, + "num_input_tokens_seen": 250586960, + "step": 11613, + "time_per_iteration": 2.50899076461792 + }, + { + "auxiliary_loss_clip": 0.01096035, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.03709435, + "balance_loss_mlp": 1.020504, + "epoch": 0.6982714564857959, + "flos": 24389594115840.0, + "grad_norm": 2.0393271133890734, + "language_loss": 0.75293213, + "learning_rate": 8.812411174856411e-07, + "loss": 0.77421045, + "num_input_tokens_seen": 250605080, + "step": 11614, + "time_per_iteration": 2.5243375301361084 + }, + { + "auxiliary_loss_clip": 0.01043805, + "auxiliary_loss_mlp": 0.0103306, + "balance_loss_clip": 1.03833961, + "balance_loss_mlp": 1.02082944, + "epoch": 0.6983315797384638, + "flos": 20083725302400.0, + "grad_norm": 2.4297123627585506, + "language_loss": 0.77038515, + "learning_rate": 8.809183093468746e-07, + "loss": 0.79115379, + "num_input_tokens_seen": 250623965, + "step": 11615, + "time_per_iteration": 2.6455461978912354 + }, + { + "auxiliary_loss_clip": 0.01077781, + "auxiliary_loss_mlp": 0.01028352, + "balance_loss_clip": 1.03699136, + "balance_loss_mlp": 1.01662147, + "epoch": 0.6983917029911318, + "flos": 13512431664000.0, + "grad_norm": 2.0366886690334507, + "language_loss": 0.73017669, + "learning_rate": 8.80595543643797e-07, + "loss": 0.75123811, + "num_input_tokens_seen": 250640675, + "step": 11616, + "time_per_iteration": 2.4994442462921143 + }, + { + "auxiliary_loss_clip": 0.01108978, + "auxiliary_loss_mlp": 0.01035114, + "balance_loss_clip": 1.04022694, + "balance_loss_mlp": 1.02301383, + "epoch": 0.6984518262437998, + "flos": 22018412672640.0, + "grad_norm": 1.5659592354012795, + "language_loss": 0.84289134, + "learning_rate": 8.802728203886487e-07, + "loss": 0.86433226, + "num_input_tokens_seen": 250660295, + "step": 11617, + "time_per_iteration": 2.5050692558288574 + }, + { + "auxiliary_loss_clip": 0.01070137, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.03556848, + "balance_loss_mlp": 1.02777004, + "epoch": 0.6985119494964678, + "flos": 18770615809920.0, + "grad_norm": 2.334105077529399, + "language_loss": 0.59075326, + "learning_rate": 8.799501395936682e-07, + "loss": 0.61186016, + "num_input_tokens_seen": 250678155, + "step": 11618, + "time_per_iteration": 2.559399127960205 + }, + { + "auxiliary_loss_clip": 0.01084384, + "auxiliary_loss_mlp": 0.01034218, + "balance_loss_clip": 1.03866911, + "balance_loss_mlp": 1.02270818, + "epoch": 0.6985720727491357, + "flos": 22382834106240.0, + "grad_norm": 1.744411002999002, + "language_loss": 0.83011043, + "learning_rate": 8.796275012710903e-07, + "loss": 0.85129642, + "num_input_tokens_seen": 250697230, + "step": 11619, + "time_per_iteration": 2.5530714988708496 + }, + { + "auxiliary_loss_clip": 0.01091857, + "auxiliary_loss_mlp": 0.01030743, + "balance_loss_clip": 1.03619719, + "balance_loss_mlp": 1.02008021, + "epoch": 0.6986321960018037, + "flos": 39567884785920.0, + "grad_norm": 4.131222918843272, + "language_loss": 0.67416215, + "learning_rate": 8.793049054331494e-07, + "loss": 0.69538814, + "num_input_tokens_seen": 250719865, + "step": 11620, + "time_per_iteration": 2.6566836833953857 + }, + { + "auxiliary_loss_clip": 0.01056215, + "auxiliary_loss_mlp": 0.01032177, + "balance_loss_clip": 1.03419876, + "balance_loss_mlp": 1.01934993, + "epoch": 0.6986923192544716, + "flos": 17967725055360.0, + "grad_norm": 3.386079483907327, + "language_loss": 0.72860217, + "learning_rate": 8.789823520920794e-07, + "loss": 0.74948603, + "num_input_tokens_seen": 250736565, + "step": 11621, + "time_per_iteration": 2.6057660579681396 + }, + { + "auxiliary_loss_clip": 0.010628, + "auxiliary_loss_mlp": 0.0103637, + "balance_loss_clip": 1.03512406, + "balance_loss_mlp": 1.02367437, + "epoch": 0.6987524425071396, + "flos": 25594325297280.0, + "grad_norm": 1.635852918600558, + "language_loss": 0.68306255, + "learning_rate": 8.7865984126011e-07, + "loss": 0.70405424, + "num_input_tokens_seen": 250757235, + "step": 11622, + "time_per_iteration": 4.02208948135376 + }, + { + "auxiliary_loss_clip": 0.01039789, + "auxiliary_loss_mlp": 0.01025178, + "balance_loss_clip": 1.03490174, + "balance_loss_mlp": 1.01380539, + "epoch": 0.6988125657598077, + "flos": 17530081747200.0, + "grad_norm": 1.8856548991659343, + "language_loss": 0.6310088, + "learning_rate": 8.783373729494721e-07, + "loss": 0.65165842, + "num_input_tokens_seen": 250775585, + "step": 11623, + "time_per_iteration": 2.5960323810577393 + }, + { + "auxiliary_loss_clip": 0.01110971, + "auxiliary_loss_mlp": 0.01029229, + "balance_loss_clip": 1.03726506, + "balance_loss_mlp": 1.01662874, + "epoch": 0.6988726890124756, + "flos": 39165721136640.0, + "grad_norm": 1.9906199048573243, + "language_loss": 0.60603845, + "learning_rate": 8.780149471723932e-07, + "loss": 0.62744045, + "num_input_tokens_seen": 250795725, + "step": 11624, + "time_per_iteration": 2.6303582191467285 + }, + { + "auxiliary_loss_clip": 0.01097084, + "auxiliary_loss_mlp": 0.01040616, + "balance_loss_clip": 1.03489172, + "balance_loss_mlp": 1.02775919, + "epoch": 0.6989328122651436, + "flos": 20193468330240.0, + "grad_norm": 1.6049260550650024, + "language_loss": 0.78129572, + "learning_rate": 8.776925639411017e-07, + "loss": 0.80267268, + "num_input_tokens_seen": 250814555, + "step": 11625, + "time_per_iteration": 2.4975905418395996 + }, + { + "auxiliary_loss_clip": 0.01071339, + "auxiliary_loss_mlp": 0.01034528, + "balance_loss_clip": 1.03620565, + "balance_loss_mlp": 1.02314305, + "epoch": 0.6989929355178115, + "flos": 21834873152640.0, + "grad_norm": 1.8297947250522166, + "language_loss": 0.65934289, + "learning_rate": 8.773702232678188e-07, + "loss": 0.68040156, + "num_input_tokens_seen": 250833105, + "step": 11626, + "time_per_iteration": 2.571974515914917 + }, + { + "auxiliary_loss_clip": 0.01089188, + "auxiliary_loss_mlp": 0.00784795, + "balance_loss_clip": 1.03971827, + "balance_loss_mlp": 1.01098335, + "epoch": 0.6990530587704795, + "flos": 26322880855680.0, + "grad_norm": 1.6576301174758237, + "language_loss": 0.69948518, + "learning_rate": 8.770479251647697e-07, + "loss": 0.71822494, + "num_input_tokens_seen": 250852570, + "step": 11627, + "time_per_iteration": 3.964367151260376 + }, + { + "auxiliary_loss_clip": 0.01105442, + "auxiliary_loss_mlp": 0.01029193, + "balance_loss_clip": 1.03888559, + "balance_loss_mlp": 1.01857734, + "epoch": 0.6991131820231474, + "flos": 19828975069440.0, + "grad_norm": 1.6714802433606066, + "language_loss": 0.62302589, + "learning_rate": 8.767256696441768e-07, + "loss": 0.64437222, + "num_input_tokens_seen": 250870500, + "step": 11628, + "time_per_iteration": 3.8639156818389893 + }, + { + "auxiliary_loss_clip": 0.01098835, + "auxiliary_loss_mlp": 0.01038672, + "balance_loss_clip": 1.03770101, + "balance_loss_mlp": 1.02633405, + "epoch": 0.6991733052758154, + "flos": 33984817102080.0, + "grad_norm": 2.075455389954821, + "language_loss": 0.68541098, + "learning_rate": 8.764034567182581e-07, + "loss": 0.70678598, + "num_input_tokens_seen": 250892745, + "step": 11629, + "time_per_iteration": 2.602429151535034 + }, + { + "auxiliary_loss_clip": 0.01106748, + "auxiliary_loss_mlp": 0.01032233, + "balance_loss_clip": 1.03843009, + "balance_loss_mlp": 1.01967442, + "epoch": 0.6992334285284834, + "flos": 15633136592640.0, + "grad_norm": 1.6571688404362803, + "language_loss": 0.72301197, + "learning_rate": 8.760812863992337e-07, + "loss": 0.74440175, + "num_input_tokens_seen": 250910225, + "step": 11630, + "time_per_iteration": 2.4353671073913574 + }, + { + "auxiliary_loss_clip": 0.01107934, + "auxiliary_loss_mlp": 0.01036637, + "balance_loss_clip": 1.03946304, + "balance_loss_mlp": 1.02487659, + "epoch": 0.6992935517811514, + "flos": 21726279360000.0, + "grad_norm": 1.6046770057801882, + "language_loss": 0.7403295, + "learning_rate": 8.757591586993196e-07, + "loss": 0.76177526, + "num_input_tokens_seen": 250929715, + "step": 11631, + "time_per_iteration": 2.4698269367218018 + }, + { + "auxiliary_loss_clip": 0.01102011, + "auxiliary_loss_mlp": 0.01033465, + "balance_loss_clip": 1.04084122, + "balance_loss_mlp": 1.02035189, + "epoch": 0.6993536750338193, + "flos": 20115254465280.0, + "grad_norm": 2.0372618385698607, + "language_loss": 0.89238816, + "learning_rate": 8.7543707363073e-07, + "loss": 0.9137429, + "num_input_tokens_seen": 250944230, + "step": 11632, + "time_per_iteration": 3.8528854846954346 + }, + { + "auxiliary_loss_clip": 0.01090129, + "auxiliary_loss_mlp": 0.01039243, + "balance_loss_clip": 1.04033828, + "balance_loss_mlp": 1.02744675, + "epoch": 0.6994137982864873, + "flos": 22010547594240.0, + "grad_norm": 1.8449444165268079, + "language_loss": 0.80279648, + "learning_rate": 8.751150312056792e-07, + "loss": 0.82409018, + "num_input_tokens_seen": 250961865, + "step": 11633, + "time_per_iteration": 2.5351340770721436 + }, + { + "auxiliary_loss_clip": 0.01112415, + "auxiliary_loss_mlp": 0.01033316, + "balance_loss_clip": 1.03859079, + "balance_loss_mlp": 1.02010202, + "epoch": 0.6994739215391552, + "flos": 25519020433920.0, + "grad_norm": 2.0023965752864195, + "language_loss": 0.67186904, + "learning_rate": 8.747930314363794e-07, + "loss": 0.69332635, + "num_input_tokens_seen": 250982025, + "step": 11634, + "time_per_iteration": 2.5170748233795166 + }, + { + "auxiliary_loss_clip": 0.01017975, + "auxiliary_loss_mlp": 0.01003663, + "balance_loss_clip": 1.02216136, + "balance_loss_mlp": 1.00244069, + "epoch": 0.6995340447918232, + "flos": 59128357691520.0, + "grad_norm": 0.6824868185738696, + "language_loss": 0.53154755, + "learning_rate": 8.744710743350412e-07, + "loss": 0.55176395, + "num_input_tokens_seen": 251046900, + "step": 11635, + "time_per_iteration": 3.3090882301330566 + }, + { + "auxiliary_loss_clip": 0.01081746, + "auxiliary_loss_mlp": 0.01032081, + "balance_loss_clip": 1.03831732, + "balance_loss_mlp": 1.01969481, + "epoch": 0.6995941680444913, + "flos": 17967832796160.0, + "grad_norm": 1.543247371669746, + "language_loss": 0.82134199, + "learning_rate": 8.741491599138726e-07, + "loss": 0.8424803, + "num_input_tokens_seen": 251065050, + "step": 11636, + "time_per_iteration": 2.525515079498291 + }, + { + "auxiliary_loss_clip": 0.01109311, + "auxiliary_loss_mlp": 0.01029057, + "balance_loss_clip": 1.03839111, + "balance_loss_mlp": 1.01721358, + "epoch": 0.6996542912971592, + "flos": 21980095839360.0, + "grad_norm": 1.9641883971246108, + "language_loss": 0.8291384, + "learning_rate": 8.738272881850801e-07, + "loss": 0.85052204, + "num_input_tokens_seen": 251083355, + "step": 11637, + "time_per_iteration": 2.4900054931640625 + }, + { + "auxiliary_loss_clip": 0.01058112, + "auxiliary_loss_mlp": 0.01034517, + "balance_loss_clip": 1.03711534, + "balance_loss_mlp": 1.02257836, + "epoch": 0.6997144145498272, + "flos": 11686158518400.0, + "grad_norm": 1.9820674585982874, + "language_loss": 0.67660356, + "learning_rate": 8.735054591608704e-07, + "loss": 0.69752985, + "num_input_tokens_seen": 251096420, + "step": 11638, + "time_per_iteration": 2.55896258354187 + }, + { + "auxiliary_loss_clip": 0.01103546, + "auxiliary_loss_mlp": 0.01036508, + "balance_loss_clip": 1.03947139, + "balance_loss_mlp": 1.02331161, + "epoch": 0.6997745378024951, + "flos": 29607162958080.0, + "grad_norm": 1.8776661858203898, + "language_loss": 0.78070623, + "learning_rate": 8.731836728534459e-07, + "loss": 0.8021068, + "num_input_tokens_seen": 251115410, + "step": 11639, + "time_per_iteration": 2.5807676315307617 + }, + { + "auxiliary_loss_clip": 0.01084789, + "auxiliary_loss_mlp": 0.01040051, + "balance_loss_clip": 1.03858697, + "balance_loss_mlp": 1.02697408, + "epoch": 0.6998346610551631, + "flos": 20886616056960.0, + "grad_norm": 2.441823037389954, + "language_loss": 0.82474792, + "learning_rate": 8.728619292750093e-07, + "loss": 0.84599632, + "num_input_tokens_seen": 251133530, + "step": 11640, + "time_per_iteration": 2.5432631969451904 + }, + { + "auxiliary_loss_clip": 0.01074823, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.03623557, + "balance_loss_mlp": 1.02018583, + "epoch": 0.699894784307831, + "flos": 27163046949120.0, + "grad_norm": 2.118550322597399, + "language_loss": 0.7536903, + "learning_rate": 8.725402284377619e-07, + "loss": 0.77475846, + "num_input_tokens_seen": 251153985, + "step": 11641, + "time_per_iteration": 2.618746042251587 + }, + { + "auxiliary_loss_clip": 0.01085111, + "auxiliary_loss_mlp": 0.01026572, + "balance_loss_clip": 1.03744984, + "balance_loss_mlp": 1.01359606, + "epoch": 0.699954907560499, + "flos": 20923640000640.0, + "grad_norm": 1.9298636216157266, + "language_loss": 0.77513671, + "learning_rate": 8.722185703539022e-07, + "loss": 0.79625356, + "num_input_tokens_seen": 251173225, + "step": 11642, + "time_per_iteration": 2.5321662425994873 + }, + { + "auxiliary_loss_clip": 0.01104789, + "auxiliary_loss_mlp": 0.01034561, + "balance_loss_clip": 1.03960967, + "balance_loss_mlp": 1.02089918, + "epoch": 0.700015030813167, + "flos": 28657792540800.0, + "grad_norm": 2.2469556344545776, + "language_loss": 0.74993038, + "learning_rate": 8.718969550356266e-07, + "loss": 0.77132386, + "num_input_tokens_seen": 251192485, + "step": 11643, + "time_per_iteration": 2.5436928272247314 + }, + { + "auxiliary_loss_clip": 0.0107878, + "auxiliary_loss_mlp": 0.01027858, + "balance_loss_clip": 1.04129004, + "balance_loss_mlp": 1.01531088, + "epoch": 0.700075154065835, + "flos": 29205286617600.0, + "grad_norm": 1.5583622312659202, + "language_loss": 0.60165536, + "learning_rate": 8.715753824951315e-07, + "loss": 0.62272179, + "num_input_tokens_seen": 251214965, + "step": 11644, + "time_per_iteration": 2.6340270042419434 + }, + { + "auxiliary_loss_clip": 0.01096185, + "auxiliary_loss_mlp": 0.01028727, + "balance_loss_clip": 1.0373342, + "balance_loss_mlp": 1.0169189, + "epoch": 0.7001352773185029, + "flos": 23112431159040.0, + "grad_norm": 1.6306492229985121, + "language_loss": 0.81637901, + "learning_rate": 8.712538527446119e-07, + "loss": 0.83762813, + "num_input_tokens_seen": 251234500, + "step": 11645, + "time_per_iteration": 2.4953136444091797 + }, + { + "auxiliary_loss_clip": 0.01097437, + "auxiliary_loss_mlp": 0.01030538, + "balance_loss_clip": 1.03786159, + "balance_loss_mlp": 1.0184617, + "epoch": 0.7001954005711709, + "flos": 21322858734720.0, + "grad_norm": 2.384354512229564, + "language_loss": 0.68038946, + "learning_rate": 8.709323657962584e-07, + "loss": 0.70166922, + "num_input_tokens_seen": 251254360, + "step": 11646, + "time_per_iteration": 2.583320379257202 + }, + { + "auxiliary_loss_clip": 0.01095032, + "auxiliary_loss_mlp": 0.01042035, + "balance_loss_clip": 1.03914225, + "balance_loss_mlp": 1.02916622, + "epoch": 0.7002555238238388, + "flos": 24535822383360.0, + "grad_norm": 1.5193886731794732, + "language_loss": 0.71137947, + "learning_rate": 8.706109216622635e-07, + "loss": 0.73275012, + "num_input_tokens_seen": 251274790, + "step": 11647, + "time_per_iteration": 2.6119205951690674 + }, + { + "auxiliary_loss_clip": 0.01098602, + "auxiliary_loss_mlp": 0.0103472, + "balance_loss_clip": 1.03871202, + "balance_loss_mlp": 1.02173173, + "epoch": 0.7003156470765068, + "flos": 39056552726400.0, + "grad_norm": 2.7755211511756297, + "language_loss": 0.71841234, + "learning_rate": 8.702895203548155e-07, + "loss": 0.73974562, + "num_input_tokens_seen": 251296275, + "step": 11648, + "time_per_iteration": 2.7204642295837402 + }, + { + "auxiliary_loss_clip": 0.01054779, + "auxiliary_loss_mlp": 0.01036175, + "balance_loss_clip": 1.0332917, + "balance_loss_mlp": 1.02332377, + "epoch": 0.7003757703291749, + "flos": 28804092635520.0, + "grad_norm": 1.9746677341631502, + "language_loss": 0.77012253, + "learning_rate": 8.699681618861014e-07, + "loss": 0.79103208, + "num_input_tokens_seen": 251317375, + "step": 11649, + "time_per_iteration": 2.651092290878296 + }, + { + "auxiliary_loss_clip": 0.01085065, + "auxiliary_loss_mlp": 0.01030477, + "balance_loss_clip": 1.03721607, + "balance_loss_mlp": 1.01865149, + "epoch": 0.7004358935818428, + "flos": 15953854152960.0, + "grad_norm": 1.6312078217325998, + "language_loss": 0.7837671, + "learning_rate": 8.69646846268308e-07, + "loss": 0.80492258, + "num_input_tokens_seen": 251333570, + "step": 11650, + "time_per_iteration": 2.5105443000793457 + }, + { + "auxiliary_loss_clip": 0.01085424, + "auxiliary_loss_mlp": 0.01033539, + "balance_loss_clip": 1.03742421, + "balance_loss_mlp": 1.02167726, + "epoch": 0.7004960168345108, + "flos": 20411984718720.0, + "grad_norm": 2.090413522621314, + "language_loss": 0.78522408, + "learning_rate": 8.693255735136194e-07, + "loss": 0.80641365, + "num_input_tokens_seen": 251351070, + "step": 11651, + "time_per_iteration": 2.5129969120025635 + }, + { + "auxiliary_loss_clip": 0.01074062, + "auxiliary_loss_mlp": 0.01042029, + "balance_loss_clip": 1.03705049, + "balance_loss_mlp": 1.02880239, + "epoch": 0.7005561400871787, + "flos": 17347547808000.0, + "grad_norm": 1.699463331329315, + "language_loss": 0.69598293, + "learning_rate": 8.690043436342198e-07, + "loss": 0.71714383, + "num_input_tokens_seen": 251370005, + "step": 11652, + "time_per_iteration": 2.561394214630127 + }, + { + "auxiliary_loss_clip": 0.01099391, + "auxiliary_loss_mlp": 0.01029342, + "balance_loss_clip": 1.03844905, + "balance_loss_mlp": 1.01701581, + "epoch": 0.7006162633398467, + "flos": 25302120157440.0, + "grad_norm": 1.3816660927355262, + "language_loss": 0.74408352, + "learning_rate": 8.686831566422874e-07, + "loss": 0.76537085, + "num_input_tokens_seen": 251391210, + "step": 11653, + "time_per_iteration": 2.526205062866211 + }, + { + "auxiliary_loss_clip": 0.01089243, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.03834093, + "balance_loss_mlp": 1.01783299, + "epoch": 0.7006763865925146, + "flos": 20668997508480.0, + "grad_norm": 1.8296665532719785, + "language_loss": 0.70239884, + "learning_rate": 8.68362012550003e-07, + "loss": 0.72360253, + "num_input_tokens_seen": 251411505, + "step": 11654, + "time_per_iteration": 2.527838945388794 + }, + { + "auxiliary_loss_clip": 0.01062811, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.0373013, + "balance_loss_mlp": 1.01972246, + "epoch": 0.7007365098451827, + "flos": 20046449963520.0, + "grad_norm": 2.508125494193808, + "language_loss": 0.72440171, + "learning_rate": 8.680409113695453e-07, + "loss": 0.74536043, + "num_input_tokens_seen": 251428975, + "step": 11655, + "time_per_iteration": 2.5665245056152344 + }, + { + "auxiliary_loss_clip": 0.0110636, + "auxiliary_loss_mlp": 0.01039345, + "balance_loss_clip": 1.04095531, + "balance_loss_mlp": 1.02532601, + "epoch": 0.7007966330978506, + "flos": 20777375819520.0, + "grad_norm": 2.177742683359031, + "language_loss": 0.70460486, + "learning_rate": 8.677198531130889e-07, + "loss": 0.72606194, + "num_input_tokens_seen": 251446940, + "step": 11656, + "time_per_iteration": 2.507297992706299 + }, + { + "auxiliary_loss_clip": 0.01061795, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.03479767, + "balance_loss_mlp": 1.01907587, + "epoch": 0.7008567563505186, + "flos": 29638189330560.0, + "grad_norm": 1.4807869480691012, + "language_loss": 0.77707064, + "learning_rate": 8.673988377928092e-07, + "loss": 0.79799461, + "num_input_tokens_seen": 251466205, + "step": 11657, + "time_per_iteration": 2.6491570472717285 + }, + { + "auxiliary_loss_clip": 0.01114083, + "auxiliary_loss_mlp": 0.01038219, + "balance_loss_clip": 1.04003406, + "balance_loss_mlp": 1.02424729, + "epoch": 0.7009168796031865, + "flos": 17092007475840.0, + "grad_norm": 2.0914776754255757, + "language_loss": 0.77994049, + "learning_rate": 8.670778654208797e-07, + "loss": 0.80146348, + "num_input_tokens_seen": 251484820, + "step": 11658, + "time_per_iteration": 2.483335018157959 + }, + { + "auxiliary_loss_clip": 0.01080669, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.03424621, + "balance_loss_mlp": 1.01876473, + "epoch": 0.7009770028558545, + "flos": 20448972748800.0, + "grad_norm": 2.11980527845079, + "language_loss": 0.83149624, + "learning_rate": 8.667569360094713e-07, + "loss": 0.85261345, + "num_input_tokens_seen": 251502670, + "step": 11659, + "time_per_iteration": 2.5163156986236572 + }, + { + "auxiliary_loss_clip": 0.01073173, + "auxiliary_loss_mlp": 0.01027319, + "balance_loss_clip": 1.03734148, + "balance_loss_mlp": 1.01537406, + "epoch": 0.7010371261085224, + "flos": 19245139407360.0, + "grad_norm": 1.9684346535858375, + "language_loss": 0.69493812, + "learning_rate": 8.664360495707526e-07, + "loss": 0.7159431, + "num_input_tokens_seen": 251521630, + "step": 11660, + "time_per_iteration": 2.6095449924468994 + }, + { + "auxiliary_loss_clip": 0.0111057, + "auxiliary_loss_mlp": 0.01033319, + "balance_loss_clip": 1.03764462, + "balance_loss_mlp": 1.0200454, + "epoch": 0.7010972493611904, + "flos": 22127581082880.0, + "grad_norm": 2.273328612861046, + "language_loss": 0.8073343, + "learning_rate": 8.661152061168924e-07, + "loss": 0.8287732, + "num_input_tokens_seen": 251540105, + "step": 11661, + "time_per_iteration": 3.8412692546844482 + }, + { + "auxiliary_loss_clip": 0.01095854, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.03607786, + "balance_loss_mlp": 1.02002239, + "epoch": 0.7011573726138585, + "flos": 31391132860800.0, + "grad_norm": 1.5535228028985364, + "language_loss": 0.78975725, + "learning_rate": 8.657944056600579e-07, + "loss": 0.81103319, + "num_input_tokens_seen": 251560530, + "step": 11662, + "time_per_iteration": 2.55916166305542 + }, + { + "auxiliary_loss_clip": 0.0109587, + "auxiliary_loss_mlp": 0.01033724, + "balance_loss_clip": 1.03703821, + "balance_loss_mlp": 1.01949024, + "epoch": 0.7012174958665264, + "flos": 18150582216960.0, + "grad_norm": 2.0019490148212906, + "language_loss": 0.83651692, + "learning_rate": 8.654736482124134e-07, + "loss": 0.85781288, + "num_input_tokens_seen": 251577930, + "step": 11663, + "time_per_iteration": 2.486645460128784 + }, + { + "auxiliary_loss_clip": 0.01026993, + "auxiliary_loss_mlp": 0.00999185, + "balance_loss_clip": 1.01534092, + "balance_loss_mlp": 0.99792749, + "epoch": 0.7012776191191944, + "flos": 60651256567680.0, + "grad_norm": 0.8249261101150267, + "language_loss": 0.53831387, + "learning_rate": 8.651529337861209e-07, + "loss": 0.55857563, + "num_input_tokens_seen": 251638820, + "step": 11664, + "time_per_iteration": 3.086350202560425 + }, + { + "auxiliary_loss_clip": 0.01088866, + "auxiliary_loss_mlp": 0.01032512, + "balance_loss_clip": 1.03787446, + "balance_loss_mlp": 1.01917219, + "epoch": 0.7013377423718623, + "flos": 27198598435200.0, + "grad_norm": 1.908641428435087, + "language_loss": 0.78739572, + "learning_rate": 8.64832262393344e-07, + "loss": 0.80860949, + "num_input_tokens_seen": 251658070, + "step": 11665, + "time_per_iteration": 3.9786338806152344 + }, + { + "auxiliary_loss_clip": 0.01093263, + "auxiliary_loss_mlp": 0.01031295, + "balance_loss_clip": 1.0356977, + "balance_loss_mlp": 1.01892674, + "epoch": 0.7013978656245303, + "flos": 16543543731840.0, + "grad_norm": 2.069274616445226, + "language_loss": 0.76798058, + "learning_rate": 8.645116340462404e-07, + "loss": 0.78922617, + "num_input_tokens_seen": 251671575, + "step": 11666, + "time_per_iteration": 3.823756694793701 + }, + { + "auxiliary_loss_clip": 0.01095496, + "auxiliary_loss_mlp": 0.01031778, + "balance_loss_clip": 1.03868997, + "balance_loss_mlp": 1.01955938, + "epoch": 0.7014579888771982, + "flos": 23143780753920.0, + "grad_norm": 1.7658689542971027, + "language_loss": 0.80991936, + "learning_rate": 8.641910487569695e-07, + "loss": 0.83119214, + "num_input_tokens_seen": 251689350, + "step": 11667, + "time_per_iteration": 2.5267486572265625 + }, + { + "auxiliary_loss_clip": 0.01074155, + "auxiliary_loss_mlp": 0.01037938, + "balance_loss_clip": 1.03484488, + "balance_loss_mlp": 1.02529597, + "epoch": 0.7015181121298663, + "flos": 25082095397760.0, + "grad_norm": 2.2202330773476597, + "language_loss": 0.65010554, + "learning_rate": 8.638705065376879e-07, + "loss": 0.6712265, + "num_input_tokens_seen": 251704635, + "step": 11668, + "time_per_iteration": 2.592855930328369 + }, + { + "auxiliary_loss_clip": 0.01089247, + "auxiliary_loss_mlp": 0.01026861, + "balance_loss_clip": 1.03752637, + "balance_loss_mlp": 1.01414132, + "epoch": 0.7015782353825342, + "flos": 23327894891520.0, + "grad_norm": 1.8397464349538428, + "language_loss": 0.76577854, + "learning_rate": 8.635500074005519e-07, + "loss": 0.78693962, + "num_input_tokens_seen": 251723035, + "step": 11669, + "time_per_iteration": 2.5408968925476074 + }, + { + "auxiliary_loss_clip": 0.01021863, + "auxiliary_loss_mlp": 0.01002902, + "balance_loss_clip": 1.01728368, + "balance_loss_mlp": 1.00166798, + "epoch": 0.7016383586352022, + "flos": 70397161107840.0, + "grad_norm": 0.6949909298262044, + "language_loss": 0.54486209, + "learning_rate": 8.632295513577122e-07, + "loss": 0.56510973, + "num_input_tokens_seen": 251791630, + "step": 11670, + "time_per_iteration": 3.255687952041626 + }, + { + "auxiliary_loss_clip": 0.01082778, + "auxiliary_loss_mlp": 0.01033905, + "balance_loss_clip": 1.03899777, + "balance_loss_mlp": 1.02170396, + "epoch": 0.7016984818878701, + "flos": 19792274348160.0, + "grad_norm": 1.6434150961952654, + "language_loss": 0.81707668, + "learning_rate": 8.629091384213218e-07, + "loss": 0.83824348, + "num_input_tokens_seen": 251809840, + "step": 11671, + "time_per_iteration": 4.001113176345825 + }, + { + "auxiliary_loss_clip": 0.01103205, + "auxiliary_loss_mlp": 0.01036079, + "balance_loss_clip": 1.04144382, + "balance_loss_mlp": 1.02291822, + "epoch": 0.7017586051405381, + "flos": 12896923184640.0, + "grad_norm": 1.9030743572461828, + "language_loss": 0.75307971, + "learning_rate": 8.625887686035313e-07, + "loss": 0.77447259, + "num_input_tokens_seen": 251827550, + "step": 11672, + "time_per_iteration": 2.480471134185791 + }, + { + "auxiliary_loss_clip": 0.01091748, + "auxiliary_loss_mlp": 0.01033033, + "balance_loss_clip": 1.03535175, + "balance_loss_mlp": 1.02026534, + "epoch": 0.701818728393206, + "flos": 18332828847360.0, + "grad_norm": 1.5541167687171387, + "language_loss": 0.8667959, + "learning_rate": 8.622684419164883e-07, + "loss": 0.88804376, + "num_input_tokens_seen": 251844880, + "step": 11673, + "time_per_iteration": 2.4832005500793457 + }, + { + "auxiliary_loss_clip": 0.01092337, + "auxiliary_loss_mlp": 0.01027328, + "balance_loss_clip": 1.0356307, + "balance_loss_mlp": 1.01497769, + "epoch": 0.701878851645874, + "flos": 17384212615680.0, + "grad_norm": 1.8475501416767877, + "language_loss": 0.73217845, + "learning_rate": 8.619481583723399e-07, + "loss": 0.75337505, + "num_input_tokens_seen": 251861025, + "step": 11674, + "time_per_iteration": 2.4956910610198975 + }, + { + "auxiliary_loss_clip": 0.01099386, + "auxiliary_loss_mlp": 0.00783174, + "balance_loss_clip": 1.04171383, + "balance_loss_mlp": 1.01021194, + "epoch": 0.701938974898542, + "flos": 23915501481600.0, + "grad_norm": 1.6197329247009413, + "language_loss": 0.71774125, + "learning_rate": 8.616279179832329e-07, + "loss": 0.73656684, + "num_input_tokens_seen": 251880175, + "step": 11675, + "time_per_iteration": 2.5394179821014404 + }, + { + "auxiliary_loss_clip": 0.01072048, + "auxiliary_loss_mlp": 0.01026924, + "balance_loss_clip": 1.03800023, + "balance_loss_mlp": 1.01421034, + "epoch": 0.70199909815121, + "flos": 21795586652160.0, + "grad_norm": 7.93405941555717, + "language_loss": 0.51435649, + "learning_rate": 8.613077207613078e-07, + "loss": 0.53534621, + "num_input_tokens_seen": 251899005, + "step": 11676, + "time_per_iteration": 2.561394453048706 + }, + { + "auxiliary_loss_clip": 0.01015733, + "auxiliary_loss_mlp": 0.00762028, + "balance_loss_clip": 1.01460743, + "balance_loss_mlp": 0.99878138, + "epoch": 0.702059221403878, + "flos": 71715047109120.0, + "grad_norm": 0.7270107810398855, + "language_loss": 0.59218633, + "learning_rate": 8.609875667187079e-07, + "loss": 0.60996389, + "num_input_tokens_seen": 251966790, + "step": 11677, + "time_per_iteration": 3.2434308528900146 + }, + { + "auxiliary_loss_clip": 0.01095461, + "auxiliary_loss_mlp": 0.01033374, + "balance_loss_clip": 1.03669035, + "balance_loss_mlp": 1.01965332, + "epoch": 0.7021193446565459, + "flos": 28111052649600.0, + "grad_norm": 1.9263400276737335, + "language_loss": 0.62514853, + "learning_rate": 8.606674558675737e-07, + "loss": 0.64643687, + "num_input_tokens_seen": 251989315, + "step": 11678, + "time_per_iteration": 2.590395927429199 + }, + { + "auxiliary_loss_clip": 0.01107546, + "auxiliary_loss_mlp": 0.0103387, + "balance_loss_clip": 1.03823817, + "balance_loss_mlp": 1.02162766, + "epoch": 0.7021794679092139, + "flos": 22924905229440.0, + "grad_norm": 7.773691393850617, + "language_loss": 0.79587078, + "learning_rate": 8.603473882200444e-07, + "loss": 0.81728494, + "num_input_tokens_seen": 252006620, + "step": 11679, + "time_per_iteration": 2.4781200885772705 + }, + { + "auxiliary_loss_clip": 0.01081602, + "auxiliary_loss_mlp": 0.01040635, + "balance_loss_clip": 1.03721821, + "balance_loss_mlp": 1.02743888, + "epoch": 0.7022395911618818, + "flos": 18077827219200.0, + "grad_norm": 2.071870755534729, + "language_loss": 0.70324391, + "learning_rate": 8.600273637882567e-07, + "loss": 0.72446632, + "num_input_tokens_seen": 252024570, + "step": 11680, + "time_per_iteration": 2.516818046569824 + }, + { + "auxiliary_loss_clip": 0.01070358, + "auxiliary_loss_mlp": 0.01034743, + "balance_loss_clip": 1.0345521, + "balance_loss_mlp": 1.02066422, + "epoch": 0.7022997144145499, + "flos": 16034294661120.0, + "grad_norm": 1.624028475717048, + "language_loss": 0.74861097, + "learning_rate": 8.597073825843446e-07, + "loss": 0.76966202, + "num_input_tokens_seen": 252042775, + "step": 11681, + "time_per_iteration": 2.5234322547912598 + }, + { + "auxiliary_loss_clip": 0.01089602, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.0375675, + "balance_loss_mlp": 1.01980436, + "epoch": 0.7023598376672178, + "flos": 26468678160000.0, + "grad_norm": 1.4866793340277638, + "language_loss": 0.76763314, + "learning_rate": 8.593874446204434e-07, + "loss": 0.7888428, + "num_input_tokens_seen": 252063690, + "step": 11682, + "time_per_iteration": 2.5828285217285156 + }, + { + "auxiliary_loss_clip": 0.01076574, + "auxiliary_loss_mlp": 0.00784565, + "balance_loss_clip": 1.03744698, + "balance_loss_mlp": 1.00867653, + "epoch": 0.7024199609198858, + "flos": 17055917285760.0, + "grad_norm": 1.973403372435661, + "language_loss": 0.73600554, + "learning_rate": 8.590675499086841e-07, + "loss": 0.75461692, + "num_input_tokens_seen": 252080335, + "step": 11683, + "time_per_iteration": 2.53600811958313 + }, + { + "auxiliary_loss_clip": 0.01076331, + "auxiliary_loss_mlp": 0.01030963, + "balance_loss_clip": 1.04144514, + "balance_loss_mlp": 1.01821363, + "epoch": 0.7024800841725537, + "flos": 25849039616640.0, + "grad_norm": 1.9916999784303735, + "language_loss": 0.71512711, + "learning_rate": 8.587476984611976e-07, + "loss": 0.73620003, + "num_input_tokens_seen": 252101075, + "step": 11684, + "time_per_iteration": 2.617619514465332 + }, + { + "auxiliary_loss_clip": 0.01100817, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.03923774, + "balance_loss_mlp": 1.02007461, + "epoch": 0.7025402074252217, + "flos": 23513014609920.0, + "grad_norm": 1.8944183372659253, + "language_loss": 0.72068226, + "learning_rate": 8.584278902901128e-07, + "loss": 0.74202263, + "num_input_tokens_seen": 252120510, + "step": 11685, + "time_per_iteration": 2.562440872192383 + }, + { + "auxiliary_loss_clip": 0.01095737, + "auxiliary_loss_mlp": 0.01027164, + "balance_loss_clip": 1.03666508, + "balance_loss_mlp": 1.0153327, + "epoch": 0.7026003306778896, + "flos": 20150985519360.0, + "grad_norm": 1.6239189378195842, + "language_loss": 0.84483153, + "learning_rate": 8.581081254075582e-07, + "loss": 0.8660605, + "num_input_tokens_seen": 252137590, + "step": 11686, + "time_per_iteration": 2.5414278507232666 + }, + { + "auxiliary_loss_clip": 0.01026341, + "auxiliary_loss_mlp": 0.0100339, + "balance_loss_clip": 1.01655436, + "balance_loss_mlp": 1.00209641, + "epoch": 0.7026604539305576, + "flos": 64772400712320.0, + "grad_norm": 0.9773861699920862, + "language_loss": 0.69951403, + "learning_rate": 8.577884038256566e-07, + "loss": 0.71981132, + "num_input_tokens_seen": 252199830, + "step": 11687, + "time_per_iteration": 3.255021572113037 + }, + { + "auxiliary_loss_clip": 0.01074159, + "auxiliary_loss_mlp": 0.01028636, + "balance_loss_clip": 1.03629589, + "balance_loss_mlp": 1.0155766, + "epoch": 0.7027205771832256, + "flos": 21871466133120.0, + "grad_norm": 2.0138503158810908, + "language_loss": 0.7737844, + "learning_rate": 8.574687255565329e-07, + "loss": 0.79481232, + "num_input_tokens_seen": 252217200, + "step": 11688, + "time_per_iteration": 2.581228017807007 + }, + { + "auxiliary_loss_clip": 0.01107949, + "auxiliary_loss_mlp": 0.01033187, + "balance_loss_clip": 1.03732574, + "balance_loss_mlp": 1.02083063, + "epoch": 0.7027807004358936, + "flos": 23367791923200.0, + "grad_norm": 2.0928053511229026, + "language_loss": 0.68636906, + "learning_rate": 8.571490906123107e-07, + "loss": 0.70778036, + "num_input_tokens_seen": 252236105, + "step": 11689, + "time_per_iteration": 2.4807448387145996 + }, + { + "auxiliary_loss_clip": 0.01088128, + "auxiliary_loss_mlp": 0.01037083, + "balance_loss_clip": 1.03803945, + "balance_loss_mlp": 1.02376103, + "epoch": 0.7028408236885616, + "flos": 15304266645120.0, + "grad_norm": 1.8761650720900458, + "language_loss": 0.79821563, + "learning_rate": 8.568294990051086e-07, + "loss": 0.81946778, + "num_input_tokens_seen": 252253315, + "step": 11690, + "time_per_iteration": 2.540001392364502 + }, + { + "auxiliary_loss_clip": 0.01108758, + "auxiliary_loss_mlp": 0.01035851, + "balance_loss_clip": 1.03877854, + "balance_loss_mlp": 1.02338195, + "epoch": 0.7029009469412295, + "flos": 22018197191040.0, + "grad_norm": 1.6497035030885268, + "language_loss": 0.76026654, + "learning_rate": 8.56509950747047e-07, + "loss": 0.78171265, + "num_input_tokens_seen": 252272765, + "step": 11691, + "time_per_iteration": 2.463104486465454 + }, + { + "auxiliary_loss_clip": 0.0108517, + "auxiliary_loss_mlp": 0.01028651, + "balance_loss_clip": 1.03843546, + "balance_loss_mlp": 1.01619363, + "epoch": 0.7029610701938975, + "flos": 21835519597440.0, + "grad_norm": 1.9719776900687946, + "language_loss": 0.81780684, + "learning_rate": 8.561904458502429e-07, + "loss": 0.83894509, + "num_input_tokens_seen": 252290510, + "step": 11692, + "time_per_iteration": 2.553443670272827 + }, + { + "auxiliary_loss_clip": 0.01083435, + "auxiliary_loss_mlp": 0.01028637, + "balance_loss_clip": 1.038625, + "balance_loss_mlp": 1.01615584, + "epoch": 0.7030211934465654, + "flos": 19135647774720.0, + "grad_norm": 2.33299099524304, + "language_loss": 0.76412767, + "learning_rate": 8.558709843268111e-07, + "loss": 0.7852484, + "num_input_tokens_seen": 252309365, + "step": 11693, + "time_per_iteration": 2.5329227447509766 + }, + { + "auxiliary_loss_clip": 0.01082942, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.03969002, + "balance_loss_mlp": 1.02198577, + "epoch": 0.7030813166992335, + "flos": 38546010766080.0, + "grad_norm": 1.451107548712621, + "language_loss": 0.68589211, + "learning_rate": 8.55551566188866e-07, + "loss": 0.70706558, + "num_input_tokens_seen": 252333010, + "step": 11694, + "time_per_iteration": 2.671213388442993 + }, + { + "auxiliary_loss_clip": 0.01108932, + "auxiliary_loss_mlp": 0.01029666, + "balance_loss_clip": 1.03763795, + "balance_loss_mlp": 1.01708341, + "epoch": 0.7031414399519014, + "flos": 14720897859840.0, + "grad_norm": 1.9852122729820922, + "language_loss": 0.75640815, + "learning_rate": 8.552321914485203e-07, + "loss": 0.77779412, + "num_input_tokens_seen": 252351330, + "step": 11695, + "time_per_iteration": 2.4433882236480713 + }, + { + "auxiliary_loss_clip": 0.01086093, + "auxiliary_loss_mlp": 0.01034236, + "balance_loss_clip": 1.03988075, + "balance_loss_mlp": 1.02103329, + "epoch": 0.7032015632045694, + "flos": 14027247342720.0, + "grad_norm": 1.9603002272708652, + "language_loss": 0.73376334, + "learning_rate": 8.549128601178852e-07, + "loss": 0.75496668, + "num_input_tokens_seen": 252369580, + "step": 11696, + "time_per_iteration": 2.5174925327301025 + }, + { + "auxiliary_loss_clip": 0.01091487, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.0377785, + "balance_loss_mlp": 1.0180527, + "epoch": 0.7032616864572373, + "flos": 27637175496960.0, + "grad_norm": 1.5598199007359645, + "language_loss": 0.75203633, + "learning_rate": 8.545935722090693e-07, + "loss": 0.77326298, + "num_input_tokens_seen": 252390525, + "step": 11697, + "time_per_iteration": 2.6158759593963623 + }, + { + "auxiliary_loss_clip": 0.01066269, + "auxiliary_loss_mlp": 0.01039776, + "balance_loss_clip": 1.03967977, + "balance_loss_mlp": 1.02488685, + "epoch": 0.7033218097099053, + "flos": 17967294092160.0, + "grad_norm": 1.827019582886223, + "language_loss": 0.80849636, + "learning_rate": 8.542743277341793e-07, + "loss": 0.82955682, + "num_input_tokens_seen": 252407470, + "step": 11698, + "time_per_iteration": 2.592144012451172 + }, + { + "auxiliary_loss_clip": 0.01081467, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.03769171, + "balance_loss_mlp": 1.02419066, + "epoch": 0.7033819329625732, + "flos": 19501721233920.0, + "grad_norm": 1.3880417450218534, + "language_loss": 0.8449133, + "learning_rate": 8.539551267053222e-07, + "loss": 0.86611021, + "num_input_tokens_seen": 252427025, + "step": 11699, + "time_per_iteration": 3.9582715034484863 + }, + { + "auxiliary_loss_clip": 0.01094181, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.03830743, + "balance_loss_mlp": 1.01815975, + "epoch": 0.7034420562152413, + "flos": 23987645948160.0, + "grad_norm": 1.9729835915935523, + "language_loss": 0.78932315, + "learning_rate": 8.53635969134601e-07, + "loss": 0.81058282, + "num_input_tokens_seen": 252445410, + "step": 11700, + "time_per_iteration": 2.5399725437164307 + }, + { + "auxiliary_loss_clip": 0.01098447, + "auxiliary_loss_mlp": 0.0102865, + "balance_loss_clip": 1.03803396, + "balance_loss_mlp": 1.01548386, + "epoch": 0.7035021794679092, + "flos": 35043427756800.0, + "grad_norm": 1.6551720302184092, + "language_loss": 0.7440753, + "learning_rate": 8.533168550341186e-07, + "loss": 0.76534629, + "num_input_tokens_seen": 252463905, + "step": 11701, + "time_per_iteration": 2.6103675365448 + }, + { + "auxiliary_loss_clip": 0.01103597, + "auxiliary_loss_mlp": 0.01032213, + "balance_loss_clip": 1.03967512, + "balance_loss_mlp": 1.01829004, + "epoch": 0.7035623027205772, + "flos": 10997428164480.0, + "grad_norm": 5.2130018392353, + "language_loss": 0.83875316, + "learning_rate": 8.529977844159769e-07, + "loss": 0.8601113, + "num_input_tokens_seen": 252478655, + "step": 11702, + "time_per_iteration": 2.452625274658203 + }, + { + "auxiliary_loss_clip": 0.01109592, + "auxiliary_loss_mlp": 0.01039368, + "balance_loss_clip": 1.03842723, + "balance_loss_mlp": 1.02629089, + "epoch": 0.7036224259732452, + "flos": 23623727304960.0, + "grad_norm": 1.9073911807117558, + "language_loss": 0.61312211, + "learning_rate": 8.526787572922738e-07, + "loss": 0.63461173, + "num_input_tokens_seen": 252498740, + "step": 11703, + "time_per_iteration": 2.5002970695495605 + }, + { + "auxiliary_loss_clip": 0.01107788, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.03621066, + "balance_loss_mlp": 1.01782322, + "epoch": 0.7036825492259131, + "flos": 31686175175040.0, + "grad_norm": 2.036441512749634, + "language_loss": 0.61324269, + "learning_rate": 8.523597736751067e-07, + "loss": 0.63462901, + "num_input_tokens_seen": 252517800, + "step": 11704, + "time_per_iteration": 3.936185836791992 + }, + { + "auxiliary_loss_clip": 0.01089586, + "auxiliary_loss_mlp": 0.01030082, + "balance_loss_clip": 1.03697801, + "balance_loss_mlp": 1.0186851, + "epoch": 0.7037426724785811, + "flos": 30192866127360.0, + "grad_norm": 1.7626100065574815, + "language_loss": 0.70869005, + "learning_rate": 8.520408335765719e-07, + "loss": 0.72988665, + "num_input_tokens_seen": 252539620, + "step": 11705, + "time_per_iteration": 3.9462175369262695 + }, + { + "auxiliary_loss_clip": 0.01096185, + "auxiliary_loss_mlp": 0.0103185, + "balance_loss_clip": 1.03791797, + "balance_loss_mlp": 1.01966059, + "epoch": 0.703802795731249, + "flos": 24311523905280.0, + "grad_norm": 1.8838781764608052, + "language_loss": 0.6192103, + "learning_rate": 8.517219370087645e-07, + "loss": 0.64049065, + "num_input_tokens_seen": 252557300, + "step": 11706, + "time_per_iteration": 2.537309408187866 + }, + { + "auxiliary_loss_clip": 0.01101443, + "auxiliary_loss_mlp": 0.0103074, + "balance_loss_clip": 1.03947043, + "balance_loss_mlp": 1.01806164, + "epoch": 0.7038629189839171, + "flos": 22528954632960.0, + "grad_norm": 2.0897908155901432, + "language_loss": 0.68060553, + "learning_rate": 8.514030839837756e-07, + "loss": 0.70192736, + "num_input_tokens_seen": 252576715, + "step": 11707, + "time_per_iteration": 2.495288848876953 + }, + { + "auxiliary_loss_clip": 0.01105921, + "auxiliary_loss_mlp": 0.01028658, + "balance_loss_clip": 1.03768575, + "balance_loss_mlp": 1.01680267, + "epoch": 0.703923042236585, + "flos": 26250484993920.0, + "grad_norm": 1.9709128433853522, + "language_loss": 0.76417005, + "learning_rate": 8.510842745136974e-07, + "loss": 0.78551579, + "num_input_tokens_seen": 252596190, + "step": 11708, + "time_per_iteration": 2.5281362533569336 + }, + { + "auxiliary_loss_clip": 0.01087273, + "auxiliary_loss_mlp": 0.01030137, + "balance_loss_clip": 1.03888965, + "balance_loss_mlp": 1.01783419, + "epoch": 0.703983165489253, + "flos": 19390254353280.0, + "grad_norm": 1.7007020195476077, + "language_loss": 0.72190034, + "learning_rate": 8.50765508610619e-07, + "loss": 0.74307442, + "num_input_tokens_seen": 252613410, + "step": 11709, + "time_per_iteration": 3.92002534866333 + }, + { + "auxiliary_loss_clip": 0.01094312, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.03575802, + "balance_loss_mlp": 1.01814353, + "epoch": 0.7040432887419209, + "flos": 16683630773760.0, + "grad_norm": 2.049738401494441, + "language_loss": 0.78564483, + "learning_rate": 8.504467862866267e-07, + "loss": 0.80688882, + "num_input_tokens_seen": 252629150, + "step": 11710, + "time_per_iteration": 2.469845771789551 + }, + { + "auxiliary_loss_clip": 0.01100555, + "auxiliary_loss_mlp": 0.01035073, + "balance_loss_clip": 1.03912258, + "balance_loss_mlp": 1.02189994, + "epoch": 0.7041034119945889, + "flos": 21141402203520.0, + "grad_norm": 1.6663176282873677, + "language_loss": 0.77239931, + "learning_rate": 8.501281075538076e-07, + "loss": 0.79375559, + "num_input_tokens_seen": 252648225, + "step": 11711, + "time_per_iteration": 2.5155739784240723 + }, + { + "auxiliary_loss_clip": 0.01073004, + "auxiliary_loss_mlp": 0.01033657, + "balance_loss_clip": 1.03606319, + "balance_loss_mlp": 1.02232051, + "epoch": 0.7041635352472568, + "flos": 16910299549440.0, + "grad_norm": 3.5009015602312306, + "language_loss": 0.73658901, + "learning_rate": 8.498094724242457e-07, + "loss": 0.75765562, + "num_input_tokens_seen": 252665380, + "step": 11712, + "time_per_iteration": 2.5495121479034424 + }, + { + "auxiliary_loss_clip": 0.01006631, + "auxiliary_loss_mlp": 0.01003699, + "balance_loss_clip": 1.01611829, + "balance_loss_mlp": 1.00241113, + "epoch": 0.7042236584999249, + "flos": 71681219475840.0, + "grad_norm": 0.9275569670744976, + "language_loss": 0.64700055, + "learning_rate": 8.494908809100247e-07, + "loss": 0.66710389, + "num_input_tokens_seen": 252727950, + "step": 11713, + "time_per_iteration": 3.2103207111358643 + }, + { + "auxiliary_loss_clip": 0.01093392, + "auxiliary_loss_mlp": 0.01029812, + "balance_loss_clip": 1.03576803, + "balance_loss_mlp": 1.0179621, + "epoch": 0.7042837817525928, + "flos": 28658187590400.0, + "grad_norm": 2.2262515812528036, + "language_loss": 0.72744685, + "learning_rate": 8.49172333023225e-07, + "loss": 0.74867892, + "num_input_tokens_seen": 252746770, + "step": 11714, + "time_per_iteration": 2.566638708114624 + }, + { + "auxiliary_loss_clip": 0.01081928, + "auxiliary_loss_mlp": 0.0078598, + "balance_loss_clip": 1.03789282, + "balance_loss_mlp": 1.01079726, + "epoch": 0.7043439050052608, + "flos": 19753562465280.0, + "grad_norm": 1.7394690244270359, + "language_loss": 0.79933918, + "learning_rate": 8.488538287759248e-07, + "loss": 0.8180182, + "num_input_tokens_seen": 252765610, + "step": 11715, + "time_per_iteration": 2.5400609970092773 + }, + { + "auxiliary_loss_clip": 0.01079747, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.03748035, + "balance_loss_mlp": 1.02124715, + "epoch": 0.7044040282579288, + "flos": 11538529620480.0, + "grad_norm": 2.4840332909708844, + "language_loss": 0.71195287, + "learning_rate": 8.485353681802037e-07, + "loss": 0.73309267, + "num_input_tokens_seen": 252781610, + "step": 11716, + "time_per_iteration": 2.53765869140625 + }, + { + "auxiliary_loss_clip": 0.01075572, + "auxiliary_loss_mlp": 0.0103227, + "balance_loss_clip": 1.04001689, + "balance_loss_mlp": 1.01978314, + "epoch": 0.7044641515105967, + "flos": 33656126722560.0, + "grad_norm": 1.8183806938266367, + "language_loss": 0.66098332, + "learning_rate": 8.482169512481358e-07, + "loss": 0.68206179, + "num_input_tokens_seen": 252800600, + "step": 11717, + "time_per_iteration": 2.690755844116211 + }, + { + "auxiliary_loss_clip": 0.01108715, + "auxiliary_loss_mlp": 0.01032406, + "balance_loss_clip": 1.03860378, + "balance_loss_mlp": 1.02054453, + "epoch": 0.7045242747632647, + "flos": 26723859356160.0, + "grad_norm": 1.4860496105974812, + "language_loss": 0.74098718, + "learning_rate": 8.478985779917967e-07, + "loss": 0.7623983, + "num_input_tokens_seen": 252822310, + "step": 11718, + "time_per_iteration": 2.507378339767456 + }, + { + "auxiliary_loss_clip": 0.01095746, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.03792727, + "balance_loss_mlp": 1.01970792, + "epoch": 0.7045843980159326, + "flos": 26797655848320.0, + "grad_norm": 1.940278222231708, + "language_loss": 0.79780775, + "learning_rate": 8.475802484232606e-07, + "loss": 0.81907958, + "num_input_tokens_seen": 252842355, + "step": 11719, + "time_per_iteration": 2.53147292137146 + }, + { + "auxiliary_loss_clip": 0.01098667, + "auxiliary_loss_mlp": 0.01038546, + "balance_loss_clip": 1.03910041, + "balance_loss_mlp": 1.02576637, + "epoch": 0.7046445212686007, + "flos": 41574824363520.0, + "grad_norm": 1.7784401798791458, + "language_loss": 0.65803373, + "learning_rate": 8.472619625545951e-07, + "loss": 0.67940587, + "num_input_tokens_seen": 252866785, + "step": 11720, + "time_per_iteration": 2.6626977920532227 + }, + { + "auxiliary_loss_clip": 0.01090455, + "auxiliary_loss_mlp": 0.0103214, + "balance_loss_clip": 1.03945065, + "balance_loss_mlp": 1.01939011, + "epoch": 0.7047046445212686, + "flos": 15560166113280.0, + "grad_norm": 2.593472882636684, + "language_loss": 0.80224824, + "learning_rate": 8.46943720397872e-07, + "loss": 0.82347417, + "num_input_tokens_seen": 252881870, + "step": 11721, + "time_per_iteration": 2.4858405590057373 + }, + { + "auxiliary_loss_clip": 0.01014677, + "auxiliary_loss_mlp": 0.01003405, + "balance_loss_clip": 1.02508867, + "balance_loss_mlp": 1.00205755, + "epoch": 0.7047647677739366, + "flos": 70410269571840.0, + "grad_norm": 0.7634690391024583, + "language_loss": 0.64797199, + "learning_rate": 8.466255219651582e-07, + "loss": 0.66815281, + "num_input_tokens_seen": 252951300, + "step": 11722, + "time_per_iteration": 3.277519464492798 + }, + { + "auxiliary_loss_clip": 0.01089366, + "auxiliary_loss_mlp": 0.01031564, + "balance_loss_clip": 1.03956354, + "balance_loss_mlp": 1.01966119, + "epoch": 0.7048248910266045, + "flos": 23660032976640.0, + "grad_norm": 1.6053386007587653, + "language_loss": 0.65612215, + "learning_rate": 8.463073672685211e-07, + "loss": 0.67733145, + "num_input_tokens_seen": 252971400, + "step": 11723, + "time_per_iteration": 2.54923677444458 + }, + { + "auxiliary_loss_clip": 0.01080984, + "auxiliary_loss_mlp": 0.01030259, + "balance_loss_clip": 1.03960061, + "balance_loss_mlp": 1.01739645, + "epoch": 0.7048850142792725, + "flos": 21397158017280.0, + "grad_norm": 1.9030844166031275, + "language_loss": 0.80619681, + "learning_rate": 8.459892563200235e-07, + "loss": 0.82730925, + "num_input_tokens_seen": 252989475, + "step": 11724, + "time_per_iteration": 2.5637145042419434 + }, + { + "auxiliary_loss_clip": 0.01096083, + "auxiliary_loss_mlp": 0.01036562, + "balance_loss_clip": 1.03602719, + "balance_loss_mlp": 1.02429521, + "epoch": 0.7049451375319404, + "flos": 21648101408640.0, + "grad_norm": 1.5905229290765486, + "language_loss": 0.73085213, + "learning_rate": 8.456711891317296e-07, + "loss": 0.75217855, + "num_input_tokens_seen": 253007220, + "step": 11725, + "time_per_iteration": 2.51859188079834 + }, + { + "auxiliary_loss_clip": 0.01064955, + "auxiliary_loss_mlp": 0.01029019, + "balance_loss_clip": 1.03554225, + "balance_loss_mlp": 1.01511919, + "epoch": 0.7050052607846085, + "flos": 14866802904960.0, + "grad_norm": 2.09625082182208, + "language_loss": 0.78043294, + "learning_rate": 8.453531657156998e-07, + "loss": 0.80137277, + "num_input_tokens_seen": 253025410, + "step": 11726, + "time_per_iteration": 2.564790725708008 + }, + { + "auxiliary_loss_clip": 0.01087936, + "auxiliary_loss_mlp": 0.01031773, + "balance_loss_clip": 1.0375284, + "balance_loss_mlp": 1.01978111, + "epoch": 0.7050653840372764, + "flos": 19241763528960.0, + "grad_norm": 1.7348314044740911, + "language_loss": 0.70631778, + "learning_rate": 8.450351860839931e-07, + "loss": 0.72751486, + "num_input_tokens_seen": 253043305, + "step": 11727, + "time_per_iteration": 2.516083240509033 + }, + { + "auxiliary_loss_clip": 0.01100409, + "auxiliary_loss_mlp": 0.00781516, + "balance_loss_clip": 1.03520465, + "balance_loss_mlp": 1.00799775, + "epoch": 0.7051255072899444, + "flos": 27780422935680.0, + "grad_norm": 1.645166540566181, + "language_loss": 0.68970931, + "learning_rate": 8.44717250248668e-07, + "loss": 0.70852852, + "num_input_tokens_seen": 253062790, + "step": 11728, + "time_per_iteration": 2.5329418182373047 + }, + { + "auxiliary_loss_clip": 0.01075751, + "auxiliary_loss_mlp": 0.0078375, + "balance_loss_clip": 1.0375483, + "balance_loss_mlp": 1.00812995, + "epoch": 0.7051856305426124, + "flos": 27892033470720.0, + "grad_norm": 1.886471842304787, + "language_loss": 0.73166412, + "learning_rate": 8.443993582217803e-07, + "loss": 0.75025916, + "num_input_tokens_seen": 253082055, + "step": 11729, + "time_per_iteration": 2.6186745166778564 + }, + { + "auxiliary_loss_clip": 0.01095193, + "auxiliary_loss_mlp": 0.01034549, + "balance_loss_clip": 1.03856039, + "balance_loss_mlp": 1.02088773, + "epoch": 0.7052457537952803, + "flos": 25043563082880.0, + "grad_norm": 1.678978843799601, + "language_loss": 0.78334993, + "learning_rate": 8.440815100153862e-07, + "loss": 0.80464733, + "num_input_tokens_seen": 253102575, + "step": 11730, + "time_per_iteration": 2.5716500282287598 + }, + { + "auxiliary_loss_clip": 0.01107593, + "auxiliary_loss_mlp": 0.010339, + "balance_loss_clip": 1.03676224, + "balance_loss_mlp": 1.02141285, + "epoch": 0.7053058770479483, + "flos": 21871717528320.0, + "grad_norm": 2.100382514042608, + "language_loss": 0.62752306, + "learning_rate": 8.437637056415359e-07, + "loss": 0.64893794, + "num_input_tokens_seen": 253121290, + "step": 11731, + "time_per_iteration": 2.479445695877075 + }, + { + "auxiliary_loss_clip": 0.01059838, + "auxiliary_loss_mlp": 0.01029828, + "balance_loss_clip": 1.03720498, + "balance_loss_mlp": 1.0165062, + "epoch": 0.7053660003006162, + "flos": 16398716094720.0, + "grad_norm": 2.1499882554655665, + "language_loss": 0.7428602, + "learning_rate": 8.434459451122815e-07, + "loss": 0.76375687, + "num_input_tokens_seen": 253139720, + "step": 11732, + "time_per_iteration": 2.580887794494629 + }, + { + "auxiliary_loss_clip": 0.01095905, + "auxiliary_loss_mlp": 0.01028627, + "balance_loss_clip": 1.03817809, + "balance_loss_mlp": 1.01681876, + "epoch": 0.7054261235532843, + "flos": 22711560399360.0, + "grad_norm": 1.572719301647992, + "language_loss": 0.71157181, + "learning_rate": 8.431282284396735e-07, + "loss": 0.73281717, + "num_input_tokens_seen": 253160250, + "step": 11733, + "time_per_iteration": 2.5111806392669678 + }, + { + "auxiliary_loss_clip": 0.01069874, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.03671587, + "balance_loss_mlp": 1.01994014, + "epoch": 0.7054862468059522, + "flos": 13589711775360.0, + "grad_norm": 2.0239998012713856, + "language_loss": 0.73311627, + "learning_rate": 8.428105556357583e-07, + "loss": 0.75413525, + "num_input_tokens_seen": 253178710, + "step": 11734, + "time_per_iteration": 2.566929578781128 + }, + { + "auxiliary_loss_clip": 0.01074013, + "auxiliary_loss_mlp": 0.01044834, + "balance_loss_clip": 1.03641403, + "balance_loss_mlp": 1.03048158, + "epoch": 0.7055463700586202, + "flos": 15880704105600.0, + "grad_norm": 2.3707147072257184, + "language_loss": 0.6883027, + "learning_rate": 8.424929267125829e-07, + "loss": 0.70949113, + "num_input_tokens_seen": 253194805, + "step": 11735, + "time_per_iteration": 2.520636558532715 + }, + { + "auxiliary_loss_clip": 0.01081642, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.03612125, + "balance_loss_mlp": 1.02184129, + "epoch": 0.7056064933112881, + "flos": 23076161400960.0, + "grad_norm": 2.099644841805034, + "language_loss": 0.72407782, + "learning_rate": 8.421753416821933e-07, + "loss": 0.74525809, + "num_input_tokens_seen": 253213895, + "step": 11736, + "time_per_iteration": 2.5788166522979736 + }, + { + "auxiliary_loss_clip": 0.01084986, + "auxiliary_loss_mlp": 0.01025215, + "balance_loss_clip": 1.03614426, + "balance_loss_mlp": 1.01371121, + "epoch": 0.7056666165639561, + "flos": 24057168721920.0, + "grad_norm": 1.6443687621179437, + "language_loss": 0.69059753, + "learning_rate": 8.41857800556629e-07, + "loss": 0.71169949, + "num_input_tokens_seen": 253231620, + "step": 11737, + "time_per_iteration": 4.0410637855529785 + }, + { + "auxiliary_loss_clip": 0.01074524, + "auxiliary_loss_mlp": 0.01032512, + "balance_loss_clip": 1.03916621, + "balance_loss_mlp": 1.01990008, + "epoch": 0.705726739816624, + "flos": 17493237371520.0, + "grad_norm": 2.4056486582375793, + "language_loss": 0.67776322, + "learning_rate": 8.415403033479332e-07, + "loss": 0.69883358, + "num_input_tokens_seen": 253249590, + "step": 11738, + "time_per_iteration": 2.5653176307678223 + }, + { + "auxiliary_loss_clip": 0.01108116, + "auxiliary_loss_mlp": 0.0103425, + "balance_loss_clip": 1.03806901, + "balance_loss_mlp": 1.02125001, + "epoch": 0.7057868630692921, + "flos": 51350426472960.0, + "grad_norm": 1.6822044444000621, + "language_loss": 0.74834973, + "learning_rate": 8.41222850068145e-07, + "loss": 0.76977336, + "num_input_tokens_seen": 253273870, + "step": 11739, + "time_per_iteration": 2.74135422706604 + }, + { + "auxiliary_loss_clip": 0.01079474, + "auxiliary_loss_mlp": 0.00784056, + "balance_loss_clip": 1.03554964, + "balance_loss_mlp": 1.00883055, + "epoch": 0.70584698632196, + "flos": 26102963836800.0, + "grad_norm": 2.133805715702363, + "language_loss": 0.71574008, + "learning_rate": 8.409054407293032e-07, + "loss": 0.73437542, + "num_input_tokens_seen": 253293720, + "step": 11740, + "time_per_iteration": 2.5713467597961426 + }, + { + "auxiliary_loss_clip": 0.01075772, + "auxiliary_loss_mlp": 0.01026328, + "balance_loss_clip": 1.03785539, + "balance_loss_mlp": 1.01503849, + "epoch": 0.705907109574628, + "flos": 21543134889600.0, + "grad_norm": 1.6867974234924925, + "language_loss": 0.81837296, + "learning_rate": 8.405880753434434e-07, + "loss": 0.83939397, + "num_input_tokens_seen": 253313700, + "step": 11741, + "time_per_iteration": 2.558472156524658 + }, + { + "auxiliary_loss_clip": 0.01083992, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.03586745, + "balance_loss_mlp": 1.01877344, + "epoch": 0.705967232827296, + "flos": 22710842127360.0, + "grad_norm": 2.6628635799343794, + "language_loss": 0.7812252, + "learning_rate": 8.402707539225993e-07, + "loss": 0.80237496, + "num_input_tokens_seen": 253332425, + "step": 11742, + "time_per_iteration": 2.5442867279052734 + }, + { + "auxiliary_loss_clip": 0.01110491, + "auxiliary_loss_mlp": 0.01031524, + "balance_loss_clip": 1.03789842, + "balance_loss_mlp": 1.01853657, + "epoch": 0.7060273560799639, + "flos": 28691225124480.0, + "grad_norm": 1.5722572089306288, + "language_loss": 0.64324683, + "learning_rate": 8.39953476478805e-07, + "loss": 0.66466701, + "num_input_tokens_seen": 253353620, + "step": 11743, + "time_per_iteration": 3.9510343074798584 + }, + { + "auxiliary_loss_clip": 0.01081626, + "auxiliary_loss_mlp": 0.01030667, + "balance_loss_clip": 1.03437364, + "balance_loss_mlp": 1.0174284, + "epoch": 0.7060874793326319, + "flos": 15706178899200.0, + "grad_norm": 23.628967158589028, + "language_loss": 0.65466368, + "learning_rate": 8.396362430240902e-07, + "loss": 0.67578661, + "num_input_tokens_seen": 253370930, + "step": 11744, + "time_per_iteration": 2.5341358184814453 + }, + { + "auxiliary_loss_clip": 0.01094277, + "auxiliary_loss_mlp": 0.01029577, + "balance_loss_clip": 1.03605366, + "balance_loss_mlp": 1.0175606, + "epoch": 0.7061476025852998, + "flos": 21506757390720.0, + "grad_norm": 1.8873769020853095, + "language_loss": 0.63876891, + "learning_rate": 8.393190535704857e-07, + "loss": 0.66000742, + "num_input_tokens_seen": 253389810, + "step": 11745, + "time_per_iteration": 2.508784055709839 + }, + { + "auxiliary_loss_clip": 0.01074613, + "auxiliary_loss_mlp": 0.01034852, + "balance_loss_clip": 1.03648329, + "balance_loss_mlp": 1.02226973, + "epoch": 0.7062077258379679, + "flos": 28181832399360.0, + "grad_norm": 1.884017002957585, + "language_loss": 0.71712196, + "learning_rate": 8.390019081300188e-07, + "loss": 0.73821664, + "num_input_tokens_seen": 253408685, + "step": 11746, + "time_per_iteration": 2.613478183746338 + }, + { + "auxiliary_loss_clip": 0.01053227, + "auxiliary_loss_mlp": 0.01030466, + "balance_loss_clip": 1.03926897, + "balance_loss_mlp": 1.01847911, + "epoch": 0.7062678490906358, + "flos": 27853680723840.0, + "grad_norm": 1.7902915060111233, + "language_loss": 0.79313517, + "learning_rate": 8.386848067147175e-07, + "loss": 0.81397212, + "num_input_tokens_seen": 253429685, + "step": 11747, + "time_per_iteration": 2.6816179752349854 + }, + { + "auxiliary_loss_clip": 0.01093136, + "auxiliary_loss_mlp": 0.01030117, + "balance_loss_clip": 1.03552139, + "balance_loss_mlp": 1.01861334, + "epoch": 0.7063279723433038, + "flos": 23184862934400.0, + "grad_norm": 1.8581009844236147, + "language_loss": 0.65167099, + "learning_rate": 8.383677493366031e-07, + "loss": 0.67290354, + "num_input_tokens_seen": 253448260, + "step": 11748, + "time_per_iteration": 3.88802433013916 + }, + { + "auxiliary_loss_clip": 0.01064998, + "auxiliary_loss_mlp": 0.01034784, + "balance_loss_clip": 1.03788066, + "balance_loss_mlp": 1.02226686, + "epoch": 0.7063880955959717, + "flos": 20188655907840.0, + "grad_norm": 2.843370998842864, + "language_loss": 0.79533505, + "learning_rate": 8.380507360077003e-07, + "loss": 0.81633282, + "num_input_tokens_seen": 253467725, + "step": 11749, + "time_per_iteration": 2.59216570854187 + }, + { + "auxiliary_loss_clip": 0.01038315, + "auxiliary_loss_mlp": 0.01004633, + "balance_loss_clip": 1.0147779, + "balance_loss_mlp": 1.0034951, + "epoch": 0.7064482188486397, + "flos": 63668182763520.0, + "grad_norm": 0.7908088262345453, + "language_loss": 0.54062223, + "learning_rate": 8.377337667400304e-07, + "loss": 0.56105173, + "num_input_tokens_seen": 253526940, + "step": 11750, + "time_per_iteration": 3.053283452987671 + }, + { + "auxiliary_loss_clip": 0.01085283, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.03646743, + "balance_loss_mlp": 1.02056873, + "epoch": 0.7065083421013076, + "flos": 25191227894400.0, + "grad_norm": 1.8886910707641753, + "language_loss": 0.78826952, + "learning_rate": 8.37416841545612e-07, + "loss": 0.80945694, + "num_input_tokens_seen": 253546160, + "step": 11751, + "time_per_iteration": 2.5615971088409424 + }, + { + "auxiliary_loss_clip": 0.01069679, + "auxiliary_loss_mlp": 0.01031199, + "balance_loss_clip": 1.03490472, + "balance_loss_mlp": 1.01984441, + "epoch": 0.7065684653539757, + "flos": 22893699288960.0, + "grad_norm": 2.140202270237752, + "language_loss": 0.68199325, + "learning_rate": 8.370999604364634e-07, + "loss": 0.7030021, + "num_input_tokens_seen": 253565505, + "step": 11752, + "time_per_iteration": 2.6132891178131104 + }, + { + "auxiliary_loss_clip": 0.01051679, + "auxiliary_loss_mlp": 0.007845, + "balance_loss_clip": 1.03809214, + "balance_loss_mlp": 1.01100874, + "epoch": 0.7066285886066436, + "flos": 23550254035200.0, + "grad_norm": 1.8909861484710473, + "language_loss": 0.76385832, + "learning_rate": 8.367831234246025e-07, + "loss": 0.78222013, + "num_input_tokens_seen": 253585125, + "step": 11753, + "time_per_iteration": 2.7270426750183105 + }, + { + "auxiliary_loss_clip": 0.01073342, + "auxiliary_loss_mlp": 0.00785004, + "balance_loss_clip": 1.0347265, + "balance_loss_mlp": 1.01207876, + "epoch": 0.7066887118593116, + "flos": 21069293650560.0, + "grad_norm": 1.4932652205863637, + "language_loss": 0.70989978, + "learning_rate": 8.364663305220405e-07, + "loss": 0.7284832, + "num_input_tokens_seen": 253604815, + "step": 11754, + "time_per_iteration": 2.643204689025879 + }, + { + "auxiliary_loss_clip": 0.0107381, + "auxiliary_loss_mlp": 0.0103763, + "balance_loss_clip": 1.03607035, + "balance_loss_mlp": 1.02372456, + "epoch": 0.7067488351119796, + "flos": 21176307244800.0, + "grad_norm": 1.7785395898334453, + "language_loss": 0.89349473, + "learning_rate": 8.361495817407919e-07, + "loss": 0.91460913, + "num_input_tokens_seen": 253622855, + "step": 11755, + "time_per_iteration": 2.5736193656921387 + }, + { + "auxiliary_loss_clip": 0.01082476, + "auxiliary_loss_mlp": 0.00782447, + "balance_loss_clip": 1.0361644, + "balance_loss_mlp": 1.00743508, + "epoch": 0.7068089583646475, + "flos": 20449224144000.0, + "grad_norm": 1.5269057761188012, + "language_loss": 0.79689479, + "learning_rate": 8.358328770928678e-07, + "loss": 0.81554401, + "num_input_tokens_seen": 253642760, + "step": 11756, + "time_per_iteration": 2.585432767868042 + }, + { + "auxiliary_loss_clip": 0.0100527, + "auxiliary_loss_mlp": 0.01004182, + "balance_loss_clip": 1.02044058, + "balance_loss_mlp": 1.00290668, + "epoch": 0.7068690816173155, + "flos": 59109179829120.0, + "grad_norm": 0.8574023488957492, + "language_loss": 0.60406053, + "learning_rate": 8.355162165902785e-07, + "loss": 0.62415504, + "num_input_tokens_seen": 253695685, + "step": 11757, + "time_per_iteration": 2.9820353984832764 + }, + { + "auxiliary_loss_clip": 0.0107363, + "auxiliary_loss_mlp": 0.01033049, + "balance_loss_clip": 1.03798866, + "balance_loss_mlp": 1.02071655, + "epoch": 0.7069292048699835, + "flos": 16251554073600.0, + "grad_norm": 1.8213451879188125, + "language_loss": 0.80529422, + "learning_rate": 8.351996002450307e-07, + "loss": 0.826361, + "num_input_tokens_seen": 253713305, + "step": 11758, + "time_per_iteration": 2.5374510288238525 + }, + { + "auxiliary_loss_clip": 0.01072002, + "auxiliary_loss_mlp": 0.00785076, + "balance_loss_clip": 1.03570902, + "balance_loss_mlp": 1.011199, + "epoch": 0.7069893281226515, + "flos": 41172768455040.0, + "grad_norm": 1.730482523339282, + "language_loss": 0.77776068, + "learning_rate": 8.348830280691304e-07, + "loss": 0.79633147, + "num_input_tokens_seen": 253736100, + "step": 11759, + "time_per_iteration": 2.7453253269195557 + }, + { + "auxiliary_loss_clip": 0.01096337, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.03594518, + "balance_loss_mlp": 1.01679528, + "epoch": 0.7070494513753194, + "flos": 24207275658240.0, + "grad_norm": 2.681303411552605, + "language_loss": 0.68027902, + "learning_rate": 8.34566500074583e-07, + "loss": 0.70153326, + "num_input_tokens_seen": 253757350, + "step": 11760, + "time_per_iteration": 2.540365219116211 + }, + { + "auxiliary_loss_clip": 0.01079297, + "auxiliary_loss_mlp": 0.01032333, + "balance_loss_clip": 1.04006934, + "balance_loss_mlp": 1.02058494, + "epoch": 0.7071095746279874, + "flos": 20185675079040.0, + "grad_norm": 1.8235027358405613, + "language_loss": 0.80528826, + "learning_rate": 8.342500162733899e-07, + "loss": 0.82640457, + "num_input_tokens_seen": 253772855, + "step": 11761, + "time_per_iteration": 2.5380539894104004 + }, + { + "auxiliary_loss_clip": 0.01082145, + "auxiliary_loss_mlp": 0.01038786, + "balance_loss_clip": 1.03504276, + "balance_loss_mlp": 1.02412307, + "epoch": 0.7071696978806553, + "flos": 18183045133440.0, + "grad_norm": 2.3139277819693453, + "language_loss": 0.75076562, + "learning_rate": 8.33933576677553e-07, + "loss": 0.77197492, + "num_input_tokens_seen": 253790360, + "step": 11762, + "time_per_iteration": 2.5037145614624023 + }, + { + "auxiliary_loss_clip": 0.01083745, + "auxiliary_loss_mlp": 0.01033295, + "balance_loss_clip": 1.03664887, + "balance_loss_mlp": 1.02190483, + "epoch": 0.7072298211333233, + "flos": 24131719399680.0, + "grad_norm": 1.6690803003598065, + "language_loss": 0.76951879, + "learning_rate": 8.336171812990724e-07, + "loss": 0.79068917, + "num_input_tokens_seen": 253810585, + "step": 11763, + "time_per_iteration": 2.5493457317352295 + }, + { + "auxiliary_loss_clip": 0.01075866, + "auxiliary_loss_mlp": 0.00785916, + "balance_loss_clip": 1.03565979, + "balance_loss_mlp": 1.01069307, + "epoch": 0.7072899443859912, + "flos": 27198418867200.0, + "grad_norm": 2.2169997222184334, + "language_loss": 0.78759825, + "learning_rate": 8.333008301499453e-07, + "loss": 0.80621612, + "num_input_tokens_seen": 253829080, + "step": 11764, + "time_per_iteration": 2.592029094696045 + }, + { + "auxiliary_loss_clip": 0.01061989, + "auxiliary_loss_mlp": 0.01032588, + "balance_loss_clip": 1.03582239, + "balance_loss_mlp": 1.01939774, + "epoch": 0.7073500676386593, + "flos": 16435596384000.0, + "grad_norm": 1.5838489283731232, + "language_loss": 0.79637909, + "learning_rate": 8.32984523242167e-07, + "loss": 0.81732488, + "num_input_tokens_seen": 253846780, + "step": 11765, + "time_per_iteration": 2.652137279510498 + }, + { + "auxiliary_loss_clip": 0.01104192, + "auxiliary_loss_mlp": 0.01024895, + "balance_loss_clip": 1.03707469, + "balance_loss_mlp": 1.01370692, + "epoch": 0.7074101908913272, + "flos": 27673732563840.0, + "grad_norm": 1.6262590107812513, + "language_loss": 0.68390685, + "learning_rate": 8.326682605877324e-07, + "loss": 0.70519769, + "num_input_tokens_seen": 253867075, + "step": 11766, + "time_per_iteration": 2.5330007076263428 + }, + { + "auxiliary_loss_clip": 0.01085234, + "auxiliary_loss_mlp": 0.01037371, + "balance_loss_clip": 1.03614569, + "balance_loss_mlp": 1.02407885, + "epoch": 0.7074703141439952, + "flos": 22238078296320.0, + "grad_norm": 1.8637518776668605, + "language_loss": 0.64006382, + "learning_rate": 8.323520421986352e-07, + "loss": 0.66128993, + "num_input_tokens_seen": 253885790, + "step": 11767, + "time_per_iteration": 2.5549161434173584 + }, + { + "auxiliary_loss_clip": 0.01094544, + "auxiliary_loss_mlp": 0.01024941, + "balance_loss_clip": 1.03521633, + "balance_loss_mlp": 1.01260853, + "epoch": 0.7075304373966632, + "flos": 29643217234560.0, + "grad_norm": 1.9748919106062652, + "language_loss": 0.52758294, + "learning_rate": 8.320358680868646e-07, + "loss": 0.54877782, + "num_input_tokens_seen": 253907070, + "step": 11768, + "time_per_iteration": 2.557574987411499 + }, + { + "auxiliary_loss_clip": 0.01083107, + "auxiliary_loss_mlp": 0.00783384, + "balance_loss_clip": 1.03648615, + "balance_loss_mlp": 1.00769377, + "epoch": 0.7075905606493311, + "flos": 19755214490880.0, + "grad_norm": 1.8294972946890684, + "language_loss": 0.75557309, + "learning_rate": 8.317197382644119e-07, + "loss": 0.77423799, + "num_input_tokens_seen": 253927290, + "step": 11769, + "time_per_iteration": 2.557758092880249 + }, + { + "auxiliary_loss_clip": 0.01021914, + "auxiliary_loss_mlp": 0.01007352, + "balance_loss_clip": 1.01784515, + "balance_loss_mlp": 1.00601125, + "epoch": 0.7076506839019991, + "flos": 65716132694400.0, + "grad_norm": 0.8495030966364372, + "language_loss": 0.62002724, + "learning_rate": 8.314036527432637e-07, + "loss": 0.64031994, + "num_input_tokens_seen": 253983440, + "step": 11770, + "time_per_iteration": 3.073887348175049 + }, + { + "auxiliary_loss_clip": 0.01073644, + "auxiliary_loss_mlp": 0.01033228, + "balance_loss_clip": 1.03441691, + "balance_loss_mlp": 1.0205977, + "epoch": 0.707710807154667, + "flos": 23765286804480.0, + "grad_norm": 1.7094072526073503, + "language_loss": 0.76054037, + "learning_rate": 8.310876115354055e-07, + "loss": 0.78160906, + "num_input_tokens_seen": 254003825, + "step": 11771, + "time_per_iteration": 2.5860376358032227 + }, + { + "auxiliary_loss_clip": 0.0109128, + "auxiliary_loss_mlp": 0.01028431, + "balance_loss_clip": 1.03465319, + "balance_loss_mlp": 1.01721907, + "epoch": 0.7077709304073351, + "flos": 21251360712960.0, + "grad_norm": 1.6088683127666719, + "language_loss": 0.71301168, + "learning_rate": 8.307716146528221e-07, + "loss": 0.73420882, + "num_input_tokens_seen": 254023345, + "step": 11772, + "time_per_iteration": 2.496386766433716 + }, + { + "auxiliary_loss_clip": 0.01066219, + "auxiliary_loss_mlp": 0.01027038, + "balance_loss_clip": 1.03573573, + "balance_loss_mlp": 1.01420522, + "epoch": 0.707831053660003, + "flos": 20740746925440.0, + "grad_norm": 2.1593412016459355, + "language_loss": 0.6966899, + "learning_rate": 8.30455662107496e-07, + "loss": 0.71762252, + "num_input_tokens_seen": 254041815, + "step": 11773, + "time_per_iteration": 2.5877084732055664 + }, + { + "auxiliary_loss_clip": 0.01096354, + "auxiliary_loss_mlp": 0.01034478, + "balance_loss_clip": 1.03592932, + "balance_loss_mlp": 1.02252698, + "epoch": 0.707891176912671, + "flos": 21980993679360.0, + "grad_norm": 1.4667405218579366, + "language_loss": 0.70309919, + "learning_rate": 8.301397539114095e-07, + "loss": 0.72440749, + "num_input_tokens_seen": 254062065, + "step": 11774, + "time_per_iteration": 2.528050661087036 + }, + { + "auxiliary_loss_clip": 0.01082864, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.03866959, + "balance_loss_mlp": 1.01574612, + "epoch": 0.7079513001653389, + "flos": 21068970428160.0, + "grad_norm": 1.4512599272490367, + "language_loss": 0.74325287, + "learning_rate": 8.298238900765407e-07, + "loss": 0.76435733, + "num_input_tokens_seen": 254080605, + "step": 11775, + "time_per_iteration": 2.541327953338623 + }, + { + "auxiliary_loss_clip": 0.01077382, + "auxiliary_loss_mlp": 0.00783649, + "balance_loss_clip": 1.03799796, + "balance_loss_mlp": 1.00809169, + "epoch": 0.7080114234180069, + "flos": 18040659621120.0, + "grad_norm": 1.6262815112444449, + "language_loss": 0.8675217, + "learning_rate": 8.295080706148665e-07, + "loss": 0.88613206, + "num_input_tokens_seen": 254098710, + "step": 11776, + "time_per_iteration": 3.9424960613250732 + }, + { + "auxiliary_loss_clip": 0.01087989, + "auxiliary_loss_mlp": 0.01033326, + "balance_loss_clip": 1.034235, + "balance_loss_mlp": 1.02172065, + "epoch": 0.7080715466706748, + "flos": 15122271409920.0, + "grad_norm": 1.4666147076005813, + "language_loss": 0.74810511, + "learning_rate": 8.291922955383641e-07, + "loss": 0.76931828, + "num_input_tokens_seen": 254117200, + "step": 11777, + "time_per_iteration": 2.4756884574890137 + }, + { + "auxiliary_loss_clip": 0.01090502, + "auxiliary_loss_mlp": 0.01034049, + "balance_loss_clip": 1.04117811, + "balance_loss_mlp": 1.02126408, + "epoch": 0.7081316699233429, + "flos": 14422802889600.0, + "grad_norm": 2.5244873900628697, + "language_loss": 0.81826872, + "learning_rate": 8.288765648590066e-07, + "loss": 0.83951414, + "num_input_tokens_seen": 254132115, + "step": 11778, + "time_per_iteration": 2.5068929195404053 + }, + { + "auxiliary_loss_clip": 0.01078624, + "auxiliary_loss_mlp": 0.01031977, + "balance_loss_clip": 1.03473139, + "balance_loss_mlp": 1.02104592, + "epoch": 0.7081917931760108, + "flos": 23222389668480.0, + "grad_norm": 1.5017814693634735, + "language_loss": 0.84945858, + "learning_rate": 8.285608785887673e-07, + "loss": 0.87056464, + "num_input_tokens_seen": 254152285, + "step": 11779, + "time_per_iteration": 2.535895824432373 + }, + { + "auxiliary_loss_clip": 0.01082955, + "auxiliary_loss_mlp": 0.01033405, + "balance_loss_clip": 1.03695011, + "balance_loss_mlp": 1.02127576, + "epoch": 0.7082519164286788, + "flos": 39308429871360.0, + "grad_norm": 2.2331249118925895, + "language_loss": 0.71925896, + "learning_rate": 8.28245236739618e-07, + "loss": 0.74042261, + "num_input_tokens_seen": 254172805, + "step": 11780, + "time_per_iteration": 2.6861252784729004 + }, + { + "auxiliary_loss_clip": 0.01064549, + "auxiliary_loss_mlp": 0.01028315, + "balance_loss_clip": 1.0365622, + "balance_loss_mlp": 1.01675165, + "epoch": 0.7083120396813467, + "flos": 21651154064640.0, + "grad_norm": 1.4513691112671563, + "language_loss": 0.73076469, + "learning_rate": 8.279296393235256e-07, + "loss": 0.75169325, + "num_input_tokens_seen": 254191890, + "step": 11781, + "time_per_iteration": 2.581162929534912 + }, + { + "auxiliary_loss_clip": 0.01093519, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.03640628, + "balance_loss_mlp": 1.01784563, + "epoch": 0.7083721629340147, + "flos": 17567033863680.0, + "grad_norm": 1.5744545022046663, + "language_loss": 0.77138209, + "learning_rate": 8.276140863524585e-07, + "loss": 0.79261178, + "num_input_tokens_seen": 254210150, + "step": 11782, + "time_per_iteration": 5.248617649078369 + }, + { + "auxiliary_loss_clip": 0.01080739, + "auxiliary_loss_mlp": 0.01029301, + "balance_loss_clip": 1.03518724, + "balance_loss_mlp": 1.01834559, + "epoch": 0.7084322861866827, + "flos": 29350509304320.0, + "grad_norm": 1.4298268694176417, + "language_loss": 0.69667482, + "learning_rate": 8.272985778383828e-07, + "loss": 0.71777511, + "num_input_tokens_seen": 254233015, + "step": 11783, + "time_per_iteration": 2.5956382751464844 + }, + { + "auxiliary_loss_clip": 0.01069598, + "auxiliary_loss_mlp": 0.01028866, + "balance_loss_clip": 1.03777146, + "balance_loss_mlp": 1.01669478, + "epoch": 0.7084924094393507, + "flos": 20194294343040.0, + "grad_norm": 1.5843600085134995, + "language_loss": 0.79068208, + "learning_rate": 8.269831137932632e-07, + "loss": 0.81166667, + "num_input_tokens_seen": 254251345, + "step": 11784, + "time_per_iteration": 2.5822486877441406 + }, + { + "auxiliary_loss_clip": 0.01105168, + "auxiliary_loss_mlp": 0.01029712, + "balance_loss_clip": 1.03690481, + "balance_loss_mlp": 1.01790476, + "epoch": 0.7085525326920187, + "flos": 23477211728640.0, + "grad_norm": 1.7391549636840924, + "language_loss": 0.77021039, + "learning_rate": 8.266676942290609e-07, + "loss": 0.79155916, + "num_input_tokens_seen": 254269905, + "step": 11785, + "time_per_iteration": 2.5559744834899902 + }, + { + "auxiliary_loss_clip": 0.01080647, + "auxiliary_loss_mlp": 0.0103144, + "balance_loss_clip": 1.03644383, + "balance_loss_mlp": 1.01901209, + "epoch": 0.7086126559446866, + "flos": 25958818558080.0, + "grad_norm": 1.5380593344271858, + "language_loss": 0.78051555, + "learning_rate": 8.26352319157738e-07, + "loss": 0.80163634, + "num_input_tokens_seen": 254289990, + "step": 11786, + "time_per_iteration": 2.570871591567993 + }, + { + "auxiliary_loss_clip": 0.01106305, + "auxiliary_loss_mlp": 0.01028955, + "balance_loss_clip": 1.03630805, + "balance_loss_mlp": 1.01662898, + "epoch": 0.7086727791973546, + "flos": 26724793109760.0, + "grad_norm": 2.9181617068764836, + "language_loss": 0.78732586, + "learning_rate": 8.260369885912526e-07, + "loss": 0.80867839, + "num_input_tokens_seen": 254309085, + "step": 11787, + "time_per_iteration": 3.886140823364258 + }, + { + "auxiliary_loss_clip": 0.01096342, + "auxiliary_loss_mlp": 0.01027359, + "balance_loss_clip": 1.03754616, + "balance_loss_mlp": 1.01572466, + "epoch": 0.7087329024500225, + "flos": 21683365585920.0, + "grad_norm": 1.8951355388829545, + "language_loss": 0.76805693, + "learning_rate": 8.257217025415615e-07, + "loss": 0.78929394, + "num_input_tokens_seen": 254327045, + "step": 11788, + "time_per_iteration": 2.5009024143218994 + }, + { + "auxiliary_loss_clip": 0.01070801, + "auxiliary_loss_mlp": 0.0103852, + "balance_loss_clip": 1.03607392, + "balance_loss_mlp": 1.02303433, + "epoch": 0.7087930257026905, + "flos": 17931060247680.0, + "grad_norm": 2.42268286824252, + "language_loss": 0.68139791, + "learning_rate": 8.254064610206212e-07, + "loss": 0.70249116, + "num_input_tokens_seen": 254344585, + "step": 11789, + "time_per_iteration": 2.5427772998809814 + }, + { + "auxiliary_loss_clip": 0.01051101, + "auxiliary_loss_mlp": 0.01028895, + "balance_loss_clip": 1.03739977, + "balance_loss_mlp": 1.01679528, + "epoch": 0.7088531489553584, + "flos": 18911528864640.0, + "grad_norm": 1.5526868780707694, + "language_loss": 0.77620441, + "learning_rate": 8.250912640403858e-07, + "loss": 0.79700434, + "num_input_tokens_seen": 254362470, + "step": 11790, + "time_per_iteration": 2.6356847286224365 + }, + { + "auxiliary_loss_clip": 0.01087626, + "auxiliary_loss_mlp": 0.01027092, + "balance_loss_clip": 1.03649497, + "balance_loss_mlp": 1.01427078, + "epoch": 0.7089132722080265, + "flos": 27380880979200.0, + "grad_norm": 1.6718298735321997, + "language_loss": 0.70836681, + "learning_rate": 8.247761116128085e-07, + "loss": 0.729514, + "num_input_tokens_seen": 254383190, + "step": 11791, + "time_per_iteration": 2.5816049575805664 + }, + { + "auxiliary_loss_clip": 0.01096124, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.03756809, + "balance_loss_mlp": 1.02128756, + "epoch": 0.7089733954606944, + "flos": 22162917087360.0, + "grad_norm": 1.8999428669554808, + "language_loss": 0.81973851, + "learning_rate": 8.244610037498376e-07, + "loss": 0.84103489, + "num_input_tokens_seen": 254403115, + "step": 11792, + "time_per_iteration": 2.5651743412017822 + }, + { + "auxiliary_loss_clip": 0.01073021, + "auxiliary_loss_mlp": 0.01028474, + "balance_loss_clip": 1.03711462, + "balance_loss_mlp": 1.01649308, + "epoch": 0.7090335187133624, + "flos": 24425827960320.0, + "grad_norm": 2.5508410370732313, + "language_loss": 0.6502139, + "learning_rate": 8.241459404634232e-07, + "loss": 0.67122889, + "num_input_tokens_seen": 254421875, + "step": 11793, + "time_per_iteration": 2.593559741973877 + }, + { + "auxiliary_loss_clip": 0.01090906, + "auxiliary_loss_mlp": 0.01031697, + "balance_loss_clip": 1.03546977, + "balance_loss_mlp": 1.01872087, + "epoch": 0.7090936419660303, + "flos": 21835232288640.0, + "grad_norm": 2.497946969333162, + "language_loss": 0.70430225, + "learning_rate": 8.238309217655133e-07, + "loss": 0.72552824, + "num_input_tokens_seen": 254440765, + "step": 11794, + "time_per_iteration": 2.5262818336486816 + }, + { + "auxiliary_loss_clip": 0.01088519, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.04117095, + "balance_loss_mlp": 1.01780736, + "epoch": 0.7091537652186983, + "flos": 20082360585600.0, + "grad_norm": 1.869664078333111, + "language_loss": 0.75351489, + "learning_rate": 8.23515947668052e-07, + "loss": 0.77468848, + "num_input_tokens_seen": 254459480, + "step": 11795, + "time_per_iteration": 2.5282771587371826 + }, + { + "auxiliary_loss_clip": 0.01070049, + "auxiliary_loss_mlp": 0.01030747, + "balance_loss_clip": 1.03758407, + "balance_loss_mlp": 1.01929724, + "epoch": 0.7092138884713663, + "flos": 13151565676800.0, + "grad_norm": 2.709712184618782, + "language_loss": 0.75185621, + "learning_rate": 8.232010181829838e-07, + "loss": 0.77286416, + "num_input_tokens_seen": 254473985, + "step": 11796, + "time_per_iteration": 2.5336527824401855 + }, + { + "auxiliary_loss_clip": 0.0109888, + "auxiliary_loss_mlp": 0.01040536, + "balance_loss_clip": 1.03872907, + "balance_loss_mlp": 1.02514577, + "epoch": 0.7092740117240343, + "flos": 21645982506240.0, + "grad_norm": 1.5567548151617336, + "language_loss": 0.74274707, + "learning_rate": 8.228861333222523e-07, + "loss": 0.76414126, + "num_input_tokens_seen": 254492135, + "step": 11797, + "time_per_iteration": 2.500070333480835 + }, + { + "auxiliary_loss_clip": 0.0106308, + "auxiliary_loss_mlp": 0.01031671, + "balance_loss_clip": 1.03985226, + "balance_loss_mlp": 1.01958323, + "epoch": 0.7093341349767023, + "flos": 21032521102080.0, + "grad_norm": 1.595048094160187, + "language_loss": 0.79609418, + "learning_rate": 8.225712930977953e-07, + "loss": 0.81704175, + "num_input_tokens_seen": 254512865, + "step": 11798, + "time_per_iteration": 2.6384851932525635 + }, + { + "auxiliary_loss_clip": 0.01076029, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.0350647, + "balance_loss_mlp": 1.02492833, + "epoch": 0.7093942582293702, + "flos": 22017658487040.0, + "grad_norm": 1.7185255966250672, + "language_loss": 0.66528994, + "learning_rate": 8.222564975215529e-07, + "loss": 0.68642765, + "num_input_tokens_seen": 254532605, + "step": 11799, + "time_per_iteration": 2.5415475368499756 + }, + { + "auxiliary_loss_clip": 0.0110682, + "auxiliary_loss_mlp": 0.01027262, + "balance_loss_clip": 1.03691745, + "balance_loss_mlp": 1.01465523, + "epoch": 0.7094543814820382, + "flos": 27235586465280.0, + "grad_norm": 1.6727735975361675, + "language_loss": 0.8149184, + "learning_rate": 8.219417466054622e-07, + "loss": 0.83625925, + "num_input_tokens_seen": 254553780, + "step": 11800, + "time_per_iteration": 2.5287108421325684 + }, + { + "auxiliary_loss_clip": 0.0108172, + "auxiliary_loss_mlp": 0.01029222, + "balance_loss_clip": 1.03490257, + "balance_loss_mlp": 1.01817727, + "epoch": 0.7095145047347061, + "flos": 12089148180480.0, + "grad_norm": 1.779697027967369, + "language_loss": 0.86577427, + "learning_rate": 8.21627040361459e-07, + "loss": 0.88688368, + "num_input_tokens_seen": 254567510, + "step": 11801, + "time_per_iteration": 2.480809450149536 + }, + { + "auxiliary_loss_clip": 0.01106804, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.0369041, + "balance_loss_mlp": 1.0220803, + "epoch": 0.7095746279873741, + "flos": 19383789905280.0, + "grad_norm": 1.874459133629234, + "language_loss": 0.75855446, + "learning_rate": 8.213123788014758e-07, + "loss": 0.7799598, + "num_input_tokens_seen": 254585565, + "step": 11802, + "time_per_iteration": 2.46742582321167 + }, + { + "auxiliary_loss_clip": 0.01093842, + "auxiliary_loss_mlp": 0.01042488, + "balance_loss_clip": 1.03737068, + "balance_loss_mlp": 1.03023946, + "epoch": 0.709634751240042, + "flos": 21360600950400.0, + "grad_norm": 1.983154845812974, + "language_loss": 0.81480432, + "learning_rate": 8.209977619374462e-07, + "loss": 0.83616769, + "num_input_tokens_seen": 254603465, + "step": 11803, + "time_per_iteration": 2.4947409629821777 + }, + { + "auxiliary_loss_clip": 0.01107663, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.03627753, + "balance_loss_mlp": 1.01635647, + "epoch": 0.7096948744927101, + "flos": 13917037438080.0, + "grad_norm": 2.4515657432602866, + "language_loss": 0.67676228, + "learning_rate": 8.206831897812995e-07, + "loss": 0.69813347, + "num_input_tokens_seen": 254620500, + "step": 11804, + "time_per_iteration": 2.4461069107055664 + }, + { + "auxiliary_loss_clip": 0.01093427, + "auxiliary_loss_mlp": 0.01026395, + "balance_loss_clip": 1.03755891, + "balance_loss_mlp": 1.01544011, + "epoch": 0.709754997745378, + "flos": 30298335436800.0, + "grad_norm": 1.897949680464105, + "language_loss": 0.77906889, + "learning_rate": 8.203686623449637e-07, + "loss": 0.8002671, + "num_input_tokens_seen": 254638565, + "step": 11805, + "time_per_iteration": 2.5589823722839355 + }, + { + "auxiliary_loss_clip": 0.01084555, + "auxiliary_loss_mlp": 0.00785677, + "balance_loss_clip": 1.03506815, + "balance_loss_mlp": 1.01206911, + "epoch": 0.709815120998046, + "flos": 18515147304960.0, + "grad_norm": 2.4471816686872505, + "language_loss": 0.78567314, + "learning_rate": 8.200541796403667e-07, + "loss": 0.80437547, + "num_input_tokens_seen": 254657505, + "step": 11806, + "time_per_iteration": 2.5408620834350586 + }, + { + "auxiliary_loss_clip": 0.01076941, + "auxiliary_loss_mlp": 0.01034891, + "balance_loss_clip": 1.03621745, + "balance_loss_mlp": 1.02205253, + "epoch": 0.7098752442507139, + "flos": 22272588288000.0, + "grad_norm": 2.177272877487699, + "language_loss": 0.56475133, + "learning_rate": 8.197397416794332e-07, + "loss": 0.58586967, + "num_input_tokens_seen": 254674730, + "step": 11807, + "time_per_iteration": 2.522702932357788 + }, + { + "auxiliary_loss_clip": 0.01109393, + "auxiliary_loss_mlp": 0.01035938, + "balance_loss_clip": 1.03614485, + "balance_loss_mlp": 1.02402294, + "epoch": 0.7099353675033819, + "flos": 19275447507840.0, + "grad_norm": 2.5389195494465717, + "language_loss": 0.68648064, + "learning_rate": 8.194253484740882e-07, + "loss": 0.70793396, + "num_input_tokens_seen": 254691665, + "step": 11808, + "time_per_iteration": 2.468324661254883 + }, + { + "auxiliary_loss_clip": 0.01093992, + "auxiliary_loss_mlp": 0.01029173, + "balance_loss_clip": 1.03763556, + "balance_loss_mlp": 1.01772344, + "epoch": 0.70999549075605, + "flos": 21908525990400.0, + "grad_norm": 2.6876997439282464, + "language_loss": 0.71508497, + "learning_rate": 8.191110000362513e-07, + "loss": 0.73631662, + "num_input_tokens_seen": 254711610, + "step": 11809, + "time_per_iteration": 2.4962143898010254 + }, + { + "auxiliary_loss_clip": 0.01036507, + "auxiliary_loss_mlp": 0.01001069, + "balance_loss_clip": 1.01284981, + "balance_loss_mlp": 0.99986535, + "epoch": 0.7100556140087179, + "flos": 70456053456000.0, + "grad_norm": 0.7483510495460608, + "language_loss": 0.5945394, + "learning_rate": 8.187966963778435e-07, + "loss": 0.61491513, + "num_input_tokens_seen": 254772615, + "step": 11810, + "time_per_iteration": 3.169534683227539 + }, + { + "auxiliary_loss_clip": 0.01039605, + "auxiliary_loss_mlp": 0.01043761, + "balance_loss_clip": 1.0354991, + "balance_loss_mlp": 1.02996242, + "epoch": 0.7101157372613859, + "flos": 23039568420480.0, + "grad_norm": 1.5628443904461748, + "language_loss": 0.74144542, + "learning_rate": 8.18482437510784e-07, + "loss": 0.76227903, + "num_input_tokens_seen": 254791375, + "step": 11811, + "time_per_iteration": 2.6607277393341064 + }, + { + "auxiliary_loss_clip": 0.01073759, + "auxiliary_loss_mlp": 0.01027984, + "balance_loss_clip": 1.03779316, + "balance_loss_mlp": 1.01615262, + "epoch": 0.7101758605140538, + "flos": 23185329811200.0, + "grad_norm": 1.8342083935579412, + "language_loss": 0.83482438, + "learning_rate": 8.181682234469882e-07, + "loss": 0.85584188, + "num_input_tokens_seen": 254809300, + "step": 11812, + "time_per_iteration": 2.5695459842681885 + }, + { + "auxiliary_loss_clip": 0.01109164, + "auxiliary_loss_mlp": 0.0103029, + "balance_loss_clip": 1.03775239, + "balance_loss_mlp": 1.01718855, + "epoch": 0.7102359837667218, + "flos": 23696123166720.0, + "grad_norm": 2.045256692905346, + "language_loss": 0.7014035, + "learning_rate": 8.178540541983716e-07, + "loss": 0.72279805, + "num_input_tokens_seen": 254829325, + "step": 11813, + "time_per_iteration": 2.5160768032073975 + }, + { + "auxiliary_loss_clip": 0.01103267, + "auxiliary_loss_mlp": 0.01024456, + "balance_loss_clip": 1.03558064, + "balance_loss_mlp": 1.01321435, + "epoch": 0.7102961070193897, + "flos": 19391116279680.0, + "grad_norm": 1.7146182990175622, + "language_loss": 0.81607974, + "learning_rate": 8.175399297768495e-07, + "loss": 0.83735693, + "num_input_tokens_seen": 254847690, + "step": 11814, + "time_per_iteration": 2.4950411319732666 + }, + { + "auxiliary_loss_clip": 0.01104892, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.03693318, + "balance_loss_mlp": 1.0165168, + "epoch": 0.7103562302720577, + "flos": 21507511576320.0, + "grad_norm": 1.7360616123628676, + "language_loss": 0.76018602, + "learning_rate": 8.172258501943301e-07, + "loss": 0.78152001, + "num_input_tokens_seen": 254865960, + "step": 11815, + "time_per_iteration": 3.8388211727142334 + }, + { + "auxiliary_loss_clip": 0.01064508, + "auxiliary_loss_mlp": 0.01033115, + "balance_loss_clip": 1.03775096, + "balance_loss_mlp": 1.02106917, + "epoch": 0.7104163535247257, + "flos": 14535059869440.0, + "grad_norm": 1.8152900184516274, + "language_loss": 0.78410369, + "learning_rate": 8.16911815462725e-07, + "loss": 0.80507994, + "num_input_tokens_seen": 254882815, + "step": 11816, + "time_per_iteration": 2.5539920330047607 + }, + { + "auxiliary_loss_clip": 0.01080519, + "auxiliary_loss_mlp": 0.0103175, + "balance_loss_clip": 1.03673339, + "balance_loss_mlp": 1.01962662, + "epoch": 0.7104764767773937, + "flos": 11400310085760.0, + "grad_norm": 1.774224886354809, + "language_loss": 0.86314046, + "learning_rate": 8.165978255939426e-07, + "loss": 0.8842631, + "num_input_tokens_seen": 254898705, + "step": 11817, + "time_per_iteration": 2.5200963020324707 + }, + { + "auxiliary_loss_clip": 0.01057021, + "auxiliary_loss_mlp": 0.01027428, + "balance_loss_clip": 1.03649855, + "balance_loss_mlp": 1.01615119, + "epoch": 0.7105366000300616, + "flos": 11690432236800.0, + "grad_norm": 3.029342511072785, + "language_loss": 0.84540069, + "learning_rate": 8.162838805998897e-07, + "loss": 0.86624521, + "num_input_tokens_seen": 254913665, + "step": 11818, + "time_per_iteration": 2.545146942138672 + }, + { + "auxiliary_loss_clip": 0.01105989, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.03595114, + "balance_loss_mlp": 1.01715291, + "epoch": 0.7105967232827296, + "flos": 19354020508800.0, + "grad_norm": 2.0980833699804404, + "language_loss": 0.75530052, + "learning_rate": 8.159699804924709e-07, + "loss": 0.77665842, + "num_input_tokens_seen": 254932140, + "step": 11819, + "time_per_iteration": 2.464923143386841 + }, + { + "auxiliary_loss_clip": 0.01063071, + "auxiliary_loss_mlp": 0.01031666, + "balance_loss_clip": 1.03433299, + "balance_loss_mlp": 1.01755786, + "epoch": 0.7106568465353975, + "flos": 22930400010240.0, + "grad_norm": 1.6720868736826446, + "language_loss": 0.7086072, + "learning_rate": 8.156561252835883e-07, + "loss": 0.72955453, + "num_input_tokens_seen": 254951580, + "step": 11820, + "time_per_iteration": 3.9713776111602783 + }, + { + "auxiliary_loss_clip": 0.01094728, + "auxiliary_loss_mlp": 0.01026727, + "balance_loss_clip": 1.03714085, + "balance_loss_mlp": 1.01519907, + "epoch": 0.7107169697880655, + "flos": 19099665325440.0, + "grad_norm": 1.8393880804519012, + "language_loss": 0.75470376, + "learning_rate": 8.153423149851449e-07, + "loss": 0.7759183, + "num_input_tokens_seen": 254969425, + "step": 11821, + "time_per_iteration": 3.8560032844543457 + }, + { + "auxiliary_loss_clip": 0.01005554, + "auxiliary_loss_mlp": 0.01000658, + "balance_loss_clip": 1.02657866, + "balance_loss_mlp": 0.99923342, + "epoch": 0.7107770930407336, + "flos": 63638054231040.0, + "grad_norm": 0.7624554414622751, + "language_loss": 0.55088925, + "learning_rate": 8.150285496090388e-07, + "loss": 0.57095134, + "num_input_tokens_seen": 255032680, + "step": 11822, + "time_per_iteration": 3.2205963134765625 + }, + { + "auxiliary_loss_clip": 0.01090015, + "auxiliary_loss_mlp": 0.01027214, + "balance_loss_clip": 1.03647017, + "balance_loss_mlp": 1.01507294, + "epoch": 0.7108372162934015, + "flos": 22054466949120.0, + "grad_norm": 2.0107513586868864, + "language_loss": 0.59872448, + "learning_rate": 8.147148291671688e-07, + "loss": 0.61989677, + "num_input_tokens_seen": 255054400, + "step": 11823, + "time_per_iteration": 2.5446133613586426 + }, + { + "auxiliary_loss_clip": 0.01094459, + "auxiliary_loss_mlp": 0.01030182, + "balance_loss_clip": 1.03660488, + "balance_loss_mlp": 1.01879203, + "epoch": 0.7108973395460695, + "flos": 19135144984320.0, + "grad_norm": 2.480542282973698, + "language_loss": 0.71025974, + "learning_rate": 8.144011536714322e-07, + "loss": 0.73150617, + "num_input_tokens_seen": 255072785, + "step": 11824, + "time_per_iteration": 2.485163688659668 + }, + { + "auxiliary_loss_clip": 0.01073271, + "auxiliary_loss_mlp": 0.00785804, + "balance_loss_clip": 1.03370345, + "balance_loss_mlp": 1.00998855, + "epoch": 0.7109574627987374, + "flos": 17894431353600.0, + "grad_norm": 1.6959991386346989, + "language_loss": 0.7262789, + "learning_rate": 8.140875231337223e-07, + "loss": 0.74486959, + "num_input_tokens_seen": 255091820, + "step": 11825, + "time_per_iteration": 2.5357978343963623 + }, + { + "auxiliary_loss_clip": 0.0108435, + "auxiliary_loss_mlp": 0.01031824, + "balance_loss_clip": 1.03866196, + "balance_loss_mlp": 1.01993287, + "epoch": 0.7110175860514054, + "flos": 28979623422720.0, + "grad_norm": 1.7127081326772016, + "language_loss": 0.79757535, + "learning_rate": 8.137739375659321e-07, + "loss": 0.81873703, + "num_input_tokens_seen": 255111720, + "step": 11826, + "time_per_iteration": 4.0725390911102295 + }, + { + "auxiliary_loss_clip": 0.0109324, + "auxiliary_loss_mlp": 0.01030281, + "balance_loss_clip": 1.03673542, + "balance_loss_mlp": 1.01910472, + "epoch": 0.7110777093040733, + "flos": 26173312623360.0, + "grad_norm": 1.5462702664498276, + "language_loss": 0.83024204, + "learning_rate": 8.134603969799527e-07, + "loss": 0.85147727, + "num_input_tokens_seen": 255133495, + "step": 11827, + "time_per_iteration": 2.559511661529541 + }, + { + "auxiliary_loss_clip": 0.01074178, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.03546691, + "balance_loss_mlp": 1.01910865, + "epoch": 0.7111378325567413, + "flos": 26869943969280.0, + "grad_norm": 1.5420453511531356, + "language_loss": 0.62318075, + "learning_rate": 8.131469013876748e-07, + "loss": 0.64424241, + "num_input_tokens_seen": 255156880, + "step": 11828, + "time_per_iteration": 2.609049081802368 + }, + { + "auxiliary_loss_clip": 0.01104588, + "auxiliary_loss_mlp": 0.01030224, + "balance_loss_clip": 1.03608727, + "balance_loss_mlp": 1.01801646, + "epoch": 0.7111979558094093, + "flos": 27271820309760.0, + "grad_norm": 1.6134600151247893, + "language_loss": 0.7210629, + "learning_rate": 8.128334508009846e-07, + "loss": 0.74241096, + "num_input_tokens_seen": 255178920, + "step": 11829, + "time_per_iteration": 2.512749433517456 + }, + { + "auxiliary_loss_clip": 0.0110395, + "auxiliary_loss_mlp": 0.01030233, + "balance_loss_clip": 1.03586268, + "balance_loss_mlp": 1.01892567, + "epoch": 0.7112580790620773, + "flos": 25046938961280.0, + "grad_norm": 2.2646881047557756, + "language_loss": 0.80222052, + "learning_rate": 8.125200452317697e-07, + "loss": 0.82356232, + "num_input_tokens_seen": 255198095, + "step": 11830, + "time_per_iteration": 2.496009111404419 + }, + { + "auxiliary_loss_clip": 0.01092872, + "auxiliary_loss_mlp": 0.01034766, + "balance_loss_clip": 1.03572667, + "balance_loss_mlp": 1.02273822, + "epoch": 0.7113182023147452, + "flos": 21646628951040.0, + "grad_norm": 1.874368782057851, + "language_loss": 0.84069848, + "learning_rate": 8.122066846919138e-07, + "loss": 0.86197484, + "num_input_tokens_seen": 255215860, + "step": 11831, + "time_per_iteration": 2.5103979110717773 + }, + { + "auxiliary_loss_clip": 0.01084066, + "auxiliary_loss_mlp": 0.01028375, + "balance_loss_clip": 1.03553617, + "balance_loss_mlp": 1.01665688, + "epoch": 0.7113783255674132, + "flos": 20996287257600.0, + "grad_norm": 2.4048252376882253, + "language_loss": 0.7734983, + "learning_rate": 8.118933691932985e-07, + "loss": 0.79462272, + "num_input_tokens_seen": 255235425, + "step": 11832, + "time_per_iteration": 2.522078514099121 + }, + { + "auxiliary_loss_clip": 0.01026809, + "auxiliary_loss_mlp": 0.01002407, + "balance_loss_clip": 1.01400328, + "balance_loss_mlp": 1.00123906, + "epoch": 0.7114384488200811, + "flos": 66771080161920.0, + "grad_norm": 0.7473647088820893, + "language_loss": 0.56620133, + "learning_rate": 8.115800987478059e-07, + "loss": 0.58649349, + "num_input_tokens_seen": 255291680, + "step": 11833, + "time_per_iteration": 3.0380194187164307 + }, + { + "auxiliary_loss_clip": 0.0105636, + "auxiliary_loss_mlp": 0.01032075, + "balance_loss_clip": 1.03495419, + "balance_loss_mlp": 1.02041054, + "epoch": 0.7114985720727491, + "flos": 25010058672000.0, + "grad_norm": 1.8646495189767185, + "language_loss": 0.70823872, + "learning_rate": 8.11266873367315e-07, + "loss": 0.72912312, + "num_input_tokens_seen": 255313880, + "step": 11834, + "time_per_iteration": 2.6403307914733887 + }, + { + "auxiliary_loss_clip": 0.01107827, + "auxiliary_loss_mlp": 0.01031694, + "balance_loss_clip": 1.03704464, + "balance_loss_mlp": 1.01949906, + "epoch": 0.7115586953254172, + "flos": 21470128496640.0, + "grad_norm": 1.861218453935047, + "language_loss": 0.79392874, + "learning_rate": 8.10953693063704e-07, + "loss": 0.81532401, + "num_input_tokens_seen": 255332390, + "step": 11835, + "time_per_iteration": 2.4613046646118164 + }, + { + "auxiliary_loss_clip": 0.01092018, + "auxiliary_loss_mlp": 0.01029997, + "balance_loss_clip": 1.0357089, + "balance_loss_mlp": 1.01895785, + "epoch": 0.7116188185780851, + "flos": 28622600190720.0, + "grad_norm": 1.4523913423713999, + "language_loss": 0.75679755, + "learning_rate": 8.10640557848848e-07, + "loss": 0.7780177, + "num_input_tokens_seen": 255354025, + "step": 11836, + "time_per_iteration": 2.571061611175537 + }, + { + "auxiliary_loss_clip": 0.01034412, + "auxiliary_loss_mlp": 0.01029725, + "balance_loss_clip": 1.03492141, + "balance_loss_mlp": 1.01766098, + "epoch": 0.7116789418307531, + "flos": 25293608634240.0, + "grad_norm": 1.6158199173897911, + "language_loss": 0.70127606, + "learning_rate": 8.103274677346208e-07, + "loss": 0.72191739, + "num_input_tokens_seen": 255371400, + "step": 11837, + "time_per_iteration": 2.6663031578063965 + }, + { + "auxiliary_loss_clip": 0.01098959, + "auxiliary_loss_mlp": 0.0103757, + "balance_loss_clip": 1.03818679, + "balance_loss_mlp": 1.02466011, + "epoch": 0.711739065083421, + "flos": 25557301353600.0, + "grad_norm": 12.682156190045957, + "language_loss": 0.61593491, + "learning_rate": 8.100144227328958e-07, + "loss": 0.63730025, + "num_input_tokens_seen": 255390710, + "step": 11838, + "time_per_iteration": 2.543238401412964 + }, + { + "auxiliary_loss_clip": 0.01094955, + "auxiliary_loss_mlp": 0.01029894, + "balance_loss_clip": 1.03737485, + "balance_loss_mlp": 1.01805031, + "epoch": 0.711799188336089, + "flos": 26140993361280.0, + "grad_norm": 2.22765909212795, + "language_loss": 0.68396914, + "learning_rate": 8.097014228555426e-07, + "loss": 0.7052176, + "num_input_tokens_seen": 255408790, + "step": 11839, + "time_per_iteration": 2.5329294204711914 + }, + { + "auxiliary_loss_clip": 0.0110666, + "auxiliary_loss_mlp": 0.01032533, + "balance_loss_clip": 1.03773117, + "balance_loss_mlp": 1.02114224, + "epoch": 0.7118593115887569, + "flos": 21140648017920.0, + "grad_norm": 1.933846206407439, + "language_loss": 0.84085697, + "learning_rate": 8.093884681144305e-07, + "loss": 0.8622489, + "num_input_tokens_seen": 255426280, + "step": 11840, + "time_per_iteration": 2.4671452045440674 + }, + { + "auxiliary_loss_clip": 0.01083932, + "auxiliary_loss_mlp": 0.01029337, + "balance_loss_clip": 1.03714299, + "balance_loss_mlp": 1.01730323, + "epoch": 0.711919434841425, + "flos": 14975684006400.0, + "grad_norm": 1.866741654406252, + "language_loss": 0.7684772, + "learning_rate": 8.090755585214277e-07, + "loss": 0.78960991, + "num_input_tokens_seen": 255442935, + "step": 11841, + "time_per_iteration": 2.4922056198120117 + }, + { + "auxiliary_loss_clip": 0.01089062, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.03655291, + "balance_loss_mlp": 1.0213865, + "epoch": 0.7119795580940929, + "flos": 16508997826560.0, + "grad_norm": 2.00249006511256, + "language_loss": 0.75072569, + "learning_rate": 8.087626940883994e-07, + "loss": 0.7719537, + "num_input_tokens_seen": 255460925, + "step": 11842, + "time_per_iteration": 2.5219979286193848 + }, + { + "auxiliary_loss_clip": 0.01031893, + "auxiliary_loss_mlp": 0.01004313, + "balance_loss_clip": 1.02496755, + "balance_loss_mlp": 1.0026679, + "epoch": 0.7120396813467609, + "flos": 66570736055040.0, + "grad_norm": 0.7894660375181115, + "language_loss": 0.61636162, + "learning_rate": 8.084498748272082e-07, + "loss": 0.6367237, + "num_input_tokens_seen": 255521360, + "step": 11843, + "time_per_iteration": 3.072843551635742 + }, + { + "auxiliary_loss_clip": 0.01104189, + "auxiliary_loss_mlp": 0.01026271, + "balance_loss_clip": 1.03663766, + "balance_loss_mlp": 1.01468945, + "epoch": 0.7120998045994288, + "flos": 26432731624320.0, + "grad_norm": 1.7311346008422905, + "language_loss": 0.80057162, + "learning_rate": 8.081371007497171e-07, + "loss": 0.82187617, + "num_input_tokens_seen": 255541435, + "step": 11844, + "time_per_iteration": 2.513951063156128 + }, + { + "auxiliary_loss_clip": 0.01054041, + "auxiliary_loss_mlp": 0.01029306, + "balance_loss_clip": 1.0336287, + "balance_loss_mlp": 1.01640785, + "epoch": 0.7121599278520968, + "flos": 16427982700800.0, + "grad_norm": 2.303612782063063, + "language_loss": 0.78724492, + "learning_rate": 8.078243718677873e-07, + "loss": 0.80807841, + "num_input_tokens_seen": 255558505, + "step": 11845, + "time_per_iteration": 2.5456557273864746 + }, + { + "auxiliary_loss_clip": 0.01087584, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.0350672, + "balance_loss_mlp": 1.02158749, + "epoch": 0.7122200511047647, + "flos": 28949889939840.0, + "grad_norm": 1.8720681526840515, + "language_loss": 0.7750122, + "learning_rate": 8.075116881932762e-07, + "loss": 0.79622501, + "num_input_tokens_seen": 255577815, + "step": 11846, + "time_per_iteration": 2.5881800651550293 + }, + { + "auxiliary_loss_clip": 0.01095822, + "auxiliary_loss_mlp": 0.01031201, + "balance_loss_clip": 1.03732419, + "balance_loss_mlp": 1.01884484, + "epoch": 0.7122801743574327, + "flos": 16471866142080.0, + "grad_norm": 1.8885479218310322, + "language_loss": 0.58531195, + "learning_rate": 8.071990497380421e-07, + "loss": 0.60658216, + "num_input_tokens_seen": 255595885, + "step": 11847, + "time_per_iteration": 2.4930670261383057 + }, + { + "auxiliary_loss_clip": 0.01090091, + "auxiliary_loss_mlp": 0.00784673, + "balance_loss_clip": 1.03572762, + "balance_loss_mlp": 1.0121789, + "epoch": 0.7123402976101008, + "flos": 20631039811200.0, + "grad_norm": 1.543318032956219, + "language_loss": 0.71290123, + "learning_rate": 8.068864565139395e-07, + "loss": 0.7316488, + "num_input_tokens_seen": 255616750, + "step": 11848, + "time_per_iteration": 2.5611441135406494 + }, + { + "auxiliary_loss_clip": 0.01026213, + "auxiliary_loss_mlp": 0.0100091, + "balance_loss_clip": 1.01165271, + "balance_loss_mlp": 0.9998129, + "epoch": 0.7124004208627687, + "flos": 62325734837760.0, + "grad_norm": 0.8457736018533026, + "language_loss": 0.62973934, + "learning_rate": 8.065739085328211e-07, + "loss": 0.65001059, + "num_input_tokens_seen": 255677900, + "step": 11849, + "time_per_iteration": 3.0652496814727783 + }, + { + "auxiliary_loss_clip": 0.01078561, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.03540254, + "balance_loss_mlp": 1.02107918, + "epoch": 0.7124605441154367, + "flos": 39675975788160.0, + "grad_norm": 1.4900401305365265, + "language_loss": 0.64076561, + "learning_rate": 8.0626140580654e-07, + "loss": 0.6618824, + "num_input_tokens_seen": 255699140, + "step": 11850, + "time_per_iteration": 2.6940503120422363 + }, + { + "auxiliary_loss_clip": 0.01095926, + "auxiliary_loss_mlp": 0.01029669, + "balance_loss_clip": 1.03738248, + "balance_loss_mlp": 1.01761723, + "epoch": 0.7125206673681046, + "flos": 28181868312960.0, + "grad_norm": 1.5865269787958802, + "language_loss": 0.70058656, + "learning_rate": 8.05948948346946e-07, + "loss": 0.72184247, + "num_input_tokens_seen": 255719640, + "step": 11851, + "time_per_iteration": 2.5767931938171387 + }, + { + "auxiliary_loss_clip": 0.01094576, + "auxiliary_loss_mlp": 0.01033952, + "balance_loss_clip": 1.03691983, + "balance_loss_mlp": 1.02331805, + "epoch": 0.7125807906207726, + "flos": 26176939896960.0, + "grad_norm": 1.447539260732319, + "language_loss": 0.83216786, + "learning_rate": 8.056365361658882e-07, + "loss": 0.85345322, + "num_input_tokens_seen": 255740450, + "step": 11852, + "time_per_iteration": 2.5376980304718018 + }, + { + "auxiliary_loss_clip": 0.01094221, + "auxiliary_loss_mlp": 0.00784218, + "balance_loss_clip": 1.03471673, + "balance_loss_mlp": 1.00823605, + "epoch": 0.7126409138734405, + "flos": 17157328358400.0, + "grad_norm": 2.247147444011058, + "language_loss": 0.73505586, + "learning_rate": 8.053241692752126e-07, + "loss": 0.75384027, + "num_input_tokens_seen": 255758070, + "step": 11853, + "time_per_iteration": 2.4619317054748535 + }, + { + "auxiliary_loss_clip": 0.01064247, + "auxiliary_loss_mlp": 0.01034341, + "balance_loss_clip": 1.03320742, + "balance_loss_mlp": 1.02280164, + "epoch": 0.7127010371261085, + "flos": 18769933451520.0, + "grad_norm": 2.0096426133941754, + "language_loss": 0.92386901, + "learning_rate": 8.050118476867635e-07, + "loss": 0.94485486, + "num_input_tokens_seen": 255775685, + "step": 11854, + "time_per_iteration": 3.9513845443725586 + }, + { + "auxiliary_loss_clip": 0.01092261, + "auxiliary_loss_mlp": 0.01029316, + "balance_loss_clip": 1.03584743, + "balance_loss_mlp": 1.01803911, + "epoch": 0.7127611603787765, + "flos": 20376433232640.0, + "grad_norm": 1.8176076335420885, + "language_loss": 0.79688942, + "learning_rate": 8.046995714123856e-07, + "loss": 0.81810522, + "num_input_tokens_seen": 255794750, + "step": 11855, + "time_per_iteration": 2.499260663986206 + }, + { + "auxiliary_loss_clip": 0.01057293, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.03393352, + "balance_loss_mlp": 1.01972425, + "epoch": 0.7128212836314445, + "flos": 20449008662400.0, + "grad_norm": 1.8737781497438626, + "language_loss": 0.72817171, + "learning_rate": 8.043873404639192e-07, + "loss": 0.74907172, + "num_input_tokens_seen": 255813325, + "step": 11856, + "time_per_iteration": 2.576819658279419 + }, + { + "auxiliary_loss_clip": 0.01096066, + "auxiliary_loss_mlp": 0.01029329, + "balance_loss_clip": 1.03684235, + "balance_loss_mlp": 1.01756287, + "epoch": 0.7128814068841124, + "flos": 23440834229760.0, + "grad_norm": 1.6067167456492044, + "language_loss": 0.69833457, + "learning_rate": 8.040751548532046e-07, + "loss": 0.71958852, + "num_input_tokens_seen": 255832470, + "step": 11857, + "time_per_iteration": 2.524803638458252 + }, + { + "auxiliary_loss_clip": 0.01091206, + "auxiliary_loss_mlp": 0.01027345, + "balance_loss_clip": 1.03372598, + "balance_loss_mlp": 1.01538277, + "epoch": 0.7129415301367804, + "flos": 18222942165120.0, + "grad_norm": 2.691411406228035, + "language_loss": 0.84989041, + "learning_rate": 8.03763014592081e-07, + "loss": 0.87107587, + "num_input_tokens_seen": 255849740, + "step": 11858, + "time_per_iteration": 2.4561057090759277 + }, + { + "auxiliary_loss_clip": 0.01111246, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.03905129, + "balance_loss_mlp": 1.0192368, + "epoch": 0.7130016533894483, + "flos": 15523896355200.0, + "grad_norm": 1.694803481926178, + "language_loss": 0.80377996, + "learning_rate": 8.034509196923829e-07, + "loss": 0.82520676, + "num_input_tokens_seen": 255866975, + "step": 11859, + "time_per_iteration": 3.843292236328125 + }, + { + "auxiliary_loss_clip": 0.01080829, + "auxiliary_loss_mlp": 0.01030176, + "balance_loss_clip": 1.03532851, + "balance_loss_mlp": 1.01836848, + "epoch": 0.7130617766421163, + "flos": 57115668960000.0, + "grad_norm": 1.1831192337888181, + "language_loss": 0.68725646, + "learning_rate": 8.031388701659456e-07, + "loss": 0.70836651, + "num_input_tokens_seen": 255892915, + "step": 11860, + "time_per_iteration": 4.219373464584351 + }, + { + "auxiliary_loss_clip": 0.01092856, + "auxiliary_loss_mlp": 0.01030615, + "balance_loss_clip": 1.03511167, + "balance_loss_mlp": 1.01783574, + "epoch": 0.7131218998947844, + "flos": 19788252024960.0, + "grad_norm": 1.9842520926471592, + "language_loss": 0.64796263, + "learning_rate": 8.028268660246023e-07, + "loss": 0.66919732, + "num_input_tokens_seen": 255911480, + "step": 11861, + "time_per_iteration": 2.4956915378570557 + }, + { + "auxiliary_loss_clip": 0.01089276, + "auxiliary_loss_mlp": 0.0102885, + "balance_loss_clip": 1.03841305, + "balance_loss_mlp": 1.01632655, + "epoch": 0.7131820231474523, + "flos": 26651894457600.0, + "grad_norm": 1.949185867684346, + "language_loss": 0.67191565, + "learning_rate": 8.025149072801849e-07, + "loss": 0.69309688, + "num_input_tokens_seen": 255931140, + "step": 11862, + "time_per_iteration": 2.5770859718322754 + }, + { + "auxiliary_loss_clip": 0.01079411, + "auxiliary_loss_mlp": 0.01035825, + "balance_loss_clip": 1.03638506, + "balance_loss_mlp": 1.0248282, + "epoch": 0.7132421464001203, + "flos": 29205609840000.0, + "grad_norm": 1.8990135900784915, + "language_loss": 0.66636598, + "learning_rate": 8.022029939445214e-07, + "loss": 0.6875183, + "num_input_tokens_seen": 255951665, + "step": 11863, + "time_per_iteration": 2.5878512859344482 + }, + { + "auxiliary_loss_clip": 0.0106513, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.03751254, + "balance_loss_mlp": 1.02701211, + "epoch": 0.7133022696527882, + "flos": 23073611535360.0, + "grad_norm": 2.3910207723614754, + "language_loss": 0.65615326, + "learning_rate": 8.018911260294414e-07, + "loss": 0.67720985, + "num_input_tokens_seen": 255970055, + "step": 11864, + "time_per_iteration": 2.5974202156066895 + }, + { + "auxiliary_loss_clip": 0.01097885, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.03635299, + "balance_loss_mlp": 1.02013707, + "epoch": 0.7133623929054562, + "flos": 17457111267840.0, + "grad_norm": 2.0563613836936465, + "language_loss": 0.85554898, + "learning_rate": 8.015793035467697e-07, + "loss": 0.87685382, + "num_input_tokens_seen": 255987720, + "step": 11865, + "time_per_iteration": 3.9420690536499023 + }, + { + "auxiliary_loss_clip": 0.01066798, + "auxiliary_loss_mlp": 0.01031966, + "balance_loss_clip": 1.03292727, + "balance_loss_mlp": 1.01873946, + "epoch": 0.7134225161581241, + "flos": 19536554448000.0, + "grad_norm": 2.0721396734448825, + "language_loss": 0.74694479, + "learning_rate": 8.012675265083304e-07, + "loss": 0.76793242, + "num_input_tokens_seen": 256005490, + "step": 11866, + "time_per_iteration": 2.534437656402588 + }, + { + "auxiliary_loss_clip": 0.01077016, + "auxiliary_loss_mlp": 0.01030303, + "balance_loss_clip": 1.03772712, + "balance_loss_mlp": 1.0176909, + "epoch": 0.7134826394107922, + "flos": 26250089944320.0, + "grad_norm": 3.4130435227312184, + "language_loss": 0.7049039, + "learning_rate": 8.009557949259464e-07, + "loss": 0.72597706, + "num_input_tokens_seen": 256026030, + "step": 11867, + "time_per_iteration": 2.592672348022461 + }, + { + "auxiliary_loss_clip": 0.01092016, + "auxiliary_loss_mlp": 0.01027192, + "balance_loss_clip": 1.0363214, + "balance_loss_mlp": 1.01621854, + "epoch": 0.7135427626634601, + "flos": 15815311395840.0, + "grad_norm": 4.665538270106113, + "language_loss": 0.72006929, + "learning_rate": 8.006441088114397e-07, + "loss": 0.74126142, + "num_input_tokens_seen": 256043680, + "step": 11868, + "time_per_iteration": 2.4610776901245117 + }, + { + "auxiliary_loss_clip": 0.01058804, + "auxiliary_loss_mlp": 0.01030942, + "balance_loss_clip": 1.03747511, + "balance_loss_mlp": 1.01667285, + "epoch": 0.7136028859161281, + "flos": 18223409041920.0, + "grad_norm": 2.2766069515922123, + "language_loss": 0.66072744, + "learning_rate": 8.003324681766286e-07, + "loss": 0.68162495, + "num_input_tokens_seen": 256059705, + "step": 11869, + "time_per_iteration": 2.5713844299316406 + }, + { + "auxiliary_loss_clip": 0.01079261, + "auxiliary_loss_mlp": 0.01023946, + "balance_loss_clip": 1.0330894, + "balance_loss_mlp": 1.01216269, + "epoch": 0.713663009168796, + "flos": 24314827956480.0, + "grad_norm": 1.6070053236537025, + "language_loss": 0.77950412, + "learning_rate": 8.000208730333298e-07, + "loss": 0.80053616, + "num_input_tokens_seen": 256079785, + "step": 11870, + "time_per_iteration": 2.574803352355957 + }, + { + "auxiliary_loss_clip": 0.01056852, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.03575695, + "balance_loss_mlp": 1.01929331, + "epoch": 0.713723132421464, + "flos": 26538488242560.0, + "grad_norm": 1.6566413940146572, + "language_loss": 0.8109538, + "learning_rate": 7.997093233933597e-07, + "loss": 0.83184028, + "num_input_tokens_seen": 256099000, + "step": 11871, + "time_per_iteration": 2.632986307144165 + }, + { + "auxiliary_loss_clip": 0.01073702, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.03506637, + "balance_loss_mlp": 1.02367902, + "epoch": 0.7137832556741319, + "flos": 19865675790720.0, + "grad_norm": 1.6219934602651103, + "language_loss": 0.78686517, + "learning_rate": 7.993978192685331e-07, + "loss": 0.8079648, + "num_input_tokens_seen": 256117985, + "step": 11872, + "time_per_iteration": 2.5503997802734375 + }, + { + "auxiliary_loss_clip": 0.01095935, + "auxiliary_loss_mlp": 0.0102694, + "balance_loss_clip": 1.03575158, + "balance_loss_mlp": 1.01429737, + "epoch": 0.7138433789267999, + "flos": 21688932193920.0, + "grad_norm": 2.447171832310374, + "language_loss": 0.83954954, + "learning_rate": 7.990863606706606e-07, + "loss": 0.86077827, + "num_input_tokens_seen": 256134350, + "step": 11873, + "time_per_iteration": 2.5009546279907227 + }, + { + "auxiliary_loss_clip": 0.01065996, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.03228652, + "balance_loss_mlp": 1.02116418, + "epoch": 0.713903502179468, + "flos": 17602729004160.0, + "grad_norm": 1.9116131856114693, + "language_loss": 0.85802627, + "learning_rate": 7.987749476115539e-07, + "loss": 0.87900573, + "num_input_tokens_seen": 256150610, + "step": 11874, + "time_per_iteration": 2.5309700965881348 + }, + { + "auxiliary_loss_clip": 0.01094896, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.03430212, + "balance_loss_mlp": 1.01755941, + "epoch": 0.7139636254321359, + "flos": 18040336398720.0, + "grad_norm": 1.832195227066355, + "language_loss": 0.83300328, + "learning_rate": 7.984635801030228e-07, + "loss": 0.85425079, + "num_input_tokens_seen": 256168620, + "step": 11875, + "time_per_iteration": 2.4892821311950684 + }, + { + "auxiliary_loss_clip": 0.01085866, + "auxiliary_loss_mlp": 0.01040109, + "balance_loss_clip": 1.03569484, + "balance_loss_mlp": 1.02513015, + "epoch": 0.7140237486848039, + "flos": 23331127115520.0, + "grad_norm": 1.913690450430302, + "language_loss": 0.69006503, + "learning_rate": 7.981522581568721e-07, + "loss": 0.71132481, + "num_input_tokens_seen": 256186700, + "step": 11876, + "time_per_iteration": 2.5290708541870117 + }, + { + "auxiliary_loss_clip": 0.01108248, + "auxiliary_loss_mlp": 0.01036059, + "balance_loss_clip": 1.03723371, + "balance_loss_mlp": 1.02377415, + "epoch": 0.7140838719374718, + "flos": 16837077674880.0, + "grad_norm": 1.914859379315155, + "language_loss": 0.77591717, + "learning_rate": 7.978409817849079e-07, + "loss": 0.79736018, + "num_input_tokens_seen": 256205390, + "step": 11877, + "time_per_iteration": 2.4598898887634277 + }, + { + "auxiliary_loss_clip": 0.01093202, + "auxiliary_loss_mlp": 0.01030669, + "balance_loss_clip": 1.03598809, + "balance_loss_mlp": 1.01986909, + "epoch": 0.7141439951901398, + "flos": 21142012734720.0, + "grad_norm": 2.2572516141584784, + "language_loss": 0.69643414, + "learning_rate": 7.97529750998934e-07, + "loss": 0.71767288, + "num_input_tokens_seen": 256224575, + "step": 11878, + "time_per_iteration": 2.491856575012207 + }, + { + "auxiliary_loss_clip": 0.01069122, + "auxiliary_loss_mlp": 0.01031759, + "balance_loss_clip": 1.03823733, + "balance_loss_mlp": 1.02105379, + "epoch": 0.7142041184428077, + "flos": 24717709877760.0, + "grad_norm": 2.0550238121408735, + "language_loss": 0.67554271, + "learning_rate": 7.972185658107535e-07, + "loss": 0.69655156, + "num_input_tokens_seen": 256242130, + "step": 11879, + "time_per_iteration": 2.6008684635162354 + }, + { + "auxiliary_loss_clip": 0.01054414, + "auxiliary_loss_mlp": 0.0103855, + "balance_loss_clip": 1.0356257, + "balance_loss_mlp": 1.02488256, + "epoch": 0.7142642416954758, + "flos": 21908202768000.0, + "grad_norm": 1.7002622285179256, + "language_loss": 0.69038773, + "learning_rate": 7.969074262321646e-07, + "loss": 0.71131736, + "num_input_tokens_seen": 256261920, + "step": 11880, + "time_per_iteration": 2.5862131118774414 + }, + { + "auxiliary_loss_clip": 0.01079027, + "auxiliary_loss_mlp": 0.0103795, + "balance_loss_clip": 1.03326225, + "balance_loss_mlp": 1.02536178, + "epoch": 0.7143243649481437, + "flos": 20805636844800.0, + "grad_norm": 2.0631736125291624, + "language_loss": 0.80819881, + "learning_rate": 7.965963322749674e-07, + "loss": 0.82936859, + "num_input_tokens_seen": 256277970, + "step": 11881, + "time_per_iteration": 2.528226613998413 + }, + { + "auxiliary_loss_clip": 0.01073091, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.03494263, + "balance_loss_mlp": 1.01909375, + "epoch": 0.7143844882008117, + "flos": 27235011847680.0, + "grad_norm": 1.7965439155116674, + "language_loss": 0.63635337, + "learning_rate": 7.962852839509579e-07, + "loss": 0.65739036, + "num_input_tokens_seen": 256298205, + "step": 11882, + "time_per_iteration": 2.581442356109619 + }, + { + "auxiliary_loss_clip": 0.01108301, + "auxiliary_loss_mlp": 0.0103197, + "balance_loss_clip": 1.03696358, + "balance_loss_mlp": 1.0202632, + "epoch": 0.7144446114534796, + "flos": 17929623703680.0, + "grad_norm": 1.8266771586439765, + "language_loss": 0.68850267, + "learning_rate": 7.959742812719304e-07, + "loss": 0.70990539, + "num_input_tokens_seen": 256316685, + "step": 11883, + "time_per_iteration": 2.4749042987823486 + }, + { + "auxiliary_loss_clip": 0.01094861, + "auxiliary_loss_mlp": 0.0103434, + "balance_loss_clip": 1.03595853, + "balance_loss_mlp": 1.0211314, + "epoch": 0.7145047347061476, + "flos": 20740962407040.0, + "grad_norm": 2.0740513718359, + "language_loss": 0.77720529, + "learning_rate": 7.956633242496788e-07, + "loss": 0.79849732, + "num_input_tokens_seen": 256334205, + "step": 11884, + "time_per_iteration": 2.484001874923706 + }, + { + "auxiliary_loss_clip": 0.01101532, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.03768301, + "balance_loss_mlp": 1.0182296, + "epoch": 0.7145648579588155, + "flos": 21178605715200.0, + "grad_norm": 1.990302923190699, + "language_loss": 0.73748326, + "learning_rate": 7.953524128959954e-07, + "loss": 0.75882322, + "num_input_tokens_seen": 256353340, + "step": 11885, + "time_per_iteration": 2.492763042449951 + }, + { + "auxiliary_loss_clip": 0.0101581, + "auxiliary_loss_mlp": 0.01002063, + "balance_loss_clip": 1.01223159, + "balance_loss_mlp": 1.00094819, + "epoch": 0.7146249812114835, + "flos": 64784539509120.0, + "grad_norm": 0.8919310366882793, + "language_loss": 0.66400403, + "learning_rate": 7.95041547222669e-07, + "loss": 0.68418276, + "num_input_tokens_seen": 256411550, + "step": 11886, + "time_per_iteration": 3.0921013355255127 + }, + { + "auxiliary_loss_clip": 0.01064035, + "auxiliary_loss_mlp": 0.0102448, + "balance_loss_clip": 1.0365144, + "balance_loss_mlp": 1.01279795, + "epoch": 0.7146851044641516, + "flos": 18113881495680.0, + "grad_norm": 1.7895413634252861, + "language_loss": 0.7466644, + "learning_rate": 7.947307272414874e-07, + "loss": 0.76754957, + "num_input_tokens_seen": 256430360, + "step": 11887, + "time_per_iteration": 2.578949213027954 + }, + { + "auxiliary_loss_clip": 0.01094653, + "auxiliary_loss_mlp": 0.01026772, + "balance_loss_clip": 1.03624165, + "balance_loss_mlp": 1.01498199, + "epoch": 0.7147452277168195, + "flos": 19243846517760.0, + "grad_norm": 1.5797823712738683, + "language_loss": 0.71481609, + "learning_rate": 7.944199529642372e-07, + "loss": 0.73603034, + "num_input_tokens_seen": 256449750, + "step": 11888, + "time_per_iteration": 2.4761970043182373 + }, + { + "auxiliary_loss_clip": 0.01090796, + "auxiliary_loss_mlp": 0.01034195, + "balance_loss_clip": 1.03321862, + "balance_loss_mlp": 1.02160621, + "epoch": 0.7148053509694875, + "flos": 23764712186880.0, + "grad_norm": 1.8447439981754719, + "language_loss": 0.84303319, + "learning_rate": 7.941092244027041e-07, + "loss": 0.86428314, + "num_input_tokens_seen": 256467330, + "step": 11889, + "time_per_iteration": 2.523054361343384 + }, + { + "auxiliary_loss_clip": 0.01057389, + "auxiliary_loss_mlp": 0.01023255, + "balance_loss_clip": 1.03502536, + "balance_loss_mlp": 1.01141715, + "epoch": 0.7148654742221554, + "flos": 22485322586880.0, + "grad_norm": 1.8364966417839728, + "language_loss": 0.75516546, + "learning_rate": 7.937985415686695e-07, + "loss": 0.77597183, + "num_input_tokens_seen": 256485705, + "step": 11890, + "time_per_iteration": 2.614504337310791 + }, + { + "auxiliary_loss_clip": 0.01065813, + "auxiliary_loss_mlp": 0.01030742, + "balance_loss_clip": 1.03362501, + "balance_loss_mlp": 1.01907182, + "epoch": 0.7149255974748234, + "flos": 24679213476480.0, + "grad_norm": 1.535683416527105, + "language_loss": 0.74108684, + "learning_rate": 7.934879044739147e-07, + "loss": 0.76205242, + "num_input_tokens_seen": 256504755, + "step": 11891, + "time_per_iteration": 2.602437973022461 + }, + { + "auxiliary_loss_clip": 0.01063185, + "auxiliary_loss_mlp": 0.01032125, + "balance_loss_clip": 1.03751791, + "balance_loss_mlp": 1.02013874, + "epoch": 0.7149857207274913, + "flos": 18405583845120.0, + "grad_norm": 1.8894274865163574, + "language_loss": 0.67594242, + "learning_rate": 7.931773131302211e-07, + "loss": 0.69689548, + "num_input_tokens_seen": 256523670, + "step": 11892, + "time_per_iteration": 2.5746917724609375 + }, + { + "auxiliary_loss_clip": 0.01071957, + "auxiliary_loss_mlp": 0.01031614, + "balance_loss_clip": 1.03711224, + "balance_loss_mlp": 1.0182035, + "epoch": 0.7150458439801594, + "flos": 24969515195520.0, + "grad_norm": 2.0549812982612794, + "language_loss": 0.74275911, + "learning_rate": 7.928667675493632e-07, + "loss": 0.76379478, + "num_input_tokens_seen": 256542225, + "step": 11893, + "time_per_iteration": 3.954145908355713 + }, + { + "auxiliary_loss_clip": 0.0111004, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.03733444, + "balance_loss_mlp": 1.01714611, + "epoch": 0.7151059672328273, + "flos": 16690777580160.0, + "grad_norm": 2.2244024088954855, + "language_loss": 0.66541582, + "learning_rate": 7.925562677431185e-07, + "loss": 0.68681604, + "num_input_tokens_seen": 256560730, + "step": 11894, + "time_per_iteration": 2.4796295166015625 + }, + { + "auxiliary_loss_clip": 0.01072546, + "auxiliary_loss_mlp": 0.01030943, + "balance_loss_clip": 1.03764808, + "balance_loss_mlp": 1.0192188, + "epoch": 0.7151660904854953, + "flos": 27271820309760.0, + "grad_norm": 1.780738127185801, + "language_loss": 0.7779547, + "learning_rate": 7.922458137232613e-07, + "loss": 0.79898959, + "num_input_tokens_seen": 256580505, + "step": 11895, + "time_per_iteration": 2.6147029399871826 + }, + { + "auxiliary_loss_clip": 0.0109602, + "auxiliary_loss_mlp": 0.01032251, + "balance_loss_clip": 1.03577352, + "balance_loss_mlp": 1.01910257, + "epoch": 0.7152262137381632, + "flos": 18332254229760.0, + "grad_norm": 1.7696653270613458, + "language_loss": 0.69489056, + "learning_rate": 7.919354055015643e-07, + "loss": 0.71617329, + "num_input_tokens_seen": 256597330, + "step": 11896, + "time_per_iteration": 2.481764554977417 + }, + { + "auxiliary_loss_clip": 0.0108372, + "auxiliary_loss_mlp": 0.01043568, + "balance_loss_clip": 1.03410304, + "balance_loss_mlp": 1.03040719, + "epoch": 0.7152863369908312, + "flos": 21799285752960.0, + "grad_norm": 1.7559405298288877, + "language_loss": 0.86517429, + "learning_rate": 7.91625043089798e-07, + "loss": 0.88644719, + "num_input_tokens_seen": 256616030, + "step": 11897, + "time_per_iteration": 2.5246143341064453 + }, + { + "auxiliary_loss_clip": 0.0108311, + "auxiliary_loss_mlp": 0.01031906, + "balance_loss_clip": 1.03514028, + "balance_loss_mlp": 1.02007425, + "epoch": 0.7153464602434991, + "flos": 22158427887360.0, + "grad_norm": 2.184864142292303, + "language_loss": 0.78063875, + "learning_rate": 7.913147264997304e-07, + "loss": 0.80178893, + "num_input_tokens_seen": 256635570, + "step": 11898, + "time_per_iteration": 3.9438400268554688 + }, + { + "auxiliary_loss_clip": 0.01084592, + "auxiliary_loss_mlp": 0.01029775, + "balance_loss_clip": 1.03584361, + "balance_loss_mlp": 1.01650119, + "epoch": 0.7154065834961671, + "flos": 24716057852160.0, + "grad_norm": 1.8065302332067816, + "language_loss": 0.72755057, + "learning_rate": 7.910044557431302e-07, + "loss": 0.74869424, + "num_input_tokens_seen": 256655290, + "step": 11899, + "time_per_iteration": 3.915525197982788 + }, + { + "auxiliary_loss_clip": 0.01095671, + "auxiliary_loss_mlp": 0.01034876, + "balance_loss_clip": 1.03631747, + "balance_loss_mlp": 1.0218997, + "epoch": 0.7154667067488351, + "flos": 22601494149120.0, + "grad_norm": 2.0182848483493445, + "language_loss": 0.75700736, + "learning_rate": 7.906942308317614e-07, + "loss": 0.77831286, + "num_input_tokens_seen": 256671605, + "step": 11900, + "time_per_iteration": 2.495344877243042 + }, + { + "auxiliary_loss_clip": 0.01096633, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.03725648, + "balance_loss_mlp": 1.01905012, + "epoch": 0.7155268300015031, + "flos": 18771154513920.0, + "grad_norm": 1.855831343502479, + "language_loss": 0.80745786, + "learning_rate": 7.903840517773886e-07, + "loss": 0.82873297, + "num_input_tokens_seen": 256689680, + "step": 11901, + "time_per_iteration": 2.4897193908691406 + }, + { + "auxiliary_loss_clip": 0.01075414, + "auxiliary_loss_mlp": 0.0103534, + "balance_loss_clip": 1.03567767, + "balance_loss_mlp": 1.02233982, + "epoch": 0.7155869532541711, + "flos": 18296343607680.0, + "grad_norm": 1.6869585099267845, + "language_loss": 0.81097108, + "learning_rate": 7.900739185917744e-07, + "loss": 0.8320787, + "num_input_tokens_seen": 256707760, + "step": 11902, + "time_per_iteration": 2.5315418243408203 + }, + { + "auxiliary_loss_clip": 0.01064875, + "auxiliary_loss_mlp": 0.01028391, + "balance_loss_clip": 1.03545594, + "balance_loss_mlp": 1.01625574, + "epoch": 0.715647076506839, + "flos": 11980805783040.0, + "grad_norm": 2.1659083018660046, + "language_loss": 0.68158323, + "learning_rate": 7.897638312866785e-07, + "loss": 0.70251596, + "num_input_tokens_seen": 256724150, + "step": 11903, + "time_per_iteration": 2.543220043182373 + }, + { + "auxiliary_loss_clip": 0.01066087, + "auxiliary_loss_mlp": 0.01028473, + "balance_loss_clip": 1.03474689, + "balance_loss_mlp": 1.01713622, + "epoch": 0.715707199759507, + "flos": 18951641377920.0, + "grad_norm": 2.086185697869419, + "language_loss": 0.75846875, + "learning_rate": 7.894537898738589e-07, + "loss": 0.77941442, + "num_input_tokens_seen": 256742780, + "step": 11904, + "time_per_iteration": 3.9726293087005615 + }, + { + "auxiliary_loss_clip": 0.01083958, + "auxiliary_loss_mlp": 0.0103717, + "balance_loss_clip": 1.03550279, + "balance_loss_mlp": 1.02415848, + "epoch": 0.7157673230121749, + "flos": 15304410299520.0, + "grad_norm": 1.8797725905315583, + "language_loss": 0.72079659, + "learning_rate": 7.891437943650727e-07, + "loss": 0.74200791, + "num_input_tokens_seen": 256761355, + "step": 11905, + "time_per_iteration": 2.5121965408325195 + }, + { + "auxiliary_loss_clip": 0.010706, + "auxiliary_loss_mlp": 0.01030552, + "balance_loss_clip": 1.0352273, + "balance_loss_mlp": 1.01890564, + "epoch": 0.715827446264843, + "flos": 23221850964480.0, + "grad_norm": 1.5288461622524643, + "language_loss": 0.78202069, + "learning_rate": 7.88833844772076e-07, + "loss": 0.80303228, + "num_input_tokens_seen": 256781335, + "step": 11906, + "time_per_iteration": 2.589752435684204 + }, + { + "auxiliary_loss_clip": 0.01017114, + "auxiliary_loss_mlp": 0.0100149, + "balance_loss_clip": 1.01495051, + "balance_loss_mlp": 1.00045335, + "epoch": 0.7158875695175109, + "flos": 60975421833600.0, + "grad_norm": 0.7356091262984021, + "language_loss": 0.55318624, + "learning_rate": 7.885239411066205e-07, + "loss": 0.57337219, + "num_input_tokens_seen": 256838890, + "step": 11907, + "time_per_iteration": 3.05844783782959 + }, + { + "auxiliary_loss_clip": 0.01088971, + "auxiliary_loss_mlp": 0.01036521, + "balance_loss_clip": 1.03285444, + "balance_loss_mlp": 1.02376008, + "epoch": 0.7159476927701789, + "flos": 17128780024320.0, + "grad_norm": 1.761849679914604, + "language_loss": 0.69494903, + "learning_rate": 7.882140833804593e-07, + "loss": 0.71620393, + "num_input_tokens_seen": 256858145, + "step": 11908, + "time_per_iteration": 2.499251365661621 + }, + { + "auxiliary_loss_clip": 0.01063117, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.03309298, + "balance_loss_mlp": 1.01854181, + "epoch": 0.7160078160228468, + "flos": 22490601886080.0, + "grad_norm": 1.6657865461485866, + "language_loss": 0.71263754, + "learning_rate": 7.879042716053415e-07, + "loss": 0.73359239, + "num_input_tokens_seen": 256878545, + "step": 11909, + "time_per_iteration": 2.5751895904541016 + }, + { + "auxiliary_loss_clip": 0.01095261, + "auxiliary_loss_mlp": 0.01028239, + "balance_loss_clip": 1.03943181, + "balance_loss_mlp": 1.01643777, + "epoch": 0.7160679392755148, + "flos": 30590935626240.0, + "grad_norm": 1.452694043151886, + "language_loss": 0.7526508, + "learning_rate": 7.875945057930144e-07, + "loss": 0.77388579, + "num_input_tokens_seen": 256899920, + "step": 11910, + "time_per_iteration": 2.5689001083374023 + }, + { + "auxiliary_loss_clip": 0.01080746, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.03654635, + "balance_loss_mlp": 1.01920056, + "epoch": 0.7161280625281827, + "flos": 21323648833920.0, + "grad_norm": 2.289138568065848, + "language_loss": 0.7694726, + "learning_rate": 7.872847859552251e-07, + "loss": 0.79058027, + "num_input_tokens_seen": 256918460, + "step": 11911, + "time_per_iteration": 2.521012306213379 + }, + { + "auxiliary_loss_clip": 0.01071177, + "auxiliary_loss_mlp": 0.0103926, + "balance_loss_clip": 1.03565681, + "balance_loss_mlp": 1.02482414, + "epoch": 0.7161881857808508, + "flos": 61860078921600.0, + "grad_norm": 1.96050940371369, + "language_loss": 0.58668876, + "learning_rate": 7.869751121037192e-07, + "loss": 0.60779309, + "num_input_tokens_seen": 256942015, + "step": 11912, + "time_per_iteration": 2.9391982555389404 + }, + { + "auxiliary_loss_clip": 0.01095743, + "auxiliary_loss_mlp": 0.01034948, + "balance_loss_clip": 1.03753757, + "balance_loss_mlp": 1.02249646, + "epoch": 0.7162483090335187, + "flos": 20812101292800.0, + "grad_norm": 1.8204868442911681, + "language_loss": 0.78144741, + "learning_rate": 7.866654842502376e-07, + "loss": 0.8027544, + "num_input_tokens_seen": 256961065, + "step": 11913, + "time_per_iteration": 2.499769687652588 + }, + { + "auxiliary_loss_clip": 0.01081474, + "auxiliary_loss_mlp": 0.01027649, + "balance_loss_clip": 1.03529704, + "balance_loss_mlp": 1.01678944, + "epoch": 0.7163084322861867, + "flos": 24097532630400.0, + "grad_norm": 1.6600341929572915, + "language_loss": 0.74030364, + "learning_rate": 7.863559024065234e-07, + "loss": 0.76139492, + "num_input_tokens_seen": 256982165, + "step": 11914, + "time_per_iteration": 2.575164556503296 + }, + { + "auxiliary_loss_clip": 0.0107013, + "auxiliary_loss_mlp": 0.0103154, + "balance_loss_clip": 1.03499794, + "balance_loss_mlp": 1.01939893, + "epoch": 0.7163685555388547, + "flos": 20080888128000.0, + "grad_norm": 1.8111072853554178, + "language_loss": 0.74018669, + "learning_rate": 7.860463665843143e-07, + "loss": 0.76120341, + "num_input_tokens_seen": 256999825, + "step": 11915, + "time_per_iteration": 2.5423030853271484 + }, + { + "auxiliary_loss_clip": 0.01105887, + "auxiliary_loss_mlp": 0.01028124, + "balance_loss_clip": 1.03528428, + "balance_loss_mlp": 1.01636422, + "epoch": 0.7164286787915226, + "flos": 17456967613440.0, + "grad_norm": 2.532321004061479, + "language_loss": 0.81086397, + "learning_rate": 7.85736876795349e-07, + "loss": 0.83220398, + "num_input_tokens_seen": 257017450, + "step": 11916, + "time_per_iteration": 2.459557056427002 + }, + { + "auxiliary_loss_clip": 0.01031688, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.03626668, + "balance_loss_mlp": 1.01691329, + "epoch": 0.7164888020441906, + "flos": 19718908819200.0, + "grad_norm": 1.8201878594680159, + "language_loss": 0.68475401, + "learning_rate": 7.854274330513626e-07, + "loss": 0.70536017, + "num_input_tokens_seen": 257035465, + "step": 11917, + "time_per_iteration": 2.6407663822174072 + }, + { + "auxiliary_loss_clip": 0.01082482, + "auxiliary_loss_mlp": 0.01027083, + "balance_loss_clip": 1.03440392, + "balance_loss_mlp": 1.0145005, + "epoch": 0.7165489252968585, + "flos": 21470523546240.0, + "grad_norm": 1.6352184773263274, + "language_loss": 0.76114947, + "learning_rate": 7.851180353640896e-07, + "loss": 0.78224516, + "num_input_tokens_seen": 257053750, + "step": 11918, + "time_per_iteration": 2.5341713428497314 + }, + { + "auxiliary_loss_clip": 0.01015531, + "auxiliary_loss_mlp": 0.01002811, + "balance_loss_clip": 1.01218748, + "balance_loss_mlp": 1.00179768, + "epoch": 0.7166090485495266, + "flos": 69928060464000.0, + "grad_norm": 0.6264180327898039, + "language_loss": 0.53935421, + "learning_rate": 7.848086837452639e-07, + "loss": 0.55953759, + "num_input_tokens_seen": 257121215, + "step": 11919, + "time_per_iteration": 3.1593003273010254 + }, + { + "auxiliary_loss_clip": 0.01086793, + "auxiliary_loss_mlp": 0.01032523, + "balance_loss_clip": 1.03838396, + "balance_loss_mlp": 1.02109051, + "epoch": 0.7166691718021945, + "flos": 27343892949120.0, + "grad_norm": 2.023797999723082, + "language_loss": 0.68727535, + "learning_rate": 7.844993782066132e-07, + "loss": 0.70846856, + "num_input_tokens_seen": 257143370, + "step": 11920, + "time_per_iteration": 2.569462537765503 + }, + { + "auxiliary_loss_clip": 0.01086441, + "auxiliary_loss_mlp": 0.01035859, + "balance_loss_clip": 1.03462398, + "balance_loss_mlp": 1.02350247, + "epoch": 0.7167292950548625, + "flos": 30408868563840.0, + "grad_norm": 2.008898626318707, + "language_loss": 0.74751419, + "learning_rate": 7.841901187598678e-07, + "loss": 0.7687372, + "num_input_tokens_seen": 257162160, + "step": 11921, + "time_per_iteration": 2.5921475887298584 + }, + { + "auxiliary_loss_clip": 0.01080461, + "auxiliary_loss_mlp": 0.01033541, + "balance_loss_clip": 1.03792822, + "balance_loss_mlp": 1.01931977, + "epoch": 0.7167894183075304, + "flos": 14571257800320.0, + "grad_norm": 2.2047923633807276, + "language_loss": 0.75202829, + "learning_rate": 7.83880905416755e-07, + "loss": 0.77316833, + "num_input_tokens_seen": 257179300, + "step": 11922, + "time_per_iteration": 2.5345499515533447 + }, + { + "auxiliary_loss_clip": 0.01018658, + "auxiliary_loss_mlp": 0.01001639, + "balance_loss_clip": 1.013906, + "balance_loss_mlp": 1.00050008, + "epoch": 0.7168495415601984, + "flos": 64110674407680.0, + "grad_norm": 0.7566472549204487, + "language_loss": 0.55167407, + "learning_rate": 7.83571738189001e-07, + "loss": 0.571877, + "num_input_tokens_seen": 257235470, + "step": 11923, + "time_per_iteration": 2.9644393920898438 + }, + { + "auxiliary_loss_clip": 0.01074464, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.03556681, + "balance_loss_mlp": 1.02249002, + "epoch": 0.7169096648128663, + "flos": 24681440119680.0, + "grad_norm": 1.4626887072055013, + "language_loss": 0.77142984, + "learning_rate": 7.832626170883279e-07, + "loss": 0.79252177, + "num_input_tokens_seen": 257255850, + "step": 11924, + "time_per_iteration": 2.575505495071411 + }, + { + "auxiliary_loss_clip": 0.01073734, + "auxiliary_loss_mlp": 0.01029994, + "balance_loss_clip": 1.03700185, + "balance_loss_mlp": 1.01868749, + "epoch": 0.7169697880655344, + "flos": 20667525050880.0, + "grad_norm": 1.7291360149024588, + "language_loss": 0.68022263, + "learning_rate": 7.829535421264588e-07, + "loss": 0.70125997, + "num_input_tokens_seen": 257275425, + "step": 11925, + "time_per_iteration": 2.5726964473724365 + }, + { + "auxiliary_loss_clip": 0.01074099, + "auxiliary_loss_mlp": 0.01028597, + "balance_loss_clip": 1.03459764, + "balance_loss_mlp": 1.0176121, + "epoch": 0.7170299113182023, + "flos": 21032700670080.0, + "grad_norm": 1.9775866566987244, + "language_loss": 0.77234817, + "learning_rate": 7.826445133151133e-07, + "loss": 0.79337513, + "num_input_tokens_seen": 257295740, + "step": 11926, + "time_per_iteration": 2.574920892715454 + }, + { + "auxiliary_loss_clip": 0.01094547, + "auxiliary_loss_mlp": 0.00783269, + "balance_loss_clip": 1.03597498, + "balance_loss_mlp": 1.00650382, + "epoch": 0.7170900345708703, + "flos": 22893304239360.0, + "grad_norm": 2.0433559573548212, + "language_loss": 0.7709229, + "learning_rate": 7.823355306660093e-07, + "loss": 0.7897011, + "num_input_tokens_seen": 257315970, + "step": 11927, + "time_per_iteration": 2.5493104457855225 + }, + { + "auxiliary_loss_clip": 0.01095649, + "auxiliary_loss_mlp": 0.01029021, + "balance_loss_clip": 1.03817272, + "balance_loss_mlp": 1.01665914, + "epoch": 0.7171501578235383, + "flos": 15518688883200.0, + "grad_norm": 1.6086582198331, + "language_loss": 0.69142753, + "learning_rate": 7.820265941908642e-07, + "loss": 0.71267426, + "num_input_tokens_seen": 257334230, + "step": 11928, + "time_per_iteration": 2.501865863800049 + }, + { + "auxiliary_loss_clip": 0.01055872, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.0337249, + "balance_loss_mlp": 1.02308655, + "epoch": 0.7172102810762062, + "flos": 26104292640000.0, + "grad_norm": 1.7555819902799286, + "language_loss": 0.65029305, + "learning_rate": 7.817177039013931e-07, + "loss": 0.67119873, + "num_input_tokens_seen": 257352145, + "step": 11929, + "time_per_iteration": 2.6298696994781494 + }, + { + "auxiliary_loss_clip": 0.01082374, + "auxiliary_loss_mlp": 0.0102927, + "balance_loss_clip": 1.03439498, + "balance_loss_mlp": 1.01658034, + "epoch": 0.7172704043288742, + "flos": 21506649649920.0, + "grad_norm": 3.2089638946706063, + "language_loss": 0.70039117, + "learning_rate": 7.81408859809308e-07, + "loss": 0.72150767, + "num_input_tokens_seen": 257371460, + "step": 11930, + "time_per_iteration": 2.5198023319244385 + }, + { + "auxiliary_loss_clip": 0.01071388, + "auxiliary_loss_mlp": 0.01030712, + "balance_loss_clip": 1.03288949, + "balance_loss_mlp": 1.01873779, + "epoch": 0.7173305275815421, + "flos": 18770939032320.0, + "grad_norm": 2.7680510460882433, + "language_loss": 0.8053897, + "learning_rate": 7.811000619263219e-07, + "loss": 0.82641077, + "num_input_tokens_seen": 257390800, + "step": 11931, + "time_per_iteration": 3.936819076538086 + }, + { + "auxiliary_loss_clip": 0.01091791, + "auxiliary_loss_mlp": 0.01027576, + "balance_loss_clip": 1.03471351, + "balance_loss_mlp": 1.01675797, + "epoch": 0.7173906508342102, + "flos": 16179876483840.0, + "grad_norm": 2.080520246818637, + "language_loss": 0.78385794, + "learning_rate": 7.80791310264143e-07, + "loss": 0.80505162, + "num_input_tokens_seen": 257407495, + "step": 11932, + "time_per_iteration": 2.4642670154571533 + }, + { + "auxiliary_loss_clip": 0.01088343, + "auxiliary_loss_mlp": 0.01028244, + "balance_loss_clip": 1.03453326, + "balance_loss_mlp": 1.01635909, + "epoch": 0.7174507740868781, + "flos": 26613864933120.0, + "grad_norm": 1.4173414863590335, + "language_loss": 0.74876332, + "learning_rate": 7.804826048344803e-07, + "loss": 0.76992917, + "num_input_tokens_seen": 257429675, + "step": 11933, + "time_per_iteration": 2.535809278488159 + }, + { + "auxiliary_loss_clip": 0.01111608, + "auxiliary_loss_mlp": 0.01036912, + "balance_loss_clip": 1.03723192, + "balance_loss_mlp": 1.02234423, + "epoch": 0.7175108973395461, + "flos": 18432911116800.0, + "grad_norm": 2.4870275276948006, + "language_loss": 0.68987137, + "learning_rate": 7.801739456490388e-07, + "loss": 0.71135664, + "num_input_tokens_seen": 257442765, + "step": 11934, + "time_per_iteration": 2.4210188388824463 + }, + { + "auxiliary_loss_clip": 0.01096084, + "auxiliary_loss_mlp": 0.0103114, + "balance_loss_clip": 1.0358572, + "balance_loss_mlp": 1.01911783, + "epoch": 0.717571020592214, + "flos": 23914962777600.0, + "grad_norm": 5.497518538818899, + "language_loss": 0.86571723, + "learning_rate": 7.798653327195237e-07, + "loss": 0.88698947, + "num_input_tokens_seen": 257459310, + "step": 11935, + "time_per_iteration": 2.5081276893615723 + }, + { + "auxiliary_loss_clip": 0.01072066, + "auxiliary_loss_mlp": 0.01031464, + "balance_loss_clip": 1.03450167, + "balance_loss_mlp": 1.01880431, + "epoch": 0.717631143844882, + "flos": 38256930109440.0, + "grad_norm": 1.3725472031103507, + "language_loss": 0.73713285, + "learning_rate": 7.795567660576388e-07, + "loss": 0.75816816, + "num_input_tokens_seen": 257484750, + "step": 11936, + "time_per_iteration": 4.097309589385986 + }, + { + "auxiliary_loss_clip": 0.01033596, + "auxiliary_loss_mlp": 0.010013, + "balance_loss_clip": 1.00995016, + "balance_loss_mlp": 1.00019729, + "epoch": 0.7176912670975499, + "flos": 65515896328320.0, + "grad_norm": 0.7666390825232345, + "language_loss": 0.5592922, + "learning_rate": 7.79248245675082e-07, + "loss": 0.57964122, + "num_input_tokens_seen": 257543110, + "step": 11937, + "time_per_iteration": 4.443563938140869 + }, + { + "auxiliary_loss_clip": 0.01098663, + "auxiliary_loss_mlp": 0.01033259, + "balance_loss_clip": 1.03763294, + "balance_loss_mlp": 1.01959193, + "epoch": 0.717751390350218, + "flos": 31281066610560.0, + "grad_norm": 2.271688120780419, + "language_loss": 0.54557323, + "learning_rate": 7.789397715835542e-07, + "loss": 0.56689239, + "num_input_tokens_seen": 257567410, + "step": 11938, + "time_per_iteration": 2.576124429702759 + }, + { + "auxiliary_loss_clip": 0.01091387, + "auxiliary_loss_mlp": 0.01025214, + "balance_loss_clip": 1.03489923, + "balance_loss_mlp": 1.01377034, + "epoch": 0.7178115136028859, + "flos": 19859031774720.0, + "grad_norm": 1.5892759121295195, + "language_loss": 0.7644465, + "learning_rate": 7.786313437947527e-07, + "loss": 0.78561252, + "num_input_tokens_seen": 257586270, + "step": 11939, + "time_per_iteration": 2.5021820068359375 + }, + { + "auxiliary_loss_clip": 0.01016396, + "auxiliary_loss_mlp": 0.01000559, + "balance_loss_clip": 1.01326609, + "balance_loss_mlp": 0.99925405, + "epoch": 0.7178716368555539, + "flos": 64348655967360.0, + "grad_norm": 0.7549039688059097, + "language_loss": 0.61367393, + "learning_rate": 7.783229623203738e-07, + "loss": 0.63384348, + "num_input_tokens_seen": 257647415, + "step": 11940, + "time_per_iteration": 3.084197759628296 + }, + { + "auxiliary_loss_clip": 0.01067936, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.03505886, + "balance_loss_mlp": 1.01734543, + "epoch": 0.7179317601082219, + "flos": 26762607152640.0, + "grad_norm": 1.5919596117727126, + "language_loss": 0.58824259, + "learning_rate": 7.780146271721097e-07, + "loss": 0.60921013, + "num_input_tokens_seen": 257669795, + "step": 11941, + "time_per_iteration": 2.6211977005004883 + }, + { + "auxiliary_loss_clip": 0.01083134, + "auxiliary_loss_mlp": 0.01028395, + "balance_loss_clip": 1.0357852, + "balance_loss_mlp": 1.0166049, + "epoch": 0.7179918833608898, + "flos": 23513804709120.0, + "grad_norm": 2.0447078731059753, + "language_loss": 0.7928949, + "learning_rate": 7.777063383616543e-07, + "loss": 0.81401014, + "num_input_tokens_seen": 257687415, + "step": 11942, + "time_per_iteration": 4.0934977531433105 + }, + { + "auxiliary_loss_clip": 0.01095392, + "auxiliary_loss_mlp": 0.01039686, + "balance_loss_clip": 1.03605509, + "balance_loss_mlp": 1.02699018, + "epoch": 0.7180520066135578, + "flos": 17165588486400.0, + "grad_norm": 1.8302745906112567, + "language_loss": 0.66460538, + "learning_rate": 7.773980959006968e-07, + "loss": 0.68595618, + "num_input_tokens_seen": 257706215, + "step": 11943, + "time_per_iteration": 2.4753940105438232 + }, + { + "auxiliary_loss_clip": 0.01103664, + "auxiliary_loss_mlp": 0.01028658, + "balance_loss_clip": 1.03550446, + "balance_loss_mlp": 1.01656985, + "epoch": 0.7181121298662257, + "flos": 17566638814080.0, + "grad_norm": 1.7739858808267672, + "language_loss": 0.7895906, + "learning_rate": 7.770898998009254e-07, + "loss": 0.8109138, + "num_input_tokens_seen": 257724740, + "step": 11944, + "time_per_iteration": 2.4623310565948486 + }, + { + "auxiliary_loss_clip": 0.01089342, + "auxiliary_loss_mlp": 0.00785549, + "balance_loss_clip": 1.0379163, + "balance_loss_mlp": 1.01027071, + "epoch": 0.7181722531188938, + "flos": 11947660508160.0, + "grad_norm": 2.036852574355583, + "language_loss": 0.63525385, + "learning_rate": 7.767817500740277e-07, + "loss": 0.65400279, + "num_input_tokens_seen": 257742060, + "step": 11945, + "time_per_iteration": 2.5050928592681885 + }, + { + "auxiliary_loss_clip": 0.01023159, + "auxiliary_loss_mlp": 0.01001693, + "balance_loss_clip": 1.01315904, + "balance_loss_mlp": 1.00036383, + "epoch": 0.7182323763715617, + "flos": 65503649790720.0, + "grad_norm": 0.698944392168418, + "language_loss": 0.51087195, + "learning_rate": 7.76473646731689e-07, + "loss": 0.53112048, + "num_input_tokens_seen": 257802250, + "step": 11946, + "time_per_iteration": 3.0180633068084717 + }, + { + "auxiliary_loss_clip": 0.01075111, + "auxiliary_loss_mlp": 0.01035964, + "balance_loss_clip": 1.03638554, + "balance_loss_mlp": 1.02191496, + "epoch": 0.7182924996242297, + "flos": 20630932070400.0, + "grad_norm": 1.6275578141030953, + "language_loss": 0.7479611, + "learning_rate": 7.761655897855925e-07, + "loss": 0.76907182, + "num_input_tokens_seen": 257821155, + "step": 11947, + "time_per_iteration": 2.546682596206665 + }, + { + "auxiliary_loss_clip": 0.01060375, + "auxiliary_loss_mlp": 0.00783189, + "balance_loss_clip": 1.03242838, + "balance_loss_mlp": 1.00876665, + "epoch": 0.7183526228768976, + "flos": 16216433550720.0, + "grad_norm": 1.4652449489507813, + "language_loss": 0.72349209, + "learning_rate": 7.758575792474187e-07, + "loss": 0.74192774, + "num_input_tokens_seen": 257839905, + "step": 11948, + "time_per_iteration": 2.5727756023406982 + }, + { + "auxiliary_loss_clip": 0.01087136, + "auxiliary_loss_mlp": 0.01042832, + "balance_loss_clip": 1.0364722, + "balance_loss_mlp": 1.02890825, + "epoch": 0.7184127461295656, + "flos": 22232655342720.0, + "grad_norm": 1.5105876666756677, + "language_loss": 0.71491194, + "learning_rate": 7.755496151288483e-07, + "loss": 0.73621166, + "num_input_tokens_seen": 257860055, + "step": 11949, + "time_per_iteration": 2.529378652572632 + }, + { + "auxiliary_loss_clip": 0.01104122, + "auxiliary_loss_mlp": 0.00783372, + "balance_loss_clip": 1.03601885, + "balance_loss_mlp": 1.00932324, + "epoch": 0.7184728693822335, + "flos": 27344503480320.0, + "grad_norm": 2.0280337450901578, + "language_loss": 0.76404715, + "learning_rate": 7.752416974415598e-07, + "loss": 0.78292221, + "num_input_tokens_seen": 257879315, + "step": 11950, + "time_per_iteration": 2.514943838119507 + }, + { + "auxiliary_loss_clip": 0.01109008, + "auxiliary_loss_mlp": 0.01031684, + "balance_loss_clip": 1.03807354, + "balance_loss_mlp": 1.0187434, + "epoch": 0.7185329926349016, + "flos": 16508530949760.0, + "grad_norm": 4.127421701277714, + "language_loss": 0.67468417, + "learning_rate": 7.749338261972282e-07, + "loss": 0.69609106, + "num_input_tokens_seen": 257896570, + "step": 11951, + "time_per_iteration": 2.436288595199585 + }, + { + "auxiliary_loss_clip": 0.01088086, + "auxiliary_loss_mlp": 0.01037747, + "balance_loss_clip": 1.0376842, + "balance_loss_mlp": 1.02410376, + "epoch": 0.7185931158875695, + "flos": 23951052967680.0, + "grad_norm": 1.7098013833672452, + "language_loss": 0.78060764, + "learning_rate": 7.746260014075286e-07, + "loss": 0.80186599, + "num_input_tokens_seen": 257916855, + "step": 11952, + "time_per_iteration": 2.561812162399292 + }, + { + "auxiliary_loss_clip": 0.01101387, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.03797626, + "balance_loss_mlp": 1.01994944, + "epoch": 0.7186532391402375, + "flos": 26542007775360.0, + "grad_norm": 1.9238958633197314, + "language_loss": 0.74790251, + "learning_rate": 7.743182230841352e-07, + "loss": 0.76924354, + "num_input_tokens_seen": 257937140, + "step": 11953, + "time_per_iteration": 2.5227158069610596 + }, + { + "auxiliary_loss_clip": 0.0109612, + "auxiliary_loss_mlp": 0.01028249, + "balance_loss_clip": 1.03425157, + "balance_loss_mlp": 1.0156548, + "epoch": 0.7187133623929055, + "flos": 22383049587840.0, + "grad_norm": 1.7677469913819763, + "language_loss": 0.72929847, + "learning_rate": 7.740104912387164e-07, + "loss": 0.75054216, + "num_input_tokens_seen": 257956785, + "step": 11954, + "time_per_iteration": 2.504838705062866 + }, + { + "auxiliary_loss_clip": 0.01087634, + "auxiliary_loss_mlp": 0.0103513, + "balance_loss_clip": 1.03904963, + "balance_loss_mlp": 1.02281022, + "epoch": 0.7187734856455734, + "flos": 15779580341760.0, + "grad_norm": 1.744630523984479, + "language_loss": 0.74574912, + "learning_rate": 7.737028058829425e-07, + "loss": 0.76697671, + "num_input_tokens_seen": 257975455, + "step": 11955, + "time_per_iteration": 2.5056939125061035 + }, + { + "auxiliary_loss_clip": 0.01073534, + "auxiliary_loss_mlp": 0.01030926, + "balance_loss_clip": 1.03618169, + "balance_loss_mlp": 1.01850998, + "epoch": 0.7188336088982414, + "flos": 31759612531200.0, + "grad_norm": 1.5533836704141994, + "language_loss": 0.73488235, + "learning_rate": 7.733951670284817e-07, + "loss": 0.75592697, + "num_input_tokens_seen": 257996850, + "step": 11956, + "time_per_iteration": 2.6486711502075195 + }, + { + "auxiliary_loss_clip": 0.01023921, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.03080606, + "balance_loss_mlp": 1.02239561, + "epoch": 0.7188937321509093, + "flos": 21465208333440.0, + "grad_norm": 1.6516988276465094, + "language_loss": 0.70909607, + "learning_rate": 7.730875746869987e-07, + "loss": 0.7297045, + "num_input_tokens_seen": 258016145, + "step": 11957, + "time_per_iteration": 2.692265510559082 + }, + { + "auxiliary_loss_clip": 0.01058149, + "auxiliary_loss_mlp": 0.01035928, + "balance_loss_clip": 1.03375614, + "balance_loss_mlp": 1.02246952, + "epoch": 0.7189538554035774, + "flos": 27271497087360.0, + "grad_norm": 1.8997440984303826, + "language_loss": 0.7322818, + "learning_rate": 7.727800288701582e-07, + "loss": 0.75322258, + "num_input_tokens_seen": 258035420, + "step": 11958, + "time_per_iteration": 2.6634421348571777 + }, + { + "auxiliary_loss_clip": 0.01092577, + "auxiliary_loss_mlp": 0.01040512, + "balance_loss_clip": 1.03554463, + "balance_loss_mlp": 1.02738082, + "epoch": 0.7190139786562453, + "flos": 21580625710080.0, + "grad_norm": 1.5645846052535453, + "language_loss": 0.83934444, + "learning_rate": 7.724725295896215e-07, + "loss": 0.86067533, + "num_input_tokens_seen": 258053520, + "step": 11959, + "time_per_iteration": 2.5061912536621094 + }, + { + "auxiliary_loss_clip": 0.01112787, + "auxiliary_loss_mlp": 0.01029964, + "balance_loss_clip": 1.03966784, + "balance_loss_mlp": 1.01658893, + "epoch": 0.7190741019089133, + "flos": 26721237663360.0, + "grad_norm": 1.5769344893515211, + "language_loss": 0.81988418, + "learning_rate": 7.7216507685705e-07, + "loss": 0.84131163, + "num_input_tokens_seen": 258073020, + "step": 11960, + "time_per_iteration": 2.502078056335449 + }, + { + "auxiliary_loss_clip": 0.01080888, + "auxiliary_loss_mlp": 0.01035717, + "balance_loss_clip": 1.03750098, + "balance_loss_mlp": 1.02279496, + "epoch": 0.7191342251615812, + "flos": 26104759516800.0, + "grad_norm": 1.6282973881142366, + "language_loss": 0.78032422, + "learning_rate": 7.718576706841013e-07, + "loss": 0.80149019, + "num_input_tokens_seen": 258093155, + "step": 11961, + "time_per_iteration": 2.5713517665863037 + }, + { + "auxiliary_loss_clip": 0.01089553, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.03739953, + "balance_loss_mlp": 1.01836967, + "epoch": 0.7191943484142492, + "flos": 22967028904320.0, + "grad_norm": 1.344376399097129, + "language_loss": 0.75017726, + "learning_rate": 7.715503110824326e-07, + "loss": 0.77137107, + "num_input_tokens_seen": 258113905, + "step": 11962, + "time_per_iteration": 2.5273945331573486 + }, + { + "auxiliary_loss_clip": 0.01096078, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.03594208, + "balance_loss_mlp": 1.01756918, + "epoch": 0.7192544716669171, + "flos": 22565332131840.0, + "grad_norm": 2.100076080534987, + "language_loss": 0.75496048, + "learning_rate": 7.712429980637001e-07, + "loss": 0.77623308, + "num_input_tokens_seen": 258132820, + "step": 11963, + "time_per_iteration": 2.5283877849578857 + }, + { + "auxiliary_loss_clip": 0.01078361, + "auxiliary_loss_mlp": 0.01037671, + "balance_loss_clip": 1.03812051, + "balance_loss_mlp": 1.02446294, + "epoch": 0.7193145949195852, + "flos": 18982200873600.0, + "grad_norm": 2.2933348881577182, + "language_loss": 0.80852747, + "learning_rate": 7.709357316395564e-07, + "loss": 0.82968783, + "num_input_tokens_seen": 258148055, + "step": 11964, + "time_per_iteration": 2.556147336959839 + }, + { + "auxiliary_loss_clip": 0.01095073, + "auxiliary_loss_mlp": 0.01031987, + "balance_loss_clip": 1.03599191, + "balance_loss_mlp": 1.01940441, + "epoch": 0.7193747181722531, + "flos": 18004246208640.0, + "grad_norm": 1.8639765448097971, + "language_loss": 0.74940085, + "learning_rate": 7.70628511821652e-07, + "loss": 0.77067143, + "num_input_tokens_seen": 258165995, + "step": 11965, + "time_per_iteration": 2.476600170135498 + }, + { + "auxiliary_loss_clip": 0.01081713, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.037673, + "balance_loss_mlp": 1.01983798, + "epoch": 0.7194348414249211, + "flos": 24389414547840.0, + "grad_norm": 1.925133362027158, + "language_loss": 0.77662426, + "learning_rate": 7.703213386216377e-07, + "loss": 0.79776943, + "num_input_tokens_seen": 258186165, + "step": 11966, + "time_per_iteration": 2.5590975284576416 + }, + { + "auxiliary_loss_clip": 0.01082997, + "auxiliary_loss_mlp": 0.0103501, + "balance_loss_clip": 1.03593802, + "balance_loss_mlp": 1.02129447, + "epoch": 0.7194949646775891, + "flos": 22163455791360.0, + "grad_norm": 1.8400521531104652, + "language_loss": 0.72888118, + "learning_rate": 7.700142120511619e-07, + "loss": 0.75006121, + "num_input_tokens_seen": 258204595, + "step": 11967, + "time_per_iteration": 2.5277252197265625 + }, + { + "auxiliary_loss_clip": 0.01080502, + "auxiliary_loss_mlp": 0.01028038, + "balance_loss_clip": 1.03939724, + "balance_loss_mlp": 1.01711869, + "epoch": 0.719555087930257, + "flos": 20266366982400.0, + "grad_norm": 1.9907374907859772, + "language_loss": 0.8167311, + "learning_rate": 7.6970713212187e-07, + "loss": 0.83781654, + "num_input_tokens_seen": 258223110, + "step": 11968, + "time_per_iteration": 2.5159270763397217 + }, + { + "auxiliary_loss_clip": 0.01083858, + "auxiliary_loss_mlp": 0.01028345, + "balance_loss_clip": 1.03673291, + "balance_loss_mlp": 1.01616788, + "epoch": 0.719615211182925, + "flos": 24716309247360.0, + "grad_norm": 2.3722066231361922, + "language_loss": 0.76219612, + "learning_rate": 7.69400098845407e-07, + "loss": 0.78331816, + "num_input_tokens_seen": 258242660, + "step": 11969, + "time_per_iteration": 3.935575485229492 + }, + { + "auxiliary_loss_clip": 0.01066707, + "auxiliary_loss_mlp": 0.01031633, + "balance_loss_clip": 1.03221524, + "balance_loss_mlp": 1.01812685, + "epoch": 0.719675334435593, + "flos": 20009641501440.0, + "grad_norm": 2.8591538340745632, + "language_loss": 0.70530272, + "learning_rate": 7.69093112233417e-07, + "loss": 0.72628617, + "num_input_tokens_seen": 258261850, + "step": 11970, + "time_per_iteration": 2.5201539993286133 + }, + { + "auxiliary_loss_clip": 0.01017196, + "auxiliary_loss_mlp": 0.01002668, + "balance_loss_clip": 1.01321507, + "balance_loss_mlp": 1.00159538, + "epoch": 0.719735457688261, + "flos": 44199861177600.0, + "grad_norm": 0.9178475229041617, + "language_loss": 0.60893989, + "learning_rate": 7.68786172297538e-07, + "loss": 0.62913859, + "num_input_tokens_seen": 258312570, + "step": 11971, + "time_per_iteration": 3.0158333778381348 + }, + { + "auxiliary_loss_clip": 0.01112753, + "auxiliary_loss_mlp": 0.0103447, + "balance_loss_clip": 1.0379045, + "balance_loss_mlp": 1.02103555, + "epoch": 0.7197955809409289, + "flos": 16802890905600.0, + "grad_norm": 1.9598894525282011, + "language_loss": 0.80275059, + "learning_rate": 7.684792790494105e-07, + "loss": 0.8242228, + "num_input_tokens_seen": 258331600, + "step": 11972, + "time_per_iteration": 2.477065324783325 + }, + { + "auxiliary_loss_clip": 0.01088517, + "auxiliary_loss_mlp": 0.01036442, + "balance_loss_clip": 1.03619552, + "balance_loss_mlp": 1.0236032, + "epoch": 0.7198557041935969, + "flos": 24535391420160.0, + "grad_norm": 1.5136043354465274, + "language_loss": 0.75390255, + "learning_rate": 7.681724325006733e-07, + "loss": 0.77515221, + "num_input_tokens_seen": 258351785, + "step": 11973, + "time_per_iteration": 2.5443708896636963 + }, + { + "auxiliary_loss_clip": 0.01000675, + "auxiliary_loss_mlp": 0.01001318, + "balance_loss_clip": 1.01600599, + "balance_loss_mlp": 1.00006068, + "epoch": 0.7199158274462648, + "flos": 70710839602560.0, + "grad_norm": 0.8523903830877986, + "language_loss": 0.57264346, + "learning_rate": 7.6786563266296e-07, + "loss": 0.59266341, + "num_input_tokens_seen": 258404035, + "step": 11974, + "time_per_iteration": 4.425535440444946 + }, + { + "auxiliary_loss_clip": 0.01080529, + "auxiliary_loss_mlp": 0.0103166, + "balance_loss_clip": 1.03489065, + "balance_loss_mlp": 1.01887488, + "epoch": 0.7199759506989328, + "flos": 29347995352320.0, + "grad_norm": 2.108964309678405, + "language_loss": 0.62172365, + "learning_rate": 7.675588795479062e-07, + "loss": 0.64284551, + "num_input_tokens_seen": 258424850, + "step": 11975, + "time_per_iteration": 2.5879931449890137 + }, + { + "auxiliary_loss_clip": 0.01093522, + "auxiliary_loss_mlp": 0.01028939, + "balance_loss_clip": 1.03317308, + "balance_loss_mlp": 1.01698792, + "epoch": 0.7200360739516007, + "flos": 24640465680000.0, + "grad_norm": 1.819074630101109, + "language_loss": 0.67485356, + "learning_rate": 7.672521731671425e-07, + "loss": 0.69607818, + "num_input_tokens_seen": 258445485, + "step": 11976, + "time_per_iteration": 3.925523281097412 + }, + { + "auxiliary_loss_clip": 0.01076926, + "auxiliary_loss_mlp": 0.01029167, + "balance_loss_clip": 1.03639579, + "balance_loss_mlp": 1.01729941, + "epoch": 0.7200961972042688, + "flos": 20812855478400.0, + "grad_norm": 1.744707228433334, + "language_loss": 0.66999346, + "learning_rate": 7.669455135323004e-07, + "loss": 0.69105434, + "num_input_tokens_seen": 258464505, + "step": 11977, + "time_per_iteration": 2.5600762367248535 + }, + { + "auxiliary_loss_clip": 0.01085726, + "auxiliary_loss_mlp": 0.01033241, + "balance_loss_clip": 1.035079, + "balance_loss_mlp": 1.02092052, + "epoch": 0.7201563204569367, + "flos": 31245910174080.0, + "grad_norm": 1.7173216383780043, + "language_loss": 0.75373375, + "learning_rate": 7.666389006550074e-07, + "loss": 0.77492344, + "num_input_tokens_seen": 258487190, + "step": 11978, + "time_per_iteration": 2.5942816734313965 + }, + { + "auxiliary_loss_clip": 0.01104049, + "auxiliary_loss_mlp": 0.0102955, + "balance_loss_clip": 1.03595078, + "balance_loss_mlp": 1.01661026, + "epoch": 0.7202164437096047, + "flos": 26651391667200.0, + "grad_norm": 2.235893175719401, + "language_loss": 0.78588408, + "learning_rate": 7.663323345468908e-07, + "loss": 0.8072201, + "num_input_tokens_seen": 258503790, + "step": 11979, + "time_per_iteration": 2.518784761428833 + }, + { + "auxiliary_loss_clip": 0.01094984, + "auxiliary_loss_mlp": 0.01027738, + "balance_loss_clip": 1.0360558, + "balance_loss_mlp": 1.01507258, + "epoch": 0.7202765669622727, + "flos": 25959608657280.0, + "grad_norm": 1.8854244847471613, + "language_loss": 0.64800066, + "learning_rate": 7.660258152195767e-07, + "loss": 0.6692279, + "num_input_tokens_seen": 258527335, + "step": 11980, + "time_per_iteration": 4.045369386672974 + }, + { + "auxiliary_loss_clip": 0.01099664, + "auxiliary_loss_mlp": 0.01036817, + "balance_loss_clip": 1.03794646, + "balance_loss_mlp": 1.02297068, + "epoch": 0.7203366902149406, + "flos": 28512354372480.0, + "grad_norm": 1.9268248034762696, + "language_loss": 0.66886318, + "learning_rate": 7.657193426846871e-07, + "loss": 0.69022793, + "num_input_tokens_seen": 258546690, + "step": 11981, + "time_per_iteration": 2.5467936992645264 + }, + { + "auxiliary_loss_clip": 0.01083847, + "auxiliary_loss_mlp": 0.01033136, + "balance_loss_clip": 1.03635216, + "balance_loss_mlp": 1.02019036, + "epoch": 0.7203968134676086, + "flos": 21106030285440.0, + "grad_norm": 1.7268608056602643, + "language_loss": 0.74096441, + "learning_rate": 7.65412916953843e-07, + "loss": 0.76213431, + "num_input_tokens_seen": 258566340, + "step": 11982, + "time_per_iteration": 2.5383734703063965 + }, + { + "auxiliary_loss_clip": 0.01078343, + "auxiliary_loss_mlp": 0.00784917, + "balance_loss_clip": 1.03620768, + "balance_loss_mlp": 1.00889111, + "epoch": 0.7204569367202766, + "flos": 18332146488960.0, + "grad_norm": 1.7903193345995545, + "language_loss": 0.6587494, + "learning_rate": 7.65106538038665e-07, + "loss": 0.67738199, + "num_input_tokens_seen": 258584455, + "step": 11983, + "time_per_iteration": 2.521113157272339 + }, + { + "auxiliary_loss_clip": 0.01077321, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.03817928, + "balance_loss_mlp": 1.01897907, + "epoch": 0.7205170599729446, + "flos": 23255103980160.0, + "grad_norm": 1.6679414728822142, + "language_loss": 0.66225243, + "learning_rate": 7.648002059507715e-07, + "loss": 0.68333983, + "num_input_tokens_seen": 258604725, + "step": 11984, + "time_per_iteration": 2.5684456825256348 + }, + { + "auxiliary_loss_clip": 0.01098886, + "auxiliary_loss_mlp": 0.01034207, + "balance_loss_clip": 1.03855991, + "balance_loss_mlp": 1.02065301, + "epoch": 0.7205771832256125, + "flos": 20120892900480.0, + "grad_norm": 1.5776603370450002, + "language_loss": 0.7374866, + "learning_rate": 7.644939207017771e-07, + "loss": 0.75881755, + "num_input_tokens_seen": 258622885, + "step": 11985, + "time_per_iteration": 2.4892077445983887 + }, + { + "auxiliary_loss_clip": 0.01095526, + "auxiliary_loss_mlp": 0.01032051, + "balance_loss_clip": 1.03703034, + "balance_loss_mlp": 1.02022505, + "epoch": 0.7206373064782805, + "flos": 27703250565120.0, + "grad_norm": 1.7072336282473266, + "language_loss": 0.62416381, + "learning_rate": 7.641876823032977e-07, + "loss": 0.64543957, + "num_input_tokens_seen": 258644305, + "step": 11986, + "time_per_iteration": 2.554241895675659 + }, + { + "auxiliary_loss_clip": 0.01089283, + "auxiliary_loss_mlp": 0.01034542, + "balance_loss_clip": 1.03890109, + "balance_loss_mlp": 1.02101159, + "epoch": 0.7206974297309484, + "flos": 17968156018560.0, + "grad_norm": 1.7891264296187577, + "language_loss": 0.72496486, + "learning_rate": 7.638814907669455e-07, + "loss": 0.74620312, + "num_input_tokens_seen": 258661775, + "step": 11987, + "time_per_iteration": 2.5046520233154297 + }, + { + "auxiliary_loss_clip": 0.01086952, + "auxiliary_loss_mlp": 0.01033432, + "balance_loss_clip": 1.03658283, + "balance_loss_mlp": 1.02068269, + "epoch": 0.7207575529836164, + "flos": 16983162288000.0, + "grad_norm": 1.7345410516408515, + "language_loss": 0.78755867, + "learning_rate": 7.635753461043301e-07, + "loss": 0.80876255, + "num_input_tokens_seen": 258679830, + "step": 11988, + "time_per_iteration": 2.5454559326171875 + }, + { + "auxiliary_loss_clip": 0.0110594, + "auxiliary_loss_mlp": 0.01028152, + "balance_loss_clip": 1.03609252, + "balance_loss_mlp": 1.01613617, + "epoch": 0.7208176762362843, + "flos": 18727594295040.0, + "grad_norm": 1.7864472258961135, + "language_loss": 0.78671622, + "learning_rate": 7.632692483270618e-07, + "loss": 0.80805719, + "num_input_tokens_seen": 258697415, + "step": 11989, + "time_per_iteration": 2.4553842544555664 + }, + { + "auxiliary_loss_clip": 0.01103535, + "auxiliary_loss_mlp": 0.01032147, + "balance_loss_clip": 1.03552556, + "balance_loss_mlp": 1.02014863, + "epoch": 0.7208777994889524, + "flos": 18734489706240.0, + "grad_norm": 1.827959631832912, + "language_loss": 0.82320738, + "learning_rate": 7.629631974467481e-07, + "loss": 0.8445642, + "num_input_tokens_seen": 258716755, + "step": 11990, + "time_per_iteration": 2.4371187686920166 + }, + { + "auxiliary_loss_clip": 0.01083967, + "auxiliary_loss_mlp": 0.01036502, + "balance_loss_clip": 1.03551769, + "balance_loss_mlp": 1.02472973, + "epoch": 0.7209379227416203, + "flos": 14793437376000.0, + "grad_norm": 2.052091838304321, + "language_loss": 0.76172066, + "learning_rate": 7.626571934749931e-07, + "loss": 0.78292537, + "num_input_tokens_seen": 258733270, + "step": 11991, + "time_per_iteration": 2.4821038246154785 + }, + { + "auxiliary_loss_clip": 0.01063648, + "auxiliary_loss_mlp": 0.0102788, + "balance_loss_clip": 1.03486955, + "balance_loss_mlp": 1.01530933, + "epoch": 0.7209980459942883, + "flos": 29636860527360.0, + "grad_norm": 1.542962209759498, + "language_loss": 0.7266674, + "learning_rate": 7.623512364234022e-07, + "loss": 0.74758267, + "num_input_tokens_seen": 258755270, + "step": 11992, + "time_per_iteration": 2.6267826557159424 + }, + { + "auxiliary_loss_clip": 0.01097742, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.0365088, + "balance_loss_mlp": 1.01822579, + "epoch": 0.7210581692469563, + "flos": 23477175815040.0, + "grad_norm": 1.459461514357293, + "language_loss": 0.66498494, + "learning_rate": 7.620453263035755e-07, + "loss": 0.68626499, + "num_input_tokens_seen": 258775340, + "step": 11993, + "time_per_iteration": 2.513005256652832 + }, + { + "auxiliary_loss_clip": 0.01096224, + "auxiliary_loss_mlp": 0.01029127, + "balance_loss_clip": 1.03503299, + "balance_loss_mlp": 1.01721835, + "epoch": 0.7211182924996242, + "flos": 26099839353600.0, + "grad_norm": 1.8025434650429608, + "language_loss": 0.6585182, + "learning_rate": 7.61739463127115e-07, + "loss": 0.67977166, + "num_input_tokens_seen": 258794580, + "step": 11994, + "time_per_iteration": 2.524806261062622 + }, + { + "auxiliary_loss_clip": 0.01097571, + "auxiliary_loss_mlp": 0.01035138, + "balance_loss_clip": 1.03643489, + "balance_loss_mlp": 1.02159548, + "epoch": 0.7211784157522922, + "flos": 17712076982400.0, + "grad_norm": 2.0592358953569536, + "language_loss": 0.67078084, + "learning_rate": 7.614336469056172e-07, + "loss": 0.69210792, + "num_input_tokens_seen": 258812330, + "step": 11995, + "time_per_iteration": 2.4566006660461426 + }, + { + "auxiliary_loss_clip": 0.01080525, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.03520775, + "balance_loss_mlp": 1.01771784, + "epoch": 0.7212385390049602, + "flos": 24423637230720.0, + "grad_norm": 1.7696630963596875, + "language_loss": 0.79476064, + "learning_rate": 7.6112787765068e-07, + "loss": 0.81587464, + "num_input_tokens_seen": 258831770, + "step": 11996, + "time_per_iteration": 2.566114664077759 + }, + { + "auxiliary_loss_clip": 0.01107747, + "auxiliary_loss_mlp": 0.01032484, + "balance_loss_clip": 1.03763366, + "balance_loss_mlp": 1.02042043, + "epoch": 0.7212986622576282, + "flos": 28147250580480.0, + "grad_norm": 3.147617406938141, + "language_loss": 0.81704319, + "learning_rate": 7.60822155373899e-07, + "loss": 0.83844554, + "num_input_tokens_seen": 258849090, + "step": 11997, + "time_per_iteration": 2.534407615661621 + }, + { + "auxiliary_loss_clip": 0.01108767, + "auxiliary_loss_mlp": 0.01032345, + "balance_loss_clip": 1.0365963, + "balance_loss_mlp": 1.0195415, + "epoch": 0.7213587855102961, + "flos": 21835770992640.0, + "grad_norm": 2.310638742655128, + "language_loss": 0.6672585, + "learning_rate": 7.605164800868646e-07, + "loss": 0.68866962, + "num_input_tokens_seen": 258868230, + "step": 11998, + "time_per_iteration": 2.464881420135498 + }, + { + "auxiliary_loss_clip": 0.01107303, + "auxiliary_loss_mlp": 0.01031357, + "balance_loss_clip": 1.03766561, + "balance_loss_mlp": 1.02005041, + "epoch": 0.7214189087629641, + "flos": 14611549881600.0, + "grad_norm": 2.011737402008611, + "language_loss": 0.72461486, + "learning_rate": 7.602108518011696e-07, + "loss": 0.74600148, + "num_input_tokens_seen": 258885525, + "step": 11999, + "time_per_iteration": 2.4400832653045654 + }, + { + "auxiliary_loss_clip": 0.01087117, + "auxiliary_loss_mlp": 0.01025985, + "balance_loss_clip": 1.03618598, + "balance_loss_mlp": 1.01310432, + "epoch": 0.721479032015632, + "flos": 19390864884480.0, + "grad_norm": 2.4655986123336113, + "language_loss": 0.83106911, + "learning_rate": 7.599052705284039e-07, + "loss": 0.85220015, + "num_input_tokens_seen": 258903245, + "step": 12000, + "time_per_iteration": 2.5015060901641846 + }, + { + "auxiliary_loss_clip": 0.01097878, + "auxiliary_loss_mlp": 0.01035213, + "balance_loss_clip": 1.0381391, + "balance_loss_mlp": 1.02260637, + "epoch": 0.7215391552683, + "flos": 18512884748160.0, + "grad_norm": 1.8239456493197628, + "language_loss": 0.77221334, + "learning_rate": 7.59599736280154e-07, + "loss": 0.79354429, + "num_input_tokens_seen": 258921245, + "step": 12001, + "time_per_iteration": 2.4782397747039795 + }, + { + "auxiliary_loss_clip": 0.01091122, + "auxiliary_loss_mlp": 0.01038009, + "balance_loss_clip": 1.03736472, + "balance_loss_mlp": 1.025581, + "epoch": 0.721599278520968, + "flos": 23258731253760.0, + "grad_norm": 1.8001923668220672, + "language_loss": 0.81426156, + "learning_rate": 7.592942490680066e-07, + "loss": 0.83555293, + "num_input_tokens_seen": 258939425, + "step": 12002, + "time_per_iteration": 2.511728286743164 + }, + { + "auxiliary_loss_clip": 0.01099182, + "auxiliary_loss_mlp": 0.01029968, + "balance_loss_clip": 1.03849268, + "balance_loss_mlp": 1.01708198, + "epoch": 0.721659401773636, + "flos": 39199045979520.0, + "grad_norm": 1.980066303326202, + "language_loss": 0.62417209, + "learning_rate": 7.589888089035462e-07, + "loss": 0.64546359, + "num_input_tokens_seen": 258960710, + "step": 12003, + "time_per_iteration": 2.6388163566589355 + }, + { + "auxiliary_loss_clip": 0.01107954, + "auxiliary_loss_mlp": 0.01031313, + "balance_loss_clip": 1.03727698, + "balance_loss_mlp": 1.01825905, + "epoch": 0.7217195250263039, + "flos": 14939917038720.0, + "grad_norm": 2.239024754848087, + "language_loss": 0.68578589, + "learning_rate": 7.586834157983544e-07, + "loss": 0.70717853, + "num_input_tokens_seen": 258978475, + "step": 12004, + "time_per_iteration": 2.4650685787200928 + }, + { + "auxiliary_loss_clip": 0.01019369, + "auxiliary_loss_mlp": 0.01004285, + "balance_loss_clip": 1.02252102, + "balance_loss_mlp": 1.00286663, + "epoch": 0.7217796482789719, + "flos": 70869206666880.0, + "grad_norm": 0.8582544965383146, + "language_loss": 0.54174662, + "learning_rate": 7.583780697640112e-07, + "loss": 0.56198323, + "num_input_tokens_seen": 259037520, + "step": 12005, + "time_per_iteration": 3.0646817684173584 + }, + { + "auxiliary_loss_clip": 0.01075798, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.0373733, + "balance_loss_mlp": 1.02150953, + "epoch": 0.7218397715316398, + "flos": 37451525402880.0, + "grad_norm": 1.4733300486680136, + "language_loss": 0.63298815, + "learning_rate": 7.580727708120962e-07, + "loss": 0.65409112, + "num_input_tokens_seen": 259061325, + "step": 12006, + "time_per_iteration": 2.6908769607543945 + }, + { + "auxiliary_loss_clip": 0.01080681, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.03509653, + "balance_loss_mlp": 1.02143431, + "epoch": 0.7218998947843078, + "flos": 22710662559360.0, + "grad_norm": 1.6369783112978062, + "language_loss": 0.919523, + "learning_rate": 7.577675189541865e-07, + "loss": 0.94066441, + "num_input_tokens_seen": 259078135, + "step": 12007, + "time_per_iteration": 2.5146210193634033 + }, + { + "auxiliary_loss_clip": 0.0107023, + "auxiliary_loss_mlp": 0.01033641, + "balance_loss_clip": 1.03320217, + "balance_loss_mlp": 1.01913309, + "epoch": 0.7219600180369758, + "flos": 12167182477440.0, + "grad_norm": 1.701079361968365, + "language_loss": 0.6390093, + "learning_rate": 7.574623142018568e-07, + "loss": 0.66004807, + "num_input_tokens_seen": 259095910, + "step": 12008, + "time_per_iteration": 3.9234368801116943 + }, + { + "auxiliary_loss_clip": 0.01098823, + "auxiliary_loss_mlp": 0.01036323, + "balance_loss_clip": 1.03745031, + "balance_loss_mlp": 1.02314425, + "epoch": 0.7220201412896438, + "flos": 22596573985920.0, + "grad_norm": 2.2332447735548877, + "language_loss": 0.78396165, + "learning_rate": 7.57157156566681e-07, + "loss": 0.80531311, + "num_input_tokens_seen": 259114225, + "step": 12009, + "time_per_iteration": 2.511363983154297 + }, + { + "auxiliary_loss_clip": 0.01097408, + "auxiliary_loss_mlp": 0.01037882, + "balance_loss_clip": 1.03786349, + "balance_loss_mlp": 1.02431035, + "epoch": 0.7220802645423118, + "flos": 26718651884160.0, + "grad_norm": 1.700950145887698, + "language_loss": 0.63799286, + "learning_rate": 7.568520460602297e-07, + "loss": 0.65934575, + "num_input_tokens_seen": 259134660, + "step": 12010, + "time_per_iteration": 2.5277822017669678 + }, + { + "auxiliary_loss_clip": 0.01106897, + "auxiliary_loss_mlp": 0.0102896, + "balance_loss_clip": 1.03661275, + "balance_loss_mlp": 1.01633024, + "epoch": 0.7221403877949797, + "flos": 24420548661120.0, + "grad_norm": 1.8626682606796008, + "language_loss": 0.77600878, + "learning_rate": 7.565469826940742e-07, + "loss": 0.79736733, + "num_input_tokens_seen": 259153300, + "step": 12011, + "time_per_iteration": 2.4944875240325928 + }, + { + "auxiliary_loss_clip": 0.01090813, + "auxiliary_loss_mlp": 0.01035923, + "balance_loss_clip": 1.03575528, + "balance_loss_mlp": 1.02413368, + "epoch": 0.7222005110476477, + "flos": 23514379326720.0, + "grad_norm": 1.622613441040394, + "language_loss": 0.78984195, + "learning_rate": 7.56241966479781e-07, + "loss": 0.8111093, + "num_input_tokens_seen": 259172115, + "step": 12012, + "time_per_iteration": 2.496013879776001 + }, + { + "auxiliary_loss_clip": 0.01085982, + "auxiliary_loss_mlp": 0.01027085, + "balance_loss_clip": 1.0372982, + "balance_loss_mlp": 1.01553392, + "epoch": 0.7222606343003156, + "flos": 23112538899840.0, + "grad_norm": 1.753571793027519, + "language_loss": 0.75383413, + "learning_rate": 7.559369974289171e-07, + "loss": 0.77496481, + "num_input_tokens_seen": 259191345, + "step": 12013, + "time_per_iteration": 3.9288089275360107 + }, + { + "auxiliary_loss_clip": 0.01105736, + "auxiliary_loss_mlp": 0.01026423, + "balance_loss_clip": 1.03695226, + "balance_loss_mlp": 1.01448441, + "epoch": 0.7223207575529836, + "flos": 24351169541760.0, + "grad_norm": 1.5153039809778643, + "language_loss": 0.75723654, + "learning_rate": 7.556320755530484e-07, + "loss": 0.77855814, + "num_input_tokens_seen": 259211700, + "step": 12014, + "time_per_iteration": 3.8815391063690186 + }, + { + "auxiliary_loss_clip": 0.01096531, + "auxiliary_loss_mlp": 0.01032295, + "balance_loss_clip": 1.03440797, + "balance_loss_mlp": 1.01985586, + "epoch": 0.7223808808056515, + "flos": 28330179569280.0, + "grad_norm": 1.5699895809204507, + "language_loss": 0.86429709, + "learning_rate": 7.553272008637346e-07, + "loss": 0.88558531, + "num_input_tokens_seen": 259233825, + "step": 12015, + "time_per_iteration": 2.5353620052337646 + }, + { + "auxiliary_loss_clip": 0.0109457, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.03699851, + "balance_loss_mlp": 1.02272129, + "epoch": 0.7224410040583196, + "flos": 21069437304960.0, + "grad_norm": 1.884004524586934, + "language_loss": 0.78138977, + "learning_rate": 7.55022373372538e-07, + "loss": 0.80268139, + "num_input_tokens_seen": 259253055, + "step": 12016, + "time_per_iteration": 2.4846980571746826 + }, + { + "auxiliary_loss_clip": 0.01063125, + "auxiliary_loss_mlp": 0.01043581, + "balance_loss_clip": 1.03347993, + "balance_loss_mlp": 1.0292697, + "epoch": 0.7225011273109875, + "flos": 26795429205120.0, + "grad_norm": 1.5678902462553646, + "language_loss": 0.77870649, + "learning_rate": 7.547175930910186e-07, + "loss": 0.79977351, + "num_input_tokens_seen": 259273420, + "step": 12017, + "time_per_iteration": 2.5885603427886963 + }, + { + "auxiliary_loss_clip": 0.01103695, + "auxiliary_loss_mlp": 0.01027548, + "balance_loss_clip": 1.03679359, + "balance_loss_mlp": 1.0162586, + "epoch": 0.7225612505636555, + "flos": 23583578878080.0, + "grad_norm": 2.333669782076671, + "language_loss": 0.7376796, + "learning_rate": 7.54412860030732e-07, + "loss": 0.75899208, + "num_input_tokens_seen": 259291000, + "step": 12018, + "time_per_iteration": 2.4856722354888916 + }, + { + "auxiliary_loss_clip": 0.01074458, + "auxiliary_loss_mlp": 0.01027646, + "balance_loss_clip": 1.03902197, + "balance_loss_mlp": 1.01694131, + "epoch": 0.7226213738163234, + "flos": 20777627214720.0, + "grad_norm": 1.7235592131281006, + "language_loss": 0.77772021, + "learning_rate": 7.541081742032347e-07, + "loss": 0.79874122, + "num_input_tokens_seen": 259312390, + "step": 12019, + "time_per_iteration": 4.036248445510864 + }, + { + "auxiliary_loss_clip": 0.01079685, + "auxiliary_loss_mlp": 0.01026771, + "balance_loss_clip": 1.03650761, + "balance_loss_mlp": 1.01461768, + "epoch": 0.7226814970689914, + "flos": 32635832901120.0, + "grad_norm": 1.6867685183578407, + "language_loss": 0.74288005, + "learning_rate": 7.53803535620081e-07, + "loss": 0.76394463, + "num_input_tokens_seen": 259332645, + "step": 12020, + "time_per_iteration": 2.6293344497680664 + }, + { + "auxiliary_loss_clip": 0.01098923, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.0364778, + "balance_loss_mlp": 1.0194875, + "epoch": 0.7227416203216595, + "flos": 22454368041600.0, + "grad_norm": 1.937676675698137, + "language_loss": 0.77237892, + "learning_rate": 7.534989442928219e-07, + "loss": 0.79367614, + "num_input_tokens_seen": 259353810, + "step": 12021, + "time_per_iteration": 2.4995036125183105 + }, + { + "auxiliary_loss_clip": 0.01070137, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.03585839, + "balance_loss_mlp": 1.02160704, + "epoch": 0.7228017435743274, + "flos": 21652303299840.0, + "grad_norm": 2.5279874129766986, + "language_loss": 0.68424916, + "learning_rate": 7.531944002330073e-07, + "loss": 0.7052871, + "num_input_tokens_seen": 259372460, + "step": 12022, + "time_per_iteration": 2.608020544052124 + }, + { + "auxiliary_loss_clip": 0.01096684, + "auxiliary_loss_mlp": 0.01029501, + "balance_loss_clip": 1.03570461, + "balance_loss_mlp": 1.01686454, + "epoch": 0.7228618668269954, + "flos": 29533474206720.0, + "grad_norm": 1.8817701882617144, + "language_loss": 0.6948241, + "learning_rate": 7.528899034521858e-07, + "loss": 0.71608603, + "num_input_tokens_seen": 259393275, + "step": 12023, + "time_per_iteration": 2.5548338890075684 + }, + { + "auxiliary_loss_clip": 0.01077351, + "auxiliary_loss_mlp": 0.01030063, + "balance_loss_clip": 1.03201461, + "balance_loss_mlp": 1.01730752, + "epoch": 0.7229219900796633, + "flos": 27453815544960.0, + "grad_norm": 1.6617959609199873, + "language_loss": 0.71049076, + "learning_rate": 7.525854539619052e-07, + "loss": 0.73156488, + "num_input_tokens_seen": 259416205, + "step": 12024, + "time_per_iteration": 2.5706751346588135 + }, + { + "auxiliary_loss_clip": 0.0107279, + "auxiliary_loss_mlp": 0.01031386, + "balance_loss_clip": 1.0356729, + "balance_loss_mlp": 1.01975107, + "epoch": 0.7229821133323313, + "flos": 16289368116480.0, + "grad_norm": 1.761326845041343, + "language_loss": 0.75981498, + "learning_rate": 7.522810517737089e-07, + "loss": 0.78085673, + "num_input_tokens_seen": 259433115, + "step": 12025, + "time_per_iteration": 2.525991439819336 + }, + { + "auxiliary_loss_clip": 0.01093724, + "auxiliary_loss_mlp": 0.01028903, + "balance_loss_clip": 1.03583622, + "balance_loss_mlp": 1.01711369, + "epoch": 0.7230422365849992, + "flos": 20412343854720.0, + "grad_norm": 1.9006716741424727, + "language_loss": 0.76709855, + "learning_rate": 7.519766968991395e-07, + "loss": 0.78832483, + "num_input_tokens_seen": 259450475, + "step": 12026, + "time_per_iteration": 2.5133049488067627 + }, + { + "auxiliary_loss_clip": 0.01095282, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.03621578, + "balance_loss_mlp": 1.02425265, + "epoch": 0.7231023598376672, + "flos": 25593499284480.0, + "grad_norm": 2.0467044266848164, + "language_loss": 0.67610765, + "learning_rate": 7.516723893497388e-07, + "loss": 0.69742262, + "num_input_tokens_seen": 259469355, + "step": 12027, + "time_per_iteration": 2.5275237560272217 + }, + { + "auxiliary_loss_clip": 0.01056664, + "auxiliary_loss_mlp": 0.010309, + "balance_loss_clip": 1.03832984, + "balance_loss_mlp": 1.01818049, + "epoch": 0.7231624830903352, + "flos": 25149607009920.0, + "grad_norm": 1.8912338525285748, + "language_loss": 0.78774494, + "learning_rate": 7.513681291370469e-07, + "loss": 0.80862057, + "num_input_tokens_seen": 259486565, + "step": 12028, + "time_per_iteration": 2.6056907176971436 + }, + { + "auxiliary_loss_clip": 0.01072549, + "auxiliary_loss_mlp": 0.01026304, + "balance_loss_clip": 1.0337292, + "balance_loss_mlp": 1.013394, + "epoch": 0.7232226063430032, + "flos": 21725740656000.0, + "grad_norm": 1.7703722098966417, + "language_loss": 0.82011998, + "learning_rate": 7.510639162726e-07, + "loss": 0.8411085, + "num_input_tokens_seen": 259505070, + "step": 12029, + "time_per_iteration": 2.564087152481079 + }, + { + "auxiliary_loss_clip": 0.0101507, + "auxiliary_loss_mlp": 0.01003054, + "balance_loss_clip": 1.01466393, + "balance_loss_mlp": 1.00179601, + "epoch": 0.7232827295956711, + "flos": 68436798491520.0, + "grad_norm": 0.8130069444961493, + "language_loss": 0.61730456, + "learning_rate": 7.507597507679347e-07, + "loss": 0.63748574, + "num_input_tokens_seen": 259569135, + "step": 12030, + "time_per_iteration": 3.185398578643799 + }, + { + "auxiliary_loss_clip": 0.01087985, + "auxiliary_loss_mlp": 0.01031014, + "balance_loss_clip": 1.03377557, + "balance_loss_mlp": 1.01743603, + "epoch": 0.7233428528483391, + "flos": 20192642317440.0, + "grad_norm": 1.6767182833603806, + "language_loss": 0.77793306, + "learning_rate": 7.504556326345859e-07, + "loss": 0.79912305, + "num_input_tokens_seen": 259587035, + "step": 12031, + "time_per_iteration": 2.4862210750579834 + }, + { + "auxiliary_loss_clip": 0.01096666, + "auxiliary_loss_mlp": 0.01028414, + "balance_loss_clip": 1.03638232, + "balance_loss_mlp": 1.01638007, + "epoch": 0.723402976101007, + "flos": 23949472769280.0, + "grad_norm": 2.015650162426516, + "language_loss": 0.81679213, + "learning_rate": 7.501515618840834e-07, + "loss": 0.83804286, + "num_input_tokens_seen": 259606140, + "step": 12032, + "time_per_iteration": 2.5163557529449463 + }, + { + "auxiliary_loss_clip": 0.01069283, + "auxiliary_loss_mlp": 0.01034441, + "balance_loss_clip": 1.03423166, + "balance_loss_mlp": 1.02126265, + "epoch": 0.723463099353675, + "flos": 20813394182400.0, + "grad_norm": 1.7396139573261402, + "language_loss": 0.75269216, + "learning_rate": 7.498475385279592e-07, + "loss": 0.77372944, + "num_input_tokens_seen": 259624275, + "step": 12033, + "time_per_iteration": 2.559782028198242 + }, + { + "auxiliary_loss_clip": 0.01068344, + "auxiliary_loss_mlp": 0.01026201, + "balance_loss_clip": 1.03266847, + "balance_loss_mlp": 1.01488256, + "epoch": 0.723523222606343, + "flos": 19098013299840.0, + "grad_norm": 1.563194924121279, + "language_loss": 0.74913895, + "learning_rate": 7.495435625777423e-07, + "loss": 0.77008438, + "num_input_tokens_seen": 259643465, + "step": 12034, + "time_per_iteration": 2.53656268119812 + }, + { + "auxiliary_loss_clip": 0.01080924, + "auxiliary_loss_mlp": 0.01027445, + "balance_loss_clip": 1.03433323, + "balance_loss_mlp": 1.01664472, + "epoch": 0.723583345859011, + "flos": 26506994993280.0, + "grad_norm": 1.7359936945536365, + "language_loss": 0.8062402, + "learning_rate": 7.492396340449578e-07, + "loss": 0.82732391, + "num_input_tokens_seen": 259662500, + "step": 12035, + "time_per_iteration": 2.591151714324951 + }, + { + "auxiliary_loss_clip": 0.01055741, + "auxiliary_loss_mlp": 0.01030585, + "balance_loss_clip": 1.03806329, + "balance_loss_mlp": 1.01862192, + "epoch": 0.723643469111679, + "flos": 16033863697920.0, + "grad_norm": 1.8392335778011457, + "language_loss": 0.61007047, + "learning_rate": 7.489357529411326e-07, + "loss": 0.63093376, + "num_input_tokens_seen": 259680140, + "step": 12036, + "time_per_iteration": 2.585636615753174 + }, + { + "auxiliary_loss_clip": 0.01091633, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.03542757, + "balance_loss_mlp": 1.01883769, + "epoch": 0.7237035923643469, + "flos": 21945549934080.0, + "grad_norm": 5.773644310912796, + "language_loss": 0.67612338, + "learning_rate": 7.486319192777883e-07, + "loss": 0.69733536, + "num_input_tokens_seen": 259700160, + "step": 12037, + "time_per_iteration": 2.513305187225342 + }, + { + "auxiliary_loss_clip": 0.01105141, + "auxiliary_loss_mlp": 0.01033339, + "balance_loss_clip": 1.0364213, + "balance_loss_mlp": 1.02138281, + "epoch": 0.7237637156170149, + "flos": 23583112001280.0, + "grad_norm": 1.7881570557848763, + "language_loss": 0.72066736, + "learning_rate": 7.483281330664479e-07, + "loss": 0.74205214, + "num_input_tokens_seen": 259720525, + "step": 12038, + "time_per_iteration": 2.48521089553833 + }, + { + "auxiliary_loss_clip": 0.01106585, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.03717208, + "balance_loss_mlp": 1.01985562, + "epoch": 0.7238238388696828, + "flos": 20594698225920.0, + "grad_norm": 1.8347899964493977, + "language_loss": 0.72325647, + "learning_rate": 7.480243943186293e-07, + "loss": 0.74465334, + "num_input_tokens_seen": 259738680, + "step": 12039, + "time_per_iteration": 2.4861931800842285 + }, + { + "auxiliary_loss_clip": 0.01107249, + "auxiliary_loss_mlp": 0.01031444, + "balance_loss_clip": 1.03725815, + "balance_loss_mlp": 1.02025652, + "epoch": 0.7238839621223508, + "flos": 24207024263040.0, + "grad_norm": 1.8494753458420639, + "language_loss": 0.76147449, + "learning_rate": 7.477207030458513e-07, + "loss": 0.78286141, + "num_input_tokens_seen": 259758790, + "step": 12040, + "time_per_iteration": 2.489377975463867 + }, + { + "auxiliary_loss_clip": 0.01073174, + "auxiliary_loss_mlp": 0.01033173, + "balance_loss_clip": 1.03523135, + "balance_loss_mlp": 1.02029252, + "epoch": 0.7239440853750188, + "flos": 14209745368320.0, + "grad_norm": 1.561388328065423, + "language_loss": 0.76634896, + "learning_rate": 7.474170592596301e-07, + "loss": 0.78741241, + "num_input_tokens_seen": 259777370, + "step": 12041, + "time_per_iteration": 2.5473968982696533 + }, + { + "auxiliary_loss_clip": 0.01095947, + "auxiliary_loss_mlp": 0.01025372, + "balance_loss_clip": 1.03461051, + "balance_loss_mlp": 1.01388657, + "epoch": 0.7240042086276868, + "flos": 21614812479360.0, + "grad_norm": 2.138919121835004, + "language_loss": 0.63423288, + "learning_rate": 7.471134629714797e-07, + "loss": 0.65544605, + "num_input_tokens_seen": 259794665, + "step": 12042, + "time_per_iteration": 2.532137632369995 + }, + { + "auxiliary_loss_clip": 0.01071893, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.03962529, + "balance_loss_mlp": 1.0170033, + "epoch": 0.7240643318803547, + "flos": 23331450337920.0, + "grad_norm": 3.5352282732388236, + "language_loss": 0.83733058, + "learning_rate": 7.468099141929116e-07, + "loss": 0.85834491, + "num_input_tokens_seen": 259811110, + "step": 12043, + "time_per_iteration": 2.593433141708374 + }, + { + "auxiliary_loss_clip": 0.01074017, + "auxiliary_loss_mlp": 0.01029312, + "balance_loss_clip": 1.03610802, + "balance_loss_mlp": 1.01698542, + "epoch": 0.7241244551330227, + "flos": 24024849459840.0, + "grad_norm": 1.616934480297615, + "language_loss": 0.64475393, + "learning_rate": 7.465064129354379e-07, + "loss": 0.66578722, + "num_input_tokens_seen": 259831080, + "step": 12044, + "time_per_iteration": 2.5667264461517334 + }, + { + "auxiliary_loss_clip": 0.01108824, + "auxiliary_loss_mlp": 0.01033325, + "balance_loss_clip": 1.03960264, + "balance_loss_mlp": 1.02086782, + "epoch": 0.7241845783856906, + "flos": 18730323728640.0, + "grad_norm": 1.5034712126132521, + "language_loss": 0.81702274, + "learning_rate": 7.462029592105658e-07, + "loss": 0.83844423, + "num_input_tokens_seen": 259850135, + "step": 12045, + "time_per_iteration": 2.4614691734313965 + }, + { + "auxiliary_loss_clip": 0.01104987, + "auxiliary_loss_mlp": 0.01031726, + "balance_loss_clip": 1.03690577, + "balance_loss_mlp": 1.01990628, + "epoch": 0.7242447016383586, + "flos": 19498668577920.0, + "grad_norm": 1.5437592165485716, + "language_loss": 0.71926022, + "learning_rate": 7.458995530298034e-07, + "loss": 0.74062729, + "num_input_tokens_seen": 259868185, + "step": 12046, + "time_per_iteration": 2.428363561630249 + }, + { + "auxiliary_loss_clip": 0.01070242, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.03301752, + "balance_loss_mlp": 1.01935828, + "epoch": 0.7243048248910267, + "flos": 22163491704960.0, + "grad_norm": 1.7175394474273817, + "language_loss": 0.71314514, + "learning_rate": 7.455961944046553e-07, + "loss": 0.73417151, + "num_input_tokens_seen": 259887055, + "step": 12047, + "time_per_iteration": 3.9650988578796387 + }, + { + "auxiliary_loss_clip": 0.01089607, + "auxiliary_loss_mlp": 0.01032789, + "balance_loss_clip": 1.03833389, + "balance_loss_mlp": 1.01980138, + "epoch": 0.7243649481436946, + "flos": 27672762896640.0, + "grad_norm": 1.6455949899176083, + "language_loss": 0.6974414, + "learning_rate": 7.45292883346627e-07, + "loss": 0.71866536, + "num_input_tokens_seen": 259908295, + "step": 12048, + "time_per_iteration": 2.5880589485168457 + }, + { + "auxiliary_loss_clip": 0.01018369, + "auxiliary_loss_mlp": 0.01017048, + "balance_loss_clip": 1.01493549, + "balance_loss_mlp": 1.0157311, + "epoch": 0.7244250713963626, + "flos": 63244545759360.0, + "grad_norm": 0.8301141688313411, + "language_loss": 0.53733033, + "learning_rate": 7.449896198672168e-07, + "loss": 0.55768454, + "num_input_tokens_seen": 259968475, + "step": 12049, + "time_per_iteration": 3.1318917274475098 + }, + { + "auxiliary_loss_clip": 0.01091656, + "auxiliary_loss_mlp": 0.01030192, + "balance_loss_clip": 1.03678107, + "balance_loss_mlp": 1.01595831, + "epoch": 0.7244851946490305, + "flos": 17967114524160.0, + "grad_norm": 2.54345288859689, + "language_loss": 0.5977515, + "learning_rate": 7.446864039779258e-07, + "loss": 0.61896992, + "num_input_tokens_seen": 259984865, + "step": 12050, + "time_per_iteration": 2.4973537921905518 + }, + { + "auxiliary_loss_clip": 0.00998821, + "auxiliary_loss_mlp": 0.01003926, + "balance_loss_clip": 1.01465797, + "balance_loss_mlp": 1.00290704, + "epoch": 0.7245453179016985, + "flos": 70943649603840.0, + "grad_norm": 0.7248307499298045, + "language_loss": 0.53253198, + "learning_rate": 7.443832356902528e-07, + "loss": 0.55255949, + "num_input_tokens_seen": 260046735, + "step": 12051, + "time_per_iteration": 3.188436269760132 + }, + { + "auxiliary_loss_clip": 0.01092954, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.03552222, + "balance_loss_mlp": 1.0192802, + "epoch": 0.7246054411543664, + "flos": 24568464867840.0, + "grad_norm": 1.5463501637460988, + "language_loss": 0.72028095, + "learning_rate": 7.440801150156927e-07, + "loss": 0.74151498, + "num_input_tokens_seen": 260067950, + "step": 12052, + "time_per_iteration": 2.5369439125061035 + }, + { + "auxiliary_loss_clip": 0.01092402, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.0350529, + "balance_loss_mlp": 1.01771235, + "epoch": 0.7246655644070344, + "flos": 32338312548480.0, + "grad_norm": 1.8208137207200468, + "language_loss": 0.74431336, + "learning_rate": 7.437770419657415e-07, + "loss": 0.76554596, + "num_input_tokens_seen": 260087730, + "step": 12053, + "time_per_iteration": 5.335455417633057 + }, + { + "auxiliary_loss_clip": 0.01065874, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.03450203, + "balance_loss_mlp": 1.01701605, + "epoch": 0.7247256876597024, + "flos": 21872471713920.0, + "grad_norm": 1.862456790147135, + "language_loss": 0.78572923, + "learning_rate": 7.434740165518898e-07, + "loss": 0.80668569, + "num_input_tokens_seen": 260107760, + "step": 12054, + "time_per_iteration": 2.5593061447143555 + }, + { + "auxiliary_loss_clip": 0.01072642, + "auxiliary_loss_mlp": 0.01034667, + "balance_loss_clip": 1.03686976, + "balance_loss_mlp": 1.02242374, + "epoch": 0.7247858109123704, + "flos": 16213093585920.0, + "grad_norm": 2.567759240883679, + "language_loss": 0.68252182, + "learning_rate": 7.431710387856301e-07, + "loss": 0.70359492, + "num_input_tokens_seen": 260123660, + "step": 12055, + "time_per_iteration": 2.544769287109375 + }, + { + "auxiliary_loss_clip": 0.01072342, + "auxiliary_loss_mlp": 0.01033629, + "balance_loss_clip": 1.03556049, + "balance_loss_mlp": 1.02230453, + "epoch": 0.7248459341650383, + "flos": 20850705434880.0, + "grad_norm": 1.6318691601811386, + "language_loss": 0.7387743, + "learning_rate": 7.428681086784496e-07, + "loss": 0.75983399, + "num_input_tokens_seen": 260142690, + "step": 12056, + "time_per_iteration": 2.54860782623291 + }, + { + "auxiliary_loss_clip": 0.01102147, + "auxiliary_loss_mlp": 0.01024554, + "balance_loss_clip": 1.03538585, + "balance_loss_mlp": 1.01273489, + "epoch": 0.7249060574177063, + "flos": 25921794614400.0, + "grad_norm": 1.5826393719871827, + "language_loss": 0.70736146, + "learning_rate": 7.425652262418368e-07, + "loss": 0.7286284, + "num_input_tokens_seen": 260162590, + "step": 12057, + "time_per_iteration": 2.54713773727417 + }, + { + "auxiliary_loss_clip": 0.01062142, + "auxiliary_loss_mlp": 0.01042156, + "balance_loss_clip": 1.03575027, + "balance_loss_mlp": 1.02809572, + "epoch": 0.7249661806703742, + "flos": 17345536646400.0, + "grad_norm": 1.7124489422546483, + "language_loss": 0.6259014, + "learning_rate": 7.42262391487277e-07, + "loss": 0.6469444, + "num_input_tokens_seen": 260181065, + "step": 12058, + "time_per_iteration": 3.9628541469573975 + }, + { + "auxiliary_loss_clip": 0.01065656, + "auxiliary_loss_mlp": 0.01030065, + "balance_loss_clip": 1.03413725, + "balance_loss_mlp": 1.01705289, + "epoch": 0.7250263039230422, + "flos": 19574153009280.0, + "grad_norm": 1.870493903940984, + "language_loss": 0.74835473, + "learning_rate": 7.419596044262535e-07, + "loss": 0.76931196, + "num_input_tokens_seen": 260200330, + "step": 12059, + "time_per_iteration": 2.603731632232666 + }, + { + "auxiliary_loss_clip": 0.01092774, + "auxiliary_loss_mlp": 0.01034223, + "balance_loss_clip": 1.03543174, + "balance_loss_mlp": 1.02312481, + "epoch": 0.7250864271757103, + "flos": 21976648133760.0, + "grad_norm": 1.7309711390955966, + "language_loss": 0.79603863, + "learning_rate": 7.416568650702472e-07, + "loss": 0.8173086, + "num_input_tokens_seen": 260219975, + "step": 12060, + "time_per_iteration": 2.5388925075531006 + }, + { + "auxiliary_loss_clip": 0.01095629, + "auxiliary_loss_mlp": 0.0102721, + "balance_loss_clip": 1.03683019, + "balance_loss_mlp": 1.01454389, + "epoch": 0.7251465504283782, + "flos": 25012608537600.0, + "grad_norm": 1.7643136579579441, + "language_loss": 0.76437026, + "learning_rate": 7.413541734307393e-07, + "loss": 0.78559864, + "num_input_tokens_seen": 260242025, + "step": 12061, + "time_per_iteration": 2.5268454551696777 + }, + { + "auxiliary_loss_clip": 0.01102949, + "auxiliary_loss_mlp": 0.00784429, + "balance_loss_clip": 1.03619266, + "balance_loss_mlp": 1.01295125, + "epoch": 0.7252066736810462, + "flos": 16690131135360.0, + "grad_norm": 1.8093876542213756, + "language_loss": 0.8109585, + "learning_rate": 7.410515295192068e-07, + "loss": 0.82983232, + "num_input_tokens_seen": 260260015, + "step": 12062, + "time_per_iteration": 2.4611501693725586 + }, + { + "auxiliary_loss_clip": 0.01059375, + "auxiliary_loss_mlp": 0.01031209, + "balance_loss_clip": 1.03579783, + "balance_loss_mlp": 1.01683199, + "epoch": 0.7252667969337141, + "flos": 25703026830720.0, + "grad_norm": 2.031485697906808, + "language_loss": 0.69039035, + "learning_rate": 7.407489333471262e-07, + "loss": 0.7112962, + "num_input_tokens_seen": 260278635, + "step": 12063, + "time_per_iteration": 2.625467538833618 + }, + { + "auxiliary_loss_clip": 0.01069853, + "auxiliary_loss_mlp": 0.01031412, + "balance_loss_clip": 1.03425527, + "balance_loss_mlp": 1.01947927, + "epoch": 0.7253269201863821, + "flos": 18259930195200.0, + "grad_norm": 1.4214818276206647, + "language_loss": 0.70028841, + "learning_rate": 7.40446384925973e-07, + "loss": 0.72130102, + "num_input_tokens_seen": 260298510, + "step": 12064, + "time_per_iteration": 2.5612504482269287 + }, + { + "auxiliary_loss_clip": 0.0108436, + "auxiliary_loss_mlp": 0.0102763, + "balance_loss_clip": 1.0359565, + "balance_loss_mlp": 1.0154531, + "epoch": 0.72538704343905, + "flos": 20411805150720.0, + "grad_norm": 1.7934062175647467, + "language_loss": 0.90316552, + "learning_rate": 7.401438842672192e-07, + "loss": 0.92428547, + "num_input_tokens_seen": 260317405, + "step": 12065, + "time_per_iteration": 2.5122909545898438 + }, + { + "auxiliary_loss_clip": 0.01026617, + "auxiliary_loss_mlp": 0.01002208, + "balance_loss_clip": 1.01271915, + "balance_loss_mlp": 1.00094438, + "epoch": 0.725447166691718, + "flos": 70151209706880.0, + "grad_norm": 0.6514073900311427, + "language_loss": 0.56107795, + "learning_rate": 7.398414313823349e-07, + "loss": 0.58136624, + "num_input_tokens_seen": 260388085, + "step": 12066, + "time_per_iteration": 3.2351391315460205 + }, + { + "auxiliary_loss_clip": 0.01057319, + "auxiliary_loss_mlp": 0.0102951, + "balance_loss_clip": 1.03594136, + "balance_loss_mlp": 1.01796508, + "epoch": 0.725507289944386, + "flos": 27052334254080.0, + "grad_norm": 1.9448730747002732, + "language_loss": 0.76818871, + "learning_rate": 7.395390262827897e-07, + "loss": 0.78905696, + "num_input_tokens_seen": 260406165, + "step": 12067, + "time_per_iteration": 2.6431632041931152 + }, + { + "auxiliary_loss_clip": 0.01018336, + "auxiliary_loss_mlp": 0.0100215, + "balance_loss_clip": 1.01543272, + "balance_loss_mlp": 1.00086808, + "epoch": 0.725567413197054, + "flos": 62921924778240.0, + "grad_norm": 0.7294498130827991, + "language_loss": 0.57062531, + "learning_rate": 7.392366689800515e-07, + "loss": 0.59083021, + "num_input_tokens_seen": 260461365, + "step": 12068, + "time_per_iteration": 3.0213775634765625 + }, + { + "auxiliary_loss_clip": 0.009979, + "auxiliary_loss_mlp": 0.0100484, + "balance_loss_clip": 1.01375771, + "balance_loss_mlp": 1.00337946, + "epoch": 0.7256275364497219, + "flos": 60295957188480.0, + "grad_norm": 0.6613310731478995, + "language_loss": 0.55451965, + "learning_rate": 7.389343594855848e-07, + "loss": 0.57454705, + "num_input_tokens_seen": 260523795, + "step": 12069, + "time_per_iteration": 3.221919059753418 + }, + { + "auxiliary_loss_clip": 0.01070812, + "auxiliary_loss_mlp": 0.01026827, + "balance_loss_clip": 1.03617215, + "balance_loss_mlp": 1.01608014, + "epoch": 0.7256876597023899, + "flos": 24498511130880.0, + "grad_norm": 2.078612941503024, + "language_loss": 0.79943287, + "learning_rate": 7.38632097810854e-07, + "loss": 0.8204093, + "num_input_tokens_seen": 260544765, + "step": 12070, + "time_per_iteration": 2.588759183883667 + }, + { + "auxiliary_loss_clip": 0.01078647, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.03508735, + "balance_loss_mlp": 1.0222342, + "epoch": 0.7257477829550578, + "flos": 24352749740160.0, + "grad_norm": 1.7428715643978419, + "language_loss": 0.72020352, + "learning_rate": 7.383298839673197e-07, + "loss": 0.74132967, + "num_input_tokens_seen": 260564340, + "step": 12071, + "time_per_iteration": 2.5696303844451904 + }, + { + "auxiliary_loss_clip": 0.01104781, + "auxiliary_loss_mlp": 0.01034634, + "balance_loss_clip": 1.03749824, + "balance_loss_mlp": 1.02328527, + "epoch": 0.7258079062077258, + "flos": 17202217380480.0, + "grad_norm": 1.9036849757727847, + "language_loss": 0.70424938, + "learning_rate": 7.380277179664436e-07, + "loss": 0.72564352, + "num_input_tokens_seen": 260582565, + "step": 12072, + "time_per_iteration": 2.4668962955474854 + }, + { + "auxiliary_loss_clip": 0.01070394, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.03344893, + "balance_loss_mlp": 1.01630473, + "epoch": 0.7258680294603939, + "flos": 21580338401280.0, + "grad_norm": 1.9044703883633056, + "language_loss": 0.78595936, + "learning_rate": 7.377255998196821e-07, + "loss": 0.80695903, + "num_input_tokens_seen": 260601700, + "step": 12073, + "time_per_iteration": 2.5717856884002686 + }, + { + "auxiliary_loss_clip": 0.01083957, + "auxiliary_loss_mlp": 0.01028966, + "balance_loss_clip": 1.03698766, + "balance_loss_mlp": 1.01711631, + "epoch": 0.7259281527130618, + "flos": 34855399036800.0, + "grad_norm": 1.473466774468194, + "language_loss": 0.70346648, + "learning_rate": 7.374235295384923e-07, + "loss": 0.72459567, + "num_input_tokens_seen": 260623040, + "step": 12074, + "time_per_iteration": 2.677253484725952 + }, + { + "auxiliary_loss_clip": 0.0108701, + "auxiliary_loss_mlp": 0.01027795, + "balance_loss_clip": 1.0359273, + "balance_loss_mlp": 1.01485467, + "epoch": 0.7259882759657298, + "flos": 25404644551680.0, + "grad_norm": 1.7400756138624678, + "language_loss": 0.74372625, + "learning_rate": 7.371215071343302e-07, + "loss": 0.76487434, + "num_input_tokens_seen": 260642735, + "step": 12075, + "time_per_iteration": 2.5911202430725098 + }, + { + "auxiliary_loss_clip": 0.01094969, + "auxiliary_loss_mlp": 0.01030425, + "balance_loss_clip": 1.03616822, + "balance_loss_mlp": 1.01811635, + "epoch": 0.7260483992183977, + "flos": 62953630531200.0, + "grad_norm": 1.4471211753394115, + "language_loss": 0.63434672, + "learning_rate": 7.368195326186458e-07, + "loss": 0.65560061, + "num_input_tokens_seen": 260669935, + "step": 12076, + "time_per_iteration": 2.896605968475342 + }, + { + "auxiliary_loss_clip": 0.01068898, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.03445029, + "balance_loss_mlp": 1.01648235, + "epoch": 0.7261085224710657, + "flos": 26467528924800.0, + "grad_norm": 1.779625291235308, + "language_loss": 0.78761232, + "learning_rate": 7.365176060028912e-07, + "loss": 0.80858892, + "num_input_tokens_seen": 260689605, + "step": 12077, + "time_per_iteration": 2.656214475631714 + }, + { + "auxiliary_loss_clip": 0.01034142, + "auxiliary_loss_mlp": 0.00761894, + "balance_loss_clip": 1.01068807, + "balance_loss_mlp": 0.99981654, + "epoch": 0.7261686457237336, + "flos": 66772732187520.0, + "grad_norm": 0.8839711671155598, + "language_loss": 0.65010571, + "learning_rate": 7.362157272985163e-07, + "loss": 0.66806608, + "num_input_tokens_seen": 260748265, + "step": 12078, + "time_per_iteration": 3.087881565093994 + }, + { + "auxiliary_loss_clip": 0.01025205, + "auxiliary_loss_mlp": 0.01000259, + "balance_loss_clip": 1.01211166, + "balance_loss_mlp": 0.99908525, + "epoch": 0.7262287689764017, + "flos": 69999594399360.0, + "grad_norm": 0.7254744502548687, + "language_loss": 0.59272003, + "learning_rate": 7.359138965169671e-07, + "loss": 0.61297476, + "num_input_tokens_seen": 260816715, + "step": 12079, + "time_per_iteration": 3.243366241455078 + }, + { + "auxiliary_loss_clip": 0.01060475, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.03429389, + "balance_loss_mlp": 1.01864231, + "epoch": 0.7262888922290696, + "flos": 23805435231360.0, + "grad_norm": 1.9454027931627436, + "language_loss": 0.64561987, + "learning_rate": 7.356121136696895e-07, + "loss": 0.66653711, + "num_input_tokens_seen": 260836765, + "step": 12080, + "time_per_iteration": 2.6117634773254395 + }, + { + "auxiliary_loss_clip": 0.01062258, + "auxiliary_loss_mlp": 0.01026702, + "balance_loss_clip": 1.03513932, + "balance_loss_mlp": 1.01346993, + "epoch": 0.7263490154817376, + "flos": 19500320603520.0, + "grad_norm": 2.4216944007441272, + "language_loss": 0.69773746, + "learning_rate": 7.35310378768128e-07, + "loss": 0.71862704, + "num_input_tokens_seen": 260854610, + "step": 12081, + "time_per_iteration": 2.568429470062256 + }, + { + "auxiliary_loss_clip": 0.0111064, + "auxiliary_loss_mlp": 0.01028066, + "balance_loss_clip": 1.03892088, + "balance_loss_mlp": 1.01583529, + "epoch": 0.7264091387344055, + "flos": 16286243633280.0, + "grad_norm": 1.8282647643934549, + "language_loss": 0.81226361, + "learning_rate": 7.350086918237237e-07, + "loss": 0.83365071, + "num_input_tokens_seen": 260871620, + "step": 12082, + "time_per_iteration": 2.4535439014434814 + }, + { + "auxiliary_loss_clip": 0.0109942, + "auxiliary_loss_mlp": 0.01035395, + "balance_loss_clip": 1.03548002, + "balance_loss_mlp": 1.02175188, + "epoch": 0.7264692619870735, + "flos": 24352031468160.0, + "grad_norm": 1.8091799150093841, + "language_loss": 0.77245378, + "learning_rate": 7.347070528479158e-07, + "loss": 0.7938019, + "num_input_tokens_seen": 260890490, + "step": 12083, + "time_per_iteration": 2.505014181137085 + }, + { + "auxiliary_loss_clip": 0.01109663, + "auxiliary_loss_mlp": 0.01030074, + "balance_loss_clip": 1.03907013, + "balance_loss_mlp": 1.01762843, + "epoch": 0.7265293852397414, + "flos": 25119478477440.0, + "grad_norm": 1.739594900242687, + "language_loss": 0.73149818, + "learning_rate": 7.344054618521433e-07, + "loss": 0.75289547, + "num_input_tokens_seen": 260909700, + "step": 12084, + "time_per_iteration": 2.5074477195739746 + }, + { + "auxiliary_loss_clip": 0.01110321, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.0383575, + "balance_loss_mlp": 1.01876807, + "epoch": 0.7265895084924094, + "flos": 22638230784000.0, + "grad_norm": 1.6987177193557668, + "language_loss": 0.77904421, + "learning_rate": 7.34103918847843e-07, + "loss": 0.80045992, + "num_input_tokens_seen": 260929090, + "step": 12085, + "time_per_iteration": 2.4601125717163086 + }, + { + "auxiliary_loss_clip": 0.01096509, + "auxiliary_loss_mlp": 0.01036336, + "balance_loss_clip": 1.03654861, + "balance_loss_mlp": 1.02434349, + "epoch": 0.7266496317450775, + "flos": 23368222886400.0, + "grad_norm": 1.7056139911658998, + "language_loss": 0.72291797, + "learning_rate": 7.338024238464493e-07, + "loss": 0.74424648, + "num_input_tokens_seen": 260946615, + "step": 12086, + "time_per_iteration": 3.8972010612487793 + }, + { + "auxiliary_loss_clip": 0.01070111, + "auxiliary_loss_mlp": 0.01037722, + "balance_loss_clip": 1.03504872, + "balance_loss_mlp": 1.02384639, + "epoch": 0.7267097549977454, + "flos": 28074603323520.0, + "grad_norm": 1.786568369976785, + "language_loss": 0.69613171, + "learning_rate": 7.335009768593938e-07, + "loss": 0.71721005, + "num_input_tokens_seen": 260968515, + "step": 12087, + "time_per_iteration": 2.6122074127197266 + }, + { + "auxiliary_loss_clip": 0.01111247, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.03990328, + "balance_loss_mlp": 1.02170849, + "epoch": 0.7267698782504134, + "flos": 22195523658240.0, + "grad_norm": 1.6591495146513835, + "language_loss": 0.7868706, + "learning_rate": 7.331995778981088e-07, + "loss": 0.80832863, + "num_input_tokens_seen": 260986790, + "step": 12088, + "time_per_iteration": 2.4600605964660645 + }, + { + "auxiliary_loss_clip": 0.0109193, + "auxiliary_loss_mlp": 0.01037103, + "balance_loss_clip": 1.03561759, + "balance_loss_mlp": 1.02497375, + "epoch": 0.7268300015030813, + "flos": 18514859996160.0, + "grad_norm": 1.750284055885843, + "language_loss": 0.73755956, + "learning_rate": 7.328982269740221e-07, + "loss": 0.75884986, + "num_input_tokens_seen": 261004925, + "step": 12089, + "time_per_iteration": 2.4675662517547607 + }, + { + "auxiliary_loss_clip": 0.01085884, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.03726053, + "balance_loss_mlp": 1.0243063, + "epoch": 0.7268901247557493, + "flos": 23986029836160.0, + "grad_norm": 1.6839338805212147, + "language_loss": 0.71141034, + "learning_rate": 7.325969240985616e-07, + "loss": 0.73263991, + "num_input_tokens_seen": 261023895, + "step": 12090, + "time_per_iteration": 2.5432631969451904 + }, + { + "auxiliary_loss_clip": 0.01053722, + "auxiliary_loss_mlp": 0.01030365, + "balance_loss_clip": 1.03311217, + "balance_loss_mlp": 1.01719272, + "epoch": 0.7269502480084172, + "flos": 32088087429120.0, + "grad_norm": 2.023584258104744, + "language_loss": 0.77505851, + "learning_rate": 7.322956692831528e-07, + "loss": 0.79589939, + "num_input_tokens_seen": 261045445, + "step": 12091, + "time_per_iteration": 2.6982319355010986 + }, + { + "auxiliary_loss_clip": 0.01089219, + "auxiliary_loss_mlp": 0.0078541, + "balance_loss_clip": 1.03398526, + "balance_loss_mlp": 1.01048934, + "epoch": 0.7270103712610853, + "flos": 19062785036160.0, + "grad_norm": 1.985390735686807, + "language_loss": 0.71077895, + "learning_rate": 7.319944625392205e-07, + "loss": 0.72952521, + "num_input_tokens_seen": 261064275, + "step": 12092, + "time_per_iteration": 5.292281150817871 + }, + { + "auxiliary_loss_clip": 0.01096932, + "auxiliary_loss_mlp": 0.01030834, + "balance_loss_clip": 1.03749704, + "balance_loss_mlp": 1.01812005, + "epoch": 0.7270704945137532, + "flos": 34532921710080.0, + "grad_norm": 2.4519523245384667, + "language_loss": 0.61069274, + "learning_rate": 7.31693303878184e-07, + "loss": 0.63197041, + "num_input_tokens_seen": 261083310, + "step": 12093, + "time_per_iteration": 2.599919080734253 + }, + { + "auxiliary_loss_clip": 0.01083774, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.03720212, + "balance_loss_mlp": 1.02082646, + "epoch": 0.7271306177664212, + "flos": 21507583403520.0, + "grad_norm": 1.5421839824071106, + "language_loss": 0.75120366, + "learning_rate": 7.313921933114644e-07, + "loss": 0.77236998, + "num_input_tokens_seen": 261103460, + "step": 12094, + "time_per_iteration": 2.5601837635040283 + }, + { + "auxiliary_loss_clip": 0.01071814, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.03479648, + "balance_loss_mlp": 1.01774156, + "epoch": 0.7271907410190891, + "flos": 22272444633600.0, + "grad_norm": 1.9504807486963558, + "language_loss": 0.8508302, + "learning_rate": 7.310911308504808e-07, + "loss": 0.87184405, + "num_input_tokens_seen": 261121375, + "step": 12095, + "time_per_iteration": 2.568286418914795 + }, + { + "auxiliary_loss_clip": 0.01093457, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.03504193, + "balance_loss_mlp": 1.02281046, + "epoch": 0.7272508642717571, + "flos": 22893124671360.0, + "grad_norm": 1.6534090229212621, + "language_loss": 0.77680135, + "learning_rate": 7.307901165066479e-07, + "loss": 0.79809701, + "num_input_tokens_seen": 261141105, + "step": 12096, + "time_per_iteration": 2.546279191970825 + }, + { + "auxiliary_loss_clip": 0.01107624, + "auxiliary_loss_mlp": 0.01035108, + "balance_loss_clip": 1.03816819, + "balance_loss_mlp": 1.02298999, + "epoch": 0.727310987524425, + "flos": 11655886331520.0, + "grad_norm": 2.183348696327486, + "language_loss": 0.71996117, + "learning_rate": 7.30489150291381e-07, + "loss": 0.7413885, + "num_input_tokens_seen": 261159255, + "step": 12097, + "time_per_iteration": 3.9706411361694336 + }, + { + "auxiliary_loss_clip": 0.01097316, + "auxiliary_loss_mlp": 0.00785194, + "balance_loss_clip": 1.03751874, + "balance_loss_mlp": 1.00964057, + "epoch": 0.727371110777093, + "flos": 24535319592960.0, + "grad_norm": 1.7909135868610941, + "language_loss": 0.765881, + "learning_rate": 7.301882322160935e-07, + "loss": 0.78470612, + "num_input_tokens_seen": 261177960, + "step": 12098, + "time_per_iteration": 2.5226917266845703 + }, + { + "auxiliary_loss_clip": 0.0108499, + "auxiliary_loss_mlp": 0.01028352, + "balance_loss_clip": 1.03374183, + "balance_loss_mlp": 1.01561451, + "epoch": 0.7274312340297611, + "flos": 74739835405440.0, + "grad_norm": 1.5921925280659814, + "language_loss": 0.67428392, + "learning_rate": 7.298873622921952e-07, + "loss": 0.6954174, + "num_input_tokens_seen": 261205660, + "step": 12099, + "time_per_iteration": 2.9600038528442383 + }, + { + "auxiliary_loss_clip": 0.01094425, + "auxiliary_loss_mlp": 0.01037034, + "balance_loss_clip": 1.03392446, + "balance_loss_mlp": 1.02126312, + "epoch": 0.727491357282429, + "flos": 22342865247360.0, + "grad_norm": 1.8426247410635586, + "language_loss": 0.72587132, + "learning_rate": 7.29586540531095e-07, + "loss": 0.74718589, + "num_input_tokens_seen": 261225185, + "step": 12100, + "time_per_iteration": 2.4981589317321777 + }, + { + "auxiliary_loss_clip": 0.0109872, + "auxiliary_loss_mlp": 0.01034599, + "balance_loss_clip": 1.03840923, + "balance_loss_mlp": 1.02257693, + "epoch": 0.727551480535097, + "flos": 23297550877440.0, + "grad_norm": 1.325732827677211, + "language_loss": 0.74673867, + "learning_rate": 7.292857669442005e-07, + "loss": 0.76807189, + "num_input_tokens_seen": 261247965, + "step": 12101, + "time_per_iteration": 2.5331127643585205 + }, + { + "auxiliary_loss_clip": 0.01065417, + "auxiliary_loss_mlp": 0.01028608, + "balance_loss_clip": 1.03697205, + "balance_loss_mlp": 1.01682448, + "epoch": 0.7276116037877649, + "flos": 21470559459840.0, + "grad_norm": 1.7615616465733526, + "language_loss": 0.82319009, + "learning_rate": 7.289850415429177e-07, + "loss": 0.8441304, + "num_input_tokens_seen": 261267585, + "step": 12102, + "time_per_iteration": 2.5522501468658447 + }, + { + "auxiliary_loss_clip": 0.01095292, + "auxiliary_loss_mlp": 0.01031517, + "balance_loss_clip": 1.0367589, + "balance_loss_mlp": 1.01952469, + "epoch": 0.7276717270404329, + "flos": 21464059098240.0, + "grad_norm": 2.356109842589698, + "language_loss": 0.81421649, + "learning_rate": 7.286843643386495e-07, + "loss": 0.8354845, + "num_input_tokens_seen": 261285200, + "step": 12103, + "time_per_iteration": 2.4886813163757324 + }, + { + "auxiliary_loss_clip": 0.01085406, + "auxiliary_loss_mlp": 0.01024279, + "balance_loss_clip": 1.03722668, + "balance_loss_mlp": 1.01192331, + "epoch": 0.7277318502931008, + "flos": 16837221329280.0, + "grad_norm": 1.5948474048661339, + "language_loss": 0.66325122, + "learning_rate": 7.283837353427968e-07, + "loss": 0.68434805, + "num_input_tokens_seen": 261303645, + "step": 12104, + "time_per_iteration": 2.4910850524902344 + }, + { + "auxiliary_loss_clip": 0.01074002, + "auxiliary_loss_mlp": 0.01030787, + "balance_loss_clip": 1.03629148, + "balance_loss_mlp": 1.01879406, + "epoch": 0.7277919735457689, + "flos": 33400550476800.0, + "grad_norm": 1.6939034048727415, + "language_loss": 0.65817595, + "learning_rate": 7.280831545667611e-07, + "loss": 0.6792239, + "num_input_tokens_seen": 261323265, + "step": 12105, + "time_per_iteration": 2.6548991203308105 + }, + { + "auxiliary_loss_clip": 0.01107874, + "auxiliary_loss_mlp": 0.01031937, + "balance_loss_clip": 1.03852963, + "balance_loss_mlp": 1.01953304, + "epoch": 0.7278520967984368, + "flos": 19206499351680.0, + "grad_norm": 2.40070127107402, + "language_loss": 0.75826406, + "learning_rate": 7.27782622021939e-07, + "loss": 0.77966213, + "num_input_tokens_seen": 261339745, + "step": 12106, + "time_per_iteration": 2.4392592906951904 + }, + { + "auxiliary_loss_clip": 0.0109763, + "auxiliary_loss_mlp": 0.01032175, + "balance_loss_clip": 1.03637004, + "balance_loss_mlp": 1.01919925, + "epoch": 0.7279122200511048, + "flos": 34094667870720.0, + "grad_norm": 2.210961204880018, + "language_loss": 0.70262212, + "learning_rate": 7.274821377197273e-07, + "loss": 0.72392011, + "num_input_tokens_seen": 261359310, + "step": 12107, + "time_per_iteration": 2.5871362686157227 + }, + { + "auxiliary_loss_clip": 0.01087768, + "auxiliary_loss_mlp": 0.01031879, + "balance_loss_clip": 1.03368664, + "balance_loss_mlp": 1.01961279, + "epoch": 0.7279723433037727, + "flos": 54599049348480.0, + "grad_norm": 1.6505647132033816, + "language_loss": 0.75248361, + "learning_rate": 7.271817016715205e-07, + "loss": 0.77368009, + "num_input_tokens_seen": 261384640, + "step": 12108, + "time_per_iteration": 2.8128466606140137 + }, + { + "auxiliary_loss_clip": 0.01106494, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.03681302, + "balance_loss_mlp": 1.01952529, + "epoch": 0.7280324665564407, + "flos": 36137482156800.0, + "grad_norm": 1.5067594216334428, + "language_loss": 0.66851044, + "learning_rate": 7.268813138887124e-07, + "loss": 0.68989551, + "num_input_tokens_seen": 261405290, + "step": 12109, + "time_per_iteration": 2.6085293292999268 + }, + { + "auxiliary_loss_clip": 0.01068849, + "auxiliary_loss_mlp": 0.01029309, + "balance_loss_clip": 1.03734112, + "balance_loss_mlp": 1.01682806, + "epoch": 0.7280925898091086, + "flos": 11618539165440.0, + "grad_norm": 2.152472722496652, + "language_loss": 0.6315912, + "learning_rate": 7.265809743826912e-07, + "loss": 0.65257275, + "num_input_tokens_seen": 261419710, + "step": 12110, + "time_per_iteration": 2.5594074726104736 + }, + { + "auxiliary_loss_clip": 0.01073521, + "auxiliary_loss_mlp": 0.0102916, + "balance_loss_clip": 1.0340215, + "balance_loss_mlp": 1.01623178, + "epoch": 0.7281527130617766, + "flos": 34277094069120.0, + "grad_norm": 2.133492304068536, + "language_loss": 0.58019507, + "learning_rate": 7.26280683164847e-07, + "loss": 0.60122186, + "num_input_tokens_seen": 261442385, + "step": 12111, + "time_per_iteration": 2.69405198097229 + }, + { + "auxiliary_loss_clip": 0.01058253, + "auxiliary_loss_mlp": 0.01033407, + "balance_loss_clip": 1.03869963, + "balance_loss_mlp": 1.02107465, + "epoch": 0.7282128363144446, + "flos": 13918043018880.0, + "grad_norm": 2.130579804929628, + "language_loss": 0.74094379, + "learning_rate": 7.259804402465677e-07, + "loss": 0.76186043, + "num_input_tokens_seen": 261459805, + "step": 12112, + "time_per_iteration": 2.6144602298736572 + }, + { + "auxiliary_loss_clip": 0.01095518, + "auxiliary_loss_mlp": 0.01032037, + "balance_loss_clip": 1.03666496, + "balance_loss_mlp": 1.02026498, + "epoch": 0.7282729595671126, + "flos": 20777627214720.0, + "grad_norm": 1.7875389044051893, + "language_loss": 0.66821688, + "learning_rate": 7.25680245639237e-07, + "loss": 0.6894924, + "num_input_tokens_seen": 261477175, + "step": 12113, + "time_per_iteration": 2.542703151702881 + }, + { + "auxiliary_loss_clip": 0.01066408, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.03428376, + "balance_loss_mlp": 1.0180198, + "epoch": 0.7283330828197806, + "flos": 16325422392960.0, + "grad_norm": 1.7569735550944434, + "language_loss": 0.73304653, + "learning_rate": 7.253800993542399e-07, + "loss": 0.75401491, + "num_input_tokens_seen": 261494990, + "step": 12114, + "time_per_iteration": 2.5143110752105713 + }, + { + "auxiliary_loss_clip": 0.010846, + "auxiliary_loss_mlp": 0.01029392, + "balance_loss_clip": 1.03680158, + "balance_loss_mlp": 1.01733398, + "epoch": 0.7283932060724485, + "flos": 27490193043840.0, + "grad_norm": 3.2340551869252363, + "language_loss": 0.68026924, + "learning_rate": 7.250800014029564e-07, + "loss": 0.70140916, + "num_input_tokens_seen": 261514445, + "step": 12115, + "time_per_iteration": 2.562067747116089 + }, + { + "auxiliary_loss_clip": 0.01108366, + "auxiliary_loss_mlp": 0.0102848, + "balance_loss_clip": 1.03693914, + "balance_loss_mlp": 1.01602888, + "epoch": 0.7284533293251165, + "flos": 18367877543040.0, + "grad_norm": 1.5589159961447048, + "language_loss": 0.59752607, + "learning_rate": 7.247799517967674e-07, + "loss": 0.61889458, + "num_input_tokens_seen": 261533565, + "step": 12116, + "time_per_iteration": 2.465832471847534 + }, + { + "auxiliary_loss_clip": 0.01093857, + "auxiliary_loss_mlp": 0.01029133, + "balance_loss_clip": 1.03625727, + "balance_loss_mlp": 1.01720572, + "epoch": 0.7285134525777844, + "flos": 21725525174400.0, + "grad_norm": 2.1464667573619987, + "language_loss": 0.72947896, + "learning_rate": 7.2447995054705e-07, + "loss": 0.75070882, + "num_input_tokens_seen": 261553795, + "step": 12117, + "time_per_iteration": 2.5057084560394287 + }, + { + "auxiliary_loss_clip": 0.01090971, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.03504086, + "balance_loss_mlp": 1.01926303, + "epoch": 0.7285735758304525, + "flos": 20741357456640.0, + "grad_norm": 1.9156027623374166, + "language_loss": 0.69684726, + "learning_rate": 7.241799976651807e-07, + "loss": 0.71807683, + "num_input_tokens_seen": 261572565, + "step": 12118, + "time_per_iteration": 2.499309778213501 + }, + { + "auxiliary_loss_clip": 0.01049055, + "auxiliary_loss_mlp": 0.01033897, + "balance_loss_clip": 1.03242564, + "balance_loss_mlp": 1.02233982, + "epoch": 0.7286336990831204, + "flos": 17310954827520.0, + "grad_norm": 1.5895184201469303, + "language_loss": 0.84561068, + "learning_rate": 7.238800931625346e-07, + "loss": 0.86644018, + "num_input_tokens_seen": 261590910, + "step": 12119, + "time_per_iteration": 2.579619884490967 + }, + { + "auxiliary_loss_clip": 0.01105595, + "auxiliary_loss_mlp": 0.01029089, + "balance_loss_clip": 1.03642988, + "balance_loss_mlp": 1.01673877, + "epoch": 0.7286938223357884, + "flos": 19787390098560.0, + "grad_norm": 2.06661196787075, + "language_loss": 0.81058902, + "learning_rate": 7.235802370504831e-07, + "loss": 0.83193588, + "num_input_tokens_seen": 261606005, + "step": 12120, + "time_per_iteration": 2.4286489486694336 + }, + { + "auxiliary_loss_clip": 0.01067924, + "auxiliary_loss_mlp": 0.01036705, + "balance_loss_clip": 1.03572428, + "balance_loss_mlp": 1.02384186, + "epoch": 0.7287539455884563, + "flos": 15340859625600.0, + "grad_norm": 2.1353127034616324, + "language_loss": 0.78884685, + "learning_rate": 7.232804293403963e-07, + "loss": 0.80989313, + "num_input_tokens_seen": 261622305, + "step": 12121, + "time_per_iteration": 2.5597317218780518 + }, + { + "auxiliary_loss_clip": 0.0110532, + "auxiliary_loss_mlp": 0.01029836, + "balance_loss_clip": 1.03410387, + "balance_loss_mlp": 1.0170331, + "epoch": 0.7288140688411243, + "flos": 25192484870400.0, + "grad_norm": 1.7137812353684079, + "language_loss": 0.69354594, + "learning_rate": 7.229806700436441e-07, + "loss": 0.71489751, + "num_input_tokens_seen": 261642465, + "step": 12122, + "time_per_iteration": 2.4917633533477783 + }, + { + "auxiliary_loss_clip": 0.01057641, + "auxiliary_loss_mlp": 0.01029208, + "balance_loss_clip": 1.0318979, + "balance_loss_mlp": 1.01764441, + "epoch": 0.7288741920937922, + "flos": 23984162328960.0, + "grad_norm": 2.434902994526673, + "language_loss": 0.87207139, + "learning_rate": 7.226809591715923e-07, + "loss": 0.89293993, + "num_input_tokens_seen": 261661420, + "step": 12123, + "time_per_iteration": 2.6010611057281494 + }, + { + "auxiliary_loss_clip": 0.01071349, + "auxiliary_loss_mlp": 0.01026116, + "balance_loss_clip": 1.03383207, + "balance_loss_mlp": 1.0141356, + "epoch": 0.7289343153464602, + "flos": 22744921155840.0, + "grad_norm": 1.872124618955195, + "language_loss": 0.82915974, + "learning_rate": 7.223812967356065e-07, + "loss": 0.85013437, + "num_input_tokens_seen": 261680865, + "step": 12124, + "time_per_iteration": 3.999394655227661 + }, + { + "auxiliary_loss_clip": 0.01084653, + "auxiliary_loss_mlp": 0.0102914, + "balance_loss_clip": 1.03655601, + "balance_loss_mlp": 1.01744568, + "epoch": 0.7289944385991282, + "flos": 24900028335360.0, + "grad_norm": 1.8367033457742714, + "language_loss": 0.67231667, + "learning_rate": 7.220816827470499e-07, + "loss": 0.69345462, + "num_input_tokens_seen": 261701455, + "step": 12125, + "time_per_iteration": 2.5594820976257324 + }, + { + "auxiliary_loss_clip": 0.01097662, + "auxiliary_loss_mlp": 0.01035607, + "balance_loss_clip": 1.03495657, + "balance_loss_mlp": 1.0222199, + "epoch": 0.7290545618517962, + "flos": 22967064817920.0, + "grad_norm": 1.7685340277093167, + "language_loss": 0.75128347, + "learning_rate": 7.217821172172855e-07, + "loss": 0.77261609, + "num_input_tokens_seen": 261721260, + "step": 12126, + "time_per_iteration": 2.503328323364258 + }, + { + "auxiliary_loss_clip": 0.01012956, + "auxiliary_loss_mlp": 0.01003667, + "balance_loss_clip": 1.0101037, + "balance_loss_mlp": 1.00257003, + "epoch": 0.7291146851044642, + "flos": 61901523216000.0, + "grad_norm": 0.8287363126752945, + "language_loss": 0.58662975, + "learning_rate": 7.2148260015767e-07, + "loss": 0.60679591, + "num_input_tokens_seen": 261779370, + "step": 12127, + "time_per_iteration": 3.0622060298919678 + }, + { + "auxiliary_loss_clip": 0.01070926, + "auxiliary_loss_mlp": 0.01025979, + "balance_loss_clip": 1.03445911, + "balance_loss_mlp": 1.01458263, + "epoch": 0.7291748083571321, + "flos": 23330947547520.0, + "grad_norm": 9.868314490620364, + "language_loss": 0.68703234, + "learning_rate": 7.21183131579562e-07, + "loss": 0.70800138, + "num_input_tokens_seen": 261798050, + "step": 12128, + "time_per_iteration": 2.5697333812713623 + }, + { + "auxiliary_loss_clip": 0.01079772, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.03502595, + "balance_loss_mlp": 1.02125049, + "epoch": 0.7292349316098001, + "flos": 28330000001280.0, + "grad_norm": 1.9859632205981075, + "language_loss": 0.65702462, + "learning_rate": 7.20883711494319e-07, + "loss": 0.67816615, + "num_input_tokens_seen": 261817660, + "step": 12129, + "time_per_iteration": 2.572272300720215 + }, + { + "auxiliary_loss_clip": 0.01102996, + "auxiliary_loss_mlp": 0.01024404, + "balance_loss_clip": 1.03634334, + "balance_loss_mlp": 1.01248872, + "epoch": 0.729295054862468, + "flos": 24132222190080.0, + "grad_norm": 2.495590145825102, + "language_loss": 0.74374104, + "learning_rate": 7.205843399132927e-07, + "loss": 0.76501507, + "num_input_tokens_seen": 261837935, + "step": 12130, + "time_per_iteration": 3.8999977111816406 + }, + { + "auxiliary_loss_clip": 0.0108059, + "auxiliary_loss_mlp": 0.01030048, + "balance_loss_clip": 1.03467011, + "balance_loss_mlp": 1.01753092, + "epoch": 0.7293551781151361, + "flos": 22816239609600.0, + "grad_norm": 1.6413930632407545, + "language_loss": 0.69903195, + "learning_rate": 7.202850168478374e-07, + "loss": 0.72013831, + "num_input_tokens_seen": 261857575, + "step": 12131, + "time_per_iteration": 3.940873384475708 + }, + { + "auxiliary_loss_clip": 0.01070646, + "auxiliary_loss_mlp": 0.0102917, + "balance_loss_clip": 1.03668511, + "balance_loss_mlp": 1.01775026, + "epoch": 0.729415301367804, + "flos": 22126683242880.0, + "grad_norm": 1.6469513284956319, + "language_loss": 0.77522135, + "learning_rate": 7.199857423093025e-07, + "loss": 0.79621947, + "num_input_tokens_seen": 261877265, + "step": 12132, + "time_per_iteration": 2.5600850582122803 + }, + { + "auxiliary_loss_clip": 0.01094654, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.03567886, + "balance_loss_mlp": 1.0223105, + "epoch": 0.729475424620472, + "flos": 12349608675840.0, + "grad_norm": 2.2503498914172875, + "language_loss": 0.79077697, + "learning_rate": 7.196865163090358e-07, + "loss": 0.81206286, + "num_input_tokens_seen": 261893695, + "step": 12133, + "time_per_iteration": 2.4739465713500977 + }, + { + "auxiliary_loss_clip": 0.01057137, + "auxiliary_loss_mlp": 0.01028639, + "balance_loss_clip": 1.03379488, + "balance_loss_mlp": 1.01603889, + "epoch": 0.7295355478731399, + "flos": 22195308176640.0, + "grad_norm": 1.8054816923496675, + "language_loss": 0.72126245, + "learning_rate": 7.193873388583846e-07, + "loss": 0.74212027, + "num_input_tokens_seen": 261911825, + "step": 12134, + "time_per_iteration": 2.590374708175659 + }, + { + "auxiliary_loss_clip": 0.01086924, + "auxiliary_loss_mlp": 0.01035337, + "balance_loss_clip": 1.03775597, + "balance_loss_mlp": 1.02274847, + "epoch": 0.7295956711258079, + "flos": 23222030532480.0, + "grad_norm": 1.6303757249806112, + "language_loss": 0.71407837, + "learning_rate": 7.190882099686939e-07, + "loss": 0.73530102, + "num_input_tokens_seen": 261931190, + "step": 12135, + "time_per_iteration": 2.5872268676757812 + }, + { + "auxiliary_loss_clip": 0.01073048, + "auxiliary_loss_mlp": 0.01033688, + "balance_loss_clip": 1.03635979, + "balance_loss_mlp": 1.02118886, + "epoch": 0.7296557943784758, + "flos": 31869104163840.0, + "grad_norm": 2.080626534780749, + "language_loss": 0.62233913, + "learning_rate": 7.187891296513075e-07, + "loss": 0.64340639, + "num_input_tokens_seen": 261951240, + "step": 12136, + "time_per_iteration": 4.03627347946167 + }, + { + "auxiliary_loss_clip": 0.01087992, + "auxiliary_loss_mlp": 0.00784563, + "balance_loss_clip": 1.03473139, + "balance_loss_mlp": 1.01282203, + "epoch": 0.7297159176311439, + "flos": 26651714889600.0, + "grad_norm": 1.8292242131094316, + "language_loss": 0.74551702, + "learning_rate": 7.184900979175654e-07, + "loss": 0.76424247, + "num_input_tokens_seen": 261971605, + "step": 12137, + "time_per_iteration": 2.564791202545166 + }, + { + "auxiliary_loss_clip": 0.01095748, + "auxiliary_loss_mlp": 0.00785852, + "balance_loss_clip": 1.03766274, + "balance_loss_mlp": 1.01235557, + "epoch": 0.7297760408838118, + "flos": 24749562263040.0, + "grad_norm": 1.5217106503866549, + "language_loss": 0.74206787, + "learning_rate": 7.181911147788069e-07, + "loss": 0.76088387, + "num_input_tokens_seen": 261990830, + "step": 12138, + "time_per_iteration": 2.5630712509155273 + }, + { + "auxiliary_loss_clip": 0.0106901, + "auxiliary_loss_mlp": 0.01027764, + "balance_loss_clip": 1.03547955, + "balance_loss_mlp": 1.01639771, + "epoch": 0.7298361641364798, + "flos": 18073768982400.0, + "grad_norm": 2.058195805372477, + "language_loss": 0.7172122, + "learning_rate": 7.178921802463702e-07, + "loss": 0.73817992, + "num_input_tokens_seen": 262008190, + "step": 12139, + "time_per_iteration": 2.5305986404418945 + }, + { + "auxiliary_loss_clip": 0.01090863, + "auxiliary_loss_mlp": 0.01025113, + "balance_loss_clip": 1.03631091, + "balance_loss_mlp": 1.01409197, + "epoch": 0.7298962873891478, + "flos": 29895597169920.0, + "grad_norm": 1.4676610942274326, + "language_loss": 0.73236519, + "learning_rate": 7.175932943315898e-07, + "loss": 0.75352502, + "num_input_tokens_seen": 262030460, + "step": 12140, + "time_per_iteration": 2.574385643005371 + }, + { + "auxiliary_loss_clip": 0.0108027, + "auxiliary_loss_mlp": 0.01028968, + "balance_loss_clip": 1.03666496, + "balance_loss_mlp": 1.01657605, + "epoch": 0.7299564106418157, + "flos": 32266096254720.0, + "grad_norm": 1.771165640291079, + "language_loss": 0.55632985, + "learning_rate": 7.172944570458003e-07, + "loss": 0.57742226, + "num_input_tokens_seen": 262050830, + "step": 12141, + "time_per_iteration": 2.601027250289917 + }, + { + "auxiliary_loss_clip": 0.01067363, + "auxiliary_loss_mlp": 0.01028747, + "balance_loss_clip": 1.03368926, + "balance_loss_mlp": 1.01741612, + "epoch": 0.7300165338944837, + "flos": 22930292269440.0, + "grad_norm": 1.4405938316127724, + "language_loss": 0.72589171, + "learning_rate": 7.169956684003342e-07, + "loss": 0.74685282, + "num_input_tokens_seen": 262071245, + "step": 12142, + "time_per_iteration": 2.6035315990448 + }, + { + "auxiliary_loss_clip": 0.0110383, + "auxiliary_loss_mlp": 0.01030758, + "balance_loss_clip": 1.03628922, + "balance_loss_mlp": 1.01973701, + "epoch": 0.7300766571471516, + "flos": 19828795501440.0, + "grad_norm": 1.798745397451092, + "language_loss": 0.73460209, + "learning_rate": 7.16696928406521e-07, + "loss": 0.75594795, + "num_input_tokens_seen": 262087525, + "step": 12143, + "time_per_iteration": 2.4336977005004883 + }, + { + "auxiliary_loss_clip": 0.01066358, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.03430152, + "balance_loss_mlp": 1.01863468, + "epoch": 0.7301367803998197, + "flos": 24347829576960.0, + "grad_norm": 1.9831028861358369, + "language_loss": 0.66752279, + "learning_rate": 7.163982370756882e-07, + "loss": 0.68849152, + "num_input_tokens_seen": 262107355, + "step": 12144, + "time_per_iteration": 2.578795909881592 + }, + { + "auxiliary_loss_clip": 0.01079888, + "auxiliary_loss_mlp": 0.01029324, + "balance_loss_clip": 1.03616571, + "balance_loss_mlp": 1.01779604, + "epoch": 0.7301969036524876, + "flos": 15304518040320.0, + "grad_norm": 1.6753059808641482, + "language_loss": 0.79220361, + "learning_rate": 7.160995944191627e-07, + "loss": 0.81329572, + "num_input_tokens_seen": 262125645, + "step": 12145, + "time_per_iteration": 2.49168062210083 + }, + { + "auxiliary_loss_clip": 0.01064341, + "auxiliary_loss_mlp": 0.01029634, + "balance_loss_clip": 1.03362203, + "balance_loss_mlp": 1.01758254, + "epoch": 0.7302570269051556, + "flos": 23507268433920.0, + "grad_norm": 1.9013261310962988, + "language_loss": 0.91275251, + "learning_rate": 7.158010004482702e-07, + "loss": 0.93369222, + "num_input_tokens_seen": 262144075, + "step": 12146, + "time_per_iteration": 2.566948175430298 + }, + { + "auxiliary_loss_clip": 0.01101664, + "auxiliary_loss_mlp": 0.01026799, + "balance_loss_clip": 1.03589916, + "balance_loss_mlp": 1.01581371, + "epoch": 0.7303171501578235, + "flos": 20523056549760.0, + "grad_norm": 1.7362189369761787, + "language_loss": 0.62057298, + "learning_rate": 7.155024551743316e-07, + "loss": 0.64185762, + "num_input_tokens_seen": 262165940, + "step": 12147, + "time_per_iteration": 2.48987078666687 + }, + { + "auxiliary_loss_clip": 0.0110726, + "auxiliary_loss_mlp": 0.0103415, + "balance_loss_clip": 1.03776598, + "balance_loss_mlp": 1.02162671, + "epoch": 0.7303772734104915, + "flos": 18332613365760.0, + "grad_norm": 1.8189118680423935, + "language_loss": 0.75371218, + "learning_rate": 7.152039586086693e-07, + "loss": 0.77512622, + "num_input_tokens_seen": 262184520, + "step": 12148, + "time_per_iteration": 2.448821783065796 + }, + { + "auxiliary_loss_clip": 0.01014714, + "auxiliary_loss_mlp": 0.00763043, + "balance_loss_clip": 1.01300943, + "balance_loss_mlp": 1.00126159, + "epoch": 0.7304373966631594, + "flos": 60654776100480.0, + "grad_norm": 2.0807285473069195, + "language_loss": 0.56692851, + "learning_rate": 7.149055107626017e-07, + "loss": 0.58470607, + "num_input_tokens_seen": 262247070, + "step": 12149, + "time_per_iteration": 3.121549367904663 + }, + { + "auxiliary_loss_clip": 0.01084145, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.03458107, + "balance_loss_mlp": 1.02012825, + "epoch": 0.7304975199158275, + "flos": 19828077229440.0, + "grad_norm": 1.864375819260993, + "language_loss": 0.7362631, + "learning_rate": 7.146071116474451e-07, + "loss": 0.75742865, + "num_input_tokens_seen": 262266605, + "step": 12150, + "time_per_iteration": 2.5382537841796875 + }, + { + "auxiliary_loss_clip": 0.01106947, + "auxiliary_loss_mlp": 0.01030004, + "balance_loss_clip": 1.0364027, + "balance_loss_mlp": 1.01770771, + "epoch": 0.7305576431684954, + "flos": 13223997452160.0, + "grad_norm": 2.127109901952234, + "language_loss": 0.83812892, + "learning_rate": 7.143087612745158e-07, + "loss": 0.8594985, + "num_input_tokens_seen": 262283880, + "step": 12151, + "time_per_iteration": 2.461709976196289 + }, + { + "auxiliary_loss_clip": 0.01073895, + "auxiliary_loss_mlp": 0.0103285, + "balance_loss_clip": 1.0343411, + "balance_loss_mlp": 1.0199995, + "epoch": 0.7306177664211634, + "flos": 24060472773120.0, + "grad_norm": 1.9053741579471506, + "language_loss": 0.78414309, + "learning_rate": 7.14010459655127e-07, + "loss": 0.80521053, + "num_input_tokens_seen": 262304155, + "step": 12152, + "time_per_iteration": 2.5980355739593506 + }, + { + "auxiliary_loss_clip": 0.01071062, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.03856063, + "balance_loss_mlp": 1.01914096, + "epoch": 0.7306778896738314, + "flos": 27089106802560.0, + "grad_norm": 1.5602149627885848, + "language_loss": 0.79708254, + "learning_rate": 7.137122068005919e-07, + "loss": 0.81810296, + "num_input_tokens_seen": 262325660, + "step": 12153, + "time_per_iteration": 2.6157665252685547 + }, + { + "auxiliary_loss_clip": 0.01096761, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.03674555, + "balance_loss_mlp": 1.01972139, + "epoch": 0.7307380129264993, + "flos": 16690669839360.0, + "grad_norm": 1.617713780939764, + "language_loss": 0.67402518, + "learning_rate": 7.134140027222173e-07, + "loss": 0.69531274, + "num_input_tokens_seen": 262344075, + "step": 12154, + "time_per_iteration": 2.52443790435791 + }, + { + "auxiliary_loss_clip": 0.0106592, + "auxiliary_loss_mlp": 0.01030977, + "balance_loss_clip": 1.0369277, + "balance_loss_mlp": 1.01906204, + "epoch": 0.7307981361791673, + "flos": 21725740656000.0, + "grad_norm": 1.7662641630879095, + "language_loss": 0.65752101, + "learning_rate": 7.131158474313128e-07, + "loss": 0.67848998, + "num_input_tokens_seen": 262363305, + "step": 12155, + "time_per_iteration": 2.637516975402832 + }, + { + "auxiliary_loss_clip": 0.01080885, + "auxiliary_loss_mlp": 0.01025961, + "balance_loss_clip": 1.03422606, + "balance_loss_mlp": 1.0144279, + "epoch": 0.7308582594318352, + "flos": 18040659621120.0, + "grad_norm": 1.7213057498786966, + "language_loss": 0.81634414, + "learning_rate": 7.128177409391851e-07, + "loss": 0.8374126, + "num_input_tokens_seen": 262380730, + "step": 12156, + "time_per_iteration": 2.500887155532837 + }, + { + "auxiliary_loss_clip": 0.01064914, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.03322053, + "balance_loss_mlp": 1.02193069, + "epoch": 0.7309183826845033, + "flos": 13844964798720.0, + "grad_norm": 2.1564948654742357, + "language_loss": 0.75436789, + "learning_rate": 7.125196832571367e-07, + "loss": 0.7753523, + "num_input_tokens_seen": 262395480, + "step": 12157, + "time_per_iteration": 2.5256032943725586 + }, + { + "auxiliary_loss_clip": 0.01088919, + "auxiliary_loss_mlp": 0.01026278, + "balance_loss_clip": 1.03489876, + "balance_loss_mlp": 1.01565599, + "epoch": 0.7309785059371712, + "flos": 17019216564480.0, + "grad_norm": 2.1149091433231426, + "language_loss": 0.72738671, + "learning_rate": 7.122216743964713e-07, + "loss": 0.74853867, + "num_input_tokens_seen": 262413340, + "step": 12158, + "time_per_iteration": 2.472226858139038 + }, + { + "auxiliary_loss_clip": 0.01083329, + "auxiliary_loss_mlp": 0.01032648, + "balance_loss_clip": 1.03660393, + "balance_loss_mlp": 1.02095342, + "epoch": 0.7310386291898392, + "flos": 26502398052480.0, + "grad_norm": 1.5339658510692826, + "language_loss": 0.85443014, + "learning_rate": 7.119237143684896e-07, + "loss": 0.87558991, + "num_input_tokens_seen": 262433455, + "step": 12159, + "time_per_iteration": 2.5926854610443115 + }, + { + "auxiliary_loss_clip": 0.01087547, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.03634381, + "balance_loss_mlp": 1.01879716, + "epoch": 0.7310987524425071, + "flos": 16945922862720.0, + "grad_norm": 2.1719780067199097, + "language_loss": 0.73492551, + "learning_rate": 7.116258031844895e-07, + "loss": 0.75611883, + "num_input_tokens_seen": 262450335, + "step": 12160, + "time_per_iteration": 2.505530595779419 + }, + { + "auxiliary_loss_clip": 0.01095841, + "auxiliary_loss_mlp": 0.01029986, + "balance_loss_clip": 1.03621459, + "balance_loss_mlp": 1.01768947, + "epoch": 0.7311588756951751, + "flos": 13845288021120.0, + "grad_norm": 4.041954196852807, + "language_loss": 0.72731662, + "learning_rate": 7.113279408557675e-07, + "loss": 0.74857485, + "num_input_tokens_seen": 262468240, + "step": 12161, + "time_per_iteration": 2.4886934757232666 + }, + { + "auxiliary_loss_clip": 0.0107576, + "auxiliary_loss_mlp": 0.007856, + "balance_loss_clip": 1.03442049, + "balance_loss_mlp": 1.01074433, + "epoch": 0.731218998947843, + "flos": 28767894704640.0, + "grad_norm": 1.658365968196064, + "language_loss": 0.69311839, + "learning_rate": 7.110301273936192e-07, + "loss": 0.71173197, + "num_input_tokens_seen": 262487045, + "step": 12162, + "time_per_iteration": 2.610635280609131 + }, + { + "auxiliary_loss_clip": 0.01096674, + "auxiliary_loss_mlp": 0.0102893, + "balance_loss_clip": 1.03742051, + "balance_loss_mlp": 1.0165143, + "epoch": 0.7312791222005111, + "flos": 27088783580160.0, + "grad_norm": 1.7200663810451444, + "language_loss": 0.66680902, + "learning_rate": 7.107323628093382e-07, + "loss": 0.68806505, + "num_input_tokens_seen": 262504855, + "step": 12163, + "time_per_iteration": 3.9545681476593018 + }, + { + "auxiliary_loss_clip": 0.01083334, + "auxiliary_loss_mlp": 0.01028973, + "balance_loss_clip": 1.0348618, + "balance_loss_mlp": 1.01665318, + "epoch": 0.731339245453179, + "flos": 20924035050240.0, + "grad_norm": 1.4195676560214838, + "language_loss": 0.68352103, + "learning_rate": 7.104346471142153e-07, + "loss": 0.70464408, + "num_input_tokens_seen": 262524920, + "step": 12164, + "time_per_iteration": 2.535433053970337 + }, + { + "auxiliary_loss_clip": 0.01056443, + "auxiliary_loss_mlp": 0.01032429, + "balance_loss_clip": 1.03634596, + "balance_loss_mlp": 1.02106237, + "epoch": 0.731399368705847, + "flos": 23075694524160.0, + "grad_norm": 2.000782192096785, + "language_loss": 0.73065889, + "learning_rate": 7.101369803195391e-07, + "loss": 0.75154757, + "num_input_tokens_seen": 262545725, + "step": 12165, + "time_per_iteration": 2.611187696456909 + }, + { + "auxiliary_loss_clip": 0.01096552, + "auxiliary_loss_mlp": 0.01032837, + "balance_loss_clip": 1.03569484, + "balance_loss_mlp": 1.02050447, + "epoch": 0.731459491958515, + "flos": 23582681038080.0, + "grad_norm": 1.8724166401356932, + "language_loss": 0.76712817, + "learning_rate": 7.098393624365988e-07, + "loss": 0.78842211, + "num_input_tokens_seen": 262565480, + "step": 12166, + "time_per_iteration": 2.5444936752319336 + }, + { + "auxiliary_loss_clip": 0.010849, + "auxiliary_loss_mlp": 0.01028588, + "balance_loss_clip": 1.03746367, + "balance_loss_mlp": 1.01689386, + "epoch": 0.7315196152111829, + "flos": 22379278659840.0, + "grad_norm": 1.8078748632185937, + "language_loss": 0.79643393, + "learning_rate": 7.095417934766781e-07, + "loss": 0.81756878, + "num_input_tokens_seen": 262584145, + "step": 12167, + "time_per_iteration": 2.556309461593628 + }, + { + "auxiliary_loss_clip": 0.01091775, + "auxiliary_loss_mlp": 0.0103838, + "balance_loss_clip": 1.0349052, + "balance_loss_mlp": 1.0269593, + "epoch": 0.7315797384638509, + "flos": 26177047637760.0, + "grad_norm": 1.598248579297874, + "language_loss": 0.76937073, + "learning_rate": 7.092442734510622e-07, + "loss": 0.7906723, + "num_input_tokens_seen": 262604045, + "step": 12168, + "time_per_iteration": 2.5405566692352295 + }, + { + "auxiliary_loss_clip": 0.0109053, + "auxiliary_loss_mlp": 0.01041971, + "balance_loss_clip": 1.03346539, + "balance_loss_mlp": 1.02709937, + "epoch": 0.7316398617165188, + "flos": 21506326427520.0, + "grad_norm": 1.5925881837266163, + "language_loss": 0.81629133, + "learning_rate": 7.089468023710326e-07, + "loss": 0.83761632, + "num_input_tokens_seen": 262624540, + "step": 12169, + "time_per_iteration": 3.923776626586914 + }, + { + "auxiliary_loss_clip": 0.01092199, + "auxiliary_loss_mlp": 0.01035162, + "balance_loss_clip": 1.03636134, + "balance_loss_mlp": 1.02246046, + "epoch": 0.7316999849691869, + "flos": 30482557315200.0, + "grad_norm": 1.7275232719825073, + "language_loss": 0.6984548, + "learning_rate": 7.08649380247871e-07, + "loss": 0.71972841, + "num_input_tokens_seen": 262644545, + "step": 12170, + "time_per_iteration": 4.0130650997161865 + }, + { + "auxiliary_loss_clip": 0.01103708, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.03580594, + "balance_loss_mlp": 1.01844919, + "epoch": 0.7317601082218548, + "flos": 21543781334400.0, + "grad_norm": 2.2563000803045616, + "language_loss": 0.70006883, + "learning_rate": 7.083520070928533e-07, + "loss": 0.72141588, + "num_input_tokens_seen": 262662570, + "step": 12171, + "time_per_iteration": 2.4790258407592773 + }, + { + "auxiliary_loss_clip": 0.01105116, + "auxiliary_loss_mlp": 0.01034007, + "balance_loss_clip": 1.036991, + "balance_loss_mlp": 1.02204442, + "epoch": 0.7318202314745228, + "flos": 33251592775680.0, + "grad_norm": 1.4798358863123173, + "language_loss": 0.65642416, + "learning_rate": 7.080546829172564e-07, + "loss": 0.67781544, + "num_input_tokens_seen": 262683245, + "step": 12172, + "time_per_iteration": 2.545663356781006 + }, + { + "auxiliary_loss_clip": 0.01107223, + "auxiliary_loss_mlp": 0.01028272, + "balance_loss_clip": 1.03812122, + "balance_loss_mlp": 1.01650012, + "epoch": 0.7318803547271907, + "flos": 20157054917760.0, + "grad_norm": 2.2924174950720673, + "language_loss": 0.6126498, + "learning_rate": 7.077574077323564e-07, + "loss": 0.63400477, + "num_input_tokens_seen": 262701585, + "step": 12173, + "time_per_iteration": 2.4946556091308594 + }, + { + "auxiliary_loss_clip": 0.01051495, + "auxiliary_loss_mlp": 0.01027729, + "balance_loss_clip": 1.03314364, + "balance_loss_mlp": 1.01648128, + "epoch": 0.7319404779798587, + "flos": 20558536208640.0, + "grad_norm": 2.003536136149973, + "language_loss": 0.74096155, + "learning_rate": 7.074601815494243e-07, + "loss": 0.7617538, + "num_input_tokens_seen": 262719295, + "step": 12174, + "time_per_iteration": 4.071456670761108 + }, + { + "auxiliary_loss_clip": 0.011015, + "auxiliary_loss_mlp": 0.0102565, + "balance_loss_clip": 1.03538501, + "balance_loss_mlp": 1.01435494, + "epoch": 0.7320006012325266, + "flos": 28695391102080.0, + "grad_norm": 1.5822214535534571, + "language_loss": 0.80860305, + "learning_rate": 7.071630043797317e-07, + "loss": 0.82987452, + "num_input_tokens_seen": 262739995, + "step": 12175, + "time_per_iteration": 2.5340375900268555 + }, + { + "auxiliary_loss_clip": 0.01082489, + "auxiliary_loss_mlp": 0.01026196, + "balance_loss_clip": 1.03391075, + "balance_loss_mlp": 1.01486468, + "epoch": 0.7320607244851947, + "flos": 16362697731840.0, + "grad_norm": 2.1877810998487446, + "language_loss": 0.77081192, + "learning_rate": 7.068658762345488e-07, + "loss": 0.79189873, + "num_input_tokens_seen": 262757680, + "step": 12176, + "time_per_iteration": 2.5229880809783936 + }, + { + "auxiliary_loss_clip": 0.01093376, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.03711486, + "balance_loss_mlp": 1.01982582, + "epoch": 0.7321208477378626, + "flos": 20955097336320.0, + "grad_norm": 1.785670685780079, + "language_loss": 0.76603872, + "learning_rate": 7.065687971251399e-07, + "loss": 0.78728253, + "num_input_tokens_seen": 262776990, + "step": 12177, + "time_per_iteration": 2.5359249114990234 + }, + { + "auxiliary_loss_clip": 0.01069404, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.03280807, + "balance_loss_mlp": 1.02498209, + "epoch": 0.7321809709905306, + "flos": 13845072539520.0, + "grad_norm": 3.2277009465400472, + "language_loss": 0.74902439, + "learning_rate": 7.06271767062772e-07, + "loss": 0.7700808, + "num_input_tokens_seen": 262795440, + "step": 12178, + "time_per_iteration": 2.5238635540008545 + }, + { + "auxiliary_loss_clip": 0.0108552, + "auxiliary_loss_mlp": 0.01032065, + "balance_loss_clip": 1.03340244, + "balance_loss_mlp": 1.02009642, + "epoch": 0.7322410942431986, + "flos": 26979938392320.0, + "grad_norm": 1.826059033437472, + "language_loss": 0.82664537, + "learning_rate": 7.059747860587084e-07, + "loss": 0.84782118, + "num_input_tokens_seen": 262816385, + "step": 12179, + "time_per_iteration": 2.561383008956909 + }, + { + "auxiliary_loss_clip": 0.01076566, + "auxiliary_loss_mlp": 0.01038721, + "balance_loss_clip": 1.03493857, + "balance_loss_mlp": 1.02650774, + "epoch": 0.7323012174958665, + "flos": 17639717034240.0, + "grad_norm": 1.645567605811073, + "language_loss": 0.74722755, + "learning_rate": 7.056778541242115e-07, + "loss": 0.7683804, + "num_input_tokens_seen": 262834955, + "step": 12180, + "time_per_iteration": 2.479093551635742 + }, + { + "auxiliary_loss_clip": 0.01094636, + "auxiliary_loss_mlp": 0.00784351, + "balance_loss_clip": 1.03318477, + "balance_loss_mlp": 1.00963068, + "epoch": 0.7323613407485345, + "flos": 32342765834880.0, + "grad_norm": 1.9447181447400566, + "language_loss": 0.79317242, + "learning_rate": 7.053809712705396e-07, + "loss": 0.81196231, + "num_input_tokens_seen": 262853555, + "step": 12181, + "time_per_iteration": 2.5624895095825195 + }, + { + "auxiliary_loss_clip": 0.01096512, + "auxiliary_loss_mlp": 0.00783416, + "balance_loss_clip": 1.03621399, + "balance_loss_mlp": 1.00844932, + "epoch": 0.7324214640012024, + "flos": 18362777811840.0, + "grad_norm": 1.7281617960017766, + "language_loss": 0.71418881, + "learning_rate": 7.050841375089506e-07, + "loss": 0.73298812, + "num_input_tokens_seen": 262870975, + "step": 12182, + "time_per_iteration": 2.4794068336486816 + }, + { + "auxiliary_loss_clip": 0.01107688, + "auxiliary_loss_mlp": 0.0103414, + "balance_loss_clip": 1.03815758, + "balance_loss_mlp": 1.02242732, + "epoch": 0.7324815872538705, + "flos": 30812289189120.0, + "grad_norm": 1.477633362737075, + "language_loss": 0.7107693, + "learning_rate": 7.047873528507015e-07, + "loss": 0.73218757, + "num_input_tokens_seen": 262892635, + "step": 12183, + "time_per_iteration": 2.540525436401367 + }, + { + "auxiliary_loss_clip": 0.01093975, + "auxiliary_loss_mlp": 0.01032074, + "balance_loss_clip": 1.03820705, + "balance_loss_mlp": 1.01931262, + "epoch": 0.7325417105065384, + "flos": 21505069451520.0, + "grad_norm": 1.9627729767132935, + "language_loss": 0.72779489, + "learning_rate": 7.04490617307045e-07, + "loss": 0.74905533, + "num_input_tokens_seen": 262910725, + "step": 12184, + "time_per_iteration": 2.523193836212158 + }, + { + "auxiliary_loss_clip": 0.0101514, + "auxiliary_loss_mlp": 0.01003336, + "balance_loss_clip": 1.01120138, + "balance_loss_mlp": 1.00225759, + "epoch": 0.7326018337592064, + "flos": 67257742556160.0, + "grad_norm": 1.339435748316049, + "language_loss": 0.65098244, + "learning_rate": 7.041939308892344e-07, + "loss": 0.67116719, + "num_input_tokens_seen": 262974150, + "step": 12185, + "time_per_iteration": 3.1242823600769043 + }, + { + "auxiliary_loss_clip": 0.01104429, + "auxiliary_loss_mlp": 0.01029341, + "balance_loss_clip": 1.03400588, + "balance_loss_mlp": 1.01682985, + "epoch": 0.7326619570118743, + "flos": 22857070394880.0, + "grad_norm": 2.0846602095860245, + "language_loss": 0.8041386, + "learning_rate": 7.038972936085197e-07, + "loss": 0.82547629, + "num_input_tokens_seen": 262993370, + "step": 12186, + "time_per_iteration": 2.490663766860962 + }, + { + "auxiliary_loss_clip": 0.01095516, + "auxiliary_loss_mlp": 0.01030224, + "balance_loss_clip": 1.03551579, + "balance_loss_mlp": 1.01712859, + "epoch": 0.7327220802645423, + "flos": 23327499841920.0, + "grad_norm": 1.6573777505686993, + "language_loss": 0.73288512, + "learning_rate": 7.036007054761508e-07, + "loss": 0.75414258, + "num_input_tokens_seen": 263012665, + "step": 12187, + "time_per_iteration": 2.5017013549804688 + }, + { + "auxiliary_loss_clip": 0.01107925, + "auxiliary_loss_mlp": 0.0103238, + "balance_loss_clip": 1.03768647, + "balance_loss_mlp": 1.02054274, + "epoch": 0.7327822035172102, + "flos": 23180661043200.0, + "grad_norm": 1.7425932039670355, + "language_loss": 0.8897692, + "learning_rate": 7.033041665033716e-07, + "loss": 0.91117227, + "num_input_tokens_seen": 263031475, + "step": 12188, + "time_per_iteration": 2.503183603286743 + }, + { + "auxiliary_loss_clip": 0.01063339, + "auxiliary_loss_mlp": 0.01033187, + "balance_loss_clip": 1.0333935, + "balance_loss_mlp": 1.0200913, + "epoch": 0.7328423267698783, + "flos": 21066600130560.0, + "grad_norm": 1.8639320661878513, + "language_loss": 0.74465799, + "learning_rate": 7.030076767014284e-07, + "loss": 0.76562327, + "num_input_tokens_seen": 263051445, + "step": 12189, + "time_per_iteration": 2.592329263687134 + }, + { + "auxiliary_loss_clip": 0.01072035, + "auxiliary_loss_mlp": 0.0102765, + "balance_loss_clip": 1.03475428, + "balance_loss_mlp": 1.0155859, + "epoch": 0.7329024500225462, + "flos": 21689578638720.0, + "grad_norm": 1.5422968370155854, + "language_loss": 0.82072794, + "learning_rate": 7.027112360815648e-07, + "loss": 0.84172475, + "num_input_tokens_seen": 263070835, + "step": 12190, + "time_per_iteration": 2.5835111141204834 + }, + { + "auxiliary_loss_clip": 0.0106656, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.03545904, + "balance_loss_mlp": 1.02045035, + "epoch": 0.7329625732752142, + "flos": 24164038661760.0, + "grad_norm": 1.9226998516865073, + "language_loss": 0.71865541, + "learning_rate": 7.024148446550204e-07, + "loss": 0.73966062, + "num_input_tokens_seen": 263090070, + "step": 12191, + "time_per_iteration": 2.5783531665802 + }, + { + "auxiliary_loss_clip": 0.01106655, + "auxiliary_loss_mlp": 0.01033272, + "balance_loss_clip": 1.0367496, + "balance_loss_mlp": 1.02069545, + "epoch": 0.7330226965278822, + "flos": 30077915627520.0, + "grad_norm": 1.5300601332581425, + "language_loss": 0.69187987, + "learning_rate": 7.021185024330361e-07, + "loss": 0.71327913, + "num_input_tokens_seen": 263110030, + "step": 12192, + "time_per_iteration": 2.5320897102355957 + }, + { + "auxiliary_loss_clip": 0.01091668, + "auxiliary_loss_mlp": 0.01030638, + "balance_loss_clip": 1.03517592, + "balance_loss_mlp": 1.01862121, + "epoch": 0.7330828197805501, + "flos": 23368294713600.0, + "grad_norm": 1.5066739581154027, + "language_loss": 0.73452491, + "learning_rate": 7.01822209426848e-07, + "loss": 0.75574797, + "num_input_tokens_seen": 263129735, + "step": 12193, + "time_per_iteration": 2.5188703536987305 + }, + { + "auxiliary_loss_clip": 0.01096999, + "auxiliary_loss_mlp": 0.01029815, + "balance_loss_clip": 1.03587246, + "balance_loss_mlp": 1.01737523, + "epoch": 0.7331429430332181, + "flos": 21032808410880.0, + "grad_norm": 1.7770110976381506, + "language_loss": 0.77013612, + "learning_rate": 7.015259656476911e-07, + "loss": 0.79140425, + "num_input_tokens_seen": 263149100, + "step": 12194, + "time_per_iteration": 2.4990944862365723 + }, + { + "auxiliary_loss_clip": 0.01094517, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.03738928, + "balance_loss_mlp": 1.01813054, + "epoch": 0.733203066285886, + "flos": 14647891466880.0, + "grad_norm": 1.8465820776319488, + "language_loss": 0.7045927, + "learning_rate": 7.012297711067998e-07, + "loss": 0.72584414, + "num_input_tokens_seen": 263166620, + "step": 12195, + "time_per_iteration": 2.447614908218384 + }, + { + "auxiliary_loss_clip": 0.01105374, + "auxiliary_loss_mlp": 0.01039184, + "balance_loss_clip": 1.03610265, + "balance_loss_mlp": 1.02716208, + "epoch": 0.7332631895385541, + "flos": 17165301177600.0, + "grad_norm": 1.8229861344175058, + "language_loss": 0.72038531, + "learning_rate": 7.009336258154057e-07, + "loss": 0.74183095, + "num_input_tokens_seen": 263184780, + "step": 12196, + "time_per_iteration": 2.4437386989593506 + }, + { + "auxiliary_loss_clip": 0.01103608, + "auxiliary_loss_mlp": 0.01028137, + "balance_loss_clip": 1.03617895, + "balance_loss_mlp": 1.01547742, + "epoch": 0.733323312791222, + "flos": 28658151676800.0, + "grad_norm": 1.700197375118563, + "language_loss": 0.71750987, + "learning_rate": 7.006375297847394e-07, + "loss": 0.73882729, + "num_input_tokens_seen": 263204625, + "step": 12197, + "time_per_iteration": 2.5084803104400635 + }, + { + "auxiliary_loss_clip": 0.01055234, + "auxiliary_loss_mlp": 0.00784031, + "balance_loss_clip": 1.03443265, + "balance_loss_mlp": 1.008021, + "epoch": 0.73338343604389, + "flos": 16618417632000.0, + "grad_norm": 2.042651736808952, + "language_loss": 0.78240132, + "learning_rate": 7.003414830260282e-07, + "loss": 0.80079395, + "num_input_tokens_seen": 263221565, + "step": 12198, + "time_per_iteration": 2.613983631134033 + }, + { + "auxiliary_loss_clip": 0.01050999, + "auxiliary_loss_mlp": 0.01033955, + "balance_loss_clip": 1.03531742, + "balance_loss_mlp": 1.02206969, + "epoch": 0.7334435592965579, + "flos": 21142084561920.0, + "grad_norm": 2.028796718795781, + "language_loss": 0.74023187, + "learning_rate": 7.000454855504974e-07, + "loss": 0.7610814, + "num_input_tokens_seen": 263240620, + "step": 12199, + "time_per_iteration": 2.6060001850128174 + }, + { + "auxiliary_loss_clip": 0.01087986, + "auxiliary_loss_mlp": 0.01034948, + "balance_loss_clip": 1.03723025, + "balance_loss_mlp": 1.0224247, + "epoch": 0.7335036825492259, + "flos": 17125332318720.0, + "grad_norm": 2.7626850160261682, + "language_loss": 0.77309489, + "learning_rate": 6.997495373693729e-07, + "loss": 0.79432422, + "num_input_tokens_seen": 263254365, + "step": 12200, + "time_per_iteration": 2.5130815505981445 + }, + { + "auxiliary_loss_clip": 0.01067706, + "auxiliary_loss_mlp": 0.01030035, + "balance_loss_clip": 1.03616965, + "balance_loss_mlp": 1.0187881, + "epoch": 0.7335638058018938, + "flos": 23731818307200.0, + "grad_norm": 2.467902441565637, + "language_loss": 0.61292195, + "learning_rate": 6.994536384938754e-07, + "loss": 0.63389933, + "num_input_tokens_seen": 263275880, + "step": 12201, + "time_per_iteration": 2.5849809646606445 + }, + { + "auxiliary_loss_clip": 0.01067545, + "auxiliary_loss_mlp": 0.00783156, + "balance_loss_clip": 1.03429484, + "balance_loss_mlp": 1.00961471, + "epoch": 0.7336239290545619, + "flos": 34933289679360.0, + "grad_norm": 2.364406813646872, + "language_loss": 0.51949883, + "learning_rate": 6.991577889352264e-07, + "loss": 0.53800583, + "num_input_tokens_seen": 263298315, + "step": 12202, + "time_per_iteration": 4.084860563278198 + }, + { + "auxiliary_loss_clip": 0.01081135, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_clip": 1.03392434, + "balance_loss_mlp": 1.01831484, + "epoch": 0.7336840523072298, + "flos": 21103049456640.0, + "grad_norm": 1.6665454277277358, + "language_loss": 0.69044393, + "learning_rate": 6.98861988704645e-07, + "loss": 0.71155632, + "num_input_tokens_seen": 263318615, + "step": 12203, + "time_per_iteration": 2.5386900901794434 + }, + { + "auxiliary_loss_clip": 0.01081233, + "auxiliary_loss_mlp": 0.01034466, + "balance_loss_clip": 1.03659964, + "balance_loss_mlp": 1.02181184, + "epoch": 0.7337441755598978, + "flos": 24024418496640.0, + "grad_norm": 3.7118217333262264, + "language_loss": 0.66202009, + "learning_rate": 6.985662378133474e-07, + "loss": 0.68317711, + "num_input_tokens_seen": 263336705, + "step": 12204, + "time_per_iteration": 2.5484304428100586 + }, + { + "auxiliary_loss_clip": 0.01082386, + "auxiliary_loss_mlp": 0.0103336, + "balance_loss_clip": 1.03767836, + "balance_loss_mlp": 1.02204669, + "epoch": 0.7338042988125658, + "flos": 22711309004160.0, + "grad_norm": 1.8848198565473389, + "language_loss": 0.77421916, + "learning_rate": 6.982705362725479e-07, + "loss": 0.7953766, + "num_input_tokens_seen": 263355065, + "step": 12205, + "time_per_iteration": 2.5387749671936035 + }, + { + "auxiliary_loss_clip": 0.01060258, + "auxiliary_loss_mlp": 0.01026933, + "balance_loss_clip": 1.03569722, + "balance_loss_mlp": 1.01580477, + "epoch": 0.7338644220652337, + "flos": 21360996000000.0, + "grad_norm": 1.627393583970014, + "language_loss": 0.79693246, + "learning_rate": 6.979748840934601e-07, + "loss": 0.81780446, + "num_input_tokens_seen": 263374460, + "step": 12206, + "time_per_iteration": 2.6074490547180176 + }, + { + "auxiliary_loss_clip": 0.01065905, + "auxiliary_loss_mlp": 0.0102845, + "balance_loss_clip": 1.03248692, + "balance_loss_mlp": 1.01597476, + "epoch": 0.7339245453179017, + "flos": 30920236536960.0, + "grad_norm": 1.9012446703026653, + "language_loss": 0.71481061, + "learning_rate": 6.976792812872958e-07, + "loss": 0.73575419, + "num_input_tokens_seen": 263393610, + "step": 12207, + "time_per_iteration": 2.6919336318969727 + }, + { + "auxiliary_loss_clip": 0.01015276, + "auxiliary_loss_mlp": 0.01002401, + "balance_loss_clip": 1.01121271, + "balance_loss_mlp": 1.0011847, + "epoch": 0.7339846685705697, + "flos": 67899429072000.0, + "grad_norm": 0.8817549127639255, + "language_loss": 0.54792237, + "learning_rate": 6.97383727865263e-07, + "loss": 0.56809914, + "num_input_tokens_seen": 263450340, + "step": 12208, + "time_per_iteration": 4.6046624183654785 + }, + { + "auxiliary_loss_clip": 0.01105232, + "auxiliary_loss_mlp": 0.0103176, + "balance_loss_clip": 1.03628337, + "balance_loss_mlp": 1.02085292, + "epoch": 0.7340447918232377, + "flos": 22236749493120.0, + "grad_norm": 1.7937534731137583, + "language_loss": 0.80357802, + "learning_rate": 6.970882238385703e-07, + "loss": 0.82494795, + "num_input_tokens_seen": 263471735, + "step": 12209, + "time_per_iteration": 3.8980114459991455 + }, + { + "auxiliary_loss_clip": 0.01101058, + "auxiliary_loss_mlp": 0.01028161, + "balance_loss_clip": 1.03381717, + "balance_loss_mlp": 1.01648402, + "epoch": 0.7341049150759056, + "flos": 23764784014080.0, + "grad_norm": 1.53708551415469, + "language_loss": 0.78849733, + "learning_rate": 6.96792769218423e-07, + "loss": 0.80978948, + "num_input_tokens_seen": 263493245, + "step": 12210, + "time_per_iteration": 2.51960825920105 + }, + { + "auxiliary_loss_clip": 0.01103194, + "auxiliary_loss_mlp": 0.01029659, + "balance_loss_clip": 1.03597283, + "balance_loss_mlp": 1.01757729, + "epoch": 0.7341650383285736, + "flos": 17236547804160.0, + "grad_norm": 1.775541343299107, + "language_loss": 0.76423192, + "learning_rate": 6.964973640160236e-07, + "loss": 0.78556049, + "num_input_tokens_seen": 263511660, + "step": 12211, + "time_per_iteration": 2.4460318088531494 + }, + { + "auxiliary_loss_clip": 0.01081861, + "auxiliary_loss_mlp": 0.01024682, + "balance_loss_clip": 1.03742695, + "balance_loss_mlp": 1.01287997, + "epoch": 0.7342251615812415, + "flos": 23403953940480.0, + "grad_norm": 1.8598422557350829, + "language_loss": 0.72286868, + "learning_rate": 6.962020082425748e-07, + "loss": 0.74393415, + "num_input_tokens_seen": 263530875, + "step": 12212, + "time_per_iteration": 2.5685925483703613 + }, + { + "auxiliary_loss_clip": 0.01106175, + "auxiliary_loss_mlp": 0.01034022, + "balance_loss_clip": 1.0377115, + "balance_loss_mlp": 1.02211261, + "epoch": 0.7342852848339095, + "flos": 22747183712640.0, + "grad_norm": 1.712077440152238, + "language_loss": 0.69073743, + "learning_rate": 6.959067019092766e-07, + "loss": 0.71213943, + "num_input_tokens_seen": 263551585, + "step": 12213, + "time_per_iteration": 3.878291606903076 + }, + { + "auxiliary_loss_clip": 0.01032499, + "auxiliary_loss_mlp": 0.01002571, + "balance_loss_clip": 1.00922632, + "balance_loss_mlp": 1.00140846, + "epoch": 0.7343454080865774, + "flos": 53942353925760.0, + "grad_norm": 0.7311735466843946, + "language_loss": 0.54368991, + "learning_rate": 6.956114450273276e-07, + "loss": 0.5640406, + "num_input_tokens_seen": 263609545, + "step": 12214, + "time_per_iteration": 2.9769935607910156 + }, + { + "auxiliary_loss_clip": 0.01107015, + "auxiliary_loss_mlp": 0.01025552, + "balance_loss_clip": 1.03621316, + "balance_loss_mlp": 1.01377451, + "epoch": 0.7344055313392455, + "flos": 12166859255040.0, + "grad_norm": 2.0651299500865417, + "language_loss": 0.70465046, + "learning_rate": 6.953162376079233e-07, + "loss": 0.72597611, + "num_input_tokens_seen": 263627880, + "step": 12215, + "time_per_iteration": 2.4376790523529053 + }, + { + "auxiliary_loss_clip": 0.01076394, + "auxiliary_loss_mlp": 0.01028174, + "balance_loss_clip": 1.03471756, + "balance_loss_mlp": 1.0167774, + "epoch": 0.7344656545919134, + "flos": 18550052346240.0, + "grad_norm": 1.5755018294224734, + "language_loss": 0.72845769, + "learning_rate": 6.950210796622573e-07, + "loss": 0.74950337, + "num_input_tokens_seen": 263645665, + "step": 12216, + "time_per_iteration": 2.488072156906128 + }, + { + "auxiliary_loss_clip": 0.01113153, + "auxiliary_loss_mlp": 0.01038336, + "balance_loss_clip": 1.03816915, + "balance_loss_mlp": 1.02357793, + "epoch": 0.7345257778445814, + "flos": 23661649088640.0, + "grad_norm": 1.7350093420516886, + "language_loss": 0.78005409, + "learning_rate": 6.947259712015236e-07, + "loss": 0.80156904, + "num_input_tokens_seen": 263668170, + "step": 12217, + "time_per_iteration": 2.5204434394836426 + }, + { + "auxiliary_loss_clip": 0.01065425, + "auxiliary_loss_mlp": 0.01027452, + "balance_loss_clip": 1.03477049, + "balance_loss_mlp": 1.01670504, + "epoch": 0.7345859010972494, + "flos": 13808659127040.0, + "grad_norm": 2.12153979932035, + "language_loss": 0.78145021, + "learning_rate": 6.94430912236911e-07, + "loss": 0.80237895, + "num_input_tokens_seen": 263684190, + "step": 12218, + "time_per_iteration": 2.5613982677459717 + }, + { + "auxiliary_loss_clip": 0.01056501, + "auxiliary_loss_mlp": 0.01037922, + "balance_loss_clip": 1.03197646, + "balance_loss_mlp": 1.02416527, + "epoch": 0.7346460243499173, + "flos": 22272731942400.0, + "grad_norm": 2.0578586359050384, + "language_loss": 0.72402221, + "learning_rate": 6.941359027796092e-07, + "loss": 0.74496645, + "num_input_tokens_seen": 263702095, + "step": 12219, + "time_per_iteration": 2.5851259231567383 + }, + { + "auxiliary_loss_clip": 0.01079254, + "auxiliary_loss_mlp": 0.01030647, + "balance_loss_clip": 1.03237462, + "balance_loss_mlp": 1.0189054, + "epoch": 0.7347061476025853, + "flos": 23255247634560.0, + "grad_norm": 1.7588658218794593, + "language_loss": 0.74802715, + "learning_rate": 6.938409428408061e-07, + "loss": 0.76912618, + "num_input_tokens_seen": 263721385, + "step": 12220, + "time_per_iteration": 2.5264031887054443 + }, + { + "auxiliary_loss_clip": 0.01096564, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.0360992, + "balance_loss_mlp": 1.01869631, + "epoch": 0.7347662708552533, + "flos": 15267565923840.0, + "grad_norm": 1.6600898594294404, + "language_loss": 0.66201377, + "learning_rate": 6.93546032431684e-07, + "loss": 0.68328822, + "num_input_tokens_seen": 263737835, + "step": 12221, + "time_per_iteration": 2.4600467681884766 + }, + { + "auxiliary_loss_clip": 0.01079308, + "auxiliary_loss_mlp": 0.01034285, + "balance_loss_clip": 1.03333628, + "balance_loss_mlp": 1.02145219, + "epoch": 0.7348263941079213, + "flos": 24859987649280.0, + "grad_norm": 2.115563016556717, + "language_loss": 0.6899718, + "learning_rate": 6.932511715634273e-07, + "loss": 0.71110767, + "num_input_tokens_seen": 263756480, + "step": 12222, + "time_per_iteration": 2.535013198852539 + }, + { + "auxiliary_loss_clip": 0.01060002, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.03457642, + "balance_loss_mlp": 1.02005041, + "epoch": 0.7348865173605892, + "flos": 24352103295360.0, + "grad_norm": 1.6767165001044861, + "language_loss": 0.65550077, + "learning_rate": 6.92956360247217e-07, + "loss": 0.67641211, + "num_input_tokens_seen": 263776440, + "step": 12223, + "time_per_iteration": 2.625537157058716 + }, + { + "auxiliary_loss_clip": 0.0108961, + "auxiliary_loss_mlp": 0.01030071, + "balance_loss_clip": 1.03562772, + "balance_loss_mlp": 1.01768529, + "epoch": 0.7349466406132572, + "flos": 20004613597440.0, + "grad_norm": 1.849969257871437, + "language_loss": 0.72001213, + "learning_rate": 6.926615984942332e-07, + "loss": 0.74120891, + "num_input_tokens_seen": 263793700, + "step": 12224, + "time_per_iteration": 2.5144588947296143 + }, + { + "auxiliary_loss_clip": 0.01075618, + "auxiliary_loss_mlp": 0.01028255, + "balance_loss_clip": 1.036098, + "balance_loss_mlp": 1.01657856, + "epoch": 0.7350067638659251, + "flos": 29825068815360.0, + "grad_norm": 1.6646096660211382, + "language_loss": 0.72225451, + "learning_rate": 6.92366886315652e-07, + "loss": 0.74329317, + "num_input_tokens_seen": 263814620, + "step": 12225, + "time_per_iteration": 2.6347968578338623 + }, + { + "auxiliary_loss_clip": 0.01108627, + "auxiliary_loss_mlp": 0.01031511, + "balance_loss_clip": 1.03639245, + "balance_loss_mlp": 1.01850569, + "epoch": 0.7350668871185931, + "flos": 21866150920320.0, + "grad_norm": 1.7135112745157375, + "language_loss": 0.76180738, + "learning_rate": 6.920722237226501e-07, + "loss": 0.78320873, + "num_input_tokens_seen": 263832725, + "step": 12226, + "time_per_iteration": 2.4667840003967285 + }, + { + "auxiliary_loss_clip": 0.01078156, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.03343952, + "balance_loss_mlp": 1.01797211, + "epoch": 0.735127010371261, + "flos": 22566122231040.0, + "grad_norm": 3.452584062342471, + "language_loss": 0.66756749, + "learning_rate": 6.917776107264008e-07, + "loss": 0.68866313, + "num_input_tokens_seen": 263853850, + "step": 12227, + "time_per_iteration": 2.5390729904174805 + }, + { + "auxiliary_loss_clip": 0.01096117, + "auxiliary_loss_mlp": 0.01032699, + "balance_loss_clip": 1.03560305, + "balance_loss_mlp": 1.02129698, + "epoch": 0.7351871336239291, + "flos": 25884339707520.0, + "grad_norm": 1.4870053125623355, + "language_loss": 0.63405085, + "learning_rate": 6.914830473380749e-07, + "loss": 0.655339, + "num_input_tokens_seen": 263874760, + "step": 12228, + "time_per_iteration": 2.54447340965271 + }, + { + "auxiliary_loss_clip": 0.01084562, + "auxiliary_loss_mlp": 0.01033984, + "balance_loss_clip": 1.03640485, + "balance_loss_mlp": 1.02297544, + "epoch": 0.735247256876597, + "flos": 17932173569280.0, + "grad_norm": 1.6764424586960494, + "language_loss": 0.63391674, + "learning_rate": 6.911885335688427e-07, + "loss": 0.65510225, + "num_input_tokens_seen": 263893390, + "step": 12229, + "time_per_iteration": 2.5210559368133545 + }, + { + "auxiliary_loss_clip": 0.01080849, + "auxiliary_loss_mlp": 0.01035227, + "balance_loss_clip": 1.03706431, + "balance_loss_mlp": 1.02267456, + "epoch": 0.735307380129265, + "flos": 28875159694080.0, + "grad_norm": 1.6462811326811897, + "language_loss": 0.73378217, + "learning_rate": 6.908940694298726e-07, + "loss": 0.75494283, + "num_input_tokens_seen": 263911180, + "step": 12230, + "time_per_iteration": 2.56764554977417 + }, + { + "auxiliary_loss_clip": 0.01051782, + "auxiliary_loss_mlp": 0.01028394, + "balance_loss_clip": 1.03359604, + "balance_loss_mlp": 1.01602566, + "epoch": 0.7353675033819329, + "flos": 13625658311040.0, + "grad_norm": 2.282001459608554, + "language_loss": 0.71901137, + "learning_rate": 6.90599654932332e-07, + "loss": 0.73981309, + "num_input_tokens_seen": 263928975, + "step": 12231, + "time_per_iteration": 2.6062746047973633 + }, + { + "auxiliary_loss_clip": 0.01097895, + "auxiliary_loss_mlp": 0.01036533, + "balance_loss_clip": 1.03812039, + "balance_loss_mlp": 1.02279401, + "epoch": 0.7354276266346009, + "flos": 19463081178240.0, + "grad_norm": 2.110570868040485, + "language_loss": 0.63696784, + "learning_rate": 6.903052900873823e-07, + "loss": 0.65831208, + "num_input_tokens_seen": 263944495, + "step": 12232, + "time_per_iteration": 2.486546754837036 + }, + { + "auxiliary_loss_clip": 0.01085913, + "auxiliary_loss_mlp": 0.01028669, + "balance_loss_clip": 1.03530657, + "balance_loss_mlp": 1.01647377, + "epoch": 0.735487749887269, + "flos": 15771858917760.0, + "grad_norm": 1.962107713726942, + "language_loss": 0.75356728, + "learning_rate": 6.900109749061874e-07, + "loss": 0.7747131, + "num_input_tokens_seen": 263961325, + "step": 12233, + "time_per_iteration": 2.507244110107422 + }, + { + "auxiliary_loss_clip": 0.01106017, + "auxiliary_loss_mlp": 0.01026496, + "balance_loss_clip": 1.03645921, + "balance_loss_mlp": 1.01395512, + "epoch": 0.7355478731399369, + "flos": 18260648467200.0, + "grad_norm": 1.571969347270664, + "language_loss": 0.73178595, + "learning_rate": 6.897167093999079e-07, + "loss": 0.75311106, + "num_input_tokens_seen": 263980445, + "step": 12234, + "time_per_iteration": 2.446500062942505 + }, + { + "auxiliary_loss_clip": 0.01094377, + "auxiliary_loss_mlp": 0.01029797, + "balance_loss_clip": 1.03648901, + "balance_loss_mlp": 1.01697004, + "epoch": 0.7356079963926049, + "flos": 26542043688960.0, + "grad_norm": 2.6270870041554035, + "language_loss": 0.60499299, + "learning_rate": 6.894224935797017e-07, + "loss": 0.62623477, + "num_input_tokens_seen": 263999330, + "step": 12235, + "time_per_iteration": 2.547255516052246 + }, + { + "auxiliary_loss_clip": 0.01082717, + "auxiliary_loss_mlp": 0.01022857, + "balance_loss_clip": 1.03621554, + "balance_loss_mlp": 1.01113844, + "epoch": 0.7356681196452728, + "flos": 10778624467200.0, + "grad_norm": 2.2443556436114975, + "language_loss": 0.86113381, + "learning_rate": 6.891283274567259e-07, + "loss": 0.88218951, + "num_input_tokens_seen": 264014150, + "step": 12236, + "time_per_iteration": 2.4753215312957764 + }, + { + "auxiliary_loss_clip": 0.01096238, + "auxiliary_loss_mlp": 0.00784168, + "balance_loss_clip": 1.03739667, + "balance_loss_mlp": 1.01160359, + "epoch": 0.7357282428979408, + "flos": 19718693337600.0, + "grad_norm": 1.803286416183646, + "language_loss": 0.69483215, + "learning_rate": 6.888342110421364e-07, + "loss": 0.71363616, + "num_input_tokens_seen": 264033140, + "step": 12237, + "time_per_iteration": 2.505621910095215 + }, + { + "auxiliary_loss_clip": 0.01022046, + "auxiliary_loss_mlp": 0.0102911, + "balance_loss_clip": 1.03254652, + "balance_loss_mlp": 1.01680148, + "epoch": 0.7357883661506087, + "flos": 19464014931840.0, + "grad_norm": 1.5371272079682532, + "language_loss": 0.71948314, + "learning_rate": 6.885401443470839e-07, + "loss": 0.73999476, + "num_input_tokens_seen": 264052105, + "step": 12238, + "time_per_iteration": 2.931239128112793 + }, + { + "auxiliary_loss_clip": 0.01076327, + "auxiliary_loss_mlp": 0.01028364, + "balance_loss_clip": 1.03377461, + "balance_loss_mlp": 1.01544809, + "epoch": 0.7358484894032767, + "flos": 27123006263040.0, + "grad_norm": 1.8368415184220992, + "language_loss": 0.72204721, + "learning_rate": 6.882461273827205e-07, + "loss": 0.74309409, + "num_input_tokens_seen": 264070690, + "step": 12239, + "time_per_iteration": 2.8123621940612793 + }, + { + "auxiliary_loss_clip": 0.01082607, + "auxiliary_loss_mlp": 0.01029716, + "balance_loss_clip": 1.03724265, + "balance_loss_mlp": 1.01773548, + "epoch": 0.7359086126559446, + "flos": 24502282058880.0, + "grad_norm": 1.4644767062299358, + "language_loss": 0.78919506, + "learning_rate": 6.879521601601954e-07, + "loss": 0.81031829, + "num_input_tokens_seen": 264094225, + "step": 12240, + "time_per_iteration": 2.5937659740448 + }, + { + "auxiliary_loss_clip": 0.01095373, + "auxiliary_loss_mlp": 0.01033866, + "balance_loss_clip": 1.03681374, + "balance_loss_mlp": 1.02128983, + "epoch": 0.7359687359086127, + "flos": 23331270769920.0, + "grad_norm": 1.8557452940590293, + "language_loss": 0.830567, + "learning_rate": 6.876582426906565e-07, + "loss": 0.85185939, + "num_input_tokens_seen": 264113190, + "step": 12241, + "time_per_iteration": 4.27785062789917 + }, + { + "auxiliary_loss_clip": 0.01092837, + "auxiliary_loss_mlp": 0.01024156, + "balance_loss_clip": 1.03455997, + "balance_loss_mlp": 1.01252699, + "epoch": 0.7360288591612806, + "flos": 20193396503040.0, + "grad_norm": 4.414466012716064, + "language_loss": 0.78595948, + "learning_rate": 6.873643749852484e-07, + "loss": 0.80712938, + "num_input_tokens_seen": 264132050, + "step": 12242, + "time_per_iteration": 2.534701108932495 + }, + { + "auxiliary_loss_clip": 0.01057191, + "auxiliary_loss_mlp": 0.01029853, + "balance_loss_clip": 1.03385282, + "balance_loss_mlp": 1.01757479, + "epoch": 0.7360889824139486, + "flos": 24972783333120.0, + "grad_norm": 1.6923396884775845, + "language_loss": 0.79179448, + "learning_rate": 6.870705570551145e-07, + "loss": 0.81266493, + "num_input_tokens_seen": 264152800, + "step": 12243, + "time_per_iteration": 2.6337709426879883 + }, + { + "auxiliary_loss_clip": 0.01095615, + "auxiliary_loss_mlp": 0.01029306, + "balance_loss_clip": 1.03552127, + "balance_loss_mlp": 1.01640737, + "epoch": 0.7361491056666165, + "flos": 15012312900480.0, + "grad_norm": 2.261697794486923, + "language_loss": 0.74545825, + "learning_rate": 6.867767889113969e-07, + "loss": 0.76670742, + "num_input_tokens_seen": 264169650, + "step": 12244, + "time_per_iteration": 2.4749157428741455 + }, + { + "auxiliary_loss_clip": 0.01089297, + "auxiliary_loss_mlp": 0.01030564, + "balance_loss_clip": 1.03390479, + "balance_loss_mlp": 1.01786196, + "epoch": 0.7362092289192845, + "flos": 22930400010240.0, + "grad_norm": 1.7252549826813348, + "language_loss": 0.69470078, + "learning_rate": 6.864830705652347e-07, + "loss": 0.71589941, + "num_input_tokens_seen": 264190530, + "step": 12245, + "time_per_iteration": 2.5161588191986084 + }, + { + "auxiliary_loss_clip": 0.01067915, + "auxiliary_loss_mlp": 0.01032281, + "balance_loss_clip": 1.03686202, + "balance_loss_mlp": 1.02002048, + "epoch": 0.7362693521719526, + "flos": 20702681487360.0, + "grad_norm": 1.4275159038208167, + "language_loss": 0.73296982, + "learning_rate": 6.861894020277658e-07, + "loss": 0.75397182, + "num_input_tokens_seen": 264210820, + "step": 12246, + "time_per_iteration": 4.397065162658691 + }, + { + "auxiliary_loss_clip": 0.01080047, + "auxiliary_loss_mlp": 0.01022964, + "balance_loss_clip": 1.03430128, + "balance_loss_mlp": 1.01182961, + "epoch": 0.7363294754246205, + "flos": 13111381336320.0, + "grad_norm": 2.0499275787373454, + "language_loss": 0.72952509, + "learning_rate": 6.858957833101266e-07, + "loss": 0.75055528, + "num_input_tokens_seen": 264227430, + "step": 12247, + "time_per_iteration": 3.905771255493164 + }, + { + "auxiliary_loss_clip": 0.01094424, + "auxiliary_loss_mlp": 0.01028126, + "balance_loss_clip": 1.03880322, + "balance_loss_mlp": 1.01672959, + "epoch": 0.7363895986772885, + "flos": 14027426910720.0, + "grad_norm": 2.848233783290102, + "language_loss": 0.74280155, + "learning_rate": 6.856022144234526e-07, + "loss": 0.764027, + "num_input_tokens_seen": 264245230, + "step": 12248, + "time_per_iteration": 2.473585605621338 + }, + { + "auxiliary_loss_clip": 0.01084834, + "auxiliary_loss_mlp": 0.01033503, + "balance_loss_clip": 1.03551793, + "balance_loss_mlp": 1.021052, + "epoch": 0.7364497219299564, + "flos": 19719986227200.0, + "grad_norm": 2.0804214440191555, + "language_loss": 0.72649658, + "learning_rate": 6.853086953788727e-07, + "loss": 0.74767995, + "num_input_tokens_seen": 264263945, + "step": 12249, + "time_per_iteration": 2.5261425971984863 + }, + { + "auxiliary_loss_clip": 0.01083441, + "auxiliary_loss_mlp": 0.0103086, + "balance_loss_clip": 1.0356549, + "balance_loss_mlp": 1.01842618, + "epoch": 0.7365098451826244, + "flos": 21361391049600.0, + "grad_norm": 1.8359377653879636, + "language_loss": 0.76977873, + "learning_rate": 6.850152261875189e-07, + "loss": 0.79092169, + "num_input_tokens_seen": 264281500, + "step": 12250, + "time_per_iteration": 2.5651333332061768 + }, + { + "auxiliary_loss_clip": 0.01063446, + "auxiliary_loss_mlp": 0.010277, + "balance_loss_clip": 1.03628933, + "balance_loss_mlp": 1.01519465, + "epoch": 0.7365699684352923, + "flos": 23368222886400.0, + "grad_norm": 1.5159530573107787, + "language_loss": 0.71277678, + "learning_rate": 6.8472180686052e-07, + "loss": 0.73368829, + "num_input_tokens_seen": 264301625, + "step": 12251, + "time_per_iteration": 4.098764896392822 + }, + { + "auxiliary_loss_clip": 0.01088823, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.0356282, + "balance_loss_mlp": 1.0192951, + "epoch": 0.7366300916879603, + "flos": 59524879927680.0, + "grad_norm": 1.54942445521682, + "language_loss": 0.65834486, + "learning_rate": 6.844284374090015e-07, + "loss": 0.67955101, + "num_input_tokens_seen": 264323975, + "step": 12252, + "time_per_iteration": 2.857226848602295 + }, + { + "auxiliary_loss_clip": 0.0105745, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.03448367, + "balance_loss_mlp": 1.02121139, + "epoch": 0.7366902149406283, + "flos": 20923137210240.0, + "grad_norm": 1.9511667791874612, + "language_loss": 0.79415089, + "learning_rate": 6.841351178440884e-07, + "loss": 0.81506449, + "num_input_tokens_seen": 264343785, + "step": 12253, + "time_per_iteration": 2.606703281402588 + }, + { + "auxiliary_loss_clip": 0.01101848, + "auxiliary_loss_mlp": 0.00783242, + "balance_loss_clip": 1.03584242, + "balance_loss_mlp": 1.0096736, + "epoch": 0.7367503381932963, + "flos": 17348158339200.0, + "grad_norm": 2.2140684375271307, + "language_loss": 0.76428986, + "learning_rate": 6.83841848176905e-07, + "loss": 0.78314078, + "num_input_tokens_seen": 264361130, + "step": 12254, + "time_per_iteration": 2.4646682739257812 + }, + { + "auxiliary_loss_clip": 0.01079758, + "auxiliary_loss_mlp": 0.01039489, + "balance_loss_clip": 1.03461409, + "balance_loss_mlp": 1.02540469, + "epoch": 0.7368104614459642, + "flos": 17821317219840.0, + "grad_norm": 2.5344924309389207, + "language_loss": 0.69225442, + "learning_rate": 6.835486284185692e-07, + "loss": 0.71344686, + "num_input_tokens_seen": 264376965, + "step": 12255, + "time_per_iteration": 2.549440383911133 + }, + { + "auxiliary_loss_clip": 0.01095776, + "auxiliary_loss_mlp": 0.01033249, + "balance_loss_clip": 1.03708506, + "balance_loss_mlp": 1.02045751, + "epoch": 0.7368705846986322, + "flos": 24606099342720.0, + "grad_norm": 1.6391626278783977, + "language_loss": 0.75323492, + "learning_rate": 6.832554585802012e-07, + "loss": 0.77452517, + "num_input_tokens_seen": 264396310, + "step": 12256, + "time_per_iteration": 2.5389461517333984 + }, + { + "auxiliary_loss_clip": 0.01095182, + "auxiliary_loss_mlp": 0.01026812, + "balance_loss_clip": 1.03571129, + "balance_loss_mlp": 1.01428962, + "epoch": 0.7369307079513001, + "flos": 34970169968640.0, + "grad_norm": 1.5867398258313095, + "language_loss": 0.73907888, + "learning_rate": 6.829623386729182e-07, + "loss": 0.76029879, + "num_input_tokens_seen": 264418085, + "step": 12257, + "time_per_iteration": 2.6123385429382324 + }, + { + "auxiliary_loss_clip": 0.01088575, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.03389502, + "balance_loss_mlp": 1.02344441, + "epoch": 0.7369908312039681, + "flos": 21214588164480.0, + "grad_norm": 1.5406634029883848, + "language_loss": 0.78340244, + "learning_rate": 6.826692687078362e-07, + "loss": 0.80464518, + "num_input_tokens_seen": 264437595, + "step": 12258, + "time_per_iteration": 2.5139074325561523 + }, + { + "auxiliary_loss_clip": 0.01097596, + "auxiliary_loss_mlp": 0.01032183, + "balance_loss_clip": 1.03728247, + "balance_loss_mlp": 1.02013099, + "epoch": 0.7370509544566362, + "flos": 23623655477760.0, + "grad_norm": 1.3971096100920117, + "language_loss": 0.66290808, + "learning_rate": 6.823762486960674e-07, + "loss": 0.68420589, + "num_input_tokens_seen": 264457385, + "step": 12259, + "time_per_iteration": 2.4997944831848145 + }, + { + "auxiliary_loss_clip": 0.01095797, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.0373981, + "balance_loss_mlp": 1.02126622, + "epoch": 0.7371110777093041, + "flos": 24827704300800.0, + "grad_norm": 1.649248196334524, + "language_loss": 0.73059011, + "learning_rate": 6.820832786487225e-07, + "loss": 0.75189233, + "num_input_tokens_seen": 264477205, + "step": 12260, + "time_per_iteration": 2.524329900741577 + }, + { + "auxiliary_loss_clip": 0.01092731, + "auxiliary_loss_mlp": 0.01031538, + "balance_loss_clip": 1.0362463, + "balance_loss_mlp": 1.01873493, + "epoch": 0.7371712009619721, + "flos": 23149491016320.0, + "grad_norm": 1.6681404996888443, + "language_loss": 0.73715812, + "learning_rate": 6.817903585769125e-07, + "loss": 0.7584008, + "num_input_tokens_seen": 264497195, + "step": 12261, + "time_per_iteration": 2.501664638519287 + }, + { + "auxiliary_loss_clip": 0.0108695, + "auxiliary_loss_mlp": 0.01035216, + "balance_loss_clip": 1.03537273, + "balance_loss_mlp": 1.02181649, + "epoch": 0.73723132421464, + "flos": 23112898035840.0, + "grad_norm": 1.9024526313002705, + "language_loss": 0.66646618, + "learning_rate": 6.814974884917438e-07, + "loss": 0.68768781, + "num_input_tokens_seen": 264516950, + "step": 12262, + "time_per_iteration": 2.56613826751709 + }, + { + "auxiliary_loss_clip": 0.01109316, + "auxiliary_loss_mlp": 0.01029665, + "balance_loss_clip": 1.03809679, + "balance_loss_mlp": 1.01686168, + "epoch": 0.737291447467308, + "flos": 19273328605440.0, + "grad_norm": 1.6952044984541146, + "language_loss": 0.88651508, + "learning_rate": 6.81204668404322e-07, + "loss": 0.90790492, + "num_input_tokens_seen": 264532675, + "step": 12263, + "time_per_iteration": 2.4330577850341797 + }, + { + "auxiliary_loss_clip": 0.01101626, + "auxiliary_loss_mlp": 0.01029191, + "balance_loss_clip": 1.03661752, + "balance_loss_mlp": 1.01885581, + "epoch": 0.7373515707199759, + "flos": 25118257415040.0, + "grad_norm": 1.4975919100971744, + "language_loss": 0.67358983, + "learning_rate": 6.809118983257522e-07, + "loss": 0.69489795, + "num_input_tokens_seen": 264555635, + "step": 12264, + "time_per_iteration": 2.5394296646118164 + }, + { + "auxiliary_loss_clip": 0.01102409, + "auxiliary_loss_mlp": 0.01029305, + "balance_loss_clip": 1.03564763, + "balance_loss_mlp": 1.01797998, + "epoch": 0.737411693972644, + "flos": 32408481767040.0, + "grad_norm": 2.3049755868806123, + "language_loss": 0.8009696, + "learning_rate": 6.806191782671356e-07, + "loss": 0.82228673, + "num_input_tokens_seen": 264573140, + "step": 12265, + "time_per_iteration": 2.549098491668701 + }, + { + "auxiliary_loss_clip": 0.01099647, + "auxiliary_loss_mlp": 0.01031915, + "balance_loss_clip": 1.03602409, + "balance_loss_mlp": 1.01912344, + "epoch": 0.7374718172253119, + "flos": 24315797623680.0, + "grad_norm": 1.755031647850366, + "language_loss": 0.74719024, + "learning_rate": 6.803265082395711e-07, + "loss": 0.76850581, + "num_input_tokens_seen": 264591610, + "step": 12266, + "time_per_iteration": 2.5248162746429443 + }, + { + "auxiliary_loss_clip": 0.01096795, + "auxiliary_loss_mlp": 0.01034999, + "balance_loss_clip": 1.03730857, + "balance_loss_mlp": 1.02242827, + "epoch": 0.7375319404779799, + "flos": 27156115624320.0, + "grad_norm": 1.6436926747938332, + "language_loss": 0.73596507, + "learning_rate": 6.800338882541576e-07, + "loss": 0.75728303, + "num_input_tokens_seen": 264611170, + "step": 12267, + "time_per_iteration": 2.5502076148986816 + }, + { + "auxiliary_loss_clip": 0.0107077, + "auxiliary_loss_mlp": 0.01032983, + "balance_loss_clip": 1.03500128, + "balance_loss_mlp": 1.02118766, + "epoch": 0.7375920637306478, + "flos": 18879999701760.0, + "grad_norm": 1.9677095060780894, + "language_loss": 0.82768911, + "learning_rate": 6.797413183219923e-07, + "loss": 0.84872663, + "num_input_tokens_seen": 264629365, + "step": 12268, + "time_per_iteration": 2.5520355701446533 + }, + { + "auxiliary_loss_clip": 0.01104902, + "auxiliary_loss_mlp": 0.01039711, + "balance_loss_clip": 1.03674626, + "balance_loss_mlp": 1.02727795, + "epoch": 0.7376521869833158, + "flos": 15669765486720.0, + "grad_norm": 1.8374970787500802, + "language_loss": 0.73165661, + "learning_rate": 6.794487984541677e-07, + "loss": 0.75310272, + "num_input_tokens_seen": 264647915, + "step": 12269, + "time_per_iteration": 2.4777140617370605 + }, + { + "auxiliary_loss_clip": 0.01082507, + "auxiliary_loss_mlp": 0.01038705, + "balance_loss_clip": 1.03474736, + "balance_loss_mlp": 1.02518058, + "epoch": 0.7377123102359837, + "flos": 36971973901440.0, + "grad_norm": 1.8391508250332866, + "language_loss": 0.70199084, + "learning_rate": 6.791563286617776e-07, + "loss": 0.72320306, + "num_input_tokens_seen": 264669620, + "step": 12270, + "time_per_iteration": 2.649378538131714 + }, + { + "auxiliary_loss_clip": 0.01091606, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.03429186, + "balance_loss_mlp": 1.01709723, + "epoch": 0.7377724334886517, + "flos": 24496284487680.0, + "grad_norm": 1.889266105271503, + "language_loss": 0.6950115, + "learning_rate": 6.788639089559119e-07, + "loss": 0.71621197, + "num_input_tokens_seen": 264689345, + "step": 12271, + "time_per_iteration": 2.5344760417938232 + }, + { + "auxiliary_loss_clip": 0.01082271, + "auxiliary_loss_mlp": 0.01027022, + "balance_loss_clip": 1.03763664, + "balance_loss_mlp": 1.01420712, + "epoch": 0.7378325567413198, + "flos": 24390025079040.0, + "grad_norm": 1.908794399890235, + "language_loss": 0.67322421, + "learning_rate": 6.785715393476586e-07, + "loss": 0.69431716, + "num_input_tokens_seen": 264707625, + "step": 12272, + "time_per_iteration": 2.5376393795013428 + }, + { + "auxiliary_loss_clip": 0.01081993, + "auxiliary_loss_mlp": 0.01029634, + "balance_loss_clip": 1.03626585, + "balance_loss_mlp": 1.01773095, + "epoch": 0.7378926799939877, + "flos": 17416388223360.0, + "grad_norm": 1.8552444717244574, + "language_loss": 0.78061867, + "learning_rate": 6.782792198481049e-07, + "loss": 0.80173498, + "num_input_tokens_seen": 264725575, + "step": 12273, + "time_per_iteration": 2.5057506561279297 + }, + { + "auxiliary_loss_clip": 0.0110413, + "auxiliary_loss_mlp": 0.01031222, + "balance_loss_clip": 1.03556728, + "balance_loss_mlp": 1.01920021, + "epoch": 0.7379528032466557, + "flos": 18474208778880.0, + "grad_norm": 1.8137083227894086, + "language_loss": 0.83818686, + "learning_rate": 6.779869504683355e-07, + "loss": 0.85954034, + "num_input_tokens_seen": 264742855, + "step": 12274, + "time_per_iteration": 2.4401180744171143 + }, + { + "auxiliary_loss_clip": 0.01085144, + "auxiliary_loss_mlp": 0.00784518, + "balance_loss_clip": 1.03696907, + "balance_loss_mlp": 1.00899816, + "epoch": 0.7380129264993236, + "flos": 17821999578240.0, + "grad_norm": 2.118018273399142, + "language_loss": 0.7386927, + "learning_rate": 6.776947312194341e-07, + "loss": 0.75738931, + "num_input_tokens_seen": 264761155, + "step": 12275, + "time_per_iteration": 2.517084836959839 + }, + { + "auxiliary_loss_clip": 0.01071827, + "auxiliary_loss_mlp": 0.01048611, + "balance_loss_clip": 1.03464818, + "balance_loss_mlp": 1.03452075, + "epoch": 0.7380730497519916, + "flos": 22997372918400.0, + "grad_norm": 3.7814404448269476, + "language_loss": 0.73097539, + "learning_rate": 6.774025621124813e-07, + "loss": 0.7521798, + "num_input_tokens_seen": 264780660, + "step": 12276, + "time_per_iteration": 2.5628018379211426 + }, + { + "auxiliary_loss_clip": 0.01106768, + "auxiliary_loss_mlp": 0.01030419, + "balance_loss_clip": 1.03559589, + "balance_loss_mlp": 1.01810443, + "epoch": 0.7381331730046595, + "flos": 20266259241600.0, + "grad_norm": 1.8701447207174482, + "language_loss": 0.77353621, + "learning_rate": 6.771104431585551e-07, + "loss": 0.79490805, + "num_input_tokens_seen": 264798850, + "step": 12277, + "time_per_iteration": 2.464379072189331 + }, + { + "auxiliary_loss_clip": 0.01104707, + "auxiliary_loss_mlp": 0.01037948, + "balance_loss_clip": 1.03740954, + "balance_loss_mlp": 1.02572298, + "epoch": 0.7381932962573275, + "flos": 19754532132480.0, + "grad_norm": 1.6640177124439044, + "language_loss": 0.78713512, + "learning_rate": 6.768183743687338e-07, + "loss": 0.80856156, + "num_input_tokens_seen": 264816795, + "step": 12278, + "time_per_iteration": 2.440934419631958 + }, + { + "auxiliary_loss_clip": 0.01096238, + "auxiliary_loss_mlp": 0.00784471, + "balance_loss_clip": 1.03572643, + "balance_loss_mlp": 1.00876486, + "epoch": 0.7382534195099955, + "flos": 17305316392320.0, + "grad_norm": 2.1219333399385603, + "language_loss": 0.72243154, + "learning_rate": 6.765263557540921e-07, + "loss": 0.74123865, + "num_input_tokens_seen": 264834105, + "step": 12279, + "time_per_iteration": 2.4752302169799805 + }, + { + "auxiliary_loss_clip": 0.01096934, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.03484464, + "balance_loss_mlp": 1.0216198, + "epoch": 0.7383135427626635, + "flos": 18697358021760.0, + "grad_norm": 2.117303878868669, + "language_loss": 0.85729134, + "learning_rate": 6.762343873257034e-07, + "loss": 0.87860805, + "num_input_tokens_seen": 264850895, + "step": 12280, + "time_per_iteration": 3.8616223335266113 + }, + { + "auxiliary_loss_clip": 0.01068203, + "auxiliary_loss_mlp": 0.01031194, + "balance_loss_clip": 1.03401518, + "balance_loss_mlp": 1.01855826, + "epoch": 0.7383736660153314, + "flos": 20881300844160.0, + "grad_norm": 1.72940291397195, + "language_loss": 0.72087502, + "learning_rate": 6.759424690946408e-07, + "loss": 0.74186897, + "num_input_tokens_seen": 264869505, + "step": 12281, + "time_per_iteration": 2.5571165084838867 + }, + { + "auxiliary_loss_clip": 0.01063207, + "auxiliary_loss_mlp": 0.0102964, + "balance_loss_clip": 1.03339911, + "balance_loss_mlp": 1.01723623, + "epoch": 0.7384337892679994, + "flos": 20663215418880.0, + "grad_norm": 1.5816832766456486, + "language_loss": 0.6029408, + "learning_rate": 6.756506010719711e-07, + "loss": 0.62386918, + "num_input_tokens_seen": 264886915, + "step": 12282, + "time_per_iteration": 2.5720293521881104 + }, + { + "auxiliary_loss_clip": 0.01074013, + "auxiliary_loss_mlp": 0.01031169, + "balance_loss_clip": 1.03636336, + "balance_loss_mlp": 1.01888466, + "epoch": 0.7384939125206673, + "flos": 29169627390720.0, + "grad_norm": 1.8467512464283036, + "language_loss": 0.67846388, + "learning_rate": 6.753587832687632e-07, + "loss": 0.6995157, + "num_input_tokens_seen": 264910350, + "step": 12283, + "time_per_iteration": 2.624000072479248 + }, + { + "auxiliary_loss_clip": 0.01106367, + "auxiliary_loss_mlp": 0.00782278, + "balance_loss_clip": 1.03740656, + "balance_loss_mlp": 1.0073235, + "epoch": 0.7385540357733353, + "flos": 36312833376000.0, + "grad_norm": 1.6976568284020885, + "language_loss": 0.75800401, + "learning_rate": 6.750670156960832e-07, + "loss": 0.7768904, + "num_input_tokens_seen": 264930705, + "step": 12284, + "time_per_iteration": 2.587664842605591 + }, + { + "auxiliary_loss_clip": 0.01089458, + "auxiliary_loss_mlp": 0.01033368, + "balance_loss_clip": 1.03313136, + "balance_loss_mlp": 1.02004623, + "epoch": 0.7386141590260034, + "flos": 20302600826880.0, + "grad_norm": 1.834889170138869, + "language_loss": 0.69750881, + "learning_rate": 6.747752983649954e-07, + "loss": 0.71873713, + "num_input_tokens_seen": 264946975, + "step": 12285, + "time_per_iteration": 3.947385787963867 + }, + { + "auxiliary_loss_clip": 0.01089034, + "auxiliary_loss_mlp": 0.01033631, + "balance_loss_clip": 1.03535342, + "balance_loss_mlp": 1.02032685, + "epoch": 0.7386742822786713, + "flos": 25483792170240.0, + "grad_norm": 1.801741176361625, + "language_loss": 0.79848969, + "learning_rate": 6.744836312865602e-07, + "loss": 0.81971633, + "num_input_tokens_seen": 264967665, + "step": 12286, + "time_per_iteration": 4.046239852905273 + }, + { + "auxiliary_loss_clip": 0.01057757, + "auxiliary_loss_mlp": 0.01030529, + "balance_loss_clip": 1.03469312, + "balance_loss_mlp": 1.01778603, + "epoch": 0.7387344055313393, + "flos": 13771958405760.0, + "grad_norm": 4.586252862763562, + "language_loss": 0.65795374, + "learning_rate": 6.741920144718396e-07, + "loss": 0.67883658, + "num_input_tokens_seen": 264985480, + "step": 12287, + "time_per_iteration": 2.6108880043029785 + }, + { + "auxiliary_loss_clip": 0.01079346, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.03249693, + "balance_loss_mlp": 1.01565433, + "epoch": 0.7387945287840072, + "flos": 27855189095040.0, + "grad_norm": 1.9892578809231651, + "language_loss": 0.76945537, + "learning_rate": 6.739004479318903e-07, + "loss": 0.79052347, + "num_input_tokens_seen": 265004790, + "step": 12288, + "time_per_iteration": 2.572993040084839 + }, + { + "auxiliary_loss_clip": 0.01097648, + "auxiliary_loss_mlp": 0.00786052, + "balance_loss_clip": 1.03791738, + "balance_loss_mlp": 1.00946975, + "epoch": 0.7388546520366752, + "flos": 44233039388160.0, + "grad_norm": 1.5768673451927742, + "language_loss": 0.58356863, + "learning_rate": 6.736089316777684e-07, + "loss": 0.60240567, + "num_input_tokens_seen": 265028790, + "step": 12289, + "time_per_iteration": 2.7142226696014404 + }, + { + "auxiliary_loss_clip": 0.01033508, + "auxiliary_loss_mlp": 0.00761544, + "balance_loss_clip": 1.01017082, + "balance_loss_mlp": 0.9996047, + "epoch": 0.7389147752893431, + "flos": 70680890638080.0, + "grad_norm": 0.6381458542438141, + "language_loss": 0.49293876, + "learning_rate": 6.733174657205287e-07, + "loss": 0.51088929, + "num_input_tokens_seen": 265096660, + "step": 12290, + "time_per_iteration": 4.5651984214782715 + }, + { + "auxiliary_loss_clip": 0.01099319, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.03799224, + "balance_loss_mlp": 1.01567376, + "epoch": 0.7389748985420111, + "flos": 25994980575360.0, + "grad_norm": 1.8703093746417219, + "language_loss": 0.67650044, + "learning_rate": 6.730260500712237e-07, + "loss": 0.69778121, + "num_input_tokens_seen": 265116375, + "step": 12291, + "time_per_iteration": 2.532646894454956 + }, + { + "auxiliary_loss_clip": 0.00994627, + "auxiliary_loss_mlp": 0.01003747, + "balance_loss_clip": 1.01327705, + "balance_loss_mlp": 1.00258446, + "epoch": 0.7390350217946791, + "flos": 54403661318400.0, + "grad_norm": 0.9939788614391721, + "language_loss": 0.60947764, + "learning_rate": 6.727346847409052e-07, + "loss": 0.62946141, + "num_input_tokens_seen": 265161230, + "step": 12292, + "time_per_iteration": 2.8518669605255127 + }, + { + "auxiliary_loss_clip": 0.01058922, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.03556681, + "balance_loss_mlp": 1.0235858, + "epoch": 0.7390951450473471, + "flos": 32196968530560.0, + "grad_norm": 1.7828189491854698, + "language_loss": 0.67029953, + "learning_rate": 6.724433697406191e-07, + "loss": 0.69124258, + "num_input_tokens_seen": 265182515, + "step": 12293, + "time_per_iteration": 2.6730523109436035 + }, + { + "auxiliary_loss_clip": 0.01094813, + "auxiliary_loss_mlp": 0.01032163, + "balance_loss_clip": 1.03514123, + "balance_loss_mlp": 1.01978362, + "epoch": 0.739155268300015, + "flos": 16684241304960.0, + "grad_norm": 1.8296831979601011, + "language_loss": 0.83616722, + "learning_rate": 6.721521050814134e-07, + "loss": 0.85743701, + "num_input_tokens_seen": 265198160, + "step": 12294, + "time_per_iteration": 2.498258352279663 + }, + { + "auxiliary_loss_clip": 0.01069534, + "auxiliary_loss_mlp": 0.0103202, + "balance_loss_clip": 1.03427815, + "balance_loss_mlp": 1.01982522, + "epoch": 0.739215391552683, + "flos": 31649761762560.0, + "grad_norm": 1.7127353828117067, + "language_loss": 0.73141944, + "learning_rate": 6.718608907743337e-07, + "loss": 0.75243503, + "num_input_tokens_seen": 265218480, + "step": 12295, + "time_per_iteration": 2.6299564838409424 + }, + { + "auxiliary_loss_clip": 0.01093521, + "auxiliary_loss_mlp": 0.01035698, + "balance_loss_clip": 1.03714108, + "balance_loss_mlp": 1.02391422, + "epoch": 0.7392755148053509, + "flos": 29718522097920.0, + "grad_norm": 1.8087195099266489, + "language_loss": 0.78948843, + "learning_rate": 6.715697268304215e-07, + "loss": 0.81078064, + "num_input_tokens_seen": 265240165, + "step": 12296, + "time_per_iteration": 2.5760204792022705 + }, + { + "auxiliary_loss_clip": 0.01105426, + "auxiliary_loss_mlp": 0.01033259, + "balance_loss_clip": 1.03646839, + "balance_loss_mlp": 1.02032471, + "epoch": 0.7393356380580189, + "flos": 37050475075200.0, + "grad_norm": 1.9427972700579963, + "language_loss": 0.66239613, + "learning_rate": 6.712786132607182e-07, + "loss": 0.68378294, + "num_input_tokens_seen": 265263295, + "step": 12297, + "time_per_iteration": 2.597039222717285 + }, + { + "auxiliary_loss_clip": 0.01080839, + "auxiliary_loss_mlp": 0.01033425, + "balance_loss_clip": 1.03562379, + "balance_loss_mlp": 1.02058578, + "epoch": 0.739395761310687, + "flos": 19719627091200.0, + "grad_norm": 1.6854586310371844, + "language_loss": 0.68881655, + "learning_rate": 6.709875500762645e-07, + "loss": 0.70995915, + "num_input_tokens_seen": 265282740, + "step": 12298, + "time_per_iteration": 2.512849807739258 + }, + { + "auxiliary_loss_clip": 0.01085526, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.03578091, + "balance_loss_mlp": 1.01883316, + "epoch": 0.7394558845633549, + "flos": 11801504067840.0, + "grad_norm": 1.861003250412372, + "language_loss": 0.74677885, + "learning_rate": 6.706965372880946e-07, + "loss": 0.76794446, + "num_input_tokens_seen": 265300175, + "step": 12299, + "time_per_iteration": 2.497136116027832 + }, + { + "auxiliary_loss_clip": 0.01018794, + "auxiliary_loss_mlp": 0.01003528, + "balance_loss_clip": 1.02172089, + "balance_loss_mlp": 1.00219274, + "epoch": 0.7395160078160229, + "flos": 66195827850240.0, + "grad_norm": 0.7275575326507032, + "language_loss": 0.6084851, + "learning_rate": 6.704055749072455e-07, + "loss": 0.62870824, + "num_input_tokens_seen": 265363275, + "step": 12300, + "time_per_iteration": 3.2091503143310547 + }, + { + "auxiliary_loss_clip": 0.01085488, + "auxiliary_loss_mlp": 0.01032779, + "balance_loss_clip": 1.03776503, + "balance_loss_mlp": 1.0198034, + "epoch": 0.7395761310686908, + "flos": 21249708687360.0, + "grad_norm": 1.7993055451662745, + "language_loss": 0.80176955, + "learning_rate": 6.7011466294475e-07, + "loss": 0.82295227, + "num_input_tokens_seen": 265382935, + "step": 12301, + "time_per_iteration": 2.5261266231536865 + }, + { + "auxiliary_loss_clip": 0.01103943, + "auxiliary_loss_mlp": 0.01029259, + "balance_loss_clip": 1.03638506, + "balance_loss_mlp": 1.01818466, + "epoch": 0.7396362543213588, + "flos": 25955299025280.0, + "grad_norm": 1.4947391976411573, + "language_loss": 0.73079121, + "learning_rate": 6.698238014116406e-07, + "loss": 0.75212324, + "num_input_tokens_seen": 265403245, + "step": 12302, + "time_per_iteration": 2.502429962158203 + }, + { + "auxiliary_loss_clip": 0.01105628, + "auxiliary_loss_mlp": 0.01037552, + "balance_loss_clip": 1.03580451, + "balance_loss_mlp": 1.02546453, + "epoch": 0.7396963775740267, + "flos": 27377936064000.0, + "grad_norm": 1.8794527538342534, + "language_loss": 0.73970133, + "learning_rate": 6.695329903189451e-07, + "loss": 0.76113307, + "num_input_tokens_seen": 265423105, + "step": 12303, + "time_per_iteration": 2.502476453781128 + }, + { + "auxiliary_loss_clip": 0.01102643, + "auxiliary_loss_mlp": 0.01029876, + "balance_loss_clip": 1.03536367, + "balance_loss_mlp": 1.01846731, + "epoch": 0.7397565008266948, + "flos": 25520133755520.0, + "grad_norm": 1.7548378874660553, + "language_loss": 0.54318881, + "learning_rate": 6.692422296776927e-07, + "loss": 0.56451398, + "num_input_tokens_seen": 265443445, + "step": 12304, + "time_per_iteration": 2.5142760276794434 + }, + { + "auxiliary_loss_clip": 0.01083184, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.03501725, + "balance_loss_mlp": 1.01979184, + "epoch": 0.7398166240793627, + "flos": 23727760070400.0, + "grad_norm": 2.1053804559735085, + "language_loss": 0.84618068, + "learning_rate": 6.689515194989084e-07, + "loss": 0.86733174, + "num_input_tokens_seen": 265462085, + "step": 12305, + "time_per_iteration": 2.5424485206604004 + }, + { + "auxiliary_loss_clip": 0.01010901, + "auxiliary_loss_mlp": 0.01001353, + "balance_loss_clip": 1.01084185, + "balance_loss_mlp": 1.00019693, + "epoch": 0.7398767473320307, + "flos": 67267582882560.0, + "grad_norm": 0.8663867832607923, + "language_loss": 0.57716739, + "learning_rate": 6.68660859793615e-07, + "loss": 0.59728992, + "num_input_tokens_seen": 265521190, + "step": 12306, + "time_per_iteration": 3.1225342750549316 + }, + { + "auxiliary_loss_clip": 0.01086679, + "auxiliary_loss_mlp": 0.01029753, + "balance_loss_clip": 1.03750813, + "balance_loss_mlp": 1.01712871, + "epoch": 0.7399368705846986, + "flos": 22018699981440.0, + "grad_norm": 1.835490745343567, + "language_loss": 0.81518805, + "learning_rate": 6.683702505728355e-07, + "loss": 0.83635235, + "num_input_tokens_seen": 265539705, + "step": 12307, + "time_per_iteration": 2.5099899768829346 + }, + { + "auxiliary_loss_clip": 0.01087925, + "auxiliary_loss_mlp": 0.01028185, + "balance_loss_clip": 1.03679669, + "balance_loss_mlp": 1.01672864, + "epoch": 0.7399969938373666, + "flos": 14173870659840.0, + "grad_norm": 1.673617090278493, + "language_loss": 0.69768775, + "learning_rate": 6.680796918475893e-07, + "loss": 0.71884882, + "num_input_tokens_seen": 265555855, + "step": 12308, + "time_per_iteration": 2.47898530960083 + }, + { + "auxiliary_loss_clip": 0.01080927, + "auxiliary_loss_mlp": 0.01027976, + "balance_loss_clip": 1.03531528, + "balance_loss_mlp": 1.0166986, + "epoch": 0.7400571170900345, + "flos": 25301473712640.0, + "grad_norm": 1.7721586879990878, + "language_loss": 0.81161582, + "learning_rate": 6.67789183628896e-07, + "loss": 0.8327049, + "num_input_tokens_seen": 265575455, + "step": 12309, + "time_per_iteration": 2.567465305328369 + }, + { + "auxiliary_loss_clip": 0.01093161, + "auxiliary_loss_mlp": 0.01029939, + "balance_loss_clip": 1.03703606, + "balance_loss_mlp": 1.01773763, + "epoch": 0.7401172403427025, + "flos": 22711344917760.0, + "grad_norm": 1.753373351604273, + "language_loss": 0.72658122, + "learning_rate": 6.674987259277692e-07, + "loss": 0.74781227, + "num_input_tokens_seen": 265595250, + "step": 12310, + "time_per_iteration": 2.517313241958618 + }, + { + "auxiliary_loss_clip": 0.01072296, + "auxiliary_loss_mlp": 0.01037185, + "balance_loss_clip": 1.03922391, + "balance_loss_mlp": 1.02424526, + "epoch": 0.7401773635953706, + "flos": 18067448188800.0, + "grad_norm": 2.4601276452080842, + "language_loss": 0.88534355, + "learning_rate": 6.672083187552239e-07, + "loss": 0.90643835, + "num_input_tokens_seen": 265606945, + "step": 12311, + "time_per_iteration": 2.497124195098877 + }, + { + "auxiliary_loss_clip": 0.01042454, + "auxiliary_loss_mlp": 0.01028872, + "balance_loss_clip": 1.03218579, + "balance_loss_mlp": 1.01718962, + "epoch": 0.7402374868480385, + "flos": 22712135016960.0, + "grad_norm": 1.4616140586283417, + "language_loss": 0.80172503, + "learning_rate": 6.669179621222738e-07, + "loss": 0.82243836, + "num_input_tokens_seen": 265626115, + "step": 12312, + "time_per_iteration": 2.6167969703674316 + }, + { + "auxiliary_loss_clip": 0.01058117, + "auxiliary_loss_mlp": 0.01034236, + "balance_loss_clip": 1.03404331, + "balance_loss_mlp": 1.02185607, + "epoch": 0.7402976101007065, + "flos": 22856675345280.0, + "grad_norm": 1.7097445182311732, + "language_loss": 0.78284639, + "learning_rate": 6.666276560399273e-07, + "loss": 0.80376995, + "num_input_tokens_seen": 265646520, + "step": 12313, + "time_per_iteration": 2.6160778999328613 + }, + { + "auxiliary_loss_clip": 0.01064311, + "auxiliary_loss_mlp": 0.01032763, + "balance_loss_clip": 1.03557706, + "balance_loss_mlp": 1.0201093, + "epoch": 0.7403577333533744, + "flos": 12345801834240.0, + "grad_norm": 2.4162185873192685, + "language_loss": 0.78359282, + "learning_rate": 6.663374005191937e-07, + "loss": 0.80456352, + "num_input_tokens_seen": 265661875, + "step": 12314, + "time_per_iteration": 2.560800790786743 + }, + { + "auxiliary_loss_clip": 0.01022736, + "auxiliary_loss_mlp": 0.0100622, + "balance_loss_clip": 1.01002645, + "balance_loss_mlp": 1.00514078, + "epoch": 0.7404178566060424, + "flos": 60327270869760.0, + "grad_norm": 0.8441679286417137, + "language_loss": 0.55193168, + "learning_rate": 6.660471955710809e-07, + "loss": 0.57222128, + "num_input_tokens_seen": 265721255, + "step": 12315, + "time_per_iteration": 3.0791289806365967 + }, + { + "auxiliary_loss_clip": 0.0108899, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.03799176, + "balance_loss_mlp": 1.01783085, + "epoch": 0.7404779798587103, + "flos": 32014650072960.0, + "grad_norm": 1.652530890145404, + "language_loss": 0.80134356, + "learning_rate": 6.65757041206591e-07, + "loss": 0.82252944, + "num_input_tokens_seen": 265743970, + "step": 12316, + "time_per_iteration": 2.594599723815918 + }, + { + "auxiliary_loss_clip": 0.01092083, + "auxiliary_loss_mlp": 0.01030317, + "balance_loss_clip": 1.03412104, + "balance_loss_mlp": 1.01852727, + "epoch": 0.7405381031113784, + "flos": 12889704551040.0, + "grad_norm": 1.8418763347490485, + "language_loss": 0.74720907, + "learning_rate": 6.654669374367275e-07, + "loss": 0.76843309, + "num_input_tokens_seen": 265760890, + "step": 12317, + "time_per_iteration": 2.488734483718872 + }, + { + "auxiliary_loss_clip": 0.01080639, + "auxiliary_loss_mlp": 0.01030233, + "balance_loss_clip": 1.03589463, + "balance_loss_mlp": 1.01861584, + "epoch": 0.7405982263640463, + "flos": 20229127557120.0, + "grad_norm": 1.5810904611640646, + "language_loss": 0.81393635, + "learning_rate": 6.651768842724917e-07, + "loss": 0.8350451, + "num_input_tokens_seen": 265779600, + "step": 12318, + "time_per_iteration": 3.8961055278778076 + }, + { + "auxiliary_loss_clip": 0.01081175, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.03478515, + "balance_loss_mlp": 1.0169301, + "epoch": 0.7406583496167143, + "flos": 17567213431680.0, + "grad_norm": 2.3730958292156035, + "language_loss": 0.76744026, + "learning_rate": 6.648868817248827e-07, + "loss": 0.78854775, + "num_input_tokens_seen": 265797030, + "step": 12319, + "time_per_iteration": 2.5030930042266846 + }, + { + "auxiliary_loss_clip": 0.01083132, + "auxiliary_loss_mlp": 0.01033147, + "balance_loss_clip": 1.03560364, + "balance_loss_mlp": 1.02204227, + "epoch": 0.7407184728693822, + "flos": 18295733076480.0, + "grad_norm": 1.8393706158431276, + "language_loss": 0.63628805, + "learning_rate": 6.64596929804897e-07, + "loss": 0.6574508, + "num_input_tokens_seen": 265815055, + "step": 12320, + "time_per_iteration": 2.483637809753418 + }, + { + "auxiliary_loss_clip": 0.01097276, + "auxiliary_loss_mlp": 0.01038758, + "balance_loss_clip": 1.03716993, + "balance_loss_mlp": 1.02654481, + "epoch": 0.7407785961220502, + "flos": 16690562098560.0, + "grad_norm": 4.151621269767466, + "language_loss": 0.82553804, + "learning_rate": 6.643070285235288e-07, + "loss": 0.84689832, + "num_input_tokens_seen": 265828480, + "step": 12321, + "time_per_iteration": 2.4755165576934814 + }, + { + "auxiliary_loss_clip": 0.0108683, + "auxiliary_loss_mlp": 0.01048453, + "balance_loss_clip": 1.03511024, + "balance_loss_mlp": 1.03479135, + "epoch": 0.7408387193747181, + "flos": 22088330496000.0, + "grad_norm": 2.4670302828978987, + "language_loss": 0.71984625, + "learning_rate": 6.640171778917727e-07, + "loss": 0.74119914, + "num_input_tokens_seen": 265845825, + "step": 12322, + "time_per_iteration": 2.5338587760925293 + }, + { + "auxiliary_loss_clip": 0.01094989, + "auxiliary_loss_mlp": 0.00784048, + "balance_loss_clip": 1.03618085, + "balance_loss_mlp": 1.00924671, + "epoch": 0.7408988426273861, + "flos": 24236721832320.0, + "grad_norm": 1.7864011410045353, + "language_loss": 0.64344293, + "learning_rate": 6.637273779206183e-07, + "loss": 0.66223329, + "num_input_tokens_seen": 265866335, + "step": 12323, + "time_per_iteration": 2.5422515869140625 + }, + { + "auxiliary_loss_clip": 0.01070597, + "auxiliary_loss_mlp": 0.01027696, + "balance_loss_clip": 1.03606427, + "balance_loss_mlp": 1.01570964, + "epoch": 0.7409589658800542, + "flos": 29023004073600.0, + "grad_norm": 1.3748630709286518, + "language_loss": 0.75853634, + "learning_rate": 6.634376286210559e-07, + "loss": 0.77951926, + "num_input_tokens_seen": 265888945, + "step": 12324, + "time_per_iteration": 5.414827108383179 + }, + { + "auxiliary_loss_clip": 0.01074209, + "auxiliary_loss_mlp": 0.01023961, + "balance_loss_clip": 1.03523731, + "balance_loss_mlp": 1.01224291, + "epoch": 0.7410190891327221, + "flos": 19351362902400.0, + "grad_norm": 1.6829619577824186, + "language_loss": 0.75183785, + "learning_rate": 6.63147930004073e-07, + "loss": 0.77281952, + "num_input_tokens_seen": 265908030, + "step": 12325, + "time_per_iteration": 2.5139832496643066 + }, + { + "auxiliary_loss_clip": 0.01065299, + "auxiliary_loss_mlp": 0.01036906, + "balance_loss_clip": 1.03511739, + "balance_loss_mlp": 1.02403164, + "epoch": 0.7410792123853901, + "flos": 22747650589440.0, + "grad_norm": 3.0018495681992885, + "language_loss": 0.68063211, + "learning_rate": 6.628582820806545e-07, + "loss": 0.7016542, + "num_input_tokens_seen": 265927030, + "step": 12326, + "time_per_iteration": 2.597381591796875 + }, + { + "auxiliary_loss_clip": 0.0106756, + "auxiliary_loss_mlp": 0.01027384, + "balance_loss_clip": 1.03638649, + "balance_loss_mlp": 1.01557088, + "epoch": 0.741139335638058, + "flos": 25372433030400.0, + "grad_norm": 1.5910253416268154, + "language_loss": 0.89418948, + "learning_rate": 6.625686848617835e-07, + "loss": 0.91513896, + "num_input_tokens_seen": 265945490, + "step": 12327, + "time_per_iteration": 2.599994421005249 + }, + { + "auxiliary_loss_clip": 0.01105752, + "auxiliary_loss_mlp": 0.01033616, + "balance_loss_clip": 1.03754878, + "balance_loss_mlp": 1.02093208, + "epoch": 0.741199458890726, + "flos": 18585639745920.0, + "grad_norm": 1.6548942386272454, + "language_loss": 0.85585946, + "learning_rate": 6.62279138358442e-07, + "loss": 0.87725317, + "num_input_tokens_seen": 265963265, + "step": 12328, + "time_per_iteration": 2.45192551612854 + }, + { + "auxiliary_loss_clip": 0.0109154, + "auxiliary_loss_mlp": 0.01029362, + "balance_loss_clip": 1.03506887, + "balance_loss_mlp": 1.01601684, + "epoch": 0.7412595821433939, + "flos": 22127078292480.0, + "grad_norm": 1.7119229990889684, + "language_loss": 0.66537726, + "learning_rate": 6.619896425816103e-07, + "loss": 0.68658626, + "num_input_tokens_seen": 265982270, + "step": 12329, + "time_per_iteration": 3.982621669769287 + }, + { + "auxiliary_loss_clip": 0.01073697, + "auxiliary_loss_mlp": 0.01040504, + "balance_loss_clip": 1.03544497, + "balance_loss_mlp": 1.02766478, + "epoch": 0.741319705396062, + "flos": 29169699217920.0, + "grad_norm": 1.6500569253780875, + "language_loss": 0.66855556, + "learning_rate": 6.617001975422647e-07, + "loss": 0.6896975, + "num_input_tokens_seen": 266003835, + "step": 12330, + "time_per_iteration": 2.648768186569214 + }, + { + "auxiliary_loss_clip": 0.01072762, + "auxiliary_loss_mlp": 0.01032062, + "balance_loss_clip": 1.03922224, + "balance_loss_mlp": 1.01750636, + "epoch": 0.7413798286487299, + "flos": 20667489137280.0, + "grad_norm": 1.8182825956932047, + "language_loss": 0.85612541, + "learning_rate": 6.614108032513823e-07, + "loss": 0.87717366, + "num_input_tokens_seen": 266021595, + "step": 12331, + "time_per_iteration": 2.578341007232666 + }, + { + "auxiliary_loss_clip": 0.01045755, + "auxiliary_loss_mlp": 0.01028857, + "balance_loss_clip": 1.03305197, + "balance_loss_mlp": 1.01625109, + "epoch": 0.7414399519013979, + "flos": 16398895662720.0, + "grad_norm": 2.2446176296108122, + "language_loss": 0.69676352, + "learning_rate": 6.611214597199364e-07, + "loss": 0.71750963, + "num_input_tokens_seen": 266039860, + "step": 12332, + "time_per_iteration": 2.6799416542053223 + }, + { + "auxiliary_loss_clip": 0.01107126, + "auxiliary_loss_mlp": 0.01040081, + "balance_loss_clip": 1.03760672, + "balance_loss_mlp": 1.02712309, + "epoch": 0.7415000751540658, + "flos": 25630235919360.0, + "grad_norm": 10.903818210350364, + "language_loss": 0.63917887, + "learning_rate": 6.608321669588984e-07, + "loss": 0.66065097, + "num_input_tokens_seen": 266058050, + "step": 12333, + "time_per_iteration": 2.6797683238983154 + }, + { + "auxiliary_loss_clip": 0.01081228, + "auxiliary_loss_mlp": 0.01033907, + "balance_loss_clip": 1.03698194, + "balance_loss_mlp": 1.02255869, + "epoch": 0.7415601984067338, + "flos": 24499732193280.0, + "grad_norm": 1.71502229609218, + "language_loss": 0.71416521, + "learning_rate": 6.605429249792387e-07, + "loss": 0.73531657, + "num_input_tokens_seen": 266078060, + "step": 12334, + "time_per_iteration": 2.564955234527588 + }, + { + "auxiliary_loss_clip": 0.01067013, + "auxiliary_loss_mlp": 0.01024246, + "balance_loss_clip": 1.03531671, + "balance_loss_mlp": 1.01239693, + "epoch": 0.7416203216594017, + "flos": 20887154760960.0, + "grad_norm": 1.8416226582450987, + "language_loss": 0.82206595, + "learning_rate": 6.602537337919257e-07, + "loss": 0.8429786, + "num_input_tokens_seen": 266097110, + "step": 12335, + "time_per_iteration": 2.5671515464782715 + }, + { + "auxiliary_loss_clip": 0.01105511, + "auxiliary_loss_mlp": 0.01028842, + "balance_loss_clip": 1.03610218, + "balance_loss_mlp": 1.01612234, + "epoch": 0.7416804449120697, + "flos": 15624265933440.0, + "grad_norm": 2.337866585890216, + "language_loss": 0.74862885, + "learning_rate": 6.599645934079259e-07, + "loss": 0.76997232, + "num_input_tokens_seen": 266110870, + "step": 12336, + "time_per_iteration": 2.404961109161377 + }, + { + "auxiliary_loss_clip": 0.01061937, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.03492153, + "balance_loss_mlp": 1.01883698, + "epoch": 0.7417405681647377, + "flos": 17120483982720.0, + "grad_norm": 1.9498925137446024, + "language_loss": 0.73258483, + "learning_rate": 6.596755038382029e-07, + "loss": 0.75351465, + "num_input_tokens_seen": 266127845, + "step": 12337, + "time_per_iteration": 2.5827932357788086 + }, + { + "auxiliary_loss_clip": 0.01078071, + "auxiliary_loss_mlp": 0.01034637, + "balance_loss_clip": 1.03589034, + "balance_loss_mlp": 1.02296686, + "epoch": 0.7418006914174057, + "flos": 18880322924160.0, + "grad_norm": 1.7010772018577223, + "language_loss": 0.76565099, + "learning_rate": 6.593864650937186e-07, + "loss": 0.78677809, + "num_input_tokens_seen": 266145400, + "step": 12338, + "time_per_iteration": 2.5041651725769043 + }, + { + "auxiliary_loss_clip": 0.01093373, + "auxiliary_loss_mlp": 0.01026362, + "balance_loss_clip": 1.03492701, + "balance_loss_mlp": 1.01569307, + "epoch": 0.7418608146700737, + "flos": 21580733450880.0, + "grad_norm": 1.7085085487834184, + "language_loss": 0.73356211, + "learning_rate": 6.590974771854345e-07, + "loss": 0.75475949, + "num_input_tokens_seen": 266164430, + "step": 12339, + "time_per_iteration": 2.5216386318206787 + }, + { + "auxiliary_loss_clip": 0.01085525, + "auxiliary_loss_mlp": 0.0103083, + "balance_loss_clip": 1.03519428, + "balance_loss_mlp": 1.01814032, + "epoch": 0.7419209379227416, + "flos": 22340459036160.0, + "grad_norm": 3.120066154988947, + "language_loss": 0.79338944, + "learning_rate": 6.588085401243077e-07, + "loss": 0.81455296, + "num_input_tokens_seen": 266183855, + "step": 12340, + "time_per_iteration": 2.547351598739624 + }, + { + "auxiliary_loss_clip": 0.01072017, + "auxiliary_loss_mlp": 0.01035189, + "balance_loss_clip": 1.03487277, + "balance_loss_mlp": 1.0226835, + "epoch": 0.7419810611754096, + "flos": 16762275601920.0, + "grad_norm": 1.4415086769101915, + "language_loss": 0.75432253, + "learning_rate": 6.585196539212958e-07, + "loss": 0.77539456, + "num_input_tokens_seen": 266202085, + "step": 12341, + "time_per_iteration": 2.5167527198791504 + }, + { + "auxiliary_loss_clip": 0.01073224, + "auxiliary_loss_mlp": 0.01038377, + "balance_loss_clip": 1.03403294, + "balance_loss_mlp": 1.02513885, + "epoch": 0.7420411844280775, + "flos": 26212958259840.0, + "grad_norm": 1.3923877826395394, + "language_loss": 0.80291617, + "learning_rate": 6.582308185873535e-07, + "loss": 0.82403213, + "num_input_tokens_seen": 266223445, + "step": 12342, + "time_per_iteration": 2.595031261444092 + }, + { + "auxiliary_loss_clip": 0.01069508, + "auxiliary_loss_mlp": 0.01030757, + "balance_loss_clip": 1.03325105, + "balance_loss_mlp": 1.01841235, + "epoch": 0.7421013076807456, + "flos": 68529371840640.0, + "grad_norm": 1.724114013215581, + "language_loss": 0.77445161, + "learning_rate": 6.57942034133433e-07, + "loss": 0.79545426, + "num_input_tokens_seen": 266246575, + "step": 12343, + "time_per_iteration": 2.954693078994751 + }, + { + "auxiliary_loss_clip": 0.01079004, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.03232431, + "balance_loss_mlp": 1.02053857, + "epoch": 0.7421614309334135, + "flos": 24425325169920.0, + "grad_norm": 8.138685087762191, + "language_loss": 0.67979014, + "learning_rate": 6.576533005704843e-07, + "loss": 0.70090538, + "num_input_tokens_seen": 266266055, + "step": 12344, + "time_per_iteration": 2.5676352977752686 + }, + { + "auxiliary_loss_clip": 0.01060988, + "auxiliary_loss_mlp": 0.01033631, + "balance_loss_clip": 1.03635979, + "balance_loss_mlp": 1.02028537, + "epoch": 0.7422215541860815, + "flos": 12311076360960.0, + "grad_norm": 2.2511616893958717, + "language_loss": 0.80972463, + "learning_rate": 6.573646179094572e-07, + "loss": 0.83067077, + "num_input_tokens_seen": 266282240, + "step": 12345, + "time_per_iteration": 2.5678532123565674 + }, + { + "auxiliary_loss_clip": 0.01074601, + "auxiliary_loss_mlp": 0.01035692, + "balance_loss_clip": 1.03472865, + "balance_loss_mlp": 1.02302647, + "epoch": 0.7422816774387494, + "flos": 19645579203840.0, + "grad_norm": 1.8900096443139274, + "language_loss": 0.70636898, + "learning_rate": 6.570759861612988e-07, + "loss": 0.72747195, + "num_input_tokens_seen": 266300980, + "step": 12346, + "time_per_iteration": 2.6218819618225098 + }, + { + "auxiliary_loss_clip": 0.01095759, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.03721273, + "balance_loss_mlp": 1.01966548, + "epoch": 0.7423418006914174, + "flos": 32015978876160.0, + "grad_norm": 1.6397783502379655, + "language_loss": 0.73199624, + "learning_rate": 6.56787405336953e-07, + "loss": 0.75327224, + "num_input_tokens_seen": 266322215, + "step": 12347, + "time_per_iteration": 2.576030731201172 + }, + { + "auxiliary_loss_clip": 0.01084082, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.03433132, + "balance_loss_mlp": 1.02061951, + "epoch": 0.7424019239440853, + "flos": 18916951818240.0, + "grad_norm": 1.7095756526717656, + "language_loss": 0.80894411, + "learning_rate": 6.564988754473642e-07, + "loss": 0.83011502, + "num_input_tokens_seen": 266341600, + "step": 12348, + "time_per_iteration": 2.5361597537994385 + }, + { + "auxiliary_loss_clip": 0.01102773, + "auxiliary_loss_mlp": 0.01031144, + "balance_loss_clip": 1.03477228, + "balance_loss_mlp": 1.01919937, + "epoch": 0.7424620471967533, + "flos": 35876518871040.0, + "grad_norm": 1.7268640218387012, + "language_loss": 0.7239306, + "learning_rate": 6.562103965034724e-07, + "loss": 0.74526978, + "num_input_tokens_seen": 266362895, + "step": 12349, + "time_per_iteration": 2.5692129135131836 + }, + { + "auxiliary_loss_clip": 0.01085507, + "auxiliary_loss_mlp": 0.01032177, + "balance_loss_clip": 1.03352368, + "balance_loss_mlp": 1.01831913, + "epoch": 0.7425221704494213, + "flos": 27016603200000.0, + "grad_norm": 2.013973407818162, + "language_loss": 0.78762138, + "learning_rate": 6.559219685162165e-07, + "loss": 0.80879819, + "num_input_tokens_seen": 266384015, + "step": 12350, + "time_per_iteration": 2.5644924640655518 + }, + { + "auxiliary_loss_clip": 0.01060287, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.03541899, + "balance_loss_mlp": 1.0181222, + "epoch": 0.7425822937020893, + "flos": 34167135559680.0, + "grad_norm": 2.136406554035744, + "language_loss": 0.75356221, + "learning_rate": 6.556335914965343e-07, + "loss": 0.77446342, + "num_input_tokens_seen": 266405990, + "step": 12351, + "time_per_iteration": 2.6857399940490723 + }, + { + "auxiliary_loss_clip": 0.01046752, + "auxiliary_loss_mlp": 0.01023686, + "balance_loss_clip": 1.03315115, + "balance_loss_mlp": 1.01181281, + "epoch": 0.7426424169547573, + "flos": 21283572234240.0, + "grad_norm": 2.1949301367231757, + "language_loss": 0.8160581, + "learning_rate": 6.553452654553611e-07, + "loss": 0.83676243, + "num_input_tokens_seen": 266424260, + "step": 12352, + "time_per_iteration": 2.6549971103668213 + }, + { + "auxiliary_loss_clip": 0.0109629, + "auxiliary_loss_mlp": 0.0103799, + "balance_loss_clip": 1.03723967, + "balance_loss_mlp": 1.02616394, + "epoch": 0.7427025402074252, + "flos": 22448442297600.0, + "grad_norm": 1.7431938699053586, + "language_loss": 0.71706545, + "learning_rate": 6.550569904036307e-07, + "loss": 0.73840821, + "num_input_tokens_seen": 266444580, + "step": 12353, + "time_per_iteration": 2.509568929672241 + }, + { + "auxiliary_loss_clip": 0.01097072, + "auxiliary_loss_mlp": 0.01033797, + "balance_loss_clip": 1.03870034, + "balance_loss_mlp": 1.02247167, + "epoch": 0.7427626634600932, + "flos": 22524609087360.0, + "grad_norm": 2.9185929903929386, + "language_loss": 0.72465849, + "learning_rate": 6.547687663522739e-07, + "loss": 0.74596721, + "num_input_tokens_seen": 266465640, + "step": 12354, + "time_per_iteration": 2.532309055328369 + }, + { + "auxiliary_loss_clip": 0.01022298, + "auxiliary_loss_mlp": 0.01003226, + "balance_loss_clip": 1.00884223, + "balance_loss_mlp": 1.00211143, + "epoch": 0.7428227867127611, + "flos": 67209477655680.0, + "grad_norm": 0.6943867093696149, + "language_loss": 0.59576714, + "learning_rate": 6.544805933122199e-07, + "loss": 0.61602235, + "num_input_tokens_seen": 266531950, + "step": 12355, + "time_per_iteration": 3.2189323902130127 + }, + { + "auxiliary_loss_clip": 0.01106296, + "auxiliary_loss_mlp": 0.01026711, + "balance_loss_clip": 1.03712058, + "balance_loss_mlp": 1.01461148, + "epoch": 0.7428829099654292, + "flos": 14721221082240.0, + "grad_norm": 1.70541606006716, + "language_loss": 0.67694616, + "learning_rate": 6.541924712943971e-07, + "loss": 0.69827622, + "num_input_tokens_seen": 266550665, + "step": 12356, + "time_per_iteration": 2.443842887878418 + }, + { + "auxiliary_loss_clip": 0.01095071, + "auxiliary_loss_mlp": 0.00783531, + "balance_loss_clip": 1.03349912, + "balance_loss_mlp": 1.00864649, + "epoch": 0.7429430332180971, + "flos": 48646496413440.0, + "grad_norm": 1.9338474509350345, + "language_loss": 0.72090971, + "learning_rate": 6.539044003097301e-07, + "loss": 0.73969573, + "num_input_tokens_seen": 266572455, + "step": 12357, + "time_per_iteration": 4.112387657165527 + }, + { + "auxiliary_loss_clip": 0.01084097, + "auxiliary_loss_mlp": 0.0102925, + "balance_loss_clip": 1.03950191, + "balance_loss_mlp": 1.01800227, + "epoch": 0.7430031564707651, + "flos": 16764071281920.0, + "grad_norm": 1.7276765347156942, + "language_loss": 0.64835256, + "learning_rate": 6.53616380369143e-07, + "loss": 0.66948605, + "num_input_tokens_seen": 266590895, + "step": 12358, + "time_per_iteration": 2.5044400691986084 + }, + { + "auxiliary_loss_clip": 0.01066261, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.03662503, + "balance_loss_mlp": 1.02224755, + "epoch": 0.743063279723433, + "flos": 23870576545920.0, + "grad_norm": 1.7846772596610843, + "language_loss": 0.80505908, + "learning_rate": 6.533284114835591e-07, + "loss": 0.82607692, + "num_input_tokens_seen": 266607660, + "step": 12359, + "time_per_iteration": 2.603227376937866 + }, + { + "auxiliary_loss_clip": 0.01093581, + "auxiliary_loss_mlp": 0.0102616, + "balance_loss_clip": 1.03496408, + "balance_loss_mlp": 1.01480496, + "epoch": 0.743123402976101, + "flos": 14391704689920.0, + "grad_norm": 1.939653931044939, + "language_loss": 0.68305266, + "learning_rate": 6.530404936638956e-07, + "loss": 0.7042501, + "num_input_tokens_seen": 266624260, + "step": 12360, + "time_per_iteration": 2.4554591178894043 + }, + { + "auxiliary_loss_clip": 0.01091665, + "auxiliary_loss_mlp": 0.00782153, + "balance_loss_clip": 1.03448796, + "balance_loss_mlp": 1.00752115, + "epoch": 0.7431835262287689, + "flos": 27454318335360.0, + "grad_norm": 1.6272758711568318, + "language_loss": 0.72789836, + "learning_rate": 6.527526269210715e-07, + "loss": 0.74663651, + "num_input_tokens_seen": 266644210, + "step": 12361, + "time_per_iteration": 2.5588886737823486 + }, + { + "auxiliary_loss_clip": 0.01056127, + "auxiliary_loss_mlp": 0.01040713, + "balance_loss_clip": 1.03247416, + "balance_loss_mlp": 1.02715278, + "epoch": 0.743243649481437, + "flos": 20959514709120.0, + "grad_norm": 1.8020576247952715, + "language_loss": 0.56229144, + "learning_rate": 6.524648112660027e-07, + "loss": 0.58325982, + "num_input_tokens_seen": 266664230, + "step": 12362, + "time_per_iteration": 2.584265947341919 + }, + { + "auxiliary_loss_clip": 0.01068421, + "auxiliary_loss_mlp": 0.01031904, + "balance_loss_clip": 1.03451073, + "balance_loss_mlp": 1.01933289, + "epoch": 0.7433037727341049, + "flos": 22783166161920.0, + "grad_norm": 1.6320370110785174, + "language_loss": 0.77455056, + "learning_rate": 6.521770467096039e-07, + "loss": 0.7955538, + "num_input_tokens_seen": 266683270, + "step": 12363, + "time_per_iteration": 5.753237962722778 + }, + { + "auxiliary_loss_clip": 0.01076774, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.03360009, + "balance_loss_mlp": 1.02062082, + "epoch": 0.7433638959867729, + "flos": 22196708807040.0, + "grad_norm": 1.7037097129540621, + "language_loss": 0.77834237, + "learning_rate": 6.518893332627862e-07, + "loss": 0.79943097, + "num_input_tokens_seen": 266701235, + "step": 12364, + "time_per_iteration": 2.541304349899292 + }, + { + "auxiliary_loss_clip": 0.01093519, + "auxiliary_loss_mlp": 0.01029548, + "balance_loss_clip": 1.03546011, + "balance_loss_mlp": 1.01789498, + "epoch": 0.7434240192394409, + "flos": 23296760778240.0, + "grad_norm": 1.9560591689170765, + "language_loss": 0.78389537, + "learning_rate": 6.516016709364604e-07, + "loss": 0.80512607, + "num_input_tokens_seen": 266721495, + "step": 12365, + "time_per_iteration": 2.5341920852661133 + }, + { + "auxiliary_loss_clip": 0.01081957, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.03560722, + "balance_loss_mlp": 1.01874685, + "epoch": 0.7434841424921088, + "flos": 54009575251200.0, + "grad_norm": 1.6890411966437404, + "language_loss": 0.76614076, + "learning_rate": 6.513140597415346e-07, + "loss": 0.78727031, + "num_input_tokens_seen": 266747400, + "step": 12366, + "time_per_iteration": 2.818577289581299 + }, + { + "auxiliary_loss_clip": 0.01093982, + "auxiliary_loss_mlp": 0.01028011, + "balance_loss_clip": 1.03683269, + "balance_loss_mlp": 1.01762748, + "epoch": 0.7435442657447768, + "flos": 21433966479360.0, + "grad_norm": 1.6653859548796892, + "language_loss": 0.71385932, + "learning_rate": 6.510264996889141e-07, + "loss": 0.73507923, + "num_input_tokens_seen": 266767630, + "step": 12367, + "time_per_iteration": 2.4971561431884766 + }, + { + "auxiliary_loss_clip": 0.01076806, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.03513336, + "balance_loss_mlp": 1.01959705, + "epoch": 0.7436043889974447, + "flos": 24499408970880.0, + "grad_norm": 1.5001740842134808, + "language_loss": 0.7435993, + "learning_rate": 6.507389907895038e-07, + "loss": 0.76468384, + "num_input_tokens_seen": 266788015, + "step": 12368, + "time_per_iteration": 3.97795033454895 + }, + { + "auxiliary_loss_clip": 0.01089552, + "auxiliary_loss_mlp": 0.01033103, + "balance_loss_clip": 1.03580618, + "balance_loss_mlp": 1.02266657, + "epoch": 0.7436645122501128, + "flos": 40698388512000.0, + "grad_norm": 1.5744963604909499, + "language_loss": 0.6956799, + "learning_rate": 6.50451533054207e-07, + "loss": 0.71690643, + "num_input_tokens_seen": 266809010, + "step": 12369, + "time_per_iteration": 2.6392264366149902 + }, + { + "auxiliary_loss_clip": 0.01083224, + "auxiliary_loss_mlp": 0.00784156, + "balance_loss_clip": 1.03649831, + "balance_loss_mlp": 1.01199293, + "epoch": 0.7437246355027807, + "flos": 18908835344640.0, + "grad_norm": 1.6692159850427977, + "language_loss": 0.75667584, + "learning_rate": 6.501641264939233e-07, + "loss": 0.77534962, + "num_input_tokens_seen": 266825390, + "step": 12370, + "time_per_iteration": 2.490330219268799 + }, + { + "auxiliary_loss_clip": 0.01106166, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.03786302, + "balance_loss_mlp": 1.02240181, + "epoch": 0.7437847587554487, + "flos": 21543817248000.0, + "grad_norm": 1.5423763933580898, + "language_loss": 0.78402841, + "learning_rate": 6.498767711195503e-07, + "loss": 0.80543602, + "num_input_tokens_seen": 266844675, + "step": 12371, + "time_per_iteration": 2.460832118988037 + }, + { + "auxiliary_loss_clip": 0.01084634, + "auxiliary_loss_mlp": 0.01023797, + "balance_loss_clip": 1.03567469, + "balance_loss_mlp": 1.01183438, + "epoch": 0.7438448820081166, + "flos": 27782470010880.0, + "grad_norm": 1.637399046615536, + "language_loss": 0.6951288, + "learning_rate": 6.495894669419857e-07, + "loss": 0.71621311, + "num_input_tokens_seen": 266865160, + "step": 12372, + "time_per_iteration": 2.568676471710205 + }, + { + "auxiliary_loss_clip": 0.01073167, + "auxiliary_loss_mlp": 0.01031017, + "balance_loss_clip": 1.0343678, + "balance_loss_mlp": 1.01882756, + "epoch": 0.7439050052607846, + "flos": 17967832796160.0, + "grad_norm": 1.7961501093872643, + "language_loss": 0.74712527, + "learning_rate": 6.493022139721245e-07, + "loss": 0.76816708, + "num_input_tokens_seen": 266883285, + "step": 12373, + "time_per_iteration": 2.5030784606933594 + }, + { + "auxiliary_loss_clip": 0.01051958, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.03185654, + "balance_loss_mlp": 1.01883125, + "epoch": 0.7439651285134525, + "flos": 22958696949120.0, + "grad_norm": 1.7475309172826103, + "language_loss": 0.77259886, + "learning_rate": 6.49015012220858e-07, + "loss": 0.79344416, + "num_input_tokens_seen": 266900960, + "step": 12374, + "time_per_iteration": 2.5844027996063232 + }, + { + "auxiliary_loss_clip": 0.01051283, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.03596759, + "balance_loss_mlp": 1.02014244, + "epoch": 0.7440252517661206, + "flos": 18806777827200.0, + "grad_norm": 2.329495456202981, + "language_loss": 0.7633515, + "learning_rate": 6.487278616990774e-07, + "loss": 0.78418946, + "num_input_tokens_seen": 266917710, + "step": 12375, + "time_per_iteration": 2.6013636589050293 + }, + { + "auxiliary_loss_clip": 0.01090345, + "auxiliary_loss_mlp": 0.01025308, + "balance_loss_clip": 1.03409648, + "balance_loss_mlp": 1.0140965, + "epoch": 0.7440853750187885, + "flos": 20266295155200.0, + "grad_norm": 1.9240422528425, + "language_loss": 0.77238595, + "learning_rate": 6.484407624176733e-07, + "loss": 0.79354256, + "num_input_tokens_seen": 266934220, + "step": 12376, + "time_per_iteration": 2.486158847808838 + }, + { + "auxiliary_loss_clip": 0.01067018, + "auxiliary_loss_mlp": 0.01037458, + "balance_loss_clip": 1.03321755, + "balance_loss_mlp": 1.02337909, + "epoch": 0.7441454982714565, + "flos": 25337276593920.0, + "grad_norm": 1.5981571861867867, + "language_loss": 0.79265428, + "learning_rate": 6.481537143875296e-07, + "loss": 0.81369901, + "num_input_tokens_seen": 266955210, + "step": 12377, + "time_per_iteration": 2.5915939807891846 + }, + { + "auxiliary_loss_clip": 0.01093134, + "auxiliary_loss_mlp": 0.01028949, + "balance_loss_clip": 1.03697896, + "balance_loss_mlp": 1.0165391, + "epoch": 0.7442056215241245, + "flos": 64480910866560.0, + "grad_norm": 1.8691545002945873, + "language_loss": 0.66722727, + "learning_rate": 6.478667176195322e-07, + "loss": 0.68844807, + "num_input_tokens_seen": 266976555, + "step": 12378, + "time_per_iteration": 2.861246109008789 + }, + { + "auxiliary_loss_clip": 0.01071779, + "auxiliary_loss_mlp": 0.01035166, + "balance_loss_clip": 1.0362227, + "balance_loss_mlp": 1.02218449, + "epoch": 0.7442657447767924, + "flos": 31285376242560.0, + "grad_norm": 1.778539145163772, + "language_loss": 0.7202723, + "learning_rate": 6.475797721245648e-07, + "loss": 0.74134183, + "num_input_tokens_seen": 266997640, + "step": 12379, + "time_per_iteration": 2.6407508850097656 + }, + { + "auxiliary_loss_clip": 0.01067882, + "auxiliary_loss_mlp": 0.00786623, + "balance_loss_clip": 1.03406501, + "balance_loss_mlp": 1.01042354, + "epoch": 0.7443258680294604, + "flos": 20807899401600.0, + "grad_norm": 1.725268945413912, + "language_loss": 0.65343481, + "learning_rate": 6.472928779135085e-07, + "loss": 0.6719799, + "num_input_tokens_seen": 267016165, + "step": 12380, + "time_per_iteration": 2.6069605350494385 + }, + { + "auxiliary_loss_clip": 0.0109505, + "auxiliary_loss_mlp": 0.01029185, + "balance_loss_clip": 1.03596568, + "balance_loss_mlp": 1.01728165, + "epoch": 0.7443859912821283, + "flos": 22199833290240.0, + "grad_norm": 2.315025992082962, + "language_loss": 0.78591037, + "learning_rate": 6.470060349972411e-07, + "loss": 0.80715275, + "num_input_tokens_seen": 267034075, + "step": 12381, + "time_per_iteration": 2.4874672889709473 + }, + { + "auxiliary_loss_clip": 0.01067122, + "auxiliary_loss_mlp": 0.01039492, + "balance_loss_clip": 1.03703356, + "balance_loss_mlp": 1.02474594, + "epoch": 0.7444461145347964, + "flos": 22017838055040.0, + "grad_norm": 2.0538053762110113, + "language_loss": 0.7226764, + "learning_rate": 6.467192433866411e-07, + "loss": 0.74374253, + "num_input_tokens_seen": 267053645, + "step": 12382, + "time_per_iteration": 2.5615084171295166 + }, + { + "auxiliary_loss_clip": 0.01004166, + "auxiliary_loss_mlp": 0.01000023, + "balance_loss_clip": 1.01239932, + "balance_loss_mlp": 0.99884856, + "epoch": 0.7445062377874643, + "flos": 70559047704960.0, + "grad_norm": 0.6500777099565485, + "language_loss": 0.54624617, + "learning_rate": 6.464325030925831e-07, + "loss": 0.56628805, + "num_input_tokens_seen": 267121830, + "step": 12383, + "time_per_iteration": 3.3125407695770264 + }, + { + "auxiliary_loss_clip": 0.01081417, + "auxiliary_loss_mlp": 0.01028936, + "balance_loss_clip": 1.03586936, + "balance_loss_mlp": 1.01667547, + "epoch": 0.7445663610401323, + "flos": 22164425458560.0, + "grad_norm": 2.1594034218612608, + "language_loss": 0.76310909, + "learning_rate": 6.461458141259395e-07, + "loss": 0.78421259, + "num_input_tokens_seen": 267141145, + "step": 12384, + "time_per_iteration": 2.5615503787994385 + }, + { + "auxiliary_loss_clip": 0.01092791, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.03476858, + "balance_loss_mlp": 1.01759446, + "epoch": 0.7446264842928002, + "flos": 24170251714560.0, + "grad_norm": 1.9088672554956119, + "language_loss": 0.79587984, + "learning_rate": 6.458591764975823e-07, + "loss": 0.81709957, + "num_input_tokens_seen": 267159280, + "step": 12385, + "time_per_iteration": 2.532036542892456 + }, + { + "auxiliary_loss_clip": 0.01073859, + "auxiliary_loss_mlp": 0.01032025, + "balance_loss_clip": 1.03846562, + "balance_loss_mlp": 1.01887012, + "epoch": 0.7446866075454682, + "flos": 24134556574080.0, + "grad_norm": 1.6521084208640353, + "language_loss": 0.81345713, + "learning_rate": 6.455725902183813e-07, + "loss": 0.83451599, + "num_input_tokens_seen": 267179390, + "step": 12386, + "time_per_iteration": 2.5866873264312744 + }, + { + "auxiliary_loss_clip": 0.01089859, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_clip": 1.03620601, + "balance_loss_mlp": 1.01942277, + "epoch": 0.7447467307981361, + "flos": 23548063305600.0, + "grad_norm": 1.6122995564365425, + "language_loss": 0.70803332, + "learning_rate": 6.452860552992037e-07, + "loss": 0.72924191, + "num_input_tokens_seen": 267198165, + "step": 12387, + "time_per_iteration": 2.4957101345062256 + }, + { + "auxiliary_loss_clip": 0.01073316, + "auxiliary_loss_mlp": 0.01031503, + "balance_loss_clip": 1.0358845, + "balance_loss_mlp": 1.02004147, + "epoch": 0.7448068540508042, + "flos": 19567832215680.0, + "grad_norm": 2.023178111803868, + "language_loss": 0.70597714, + "learning_rate": 6.449995717509138e-07, + "loss": 0.72702539, + "num_input_tokens_seen": 267214520, + "step": 12388, + "time_per_iteration": 2.551757574081421 + }, + { + "auxiliary_loss_clip": 0.0109343, + "auxiliary_loss_mlp": 0.01029424, + "balance_loss_clip": 1.03460765, + "balance_loss_mlp": 1.01769376, + "epoch": 0.7448669773034721, + "flos": 21839721488640.0, + "grad_norm": 1.5977948183582167, + "language_loss": 0.84676385, + "learning_rate": 6.447131395843761e-07, + "loss": 0.8679924, + "num_input_tokens_seen": 267236555, + "step": 12389, + "time_per_iteration": 2.5446979999542236 + }, + { + "auxiliary_loss_clip": 0.01065121, + "auxiliary_loss_mlp": 0.01031811, + "balance_loss_clip": 1.03659713, + "balance_loss_mlp": 1.01970506, + "epoch": 0.7449271005561401, + "flos": 25155389099520.0, + "grad_norm": 1.8767072819941821, + "language_loss": 0.79346383, + "learning_rate": 6.444267588104526e-07, + "loss": 0.8144331, + "num_input_tokens_seen": 267254800, + "step": 12390, + "time_per_iteration": 2.6248862743377686 + }, + { + "auxiliary_loss_clip": 0.01085863, + "auxiliary_loss_mlp": 0.01029854, + "balance_loss_clip": 1.03658593, + "balance_loss_mlp": 1.01717639, + "epoch": 0.7449872238088081, + "flos": 22273342473600.0, + "grad_norm": 1.6217239665237047, + "language_loss": 0.84840083, + "learning_rate": 6.441404294400014e-07, + "loss": 0.86955798, + "num_input_tokens_seen": 267274610, + "step": 12391, + "time_per_iteration": 2.5180253982543945 + }, + { + "auxiliary_loss_clip": 0.01104027, + "auxiliary_loss_mlp": 0.01026877, + "balance_loss_clip": 1.03569233, + "balance_loss_mlp": 1.0157485, + "epoch": 0.745047347061476, + "flos": 20594805966720.0, + "grad_norm": 2.1250168343804456, + "language_loss": 0.73667687, + "learning_rate": 6.438541514838811e-07, + "loss": 0.75798595, + "num_input_tokens_seen": 267292600, + "step": 12392, + "time_per_iteration": 2.4667131900787354 + }, + { + "auxiliary_loss_clip": 0.01091324, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.03617072, + "balance_loss_mlp": 1.02147293, + "epoch": 0.745107470314144, + "flos": 22127545169280.0, + "grad_norm": 1.6255926631750188, + "language_loss": 0.76883394, + "learning_rate": 6.435679249529487e-07, + "loss": 0.7900793, + "num_input_tokens_seen": 267311295, + "step": 12393, + "time_per_iteration": 2.480015993118286 + }, + { + "auxiliary_loss_clip": 0.01094961, + "auxiliary_loss_mlp": 0.01032064, + "balance_loss_clip": 1.03727579, + "balance_loss_mlp": 1.01919568, + "epoch": 0.745167593566812, + "flos": 22236498097920.0, + "grad_norm": 1.8572618435726926, + "language_loss": 0.730129, + "learning_rate": 6.432817498580552e-07, + "loss": 0.75139928, + "num_input_tokens_seen": 267328390, + "step": 12394, + "time_per_iteration": 2.502887010574341 + }, + { + "auxiliary_loss_clip": 0.01048749, + "auxiliary_loss_mlp": 0.00782923, + "balance_loss_clip": 1.03658748, + "balance_loss_mlp": 1.00961971, + "epoch": 0.74522771681948, + "flos": 20666232161280.0, + "grad_norm": 1.5428169771739302, + "language_loss": 0.81325608, + "learning_rate": 6.429956262100535e-07, + "loss": 0.83157283, + "num_input_tokens_seen": 267348185, + "step": 12395, + "time_per_iteration": 4.051112413406372 + }, + { + "auxiliary_loss_clip": 0.0109814, + "auxiliary_loss_mlp": 0.01035027, + "balance_loss_clip": 1.03646302, + "balance_loss_mlp": 1.02252841, + "epoch": 0.7452878400721479, + "flos": 21106999952640.0, + "grad_norm": 3.536633369747469, + "language_loss": 0.71512079, + "learning_rate": 6.427095540197937e-07, + "loss": 0.7364524, + "num_input_tokens_seen": 267367010, + "step": 12396, + "time_per_iteration": 2.4796435832977295 + }, + { + "auxiliary_loss_clip": 0.01065593, + "auxiliary_loss_mlp": 0.01029649, + "balance_loss_clip": 1.03580785, + "balance_loss_mlp": 1.01740015, + "epoch": 0.7453479633248159, + "flos": 26688056474880.0, + "grad_norm": 2.0367224714382868, + "language_loss": 0.68158466, + "learning_rate": 6.424235332981245e-07, + "loss": 0.70253706, + "num_input_tokens_seen": 267386605, + "step": 12397, + "time_per_iteration": 2.6405365467071533 + }, + { + "auxiliary_loss_clip": 0.01104122, + "auxiliary_loss_mlp": 0.01040304, + "balance_loss_clip": 1.03556001, + "balance_loss_mlp": 1.02801943, + "epoch": 0.7454080865774838, + "flos": 17016056167680.0, + "grad_norm": 1.7539178697213411, + "language_loss": 0.76382834, + "learning_rate": 6.421375640558908e-07, + "loss": 0.7852726, + "num_input_tokens_seen": 267404135, + "step": 12398, + "time_per_iteration": 2.4339704513549805 + }, + { + "auxiliary_loss_clip": 0.01091969, + "auxiliary_loss_mlp": 0.01025089, + "balance_loss_clip": 1.03521514, + "balance_loss_mlp": 1.01326358, + "epoch": 0.7454682098301518, + "flos": 21323900229120.0, + "grad_norm": 2.120052450037395, + "language_loss": 0.77946609, + "learning_rate": 6.418516463039363e-07, + "loss": 0.80063671, + "num_input_tokens_seen": 267423120, + "step": 12399, + "time_per_iteration": 2.4947004318237305 + }, + { + "auxiliary_loss_clip": 0.01079233, + "auxiliary_loss_mlp": 0.01033017, + "balance_loss_clip": 1.033355, + "balance_loss_mlp": 1.02170444, + "epoch": 0.7455283330828197, + "flos": 17858341163520.0, + "grad_norm": 1.860746586333181, + "language_loss": 0.73501873, + "learning_rate": 6.415657800531038e-07, + "loss": 0.75614119, + "num_input_tokens_seen": 267441250, + "step": 12400, + "time_per_iteration": 2.500904083251953 + }, + { + "auxiliary_loss_clip": 0.01091335, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.03482556, + "balance_loss_mlp": 1.02113485, + "epoch": 0.7455884563354878, + "flos": 30774259664640.0, + "grad_norm": 1.766492035128556, + "language_loss": 0.8214227, + "learning_rate": 6.412799653142327e-07, + "loss": 0.84265894, + "num_input_tokens_seen": 267462820, + "step": 12401, + "time_per_iteration": 3.941330909729004 + }, + { + "auxiliary_loss_clip": 0.01071564, + "auxiliary_loss_mlp": 0.01031387, + "balance_loss_clip": 1.0353992, + "balance_loss_mlp": 1.01975846, + "epoch": 0.7456485795881557, + "flos": 23185545292800.0, + "grad_norm": 1.9466936697628179, + "language_loss": 0.652969, + "learning_rate": 6.409942020981611e-07, + "loss": 0.67399848, + "num_input_tokens_seen": 267483065, + "step": 12402, + "time_per_iteration": 3.9990975856781006 + }, + { + "auxiliary_loss_clip": 0.01072266, + "auxiliary_loss_mlp": 0.01032889, + "balance_loss_clip": 1.0338726, + "balance_loss_mlp": 1.02146912, + "epoch": 0.7457087028408237, + "flos": 38727144074880.0, + "grad_norm": 1.5580717337306913, + "language_loss": 0.73367298, + "learning_rate": 6.407084904157265e-07, + "loss": 0.7547245, + "num_input_tokens_seen": 267504825, + "step": 12403, + "time_per_iteration": 2.7135751247406006 + }, + { + "auxiliary_loss_clip": 0.01001921, + "auxiliary_loss_mlp": 0.01002808, + "balance_loss_clip": 1.0113852, + "balance_loss_mlp": 1.00159764, + "epoch": 0.7457688260934917, + "flos": 56043737337600.0, + "grad_norm": 0.8256386603819094, + "language_loss": 0.58750796, + "learning_rate": 6.404228302777621e-07, + "loss": 0.60755527, + "num_input_tokens_seen": 267559260, + "step": 12404, + "time_per_iteration": 3.0052683353424072 + }, + { + "auxiliary_loss_clip": 0.01103339, + "auxiliary_loss_mlp": 0.01031334, + "balance_loss_clip": 1.03452325, + "balance_loss_mlp": 1.01993132, + "epoch": 0.7458289493461596, + "flos": 20116152305280.0, + "grad_norm": 2.4351817039950974, + "language_loss": 0.77622783, + "learning_rate": 6.401372216950995e-07, + "loss": 0.79757452, + "num_input_tokens_seen": 267578720, + "step": 12405, + "time_per_iteration": 2.461237907409668 + }, + { + "auxiliary_loss_clip": 0.01078406, + "auxiliary_loss_mlp": 0.01038069, + "balance_loss_clip": 1.03384829, + "balance_loss_mlp": 1.02576089, + "epoch": 0.7458890725988276, + "flos": 20193073280640.0, + "grad_norm": 1.5406022317438208, + "language_loss": 0.69120997, + "learning_rate": 6.398516646785698e-07, + "loss": 0.71237475, + "num_input_tokens_seen": 267598250, + "step": 12406, + "time_per_iteration": 2.5138683319091797 + }, + { + "auxiliary_loss_clip": 0.01046941, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.03546214, + "balance_loss_mlp": 1.02212024, + "epoch": 0.7459491958514956, + "flos": 17018749687680.0, + "grad_norm": 1.7497696791019406, + "language_loss": 0.65075558, + "learning_rate": 6.39566159239002e-07, + "loss": 0.67158365, + "num_input_tokens_seen": 267615430, + "step": 12407, + "time_per_iteration": 4.109792470932007 + }, + { + "auxiliary_loss_clip": 0.01064132, + "auxiliary_loss_mlp": 0.01033954, + "balance_loss_clip": 1.0344516, + "balance_loss_mlp": 1.02079284, + "epoch": 0.7460093191041636, + "flos": 25078719519360.0, + "grad_norm": 1.720993934943781, + "language_loss": 0.7200129, + "learning_rate": 6.392807053872212e-07, + "loss": 0.74099374, + "num_input_tokens_seen": 267635075, + "step": 12408, + "time_per_iteration": 2.8240721225738525 + }, + { + "auxiliary_loss_clip": 0.01100194, + "auxiliary_loss_mlp": 0.01033706, + "balance_loss_clip": 1.03724384, + "balance_loss_mlp": 1.01985967, + "epoch": 0.7460694423568315, + "flos": 21908525990400.0, + "grad_norm": 1.8452217382321383, + "language_loss": 0.72992671, + "learning_rate": 6.38995303134053e-07, + "loss": 0.75126576, + "num_input_tokens_seen": 267654105, + "step": 12409, + "time_per_iteration": 2.550278425216675 + }, + { + "auxiliary_loss_clip": 0.01090938, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.03478551, + "balance_loss_mlp": 1.01861048, + "epoch": 0.7461295656094995, + "flos": 21215737399680.0, + "grad_norm": 1.7073036449418648, + "language_loss": 0.65929961, + "learning_rate": 6.38709952490319e-07, + "loss": 0.68050712, + "num_input_tokens_seen": 267673090, + "step": 12410, + "time_per_iteration": 2.4936752319335938 + }, + { + "auxiliary_loss_clip": 0.01087737, + "auxiliary_loss_mlp": 0.00782981, + "balance_loss_clip": 1.03485405, + "balance_loss_mlp": 1.01002574, + "epoch": 0.7461896888621674, + "flos": 22346851656960.0, + "grad_norm": 1.8982360030683214, + "language_loss": 0.84597725, + "learning_rate": 6.384246534668396e-07, + "loss": 0.86468446, + "num_input_tokens_seen": 267690605, + "step": 12411, + "time_per_iteration": 2.505321741104126 + }, + { + "auxiliary_loss_clip": 0.01070281, + "auxiliary_loss_mlp": 0.01028284, + "balance_loss_clip": 1.03437328, + "balance_loss_mlp": 1.01594567, + "epoch": 0.7462498121148354, + "flos": 25482930243840.0, + "grad_norm": 1.572197790781628, + "language_loss": 0.77791572, + "learning_rate": 6.381394060744339e-07, + "loss": 0.79890138, + "num_input_tokens_seen": 267710540, + "step": 12412, + "time_per_iteration": 2.5694098472595215 + }, + { + "auxiliary_loss_clip": 0.01065352, + "auxiliary_loss_mlp": 0.01037015, + "balance_loss_clip": 1.03409457, + "balance_loss_mlp": 1.0245223, + "epoch": 0.7463099353675033, + "flos": 33947936812800.0, + "grad_norm": 1.8897931548494664, + "language_loss": 0.62467396, + "learning_rate": 6.378542103239188e-07, + "loss": 0.64569771, + "num_input_tokens_seen": 267730780, + "step": 12413, + "time_per_iteration": 2.697270631790161 + }, + { + "auxiliary_loss_clip": 0.0102444, + "auxiliary_loss_mlp": 0.00762593, + "balance_loss_clip": 1.00989246, + "balance_loss_mlp": 1.00180292, + "epoch": 0.7463700586201714, + "flos": 62767723691520.0, + "grad_norm": 0.7268810703358441, + "language_loss": 0.54919755, + "learning_rate": 6.375690662261082e-07, + "loss": 0.56706786, + "num_input_tokens_seen": 267794240, + "step": 12414, + "time_per_iteration": 3.1534206867218018 + }, + { + "auxiliary_loss_clip": 0.01077667, + "auxiliary_loss_mlp": 0.01029731, + "balance_loss_clip": 1.03545654, + "balance_loss_mlp": 1.01735735, + "epoch": 0.7464301818728393, + "flos": 33432654257280.0, + "grad_norm": 1.5880463360297183, + "language_loss": 0.54558623, + "learning_rate": 6.372839737918154e-07, + "loss": 0.56666023, + "num_input_tokens_seen": 267817190, + "step": 12415, + "time_per_iteration": 2.6484315395355225 + }, + { + "auxiliary_loss_clip": 0.01049329, + "auxiliary_loss_mlp": 0.01035325, + "balance_loss_clip": 1.03477812, + "balance_loss_mlp": 1.02133012, + "epoch": 0.7464903051255073, + "flos": 26869872142080.0, + "grad_norm": 1.608871418961689, + "language_loss": 0.75192833, + "learning_rate": 6.369989330318506e-07, + "loss": 0.77277488, + "num_input_tokens_seen": 267836245, + "step": 12416, + "time_per_iteration": 2.6346628665924072 + }, + { + "auxiliary_loss_clip": 0.0105569, + "auxiliary_loss_mlp": 0.01037665, + "balance_loss_clip": 1.03170812, + "balance_loss_mlp": 1.02483857, + "epoch": 0.7465504283781753, + "flos": 44086954775040.0, + "grad_norm": 1.6389189809097362, + "language_loss": 0.69190031, + "learning_rate": 6.367139439570233e-07, + "loss": 0.71283388, + "num_input_tokens_seen": 267858310, + "step": 12417, + "time_per_iteration": 2.7954540252685547 + }, + { + "auxiliary_loss_clip": 0.01077469, + "auxiliary_loss_mlp": 0.0103233, + "balance_loss_clip": 1.03752697, + "balance_loss_mlp": 1.0195272, + "epoch": 0.7466105516308432, + "flos": 19676102785920.0, + "grad_norm": 2.5094616068259548, + "language_loss": 0.73859775, + "learning_rate": 6.364290065781392e-07, + "loss": 0.75969577, + "num_input_tokens_seen": 267876345, + "step": 12418, + "time_per_iteration": 2.548858642578125 + }, + { + "auxiliary_loss_clip": 0.01094606, + "auxiliary_loss_mlp": 0.01034931, + "balance_loss_clip": 1.03652477, + "balance_loss_mlp": 1.02323675, + "epoch": 0.7466706748835112, + "flos": 20520722165760.0, + "grad_norm": 1.8629605609829527, + "language_loss": 0.69168425, + "learning_rate": 6.361441209060039e-07, + "loss": 0.71297956, + "num_input_tokens_seen": 267896740, + "step": 12419, + "time_per_iteration": 2.5440244674682617 + }, + { + "auxiliary_loss_clip": 0.01099196, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.03452468, + "balance_loss_mlp": 1.01975608, + "epoch": 0.7467307981361792, + "flos": 21690260997120.0, + "grad_norm": 1.8581267420314864, + "language_loss": 0.75032157, + "learning_rate": 6.358592869514216e-07, + "loss": 0.77162468, + "num_input_tokens_seen": 267914765, + "step": 12420, + "time_per_iteration": 2.4803249835968018 + }, + { + "auxiliary_loss_clip": 0.01099485, + "auxiliary_loss_mlp": 0.01030695, + "balance_loss_clip": 1.03826952, + "balance_loss_mlp": 1.01818371, + "epoch": 0.7467909213888472, + "flos": 19573686132480.0, + "grad_norm": 1.8293994625569678, + "language_loss": 0.67677462, + "learning_rate": 6.355745047251904e-07, + "loss": 0.69807643, + "num_input_tokens_seen": 267934085, + "step": 12421, + "time_per_iteration": 2.502230167388916 + }, + { + "auxiliary_loss_clip": 0.01081668, + "auxiliary_loss_mlp": 0.01033915, + "balance_loss_clip": 1.03578043, + "balance_loss_mlp": 1.0208621, + "epoch": 0.7468510446415151, + "flos": 23695225326720.0, + "grad_norm": 1.6103472251205657, + "language_loss": 0.7243799, + "learning_rate": 6.352897742381107e-07, + "loss": 0.74553567, + "num_input_tokens_seen": 267955170, + "step": 12422, + "time_per_iteration": 2.564944267272949 + }, + { + "auxiliary_loss_clip": 0.01067413, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.0354979, + "balance_loss_mlp": 1.02094126, + "epoch": 0.7469111678941831, + "flos": 29315783831040.0, + "grad_norm": 1.7830951979353022, + "language_loss": 0.74818158, + "learning_rate": 6.350050955009796e-07, + "loss": 0.76919425, + "num_input_tokens_seen": 267974980, + "step": 12423, + "time_per_iteration": 2.6318697929382324 + }, + { + "auxiliary_loss_clip": 0.01090645, + "auxiliary_loss_mlp": 0.01024671, + "balance_loss_clip": 1.03428459, + "balance_loss_mlp": 1.01361394, + "epoch": 0.746971291146851, + "flos": 21798639308160.0, + "grad_norm": 1.302575394146366, + "language_loss": 0.67558765, + "learning_rate": 6.347204685245929e-07, + "loss": 0.69674081, + "num_input_tokens_seen": 267994985, + "step": 12424, + "time_per_iteration": 2.5149121284484863 + }, + { + "auxiliary_loss_clip": 0.0110027, + "auxiliary_loss_mlp": 0.0103526, + "balance_loss_clip": 1.03860736, + "balance_loss_mlp": 1.02290988, + "epoch": 0.747031414399519, + "flos": 36245070368640.0, + "grad_norm": 1.9662124985774336, + "language_loss": 0.74729902, + "learning_rate": 6.344358933197418e-07, + "loss": 0.76865429, + "num_input_tokens_seen": 268014985, + "step": 12425, + "time_per_iteration": 2.635683298110962 + }, + { + "auxiliary_loss_clip": 0.01066407, + "auxiliary_loss_mlp": 0.01034863, + "balance_loss_clip": 1.03403187, + "balance_loss_mlp": 1.02167857, + "epoch": 0.7470915376521869, + "flos": 19974916028160.0, + "grad_norm": 2.8282388979012363, + "language_loss": 0.69383752, + "learning_rate": 6.341513698972194e-07, + "loss": 0.71485019, + "num_input_tokens_seen": 268034395, + "step": 12426, + "time_per_iteration": 2.5815205574035645 + }, + { + "auxiliary_loss_clip": 0.01065373, + "auxiliary_loss_mlp": 0.01034693, + "balance_loss_clip": 1.03642392, + "balance_loss_mlp": 1.02302885, + "epoch": 0.747151660904855, + "flos": 20084299920000.0, + "grad_norm": 1.5758304755642805, + "language_loss": 0.65460479, + "learning_rate": 6.338668982678139e-07, + "loss": 0.67560542, + "num_input_tokens_seen": 268054485, + "step": 12427, + "time_per_iteration": 2.5713250637054443 + }, + { + "auxiliary_loss_clip": 0.01106475, + "auxiliary_loss_mlp": 0.01029511, + "balance_loss_clip": 1.03739572, + "balance_loss_mlp": 1.01683879, + "epoch": 0.7472117841575229, + "flos": 16290373697280.0, + "grad_norm": 1.5791368985642704, + "language_loss": 0.74561083, + "learning_rate": 6.335824784423118e-07, + "loss": 0.76697063, + "num_input_tokens_seen": 268072250, + "step": 12428, + "time_per_iteration": 2.4341745376586914 + }, + { + "auxiliary_loss_clip": 0.01100511, + "auxiliary_loss_mlp": 0.01033425, + "balance_loss_clip": 1.03712916, + "balance_loss_mlp": 1.02031231, + "epoch": 0.7472719074101909, + "flos": 21389939383680.0, + "grad_norm": 1.9086889270479022, + "language_loss": 0.58178741, + "learning_rate": 6.33298110431499e-07, + "loss": 0.60312676, + "num_input_tokens_seen": 268089840, + "step": 12429, + "time_per_iteration": 2.4761509895324707 + }, + { + "auxiliary_loss_clip": 0.01094534, + "auxiliary_loss_mlp": 0.01030321, + "balance_loss_clip": 1.03719127, + "balance_loss_mlp": 1.01782751, + "epoch": 0.7473320306628589, + "flos": 29643289061760.0, + "grad_norm": 2.044743133589867, + "language_loss": 0.605986, + "learning_rate": 6.330137942461595e-07, + "loss": 0.62723458, + "num_input_tokens_seen": 268109360, + "step": 12430, + "time_per_iteration": 2.5812690258026123 + }, + { + "auxiliary_loss_clip": 0.01084736, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.03627658, + "balance_loss_mlp": 1.01667726, + "epoch": 0.7473921539155268, + "flos": 24136100858880.0, + "grad_norm": 1.479504046573755, + "language_loss": 0.75575566, + "learning_rate": 6.327295298970734e-07, + "loss": 0.77688867, + "num_input_tokens_seen": 268131840, + "step": 12431, + "time_per_iteration": 2.5966789722442627 + }, + { + "auxiliary_loss_clip": 0.01095479, + "auxiliary_loss_mlp": 0.01028713, + "balance_loss_clip": 1.03631783, + "balance_loss_mlp": 1.01689363, + "epoch": 0.7474522771681948, + "flos": 17487958072320.0, + "grad_norm": 1.9919547552345311, + "language_loss": 0.75544977, + "learning_rate": 6.32445317395021e-07, + "loss": 0.77669168, + "num_input_tokens_seen": 268148300, + "step": 12432, + "time_per_iteration": 2.478036403656006 + }, + { + "auxiliary_loss_clip": 0.01084649, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.0370667, + "balance_loss_mlp": 1.01924181, + "epoch": 0.7475124004208628, + "flos": 16727298733440.0, + "grad_norm": 2.414570790059739, + "language_loss": 0.69943875, + "learning_rate": 6.321611567507787e-07, + "loss": 0.7206133, + "num_input_tokens_seen": 268166450, + "step": 12433, + "time_per_iteration": 2.5433192253112793 + }, + { + "auxiliary_loss_clip": 0.01059251, + "auxiliary_loss_mlp": 0.01032843, + "balance_loss_clip": 1.0360868, + "balance_loss_mlp": 1.02018857, + "epoch": 0.7475725236735308, + "flos": 19720237622400.0, + "grad_norm": 1.6620386462184908, + "language_loss": 0.66803324, + "learning_rate": 6.318770479751232e-07, + "loss": 0.68895423, + "num_input_tokens_seen": 268186165, + "step": 12434, + "time_per_iteration": 4.218998432159424 + }, + { + "auxiliary_loss_clip": 0.01099636, + "auxiliary_loss_mlp": 0.01032653, + "balance_loss_clip": 1.03534472, + "balance_loss_mlp": 1.02177536, + "epoch": 0.7476326469261987, + "flos": 26286000566400.0, + "grad_norm": 1.434380456712659, + "language_loss": 0.79551721, + "learning_rate": 6.315929910788263e-07, + "loss": 0.81684005, + "num_input_tokens_seen": 268208145, + "step": 12435, + "time_per_iteration": 2.518249034881592 + }, + { + "auxiliary_loss_clip": 0.01075397, + "auxiliary_loss_mlp": 0.01030022, + "balance_loss_clip": 1.03368044, + "balance_loss_mlp": 1.01780868, + "epoch": 0.7476927701788667, + "flos": 31831828824960.0, + "grad_norm": 32.027791172425424, + "language_loss": 0.67792928, + "learning_rate": 6.313089860726604e-07, + "loss": 0.69898343, + "num_input_tokens_seen": 268228345, + "step": 12436, + "time_per_iteration": 2.635502338409424 + }, + { + "auxiliary_loss_clip": 0.01079236, + "auxiliary_loss_mlp": 0.01033935, + "balance_loss_clip": 1.03418994, + "balance_loss_mlp": 1.02155542, + "epoch": 0.7477528934315346, + "flos": 31795487239680.0, + "grad_norm": 1.655573065822874, + "language_loss": 0.70599282, + "learning_rate": 6.31025032967396e-07, + "loss": 0.72712457, + "num_input_tokens_seen": 268250260, + "step": 12437, + "time_per_iteration": 2.630236864089966 + }, + { + "auxiliary_loss_clip": 0.01066568, + "auxiliary_loss_mlp": 0.0102678, + "balance_loss_clip": 1.0330534, + "balance_loss_mlp": 1.01550913, + "epoch": 0.7478130166842026, + "flos": 20371979946240.0, + "grad_norm": 1.8328517645202864, + "language_loss": 0.67336512, + "learning_rate": 6.307411317737986e-07, + "loss": 0.69429862, + "num_input_tokens_seen": 268268440, + "step": 12438, + "time_per_iteration": 2.575533151626587 + }, + { + "auxiliary_loss_clip": 0.01082883, + "auxiliary_loss_mlp": 0.01034648, + "balance_loss_clip": 1.03486514, + "balance_loss_mlp": 1.02267957, + "epoch": 0.7478731399368705, + "flos": 18148930191360.0, + "grad_norm": 1.6858175316956214, + "language_loss": 0.80595887, + "learning_rate": 6.304572825026344e-07, + "loss": 0.82713413, + "num_input_tokens_seen": 268285765, + "step": 12439, + "time_per_iteration": 2.5026497840881348 + }, + { + "auxiliary_loss_clip": 0.01069517, + "auxiliary_loss_mlp": 0.01038072, + "balance_loss_clip": 1.03429341, + "balance_loss_mlp": 1.02615678, + "epoch": 0.7479332631895386, + "flos": 15267889146240.0, + "grad_norm": 1.9812843426398712, + "language_loss": 0.71043396, + "learning_rate": 6.301734851646674e-07, + "loss": 0.73150986, + "num_input_tokens_seen": 268304015, + "step": 12440, + "time_per_iteration": 3.9029359817504883 + }, + { + "auxiliary_loss_clip": 0.0108216, + "auxiliary_loss_mlp": 0.01026581, + "balance_loss_clip": 1.03732789, + "balance_loss_mlp": 1.01485646, + "epoch": 0.7479933864422065, + "flos": 21142515525120.0, + "grad_norm": 1.726529693038754, + "language_loss": 0.74414635, + "learning_rate": 6.298897397706597e-07, + "loss": 0.76523376, + "num_input_tokens_seen": 268323290, + "step": 12441, + "time_per_iteration": 3.974076509475708 + }, + { + "auxiliary_loss_clip": 0.01097548, + "auxiliary_loss_mlp": 0.00783987, + "balance_loss_clip": 1.03742123, + "balance_loss_mlp": 1.01083052, + "epoch": 0.7480535096948745, + "flos": 14392027912320.0, + "grad_norm": 2.166831080165801, + "language_loss": 0.82586074, + "learning_rate": 6.296060463313698e-07, + "loss": 0.84467608, + "num_input_tokens_seen": 268339490, + "step": 12442, + "time_per_iteration": 2.477626323699951 + }, + { + "auxiliary_loss_clip": 0.01050288, + "auxiliary_loss_mlp": 0.01033684, + "balance_loss_clip": 1.03520107, + "balance_loss_mlp": 1.02030301, + "epoch": 0.7481136329475425, + "flos": 27344683048320.0, + "grad_norm": 1.8598648867797591, + "language_loss": 0.62883502, + "learning_rate": 6.293224048575565e-07, + "loss": 0.64967471, + "num_input_tokens_seen": 268359865, + "step": 12443, + "time_per_iteration": 2.6732797622680664 + }, + { + "auxiliary_loss_clip": 0.01063522, + "auxiliary_loss_mlp": 0.01028352, + "balance_loss_clip": 1.03391886, + "balance_loss_mlp": 1.01665151, + "epoch": 0.7481737562002104, + "flos": 19531454716800.0, + "grad_norm": 1.67046864570416, + "language_loss": 0.7110272, + "learning_rate": 6.29038815359975e-07, + "loss": 0.73194593, + "num_input_tokens_seen": 268377065, + "step": 12444, + "time_per_iteration": 2.548701763153076 + }, + { + "auxiliary_loss_clip": 0.01054697, + "auxiliary_loss_mlp": 0.01033517, + "balance_loss_clip": 1.03509355, + "balance_loss_mlp": 1.02075005, + "epoch": 0.7482338794528784, + "flos": 21760035166080.0, + "grad_norm": 1.4093311387286585, + "language_loss": 0.68831599, + "learning_rate": 6.287552778493786e-07, + "loss": 0.70919818, + "num_input_tokens_seen": 268396935, + "step": 12445, + "time_per_iteration": 4.071219444274902 + }, + { + "auxiliary_loss_clip": 0.01091384, + "auxiliary_loss_mlp": 0.01024757, + "balance_loss_clip": 1.03504121, + "balance_loss_mlp": 1.01325965, + "epoch": 0.7482940027055464, + "flos": 18697358021760.0, + "grad_norm": 1.722012330933655, + "language_loss": 0.74150407, + "learning_rate": 6.28471792336519e-07, + "loss": 0.76266551, + "num_input_tokens_seen": 268414460, + "step": 12446, + "time_per_iteration": 2.4746510982513428 + }, + { + "auxiliary_loss_clip": 0.01088461, + "auxiliary_loss_mlp": 0.00785153, + "balance_loss_clip": 1.03661585, + "balance_loss_mlp": 1.01037312, + "epoch": 0.7483541259582144, + "flos": 15998024903040.0, + "grad_norm": 2.2092870502886854, + "language_loss": 0.73504388, + "learning_rate": 6.281883588321475e-07, + "loss": 0.75377995, + "num_input_tokens_seen": 268432225, + "step": 12447, + "time_per_iteration": 2.5000741481781006 + }, + { + "auxiliary_loss_clip": 0.01061349, + "auxiliary_loss_mlp": 0.01031328, + "balance_loss_clip": 1.03255415, + "balance_loss_mlp": 1.01981282, + "epoch": 0.7484142492108823, + "flos": 25556295772800.0, + "grad_norm": 4.867366876804958, + "language_loss": 0.7201705, + "learning_rate": 6.279049773470109e-07, + "loss": 0.74109727, + "num_input_tokens_seen": 268449270, + "step": 12448, + "time_per_iteration": 2.616562843322754 + }, + { + "auxiliary_loss_clip": 0.01107663, + "auxiliary_loss_mlp": 0.01035201, + "balance_loss_clip": 1.03737342, + "balance_loss_mlp": 1.02313721, + "epoch": 0.7484743724635503, + "flos": 22887737631360.0, + "grad_norm": 1.9151443353946096, + "language_loss": 0.73624802, + "learning_rate": 6.276216478918543e-07, + "loss": 0.75767672, + "num_input_tokens_seen": 268467250, + "step": 12449, + "time_per_iteration": 2.479686975479126 + }, + { + "auxiliary_loss_clip": 0.01080436, + "auxiliary_loss_mlp": 0.01034332, + "balance_loss_clip": 1.03797388, + "balance_loss_mlp": 1.02139759, + "epoch": 0.7485344957162182, + "flos": 25300288563840.0, + "grad_norm": 1.8881210961400068, + "language_loss": 0.60763562, + "learning_rate": 6.273383704774225e-07, + "loss": 0.62878323, + "num_input_tokens_seen": 268487270, + "step": 12450, + "time_per_iteration": 2.578500270843506 + }, + { + "auxiliary_loss_clip": 0.01099914, + "auxiliary_loss_mlp": 0.01025249, + "balance_loss_clip": 1.03453016, + "balance_loss_mlp": 1.01414514, + "epoch": 0.7485946189688862, + "flos": 27053016612480.0, + "grad_norm": 3.0501001132599326, + "language_loss": 0.70339847, + "learning_rate": 6.270551451144577e-07, + "loss": 0.72465014, + "num_input_tokens_seen": 268508020, + "step": 12451, + "time_per_iteration": 2.5189316272735596 + }, + { + "auxiliary_loss_clip": 0.01098567, + "auxiliary_loss_mlp": 0.01027996, + "balance_loss_clip": 1.03562903, + "balance_loss_mlp": 1.01577067, + "epoch": 0.7486547422215541, + "flos": 26906752431360.0, + "grad_norm": 3.161847754308371, + "language_loss": 0.80074203, + "learning_rate": 6.267719718136988e-07, + "loss": 0.82200766, + "num_input_tokens_seen": 268527375, + "step": 12452, + "time_per_iteration": 2.5053582191467285 + }, + { + "auxiliary_loss_clip": 0.01113413, + "auxiliary_loss_mlp": 0.01032466, + "balance_loss_clip": 1.04023433, + "balance_loss_mlp": 1.01972818, + "epoch": 0.7487148654742222, + "flos": 22346277039360.0, + "grad_norm": 2.2011923882668527, + "language_loss": 0.71658796, + "learning_rate": 6.264888505858843e-07, + "loss": 0.73804677, + "num_input_tokens_seen": 268544870, + "step": 12453, + "time_per_iteration": 2.4709975719451904 + }, + { + "auxiliary_loss_clip": 0.01085476, + "auxiliary_loss_mlp": 0.01032094, + "balance_loss_clip": 1.03777647, + "balance_loss_mlp": 1.01995873, + "epoch": 0.7487749887268901, + "flos": 23038814234880.0, + "grad_norm": 1.520752990944129, + "language_loss": 0.74211639, + "learning_rate": 6.262057814417517e-07, + "loss": 0.76329207, + "num_input_tokens_seen": 268564580, + "step": 12454, + "time_per_iteration": 2.5405733585357666 + }, + { + "auxiliary_loss_clip": 0.01013916, + "auxiliary_loss_mlp": 0.01017857, + "balance_loss_clip": 1.01334596, + "balance_loss_mlp": 1.01631916, + "epoch": 0.7488351119795581, + "flos": 71525294536320.0, + "grad_norm": 0.7342577747056931, + "language_loss": 0.59430796, + "learning_rate": 6.259227643920322e-07, + "loss": 0.61462569, + "num_input_tokens_seen": 268629550, + "step": 12455, + "time_per_iteration": 3.320308208465576 + }, + { + "auxiliary_loss_clip": 0.01069064, + "auxiliary_loss_mlp": 0.01027563, + "balance_loss_clip": 1.03719401, + "balance_loss_mlp": 1.01523089, + "epoch": 0.748895235232226, + "flos": 17196255722880.0, + "grad_norm": 2.032098726270048, + "language_loss": 0.79506797, + "learning_rate": 6.256397994474592e-07, + "loss": 0.81603426, + "num_input_tokens_seen": 268646645, + "step": 12456, + "time_per_iteration": 2.550816059112549 + }, + { + "auxiliary_loss_clip": 0.01025797, + "auxiliary_loss_mlp": 0.01002083, + "balance_loss_clip": 1.01129258, + "balance_loss_mlp": 1.00102806, + "epoch": 0.748955358484894, + "flos": 58979256336000.0, + "grad_norm": 0.8613344432054116, + "language_loss": 0.6140939, + "learning_rate": 6.25356886618763e-07, + "loss": 0.63437271, + "num_input_tokens_seen": 268702275, + "step": 12457, + "time_per_iteration": 3.045292615890503 + }, + { + "auxiliary_loss_clip": 0.01090652, + "auxiliary_loss_mlp": 0.01031206, + "balance_loss_clip": 1.0386076, + "balance_loss_mlp": 1.01901734, + "epoch": 0.749015481737562, + "flos": 11360413054080.0, + "grad_norm": 2.016159158164655, + "language_loss": 0.67491674, + "learning_rate": 6.250740259166711e-07, + "loss": 0.6961354, + "num_input_tokens_seen": 268716265, + "step": 12458, + "time_per_iteration": 2.501405954360962 + }, + { + "auxiliary_loss_clip": 0.01052039, + "auxiliary_loss_mlp": 0.01029212, + "balance_loss_clip": 1.03448761, + "balance_loss_mlp": 1.01765501, + "epoch": 0.74907560499023, + "flos": 21106497162240.0, + "grad_norm": 1.6746090213527325, + "language_loss": 0.79762006, + "learning_rate": 6.247912173519106e-07, + "loss": 0.81843257, + "num_input_tokens_seen": 268734330, + "step": 12459, + "time_per_iteration": 2.6022870540618896 + }, + { + "auxiliary_loss_clip": 0.01066764, + "auxiliary_loss_mlp": 0.0103338, + "balance_loss_clip": 1.0335784, + "balance_loss_mlp": 1.02085125, + "epoch": 0.749135728242898, + "flos": 22268027260800.0, + "grad_norm": 1.8603524967098515, + "language_loss": 0.80517602, + "learning_rate": 6.245084609352043e-07, + "loss": 0.82617736, + "num_input_tokens_seen": 268753500, + "step": 12460, + "time_per_iteration": 2.566892385482788 + }, + { + "auxiliary_loss_clip": 0.01078156, + "auxiliary_loss_mlp": 0.01033863, + "balance_loss_clip": 1.03513861, + "balance_loss_mlp": 1.02080345, + "epoch": 0.7491958514955659, + "flos": 24057527857920.0, + "grad_norm": 1.7506406240741834, + "language_loss": 0.86306703, + "learning_rate": 6.242257566772755e-07, + "loss": 0.88418722, + "num_input_tokens_seen": 268772055, + "step": 12461, + "time_per_iteration": 2.563171863555908 + }, + { + "auxiliary_loss_clip": 0.01087427, + "auxiliary_loss_mlp": 0.01032833, + "balance_loss_clip": 1.03709018, + "balance_loss_mlp": 1.02112687, + "epoch": 0.7492559747482339, + "flos": 24492118510080.0, + "grad_norm": 1.7608249579379034, + "language_loss": 0.68932426, + "learning_rate": 6.239431045888435e-07, + "loss": 0.71052694, + "num_input_tokens_seen": 268792265, + "step": 12462, + "time_per_iteration": 2.530583620071411 + }, + { + "auxiliary_loss_clip": 0.01104257, + "auxiliary_loss_mlp": 0.01030706, + "balance_loss_clip": 1.03525817, + "balance_loss_mlp": 1.01821339, + "epoch": 0.7493160980009018, + "flos": 27745338326400.0, + "grad_norm": 2.468987515788426, + "language_loss": 0.70595902, + "learning_rate": 6.236605046806267e-07, + "loss": 0.72730863, + "num_input_tokens_seen": 268812735, + "step": 12463, + "time_per_iteration": 2.5265681743621826 + }, + { + "auxiliary_loss_clip": 0.01069538, + "auxiliary_loss_mlp": 0.01027795, + "balance_loss_clip": 1.03509438, + "balance_loss_mlp": 1.01623762, + "epoch": 0.7493762212535698, + "flos": 30226190970240.0, + "grad_norm": 1.5948281881498696, + "language_loss": 0.7773369, + "learning_rate": 6.233779569633419e-07, + "loss": 0.79831022, + "num_input_tokens_seen": 268833090, + "step": 12464, + "time_per_iteration": 2.622408151626587 + }, + { + "auxiliary_loss_clip": 0.01079622, + "auxiliary_loss_mlp": 0.01025579, + "balance_loss_clip": 1.03282642, + "balance_loss_mlp": 1.01421201, + "epoch": 0.7494363445062378, + "flos": 21944472526080.0, + "grad_norm": 1.6152166143014708, + "language_loss": 0.78504294, + "learning_rate": 6.230954614477034e-07, + "loss": 0.80609494, + "num_input_tokens_seen": 268851880, + "step": 12465, + "time_per_iteration": 2.538890838623047 + }, + { + "auxiliary_loss_clip": 0.01077263, + "auxiliary_loss_mlp": 0.01035251, + "balance_loss_clip": 1.03739274, + "balance_loss_mlp": 1.02167296, + "epoch": 0.7494964677589058, + "flos": 12490342162560.0, + "grad_norm": 2.3752099787136225, + "language_loss": 0.74618471, + "learning_rate": 6.22813018144422e-07, + "loss": 0.7673099, + "num_input_tokens_seen": 268867910, + "step": 12466, + "time_per_iteration": 2.533190965652466 + }, + { + "auxiliary_loss_clip": 0.01095695, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.0356679, + "balance_loss_mlp": 1.01965129, + "epoch": 0.7495565910115737, + "flos": 21653057485440.0, + "grad_norm": 2.0377570435141865, + "language_loss": 0.66279203, + "learning_rate": 6.22530627064209e-07, + "loss": 0.6840679, + "num_input_tokens_seen": 268887260, + "step": 12467, + "time_per_iteration": 2.4945473670959473 + }, + { + "auxiliary_loss_clip": 0.01058624, + "auxiliary_loss_mlp": 0.00786945, + "balance_loss_clip": 1.03659654, + "balance_loss_mlp": 1.01122284, + "epoch": 0.7496167142642417, + "flos": 15268535591040.0, + "grad_norm": 2.0651796327331913, + "language_loss": 0.76126587, + "learning_rate": 6.222482882177735e-07, + "loss": 0.77972156, + "num_input_tokens_seen": 268902520, + "step": 12468, + "time_per_iteration": 2.5627007484436035 + }, + { + "auxiliary_loss_clip": 0.01066476, + "auxiliary_loss_mlp": 0.01030823, + "balance_loss_clip": 1.03748989, + "balance_loss_mlp": 1.0180918, + "epoch": 0.7496768375169096, + "flos": 22054933825920.0, + "grad_norm": 1.8753619878756969, + "language_loss": 0.69013447, + "learning_rate": 6.219660016158201e-07, + "loss": 0.71110749, + "num_input_tokens_seen": 268920970, + "step": 12469, + "time_per_iteration": 2.570352077484131 + }, + { + "auxiliary_loss_clip": 0.01086475, + "auxiliary_loss_mlp": 0.01032539, + "balance_loss_clip": 1.03749919, + "balance_loss_mlp": 1.01999784, + "epoch": 0.7497369607695776, + "flos": 19057038860160.0, + "grad_norm": 1.983508551044083, + "language_loss": 0.69340867, + "learning_rate": 6.216837672690543e-07, + "loss": 0.71459877, + "num_input_tokens_seen": 268936600, + "step": 12470, + "time_per_iteration": 2.512080430984497 + }, + { + "auxiliary_loss_clip": 0.01088825, + "auxiliary_loss_mlp": 0.01031489, + "balance_loss_clip": 1.04165363, + "balance_loss_mlp": 1.01796484, + "epoch": 0.7497970840222457, + "flos": 21617434172160.0, + "grad_norm": 2.0300373116112937, + "language_loss": 0.75586671, + "learning_rate": 6.214015851881793e-07, + "loss": 0.77706981, + "num_input_tokens_seen": 268956560, + "step": 12471, + "time_per_iteration": 2.5457096099853516 + }, + { + "auxiliary_loss_clip": 0.01080013, + "auxiliary_loss_mlp": 0.01029752, + "balance_loss_clip": 1.03437686, + "balance_loss_mlp": 1.01577497, + "epoch": 0.7498572072749136, + "flos": 13735580906880.0, + "grad_norm": 2.4843794320911643, + "language_loss": 0.76614523, + "learning_rate": 6.211194553838929e-07, + "loss": 0.78724289, + "num_input_tokens_seen": 268973945, + "step": 12472, + "time_per_iteration": 3.8707194328308105 + }, + { + "auxiliary_loss_clip": 0.01093498, + "auxiliary_loss_mlp": 0.00782876, + "balance_loss_clip": 1.03602827, + "balance_loss_mlp": 1.00894523, + "epoch": 0.7499173305275816, + "flos": 22966526113920.0, + "grad_norm": 1.5025750254311407, + "language_loss": 0.84220791, + "learning_rate": 6.208373778668951e-07, + "loss": 0.86097163, + "num_input_tokens_seen": 268993245, + "step": 12473, + "time_per_iteration": 2.5149974822998047 + }, + { + "auxiliary_loss_clip": 0.01076569, + "auxiliary_loss_mlp": 0.01036294, + "balance_loss_clip": 1.03507638, + "balance_loss_mlp": 1.0226028, + "epoch": 0.7499774537802495, + "flos": 22740467869440.0, + "grad_norm": 2.2100310436147983, + "language_loss": 0.7370646, + "learning_rate": 6.205553526478829e-07, + "loss": 0.75819314, + "num_input_tokens_seen": 269012125, + "step": 12474, + "time_per_iteration": 2.5586583614349365 + }, + { + "auxiliary_loss_clip": 0.01083043, + "auxiliary_loss_mlp": 0.01034206, + "balance_loss_clip": 1.03558993, + "balance_loss_mlp": 1.0214746, + "epoch": 0.7500375770329175, + "flos": 18296559089280.0, + "grad_norm": 1.797763285121591, + "language_loss": 0.74605834, + "learning_rate": 6.202733797375492e-07, + "loss": 0.76723075, + "num_input_tokens_seen": 269030545, + "step": 12475, + "time_per_iteration": 2.497840642929077 + }, + { + "auxiliary_loss_clip": 0.0110244, + "auxiliary_loss_mlp": 0.01034104, + "balance_loss_clip": 1.03876638, + "balance_loss_mlp": 1.02069867, + "epoch": 0.7500977002855854, + "flos": 19169978198400.0, + "grad_norm": 1.7779836276187349, + "language_loss": 0.79786074, + "learning_rate": 6.199914591465878e-07, + "loss": 0.81922615, + "num_input_tokens_seen": 269048180, + "step": 12476, + "time_per_iteration": 2.4936680793762207 + }, + { + "auxiliary_loss_clip": 0.01070962, + "auxiliary_loss_mlp": 0.01033361, + "balance_loss_clip": 1.03539705, + "balance_loss_mlp": 1.02170229, + "epoch": 0.7501578235382534, + "flos": 22163886754560.0, + "grad_norm": 1.7544852382350766, + "language_loss": 0.77934766, + "learning_rate": 6.19709590885688e-07, + "loss": 0.8003909, + "num_input_tokens_seen": 269068600, + "step": 12477, + "time_per_iteration": 2.577725648880005 + }, + { + "auxiliary_loss_clip": 0.01012965, + "auxiliary_loss_mlp": 0.0099957, + "balance_loss_clip": 1.01122522, + "balance_loss_mlp": 0.9983902, + "epoch": 0.7502179467909214, + "flos": 64465040033280.0, + "grad_norm": 0.805701740048689, + "language_loss": 0.54472798, + "learning_rate": 6.194277749655394e-07, + "loss": 0.56485331, + "num_input_tokens_seen": 269119045, + "step": 12478, + "time_per_iteration": 4.483763217926025 + }, + { + "auxiliary_loss_clip": 0.01077599, + "auxiliary_loss_mlp": 0.01033383, + "balance_loss_clip": 1.03477907, + "balance_loss_mlp": 1.02137256, + "epoch": 0.7502780700435894, + "flos": 20478275268480.0, + "grad_norm": 1.6477925934653805, + "language_loss": 0.79980731, + "learning_rate": 6.191460113968272e-07, + "loss": 0.82091713, + "num_input_tokens_seen": 269136755, + "step": 12479, + "time_per_iteration": 2.594290018081665 + }, + { + "auxiliary_loss_clip": 0.01098884, + "auxiliary_loss_mlp": 0.0103784, + "balance_loss_clip": 1.03707492, + "balance_loss_mlp": 1.0246439, + "epoch": 0.7503381932962573, + "flos": 20445273648000.0, + "grad_norm": 1.9395856420488617, + "language_loss": 0.62359059, + "learning_rate": 6.188643001902369e-07, + "loss": 0.64495784, + "num_input_tokens_seen": 269156120, + "step": 12480, + "time_per_iteration": 3.9500463008880615 + }, + { + "auxiliary_loss_clip": 0.01078388, + "auxiliary_loss_mlp": 0.01036415, + "balance_loss_clip": 1.03468585, + "balance_loss_mlp": 1.02489924, + "epoch": 0.7503983165489253, + "flos": 22381936266240.0, + "grad_norm": 1.5634155071932831, + "language_loss": 0.7767731, + "learning_rate": 6.185826413564512e-07, + "loss": 0.79792106, + "num_input_tokens_seen": 269175650, + "step": 12481, + "time_per_iteration": 2.535130500793457 + }, + { + "auxiliary_loss_clip": 0.01065351, + "auxiliary_loss_mlp": 0.01032879, + "balance_loss_clip": 1.03530777, + "balance_loss_mlp": 1.01961124, + "epoch": 0.7504584398015932, + "flos": 24899453717760.0, + "grad_norm": 1.6055308566556097, + "language_loss": 0.71363711, + "learning_rate": 6.183010349061501e-07, + "loss": 0.7346195, + "num_input_tokens_seen": 269197080, + "step": 12482, + "time_per_iteration": 2.599626302719116 + }, + { + "auxiliary_loss_clip": 0.01107663, + "auxiliary_loss_mlp": 0.01035861, + "balance_loss_clip": 1.03783417, + "balance_loss_mlp": 1.02360666, + "epoch": 0.7505185630542612, + "flos": 25885237547520.0, + "grad_norm": 1.6924906764911605, + "language_loss": 0.70293677, + "learning_rate": 6.180194808500118e-07, + "loss": 0.72437203, + "num_input_tokens_seen": 269218600, + "step": 12483, + "time_per_iteration": 2.4972894191741943 + }, + { + "auxiliary_loss_clip": 0.01105686, + "auxiliary_loss_mlp": 0.01028894, + "balance_loss_clip": 1.03642642, + "balance_loss_mlp": 1.01761651, + "epoch": 0.7505786863069293, + "flos": 23143852581120.0, + "grad_norm": 2.0273934120648063, + "language_loss": 0.74658406, + "learning_rate": 6.177379791987131e-07, + "loss": 0.76792991, + "num_input_tokens_seen": 269239245, + "step": 12484, + "time_per_iteration": 3.8632073402404785 + }, + { + "auxiliary_loss_clip": 0.01083175, + "auxiliary_loss_mlp": 0.01029149, + "balance_loss_clip": 1.03525305, + "balance_loss_mlp": 1.01643491, + "epoch": 0.7506388095595972, + "flos": 16983377769600.0, + "grad_norm": 2.602467666306162, + "language_loss": 0.84431762, + "learning_rate": 6.174565299629295e-07, + "loss": 0.86544085, + "num_input_tokens_seen": 269258520, + "step": 12485, + "time_per_iteration": 2.5065524578094482 + }, + { + "auxiliary_loss_clip": 0.01069283, + "auxiliary_loss_mlp": 0.01029851, + "balance_loss_clip": 1.03610015, + "balance_loss_mlp": 1.01759017, + "epoch": 0.7506989328122652, + "flos": 22344984149760.0, + "grad_norm": 1.5288288394289904, + "language_loss": 0.78119469, + "learning_rate": 6.171751331533323e-07, + "loss": 0.80218601, + "num_input_tokens_seen": 269278320, + "step": 12486, + "time_per_iteration": 2.5666418075561523 + }, + { + "auxiliary_loss_clip": 0.01095877, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.03617835, + "balance_loss_mlp": 1.01789725, + "epoch": 0.7507590560649331, + "flos": 25776069137280.0, + "grad_norm": 2.077392676872717, + "language_loss": 0.72770011, + "learning_rate": 6.168937887805932e-07, + "loss": 0.74896801, + "num_input_tokens_seen": 269298025, + "step": 12487, + "time_per_iteration": 2.5141847133636475 + }, + { + "auxiliary_loss_clip": 0.01080616, + "auxiliary_loss_mlp": 0.01027643, + "balance_loss_clip": 1.03378534, + "balance_loss_mlp": 1.01570427, + "epoch": 0.7508191793176011, + "flos": 24279420124800.0, + "grad_norm": 1.9813379377058418, + "language_loss": 0.67140996, + "learning_rate": 6.166124968553801e-07, + "loss": 0.6924926, + "num_input_tokens_seen": 269316770, + "step": 12488, + "time_per_iteration": 2.553664445877075 + }, + { + "auxiliary_loss_clip": 0.01040443, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.03276873, + "balance_loss_mlp": 1.02143979, + "epoch": 0.750879302570269, + "flos": 19899575251200.0, + "grad_norm": 1.7317282443134243, + "language_loss": 0.77263552, + "learning_rate": 6.163312573883592e-07, + "loss": 0.79338002, + "num_input_tokens_seen": 269334755, + "step": 12489, + "time_per_iteration": 2.6048166751861572 + }, + { + "auxiliary_loss_clip": 0.0109321, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.03730869, + "balance_loss_mlp": 1.019472, + "epoch": 0.750939425822937, + "flos": 29205681667200.0, + "grad_norm": 2.0422525380887424, + "language_loss": 0.75238216, + "learning_rate": 6.160500703901956e-07, + "loss": 0.77361763, + "num_input_tokens_seen": 269353810, + "step": 12490, + "time_per_iteration": 2.542771339416504 + }, + { + "auxiliary_loss_clip": 0.0110468, + "auxiliary_loss_mlp": 0.01028058, + "balance_loss_clip": 1.0369184, + "balance_loss_mlp": 1.01622677, + "epoch": 0.750999549075605, + "flos": 21142300043520.0, + "grad_norm": 1.558997955361212, + "language_loss": 0.7814759, + "learning_rate": 6.157689358715527e-07, + "loss": 0.80280328, + "num_input_tokens_seen": 269372910, + "step": 12491, + "time_per_iteration": 2.449381113052368 + }, + { + "auxiliary_loss_clip": 0.01092419, + "auxiliary_loss_mlp": 0.01029256, + "balance_loss_clip": 1.03472555, + "balance_loss_mlp": 1.0186944, + "epoch": 0.751059672328273, + "flos": 23547740083200.0, + "grad_norm": 1.9051801324386155, + "language_loss": 0.76483464, + "learning_rate": 6.154878538430899e-07, + "loss": 0.78605139, + "num_input_tokens_seen": 269391545, + "step": 12492, + "time_per_iteration": 2.518357753753662 + }, + { + "auxiliary_loss_clip": 0.01073169, + "auxiliary_loss_mlp": 0.01029318, + "balance_loss_clip": 1.03490686, + "balance_loss_mlp": 1.01789141, + "epoch": 0.7511197955809409, + "flos": 18989742729600.0, + "grad_norm": 2.0052366052901744, + "language_loss": 0.7098459, + "learning_rate": 6.152068243154671e-07, + "loss": 0.73087078, + "num_input_tokens_seen": 269408530, + "step": 12493, + "time_per_iteration": 2.529813766479492 + }, + { + "auxiliary_loss_clip": 0.01096058, + "auxiliary_loss_mlp": 0.00783117, + "balance_loss_clip": 1.03725648, + "balance_loss_mlp": 1.01124501, + "epoch": 0.7511799188336089, + "flos": 22046961006720.0, + "grad_norm": 1.6436004341278811, + "language_loss": 0.80853724, + "learning_rate": 6.149258472993395e-07, + "loss": 0.82732892, + "num_input_tokens_seen": 269425930, + "step": 12494, + "time_per_iteration": 2.5087790489196777 + }, + { + "auxiliary_loss_clip": 0.01106553, + "auxiliary_loss_mlp": 0.01028071, + "balance_loss_clip": 1.0369935, + "balance_loss_mlp": 1.01572108, + "epoch": 0.7512400420862768, + "flos": 16467125546880.0, + "grad_norm": 1.9267796723662196, + "language_loss": 0.78950649, + "learning_rate": 6.146449228053634e-07, + "loss": 0.81085277, + "num_input_tokens_seen": 269443945, + "step": 12495, + "time_per_iteration": 2.472052812576294 + }, + { + "auxiliary_loss_clip": 0.01104882, + "auxiliary_loss_mlp": 0.00781977, + "balance_loss_clip": 1.03623748, + "balance_loss_mlp": 1.00880241, + "epoch": 0.7513001653389448, + "flos": 20448326304000.0, + "grad_norm": 2.030564241967498, + "language_loss": 0.71490884, + "learning_rate": 6.143640508441898e-07, + "loss": 0.7337774, + "num_input_tokens_seen": 269463625, + "step": 12496, + "time_per_iteration": 2.482271432876587 + }, + { + "auxiliary_loss_clip": 0.01059889, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.03244698, + "balance_loss_mlp": 1.02077127, + "epoch": 0.7513602885916129, + "flos": 23476816679040.0, + "grad_norm": 1.5967325430846957, + "language_loss": 0.78375411, + "learning_rate": 6.140832314264705e-07, + "loss": 0.80468035, + "num_input_tokens_seen": 269483415, + "step": 12497, + "time_per_iteration": 2.6006524562835693 + }, + { + "auxiliary_loss_clip": 0.01097504, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.03721046, + "balance_loss_mlp": 1.02287197, + "epoch": 0.7514204118442808, + "flos": 26797224885120.0, + "grad_norm": 1.5108013816480954, + "language_loss": 0.76755548, + "learning_rate": 6.13802464562855e-07, + "loss": 0.78888679, + "num_input_tokens_seen": 269504635, + "step": 12498, + "time_per_iteration": 2.549534320831299 + }, + { + "auxiliary_loss_clip": 0.01082991, + "auxiliary_loss_mlp": 0.0103486, + "balance_loss_clip": 1.03844893, + "balance_loss_mlp": 1.02326155, + "epoch": 0.7514805350969488, + "flos": 19865639877120.0, + "grad_norm": 1.7289059979201555, + "language_loss": 0.74087769, + "learning_rate": 6.135217502639878e-07, + "loss": 0.76205623, + "num_input_tokens_seen": 269523955, + "step": 12499, + "time_per_iteration": 2.5353164672851562 + }, + { + "auxiliary_loss_clip": 0.01091207, + "auxiliary_loss_mlp": 0.01025764, + "balance_loss_clip": 1.03311515, + "balance_loss_mlp": 1.01483274, + "epoch": 0.7515406583496167, + "flos": 24571553437440.0, + "grad_norm": 1.851485553278697, + "language_loss": 0.79190719, + "learning_rate": 6.132410885405148e-07, + "loss": 0.81307691, + "num_input_tokens_seen": 269544410, + "step": 12500, + "time_per_iteration": 2.5767288208007812 + }, + { + "auxiliary_loss_clip": 0.01103505, + "auxiliary_loss_mlp": 0.01035638, + "balance_loss_clip": 1.03817797, + "balance_loss_mlp": 1.0216428, + "epoch": 0.7516007816022847, + "flos": 20120246455680.0, + "grad_norm": 1.8585060915413656, + "language_loss": 0.73630524, + "learning_rate": 6.129604794030794e-07, + "loss": 0.75769669, + "num_input_tokens_seen": 269563315, + "step": 12501, + "time_per_iteration": 2.487550735473633 + }, + { + "auxiliary_loss_clip": 0.01082531, + "auxiliary_loss_mlp": 0.01024632, + "balance_loss_clip": 1.03509104, + "balance_loss_mlp": 1.01224041, + "epoch": 0.7516609048549526, + "flos": 22784638619520.0, + "grad_norm": 1.5630119983134898, + "language_loss": 0.78517497, + "learning_rate": 6.126799228623207e-07, + "loss": 0.80624664, + "num_input_tokens_seen": 269583950, + "step": 12502, + "time_per_iteration": 2.587296962738037 + }, + { + "auxiliary_loss_clip": 0.01083312, + "auxiliary_loss_mlp": 0.01031257, + "balance_loss_clip": 1.0350852, + "balance_loss_mlp": 1.01960981, + "epoch": 0.7517210281076206, + "flos": 10634012311680.0, + "grad_norm": 2.3861117684436914, + "language_loss": 0.70581466, + "learning_rate": 6.123994189288786e-07, + "loss": 0.72696036, + "num_input_tokens_seen": 269600120, + "step": 12503, + "time_per_iteration": 2.4832592010498047 + }, + { + "auxiliary_loss_clip": 0.01032322, + "auxiliary_loss_mlp": 0.01003837, + "balance_loss_clip": 1.00914514, + "balance_loss_mlp": 1.00271034, + "epoch": 0.7517811513602886, + "flos": 66052221275520.0, + "grad_norm": 0.9838714999072278, + "language_loss": 0.6398136, + "learning_rate": 6.121189676133903e-07, + "loss": 0.66017526, + "num_input_tokens_seen": 269659815, + "step": 12504, + "time_per_iteration": 3.0037269592285156 + }, + { + "auxiliary_loss_clip": 0.01062527, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.03114057, + "balance_loss_mlp": 1.01940703, + "epoch": 0.7518412746129566, + "flos": 37268345018880.0, + "grad_norm": 1.4955568079651682, + "language_loss": 0.68525279, + "learning_rate": 6.118385689264896e-07, + "loss": 0.70619053, + "num_input_tokens_seen": 269684565, + "step": 12505, + "time_per_iteration": 2.7339110374450684 + }, + { + "auxiliary_loss_clip": 0.01022613, + "auxiliary_loss_mlp": 0.00762797, + "balance_loss_clip": 1.00993311, + "balance_loss_mlp": 1.00259876, + "epoch": 0.7519013978656245, + "flos": 60518567727360.0, + "grad_norm": 0.6436769394826811, + "language_loss": 0.55091321, + "learning_rate": 6.11558222878809e-07, + "loss": 0.56876731, + "num_input_tokens_seen": 269752325, + "step": 12506, + "time_per_iteration": 3.2125961780548096 + }, + { + "auxiliary_loss_clip": 0.01094191, + "auxiliary_loss_mlp": 0.01034972, + "balance_loss_clip": 1.03673744, + "balance_loss_mlp": 1.02273536, + "epoch": 0.7519615211182925, + "flos": 18806885568000.0, + "grad_norm": 1.8599667222002583, + "language_loss": 0.78526068, + "learning_rate": 6.112779294809796e-07, + "loss": 0.80655229, + "num_input_tokens_seen": 269770630, + "step": 12507, + "time_per_iteration": 2.51926326751709 + }, + { + "auxiliary_loss_clip": 0.01078328, + "auxiliary_loss_mlp": 0.01031769, + "balance_loss_clip": 1.03709793, + "balance_loss_mlp": 1.02025294, + "epoch": 0.7520216443709604, + "flos": 14575244209920.0, + "grad_norm": 1.5890886033139742, + "language_loss": 0.71113914, + "learning_rate": 6.10997688743631e-07, + "loss": 0.73224008, + "num_input_tokens_seen": 269787280, + "step": 12508, + "time_per_iteration": 2.497087001800537 + }, + { + "auxiliary_loss_clip": 0.01088883, + "auxiliary_loss_mlp": 0.01028601, + "balance_loss_clip": 1.0339818, + "balance_loss_mlp": 1.01647186, + "epoch": 0.7520817676236284, + "flos": 17056599644160.0, + "grad_norm": 1.5343955525117614, + "language_loss": 0.7161088, + "learning_rate": 6.107175006773885e-07, + "loss": 0.73728359, + "num_input_tokens_seen": 269805205, + "step": 12509, + "time_per_iteration": 2.490513801574707 + }, + { + "auxiliary_loss_clip": 0.01111069, + "auxiliary_loss_mlp": 0.01038515, + "balance_loss_clip": 1.03751802, + "balance_loss_mlp": 1.02503824, + "epoch": 0.7521418908762965, + "flos": 25666397936640.0, + "grad_norm": 1.7030303986704753, + "language_loss": 0.61797899, + "learning_rate": 6.104373652928785e-07, + "loss": 0.63947481, + "num_input_tokens_seen": 269824820, + "step": 12510, + "time_per_iteration": 2.4850845336914062 + }, + { + "auxiliary_loss_clip": 0.01090995, + "auxiliary_loss_mlp": 0.01030431, + "balance_loss_clip": 1.03548336, + "balance_loss_mlp": 1.01893318, + "epoch": 0.7522020141289644, + "flos": 20886759711360.0, + "grad_norm": 1.7668134657850443, + "language_loss": 0.81390828, + "learning_rate": 6.10157282600722e-07, + "loss": 0.83512253, + "num_input_tokens_seen": 269842825, + "step": 12511, + "time_per_iteration": 3.950269937515259 + }, + { + "auxiliary_loss_clip": 0.01081035, + "auxiliary_loss_mlp": 0.01033566, + "balance_loss_clip": 1.0358156, + "balance_loss_mlp": 1.02067399, + "epoch": 0.7522621373816324, + "flos": 12640305444480.0, + "grad_norm": 2.04263083550264, + "language_loss": 0.75599831, + "learning_rate": 6.098772526115412e-07, + "loss": 0.77714431, + "num_input_tokens_seen": 269859000, + "step": 12512, + "time_per_iteration": 2.5187625885009766 + }, + { + "auxiliary_loss_clip": 0.01087366, + "auxiliary_loss_mlp": 0.01024929, + "balance_loss_clip": 1.0332253, + "balance_loss_mlp": 1.014153, + "epoch": 0.7523222606343003, + "flos": 25626141768960.0, + "grad_norm": 1.6884203900080537, + "language_loss": 0.82305825, + "learning_rate": 6.095972753359537e-07, + "loss": 0.84418118, + "num_input_tokens_seen": 269878895, + "step": 12513, + "time_per_iteration": 2.562002420425415 + }, + { + "auxiliary_loss_clip": 0.01093579, + "auxiliary_loss_mlp": 0.01035041, + "balance_loss_clip": 1.03697681, + "balance_loss_mlp": 1.02238655, + "epoch": 0.7523823838869683, + "flos": 20448900921600.0, + "grad_norm": 3.515412038649654, + "language_loss": 0.74769056, + "learning_rate": 6.093173507845771e-07, + "loss": 0.76897675, + "num_input_tokens_seen": 269897280, + "step": 12514, + "time_per_iteration": 2.52724027633667 + }, + { + "auxiliary_loss_clip": 0.01083775, + "auxiliary_loss_mlp": 0.01028042, + "balance_loss_clip": 1.03489125, + "balance_loss_mlp": 1.01727104, + "epoch": 0.7524425071396362, + "flos": 14720610551040.0, + "grad_norm": 1.9306911917256857, + "language_loss": 0.69149435, + "learning_rate": 6.090374789680271e-07, + "loss": 0.71261251, + "num_input_tokens_seen": 269914640, + "step": 12515, + "time_per_iteration": 2.526887893676758 + }, + { + "auxiliary_loss_clip": 0.01094265, + "auxiliary_loss_mlp": 0.01030656, + "balance_loss_clip": 1.03618252, + "balance_loss_mlp": 1.01940227, + "epoch": 0.7525026303923043, + "flos": 30592048947840.0, + "grad_norm": 1.9783723164715672, + "language_loss": 0.70058334, + "learning_rate": 6.087576598969137e-07, + "loss": 0.72183251, + "num_input_tokens_seen": 269934960, + "step": 12516, + "time_per_iteration": 2.5820538997650146 + }, + { + "auxiliary_loss_clip": 0.01057175, + "auxiliary_loss_mlp": 0.01031096, + "balance_loss_clip": 1.03764689, + "balance_loss_mlp": 1.0199616, + "epoch": 0.7525627536449722, + "flos": 24791757765120.0, + "grad_norm": 1.5046818591365116, + "language_loss": 0.89729691, + "learning_rate": 6.084778935818495e-07, + "loss": 0.91817963, + "num_input_tokens_seen": 269956655, + "step": 12517, + "time_per_iteration": 4.047922372817993 + }, + { + "auxiliary_loss_clip": 0.01083932, + "auxiliary_loss_mlp": 0.01033807, + "balance_loss_clip": 1.03618479, + "balance_loss_mlp": 1.02187979, + "epoch": 0.7526228768976402, + "flos": 20779782030720.0, + "grad_norm": 1.6272729010569138, + "language_loss": 0.74256539, + "learning_rate": 6.081981800334437e-07, + "loss": 0.7637428, + "num_input_tokens_seen": 269976835, + "step": 12518, + "time_per_iteration": 3.9690399169921875 + }, + { + "auxiliary_loss_clip": 0.01004286, + "auxiliary_loss_mlp": 0.01004005, + "balance_loss_clip": 1.02611184, + "balance_loss_mlp": 1.00254488, + "epoch": 0.7526830001503081, + "flos": 66559243703040.0, + "grad_norm": 0.7262731237667166, + "language_loss": 0.55693161, + "learning_rate": 6.079185192623017e-07, + "loss": 0.57701451, + "num_input_tokens_seen": 270040630, + "step": 12519, + "time_per_iteration": 3.279413938522339 + }, + { + "auxiliary_loss_clip": 0.01087375, + "auxiliary_loss_mlp": 0.01033509, + "balance_loss_clip": 1.03565264, + "balance_loss_mlp": 1.02261877, + "epoch": 0.7527431234029761, + "flos": 23477894087040.0, + "grad_norm": 1.448688204103979, + "language_loss": 0.77677774, + "learning_rate": 6.07638911279029e-07, + "loss": 0.79798663, + "num_input_tokens_seen": 270059695, + "step": 12520, + "time_per_iteration": 2.5076589584350586 + }, + { + "auxiliary_loss_clip": 0.01087091, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.03314614, + "balance_loss_mlp": 1.02458107, + "epoch": 0.752803246655644, + "flos": 22049546785920.0, + "grad_norm": 2.134228430347393, + "language_loss": 0.74020475, + "learning_rate": 6.07359356094229e-07, + "loss": 0.76143676, + "num_input_tokens_seen": 270078420, + "step": 12521, + "time_per_iteration": 2.5245373249053955 + }, + { + "auxiliary_loss_clip": 0.01089561, + "auxiliary_loss_mlp": 0.01034802, + "balance_loss_clip": 1.03866649, + "balance_loss_mlp": 1.02168345, + "epoch": 0.752863369908312, + "flos": 30153795108480.0, + "grad_norm": 2.1155507287833553, + "language_loss": 0.66979623, + "learning_rate": 6.070798537185016e-07, + "loss": 0.6910398, + "num_input_tokens_seen": 270097040, + "step": 12522, + "time_per_iteration": 3.9793951511383057 + }, + { + "auxiliary_loss_clip": 0.01097386, + "auxiliary_loss_mlp": 0.01038021, + "balance_loss_clip": 1.03719044, + "balance_loss_mlp": 1.02598691, + "epoch": 0.7529234931609801, + "flos": 24567638855040.0, + "grad_norm": 1.5635370742962917, + "language_loss": 0.7860471, + "learning_rate": 6.068004041624453e-07, + "loss": 0.80740118, + "num_input_tokens_seen": 270116365, + "step": 12523, + "time_per_iteration": 2.5221335887908936 + }, + { + "auxiliary_loss_clip": 0.01103662, + "auxiliary_loss_mlp": 0.01026722, + "balance_loss_clip": 1.03685021, + "balance_loss_mlp": 1.0150094, + "epoch": 0.752983616413648, + "flos": 23112395245440.0, + "grad_norm": 1.8670551756808684, + "language_loss": 0.80935454, + "learning_rate": 6.065210074366571e-07, + "loss": 0.83065838, + "num_input_tokens_seen": 270135395, + "step": 12524, + "time_per_iteration": 2.483976125717163 + }, + { + "auxiliary_loss_clip": 0.01094903, + "auxiliary_loss_mlp": 0.00781325, + "balance_loss_clip": 1.03669262, + "balance_loss_mlp": 1.00840378, + "epoch": 0.753043739666316, + "flos": 24316946858880.0, + "grad_norm": 1.5718628618359405, + "language_loss": 0.74069113, + "learning_rate": 6.062416635517326e-07, + "loss": 0.75945342, + "num_input_tokens_seen": 270156425, + "step": 12525, + "time_per_iteration": 2.519648790359497 + }, + { + "auxiliary_loss_clip": 0.0107157, + "auxiliary_loss_mlp": 0.01028598, + "balance_loss_clip": 1.03531837, + "balance_loss_mlp": 1.01717138, + "epoch": 0.7531038629189839, + "flos": 24243294021120.0, + "grad_norm": 1.8726252235513057, + "language_loss": 0.71784425, + "learning_rate": 6.059623725182641e-07, + "loss": 0.73884594, + "num_input_tokens_seen": 270176905, + "step": 12526, + "time_per_iteration": 2.5989396572113037 + }, + { + "auxiliary_loss_clip": 0.01080934, + "auxiliary_loss_mlp": 0.01024202, + "balance_loss_clip": 1.03348494, + "balance_loss_mlp": 1.01316905, + "epoch": 0.7531639861716519, + "flos": 30188807890560.0, + "grad_norm": 1.6636812045092084, + "language_loss": 0.72094327, + "learning_rate": 6.056831343468414e-07, + "loss": 0.74199468, + "num_input_tokens_seen": 270196640, + "step": 12527, + "time_per_iteration": 2.5888571739196777 + }, + { + "auxiliary_loss_clip": 0.01069078, + "auxiliary_loss_mlp": 0.01023696, + "balance_loss_clip": 1.03412223, + "balance_loss_mlp": 1.01316416, + "epoch": 0.7532241094243198, + "flos": 18223193560320.0, + "grad_norm": 1.7982010468987335, + "language_loss": 0.81197083, + "learning_rate": 6.054039490480539e-07, + "loss": 0.83289862, + "num_input_tokens_seen": 270213905, + "step": 12528, + "time_per_iteration": 2.531092643737793 + }, + { + "auxiliary_loss_clip": 0.01049364, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.03358698, + "balance_loss_mlp": 1.02053392, + "epoch": 0.7532842326769879, + "flos": 20881049448960.0, + "grad_norm": 1.7930087991184898, + "language_loss": 0.8524074, + "learning_rate": 6.051248166324892e-07, + "loss": 0.87323081, + "num_input_tokens_seen": 270231995, + "step": 12529, + "time_per_iteration": 2.596324920654297 + }, + { + "auxiliary_loss_clip": 0.01077518, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.03750825, + "balance_loss_mlp": 1.0209322, + "epoch": 0.7533443559296558, + "flos": 18078689145600.0, + "grad_norm": 1.6657629802249088, + "language_loss": 0.73556739, + "learning_rate": 6.048457371107303e-07, + "loss": 0.75667632, + "num_input_tokens_seen": 270251480, + "step": 12530, + "time_per_iteration": 2.576198101043701 + }, + { + "auxiliary_loss_clip": 0.00998683, + "auxiliary_loss_mlp": 0.010039, + "balance_loss_clip": 1.02384782, + "balance_loss_mlp": 1.00237453, + "epoch": 0.7534044791823238, + "flos": 50254830766080.0, + "grad_norm": 0.8250759001478801, + "language_loss": 0.63679278, + "learning_rate": 6.045667104933612e-07, + "loss": 0.65681863, + "num_input_tokens_seen": 270306480, + "step": 12531, + "time_per_iteration": 3.0545570850372314 + }, + { + "auxiliary_loss_clip": 0.0108358, + "auxiliary_loss_mlp": 0.01028914, + "balance_loss_clip": 1.03597999, + "balance_loss_mlp": 1.01665306, + "epoch": 0.7534646024349917, + "flos": 20850274471680.0, + "grad_norm": 1.8824616198606245, + "language_loss": 0.69817126, + "learning_rate": 6.042877367909633e-07, + "loss": 0.71929622, + "num_input_tokens_seen": 270324595, + "step": 12532, + "time_per_iteration": 2.591109037399292 + }, + { + "auxiliary_loss_clip": 0.01083798, + "auxiliary_loss_mlp": 0.01027551, + "balance_loss_clip": 1.03670406, + "balance_loss_mlp": 1.01685739, + "epoch": 0.7535247256876597, + "flos": 23071779941760.0, + "grad_norm": 1.599884934324779, + "language_loss": 0.77381158, + "learning_rate": 6.040088160141132e-07, + "loss": 0.79492503, + "num_input_tokens_seen": 270344375, + "step": 12533, + "time_per_iteration": 2.568091630935669 + }, + { + "auxiliary_loss_clip": 0.01023034, + "auxiliary_loss_mlp": 0.01007915, + "balance_loss_clip": 1.00892282, + "balance_loss_mlp": 1.00671053, + "epoch": 0.7535848489403276, + "flos": 58623418252800.0, + "grad_norm": 0.7820394763904313, + "language_loss": 0.57308382, + "learning_rate": 6.037299481733886e-07, + "loss": 0.59339333, + "num_input_tokens_seen": 270405235, + "step": 12534, + "time_per_iteration": 3.141066551208496 + }, + { + "auxiliary_loss_clip": 0.01082286, + "auxiliary_loss_mlp": 0.01027243, + "balance_loss_clip": 1.03443754, + "balance_loss_mlp": 1.01547098, + "epoch": 0.7536449721929956, + "flos": 26577882483840.0, + "grad_norm": 1.4859180462079278, + "language_loss": 0.71509033, + "learning_rate": 6.03451133279365e-07, + "loss": 0.73618567, + "num_input_tokens_seen": 270425820, + "step": 12535, + "time_per_iteration": 2.559091806411743 + }, + { + "auxiliary_loss_clip": 0.01080593, + "auxiliary_loss_mlp": 0.01028027, + "balance_loss_clip": 1.03242683, + "balance_loss_mlp": 1.01538539, + "epoch": 0.7537050954456637, + "flos": 25735992537600.0, + "grad_norm": 1.5631944878000839, + "language_loss": 0.80820239, + "learning_rate": 6.031723713426135e-07, + "loss": 0.8292886, + "num_input_tokens_seen": 270447120, + "step": 12536, + "time_per_iteration": 2.571333885192871 + }, + { + "auxiliary_loss_clip": 0.01078185, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.03323221, + "balance_loss_mlp": 1.01847839, + "epoch": 0.7537652186983316, + "flos": 30224431203840.0, + "grad_norm": 1.806411410896196, + "language_loss": 0.74504244, + "learning_rate": 6.028936623737067e-07, + "loss": 0.76612341, + "num_input_tokens_seen": 270468680, + "step": 12537, + "time_per_iteration": 2.6060614585876465 + }, + { + "auxiliary_loss_clip": 0.0110462, + "auxiliary_loss_mlp": 0.01032114, + "balance_loss_clip": 1.03515291, + "balance_loss_mlp": 1.01984143, + "epoch": 0.7538253419509996, + "flos": 12641239198080.0, + "grad_norm": 1.6871873309217127, + "language_loss": 0.74026674, + "learning_rate": 6.026150063832111e-07, + "loss": 0.76163411, + "num_input_tokens_seen": 270486310, + "step": 12538, + "time_per_iteration": 2.466470241546631 + }, + { + "auxiliary_loss_clip": 0.01068221, + "auxiliary_loss_mlp": 0.01029948, + "balance_loss_clip": 1.03620541, + "balance_loss_mlp": 1.01786637, + "epoch": 0.7538854652036675, + "flos": 23185976256000.0, + "grad_norm": 1.469429660159667, + "language_loss": 0.67270184, + "learning_rate": 6.023364033816956e-07, + "loss": 0.69368351, + "num_input_tokens_seen": 270507210, + "step": 12539, + "time_per_iteration": 2.5996882915496826 + }, + { + "auxiliary_loss_clip": 0.01103253, + "auxiliary_loss_mlp": 0.01027513, + "balance_loss_clip": 1.03591561, + "balance_loss_mlp": 1.01554477, + "epoch": 0.7539455884563355, + "flos": 23186227651200.0, + "grad_norm": 1.5785818908289488, + "language_loss": 0.74547398, + "learning_rate": 6.020578533797229e-07, + "loss": 0.76678169, + "num_input_tokens_seen": 270525250, + "step": 12540, + "time_per_iteration": 2.4579107761383057 + }, + { + "auxiliary_loss_clip": 0.01106308, + "auxiliary_loss_mlp": 0.01029159, + "balance_loss_clip": 1.03557646, + "balance_loss_mlp": 1.01708865, + "epoch": 0.7540057117090034, + "flos": 13181155505280.0, + "grad_norm": 2.895484024278151, + "language_loss": 0.72901762, + "learning_rate": 6.017793563878566e-07, + "loss": 0.75037229, + "num_input_tokens_seen": 270539295, + "step": 12541, + "time_per_iteration": 2.3987300395965576 + }, + { + "auxiliary_loss_clip": 0.01103803, + "auxiliary_loss_mlp": 0.01028437, + "balance_loss_clip": 1.03627622, + "balance_loss_mlp": 1.01649237, + "epoch": 0.7540658349616715, + "flos": 45478134478080.0, + "grad_norm": 1.5354839012803267, + "language_loss": 0.71453643, + "learning_rate": 6.015009124166576e-07, + "loss": 0.73585886, + "num_input_tokens_seen": 270562815, + "step": 12542, + "time_per_iteration": 2.6646933555603027 + }, + { + "auxiliary_loss_clip": 0.01078531, + "auxiliary_loss_mlp": 0.01025069, + "balance_loss_clip": 1.03324211, + "balance_loss_mlp": 1.01319528, + "epoch": 0.7541259582143394, + "flos": 19930817105280.0, + "grad_norm": 1.8421909229108038, + "language_loss": 0.84473825, + "learning_rate": 6.012225214766844e-07, + "loss": 0.86577427, + "num_input_tokens_seen": 270579055, + "step": 12543, + "time_per_iteration": 2.509392738342285 + }, + { + "auxiliary_loss_clip": 0.01073306, + "auxiliary_loss_mlp": 0.01031309, + "balance_loss_clip": 1.03972697, + "balance_loss_mlp": 1.01934648, + "epoch": 0.7541860814670074, + "flos": 27198239299200.0, + "grad_norm": 2.800183314765061, + "language_loss": 0.73245549, + "learning_rate": 6.009441835784927e-07, + "loss": 0.75350165, + "num_input_tokens_seen": 270599080, + "step": 12544, + "time_per_iteration": 2.593473434448242 + }, + { + "auxiliary_loss_clip": 0.0108918, + "auxiliary_loss_mlp": 0.01032439, + "balance_loss_clip": 1.03586507, + "balance_loss_mlp": 1.02119708, + "epoch": 0.7542462047196753, + "flos": 21324151624320.0, + "grad_norm": 2.0867028638219898, + "language_loss": 0.68037403, + "learning_rate": 6.006658987326383e-07, + "loss": 0.70159018, + "num_input_tokens_seen": 270618715, + "step": 12545, + "time_per_iteration": 2.5062661170959473 + }, + { + "auxiliary_loss_clip": 0.01081374, + "auxiliary_loss_mlp": 0.01031405, + "balance_loss_clip": 1.03434205, + "balance_loss_mlp": 1.01944232, + "epoch": 0.7543063279723433, + "flos": 11940944664960.0, + "grad_norm": 1.6465688607053828, + "language_loss": 0.68626565, + "learning_rate": 6.003876669496728e-07, + "loss": 0.70739347, + "num_input_tokens_seen": 270635695, + "step": 12546, + "time_per_iteration": 2.4988033771514893 + }, + { + "auxiliary_loss_clip": 0.01092461, + "auxiliary_loss_mlp": 0.01033896, + "balance_loss_clip": 1.0380646, + "balance_loss_mlp": 1.02130127, + "epoch": 0.7543664512250112, + "flos": 22819974624000.0, + "grad_norm": 6.308178284225634, + "language_loss": 0.73575258, + "learning_rate": 6.00109488240147e-07, + "loss": 0.75701618, + "num_input_tokens_seen": 270654325, + "step": 12547, + "time_per_iteration": 2.5134336948394775 + }, + { + "auxiliary_loss_clip": 0.01104547, + "auxiliary_loss_mlp": 0.01025581, + "balance_loss_clip": 1.03601146, + "balance_loss_mlp": 1.01328421, + "epoch": 0.7544265744776792, + "flos": 20923855482240.0, + "grad_norm": 2.7950113755223205, + "language_loss": 0.6750294, + "learning_rate": 5.998313626146099e-07, + "loss": 0.69633067, + "num_input_tokens_seen": 270674260, + "step": 12548, + "time_per_iteration": 2.467850923538208 + }, + { + "auxiliary_loss_clip": 0.01083624, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.03499579, + "balance_loss_mlp": 1.01855087, + "epoch": 0.7544866977303473, + "flos": 15195493284480.0, + "grad_norm": 1.8227839133353618, + "language_loss": 0.87292594, + "learning_rate": 5.995532900836088e-07, + "loss": 0.89406812, + "num_input_tokens_seen": 270692200, + "step": 12549, + "time_per_iteration": 2.5147600173950195 + }, + { + "auxiliary_loss_clip": 0.01056661, + "auxiliary_loss_mlp": 0.01034582, + "balance_loss_clip": 1.03579426, + "balance_loss_mlp": 1.02328074, + "epoch": 0.7545468209830152, + "flos": 27083683848960.0, + "grad_norm": 1.7319896565436874, + "language_loss": 0.77084649, + "learning_rate": 5.992752706576865e-07, + "loss": 0.79175889, + "num_input_tokens_seen": 270709675, + "step": 12550, + "time_per_iteration": 4.002254486083984 + }, + { + "auxiliary_loss_clip": 0.01104924, + "auxiliary_loss_mlp": 0.01025555, + "balance_loss_clip": 1.03534603, + "balance_loss_mlp": 1.01381302, + "epoch": 0.7546069442356832, + "flos": 26871703735680.0, + "grad_norm": 1.426147114766673, + "language_loss": 0.69329214, + "learning_rate": 5.98997304347386e-07, + "loss": 0.71459699, + "num_input_tokens_seen": 270733055, + "step": 12551, + "time_per_iteration": 2.522958278656006 + }, + { + "auxiliary_loss_clip": 0.01082743, + "auxiliary_loss_mlp": 0.01027337, + "balance_loss_clip": 1.03661084, + "balance_loss_mlp": 1.01498079, + "epoch": 0.7546670674883511, + "flos": 15743131015680.0, + "grad_norm": 2.604516278348822, + "language_loss": 0.86308199, + "learning_rate": 5.987193911632487e-07, + "loss": 0.88418275, + "num_input_tokens_seen": 270749275, + "step": 12552, + "time_per_iteration": 2.49202036857605 + }, + { + "auxiliary_loss_clip": 0.01095472, + "auxiliary_loss_mlp": 0.01032312, + "balance_loss_clip": 1.03558123, + "balance_loss_mlp": 1.0205338, + "epoch": 0.7547271907410191, + "flos": 23477714519040.0, + "grad_norm": 1.8137954048237637, + "language_loss": 0.77973974, + "learning_rate": 5.98441531115812e-07, + "loss": 0.80101752, + "num_input_tokens_seen": 270768230, + "step": 12553, + "time_per_iteration": 2.511488437652588 + }, + { + "auxiliary_loss_clip": 0.01095194, + "auxiliary_loss_mlp": 0.01034825, + "balance_loss_clip": 1.03808558, + "balance_loss_mlp": 1.02274966, + "epoch": 0.754787313993687, + "flos": 31722804069120.0, + "grad_norm": 1.919775563073745, + "language_loss": 0.62590015, + "learning_rate": 5.981637242156135e-07, + "loss": 0.64720035, + "num_input_tokens_seen": 270786285, + "step": 12554, + "time_per_iteration": 2.5777547359466553 + }, + { + "auxiliary_loss_clip": 0.01082712, + "auxiliary_loss_mlp": 0.01036676, + "balance_loss_clip": 1.03402054, + "balance_loss_mlp": 1.02493989, + "epoch": 0.7548474372463551, + "flos": 27563055782400.0, + "grad_norm": 1.5416249450200699, + "language_loss": 0.7346406, + "learning_rate": 5.978859704731864e-07, + "loss": 0.75583446, + "num_input_tokens_seen": 270805505, + "step": 12555, + "time_per_iteration": 3.9438347816467285 + }, + { + "auxiliary_loss_clip": 0.0108072, + "auxiliary_loss_mlp": 0.01026518, + "balance_loss_clip": 1.03783202, + "balance_loss_mlp": 1.01444221, + "epoch": 0.754907560499023, + "flos": 19318576763520.0, + "grad_norm": 1.916613473330727, + "language_loss": 0.78306162, + "learning_rate": 5.976082698990645e-07, + "loss": 0.80413401, + "num_input_tokens_seen": 270824610, + "step": 12556, + "time_per_iteration": 2.562962055206299 + }, + { + "auxiliary_loss_clip": 0.0102206, + "auxiliary_loss_mlp": 0.01003406, + "balance_loss_clip": 1.00925016, + "balance_loss_mlp": 1.0023272, + "epoch": 0.754967683751691, + "flos": 69744628684800.0, + "grad_norm": 0.7081316104057256, + "language_loss": 0.50447935, + "learning_rate": 5.973306225037769e-07, + "loss": 0.52473402, + "num_input_tokens_seen": 270886155, + "step": 12557, + "time_per_iteration": 4.5392725467681885 + }, + { + "auxiliary_loss_clip": 0.0109477, + "auxiliary_loss_mlp": 0.01029487, + "balance_loss_clip": 1.03975236, + "balance_loss_mlp": 1.01753044, + "epoch": 0.7550278070043589, + "flos": 24421913377920.0, + "grad_norm": 1.717825602062205, + "language_loss": 0.7165103, + "learning_rate": 5.970530282978525e-07, + "loss": 0.73775291, + "num_input_tokens_seen": 270905325, + "step": 12558, + "time_per_iteration": 2.5575428009033203 + }, + { + "auxiliary_loss_clip": 0.01080176, + "auxiliary_loss_mlp": 0.01036081, + "balance_loss_clip": 1.03380513, + "balance_loss_mlp": 1.02319479, + "epoch": 0.7550879302570269, + "flos": 32634611838720.0, + "grad_norm": 1.7613666083843844, + "language_loss": 0.80310762, + "learning_rate": 5.967754872918187e-07, + "loss": 0.82427019, + "num_input_tokens_seen": 270927535, + "step": 12559, + "time_per_iteration": 2.625265121459961 + }, + { + "auxiliary_loss_clip": 0.01058017, + "auxiliary_loss_mlp": 0.01028381, + "balance_loss_clip": 1.03590155, + "balance_loss_mlp": 1.01622212, + "epoch": 0.7551480535096948, + "flos": 21795550738560.0, + "grad_norm": 1.608551282811912, + "language_loss": 0.78935665, + "learning_rate": 5.96497999496199e-07, + "loss": 0.8102206, + "num_input_tokens_seen": 270946920, + "step": 12560, + "time_per_iteration": 2.6259067058563232 + }, + { + "auxiliary_loss_clip": 0.01054143, + "auxiliary_loss_mlp": 0.01040505, + "balance_loss_clip": 1.03449297, + "balance_loss_mlp": 1.02758253, + "epoch": 0.7552081767623628, + "flos": 18515111391360.0, + "grad_norm": 1.6787559628144801, + "language_loss": 0.70545614, + "learning_rate": 5.96220564921515e-07, + "loss": 0.72640258, + "num_input_tokens_seen": 270965705, + "step": 12561, + "time_per_iteration": 2.5699119567871094 + }, + { + "auxiliary_loss_clip": 0.01077531, + "auxiliary_loss_mlp": 0.00786037, + "balance_loss_clip": 1.03228879, + "balance_loss_mlp": 1.00926745, + "epoch": 0.7552683000150308, + "flos": 27634805199360.0, + "grad_norm": 1.5519131804653503, + "language_loss": 0.75704336, + "learning_rate": 5.959431835782889e-07, + "loss": 0.77567905, + "num_input_tokens_seen": 270986550, + "step": 12562, + "time_per_iteration": 4.006548643112183 + }, + { + "auxiliary_loss_clip": 0.01078907, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.03495061, + "balance_loss_mlp": 1.01657152, + "epoch": 0.7553284232676988, + "flos": 20302924049280.0, + "grad_norm": 2.0846556178086386, + "language_loss": 0.75936139, + "learning_rate": 5.956658554770371e-07, + "loss": 0.78044015, + "num_input_tokens_seen": 271006250, + "step": 12563, + "time_per_iteration": 2.531672239303589 + }, + { + "auxiliary_loss_clip": 0.0107612, + "auxiliary_loss_mlp": 0.01034072, + "balance_loss_clip": 1.03459466, + "balance_loss_mlp": 1.01937306, + "epoch": 0.7553885465203668, + "flos": 33255471444480.0, + "grad_norm": 2.212510371592076, + "language_loss": 0.67313796, + "learning_rate": 5.953885806282768e-07, + "loss": 0.69423985, + "num_input_tokens_seen": 271025575, + "step": 12564, + "time_per_iteration": 2.6571788787841797 + }, + { + "auxiliary_loss_clip": 0.01078216, + "auxiliary_loss_mlp": 0.01035662, + "balance_loss_clip": 1.03646719, + "balance_loss_mlp": 1.02270389, + "epoch": 0.7554486697730347, + "flos": 21616249023360.0, + "grad_norm": 1.9247829908605825, + "language_loss": 0.68673909, + "learning_rate": 5.951113590425228e-07, + "loss": 0.70787787, + "num_input_tokens_seen": 271045805, + "step": 12565, + "time_per_iteration": 2.5617005825042725 + }, + { + "auxiliary_loss_clip": 0.01085141, + "auxiliary_loss_mlp": 0.01029207, + "balance_loss_clip": 1.03433943, + "balance_loss_mlp": 1.01629639, + "epoch": 0.7555087930257027, + "flos": 27632973605760.0, + "grad_norm": 2.5598997788851743, + "language_loss": 0.74914777, + "learning_rate": 5.94834190730287e-07, + "loss": 0.77029127, + "num_input_tokens_seen": 271066065, + "step": 12566, + "time_per_iteration": 2.585289716720581 + }, + { + "auxiliary_loss_clip": 0.01099934, + "auxiliary_loss_mlp": 0.01034406, + "balance_loss_clip": 1.03795004, + "balance_loss_mlp": 1.02060783, + "epoch": 0.7555689162783706, + "flos": 23621644316160.0, + "grad_norm": 1.9553463244571208, + "language_loss": 0.73576194, + "learning_rate": 5.945570757020789e-07, + "loss": 0.75710535, + "num_input_tokens_seen": 271085870, + "step": 12567, + "time_per_iteration": 2.5140371322631836 + }, + { + "auxiliary_loss_clip": 0.01103674, + "auxiliary_loss_mlp": 0.01028033, + "balance_loss_clip": 1.03526151, + "balance_loss_mlp": 1.01642847, + "epoch": 0.7556290395310387, + "flos": 24863076218880.0, + "grad_norm": 1.9650130063807358, + "language_loss": 0.62676853, + "learning_rate": 5.942800139684073e-07, + "loss": 0.64808559, + "num_input_tokens_seen": 271104260, + "step": 12568, + "time_per_iteration": 2.493835210800171 + }, + { + "auxiliary_loss_clip": 0.01019588, + "auxiliary_loss_mlp": 0.01037245, + "balance_loss_clip": 1.03341794, + "balance_loss_mlp": 1.02446628, + "epoch": 0.7556891627837066, + "flos": 43543770330240.0, + "grad_norm": 1.8441000260839275, + "language_loss": 0.66555429, + "learning_rate": 5.940030055397789e-07, + "loss": 0.68612266, + "num_input_tokens_seen": 271125745, + "step": 12569, + "time_per_iteration": 2.93245792388916 + }, + { + "auxiliary_loss_clip": 0.01099563, + "auxiliary_loss_mlp": 0.01036004, + "balance_loss_clip": 1.03866887, + "balance_loss_mlp": 1.02230644, + "epoch": 0.7557492860363746, + "flos": 26650924790400.0, + "grad_norm": 1.6759615480978152, + "language_loss": 0.67455512, + "learning_rate": 5.93726050426697e-07, + "loss": 0.69591075, + "num_input_tokens_seen": 271147145, + "step": 12570, + "time_per_iteration": 2.66404390335083 + }, + { + "auxiliary_loss_clip": 0.01105573, + "auxiliary_loss_mlp": 0.01030404, + "balance_loss_clip": 1.03657269, + "balance_loss_mlp": 1.01815557, + "epoch": 0.7558094092890425, + "flos": 55182885010560.0, + "grad_norm": 2.059497901712733, + "language_loss": 0.71991998, + "learning_rate": 5.934491486396647e-07, + "loss": 0.74127972, + "num_input_tokens_seen": 271170865, + "step": 12571, + "time_per_iteration": 2.7414238452911377 + }, + { + "auxiliary_loss_clip": 0.01061355, + "auxiliary_loss_mlp": 0.01035207, + "balance_loss_clip": 1.03534317, + "balance_loss_mlp": 1.02158689, + "epoch": 0.7558695325417105, + "flos": 23988292392960.0, + "grad_norm": 1.707328548729647, + "language_loss": 0.73390609, + "learning_rate": 5.931723001891811e-07, + "loss": 0.75487173, + "num_input_tokens_seen": 271191450, + "step": 12572, + "time_per_iteration": 2.656403064727783 + }, + { + "auxiliary_loss_clip": 0.01086869, + "auxiliary_loss_mlp": 0.01029915, + "balance_loss_clip": 1.03828573, + "balance_loss_mlp": 1.0181849, + "epoch": 0.7559296557943784, + "flos": 14611262572800.0, + "grad_norm": 2.00441253244529, + "language_loss": 0.76875353, + "learning_rate": 5.928955050857456e-07, + "loss": 0.78992134, + "num_input_tokens_seen": 271207335, + "step": 12573, + "time_per_iteration": 2.507458209991455 + }, + { + "auxiliary_loss_clip": 0.01080076, + "auxiliary_loss_mlp": 0.01029996, + "balance_loss_clip": 1.03691816, + "balance_loss_mlp": 1.01781869, + "epoch": 0.7559897790470465, + "flos": 18550483309440.0, + "grad_norm": 1.6240277338987166, + "language_loss": 0.69233268, + "learning_rate": 5.926187633398527e-07, + "loss": 0.71343338, + "num_input_tokens_seen": 271226895, + "step": 12574, + "time_per_iteration": 2.5767080783843994 + }, + { + "auxiliary_loss_clip": 0.01066926, + "auxiliary_loss_mlp": 0.01037441, + "balance_loss_clip": 1.03067338, + "balance_loss_mlp": 1.02452469, + "epoch": 0.7560499022997144, + "flos": 17967868709760.0, + "grad_norm": 2.1120611812689547, + "language_loss": 0.71469426, + "learning_rate": 5.923420749619974e-07, + "loss": 0.73573792, + "num_input_tokens_seen": 271244375, + "step": 12575, + "time_per_iteration": 2.5312037467956543 + }, + { + "auxiliary_loss_clip": 0.01102521, + "auxiliary_loss_mlp": 0.00783355, + "balance_loss_clip": 1.03454792, + "balance_loss_mlp": 1.00970984, + "epoch": 0.7561100255523824, + "flos": 15737815802880.0, + "grad_norm": 2.1288585560103024, + "language_loss": 0.72136497, + "learning_rate": 5.92065439962673e-07, + "loss": 0.74022377, + "num_input_tokens_seen": 271259530, + "step": 12576, + "time_per_iteration": 2.4883334636688232 + }, + { + "auxiliary_loss_clip": 0.01069856, + "auxiliary_loss_mlp": 0.0103067, + "balance_loss_clip": 1.03532112, + "balance_loss_mlp": 1.01868391, + "epoch": 0.7561701488050504, + "flos": 15888102307200.0, + "grad_norm": 1.7928768681560392, + "language_loss": 0.67228949, + "learning_rate": 5.917888583523669e-07, + "loss": 0.69329476, + "num_input_tokens_seen": 271276835, + "step": 12577, + "time_per_iteration": 2.542865753173828 + }, + { + "auxiliary_loss_clip": 0.01082363, + "auxiliary_loss_mlp": 0.01034448, + "balance_loss_clip": 1.03500628, + "balance_loss_mlp": 1.02246106, + "epoch": 0.7562302720577183, + "flos": 20339157893760.0, + "grad_norm": 1.7313226915970445, + "language_loss": 0.78307045, + "learning_rate": 5.915123301415685e-07, + "loss": 0.80423856, + "num_input_tokens_seen": 271296275, + "step": 12578, + "time_per_iteration": 2.518136739730835 + }, + { + "auxiliary_loss_clip": 0.01094503, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.03544104, + "balance_loss_mlp": 1.02135301, + "epoch": 0.7562903953103863, + "flos": 20812209033600.0, + "grad_norm": 1.5107610521350894, + "language_loss": 0.75935221, + "learning_rate": 5.912358553407641e-07, + "loss": 0.78063071, + "num_input_tokens_seen": 271315685, + "step": 12579, + "time_per_iteration": 2.527510643005371 + }, + { + "auxiliary_loss_clip": 0.01064318, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.03724074, + "balance_loss_mlp": 1.01888227, + "epoch": 0.7563505185630542, + "flos": 37596999484800.0, + "grad_norm": 2.0819873667994235, + "language_loss": 0.62743121, + "learning_rate": 5.90959433960437e-07, + "loss": 0.64839649, + "num_input_tokens_seen": 271336790, + "step": 12580, + "time_per_iteration": 2.731679677963257 + }, + { + "auxiliary_loss_clip": 0.01060936, + "auxiliary_loss_mlp": 0.01028894, + "balance_loss_clip": 1.03377247, + "balance_loss_mlp": 1.01682436, + "epoch": 0.7564106418157223, + "flos": 20230995064320.0, + "grad_norm": 1.783254124912878, + "language_loss": 0.74953419, + "learning_rate": 5.906830660110691e-07, + "loss": 0.77043247, + "num_input_tokens_seen": 271355470, + "step": 12581, + "time_per_iteration": 2.6006104946136475 + }, + { + "auxiliary_loss_clip": 0.01065173, + "auxiliary_loss_mlp": 0.01030354, + "balance_loss_clip": 1.03660464, + "balance_loss_mlp": 1.01802754, + "epoch": 0.7564707650683902, + "flos": 24754877475840.0, + "grad_norm": 1.8200394118983614, + "language_loss": 0.62609512, + "learning_rate": 5.904067515031412e-07, + "loss": 0.64705038, + "num_input_tokens_seen": 271375810, + "step": 12582, + "time_per_iteration": 2.6009130477905273 + }, + { + "auxiliary_loss_clip": 0.0103191, + "auxiliary_loss_mlp": 0.01005695, + "balance_loss_clip": 1.00866318, + "balance_loss_mlp": 1.00454509, + "epoch": 0.7565308883210582, + "flos": 48530076433920.0, + "grad_norm": 0.9515085202372157, + "language_loss": 0.60717702, + "learning_rate": 5.901304904471307e-07, + "loss": 0.62755311, + "num_input_tokens_seen": 271424775, + "step": 12583, + "time_per_iteration": 2.8540520668029785 + }, + { + "auxiliary_loss_clip": 0.01078866, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.03818893, + "balance_loss_mlp": 1.01901364, + "epoch": 0.7565910115737261, + "flos": 12495082757760.0, + "grad_norm": 1.8821615709627735, + "language_loss": 0.79006904, + "learning_rate": 5.898542828535125e-07, + "loss": 0.81117284, + "num_input_tokens_seen": 271440500, + "step": 12584, + "time_per_iteration": 2.5076582431793213 + }, + { + "auxiliary_loss_clip": 0.01075, + "auxiliary_loss_mlp": 0.01041526, + "balance_loss_clip": 1.03339314, + "balance_loss_mlp": 1.02708995, + "epoch": 0.7566511348263941, + "flos": 21173003193600.0, + "grad_norm": 1.9247729836367704, + "language_loss": 0.77520859, + "learning_rate": 5.895781287327612e-07, + "loss": 0.79637384, + "num_input_tokens_seen": 271458180, + "step": 12585, + "time_per_iteration": 2.5434367656707764 + }, + { + "auxiliary_loss_clip": 0.01110213, + "auxiliary_loss_mlp": 0.01033819, + "balance_loss_clip": 1.0390811, + "balance_loss_mlp": 1.02105761, + "epoch": 0.756711258079062, + "flos": 21754827694080.0, + "grad_norm": 1.921569916424512, + "language_loss": 0.8295145, + "learning_rate": 5.893020280953493e-07, + "loss": 0.85095489, + "num_input_tokens_seen": 271475730, + "step": 12586, + "time_per_iteration": 2.4690253734588623 + }, + { + "auxiliary_loss_clip": 0.01108811, + "auxiliary_loss_mlp": 0.01030989, + "balance_loss_clip": 1.03712201, + "balance_loss_mlp": 1.0193243, + "epoch": 0.75677138133173, + "flos": 22382905933440.0, + "grad_norm": 2.235983633684025, + "language_loss": 0.83533657, + "learning_rate": 5.890259809517459e-07, + "loss": 0.85673457, + "num_input_tokens_seen": 271495030, + "step": 12587, + "time_per_iteration": 2.4896111488342285 + }, + { + "auxiliary_loss_clip": 0.01071148, + "auxiliary_loss_mlp": 0.01027418, + "balance_loss_clip": 1.03472567, + "balance_loss_mlp": 1.01528859, + "epoch": 0.756831504584398, + "flos": 22708974620160.0, + "grad_norm": 1.5910895723602476, + "language_loss": 0.71358538, + "learning_rate": 5.88749987312418e-07, + "loss": 0.7345711, + "num_input_tokens_seen": 271515355, + "step": 12588, + "time_per_iteration": 2.571631908416748 + }, + { + "auxiliary_loss_clip": 0.01108338, + "auxiliary_loss_mlp": 0.00782705, + "balance_loss_clip": 1.0375483, + "balance_loss_mlp": 1.00722373, + "epoch": 0.756891627837066, + "flos": 24098358643200.0, + "grad_norm": 1.9239347573246157, + "language_loss": 0.69060731, + "learning_rate": 5.884740471878327e-07, + "loss": 0.70951772, + "num_input_tokens_seen": 271535090, + "step": 12589, + "time_per_iteration": 3.940742254257202 + }, + { + "auxiliary_loss_clip": 0.0109379, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.03518534, + "balance_loss_mlp": 1.01538062, + "epoch": 0.756951751089734, + "flos": 19749001438080.0, + "grad_norm": 1.6486989084072432, + "language_loss": 0.92053962, + "learning_rate": 5.881981605884522e-07, + "loss": 0.9417522, + "num_input_tokens_seen": 271551075, + "step": 12590, + "time_per_iteration": 2.4786465167999268 + }, + { + "auxiliary_loss_clip": 0.0107731, + "auxiliary_loss_mlp": 0.01029098, + "balance_loss_clip": 1.03429914, + "balance_loss_mlp": 1.01674247, + "epoch": 0.7570118743424019, + "flos": 35079266551680.0, + "grad_norm": 1.77528228200888, + "language_loss": 0.65549266, + "learning_rate": 5.879223275247391e-07, + "loss": 0.67655677, + "num_input_tokens_seen": 271571035, + "step": 12591, + "time_per_iteration": 2.6804704666137695 + }, + { + "auxiliary_loss_clip": 0.01094027, + "auxiliary_loss_mlp": 0.01022957, + "balance_loss_clip": 1.03689861, + "balance_loss_mlp": 1.01192403, + "epoch": 0.7570719975950699, + "flos": 25594540778880.0, + "grad_norm": 2.121121914421505, + "language_loss": 0.73631394, + "learning_rate": 5.876465480071528e-07, + "loss": 0.75748378, + "num_input_tokens_seen": 271592950, + "step": 12592, + "time_per_iteration": 2.5370850563049316 + }, + { + "auxiliary_loss_clip": 0.01095899, + "auxiliary_loss_mlp": 0.01034458, + "balance_loss_clip": 1.03613615, + "balance_loss_mlp": 1.0215652, + "epoch": 0.7571321208477378, + "flos": 10816223028480.0, + "grad_norm": 2.3114374577424996, + "language_loss": 0.71476865, + "learning_rate": 5.873708220461522e-07, + "loss": 0.73607218, + "num_input_tokens_seen": 271608835, + "step": 12593, + "time_per_iteration": 2.4616951942443848 + }, + { + "auxiliary_loss_clip": 0.01105912, + "auxiliary_loss_mlp": 0.01029827, + "balance_loss_clip": 1.03548431, + "balance_loss_mlp": 1.01778698, + "epoch": 0.7571922441004059, + "flos": 18260109763200.0, + "grad_norm": 2.584148050491652, + "language_loss": 0.66250694, + "learning_rate": 5.870951496521903e-07, + "loss": 0.68386436, + "num_input_tokens_seen": 271627730, + "step": 12594, + "time_per_iteration": 3.82405948638916 + }, + { + "auxiliary_loss_clip": 0.01075886, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.03693271, + "balance_loss_mlp": 1.01895595, + "epoch": 0.7572523673530738, + "flos": 22890502978560.0, + "grad_norm": 1.7144933778907412, + "language_loss": 0.80477899, + "learning_rate": 5.86819530835722e-07, + "loss": 0.82584572, + "num_input_tokens_seen": 271646415, + "step": 12595, + "time_per_iteration": 3.924114227294922 + }, + { + "auxiliary_loss_clip": 0.01072284, + "auxiliary_loss_mlp": 0.01031603, + "balance_loss_clip": 1.03576863, + "balance_loss_mlp": 1.01985455, + "epoch": 0.7573124906057418, + "flos": 20996323171200.0, + "grad_norm": 1.9461362085533571, + "language_loss": 0.71524084, + "learning_rate": 5.865439656071993e-07, + "loss": 0.73627973, + "num_input_tokens_seen": 271666240, + "step": 12596, + "time_per_iteration": 2.5494489669799805 + }, + { + "auxiliary_loss_clip": 0.01016757, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.03622484, + "balance_loss_mlp": 1.01790035, + "epoch": 0.7573726138584097, + "flos": 20886292834560.0, + "grad_norm": 1.4871680137227337, + "language_loss": 0.80514908, + "learning_rate": 5.862684539770706e-07, + "loss": 0.82561171, + "num_input_tokens_seen": 271686370, + "step": 12597, + "time_per_iteration": 2.881378650665283 + }, + { + "auxiliary_loss_clip": 0.01078068, + "auxiliary_loss_mlp": 0.01030665, + "balance_loss_clip": 1.03724849, + "balance_loss_mlp": 1.01749206, + "epoch": 0.7574327371110777, + "flos": 24530507170560.0, + "grad_norm": 2.2285457079882334, + "language_loss": 0.83117652, + "learning_rate": 5.859929959557835e-07, + "loss": 0.85226387, + "num_input_tokens_seen": 271705050, + "step": 12598, + "time_per_iteration": 2.8816792964935303 + }, + { + "auxiliary_loss_clip": 0.01077399, + "auxiliary_loss_mlp": 0.01024905, + "balance_loss_clip": 1.035725, + "balance_loss_mlp": 1.01395583, + "epoch": 0.7574928603637456, + "flos": 23364523785600.0, + "grad_norm": 1.6962601443753038, + "language_loss": 0.62943125, + "learning_rate": 5.857175915537845e-07, + "loss": 0.65045428, + "num_input_tokens_seen": 271724915, + "step": 12599, + "time_per_iteration": 2.547057867050171 + }, + { + "auxiliary_loss_clip": 0.01088986, + "auxiliary_loss_mlp": 0.00786492, + "balance_loss_clip": 1.03682041, + "balance_loss_mlp": 1.01319206, + "epoch": 0.7575529836164137, + "flos": 13516274419200.0, + "grad_norm": 2.7796289012513493, + "language_loss": 0.63313484, + "learning_rate": 5.854422407815161e-07, + "loss": 0.65188968, + "num_input_tokens_seen": 271742410, + "step": 12600, + "time_per_iteration": 4.071700572967529 + }, + { + "auxiliary_loss_clip": 0.01077064, + "auxiliary_loss_mlp": 0.01031884, + "balance_loss_clip": 1.03379154, + "balance_loss_mlp": 1.01876462, + "epoch": 0.7576131068690816, + "flos": 19646584784640.0, + "grad_norm": 1.68781142331712, + "language_loss": 0.66405272, + "learning_rate": 5.851669436494191e-07, + "loss": 0.68514222, + "num_input_tokens_seen": 271761425, + "step": 12601, + "time_per_iteration": 2.6218433380126953 + }, + { + "auxiliary_loss_clip": 0.01083296, + "auxiliary_loss_mlp": 0.01027784, + "balance_loss_clip": 1.03692317, + "balance_loss_mlp": 1.01666141, + "epoch": 0.7576732301217496, + "flos": 20048245643520.0, + "grad_norm": 1.6599385668428057, + "language_loss": 0.68031013, + "learning_rate": 5.848917001679335e-07, + "loss": 0.70142096, + "num_input_tokens_seen": 271780875, + "step": 12602, + "time_per_iteration": 2.5062079429626465 + }, + { + "auxiliary_loss_clip": 0.01097119, + "auxiliary_loss_mlp": 0.01031459, + "balance_loss_clip": 1.03715038, + "balance_loss_mlp": 1.01835775, + "epoch": 0.7577333533744176, + "flos": 15377093470080.0, + "grad_norm": 1.8516900415586521, + "language_loss": 0.67109895, + "learning_rate": 5.846165103474967e-07, + "loss": 0.69238478, + "num_input_tokens_seen": 271799490, + "step": 12603, + "time_per_iteration": 2.4768238067626953 + }, + { + "auxiliary_loss_clip": 0.01080362, + "auxiliary_loss_mlp": 0.01028668, + "balance_loss_clip": 1.03312159, + "balance_loss_mlp": 1.01745057, + "epoch": 0.7577934766270855, + "flos": 17894862316800.0, + "grad_norm": 1.8641923021586533, + "language_loss": 0.61696744, + "learning_rate": 5.843413741985439e-07, + "loss": 0.63805771, + "num_input_tokens_seen": 271817040, + "step": 12604, + "time_per_iteration": 2.4764058589935303 + }, + { + "auxiliary_loss_clip": 0.01106686, + "auxiliary_loss_mlp": 0.0103426, + "balance_loss_clip": 1.03893471, + "balance_loss_mlp": 1.02168298, + "epoch": 0.7578535998797535, + "flos": 21613770984960.0, + "grad_norm": 1.7453052938972013, + "language_loss": 0.79959059, + "learning_rate": 5.840662917315076e-07, + "loss": 0.82100004, + "num_input_tokens_seen": 271835480, + "step": 12605, + "time_per_iteration": 2.4607107639312744 + }, + { + "auxiliary_loss_clip": 0.01108513, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.03658283, + "balance_loss_mlp": 1.01736474, + "epoch": 0.7579137231324214, + "flos": 18478374756480.0, + "grad_norm": 2.6829601673299703, + "language_loss": 0.79759097, + "learning_rate": 5.837912629568198e-07, + "loss": 0.81897342, + "num_input_tokens_seen": 271849835, + "step": 12606, + "time_per_iteration": 2.4144935607910156 + }, + { + "auxiliary_loss_clip": 0.01088517, + "auxiliary_loss_mlp": 0.01026193, + "balance_loss_clip": 1.03510714, + "balance_loss_mlp": 1.01567864, + "epoch": 0.7579738463850895, + "flos": 23255032152960.0, + "grad_norm": 1.530363428783069, + "language_loss": 0.72812986, + "learning_rate": 5.835162878849087e-07, + "loss": 0.74927694, + "num_input_tokens_seen": 271869560, + "step": 12607, + "time_per_iteration": 2.5196456909179688 + }, + { + "auxiliary_loss_clip": 0.01078999, + "auxiliary_loss_mlp": 0.01028471, + "balance_loss_clip": 1.03577757, + "balance_loss_mlp": 1.01623976, + "epoch": 0.7580339696377574, + "flos": 14027031861120.0, + "grad_norm": 2.011897558219424, + "language_loss": 0.74909025, + "learning_rate": 5.83241366526202e-07, + "loss": 0.77016497, + "num_input_tokens_seen": 271887950, + "step": 12608, + "time_per_iteration": 2.490443706512451 + }, + { + "auxiliary_loss_clip": 0.01070775, + "auxiliary_loss_mlp": 0.00783847, + "balance_loss_clip": 1.0359292, + "balance_loss_mlp": 1.01072693, + "epoch": 0.7580940928904254, + "flos": 25082777756160.0, + "grad_norm": 1.5202469383561665, + "language_loss": 0.71251744, + "learning_rate": 5.829664988911245e-07, + "loss": 0.73106366, + "num_input_tokens_seen": 271907700, + "step": 12609, + "time_per_iteration": 2.6089119911193848 + }, + { + "auxiliary_loss_clip": 0.01106803, + "auxiliary_loss_mlp": 0.01030581, + "balance_loss_clip": 1.0359683, + "balance_loss_mlp": 1.01733649, + "epoch": 0.7581542161430933, + "flos": 23836425690240.0, + "grad_norm": 1.693848724515262, + "language_loss": 0.81674874, + "learning_rate": 5.826916849901007e-07, + "loss": 0.83812261, + "num_input_tokens_seen": 271926840, + "step": 12610, + "time_per_iteration": 2.457559823989868 + }, + { + "auxiliary_loss_clip": 0.01089723, + "auxiliary_loss_mlp": 0.01029443, + "balance_loss_clip": 1.0369916, + "balance_loss_mlp": 1.01683116, + "epoch": 0.7582143393957613, + "flos": 22237000888320.0, + "grad_norm": 1.7556565398519286, + "language_loss": 0.70132506, + "learning_rate": 5.824169248335488e-07, + "loss": 0.72251678, + "num_input_tokens_seen": 271946465, + "step": 12611, + "time_per_iteration": 2.544071674346924 + }, + { + "auxiliary_loss_clip": 0.0110591, + "auxiliary_loss_mlp": 0.01028494, + "balance_loss_clip": 1.03743434, + "balance_loss_mlp": 1.01623964, + "epoch": 0.7582744626484292, + "flos": 21106389421440.0, + "grad_norm": 1.4905088915954563, + "language_loss": 0.71204299, + "learning_rate": 5.821422184318893e-07, + "loss": 0.73338699, + "num_input_tokens_seen": 271967295, + "step": 12612, + "time_per_iteration": 2.4544012546539307 + }, + { + "auxiliary_loss_clip": 0.0104695, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.0346837, + "balance_loss_mlp": 1.0268358, + "epoch": 0.7583345859010973, + "flos": 24604770539520.0, + "grad_norm": 2.7726650874562218, + "language_loss": 0.59545356, + "learning_rate": 5.818675657955397e-07, + "loss": 0.61631107, + "num_input_tokens_seen": 271987960, + "step": 12613, + "time_per_iteration": 2.660717487335205 + }, + { + "auxiliary_loss_clip": 0.01080379, + "auxiliary_loss_mlp": 0.01039381, + "balance_loss_clip": 1.03421724, + "balance_loss_mlp": 1.02601147, + "epoch": 0.7583947091537652, + "flos": 33546814657920.0, + "grad_norm": 1.5328929722229196, + "language_loss": 0.59859818, + "learning_rate": 5.815929669349135e-07, + "loss": 0.6197958, + "num_input_tokens_seen": 272011780, + "step": 12614, + "time_per_iteration": 2.6304147243499756 + }, + { + "auxiliary_loss_clip": 0.01070829, + "auxiliary_loss_mlp": 0.01027296, + "balance_loss_clip": 1.03317046, + "balance_loss_mlp": 1.01450503, + "epoch": 0.7584548324064332, + "flos": 20121000641280.0, + "grad_norm": 1.854123343796536, + "language_loss": 0.73284352, + "learning_rate": 5.813184218604246e-07, + "loss": 0.75382483, + "num_input_tokens_seen": 272030825, + "step": 12615, + "time_per_iteration": 2.555500030517578 + }, + { + "auxiliary_loss_clip": 0.0101538, + "auxiliary_loss_mlp": 0.01000042, + "balance_loss_clip": 1.01524079, + "balance_loss_mlp": 0.99887389, + "epoch": 0.7585149556591012, + "flos": 70402584061440.0, + "grad_norm": 0.8063946325322698, + "language_loss": 0.6776129, + "learning_rate": 5.810439305824828e-07, + "loss": 0.69776702, + "num_input_tokens_seen": 272095825, + "step": 12616, + "time_per_iteration": 3.1808230876922607 + }, + { + "auxiliary_loss_clip": 0.01074695, + "auxiliary_loss_mlp": 0.01034551, + "balance_loss_clip": 1.03608155, + "balance_loss_mlp": 1.0217005, + "epoch": 0.7585750789117691, + "flos": 16143786293760.0, + "grad_norm": 1.8243241536540327, + "language_loss": 0.84895813, + "learning_rate": 5.807694931114979e-07, + "loss": 0.87005067, + "num_input_tokens_seen": 272113950, + "step": 12617, + "time_per_iteration": 2.549408435821533 + }, + { + "auxiliary_loss_clip": 0.01071265, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.03545165, + "balance_loss_mlp": 1.0194689, + "epoch": 0.7586352021644371, + "flos": 17493165544320.0, + "grad_norm": 4.08440386960874, + "language_loss": 0.73995215, + "learning_rate": 5.804951094578757e-07, + "loss": 0.76097488, + "num_input_tokens_seen": 272130315, + "step": 12618, + "time_per_iteration": 2.514948606491089 + }, + { + "auxiliary_loss_clip": 0.01085187, + "auxiliary_loss_mlp": 0.01031913, + "balance_loss_clip": 1.03813004, + "balance_loss_mlp": 1.01942623, + "epoch": 0.758695325417105, + "flos": 17275187859840.0, + "grad_norm": 2.0077368300169973, + "language_loss": 0.77479726, + "learning_rate": 5.802207796320209e-07, + "loss": 0.79596823, + "num_input_tokens_seen": 272149080, + "step": 12619, + "time_per_iteration": 2.540688991546631 + }, + { + "auxiliary_loss_clip": 0.01064775, + "auxiliary_loss_mlp": 0.01032063, + "balance_loss_clip": 1.03250968, + "balance_loss_mlp": 1.01937318, + "epoch": 0.7587554486697731, + "flos": 29495660163840.0, + "grad_norm": 1.8083885897955416, + "language_loss": 0.82405144, + "learning_rate": 5.79946503644337e-07, + "loss": 0.84501982, + "num_input_tokens_seen": 272168285, + "step": 12620, + "time_per_iteration": 2.6320033073425293 + }, + { + "auxiliary_loss_clip": 0.01081645, + "auxiliary_loss_mlp": 0.01035369, + "balance_loss_clip": 1.03611541, + "balance_loss_mlp": 1.02191067, + "epoch": 0.758815571922441, + "flos": 16100800692480.0, + "grad_norm": 2.2649493735939425, + "language_loss": 0.81884193, + "learning_rate": 5.796722815052242e-07, + "loss": 0.84001207, + "num_input_tokens_seen": 272184585, + "step": 12621, + "time_per_iteration": 2.5399515628814697 + }, + { + "auxiliary_loss_clip": 0.01084787, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.03654623, + "balance_loss_mlp": 1.0217998, + "epoch": 0.758875695175109, + "flos": 16143714466560.0, + "grad_norm": 1.903456257653464, + "language_loss": 0.7345432, + "learning_rate": 5.7939811322508e-07, + "loss": 0.75573707, + "num_input_tokens_seen": 272200205, + "step": 12622, + "time_per_iteration": 2.5053815841674805 + }, + { + "auxiliary_loss_clip": 0.01020989, + "auxiliary_loss_mlp": 0.01004529, + "balance_loss_clip": 1.00829422, + "balance_loss_mlp": 1.00332499, + "epoch": 0.7589358184277769, + "flos": 68462006860800.0, + "grad_norm": 0.8126635712276263, + "language_loss": 0.60850495, + "learning_rate": 5.791239988143024e-07, + "loss": 0.62876016, + "num_input_tokens_seen": 272259670, + "step": 12623, + "time_per_iteration": 3.1831858158111572 + }, + { + "auxiliary_loss_clip": 0.0110369, + "auxiliary_loss_mlp": 0.0103602, + "balance_loss_clip": 1.03729129, + "balance_loss_mlp": 1.02448678, + "epoch": 0.7589959416804449, + "flos": 20047311889920.0, + "grad_norm": 1.9586233528729622, + "language_loss": 0.67420292, + "learning_rate": 5.788499382832847e-07, + "loss": 0.69560009, + "num_input_tokens_seen": 272277925, + "step": 12624, + "time_per_iteration": 2.4545326232910156 + }, + { + "auxiliary_loss_clip": 0.01102841, + "auxiliary_loss_mlp": 0.01024177, + "balance_loss_clip": 1.03599167, + "balance_loss_mlp": 1.01179171, + "epoch": 0.7590560649331128, + "flos": 18771800958720.0, + "grad_norm": 1.7488200894997303, + "language_loss": 0.7617681, + "learning_rate": 5.785759316424196e-07, + "loss": 0.78303826, + "num_input_tokens_seen": 272296010, + "step": 12625, + "time_per_iteration": 2.4783124923706055 + }, + { + "auxiliary_loss_clip": 0.01078995, + "auxiliary_loss_mlp": 0.01044152, + "balance_loss_clip": 1.03497446, + "balance_loss_mlp": 1.03069925, + "epoch": 0.7591161881857809, + "flos": 29825284296960.0, + "grad_norm": 1.775847563571098, + "language_loss": 0.63042182, + "learning_rate": 5.783019789020977e-07, + "loss": 0.65165329, + "num_input_tokens_seen": 272318330, + "step": 12626, + "time_per_iteration": 2.589083433151245 + }, + { + "auxiliary_loss_clip": 0.01069752, + "auxiliary_loss_mlp": 0.00786767, + "balance_loss_clip": 1.03814793, + "balance_loss_mlp": 1.01197481, + "epoch": 0.7591763114384488, + "flos": 20302708567680.0, + "grad_norm": 1.9359127257216533, + "language_loss": 0.73818183, + "learning_rate": 5.780280800727084e-07, + "loss": 0.75674701, + "num_input_tokens_seen": 272335265, + "step": 12627, + "time_per_iteration": 2.6098482608795166 + }, + { + "auxiliary_loss_clip": 0.01096321, + "auxiliary_loss_mlp": 0.01029, + "balance_loss_clip": 1.03741527, + "balance_loss_mlp": 1.01680481, + "epoch": 0.7592364346911168, + "flos": 20813609664000.0, + "grad_norm": 2.6868463709306245, + "language_loss": 0.68989974, + "learning_rate": 5.777542351646356e-07, + "loss": 0.71115297, + "num_input_tokens_seen": 272354795, + "step": 12628, + "time_per_iteration": 4.16873836517334 + }, + { + "auxiliary_loss_clip": 0.01098605, + "auxiliary_loss_mlp": 0.01034412, + "balance_loss_clip": 1.03926229, + "balance_loss_mlp": 1.02113783, + "epoch": 0.7592965579437848, + "flos": 21251504367360.0, + "grad_norm": 2.019192101201329, + "language_loss": 0.62568164, + "learning_rate": 5.774804441882648e-07, + "loss": 0.64701188, + "num_input_tokens_seen": 272372875, + "step": 12629, + "time_per_iteration": 2.544950008392334 + }, + { + "auxiliary_loss_clip": 0.01080122, + "auxiliary_loss_mlp": 0.01030015, + "balance_loss_clip": 1.03461123, + "balance_loss_mlp": 1.01789165, + "epoch": 0.7593566811964527, + "flos": 26213604704640.0, + "grad_norm": 1.4350798949220849, + "language_loss": 0.77448779, + "learning_rate": 5.772067071539786e-07, + "loss": 0.79558921, + "num_input_tokens_seen": 272394715, + "step": 12630, + "time_per_iteration": 2.568291664123535 + }, + { + "auxiliary_loss_clip": 0.01031524, + "auxiliary_loss_mlp": 0.01002081, + "balance_loss_clip": 1.0085001, + "balance_loss_mlp": 1.00096655, + "epoch": 0.7594168044491207, + "flos": 71237255374080.0, + "grad_norm": 0.8106193984301918, + "language_loss": 0.61534786, + "learning_rate": 5.769330240721562e-07, + "loss": 0.63568389, + "num_input_tokens_seen": 272458775, + "step": 12631, + "time_per_iteration": 3.167537212371826 + }, + { + "auxiliary_loss_clip": 0.0107902, + "auxiliary_loss_mlp": 0.00788442, + "balance_loss_clip": 1.03745031, + "balance_loss_mlp": 1.01498079, + "epoch": 0.7594769277017887, + "flos": 26613326229120.0, + "grad_norm": 1.8997286267485045, + "language_loss": 0.739474, + "learning_rate": 5.766593949531767e-07, + "loss": 0.75814867, + "num_input_tokens_seen": 272479355, + "step": 12632, + "time_per_iteration": 3.9765303134918213 + }, + { + "auxiliary_loss_clip": 0.01085124, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.03719687, + "balance_loss_mlp": 1.0172503, + "epoch": 0.7595370509544567, + "flos": 17595941333760.0, + "grad_norm": 1.9101910984548796, + "language_loss": 0.7500608, + "learning_rate": 5.763858198074154e-07, + "loss": 0.77120215, + "num_input_tokens_seen": 272493555, + "step": 12633, + "time_per_iteration": 2.52139949798584 + }, + { + "auxiliary_loss_clip": 0.01081262, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.036165, + "balance_loss_mlp": 1.01657104, + "epoch": 0.7595971742071246, + "flos": 18002953319040.0, + "grad_norm": 1.9569991223371972, + "language_loss": 0.73648143, + "learning_rate": 5.76112298645246e-07, + "loss": 0.75757074, + "num_input_tokens_seen": 272508925, + "step": 12634, + "time_per_iteration": 3.8411355018615723 + }, + { + "auxiliary_loss_clip": 0.01107481, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.03772259, + "balance_loss_mlp": 1.02233672, + "epoch": 0.7596572974597926, + "flos": 28840326480000.0, + "grad_norm": 1.6656391678588975, + "language_loss": 0.64730805, + "learning_rate": 5.758388314770408e-07, + "loss": 0.66872823, + "num_input_tokens_seen": 272528805, + "step": 12635, + "time_per_iteration": 2.5021257400512695 + }, + { + "auxiliary_loss_clip": 0.01049032, + "auxiliary_loss_mlp": 0.01039762, + "balance_loss_clip": 1.03365219, + "balance_loss_mlp": 1.02453923, + "epoch": 0.7597174207124605, + "flos": 14282823588480.0, + "grad_norm": 1.68526363091464, + "language_loss": 0.68572658, + "learning_rate": 5.7556541831317e-07, + "loss": 0.70661455, + "num_input_tokens_seen": 272546655, + "step": 12636, + "time_per_iteration": 2.5675365924835205 + }, + { + "auxiliary_loss_clip": 0.01089422, + "auxiliary_loss_mlp": 0.01036823, + "balance_loss_clip": 1.03919232, + "balance_loss_mlp": 1.02490187, + "epoch": 0.7597775439651285, + "flos": 21688932193920.0, + "grad_norm": 1.7701478691444847, + "language_loss": 0.80990887, + "learning_rate": 5.752920591640018e-07, + "loss": 0.83117127, + "num_input_tokens_seen": 272564010, + "step": 12637, + "time_per_iteration": 2.5088589191436768 + }, + { + "auxiliary_loss_clip": 0.01093926, + "auxiliary_loss_mlp": 0.01035723, + "balance_loss_clip": 1.03572106, + "balance_loss_mlp": 1.02388, + "epoch": 0.7598376672177964, + "flos": 36101248312320.0, + "grad_norm": 2.0227002142260897, + "language_loss": 0.66355217, + "learning_rate": 5.750187540399017e-07, + "loss": 0.68484867, + "num_input_tokens_seen": 272585840, + "step": 12638, + "time_per_iteration": 2.6175339221954346 + }, + { + "auxiliary_loss_clip": 0.01106802, + "auxiliary_loss_mlp": 0.01037713, + "balance_loss_clip": 1.03679276, + "balance_loss_mlp": 1.02386117, + "epoch": 0.7598977904704645, + "flos": 18332326056960.0, + "grad_norm": 2.3459312606424727, + "language_loss": 0.65335482, + "learning_rate": 5.747455029512323e-07, + "loss": 0.67479998, + "num_input_tokens_seen": 272602300, + "step": 12639, + "time_per_iteration": 3.839275598526001 + }, + { + "auxiliary_loss_clip": 0.01093411, + "auxiliary_loss_mlp": 0.01029015, + "balance_loss_clip": 1.03469777, + "balance_loss_mlp": 1.01667118, + "epoch": 0.7599579137231324, + "flos": 20192642317440.0, + "grad_norm": 1.917374208932494, + "language_loss": 0.69718921, + "learning_rate": 5.744723059083572e-07, + "loss": 0.71841347, + "num_input_tokens_seen": 272619595, + "step": 12640, + "time_per_iteration": 2.465106725692749 + }, + { + "auxiliary_loss_clip": 0.01085077, + "auxiliary_loss_mlp": 0.01031132, + "balance_loss_clip": 1.03728056, + "balance_loss_mlp": 1.0179714, + "epoch": 0.7600180369758004, + "flos": 24024849459840.0, + "grad_norm": 1.7460526424857297, + "language_loss": 0.67235816, + "learning_rate": 5.741991629216343e-07, + "loss": 0.69352019, + "num_input_tokens_seen": 272638825, + "step": 12641, + "time_per_iteration": 2.532058000564575 + }, + { + "auxiliary_loss_clip": 0.01093043, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.03489494, + "balance_loss_mlp": 1.0189451, + "epoch": 0.7600781602284684, + "flos": 18989527248000.0, + "grad_norm": 2.2235172760308153, + "language_loss": 0.66631711, + "learning_rate": 5.73926074001422e-07, + "loss": 0.68756628, + "num_input_tokens_seen": 272657240, + "step": 12642, + "time_per_iteration": 2.4637317657470703 + }, + { + "auxiliary_loss_clip": 0.01086106, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.03823185, + "balance_loss_mlp": 1.01816189, + "epoch": 0.7601382834811363, + "flos": 26067520091520.0, + "grad_norm": 1.793887868643889, + "language_loss": 0.75215214, + "learning_rate": 5.736530391580765e-07, + "loss": 0.77331752, + "num_input_tokens_seen": 272677520, + "step": 12643, + "time_per_iteration": 2.5543930530548096 + }, + { + "auxiliary_loss_clip": 0.01070055, + "auxiliary_loss_mlp": 0.01036117, + "balance_loss_clip": 1.03625584, + "balance_loss_mlp": 1.02273548, + "epoch": 0.7601984067338043, + "flos": 18844232734080.0, + "grad_norm": 1.9296953518833748, + "language_loss": 0.78826934, + "learning_rate": 5.733800584019508e-07, + "loss": 0.80933106, + "num_input_tokens_seen": 272696770, + "step": 12644, + "time_per_iteration": 2.5304133892059326 + }, + { + "auxiliary_loss_clip": 0.01079793, + "auxiliary_loss_mlp": 0.01031193, + "balance_loss_clip": 1.03393292, + "balance_loss_mlp": 1.01877713, + "epoch": 0.7602585299864723, + "flos": 24646391424000.0, + "grad_norm": 1.626303169780059, + "language_loss": 0.80164802, + "learning_rate": 5.731071317433957e-07, + "loss": 0.82275784, + "num_input_tokens_seen": 272718340, + "step": 12645, + "time_per_iteration": 2.5612692832946777 + }, + { + "auxiliary_loss_clip": 0.01090466, + "auxiliary_loss_mlp": 0.01031568, + "balance_loss_clip": 1.03879917, + "balance_loss_mlp": 1.0190208, + "epoch": 0.7603186532391403, + "flos": 23842100039040.0, + "grad_norm": 1.7073777372649244, + "language_loss": 0.73047274, + "learning_rate": 5.728342591927611e-07, + "loss": 0.75169307, + "num_input_tokens_seen": 272739575, + "step": 12646, + "time_per_iteration": 2.5349104404449463 + }, + { + "auxiliary_loss_clip": 0.01093884, + "auxiliary_loss_mlp": 0.01035402, + "balance_loss_clip": 1.03645074, + "balance_loss_mlp": 1.02363062, + "epoch": 0.7603787764918082, + "flos": 22199905117440.0, + "grad_norm": 1.9078607738330249, + "language_loss": 0.67156005, + "learning_rate": 5.725614407603949e-07, + "loss": 0.69285285, + "num_input_tokens_seen": 272758710, + "step": 12647, + "time_per_iteration": 2.5044877529144287 + }, + { + "auxiliary_loss_clip": 0.01026456, + "auxiliary_loss_mlp": 0.01019864, + "balance_loss_clip": 1.01330328, + "balance_loss_mlp": 1.01837397, + "epoch": 0.7604388997444762, + "flos": 54086894254080.0, + "grad_norm": 0.6895456355460703, + "language_loss": 0.49006724, + "learning_rate": 5.722886764566415e-07, + "loss": 0.51053047, + "num_input_tokens_seen": 272814855, + "step": 12648, + "time_per_iteration": 3.07966947555542 + }, + { + "auxiliary_loss_clip": 0.01091437, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.03512287, + "balance_loss_mlp": 1.0211066, + "epoch": 0.7604990229971441, + "flos": 19681920789120.0, + "grad_norm": 2.337665925509676, + "language_loss": 0.76490283, + "learning_rate": 5.720159662918451e-07, + "loss": 0.78614581, + "num_input_tokens_seen": 272834400, + "step": 12649, + "time_per_iteration": 2.583862781524658 + }, + { + "auxiliary_loss_clip": 0.0106258, + "auxiliary_loss_mlp": 0.01031246, + "balance_loss_clip": 1.03412223, + "balance_loss_mlp": 1.01951027, + "epoch": 0.7605591462498121, + "flos": 25228036356480.0, + "grad_norm": 1.5043419811876233, + "language_loss": 0.6879248, + "learning_rate": 5.717433102763462e-07, + "loss": 0.70886302, + "num_input_tokens_seen": 272854760, + "step": 12650, + "time_per_iteration": 2.572755813598633 + }, + { + "auxiliary_loss_clip": 0.01021187, + "auxiliary_loss_mlp": 0.01003987, + "balance_loss_clip": 1.00846362, + "balance_loss_mlp": 1.00284886, + "epoch": 0.76061926950248, + "flos": 66783757662720.0, + "grad_norm": 0.7544929821757882, + "language_loss": 0.62698829, + "learning_rate": 5.714707084204838e-07, + "loss": 0.64724004, + "num_input_tokens_seen": 272919030, + "step": 12651, + "time_per_iteration": 3.109058141708374 + }, + { + "auxiliary_loss_clip": 0.01071034, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.03505373, + "balance_loss_mlp": 1.02149725, + "epoch": 0.7606793927551481, + "flos": 25338354001920.0, + "grad_norm": 1.412892542170891, + "language_loss": 0.71393794, + "learning_rate": 5.711981607345951e-07, + "loss": 0.73498034, + "num_input_tokens_seen": 272938925, + "step": 12652, + "time_per_iteration": 2.6114797592163086 + }, + { + "auxiliary_loss_clip": 0.01054204, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.03515172, + "balance_loss_mlp": 1.02395272, + "epoch": 0.760739516007816, + "flos": 18223624523520.0, + "grad_norm": 1.790837276620901, + "language_loss": 0.79994977, + "learning_rate": 5.709256672290152e-07, + "loss": 0.82085598, + "num_input_tokens_seen": 272954945, + "step": 12653, + "time_per_iteration": 2.596724510192871 + }, + { + "auxiliary_loss_clip": 0.01109941, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.03820384, + "balance_loss_mlp": 1.01793051, + "epoch": 0.760799639260484, + "flos": 22559119079040.0, + "grad_norm": 1.522474760179085, + "language_loss": 0.80318362, + "learning_rate": 5.706532279140785e-07, + "loss": 0.82458442, + "num_input_tokens_seen": 272972855, + "step": 12654, + "time_per_iteration": 2.4586777687072754 + }, + { + "auxiliary_loss_clip": 0.01069662, + "auxiliary_loss_mlp": 0.010364, + "balance_loss_clip": 1.0340836, + "balance_loss_mlp": 1.02294719, + "epoch": 0.760859762513152, + "flos": 22309324922880.0, + "grad_norm": 1.9488485380738214, + "language_loss": 0.79734337, + "learning_rate": 5.703808428001136e-07, + "loss": 0.81840396, + "num_input_tokens_seen": 272989895, + "step": 12655, + "time_per_iteration": 2.5570497512817383 + }, + { + "auxiliary_loss_clip": 0.01086731, + "auxiliary_loss_mlp": 0.01024837, + "balance_loss_clip": 1.03572011, + "balance_loss_mlp": 1.01482368, + "epoch": 0.7609198857658199, + "flos": 24863902231680.0, + "grad_norm": 1.829672254998866, + "language_loss": 0.68527585, + "learning_rate": 5.701085118974505e-07, + "loss": 0.70639145, + "num_input_tokens_seen": 273011695, + "step": 12656, + "time_per_iteration": 2.5388708114624023 + }, + { + "auxiliary_loss_clip": 0.01095574, + "auxiliary_loss_mlp": 0.01029683, + "balance_loss_clip": 1.03331339, + "balance_loss_mlp": 1.01667702, + "epoch": 0.760980009018488, + "flos": 16836790366080.0, + "grad_norm": 3.7074335310627107, + "language_loss": 0.73577452, + "learning_rate": 5.698362352164164e-07, + "loss": 0.75702703, + "num_input_tokens_seen": 273028815, + "step": 12657, + "time_per_iteration": 2.4616217613220215 + }, + { + "auxiliary_loss_clip": 0.0101472, + "auxiliary_loss_mlp": 0.01002872, + "balance_loss_clip": 1.00986052, + "balance_loss_mlp": 1.00167942, + "epoch": 0.7610401322711559, + "flos": 61230603029760.0, + "grad_norm": 0.8591108360913255, + "language_loss": 0.64944541, + "learning_rate": 5.695640127673347e-07, + "loss": 0.66962135, + "num_input_tokens_seen": 273084080, + "step": 12658, + "time_per_iteration": 3.0704715251922607 + }, + { + "auxiliary_loss_clip": 0.01087869, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.0366621, + "balance_loss_mlp": 1.02090764, + "epoch": 0.7611002555238239, + "flos": 19640730867840.0, + "grad_norm": 1.6236491480527129, + "language_loss": 0.7924844, + "learning_rate": 5.692918445605293e-07, + "loss": 0.81369716, + "num_input_tokens_seen": 273102295, + "step": 12659, + "time_per_iteration": 2.4778554439544678 + }, + { + "auxiliary_loss_clip": 0.01091775, + "auxiliary_loss_mlp": 0.01027012, + "balance_loss_clip": 1.0342679, + "balance_loss_mlp": 1.01504922, + "epoch": 0.7611603787764918, + "flos": 26872206526080.0, + "grad_norm": 1.526035034454464, + "language_loss": 0.68629593, + "learning_rate": 5.690197306063209e-07, + "loss": 0.70748377, + "num_input_tokens_seen": 273123400, + "step": 12660, + "time_per_iteration": 2.54925799369812 + }, + { + "auxiliary_loss_clip": 0.0110517, + "auxiliary_loss_mlp": 0.01029143, + "balance_loss_clip": 1.03561664, + "balance_loss_mlp": 1.01741862, + "epoch": 0.7612205020291598, + "flos": 27344252085120.0, + "grad_norm": 1.5201468402166578, + "language_loss": 0.70361006, + "learning_rate": 5.687476709150281e-07, + "loss": 0.72495317, + "num_input_tokens_seen": 273145150, + "step": 12661, + "time_per_iteration": 2.4921281337738037 + }, + { + "auxiliary_loss_clip": 0.01092043, + "auxiliary_loss_mlp": 0.01029751, + "balance_loss_clip": 1.03391039, + "balance_loss_mlp": 1.01832438, + "epoch": 0.7612806252818277, + "flos": 29314598682240.0, + "grad_norm": 1.44674272403428, + "language_loss": 0.83258855, + "learning_rate": 5.68475665496966e-07, + "loss": 0.85380644, + "num_input_tokens_seen": 273165180, + "step": 12662, + "time_per_iteration": 2.554934024810791 + }, + { + "auxiliary_loss_clip": 0.01081741, + "auxiliary_loss_mlp": 0.01042987, + "balance_loss_clip": 1.03474498, + "balance_loss_mlp": 1.03102422, + "epoch": 0.7613407485344957, + "flos": 19026048401280.0, + "grad_norm": 1.7361906176983537, + "language_loss": 0.68539822, + "learning_rate": 5.682037143624505e-07, + "loss": 0.70664549, + "num_input_tokens_seen": 273184005, + "step": 12663, + "time_per_iteration": 2.508544683456421 + }, + { + "auxiliary_loss_clip": 0.01090806, + "auxiliary_loss_mlp": 0.01024805, + "balance_loss_clip": 1.03492165, + "balance_loss_mlp": 1.01323569, + "epoch": 0.7614008717871636, + "flos": 23256037733760.0, + "grad_norm": 1.8770742666827729, + "language_loss": 0.70142728, + "learning_rate": 5.67931817521794e-07, + "loss": 0.72258341, + "num_input_tokens_seen": 273203565, + "step": 12664, + "time_per_iteration": 2.5068535804748535 + }, + { + "auxiliary_loss_clip": 0.01098827, + "auxiliary_loss_mlp": 0.01036585, + "balance_loss_clip": 1.03818107, + "balance_loss_mlp": 1.02375841, + "epoch": 0.7614609950398317, + "flos": 21579907438080.0, + "grad_norm": 1.629522900379031, + "language_loss": 0.79246885, + "learning_rate": 5.676599749853066e-07, + "loss": 0.81382298, + "num_input_tokens_seen": 273221645, + "step": 12665, + "time_per_iteration": 2.4873647689819336 + }, + { + "auxiliary_loss_clip": 0.01104301, + "auxiliary_loss_mlp": 0.0078218, + "balance_loss_clip": 1.03807521, + "balance_loss_mlp": 1.00856769, + "epoch": 0.7615211182924996, + "flos": 29277897960960.0, + "grad_norm": 1.6084873614930912, + "language_loss": 0.87998295, + "learning_rate": 5.673881867632959e-07, + "loss": 0.89884776, + "num_input_tokens_seen": 273242040, + "step": 12666, + "time_per_iteration": 2.546276569366455 + }, + { + "auxiliary_loss_clip": 0.01049757, + "auxiliary_loss_mlp": 0.01032328, + "balance_loss_clip": 1.03375638, + "balance_loss_mlp": 1.0193758, + "epoch": 0.7615812415451676, + "flos": 13261129136640.0, + "grad_norm": 2.4053441903649766, + "language_loss": 0.83154428, + "learning_rate": 5.671164528660693e-07, + "loss": 0.8523652, + "num_input_tokens_seen": 273257365, + "step": 12667, + "time_per_iteration": 3.943321704864502 + }, + { + "auxiliary_loss_clip": 0.01081843, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.03559148, + "balance_loss_mlp": 1.02337861, + "epoch": 0.7616413647978356, + "flos": 18584741905920.0, + "grad_norm": 1.5741337388476715, + "language_loss": 0.78520072, + "learning_rate": 5.668447733039296e-07, + "loss": 0.80636609, + "num_input_tokens_seen": 273274710, + "step": 12668, + "time_per_iteration": 2.5610439777374268 + }, + { + "auxiliary_loss_clip": 0.01069229, + "auxiliary_loss_mlp": 0.01027278, + "balance_loss_clip": 1.03296232, + "balance_loss_mlp": 1.01555419, + "epoch": 0.7617014880505035, + "flos": 18516188799360.0, + "grad_norm": 1.9133164203322024, + "language_loss": 0.64428639, + "learning_rate": 5.6657314808718e-07, + "loss": 0.66525149, + "num_input_tokens_seen": 273292870, + "step": 12669, + "time_per_iteration": 2.5123379230499268 + }, + { + "auxiliary_loss_clip": 0.01079073, + "auxiliary_loss_mlp": 0.01038598, + "balance_loss_clip": 1.03297484, + "balance_loss_mlp": 1.02496028, + "epoch": 0.7617616113031715, + "flos": 24973178382720.0, + "grad_norm": 1.8544854934285684, + "language_loss": 0.66342235, + "learning_rate": 5.663015772261202e-07, + "loss": 0.68459904, + "num_input_tokens_seen": 273312375, + "step": 12670, + "time_per_iteration": 2.587808847427368 + }, + { + "auxiliary_loss_clip": 0.0109621, + "auxiliary_loss_mlp": 0.01034164, + "balance_loss_clip": 1.03640008, + "balance_loss_mlp": 1.02230251, + "epoch": 0.7618217345558395, + "flos": 23295036925440.0, + "grad_norm": 1.7300877599952822, + "language_loss": 0.73121452, + "learning_rate": 5.660300607310493e-07, + "loss": 0.7525183, + "num_input_tokens_seen": 273332590, + "step": 12671, + "time_per_iteration": 3.940094232559204 + }, + { + "auxiliary_loss_clip": 0.01067127, + "auxiliary_loss_mlp": 0.01035555, + "balance_loss_clip": 1.0324285, + "balance_loss_mlp": 1.02276397, + "epoch": 0.7618818578085075, + "flos": 25482894330240.0, + "grad_norm": 1.674856908289275, + "language_loss": 0.73448813, + "learning_rate": 5.657585986122613e-07, + "loss": 0.75551498, + "num_input_tokens_seen": 273352885, + "step": 12672, + "time_per_iteration": 3.9829635620117188 + }, + { + "auxiliary_loss_clip": 0.01009578, + "auxiliary_loss_mlp": 0.01001001, + "balance_loss_clip": 1.00864458, + "balance_loss_mlp": 0.99988633, + "epoch": 0.7619419810611754, + "flos": 61151994115200.0, + "grad_norm": 0.7549985006659468, + "language_loss": 0.56632227, + "learning_rate": 5.654871908800506e-07, + "loss": 0.58642805, + "num_input_tokens_seen": 273411730, + "step": 12673, + "time_per_iteration": 3.0915274620056152 + }, + { + "auxiliary_loss_clip": 0.0109338, + "auxiliary_loss_mlp": 0.01031168, + "balance_loss_clip": 1.03588951, + "balance_loss_mlp": 1.01846075, + "epoch": 0.7620021043138434, + "flos": 23258659426560.0, + "grad_norm": 1.8674317137531766, + "language_loss": 0.74939716, + "learning_rate": 5.652158375447102e-07, + "loss": 0.77064264, + "num_input_tokens_seen": 273430020, + "step": 12674, + "time_per_iteration": 2.5041821002960205 + }, + { + "auxiliary_loss_clip": 0.01074143, + "auxiliary_loss_mlp": 0.01036311, + "balance_loss_clip": 1.03267705, + "balance_loss_mlp": 1.02294219, + "epoch": 0.7620622275665113, + "flos": 25082490447360.0, + "grad_norm": 1.8697032906046793, + "language_loss": 0.7222293, + "learning_rate": 5.649445386165286e-07, + "loss": 0.74333382, + "num_input_tokens_seen": 273448690, + "step": 12675, + "time_per_iteration": 2.548712968826294 + }, + { + "auxiliary_loss_clip": 0.01091885, + "auxiliary_loss_mlp": 0.01028021, + "balance_loss_clip": 1.03577948, + "balance_loss_mlp": 1.01654136, + "epoch": 0.7621223508191793, + "flos": 20155007842560.0, + "grad_norm": 2.263562855700515, + "language_loss": 0.73041373, + "learning_rate": 5.646732941057936e-07, + "loss": 0.75161278, + "num_input_tokens_seen": 273465190, + "step": 12676, + "time_per_iteration": 2.4798154830932617 + }, + { + "auxiliary_loss_clip": 0.01075955, + "auxiliary_loss_mlp": 0.00786274, + "balance_loss_clip": 1.03884649, + "balance_loss_mlp": 1.01154947, + "epoch": 0.7621824740718472, + "flos": 18000187971840.0, + "grad_norm": 2.666538220917961, + "language_loss": 0.53498459, + "learning_rate": 5.644021040227927e-07, + "loss": 0.55360693, + "num_input_tokens_seen": 273478620, + "step": 12677, + "time_per_iteration": 2.508429765701294 + }, + { + "auxiliary_loss_clip": 0.01056399, + "auxiliary_loss_mlp": 0.01029639, + "balance_loss_clip": 1.03478682, + "balance_loss_mlp": 1.01670516, + "epoch": 0.7622425973245153, + "flos": 21725668828800.0, + "grad_norm": 1.8915611992771084, + "language_loss": 0.79006183, + "learning_rate": 5.641309683778064e-07, + "loss": 0.81092227, + "num_input_tokens_seen": 273497635, + "step": 12678, + "time_per_iteration": 3.9600744247436523 + }, + { + "auxiliary_loss_clip": 0.01069441, + "auxiliary_loss_mlp": 0.0103636, + "balance_loss_clip": 1.0335927, + "balance_loss_mlp": 1.02260947, + "epoch": 0.7623027205771832, + "flos": 19718549683200.0, + "grad_norm": 2.1229357382465626, + "language_loss": 0.77471614, + "learning_rate": 5.638598871811175e-07, + "loss": 0.7957741, + "num_input_tokens_seen": 273513955, + "step": 12679, + "time_per_iteration": 2.5113048553466797 + }, + { + "auxiliary_loss_clip": 0.01092724, + "auxiliary_loss_mlp": 0.01023954, + "balance_loss_clip": 1.03499031, + "balance_loss_mlp": 1.01168156, + "epoch": 0.7623628438298512, + "flos": 23988831096960.0, + "grad_norm": 1.589777156068415, + "language_loss": 0.80097955, + "learning_rate": 5.635888604430059e-07, + "loss": 0.8221463, + "num_input_tokens_seen": 273533970, + "step": 12680, + "time_per_iteration": 2.546685218811035 + }, + { + "auxiliary_loss_clip": 0.01084993, + "auxiliary_loss_mlp": 0.01031342, + "balance_loss_clip": 1.0373652, + "balance_loss_mlp": 1.01765656, + "epoch": 0.7624229670825191, + "flos": 22345702421760.0, + "grad_norm": 1.7026043582740156, + "language_loss": 0.62799972, + "learning_rate": 5.633178881737493e-07, + "loss": 0.64916307, + "num_input_tokens_seen": 273553090, + "step": 12681, + "time_per_iteration": 2.5129544734954834 + }, + { + "auxiliary_loss_clip": 0.01068778, + "auxiliary_loss_mlp": 0.01029052, + "balance_loss_clip": 1.03604901, + "balance_loss_mlp": 1.017465, + "epoch": 0.7624830903351871, + "flos": 22711775880960.0, + "grad_norm": 1.9816637790571414, + "language_loss": 0.75933564, + "learning_rate": 5.63046970383622e-07, + "loss": 0.78031385, + "num_input_tokens_seen": 273572460, + "step": 12682, + "time_per_iteration": 2.578129768371582 + }, + { + "auxiliary_loss_clip": 0.01080462, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.03408694, + "balance_loss_mlp": 1.0186677, + "epoch": 0.7625432135878552, + "flos": 25593714766080.0, + "grad_norm": 1.427002460060655, + "language_loss": 0.67946064, + "learning_rate": 5.627761070828974e-07, + "loss": 0.70056152, + "num_input_tokens_seen": 273592815, + "step": 12683, + "time_per_iteration": 2.5489578247070312 + }, + { + "auxiliary_loss_clip": 0.01065731, + "auxiliary_loss_mlp": 0.0078609, + "balance_loss_clip": 1.03382349, + "balance_loss_mlp": 1.01001489, + "epoch": 0.7626033368405231, + "flos": 23987645948160.0, + "grad_norm": 2.4977853433011443, + "language_loss": 0.83452785, + "learning_rate": 5.625052982818472e-07, + "loss": 0.85304606, + "num_input_tokens_seen": 273611790, + "step": 12684, + "time_per_iteration": 2.586915969848633 + }, + { + "auxiliary_loss_clip": 0.01084125, + "auxiliary_loss_mlp": 0.01030753, + "balance_loss_clip": 1.03573012, + "balance_loss_mlp": 1.01718116, + "epoch": 0.7626634600931911, + "flos": 12599115523200.0, + "grad_norm": 3.943461453176747, + "language_loss": 0.82525241, + "learning_rate": 5.622345439907396e-07, + "loss": 0.84640121, + "num_input_tokens_seen": 273628340, + "step": 12685, + "time_per_iteration": 2.4941577911376953 + }, + { + "auxiliary_loss_clip": 0.01075552, + "auxiliary_loss_mlp": 0.0078518, + "balance_loss_clip": 1.03513885, + "balance_loss_mlp": 1.01050735, + "epoch": 0.762723583345859, + "flos": 26322593546880.0, + "grad_norm": 1.9297028170463524, + "language_loss": 0.77376783, + "learning_rate": 5.619638442198422e-07, + "loss": 0.79237521, + "num_input_tokens_seen": 273646585, + "step": 12686, + "time_per_iteration": 2.5836827754974365 + }, + { + "auxiliary_loss_clip": 0.01048715, + "auxiliary_loss_mlp": 0.01045521, + "balance_loss_clip": 1.0328517, + "balance_loss_mlp": 1.03003585, + "epoch": 0.762783706598527, + "flos": 21907053532800.0, + "grad_norm": 1.7209627421678655, + "language_loss": 0.7217195, + "learning_rate": 5.616931989794198e-07, + "loss": 0.74266189, + "num_input_tokens_seen": 273665410, + "step": 12687, + "time_per_iteration": 2.61552095413208 + }, + { + "auxiliary_loss_clip": 0.01081793, + "auxiliary_loss_mlp": 0.01045851, + "balance_loss_clip": 1.03582585, + "balance_loss_mlp": 1.03122413, + "epoch": 0.7628438298511949, + "flos": 15339782217600.0, + "grad_norm": 2.0478197250130004, + "language_loss": 0.64738488, + "learning_rate": 5.614226082797369e-07, + "loss": 0.66866136, + "num_input_tokens_seen": 273683035, + "step": 12688, + "time_per_iteration": 2.5080111026763916 + }, + { + "auxiliary_loss_clip": 0.01094198, + "auxiliary_loss_mlp": 0.01027838, + "balance_loss_clip": 1.03629017, + "balance_loss_mlp": 1.01645327, + "epoch": 0.7629039531038629, + "flos": 13006307076480.0, + "grad_norm": 1.9184383376244654, + "language_loss": 0.70836234, + "learning_rate": 5.611520721310515e-07, + "loss": 0.72958267, + "num_input_tokens_seen": 273700130, + "step": 12689, + "time_per_iteration": 2.482851028442383 + }, + { + "auxiliary_loss_clip": 0.0107458, + "auxiliary_loss_mlp": 0.01037624, + "balance_loss_clip": 1.03469253, + "balance_loss_mlp": 1.02487445, + "epoch": 0.7629640763565309, + "flos": 26171660597760.0, + "grad_norm": 1.9524429470624853, + "language_loss": 0.6984669, + "learning_rate": 5.608815905436238e-07, + "loss": 0.71958888, + "num_input_tokens_seen": 273720310, + "step": 12690, + "time_per_iteration": 2.610696315765381 + }, + { + "auxiliary_loss_clip": 0.01079595, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.03440261, + "balance_loss_mlp": 1.02344441, + "epoch": 0.7630241996091989, + "flos": 36793713680640.0, + "grad_norm": 1.4563334060608215, + "language_loss": 0.69403744, + "learning_rate": 5.606111635277109e-07, + "loss": 0.71519655, + "num_input_tokens_seen": 273744475, + "step": 12691, + "time_per_iteration": 2.668941020965576 + }, + { + "auxiliary_loss_clip": 0.01088885, + "auxiliary_loss_mlp": 0.01035721, + "balance_loss_clip": 1.03546226, + "balance_loss_mlp": 1.02449703, + "epoch": 0.7630843228618668, + "flos": 21835160461440.0, + "grad_norm": 1.6197845181352042, + "language_loss": 0.81429905, + "learning_rate": 5.603407910935662e-07, + "loss": 0.83554512, + "num_input_tokens_seen": 273764635, + "step": 12692, + "time_per_iteration": 2.495760440826416 + }, + { + "auxiliary_loss_clip": 0.01072051, + "auxiliary_loss_mlp": 0.01028367, + "balance_loss_clip": 1.03776157, + "balance_loss_mlp": 1.01657724, + "epoch": 0.7631444461145348, + "flos": 12640520926080.0, + "grad_norm": 2.253722798539994, + "language_loss": 0.76997781, + "learning_rate": 5.600704732514438e-07, + "loss": 0.79098195, + "num_input_tokens_seen": 273780115, + "step": 12693, + "time_per_iteration": 2.5511112213134766 + }, + { + "auxiliary_loss_clip": 0.01066929, + "auxiliary_loss_mlp": 0.0102801, + "balance_loss_clip": 1.03592229, + "balance_loss_mlp": 1.01528478, + "epoch": 0.7632045693672027, + "flos": 16836610798080.0, + "grad_norm": 2.693808251305855, + "language_loss": 0.72598898, + "learning_rate": 5.598002100115933e-07, + "loss": 0.74693835, + "num_input_tokens_seen": 273796605, + "step": 12694, + "time_per_iteration": 2.511033058166504 + }, + { + "auxiliary_loss_clip": 0.01091232, + "auxiliary_loss_mlp": 0.01025423, + "balance_loss_clip": 1.03426623, + "balance_loss_mlp": 1.0130074, + "epoch": 0.7632646926198707, + "flos": 22017335264640.0, + "grad_norm": 1.7411689628442022, + "language_loss": 0.70658147, + "learning_rate": 5.595300013842625e-07, + "loss": 0.72774804, + "num_input_tokens_seen": 273816515, + "step": 12695, + "time_per_iteration": 2.524937629699707 + }, + { + "auxiliary_loss_clip": 0.01104409, + "auxiliary_loss_mlp": 0.0103082, + "balance_loss_clip": 1.03597522, + "balance_loss_mlp": 1.01913786, + "epoch": 0.7633248158725388, + "flos": 23114011357440.0, + "grad_norm": 1.4340338975923714, + "language_loss": 0.7227546, + "learning_rate": 5.592598473796985e-07, + "loss": 0.74410689, + "num_input_tokens_seen": 273837060, + "step": 12696, + "time_per_iteration": 2.484015941619873 + }, + { + "auxiliary_loss_clip": 0.01046201, + "auxiliary_loss_mlp": 0.01039197, + "balance_loss_clip": 1.03418827, + "balance_loss_mlp": 1.02575588, + "epoch": 0.7633849391252067, + "flos": 10889839952640.0, + "grad_norm": 2.259469705165791, + "language_loss": 0.71179831, + "learning_rate": 5.589897480081453e-07, + "loss": 0.73265231, + "num_input_tokens_seen": 273853365, + "step": 12697, + "time_per_iteration": 2.5924935340881348 + }, + { + "auxiliary_loss_clip": 0.01070493, + "auxiliary_loss_mlp": 0.01026895, + "balance_loss_clip": 1.0392586, + "balance_loss_mlp": 1.01512313, + "epoch": 0.7634450623778747, + "flos": 20994168355200.0, + "grad_norm": 1.9953388312114715, + "language_loss": 0.66791278, + "learning_rate": 5.587197032798461e-07, + "loss": 0.68888664, + "num_input_tokens_seen": 273870750, + "step": 12698, + "time_per_iteration": 2.6198856830596924 + }, + { + "auxiliary_loss_clip": 0.010923, + "auxiliary_loss_mlp": 0.01026764, + "balance_loss_clip": 1.03349221, + "balance_loss_mlp": 1.01453912, + "epoch": 0.7635051856305426, + "flos": 18882046776960.0, + "grad_norm": 1.7365582668133617, + "language_loss": 0.71890408, + "learning_rate": 5.5844971320504e-07, + "loss": 0.74009472, + "num_input_tokens_seen": 273890890, + "step": 12699, + "time_per_iteration": 2.5164833068847656 + }, + { + "auxiliary_loss_clip": 0.01082944, + "auxiliary_loss_mlp": 0.01032957, + "balance_loss_clip": 1.03531766, + "balance_loss_mlp": 1.02138162, + "epoch": 0.7635653088832106, + "flos": 34786989584640.0, + "grad_norm": 1.7144279871177495, + "language_loss": 0.73210061, + "learning_rate": 5.581797777939648e-07, + "loss": 0.7532596, + "num_input_tokens_seen": 273914015, + "step": 12700, + "time_per_iteration": 2.6401526927948 + }, + { + "auxiliary_loss_clip": 0.01104428, + "auxiliary_loss_mlp": 0.01031516, + "balance_loss_clip": 1.03475833, + "balance_loss_mlp": 1.01954174, + "epoch": 0.7636254321358785, + "flos": 23178434400000.0, + "grad_norm": 2.4403303197931048, + "language_loss": 0.69365031, + "learning_rate": 5.579098970568574e-07, + "loss": 0.71500975, + "num_input_tokens_seen": 273927415, + "step": 12701, + "time_per_iteration": 2.4446098804473877 + }, + { + "auxiliary_loss_clip": 0.01077639, + "auxiliary_loss_mlp": 0.01026977, + "balance_loss_clip": 1.03890824, + "balance_loss_mlp": 1.01543128, + "epoch": 0.7636855553885465, + "flos": 21325229032320.0, + "grad_norm": 1.52687546361391, + "language_loss": 0.64216506, + "learning_rate": 5.576400710039508e-07, + "loss": 0.66321123, + "num_input_tokens_seen": 273946690, + "step": 12702, + "time_per_iteration": 2.5328667163848877 + }, + { + "auxiliary_loss_clip": 0.01073337, + "auxiliary_loss_mlp": 0.01033019, + "balance_loss_clip": 1.03607917, + "balance_loss_mlp": 1.02088976, + "epoch": 0.7637456786412145, + "flos": 28658079849600.0, + "grad_norm": 1.9740754643474494, + "language_loss": 0.6529851, + "learning_rate": 5.57370299645477e-07, + "loss": 0.6740486, + "num_input_tokens_seen": 273966870, + "step": 12703, + "time_per_iteration": 2.617490768432617 + }, + { + "auxiliary_loss_clip": 0.01081056, + "auxiliary_loss_mlp": 0.01025523, + "balance_loss_clip": 1.03624392, + "balance_loss_mlp": 1.01369786, + "epoch": 0.7638058018938825, + "flos": 21907269014400.0, + "grad_norm": 1.7978709100130605, + "language_loss": 0.83977789, + "learning_rate": 5.571005829916668e-07, + "loss": 0.86084366, + "num_input_tokens_seen": 273986360, + "step": 12704, + "time_per_iteration": 2.5362422466278076 + }, + { + "auxiliary_loss_clip": 0.01083937, + "auxiliary_loss_mlp": 0.01030711, + "balance_loss_clip": 1.03655303, + "balance_loss_mlp": 1.01842034, + "epoch": 0.7638659251465504, + "flos": 29643899592960.0, + "grad_norm": 1.4234525255075872, + "language_loss": 0.67991906, + "learning_rate": 5.568309210527469e-07, + "loss": 0.70106554, + "num_input_tokens_seen": 274009745, + "step": 12705, + "time_per_iteration": 2.635594129562378 + }, + { + "auxiliary_loss_clip": 0.01078872, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.03464711, + "balance_loss_mlp": 1.0170902, + "epoch": 0.7639260483992184, + "flos": 26141172929280.0, + "grad_norm": 1.6579830755573166, + "language_loss": 0.74411571, + "learning_rate": 5.565613138389427e-07, + "loss": 0.76519704, + "num_input_tokens_seen": 274028775, + "step": 12706, + "time_per_iteration": 4.063490867614746 + }, + { + "auxiliary_loss_clip": 0.01089691, + "auxiliary_loss_mlp": 0.01034201, + "balance_loss_clip": 1.03552985, + "balance_loss_mlp": 1.02163589, + "epoch": 0.7639861716518863, + "flos": 20156695781760.0, + "grad_norm": 1.874713618698413, + "language_loss": 0.78395748, + "learning_rate": 5.562917613604781e-07, + "loss": 0.8051964, + "num_input_tokens_seen": 274047520, + "step": 12707, + "time_per_iteration": 2.503784656524658 + }, + { + "auxiliary_loss_clip": 0.01077469, + "auxiliary_loss_mlp": 0.01026579, + "balance_loss_clip": 1.03400552, + "balance_loss_mlp": 1.01431274, + "epoch": 0.7640462949045543, + "flos": 18583125793920.0, + "grad_norm": 1.676659069403074, + "language_loss": 0.79733527, + "learning_rate": 5.560222636275751e-07, + "loss": 0.81837571, + "num_input_tokens_seen": 274065350, + "step": 12708, + "time_per_iteration": 2.499134063720703 + }, + { + "auxiliary_loss_clip": 0.01019374, + "auxiliary_loss_mlp": 0.01002838, + "balance_loss_clip": 1.00931716, + "balance_loss_mlp": 1.00164604, + "epoch": 0.7641064181572224, + "flos": 68321991646080.0, + "grad_norm": 0.8180812717830693, + "language_loss": 0.56488198, + "learning_rate": 5.557528206504521e-07, + "loss": 0.58510411, + "num_input_tokens_seen": 274122315, + "step": 12709, + "time_per_iteration": 4.506904602050781 + }, + { + "auxiliary_loss_clip": 0.0109506, + "auxiliary_loss_mlp": 0.01037192, + "balance_loss_clip": 1.03596592, + "balance_loss_mlp": 1.02366757, + "epoch": 0.7641665414098903, + "flos": 17968982031360.0, + "grad_norm": 17.013257312367852, + "language_loss": 0.63530135, + "learning_rate": 5.554834324393271e-07, + "loss": 0.6566239, + "num_input_tokens_seen": 274140555, + "step": 12710, + "time_per_iteration": 3.945692539215088 + }, + { + "auxiliary_loss_clip": 0.01060167, + "auxiliary_loss_mlp": 0.0078507, + "balance_loss_clip": 1.03596961, + "balance_loss_mlp": 1.00793052, + "epoch": 0.7642266646625583, + "flos": 21252078984960.0, + "grad_norm": 1.910032240154901, + "language_loss": 0.64281499, + "learning_rate": 5.552140990044154e-07, + "loss": 0.6612674, + "num_input_tokens_seen": 274161125, + "step": 12711, + "time_per_iteration": 2.606048583984375 + }, + { + "auxiliary_loss_clip": 0.01082423, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.03513443, + "balance_loss_mlp": 1.01938951, + "epoch": 0.7642867879152262, + "flos": 22747794243840.0, + "grad_norm": 1.7290597510589432, + "language_loss": 0.73101598, + "learning_rate": 5.549448203559293e-07, + "loss": 0.75214982, + "num_input_tokens_seen": 274180835, + "step": 12712, + "time_per_iteration": 2.5225303173065186 + }, + { + "auxiliary_loss_clip": 0.01070185, + "auxiliary_loss_mlp": 0.01028432, + "balance_loss_clip": 1.0356431, + "balance_loss_mlp": 1.01724386, + "epoch": 0.7643469111678942, + "flos": 23332132696320.0, + "grad_norm": 1.4822014207304208, + "language_loss": 0.80438852, + "learning_rate": 5.546755965040804e-07, + "loss": 0.82537472, + "num_input_tokens_seen": 274201190, + "step": 12713, + "time_per_iteration": 2.599271535873413 + }, + { + "auxiliary_loss_clip": 0.010964, + "auxiliary_loss_mlp": 0.00784186, + "balance_loss_clip": 1.03498697, + "balance_loss_mlp": 1.01001644, + "epoch": 0.7644070344205621, + "flos": 19857092440320.0, + "grad_norm": 2.208101730158308, + "language_loss": 0.83271867, + "learning_rate": 5.544064274590776e-07, + "loss": 0.85152459, + "num_input_tokens_seen": 274217595, + "step": 12714, + "time_per_iteration": 2.4917168617248535 + }, + { + "auxiliary_loss_clip": 0.01099611, + "auxiliary_loss_mlp": 0.01034801, + "balance_loss_clip": 1.03850365, + "balance_loss_mlp": 1.02212274, + "epoch": 0.7644671576732301, + "flos": 22090628966400.0, + "grad_norm": 1.560705328145743, + "language_loss": 0.72881591, + "learning_rate": 5.541373132311287e-07, + "loss": 0.75015998, + "num_input_tokens_seen": 274237885, + "step": 12715, + "time_per_iteration": 2.5132856369018555 + }, + { + "auxiliary_loss_clip": 0.01065237, + "auxiliary_loss_mlp": 0.01028269, + "balance_loss_clip": 1.03436804, + "balance_loss_mlp": 1.01609755, + "epoch": 0.7645272809258981, + "flos": 25481421872640.0, + "grad_norm": 1.5405521708423653, + "language_loss": 0.6311084, + "learning_rate": 5.538682538304376e-07, + "loss": 0.65204346, + "num_input_tokens_seen": 274258820, + "step": 12716, + "time_per_iteration": 3.9953503608703613 + }, + { + "auxiliary_loss_clip": 0.01109554, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.03685713, + "balance_loss_mlp": 1.01895404, + "epoch": 0.7645874041785661, + "flos": 21541877913600.0, + "grad_norm": 1.4798090882089021, + "language_loss": 0.79953456, + "learning_rate": 5.535992492672068e-07, + "loss": 0.8209523, + "num_input_tokens_seen": 274278835, + "step": 12717, + "time_per_iteration": 2.4645755290985107 + }, + { + "auxiliary_loss_clip": 0.01105539, + "auxiliary_loss_mlp": 0.01033144, + "balance_loss_clip": 1.0371151, + "balance_loss_mlp": 1.02106762, + "epoch": 0.764647527431234, + "flos": 20630896156800.0, + "grad_norm": 2.2484592947020223, + "language_loss": 0.66562712, + "learning_rate": 5.53330299551638e-07, + "loss": 0.68701398, + "num_input_tokens_seen": 274297110, + "step": 12718, + "time_per_iteration": 2.4482691287994385 + }, + { + "auxiliary_loss_clip": 0.01066899, + "auxiliary_loss_mlp": 0.01033502, + "balance_loss_clip": 1.0361588, + "balance_loss_mlp": 1.02221847, + "epoch": 0.764707650683902, + "flos": 21434074220160.0, + "grad_norm": 1.9402739293904379, + "language_loss": 0.7709583, + "learning_rate": 5.530614046939286e-07, + "loss": 0.79196233, + "num_input_tokens_seen": 274315610, + "step": 12719, + "time_per_iteration": 2.542832851409912 + }, + { + "auxiliary_loss_clip": 0.01105208, + "auxiliary_loss_mlp": 0.01027923, + "balance_loss_clip": 1.03555107, + "balance_loss_mlp": 1.01519752, + "epoch": 0.7647677739365699, + "flos": 22711201263360.0, + "grad_norm": 1.7032412327501656, + "language_loss": 0.69882929, + "learning_rate": 5.527925647042754e-07, + "loss": 0.7201606, + "num_input_tokens_seen": 274333975, + "step": 12720, + "time_per_iteration": 2.4664902687072754 + }, + { + "auxiliary_loss_clip": 0.01072911, + "auxiliary_loss_mlp": 0.01035403, + "balance_loss_clip": 1.03581071, + "balance_loss_mlp": 1.0236429, + "epoch": 0.7648278971892379, + "flos": 21324115710720.0, + "grad_norm": 2.3858028766462955, + "language_loss": 0.73863447, + "learning_rate": 5.52523779592875e-07, + "loss": 0.75971758, + "num_input_tokens_seen": 274353695, + "step": 12721, + "time_per_iteration": 2.5325324535369873 + }, + { + "auxiliary_loss_clip": 0.01066484, + "auxiliary_loss_mlp": 0.01030789, + "balance_loss_clip": 1.03545034, + "balance_loss_mlp": 1.01856399, + "epoch": 0.764888020441906, + "flos": 20667345482880.0, + "grad_norm": 1.9260520949013944, + "language_loss": 0.73537505, + "learning_rate": 5.522550493699163e-07, + "loss": 0.75634778, + "num_input_tokens_seen": 274371120, + "step": 12722, + "time_per_iteration": 2.547100782394409 + }, + { + "auxiliary_loss_clip": 0.0109502, + "auxiliary_loss_mlp": 0.01033407, + "balance_loss_clip": 1.03529525, + "balance_loss_mlp": 1.02128899, + "epoch": 0.7649481436945739, + "flos": 25082526360960.0, + "grad_norm": 1.7951258048533811, + "language_loss": 0.73822641, + "learning_rate": 5.519863740455912e-07, + "loss": 0.75951064, + "num_input_tokens_seen": 274389665, + "step": 12723, + "time_per_iteration": 2.5223472118377686 + }, + { + "auxiliary_loss_clip": 0.01105881, + "auxiliary_loss_mlp": 0.01029962, + "balance_loss_clip": 1.03490245, + "balance_loss_mlp": 1.01742673, + "epoch": 0.7650082669472419, + "flos": 24900890261760.0, + "grad_norm": 1.713732460691786, + "language_loss": 0.73062921, + "learning_rate": 5.517177536300881e-07, + "loss": 0.75198764, + "num_input_tokens_seen": 274408750, + "step": 12724, + "time_per_iteration": 2.4960367679595947 + }, + { + "auxiliary_loss_clip": 0.01091411, + "auxiliary_loss_mlp": 0.01026264, + "balance_loss_clip": 1.03589463, + "balance_loss_mlp": 1.01452219, + "epoch": 0.7650683901999098, + "flos": 14647388676480.0, + "grad_norm": 1.8584106642280627, + "language_loss": 0.84055841, + "learning_rate": 5.514491881335935e-07, + "loss": 0.86173517, + "num_input_tokens_seen": 274424600, + "step": 12725, + "time_per_iteration": 2.4505133628845215 + }, + { + "auxiliary_loss_clip": 0.0106452, + "auxiliary_loss_mlp": 0.01033449, + "balance_loss_clip": 1.03541112, + "balance_loss_mlp": 1.0204668, + "epoch": 0.7651285134525778, + "flos": 26352434770560.0, + "grad_norm": 1.7805843509886905, + "language_loss": 0.77419645, + "learning_rate": 5.511806775662901e-07, + "loss": 0.79517615, + "num_input_tokens_seen": 274443075, + "step": 12726, + "time_per_iteration": 2.6109695434570312 + }, + { + "auxiliary_loss_clip": 0.01095479, + "auxiliary_loss_mlp": 0.01032449, + "balance_loss_clip": 1.03606594, + "balance_loss_mlp": 1.02020597, + "epoch": 0.7651886367052457, + "flos": 26646866553600.0, + "grad_norm": 2.5005385961415616, + "language_loss": 0.70800877, + "learning_rate": 5.509122219383615e-07, + "loss": 0.72928798, + "num_input_tokens_seen": 274463240, + "step": 12727, + "time_per_iteration": 2.5367915630340576 + }, + { + "auxiliary_loss_clip": 0.01101292, + "auxiliary_loss_mlp": 0.01026805, + "balance_loss_clip": 1.03494048, + "balance_loss_mlp": 1.01530194, + "epoch": 0.7652487599579137, + "flos": 25702847262720.0, + "grad_norm": 1.6851033135092783, + "language_loss": 0.79577768, + "learning_rate": 5.506438212599864e-07, + "loss": 0.81705862, + "num_input_tokens_seen": 274482750, + "step": 12728, + "time_per_iteration": 2.50215220451355 + }, + { + "auxiliary_loss_clip": 0.01107681, + "auxiliary_loss_mlp": 0.010278, + "balance_loss_clip": 1.03760886, + "balance_loss_mlp": 1.01517534, + "epoch": 0.7653088832105817, + "flos": 28585576247040.0, + "grad_norm": 1.8695902203833261, + "language_loss": 0.55167186, + "learning_rate": 5.503754755413424e-07, + "loss": 0.57302666, + "num_input_tokens_seen": 274503545, + "step": 12729, + "time_per_iteration": 2.5152390003204346 + }, + { + "auxiliary_loss_clip": 0.01081257, + "auxiliary_loss_mlp": 0.00784637, + "balance_loss_clip": 1.03444195, + "balance_loss_mlp": 1.01027954, + "epoch": 0.7653690064632497, + "flos": 23366750428800.0, + "grad_norm": 1.568604240183008, + "language_loss": 0.777421, + "learning_rate": 5.501071847926055e-07, + "loss": 0.79607999, + "num_input_tokens_seen": 274523825, + "step": 12730, + "time_per_iteration": 2.570303201675415 + }, + { + "auxiliary_loss_clip": 0.01100351, + "auxiliary_loss_mlp": 0.01036867, + "balance_loss_clip": 1.04006398, + "balance_loss_mlp": 1.02396214, + "epoch": 0.7654291297159176, + "flos": 15773905992960.0, + "grad_norm": 1.6128678807957468, + "language_loss": 0.68976015, + "learning_rate": 5.498389490239495e-07, + "loss": 0.71113229, + "num_input_tokens_seen": 274541625, + "step": 12731, + "time_per_iteration": 2.45709490776062 + }, + { + "auxiliary_loss_clip": 0.01108371, + "auxiliary_loss_mlp": 0.01030422, + "balance_loss_clip": 1.03767419, + "balance_loss_mlp": 1.01837552, + "epoch": 0.7654892529685856, + "flos": 18033800123520.0, + "grad_norm": 2.400353312672418, + "language_loss": 0.70332956, + "learning_rate": 5.495707682455471e-07, + "loss": 0.7247175, + "num_input_tokens_seen": 274557580, + "step": 12732, + "time_per_iteration": 2.4380133152008057 + }, + { + "auxiliary_loss_clip": 0.01086896, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.0361433, + "balance_loss_mlp": 1.01570606, + "epoch": 0.7655493762212535, + "flos": 27236017428480.0, + "grad_norm": 1.5764152119765142, + "language_loss": 0.78390157, + "learning_rate": 5.493026424675653e-07, + "loss": 0.80505252, + "num_input_tokens_seen": 274578135, + "step": 12733, + "time_per_iteration": 2.5690832138061523 + }, + { + "auxiliary_loss_clip": 0.01094452, + "auxiliary_loss_mlp": 0.01031725, + "balance_loss_clip": 1.03615165, + "balance_loss_mlp": 1.01942253, + "epoch": 0.7656094994739215, + "flos": 20773964027520.0, + "grad_norm": 1.935481264600818, + "language_loss": 0.7720449, + "learning_rate": 5.490345717001726e-07, + "loss": 0.79330671, + "num_input_tokens_seen": 274595655, + "step": 12734, + "time_per_iteration": 2.5017309188842773 + }, + { + "auxiliary_loss_clip": 0.01082584, + "auxiliary_loss_mlp": 0.01029396, + "balance_loss_clip": 1.03818333, + "balance_loss_mlp": 1.01575291, + "epoch": 0.7656696227265896, + "flos": 23039245198080.0, + "grad_norm": 1.5631254803635872, + "language_loss": 0.73276001, + "learning_rate": 5.48766555953535e-07, + "loss": 0.75387985, + "num_input_tokens_seen": 274616305, + "step": 12735, + "time_per_iteration": 2.5621604919433594 + }, + { + "auxiliary_loss_clip": 0.01084146, + "auxiliary_loss_mlp": 0.01029769, + "balance_loss_clip": 1.03876913, + "balance_loss_mlp": 1.01751447, + "epoch": 0.7657297459792575, + "flos": 27525636789120.0, + "grad_norm": 1.986157288815839, + "language_loss": 0.72643936, + "learning_rate": 5.484985952378145e-07, + "loss": 0.7475785, + "num_input_tokens_seen": 274638110, + "step": 12736, + "time_per_iteration": 2.5855581760406494 + }, + { + "auxiliary_loss_clip": 0.01097447, + "auxiliary_loss_mlp": 0.00786265, + "balance_loss_clip": 1.03972292, + "balance_loss_mlp": 1.01157999, + "epoch": 0.7657898692319255, + "flos": 17128456801920.0, + "grad_norm": 2.320909965491637, + "language_loss": 0.77772641, + "learning_rate": 5.482306895631728e-07, + "loss": 0.79656363, + "num_input_tokens_seen": 274656565, + "step": 12737, + "time_per_iteration": 2.4691450595855713 + }, + { + "auxiliary_loss_clip": 0.01079324, + "auxiliary_loss_mlp": 0.01032629, + "balance_loss_clip": 1.03381824, + "balance_loss_mlp": 1.01985013, + "epoch": 0.7658499924845934, + "flos": 21465747037440.0, + "grad_norm": 1.6363883816890863, + "language_loss": 0.76772445, + "learning_rate": 5.479628389397699e-07, + "loss": 0.78884399, + "num_input_tokens_seen": 274674215, + "step": 12738, + "time_per_iteration": 2.4983088970184326 + }, + { + "auxiliary_loss_clip": 0.01085627, + "auxiliary_loss_mlp": 0.01029605, + "balance_loss_clip": 1.03536463, + "balance_loss_mlp": 1.01669431, + "epoch": 0.7659101157372614, + "flos": 29496665744640.0, + "grad_norm": 1.860170500672601, + "language_loss": 0.62653166, + "learning_rate": 5.476950433777603e-07, + "loss": 0.64768398, + "num_input_tokens_seen": 274693445, + "step": 12739, + "time_per_iteration": 2.5684781074523926 + }, + { + "auxiliary_loss_clip": 0.0110796, + "auxiliary_loss_mlp": 0.01034303, + "balance_loss_clip": 1.03784513, + "balance_loss_mlp": 1.0214045, + "epoch": 0.7659702389899293, + "flos": 18551812112640.0, + "grad_norm": 1.8685906667604195, + "language_loss": 0.7918427, + "learning_rate": 5.474273028873004e-07, + "loss": 0.81326532, + "num_input_tokens_seen": 274712815, + "step": 12740, + "time_per_iteration": 2.4931511878967285 + }, + { + "auxiliary_loss_clip": 0.01095315, + "auxiliary_loss_mlp": 0.01034415, + "balance_loss_clip": 1.03446615, + "balance_loss_mlp": 1.02142143, + "epoch": 0.7660303622425974, + "flos": 23549176627200.0, + "grad_norm": 1.58802686115676, + "language_loss": 0.65662527, + "learning_rate": 5.471596174785429e-07, + "loss": 0.67792255, + "num_input_tokens_seen": 274732690, + "step": 12741, + "time_per_iteration": 2.5480456352233887 + }, + { + "auxiliary_loss_clip": 0.01080165, + "auxiliary_loss_mlp": 0.01029238, + "balance_loss_clip": 1.03564978, + "balance_loss_mlp": 1.01629806, + "epoch": 0.7660904854952653, + "flos": 18916736336640.0, + "grad_norm": 1.4803089009429498, + "language_loss": 0.76006705, + "learning_rate": 5.468919871616386e-07, + "loss": 0.78116113, + "num_input_tokens_seen": 274752460, + "step": 12742, + "time_per_iteration": 2.5261166095733643 + }, + { + "auxiliary_loss_clip": 0.01080521, + "auxiliary_loss_mlp": 0.01030544, + "balance_loss_clip": 1.03672504, + "balance_loss_mlp": 1.01892686, + "epoch": 0.7661506087479333, + "flos": 23147515768320.0, + "grad_norm": 1.3397860004290465, + "language_loss": 0.76493883, + "learning_rate": 5.46624411946736e-07, + "loss": 0.78604949, + "num_input_tokens_seen": 274773070, + "step": 12743, + "time_per_iteration": 2.525453567504883 + }, + { + "auxiliary_loss_clip": 0.01080223, + "auxiliary_loss_mlp": 0.0103319, + "balance_loss_clip": 1.03354609, + "balance_loss_mlp": 1.02110863, + "epoch": 0.7662107320006012, + "flos": 17565776887680.0, + "grad_norm": 1.938449710555847, + "language_loss": 0.74933881, + "learning_rate": 5.463568918439805e-07, + "loss": 0.77047288, + "num_input_tokens_seen": 274790220, + "step": 12744, + "time_per_iteration": 3.8911166191101074 + }, + { + "auxiliary_loss_clip": 0.01096402, + "auxiliary_loss_mlp": 0.01030414, + "balance_loss_clip": 1.03625226, + "balance_loss_mlp": 1.01739073, + "epoch": 0.7662708552532692, + "flos": 22303075956480.0, + "grad_norm": 2.2451181301662206, + "language_loss": 0.71222448, + "learning_rate": 5.460894268635181e-07, + "loss": 0.73349261, + "num_input_tokens_seen": 274805095, + "step": 12745, + "time_per_iteration": 2.4627761840820312 + }, + { + "auxiliary_loss_clip": 0.01090412, + "auxiliary_loss_mlp": 0.01036736, + "balance_loss_clip": 1.03370416, + "balance_loss_mlp": 1.0224185, + "epoch": 0.7663309785059371, + "flos": 15742053607680.0, + "grad_norm": 2.348610615936546, + "language_loss": 0.76789314, + "learning_rate": 5.458220170154896e-07, + "loss": 0.7891646, + "num_input_tokens_seen": 274821800, + "step": 12746, + "time_per_iteration": 2.474148988723755 + }, + { + "auxiliary_loss_clip": 0.01004799, + "auxiliary_loss_mlp": 0.01000908, + "balance_loss_clip": 1.01124573, + "balance_loss_mlp": 0.99981159, + "epoch": 0.7663911017586051, + "flos": 62163312514560.0, + "grad_norm": 0.6644517547792831, + "language_loss": 0.56804311, + "learning_rate": 5.455546623100362e-07, + "loss": 0.58810019, + "num_input_tokens_seen": 274886970, + "step": 12747, + "time_per_iteration": 3.2208662033081055 + }, + { + "auxiliary_loss_clip": 0.01103575, + "auxiliary_loss_mlp": 0.01033398, + "balance_loss_clip": 1.0360949, + "balance_loss_mlp": 1.02263284, + "epoch": 0.7664512250112732, + "flos": 26506025326080.0, + "grad_norm": 1.4843315853474126, + "language_loss": 0.72147238, + "learning_rate": 5.452873627572956e-07, + "loss": 0.7428422, + "num_input_tokens_seen": 274907240, + "step": 12748, + "time_per_iteration": 3.8783538341522217 + }, + { + "auxiliary_loss_clip": 0.01070414, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_clip": 1.03423977, + "balance_loss_mlp": 1.01757371, + "epoch": 0.7665113482639411, + "flos": 16249542912000.0, + "grad_norm": 2.102250658225341, + "language_loss": 0.69326097, + "learning_rate": 5.450201183674052e-07, + "loss": 0.71427071, + "num_input_tokens_seen": 274924650, + "step": 12749, + "time_per_iteration": 3.987619638442993 + }, + { + "auxiliary_loss_clip": 0.01095417, + "auxiliary_loss_mlp": 0.0103108, + "balance_loss_clip": 1.03593063, + "balance_loss_mlp": 1.01835406, + "epoch": 0.7665714715166091, + "flos": 27197880163200.0, + "grad_norm": 1.6257675530864106, + "language_loss": 0.73549503, + "learning_rate": 5.447529291504967e-07, + "loss": 0.75676, + "num_input_tokens_seen": 274944550, + "step": 12750, + "time_per_iteration": 2.557943344116211 + }, + { + "auxiliary_loss_clip": 0.01091594, + "auxiliary_loss_mlp": 0.01031326, + "balance_loss_clip": 1.03566647, + "balance_loss_mlp": 1.01929784, + "epoch": 0.766631594769277, + "flos": 21067785279360.0, + "grad_norm": 2.011661877971584, + "language_loss": 0.75765753, + "learning_rate": 5.444857951167026e-07, + "loss": 0.77888674, + "num_input_tokens_seen": 274961330, + "step": 12751, + "time_per_iteration": 2.4928882122039795 + }, + { + "auxiliary_loss_clip": 0.01071129, + "auxiliary_loss_mlp": 0.01036981, + "balance_loss_clip": 1.0348177, + "balance_loss_mlp": 1.02401733, + "epoch": 0.766691718021945, + "flos": 24097963593600.0, + "grad_norm": 1.6824234020545232, + "language_loss": 0.61249399, + "learning_rate": 5.442187162761537e-07, + "loss": 0.63357508, + "num_input_tokens_seen": 274981655, + "step": 12752, + "time_per_iteration": 2.5979862213134766 + }, + { + "auxiliary_loss_clip": 0.01096721, + "auxiliary_loss_mlp": 0.01031839, + "balance_loss_clip": 1.03667879, + "balance_loss_mlp": 1.01891661, + "epoch": 0.7667518412746129, + "flos": 23440654661760.0, + "grad_norm": 2.0845765916472017, + "language_loss": 0.69597387, + "learning_rate": 5.439516926389767e-07, + "loss": 0.71725947, + "num_input_tokens_seen": 274999970, + "step": 12753, + "time_per_iteration": 2.503129005432129 + }, + { + "auxiliary_loss_clip": 0.01096391, + "auxiliary_loss_mlp": 0.01036559, + "balance_loss_clip": 1.03718638, + "balance_loss_mlp": 1.02427495, + "epoch": 0.766811964527281, + "flos": 18148786536960.0, + "grad_norm": 1.9694118502455005, + "language_loss": 0.6233899, + "learning_rate": 5.436847242152971e-07, + "loss": 0.64471942, + "num_input_tokens_seen": 275015805, + "step": 12754, + "time_per_iteration": 2.4710001945495605 + }, + { + "auxiliary_loss_clip": 0.01106903, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.0387547, + "balance_loss_mlp": 1.01960707, + "epoch": 0.7668720877799489, + "flos": 19536051657600.0, + "grad_norm": 2.698192894147036, + "language_loss": 0.79756403, + "learning_rate": 5.434178110152401e-07, + "loss": 0.81894672, + "num_input_tokens_seen": 275031810, + "step": 12755, + "time_per_iteration": 3.812110424041748 + }, + { + "auxiliary_loss_clip": 0.01104165, + "auxiliary_loss_mlp": 0.01030413, + "balance_loss_clip": 1.03604913, + "balance_loss_mlp": 1.01864743, + "epoch": 0.7669322110326169, + "flos": 22674320974080.0, + "grad_norm": 1.9633767341617367, + "language_loss": 0.70682645, + "learning_rate": 5.431509530489242e-07, + "loss": 0.72817218, + "num_input_tokens_seen": 275049325, + "step": 12756, + "time_per_iteration": 2.470271110534668 + }, + { + "auxiliary_loss_clip": 0.01095235, + "auxiliary_loss_mlp": 0.01037207, + "balance_loss_clip": 1.0366925, + "balance_loss_mlp": 1.02563167, + "epoch": 0.7669923342852848, + "flos": 26469396432000.0, + "grad_norm": 1.5202718272173454, + "language_loss": 0.69719505, + "learning_rate": 5.428841503264706e-07, + "loss": 0.71851957, + "num_input_tokens_seen": 275070865, + "step": 12757, + "time_per_iteration": 2.5235588550567627 + }, + { + "auxiliary_loss_clip": 0.01086972, + "auxiliary_loss_mlp": 0.01037913, + "balance_loss_clip": 1.03807592, + "balance_loss_mlp": 1.02490711, + "epoch": 0.7670524575379528, + "flos": 22856136641280.0, + "grad_norm": 1.9461270919814544, + "language_loss": 0.76210648, + "learning_rate": 5.426174028579955e-07, + "loss": 0.7833553, + "num_input_tokens_seen": 275088015, + "step": 12758, + "time_per_iteration": 2.544605016708374 + }, + { + "auxiliary_loss_clip": 0.01091959, + "auxiliary_loss_mlp": 0.01034529, + "balance_loss_clip": 1.03460205, + "balance_loss_mlp": 1.02249467, + "epoch": 0.7671125807906207, + "flos": 22452141398400.0, + "grad_norm": 1.952729486379786, + "language_loss": 0.76411438, + "learning_rate": 5.423507106536156e-07, + "loss": 0.78537929, + "num_input_tokens_seen": 275106975, + "step": 12759, + "time_per_iteration": 2.484149932861328 + }, + { + "auxiliary_loss_clip": 0.01083642, + "auxiliary_loss_mlp": 0.01027516, + "balance_loss_clip": 1.03435481, + "balance_loss_mlp": 1.01574993, + "epoch": 0.7671727040432887, + "flos": 35371543518720.0, + "grad_norm": 1.9752245876113859, + "language_loss": 0.68165565, + "learning_rate": 5.420840737234425e-07, + "loss": 0.70276725, + "num_input_tokens_seen": 275129560, + "step": 12760, + "time_per_iteration": 2.643575668334961 + }, + { + "auxiliary_loss_clip": 0.01084445, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_clip": 1.03582025, + "balance_loss_mlp": 1.01819992, + "epoch": 0.7672328272959568, + "flos": 22494947431680.0, + "grad_norm": 1.4710674423242924, + "language_loss": 0.7932936, + "learning_rate": 5.418174920775871e-07, + "loss": 0.8144514, + "num_input_tokens_seen": 275151180, + "step": 12761, + "time_per_iteration": 2.5579490661621094 + }, + { + "auxiliary_loss_clip": 0.01081349, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.03564322, + "balance_loss_mlp": 1.01844871, + "epoch": 0.7672929505486247, + "flos": 22815557251200.0, + "grad_norm": 1.8225944135678371, + "language_loss": 0.6625793, + "learning_rate": 5.415509657261589e-07, + "loss": 0.68370211, + "num_input_tokens_seen": 275170605, + "step": 12762, + "time_per_iteration": 2.5468802452087402 + }, + { + "auxiliary_loss_clip": 0.0109645, + "auxiliary_loss_mlp": 0.01030692, + "balance_loss_clip": 1.03634048, + "balance_loss_mlp": 1.01748323, + "epoch": 0.7673530738012927, + "flos": 20338834671360.0, + "grad_norm": 1.7772141296572048, + "language_loss": 0.74057066, + "learning_rate": 5.412844946792639e-07, + "loss": 0.76184213, + "num_input_tokens_seen": 275188750, + "step": 12763, + "time_per_iteration": 2.515439748764038 + }, + { + "auxiliary_loss_clip": 0.0108326, + "auxiliary_loss_mlp": 0.01030305, + "balance_loss_clip": 1.03791225, + "balance_loss_mlp": 1.01815188, + "epoch": 0.7674131970539606, + "flos": 34933576988160.0, + "grad_norm": 1.417648606839019, + "language_loss": 0.70592403, + "learning_rate": 5.410180789470067e-07, + "loss": 0.72705966, + "num_input_tokens_seen": 275211365, + "step": 12764, + "time_per_iteration": 2.6685361862182617 + }, + { + "auxiliary_loss_clip": 0.01095754, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.0371325, + "balance_loss_mlp": 1.01986945, + "epoch": 0.7674733203066286, + "flos": 28328850766080.0, + "grad_norm": 1.6056250925169935, + "language_loss": 0.69558728, + "learning_rate": 5.40751718539491e-07, + "loss": 0.71686399, + "num_input_tokens_seen": 275231670, + "step": 12765, + "time_per_iteration": 2.561206102371216 + }, + { + "auxiliary_loss_clip": 0.01082477, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.03390765, + "balance_loss_mlp": 1.02079964, + "epoch": 0.7675334435592965, + "flos": 16289727252480.0, + "grad_norm": 1.7213541813042743, + "language_loss": 0.60957682, + "learning_rate": 5.404854134668162e-07, + "loss": 0.63071978, + "num_input_tokens_seen": 275249425, + "step": 12766, + "time_per_iteration": 2.4870307445526123 + }, + { + "auxiliary_loss_clip": 0.01008894, + "auxiliary_loss_mlp": 0.0100161, + "balance_loss_clip": 1.02275181, + "balance_loss_mlp": 1.000144, + "epoch": 0.7675935668119646, + "flos": 64826232220800.0, + "grad_norm": 0.7298070534310702, + "language_loss": 0.60776448, + "learning_rate": 5.402191637390803e-07, + "loss": 0.62786949, + "num_input_tokens_seen": 275312485, + "step": 12767, + "time_per_iteration": 3.2905070781707764 + }, + { + "auxiliary_loss_clip": 0.01082129, + "auxiliary_loss_mlp": 0.01024546, + "balance_loss_clip": 1.03700757, + "balance_loss_mlp": 1.01352537, + "epoch": 0.7676536900646325, + "flos": 22675398382080.0, + "grad_norm": 1.737800582710968, + "language_loss": 0.69661635, + "learning_rate": 5.399529693663801e-07, + "loss": 0.71768308, + "num_input_tokens_seen": 275331680, + "step": 12768, + "time_per_iteration": 2.5282299518585205 + }, + { + "auxiliary_loss_clip": 0.01101051, + "auxiliary_loss_mlp": 0.01036548, + "balance_loss_clip": 1.03893924, + "balance_loss_mlp": 1.02336383, + "epoch": 0.7677138133173005, + "flos": 26939682224640.0, + "grad_norm": 2.630385377830171, + "language_loss": 0.70802271, + "learning_rate": 5.3968683035881e-07, + "loss": 0.72939867, + "num_input_tokens_seen": 275351615, + "step": 12769, + "time_per_iteration": 2.5584893226623535 + }, + { + "auxiliary_loss_clip": 0.01096115, + "auxiliary_loss_mlp": 0.01028763, + "balance_loss_clip": 1.03635037, + "balance_loss_mlp": 1.01615679, + "epoch": 0.7677739365699684, + "flos": 23799545400960.0, + "grad_norm": 2.493640318957474, + "language_loss": 0.80463886, + "learning_rate": 5.394207467264611e-07, + "loss": 0.82588762, + "num_input_tokens_seen": 275368815, + "step": 12770, + "time_per_iteration": 2.497979164123535 + }, + { + "auxiliary_loss_clip": 0.01067019, + "auxiliary_loss_mlp": 0.0104101, + "balance_loss_clip": 1.0328331, + "balance_loss_mlp": 1.0284096, + "epoch": 0.7678340598226364, + "flos": 34455497944320.0, + "grad_norm": 1.6466426147155542, + "language_loss": 0.78765714, + "learning_rate": 5.391547184794245e-07, + "loss": 0.8087374, + "num_input_tokens_seen": 275389345, + "step": 12771, + "time_per_iteration": 2.6543772220611572 + }, + { + "auxiliary_loss_clip": 0.01103864, + "auxiliary_loss_mlp": 0.01030647, + "balance_loss_clip": 1.03500128, + "balance_loss_mlp": 1.0188216, + "epoch": 0.7678941830753043, + "flos": 23841740903040.0, + "grad_norm": 1.3475323037838656, + "language_loss": 0.68409783, + "learning_rate": 5.388887456277876e-07, + "loss": 0.70544291, + "num_input_tokens_seen": 275411240, + "step": 12772, + "time_per_iteration": 2.477107524871826 + }, + { + "auxiliary_loss_clip": 0.01090038, + "auxiliary_loss_mlp": 0.01024566, + "balance_loss_clip": 1.03542042, + "balance_loss_mlp": 1.01293683, + "epoch": 0.7679543063279723, + "flos": 25410929431680.0, + "grad_norm": 1.5253605028037867, + "language_loss": 0.73342824, + "learning_rate": 5.386228281816349e-07, + "loss": 0.7545743, + "num_input_tokens_seen": 275432010, + "step": 12773, + "time_per_iteration": 2.5394811630249023 + }, + { + "auxiliary_loss_clip": 0.01065448, + "auxiliary_loss_mlp": 0.01029136, + "balance_loss_clip": 1.03290677, + "balance_loss_mlp": 1.01801968, + "epoch": 0.7680144295806404, + "flos": 27962382257280.0, + "grad_norm": 1.9915132351779838, + "language_loss": 0.80785263, + "learning_rate": 5.383569661510512e-07, + "loss": 0.82879841, + "num_input_tokens_seen": 275453710, + "step": 12774, + "time_per_iteration": 2.5962765216827393 + }, + { + "auxiliary_loss_clip": 0.01097178, + "auxiliary_loss_mlp": 0.0078436, + "balance_loss_clip": 1.03836524, + "balance_loss_mlp": 1.00978673, + "epoch": 0.7680745528333083, + "flos": 20412810731520.0, + "grad_norm": 1.6172900178557381, + "language_loss": 0.69950461, + "learning_rate": 5.380911595461177e-07, + "loss": 0.71832001, + "num_input_tokens_seen": 275472915, + "step": 12775, + "time_per_iteration": 2.5239555835723877 + }, + { + "auxiliary_loss_clip": 0.00995853, + "auxiliary_loss_mlp": 0.01007484, + "balance_loss_clip": 1.01503897, + "balance_loss_mlp": 1.00637567, + "epoch": 0.7681346760859763, + "flos": 68401103351040.0, + "grad_norm": 0.7051962527432869, + "language_loss": 0.56812096, + "learning_rate": 5.378254083769147e-07, + "loss": 0.58815432, + "num_input_tokens_seen": 275534785, + "step": 12776, + "time_per_iteration": 3.2369422912597656 + }, + { + "auxiliary_loss_clip": 0.01091717, + "auxiliary_loss_mlp": 0.01035192, + "balance_loss_clip": 1.03489566, + "balance_loss_mlp": 1.023283, + "epoch": 0.7681947993386442, + "flos": 21251468453760.0, + "grad_norm": 1.8868067745225896, + "language_loss": 0.73809528, + "learning_rate": 5.375597126535188e-07, + "loss": 0.75936431, + "num_input_tokens_seen": 275553205, + "step": 12777, + "time_per_iteration": 2.5061800479888916 + }, + { + "auxiliary_loss_clip": 0.01075004, + "auxiliary_loss_mlp": 0.01031721, + "balance_loss_clip": 1.03539109, + "balance_loss_mlp": 1.02010453, + "epoch": 0.7682549225913122, + "flos": 21397696721280.0, + "grad_norm": 2.1312070870735105, + "language_loss": 0.70332146, + "learning_rate": 5.372940723860043e-07, + "loss": 0.72438872, + "num_input_tokens_seen": 275571490, + "step": 12778, + "time_per_iteration": 2.5399656295776367 + }, + { + "auxiliary_loss_clip": 0.01091827, + "auxiliary_loss_mlp": 0.01030114, + "balance_loss_clip": 1.03885674, + "balance_loss_mlp": 1.01872945, + "epoch": 0.7683150458439801, + "flos": 23038921975680.0, + "grad_norm": 1.7470664705128751, + "language_loss": 0.70290995, + "learning_rate": 5.37028487584446e-07, + "loss": 0.72412932, + "num_input_tokens_seen": 275589665, + "step": 12779, + "time_per_iteration": 2.5186898708343506 + }, + { + "auxiliary_loss_clip": 0.01083231, + "auxiliary_loss_mlp": 0.01028435, + "balance_loss_clip": 1.03877604, + "balance_loss_mlp": 1.01615, + "epoch": 0.7683751690966482, + "flos": 67332397996800.0, + "grad_norm": 1.5940936879832504, + "language_loss": 0.59133524, + "learning_rate": 5.367629582589133e-07, + "loss": 0.61245185, + "num_input_tokens_seen": 275615605, + "step": 12780, + "time_per_iteration": 2.940070629119873 + }, + { + "auxiliary_loss_clip": 0.01098765, + "auxiliary_loss_mlp": 0.01041159, + "balance_loss_clip": 1.0367558, + "balance_loss_mlp": 1.02652001, + "epoch": 0.7684352923493161, + "flos": 21798890703360.0, + "grad_norm": 1.7423012194162606, + "language_loss": 0.68012071, + "learning_rate": 5.364974844194759e-07, + "loss": 0.70151997, + "num_input_tokens_seen": 275634965, + "step": 12781, + "time_per_iteration": 2.50557017326355 + }, + { + "auxiliary_loss_clip": 0.01062483, + "auxiliary_loss_mlp": 0.01029539, + "balance_loss_clip": 1.03486705, + "balance_loss_mlp": 1.01795805, + "epoch": 0.7684954156019841, + "flos": 25847603072640.0, + "grad_norm": 1.4723753684940493, + "language_loss": 0.79452109, + "learning_rate": 5.362320660762016e-07, + "loss": 0.81544131, + "num_input_tokens_seen": 275655785, + "step": 12782, + "time_per_iteration": 2.6484766006469727 + }, + { + "auxiliary_loss_clip": 0.0108642, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.03751945, + "balance_loss_mlp": 1.0188334, + "epoch": 0.768555538854652, + "flos": 25447378757760.0, + "grad_norm": 1.6984795874312189, + "language_loss": 0.66695386, + "learning_rate": 5.35966703239153e-07, + "loss": 0.68813109, + "num_input_tokens_seen": 275676160, + "step": 12783, + "time_per_iteration": 3.9207050800323486 + }, + { + "auxiliary_loss_clip": 0.01083458, + "auxiliary_loss_mlp": 0.01034819, + "balance_loss_clip": 1.03577709, + "balance_loss_mlp": 1.02173567, + "epoch": 0.76861566210732, + "flos": 19646369303040.0, + "grad_norm": 2.0920688623122965, + "language_loss": 0.69007111, + "learning_rate": 5.357013959183938e-07, + "loss": 0.71125388, + "num_input_tokens_seen": 275695660, + "step": 12784, + "time_per_iteration": 2.521996021270752 + }, + { + "auxiliary_loss_clip": 0.01062042, + "auxiliary_loss_mlp": 0.0102613, + "balance_loss_clip": 1.03625476, + "balance_loss_mlp": 1.01497841, + "epoch": 0.7686757853599879, + "flos": 22419032037120.0, + "grad_norm": 1.6937361435637195, + "language_loss": 0.80521554, + "learning_rate": 5.354361441239843e-07, + "loss": 0.82609725, + "num_input_tokens_seen": 275714025, + "step": 12785, + "time_per_iteration": 2.582792043685913 + }, + { + "auxiliary_loss_clip": 0.01091544, + "auxiliary_loss_mlp": 0.01037134, + "balance_loss_clip": 1.035501, + "balance_loss_mlp": 1.02240014, + "epoch": 0.768735908612656, + "flos": 47774262453120.0, + "grad_norm": 1.5644328374093424, + "language_loss": 0.7744211, + "learning_rate": 5.351709478659836e-07, + "loss": 0.79570788, + "num_input_tokens_seen": 275737300, + "step": 12786, + "time_per_iteration": 4.225131988525391 + }, + { + "auxiliary_loss_clip": 0.01104102, + "auxiliary_loss_mlp": 0.01030218, + "balance_loss_clip": 1.03560758, + "balance_loss_mlp": 1.01830935, + "epoch": 0.7687960318653239, + "flos": 30263179000320.0, + "grad_norm": 2.0329388969722126, + "language_loss": 0.58764386, + "learning_rate": 5.349058071544468e-07, + "loss": 0.60898703, + "num_input_tokens_seen": 275757895, + "step": 12787, + "time_per_iteration": 2.5639567375183105 + }, + { + "auxiliary_loss_clip": 0.01078998, + "auxiliary_loss_mlp": 0.01029588, + "balance_loss_clip": 1.03350496, + "balance_loss_mlp": 1.01707125, + "epoch": 0.7688561551179919, + "flos": 19573434737280.0, + "grad_norm": 1.6256673595017697, + "language_loss": 0.76087809, + "learning_rate": 5.346407219994292e-07, + "loss": 0.78196388, + "num_input_tokens_seen": 275776745, + "step": 12788, + "time_per_iteration": 3.90998911857605 + }, + { + "auxiliary_loss_clip": 0.01064537, + "auxiliary_loss_mlp": 0.00784033, + "balance_loss_clip": 1.03595209, + "balance_loss_mlp": 1.00815022, + "epoch": 0.7689162783706599, + "flos": 22783776693120.0, + "grad_norm": 1.5657756541470398, + "language_loss": 0.66660833, + "learning_rate": 5.343756924109821e-07, + "loss": 0.685094, + "num_input_tokens_seen": 275797205, + "step": 12789, + "time_per_iteration": 2.6016578674316406 + }, + { + "auxiliary_loss_clip": 0.01085839, + "auxiliary_loss_mlp": 0.01033509, + "balance_loss_clip": 1.03585863, + "balance_loss_mlp": 1.01990747, + "epoch": 0.7689764016233278, + "flos": 34204195416960.0, + "grad_norm": 2.020502403401883, + "language_loss": 0.68399131, + "learning_rate": 5.341107183991553e-07, + "loss": 0.70518476, + "num_input_tokens_seen": 275817935, + "step": 12790, + "time_per_iteration": 2.640934944152832 + }, + { + "auxiliary_loss_clip": 0.01079173, + "auxiliary_loss_mlp": 0.01030039, + "balance_loss_clip": 1.03602147, + "balance_loss_mlp": 1.0175879, + "epoch": 0.7690365248759958, + "flos": 17274469587840.0, + "grad_norm": 1.5478718430920804, + "language_loss": 0.68759018, + "learning_rate": 5.338457999739969e-07, + "loss": 0.70868236, + "num_input_tokens_seen": 275837145, + "step": 12791, + "time_per_iteration": 2.5094332695007324 + }, + { + "auxiliary_loss_clip": 0.01091947, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.03612328, + "balance_loss_mlp": 1.01939452, + "epoch": 0.7690966481286637, + "flos": 18223157646720.0, + "grad_norm": 1.7912425261392715, + "language_loss": 0.79689276, + "learning_rate": 5.335809371455526e-07, + "loss": 0.81812346, + "num_input_tokens_seen": 275855705, + "step": 12792, + "time_per_iteration": 2.4814069271087646 + }, + { + "auxiliary_loss_clip": 0.01079207, + "auxiliary_loss_mlp": 0.00786635, + "balance_loss_clip": 1.03969622, + "balance_loss_mlp": 1.01242733, + "epoch": 0.7691567713813318, + "flos": 21537568281600.0, + "grad_norm": 1.8476481545668038, + "language_loss": 0.72403139, + "learning_rate": 5.333161299238673e-07, + "loss": 0.74268985, + "num_input_tokens_seen": 275873930, + "step": 12793, + "time_per_iteration": 2.5596067905426025 + }, + { + "auxiliary_loss_clip": 0.01068164, + "auxiliary_loss_mlp": 0.01032311, + "balance_loss_clip": 1.03857279, + "balance_loss_mlp": 1.02027082, + "epoch": 0.7692168946339997, + "flos": 39379999720320.0, + "grad_norm": 1.8383572918807012, + "language_loss": 0.63565028, + "learning_rate": 5.330513783189803e-07, + "loss": 0.65665501, + "num_input_tokens_seen": 275895895, + "step": 12794, + "time_per_iteration": 4.091511487960815 + }, + { + "auxiliary_loss_clip": 0.01082856, + "auxiliary_loss_mlp": 0.0103706, + "balance_loss_clip": 1.03563094, + "balance_loss_mlp": 1.02448928, + "epoch": 0.7692770178866677, + "flos": 25009950931200.0, + "grad_norm": 1.4599955268337872, + "language_loss": 0.764292, + "learning_rate": 5.327866823409319e-07, + "loss": 0.78549111, + "num_input_tokens_seen": 275917825, + "step": 12795, + "time_per_iteration": 2.5398802757263184 + }, + { + "auxiliary_loss_clip": 0.0106983, + "auxiliary_loss_mlp": 0.01028011, + "balance_loss_clip": 1.03577769, + "balance_loss_mlp": 1.01586318, + "epoch": 0.7693371411393356, + "flos": 24716273333760.0, + "grad_norm": 1.4523009428308318, + "language_loss": 0.71834326, + "learning_rate": 5.325220419997601e-07, + "loss": 0.73932165, + "num_input_tokens_seen": 275937890, + "step": 12796, + "time_per_iteration": 2.592381477355957 + }, + { + "auxiliary_loss_clip": 0.01105366, + "auxiliary_loss_mlp": 0.01026803, + "balance_loss_clip": 1.03589821, + "balance_loss_mlp": 1.01441669, + "epoch": 0.7693972643920036, + "flos": 15924803028480.0, + "grad_norm": 1.8974975529529157, + "language_loss": 0.64765334, + "learning_rate": 5.32257457305499e-07, + "loss": 0.66897511, + "num_input_tokens_seen": 275954495, + "step": 12797, + "time_per_iteration": 2.440858840942383 + }, + { + "auxiliary_loss_clip": 0.0106958, + "auxiliary_loss_mlp": 0.01033792, + "balance_loss_clip": 1.03477359, + "balance_loss_mlp": 1.02030301, + "epoch": 0.7694573876446715, + "flos": 25405901527680.0, + "grad_norm": 1.8445992955216743, + "language_loss": 0.91545862, + "learning_rate": 5.319929282681823e-07, + "loss": 0.93649232, + "num_input_tokens_seen": 275972395, + "step": 12798, + "time_per_iteration": 2.5899174213409424 + }, + { + "auxiliary_loss_clip": 0.01063103, + "auxiliary_loss_mlp": 0.01024798, + "balance_loss_clip": 1.03533494, + "balance_loss_mlp": 1.01313889, + "epoch": 0.7695175108973396, + "flos": 16654220513280.0, + "grad_norm": 2.20104586473166, + "language_loss": 0.82254064, + "learning_rate": 5.317284548978418e-07, + "loss": 0.84341967, + "num_input_tokens_seen": 275989020, + "step": 12799, + "time_per_iteration": 2.552102565765381 + }, + { + "auxiliary_loss_clip": 0.01052663, + "auxiliary_loss_mlp": 0.01026426, + "balance_loss_clip": 1.03598535, + "balance_loss_mlp": 1.01382542, + "epoch": 0.7695776341500075, + "flos": 13626520237440.0, + "grad_norm": 1.9254303479503816, + "language_loss": 0.78411353, + "learning_rate": 5.314640372045045e-07, + "loss": 0.80490446, + "num_input_tokens_seen": 276006525, + "step": 12800, + "time_per_iteration": 2.607235908508301 + }, + { + "auxiliary_loss_clip": 0.0108768, + "auxiliary_loss_mlp": 0.01024928, + "balance_loss_clip": 1.03537083, + "balance_loss_mlp": 1.01174903, + "epoch": 0.7696377574026755, + "flos": 24276690691200.0, + "grad_norm": 1.5456358546841908, + "language_loss": 0.83807737, + "learning_rate": 5.31199675198198e-07, + "loss": 0.85920346, + "num_input_tokens_seen": 276027130, + "step": 12801, + "time_per_iteration": 2.557515859603882 + }, + { + "auxiliary_loss_clip": 0.01084126, + "auxiliary_loss_mlp": 0.01027013, + "balance_loss_clip": 1.03558373, + "balance_loss_mlp": 1.01469278, + "epoch": 0.7696978806553435, + "flos": 20923137210240.0, + "grad_norm": 1.9093711411420022, + "language_loss": 0.72136998, + "learning_rate": 5.30935368888947e-07, + "loss": 0.74248135, + "num_input_tokens_seen": 276045715, + "step": 12802, + "time_per_iteration": 2.572239637374878 + }, + { + "auxiliary_loss_clip": 0.01079966, + "auxiliary_loss_mlp": 0.01032613, + "balance_loss_clip": 1.03480852, + "balance_loss_mlp": 1.02014971, + "epoch": 0.7697580039080114, + "flos": 22929609911040.0, + "grad_norm": 1.7594288205026527, + "language_loss": 0.76317799, + "learning_rate": 5.306711182867747e-07, + "loss": 0.78430372, + "num_input_tokens_seen": 276065375, + "step": 12803, + "time_per_iteration": 2.6012067794799805 + }, + { + "auxiliary_loss_clip": 0.01017215, + "auxiliary_loss_mlp": 0.01004345, + "balance_loss_clip": 1.0137434, + "balance_loss_mlp": 1.00306976, + "epoch": 0.7698181271606794, + "flos": 68717654933760.0, + "grad_norm": 0.7341073536466035, + "language_loss": 0.55844843, + "learning_rate": 5.304069234017001e-07, + "loss": 0.57866406, + "num_input_tokens_seen": 276131405, + "step": 12804, + "time_per_iteration": 3.174926996231079 + }, + { + "auxiliary_loss_clip": 0.01015219, + "auxiliary_loss_mlp": 0.01004636, + "balance_loss_clip": 1.01170886, + "balance_loss_mlp": 1.00361109, + "epoch": 0.7698782504133473, + "flos": 67409716999680.0, + "grad_norm": 0.7475831035439705, + "language_loss": 0.54013586, + "learning_rate": 5.301427842437429e-07, + "loss": 0.56033444, + "num_input_tokens_seen": 276200755, + "step": 12805, + "time_per_iteration": 3.303776502609253 + }, + { + "auxiliary_loss_clip": 0.01075105, + "auxiliary_loss_mlp": 0.01033129, + "balance_loss_clip": 1.03778648, + "balance_loss_mlp": 1.02026641, + "epoch": 0.7699383736660154, + "flos": 22488842119680.0, + "grad_norm": 1.977605556067504, + "language_loss": 0.72660744, + "learning_rate": 5.298787008229187e-07, + "loss": 0.74768978, + "num_input_tokens_seen": 276217880, + "step": 12806, + "time_per_iteration": 2.5716021060943604 + }, + { + "auxiliary_loss_clip": 0.01081905, + "auxiliary_loss_mlp": 0.0103509, + "balance_loss_clip": 1.03661585, + "balance_loss_mlp": 1.02271605, + "epoch": 0.7699984969186833, + "flos": 21539723097600.0, + "grad_norm": 1.8908332029052761, + "language_loss": 0.74887365, + "learning_rate": 5.296146731492408e-07, + "loss": 0.77004361, + "num_input_tokens_seen": 276234810, + "step": 12807, + "time_per_iteration": 2.5444610118865967 + }, + { + "auxiliary_loss_clip": 0.0109978, + "auxiliary_loss_mlp": 0.0103541, + "balance_loss_clip": 1.0377326, + "balance_loss_mlp": 1.02279758, + "epoch": 0.7700586201713513, + "flos": 21719096640000.0, + "grad_norm": 2.262278742494346, + "language_loss": 0.80169064, + "learning_rate": 5.293507012327218e-07, + "loss": 0.82304251, + "num_input_tokens_seen": 276252850, + "step": 12808, + "time_per_iteration": 2.502634286880493 + }, + { + "auxiliary_loss_clip": 0.01098539, + "auxiliary_loss_mlp": 0.01033812, + "balance_loss_clip": 1.0379343, + "balance_loss_mlp": 1.0218792, + "epoch": 0.7701187434240192, + "flos": 27856015107840.0, + "grad_norm": 1.931795885772982, + "language_loss": 0.79230249, + "learning_rate": 5.290867850833718e-07, + "loss": 0.81362605, + "num_input_tokens_seen": 276272525, + "step": 12809, + "time_per_iteration": 2.554884433746338 + }, + { + "auxiliary_loss_clip": 0.01070432, + "auxiliary_loss_mlp": 0.01026467, + "balance_loss_clip": 1.03557098, + "balance_loss_mlp": 1.01510668, + "epoch": 0.7701788666766872, + "flos": 28621307301120.0, + "grad_norm": 1.4202725467125998, + "language_loss": 0.70138264, + "learning_rate": 5.288229247111993e-07, + "loss": 0.72235167, + "num_input_tokens_seen": 276294210, + "step": 12810, + "time_per_iteration": 2.599517822265625 + }, + { + "auxiliary_loss_clip": 0.01082259, + "auxiliary_loss_mlp": 0.0103901, + "balance_loss_clip": 1.03460526, + "balance_loss_mlp": 1.02326214, + "epoch": 0.7702389899293551, + "flos": 14246446089600.0, + "grad_norm": 2.334487869428652, + "language_loss": 0.78669435, + "learning_rate": 5.285591201262079e-07, + "loss": 0.80790699, + "num_input_tokens_seen": 276310290, + "step": 12811, + "time_per_iteration": 2.5000243186950684 + }, + { + "auxiliary_loss_clip": 0.01014159, + "auxiliary_loss_mlp": 0.01019602, + "balance_loss_clip": 1.01294971, + "balance_loss_mlp": 1.01806402, + "epoch": 0.7702991131820232, + "flos": 70574128439040.0, + "grad_norm": 0.810492115759281, + "language_loss": 0.56637633, + "learning_rate": 5.28295371338402e-07, + "loss": 0.58671403, + "num_input_tokens_seen": 276371715, + "step": 12812, + "time_per_iteration": 3.1848747730255127 + }, + { + "auxiliary_loss_clip": 0.01075142, + "auxiliary_loss_mlp": 0.01034388, + "balance_loss_clip": 1.03562903, + "balance_loss_mlp": 1.02244925, + "epoch": 0.7703592364346911, + "flos": 25480021242240.0, + "grad_norm": 1.5828252839654038, + "language_loss": 0.71707237, + "learning_rate": 5.280316783577836e-07, + "loss": 0.7381677, + "num_input_tokens_seen": 276389895, + "step": 12813, + "time_per_iteration": 2.5900189876556396 + }, + { + "auxiliary_loss_clip": 0.01096871, + "auxiliary_loss_mlp": 0.01027977, + "balance_loss_clip": 1.03604746, + "balance_loss_mlp": 1.01477432, + "epoch": 0.7704193596873591, + "flos": 19280906375040.0, + "grad_norm": 2.0001819861113272, + "language_loss": 0.66381699, + "learning_rate": 5.27768041194351e-07, + "loss": 0.68506545, + "num_input_tokens_seen": 276408990, + "step": 12814, + "time_per_iteration": 2.5317184925079346 + }, + { + "auxiliary_loss_clip": 0.01082907, + "auxiliary_loss_mlp": 0.01035732, + "balance_loss_clip": 1.03482854, + "balance_loss_mlp": 1.02348971, + "epoch": 0.7704794829400271, + "flos": 23658452778240.0, + "grad_norm": 1.811959076872331, + "language_loss": 0.65576565, + "learning_rate": 5.275044598581018e-07, + "loss": 0.676952, + "num_input_tokens_seen": 276428190, + "step": 12815, + "time_per_iteration": 2.548947811126709 + }, + { + "auxiliary_loss_clip": 0.01093691, + "auxiliary_loss_mlp": 0.01032063, + "balance_loss_clip": 1.03550398, + "balance_loss_mlp": 1.01962972, + "epoch": 0.770539606192695, + "flos": 18989311766400.0, + "grad_norm": 2.0657346600618967, + "language_loss": 0.6466924, + "learning_rate": 5.272409343590322e-07, + "loss": 0.66794991, + "num_input_tokens_seen": 276446855, + "step": 12816, + "time_per_iteration": 2.4844844341278076 + }, + { + "auxiliary_loss_clip": 0.0109652, + "auxiliary_loss_mlp": 0.01033189, + "balance_loss_clip": 1.03742254, + "balance_loss_mlp": 1.02119064, + "epoch": 0.770599729445363, + "flos": 11830160142720.0, + "grad_norm": 2.039349826403797, + "language_loss": 0.71837872, + "learning_rate": 5.26977464707133e-07, + "loss": 0.73967576, + "num_input_tokens_seen": 276462000, + "step": 12817, + "time_per_iteration": 2.45175838470459 + }, + { + "auxiliary_loss_clip": 0.0106389, + "auxiliary_loss_mlp": 0.01029471, + "balance_loss_clip": 1.03669989, + "balance_loss_mlp": 1.01759768, + "epoch": 0.770659852698031, + "flos": 17822610109440.0, + "grad_norm": 3.857868198964955, + "language_loss": 0.61035085, + "learning_rate": 5.267140509123957e-07, + "loss": 0.63128448, + "num_input_tokens_seen": 276481190, + "step": 12818, + "time_per_iteration": 2.576406240463257 + }, + { + "auxiliary_loss_clip": 0.0109336, + "auxiliary_loss_mlp": 0.0102795, + "balance_loss_clip": 1.03676832, + "balance_loss_mlp": 1.0172981, + "epoch": 0.770719975950699, + "flos": 21871968923520.0, + "grad_norm": 1.6504535971655625, + "language_loss": 0.67155415, + "learning_rate": 5.264506929848093e-07, + "loss": 0.69276726, + "num_input_tokens_seen": 276499520, + "step": 12819, + "time_per_iteration": 2.495257616043091 + }, + { + "auxiliary_loss_clip": 0.01107168, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.03665185, + "balance_loss_mlp": 1.01935554, + "epoch": 0.7707800992033669, + "flos": 21325049464320.0, + "grad_norm": 1.7333373051393226, + "language_loss": 0.57426894, + "learning_rate": 5.261873909343608e-07, + "loss": 0.5956558, + "num_input_tokens_seen": 276519110, + "step": 12820, + "time_per_iteration": 2.4853034019470215 + }, + { + "auxiliary_loss_clip": 0.01077188, + "auxiliary_loss_mlp": 0.01029643, + "balance_loss_clip": 1.03572035, + "balance_loss_mlp": 1.01718569, + "epoch": 0.7708402224560349, + "flos": 28179426188160.0, + "grad_norm": 1.7856003471020523, + "language_loss": 0.81137687, + "learning_rate": 5.259241447710343e-07, + "loss": 0.83244526, + "num_input_tokens_seen": 276538805, + "step": 12821, + "time_per_iteration": 2.5758302211761475 + }, + { + "auxiliary_loss_clip": 0.0110704, + "auxiliary_loss_mlp": 0.0103108, + "balance_loss_clip": 1.03720808, + "balance_loss_mlp": 1.01859903, + "epoch": 0.7709003457087028, + "flos": 15377057556480.0, + "grad_norm": 6.69579716423254, + "language_loss": 0.68804806, + "learning_rate": 5.256609545048114e-07, + "loss": 0.70942926, + "num_input_tokens_seen": 276554770, + "step": 12822, + "time_per_iteration": 3.827915668487549 + }, + { + "auxiliary_loss_clip": 0.01082006, + "auxiliary_loss_mlp": 0.0103536, + "balance_loss_clip": 1.03473878, + "balance_loss_mlp": 1.02257514, + "epoch": 0.7709604689613708, + "flos": 30621854257920.0, + "grad_norm": 1.7375377098480962, + "language_loss": 0.72474754, + "learning_rate": 5.253978201456733e-07, + "loss": 0.74592113, + "num_input_tokens_seen": 276574535, + "step": 12823, + "time_per_iteration": 2.5875067710876465 + }, + { + "auxiliary_loss_clip": 0.01100606, + "auxiliary_loss_mlp": 0.01039622, + "balance_loss_clip": 1.03746247, + "balance_loss_mlp": 1.02537012, + "epoch": 0.7710205922140387, + "flos": 20301272023680.0, + "grad_norm": 1.6187982955998403, + "language_loss": 0.76372194, + "learning_rate": 5.251347417035969e-07, + "loss": 0.78512424, + "num_input_tokens_seen": 276592925, + "step": 12824, + "time_per_iteration": 2.495988130569458 + }, + { + "auxiliary_loss_clip": 0.01082933, + "auxiliary_loss_mlp": 0.0102914, + "balance_loss_clip": 1.03871047, + "balance_loss_mlp": 1.01649153, + "epoch": 0.7710807154667068, + "flos": 19644214487040.0, + "grad_norm": 1.8043239632398065, + "language_loss": 0.7240535, + "learning_rate": 5.248717191885592e-07, + "loss": 0.74517417, + "num_input_tokens_seen": 276610540, + "step": 12825, + "time_per_iteration": 3.973560333251953 + }, + { + "auxiliary_loss_clip": 0.01102457, + "auxiliary_loss_mlp": 0.01037269, + "balance_loss_clip": 1.03645158, + "balance_loss_mlp": 1.02626646, + "epoch": 0.7711408387193747, + "flos": 20006337450240.0, + "grad_norm": 1.4105632343658252, + "language_loss": 0.73708212, + "learning_rate": 5.246087526105343e-07, + "loss": 0.75847936, + "num_input_tokens_seen": 276629200, + "step": 12826, + "time_per_iteration": 2.471680164337158 + }, + { + "auxiliary_loss_clip": 0.01107573, + "auxiliary_loss_mlp": 0.01035956, + "balance_loss_clip": 1.03560567, + "balance_loss_mlp": 1.02240849, + "epoch": 0.7712009619720427, + "flos": 24971131307520.0, + "grad_norm": 1.4921595501791176, + "language_loss": 0.81317747, + "learning_rate": 5.243458419794933e-07, + "loss": 0.83461285, + "num_input_tokens_seen": 276648655, + "step": 12827, + "time_per_iteration": 3.969805955886841 + }, + { + "auxiliary_loss_clip": 0.01032215, + "auxiliary_loss_mlp": 0.01002316, + "balance_loss_clip": 1.00913978, + "balance_loss_mlp": 1.0011301, + "epoch": 0.7712610852247107, + "flos": 63249681404160.0, + "grad_norm": 0.8629111488922488, + "language_loss": 0.55171943, + "learning_rate": 5.240829873054051e-07, + "loss": 0.57206476, + "num_input_tokens_seen": 276716500, + "step": 12828, + "time_per_iteration": 3.2220711708068848 + }, + { + "auxiliary_loss_clip": 0.01064932, + "auxiliary_loss_mlp": 0.01032844, + "balance_loss_clip": 1.03336239, + "balance_loss_mlp": 1.021227, + "epoch": 0.7713212084773786, + "flos": 18697860812160.0, + "grad_norm": 2.236184037837105, + "language_loss": 0.69423324, + "learning_rate": 5.23820188598238e-07, + "loss": 0.71521103, + "num_input_tokens_seen": 276733535, + "step": 12829, + "time_per_iteration": 2.515791654586792 + }, + { + "auxiliary_loss_clip": 0.01085029, + "auxiliary_loss_mlp": 0.01030275, + "balance_loss_clip": 1.03971767, + "balance_loss_mlp": 1.01715636, + "epoch": 0.7713813317300466, + "flos": 14173367869440.0, + "grad_norm": 2.8080193338262878, + "language_loss": 0.8026126, + "learning_rate": 5.235574458679579e-07, + "loss": 0.82376564, + "num_input_tokens_seen": 276749575, + "step": 12830, + "time_per_iteration": 2.4987292289733887 + }, + { + "auxiliary_loss_clip": 0.01098103, + "auxiliary_loss_mlp": 0.01038453, + "balance_loss_clip": 1.03745747, + "balance_loss_mlp": 1.02513778, + "epoch": 0.7714414549827145, + "flos": 25703960584320.0, + "grad_norm": 1.7036685134966296, + "language_loss": 0.77744859, + "learning_rate": 5.232947591245269e-07, + "loss": 0.79881412, + "num_input_tokens_seen": 276769460, + "step": 12831, + "time_per_iteration": 2.5265276432037354 + }, + { + "auxiliary_loss_clip": 0.01075452, + "auxiliary_loss_mlp": 0.0103404, + "balance_loss_clip": 1.03393602, + "balance_loss_mlp": 1.02057528, + "epoch": 0.7715015782353826, + "flos": 30555312312960.0, + "grad_norm": 1.446187031438796, + "language_loss": 0.61033809, + "learning_rate": 5.230321283779071e-07, + "loss": 0.63143301, + "num_input_tokens_seen": 276790820, + "step": 12832, + "time_per_iteration": 4.0561299324035645 + }, + { + "auxiliary_loss_clip": 0.01079263, + "auxiliary_loss_mlp": 0.01036316, + "balance_loss_clip": 1.03494036, + "balance_loss_mlp": 1.02373326, + "epoch": 0.7715617014880505, + "flos": 20229343038720.0, + "grad_norm": 1.5677496436205776, + "language_loss": 0.79475713, + "learning_rate": 5.227695536380572e-07, + "loss": 0.81591296, + "num_input_tokens_seen": 276811345, + "step": 12833, + "time_per_iteration": 2.5706682205200195 + }, + { + "auxiliary_loss_clip": 0.00993473, + "auxiliary_loss_mlp": 0.00999943, + "balance_loss_clip": 1.01433945, + "balance_loss_mlp": 0.9986912, + "epoch": 0.7716218247407185, + "flos": 63664770971520.0, + "grad_norm": 0.8413287250587307, + "language_loss": 0.55366367, + "learning_rate": 5.22507034914933e-07, + "loss": 0.57359779, + "num_input_tokens_seen": 276870950, + "step": 12834, + "time_per_iteration": 3.2120351791381836 + }, + { + "auxiliary_loss_clip": 0.01058304, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.03175056, + "balance_loss_mlp": 1.01904392, + "epoch": 0.7716819479933864, + "flos": 19791807471360.0, + "grad_norm": 2.8875431454830265, + "language_loss": 0.7275089, + "learning_rate": 5.222445722184903e-07, + "loss": 0.74841487, + "num_input_tokens_seen": 276890760, + "step": 12835, + "time_per_iteration": 2.64113450050354 + }, + { + "auxiliary_loss_clip": 0.0107206, + "auxiliary_loss_mlp": 0.00784714, + "balance_loss_clip": 1.03407133, + "balance_loss_mlp": 1.01056337, + "epoch": 0.7717420712460544, + "flos": 18442176825600.0, + "grad_norm": 1.6541349596803399, + "language_loss": 0.70040482, + "learning_rate": 5.219821655586814e-07, + "loss": 0.71897256, + "num_input_tokens_seen": 276909625, + "step": 12836, + "time_per_iteration": 2.5338456630706787 + }, + { + "auxiliary_loss_clip": 0.01083471, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.03568006, + "balance_loss_mlp": 1.01816249, + "epoch": 0.7718021944987223, + "flos": 35189476456320.0, + "grad_norm": 2.0268943455812347, + "language_loss": 0.59288329, + "learning_rate": 5.217198149454575e-07, + "loss": 0.61402482, + "num_input_tokens_seen": 276930760, + "step": 12837, + "time_per_iteration": 2.658083915710449 + }, + { + "auxiliary_loss_clip": 0.01025113, + "auxiliary_loss_mlp": 0.01002826, + "balance_loss_clip": 1.01911259, + "balance_loss_mlp": 1.00147343, + "epoch": 0.7718623177513904, + "flos": 67923167961600.0, + "grad_norm": 0.8625793663627638, + "language_loss": 0.55769199, + "learning_rate": 5.214575203887666e-07, + "loss": 0.57797134, + "num_input_tokens_seen": 276989580, + "step": 12838, + "time_per_iteration": 3.0817508697509766 + }, + { + "auxiliary_loss_clip": 0.01095124, + "auxiliary_loss_mlp": 0.0102855, + "balance_loss_clip": 1.03618503, + "balance_loss_mlp": 1.01718342, + "epoch": 0.7719224410040583, + "flos": 18581401941120.0, + "grad_norm": 2.525487118789534, + "language_loss": 0.69430709, + "learning_rate": 5.211952818985538e-07, + "loss": 0.71554387, + "num_input_tokens_seen": 277005450, + "step": 12839, + "time_per_iteration": 2.4914534091949463 + }, + { + "auxiliary_loss_clip": 0.01093023, + "auxiliary_loss_mlp": 0.01026155, + "balance_loss_clip": 1.0364368, + "balance_loss_mlp": 1.01431131, + "epoch": 0.7719825642567263, + "flos": 23075802264960.0, + "grad_norm": 2.552612767149446, + "language_loss": 0.80033261, + "learning_rate": 5.209330994847647e-07, + "loss": 0.82152438, + "num_input_tokens_seen": 277023055, + "step": 12840, + "time_per_iteration": 2.501300811767578 + }, + { + "auxiliary_loss_clip": 0.01094042, + "auxiliary_loss_mlp": 0.00784336, + "balance_loss_clip": 1.03548217, + "balance_loss_mlp": 1.01049852, + "epoch": 0.7720426875093943, + "flos": 20339086066560.0, + "grad_norm": 1.7545138080351081, + "language_loss": 0.80459869, + "learning_rate": 5.206709731573402e-07, + "loss": 0.82338244, + "num_input_tokens_seen": 277041150, + "step": 12841, + "time_per_iteration": 2.5766897201538086 + }, + { + "auxiliary_loss_clip": 0.01068646, + "auxiliary_loss_mlp": 0.01029794, + "balance_loss_clip": 1.03618455, + "balance_loss_mlp": 1.01749778, + "epoch": 0.7721028107620622, + "flos": 23880704181120.0, + "grad_norm": 1.429064390350467, + "language_loss": 0.76078552, + "learning_rate": 5.204089029262208e-07, + "loss": 0.78176993, + "num_input_tokens_seen": 277063895, + "step": 12842, + "time_per_iteration": 2.5987627506256104 + }, + { + "auxiliary_loss_clip": 0.0106023, + "auxiliary_loss_mlp": 0.00786152, + "balance_loss_clip": 1.03664851, + "balance_loss_mlp": 1.01227021, + "epoch": 0.7721629340147302, + "flos": 26651571235200.0, + "grad_norm": 1.8124624665391602, + "language_loss": 0.6872108, + "learning_rate": 5.201468888013445e-07, + "loss": 0.70567465, + "num_input_tokens_seen": 277084045, + "step": 12843, + "time_per_iteration": 2.66969633102417 + }, + { + "auxiliary_loss_clip": 0.01081744, + "auxiliary_loss_mlp": 0.01028079, + "balance_loss_clip": 1.0319314, + "balance_loss_mlp": 1.01619935, + "epoch": 0.7722230572673981, + "flos": 21178857110400.0, + "grad_norm": 2.0607072607211427, + "language_loss": 0.7357223, + "learning_rate": 5.198849307926465e-07, + "loss": 0.75682056, + "num_input_tokens_seen": 277102625, + "step": 12844, + "time_per_iteration": 2.548755645751953 + }, + { + "auxiliary_loss_clip": 0.01090724, + "auxiliary_loss_mlp": 0.01041073, + "balance_loss_clip": 1.03527331, + "balance_loss_mlp": 1.02731609, + "epoch": 0.7722831805200662, + "flos": 27964644814080.0, + "grad_norm": 1.3927089722591668, + "language_loss": 0.71669269, + "learning_rate": 5.196230289100596e-07, + "loss": 0.73801064, + "num_input_tokens_seen": 277123210, + "step": 12845, + "time_per_iteration": 2.542170286178589 + }, + { + "auxiliary_loss_clip": 0.01102499, + "auxiliary_loss_mlp": 0.01031733, + "balance_loss_clip": 1.03568745, + "balance_loss_mlp": 1.02026498, + "epoch": 0.7723433037727341, + "flos": 33875576864640.0, + "grad_norm": 1.9749642509709284, + "language_loss": 0.64235836, + "learning_rate": 5.193611831635159e-07, + "loss": 0.6637007, + "num_input_tokens_seen": 277144895, + "step": 12846, + "time_per_iteration": 2.555647134780884 + }, + { + "auxiliary_loss_clip": 0.01023505, + "auxiliary_loss_mlp": 0.00763444, + "balance_loss_clip": 1.00914526, + "balance_loss_mlp": 1.00374627, + "epoch": 0.7724034270254021, + "flos": 62848271940480.0, + "grad_norm": 0.7925767752665814, + "language_loss": 0.61749732, + "learning_rate": 5.19099393562945e-07, + "loss": 0.6353668, + "num_input_tokens_seen": 277205160, + "step": 12847, + "time_per_iteration": 3.0515732765197754 + }, + { + "auxiliary_loss_clip": 0.01103869, + "auxiliary_loss_mlp": 0.01026873, + "balance_loss_clip": 1.03438878, + "balance_loss_mlp": 1.01480889, + "epoch": 0.77246355027807, + "flos": 23295467888640.0, + "grad_norm": 1.7693170715561644, + "language_loss": 0.79242843, + "learning_rate": 5.188376601182732e-07, + "loss": 0.81373584, + "num_input_tokens_seen": 277223005, + "step": 12848, + "time_per_iteration": 2.467751979827881 + }, + { + "auxiliary_loss_clip": 0.01068881, + "auxiliary_loss_mlp": 0.01041512, + "balance_loss_clip": 1.03671217, + "balance_loss_mlp": 1.0279218, + "epoch": 0.772523673530738, + "flos": 20121287950080.0, + "grad_norm": 1.5612725939448235, + "language_loss": 0.72835159, + "learning_rate": 5.185759828394261e-07, + "loss": 0.74945551, + "num_input_tokens_seen": 277241785, + "step": 12849, + "time_per_iteration": 2.5606796741485596 + }, + { + "auxiliary_loss_clip": 0.01103645, + "auxiliary_loss_mlp": 0.01032594, + "balance_loss_clip": 1.03499579, + "balance_loss_mlp": 1.02017212, + "epoch": 0.7725837967834059, + "flos": 17820096157440.0, + "grad_norm": 1.9126752188721703, + "language_loss": 0.78369349, + "learning_rate": 5.183143617363261e-07, + "loss": 0.80505592, + "num_input_tokens_seen": 277259050, + "step": 12850, + "time_per_iteration": 2.4162545204162598 + }, + { + "auxiliary_loss_clip": 0.01049013, + "auxiliary_loss_mlp": 0.00784858, + "balance_loss_clip": 1.03347754, + "balance_loss_mlp": 1.00920463, + "epoch": 0.772643920036074, + "flos": 27198921657600.0, + "grad_norm": 1.5062023186817235, + "language_loss": 0.79853922, + "learning_rate": 5.180527968188935e-07, + "loss": 0.81687796, + "num_input_tokens_seen": 277278235, + "step": 12851, + "time_per_iteration": 2.727858304977417 + }, + { + "auxiliary_loss_clip": 0.01094172, + "auxiliary_loss_mlp": 0.01029736, + "balance_loss_clip": 1.03687274, + "balance_loss_mlp": 1.01605678, + "epoch": 0.7727040432887419, + "flos": 21579512388480.0, + "grad_norm": 1.6296018580622826, + "language_loss": 0.73873973, + "learning_rate": 5.177912880970474e-07, + "loss": 0.75997883, + "num_input_tokens_seen": 277298355, + "step": 12852, + "time_per_iteration": 2.6451754570007324 + }, + { + "auxiliary_loss_clip": 0.01102929, + "auxiliary_loss_mlp": 0.01034034, + "balance_loss_clip": 1.0345161, + "balance_loss_mlp": 1.02166569, + "epoch": 0.7727641665414099, + "flos": 22236641752320.0, + "grad_norm": 1.674738145271165, + "language_loss": 0.82096112, + "learning_rate": 5.17529835580704e-07, + "loss": 0.84233081, + "num_input_tokens_seen": 277316095, + "step": 12853, + "time_per_iteration": 2.469040870666504 + }, + { + "auxiliary_loss_clip": 0.01032215, + "auxiliary_loss_mlp": 0.01002478, + "balance_loss_clip": 1.00922358, + "balance_loss_mlp": 1.00130939, + "epoch": 0.7728242897940779, + "flos": 54832221463680.0, + "grad_norm": 0.8722879249591857, + "language_loss": 0.54522181, + "learning_rate": 5.172684392797786e-07, + "loss": 0.56556875, + "num_input_tokens_seen": 277380130, + "step": 12854, + "time_per_iteration": 3.1401758193969727 + }, + { + "auxiliary_loss_clip": 0.01097204, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.03656387, + "balance_loss_mlp": 1.01679492, + "epoch": 0.7728844130467458, + "flos": 34461962392320.0, + "grad_norm": 1.6658056097746574, + "language_loss": 0.71803451, + "learning_rate": 5.170070992041826e-07, + "loss": 0.73931158, + "num_input_tokens_seen": 277404015, + "step": 12855, + "time_per_iteration": 2.6139421463012695 + }, + { + "auxiliary_loss_clip": 0.01105138, + "auxiliary_loss_mlp": 0.01029948, + "balance_loss_clip": 1.03576529, + "balance_loss_mlp": 1.0168407, + "epoch": 0.7729445362994138, + "flos": 18916341287040.0, + "grad_norm": 1.6396582811302933, + "language_loss": 0.6771338, + "learning_rate": 5.167458153638254e-07, + "loss": 0.69848466, + "num_input_tokens_seen": 277421375, + "step": 12856, + "time_per_iteration": 2.4586925506591797 + }, + { + "auxiliary_loss_clip": 0.01075173, + "auxiliary_loss_mlp": 0.0102987, + "balance_loss_clip": 1.03489864, + "balance_loss_mlp": 1.01795459, + "epoch": 0.7730046595520818, + "flos": 22200048771840.0, + "grad_norm": 1.5090834368929613, + "language_loss": 0.78801906, + "learning_rate": 5.164845877686162e-07, + "loss": 0.80906945, + "num_input_tokens_seen": 277440170, + "step": 12857, + "time_per_iteration": 2.5477378368377686 + }, + { + "auxiliary_loss_clip": 0.01055116, + "auxiliary_loss_mlp": 0.00783478, + "balance_loss_clip": 1.03626943, + "balance_loss_mlp": 1.00894868, + "epoch": 0.7730647828047498, + "flos": 13552328695680.0, + "grad_norm": 1.6656244384776142, + "language_loss": 0.7858988, + "learning_rate": 5.162234164284591e-07, + "loss": 0.80428469, + "num_input_tokens_seen": 277456880, + "step": 12858, + "time_per_iteration": 2.628862142562866 + }, + { + "auxiliary_loss_clip": 0.01104806, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.03503466, + "balance_loss_mlp": 1.0192976, + "epoch": 0.7731249060574177, + "flos": 21976037602560.0, + "grad_norm": 1.9714962257832367, + "language_loss": 0.76945198, + "learning_rate": 5.159623013532591e-07, + "loss": 0.790815, + "num_input_tokens_seen": 277475365, + "step": 12859, + "time_per_iteration": 2.4592201709747314 + }, + { + "auxiliary_loss_clip": 0.01093267, + "auxiliary_loss_mlp": 0.0102761, + "balance_loss_clip": 1.03840411, + "balance_loss_mlp": 1.01702464, + "epoch": 0.7731850293100857, + "flos": 22601817371520.0, + "grad_norm": 1.5259192437773528, + "language_loss": 0.68043607, + "learning_rate": 5.157012425529186e-07, + "loss": 0.70164484, + "num_input_tokens_seen": 277494975, + "step": 12860, + "time_per_iteration": 2.5339112281799316 + }, + { + "auxiliary_loss_clip": 0.01107958, + "auxiliary_loss_mlp": 0.01034981, + "balance_loss_clip": 1.0355531, + "balance_loss_mlp": 1.02216041, + "epoch": 0.7732451525627536, + "flos": 14098422142080.0, + "grad_norm": 9.239583151699152, + "language_loss": 0.74312508, + "learning_rate": 5.154402400373343e-07, + "loss": 0.7645545, + "num_input_tokens_seen": 277510520, + "step": 12861, + "time_per_iteration": 3.884416341781616 + }, + { + "auxiliary_loss_clip": 0.01098985, + "auxiliary_loss_mlp": 0.01031664, + "balance_loss_clip": 1.03782606, + "balance_loss_mlp": 1.01859236, + "epoch": 0.7733052758154216, + "flos": 21470020755840.0, + "grad_norm": 1.565709952876274, + "language_loss": 0.7450549, + "learning_rate": 5.15179293816405e-07, + "loss": 0.76636142, + "num_input_tokens_seen": 277530505, + "step": 12862, + "time_per_iteration": 2.5455410480499268 + }, + { + "auxiliary_loss_clip": 0.01059312, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.03323889, + "balance_loss_mlp": 1.02003241, + "epoch": 0.7733653990680895, + "flos": 21394284929280.0, + "grad_norm": 1.5762928363583804, + "language_loss": 0.83227026, + "learning_rate": 5.149184039000256e-07, + "loss": 0.8531791, + "num_input_tokens_seen": 277550810, + "step": 12863, + "time_per_iteration": 2.6097006797790527 + }, + { + "auxiliary_loss_clip": 0.01104908, + "auxiliary_loss_mlp": 0.01029399, + "balance_loss_clip": 1.0362376, + "balance_loss_mlp": 1.01764488, + "epoch": 0.7734255223207576, + "flos": 17676058619520.0, + "grad_norm": 1.6347509127155062, + "language_loss": 0.73389584, + "learning_rate": 5.146575702980898e-07, + "loss": 0.75523889, + "num_input_tokens_seen": 277567680, + "step": 12864, + "time_per_iteration": 3.8390305042266846 + }, + { + "auxiliary_loss_clip": 0.01082809, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.03375387, + "balance_loss_mlp": 1.01985335, + "epoch": 0.7734856455734255, + "flos": 25230837617280.0, + "grad_norm": 1.7381131139852526, + "language_loss": 0.82632494, + "learning_rate": 5.143967930204871e-07, + "loss": 0.84746718, + "num_input_tokens_seen": 277588970, + "step": 12865, + "time_per_iteration": 3.9596285820007324 + }, + { + "auxiliary_loss_clip": 0.01110946, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.03860354, + "balance_loss_mlp": 1.01857591, + "epoch": 0.7735457688260935, + "flos": 23433112805760.0, + "grad_norm": 2.0023546729560238, + "language_loss": 0.71979547, + "learning_rate": 5.141360720771077e-07, + "loss": 0.7412312, + "num_input_tokens_seen": 277605450, + "step": 12866, + "time_per_iteration": 2.4728803634643555 + }, + { + "auxiliary_loss_clip": 0.01061766, + "auxiliary_loss_mlp": 0.00782969, + "balance_loss_clip": 1.03762388, + "balance_loss_mlp": 1.0085429, + "epoch": 0.7736058920787615, + "flos": 18729246320640.0, + "grad_norm": 2.874493838284572, + "language_loss": 0.64914107, + "learning_rate": 5.138754074778371e-07, + "loss": 0.66758847, + "num_input_tokens_seen": 277622530, + "step": 12867, + "time_per_iteration": 2.552341938018799 + }, + { + "auxiliary_loss_clip": 0.01092206, + "auxiliary_loss_mlp": 0.01033408, + "balance_loss_clip": 1.03549671, + "balance_loss_mlp": 1.02179718, + "epoch": 0.7736660153314294, + "flos": 22893304239360.0, + "grad_norm": 1.6016792432355687, + "language_loss": 0.71021533, + "learning_rate": 5.136147992325595e-07, + "loss": 0.73147148, + "num_input_tokens_seen": 277642700, + "step": 12868, + "time_per_iteration": 2.5141184329986572 + }, + { + "auxiliary_loss_clip": 0.0109834, + "auxiliary_loss_mlp": 0.01029318, + "balance_loss_clip": 1.03739738, + "balance_loss_mlp": 1.01713467, + "epoch": 0.7737261385840974, + "flos": 13800901789440.0, + "grad_norm": 2.0821329852335464, + "language_loss": 0.78156292, + "learning_rate": 5.133542473511578e-07, + "loss": 0.80283952, + "num_input_tokens_seen": 277660005, + "step": 12869, + "time_per_iteration": 2.4598798751831055 + }, + { + "auxiliary_loss_clip": 0.0108998, + "auxiliary_loss_mlp": 0.01026195, + "balance_loss_clip": 1.03434563, + "balance_loss_mlp": 1.01431012, + "epoch": 0.7737862618367654, + "flos": 28730727106560.0, + "grad_norm": 1.7952855395044343, + "language_loss": 0.73752189, + "learning_rate": 5.130937518435124e-07, + "loss": 0.75868362, + "num_input_tokens_seen": 277682890, + "step": 12870, + "time_per_iteration": 2.5559346675872803 + }, + { + "auxiliary_loss_clip": 0.01096023, + "auxiliary_loss_mlp": 0.01032437, + "balance_loss_clip": 1.03571701, + "balance_loss_mlp": 1.01983631, + "epoch": 0.7738463850894334, + "flos": 17018570119680.0, + "grad_norm": 1.9502380444041627, + "language_loss": 0.75931907, + "learning_rate": 5.12833312719501e-07, + "loss": 0.78060365, + "num_input_tokens_seen": 277699330, + "step": 12871, + "time_per_iteration": 3.8365724086761475 + }, + { + "auxiliary_loss_clip": 0.0108026, + "auxiliary_loss_mlp": 0.01030703, + "balance_loss_clip": 1.0335499, + "balance_loss_mlp": 1.01955736, + "epoch": 0.7739065083421013, + "flos": 20704010290560.0, + "grad_norm": 1.6470968031695052, + "language_loss": 0.69112808, + "learning_rate": 5.12572929988999e-07, + "loss": 0.71223772, + "num_input_tokens_seen": 277718750, + "step": 12872, + "time_per_iteration": 2.6436235904693604 + }, + { + "auxiliary_loss_clip": 0.01105758, + "auxiliary_loss_mlp": 0.0103282, + "balance_loss_clip": 1.03599501, + "balance_loss_mlp": 1.01971924, + "epoch": 0.7739666315947693, + "flos": 20697222620160.0, + "grad_norm": 2.0074334248013432, + "language_loss": 0.85347533, + "learning_rate": 5.123126036618804e-07, + "loss": 0.87486112, + "num_input_tokens_seen": 277734645, + "step": 12873, + "time_per_iteration": 2.4310896396636963 + }, + { + "auxiliary_loss_clip": 0.0110662, + "auxiliary_loss_mlp": 0.01033337, + "balance_loss_clip": 1.03674269, + "balance_loss_mlp": 1.02122593, + "epoch": 0.7740267548474372, + "flos": 29570677718400.0, + "grad_norm": 2.3220493277297845, + "language_loss": 0.65370262, + "learning_rate": 5.120523337480174e-07, + "loss": 0.67510217, + "num_input_tokens_seen": 277755535, + "step": 12874, + "time_per_iteration": 2.531129837036133 + }, + { + "auxiliary_loss_clip": 0.01063374, + "auxiliary_loss_mlp": 0.01030075, + "balance_loss_clip": 1.03622913, + "balance_loss_mlp": 1.01757014, + "epoch": 0.7740868781001052, + "flos": 23659099223040.0, + "grad_norm": 1.7701985424627638, + "language_loss": 0.62335628, + "learning_rate": 5.117921202572785e-07, + "loss": 0.6442908, + "num_input_tokens_seen": 277775585, + "step": 12875, + "time_per_iteration": 2.6034257411956787 + }, + { + "auxiliary_loss_clip": 0.01095873, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.03510857, + "balance_loss_mlp": 1.01811051, + "epoch": 0.7741470013527731, + "flos": 24717314828160.0, + "grad_norm": 2.8315283171337255, + "language_loss": 0.65323097, + "learning_rate": 5.115319631995318e-07, + "loss": 0.67449415, + "num_input_tokens_seen": 277794795, + "step": 12876, + "time_per_iteration": 2.521559000015259 + }, + { + "auxiliary_loss_clip": 0.01079685, + "auxiliary_loss_mlp": 0.01032453, + "balance_loss_clip": 1.0367167, + "balance_loss_mlp": 1.02072287, + "epoch": 0.7742071246054412, + "flos": 21871645701120.0, + "grad_norm": 2.329977414569712, + "language_loss": 0.71151114, + "learning_rate": 5.112718625846433e-07, + "loss": 0.73263252, + "num_input_tokens_seen": 277813235, + "step": 12877, + "time_per_iteration": 2.5230207443237305 + }, + { + "auxiliary_loss_clip": 0.01066141, + "auxiliary_loss_mlp": 0.01037904, + "balance_loss_clip": 1.03405213, + "balance_loss_mlp": 1.02376533, + "epoch": 0.7742672478581091, + "flos": 22674249146880.0, + "grad_norm": 1.6489017128494265, + "language_loss": 0.82982671, + "learning_rate": 5.110118184224736e-07, + "loss": 0.85086721, + "num_input_tokens_seen": 277832560, + "step": 12878, + "time_per_iteration": 2.5750985145568848 + }, + { + "auxiliary_loss_clip": 0.01084042, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.03593338, + "balance_loss_mlp": 1.01877785, + "epoch": 0.7743273711107771, + "flos": 18840892769280.0, + "grad_norm": 1.753877777350819, + "language_loss": 0.73825979, + "learning_rate": 5.10751830722885e-07, + "loss": 0.75941968, + "num_input_tokens_seen": 277850120, + "step": 12879, + "time_per_iteration": 2.4994149208068848 + }, + { + "auxiliary_loss_clip": 0.01079107, + "auxiliary_loss_mlp": 0.01031164, + "balance_loss_clip": 1.033687, + "balance_loss_mlp": 1.01877236, + "epoch": 0.7743874943634451, + "flos": 28729326476160.0, + "grad_norm": 1.826012095180428, + "language_loss": 0.79511273, + "learning_rate": 5.104918994957364e-07, + "loss": 0.8162154, + "num_input_tokens_seen": 277871020, + "step": 12880, + "time_per_iteration": 2.580899238586426 + }, + { + "auxiliary_loss_clip": 0.01082381, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.03613126, + "balance_loss_mlp": 1.02333856, + "epoch": 0.774447617616113, + "flos": 21909639312000.0, + "grad_norm": 1.975896544591322, + "language_loss": 0.70363802, + "learning_rate": 5.102320247508847e-07, + "loss": 0.7248221, + "num_input_tokens_seen": 277891525, + "step": 12881, + "time_per_iteration": 2.524547815322876 + }, + { + "auxiliary_loss_clip": 0.01086692, + "auxiliary_loss_mlp": 0.01042475, + "balance_loss_clip": 1.03522718, + "balance_loss_mlp": 1.02852178, + "epoch": 0.774507740868781, + "flos": 19500643825920.0, + "grad_norm": 2.592829029585313, + "language_loss": 0.84567511, + "learning_rate": 5.099722064981832e-07, + "loss": 0.86696678, + "num_input_tokens_seen": 277910425, + "step": 12882, + "time_per_iteration": 2.5531561374664307 + }, + { + "auxiliary_loss_clip": 0.01004385, + "auxiliary_loss_mlp": 0.01004458, + "balance_loss_clip": 1.01874757, + "balance_loss_mlp": 1.00301576, + "epoch": 0.774567864121449, + "flos": 59426560402560.0, + "grad_norm": 0.7703362137093294, + "language_loss": 0.60428351, + "learning_rate": 5.097124447474858e-07, + "loss": 0.62437195, + "num_input_tokens_seen": 277972795, + "step": 12883, + "time_per_iteration": 3.12441349029541 + }, + { + "auxiliary_loss_clip": 0.01058643, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.03379369, + "balance_loss_mlp": 1.02231228, + "epoch": 0.774627987374117, + "flos": 13225326255360.0, + "grad_norm": 1.8773506624971381, + "language_loss": 0.72758168, + "learning_rate": 5.094527395086416e-07, + "loss": 0.74853599, + "num_input_tokens_seen": 277990675, + "step": 12884, + "time_per_iteration": 2.53611159324646 + }, + { + "auxiliary_loss_clip": 0.01094114, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.03651059, + "balance_loss_mlp": 1.02263331, + "epoch": 0.7746881106267849, + "flos": 21394033534080.0, + "grad_norm": 1.6085790226983008, + "language_loss": 0.80952728, + "learning_rate": 5.091930907914986e-07, + "loss": 0.83080548, + "num_input_tokens_seen": 278010050, + "step": 12885, + "time_per_iteration": 2.493065595626831 + }, + { + "auxiliary_loss_clip": 0.01102695, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.03513694, + "balance_loss_mlp": 1.01949382, + "epoch": 0.7747482338794529, + "flos": 25629338079360.0, + "grad_norm": 1.6843929265629287, + "language_loss": 0.63587934, + "learning_rate": 5.089334986059029e-07, + "loss": 0.65720952, + "num_input_tokens_seen": 278030660, + "step": 12886, + "time_per_iteration": 2.4858391284942627 + }, + { + "auxiliary_loss_clip": 0.01076374, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.03596902, + "balance_loss_mlp": 1.02129269, + "epoch": 0.7748083571321208, + "flos": 11546933402880.0, + "grad_norm": 1.8596779462921018, + "language_loss": 0.69464242, + "learning_rate": 5.086739629616987e-07, + "loss": 0.71573323, + "num_input_tokens_seen": 278047645, + "step": 12887, + "time_per_iteration": 2.535980224609375 + }, + { + "auxiliary_loss_clip": 0.01094241, + "auxiliary_loss_mlp": 0.01027711, + "balance_loss_clip": 1.0353514, + "balance_loss_mlp": 1.01640451, + "epoch": 0.7748684803847888, + "flos": 19062425900160.0, + "grad_norm": 1.6941714927295886, + "language_loss": 0.70730853, + "learning_rate": 5.084144838687275e-07, + "loss": 0.72852802, + "num_input_tokens_seen": 278066170, + "step": 12888, + "time_per_iteration": 2.4779396057128906 + }, + { + "auxiliary_loss_clip": 0.01095742, + "auxiliary_loss_mlp": 0.01032168, + "balance_loss_clip": 1.0352267, + "balance_loss_mlp": 1.01969242, + "epoch": 0.7749286036374567, + "flos": 22273162905600.0, + "grad_norm": 1.6709601342722726, + "language_loss": 0.8185004, + "learning_rate": 5.081550613368279e-07, + "loss": 0.8397795, + "num_input_tokens_seen": 278085545, + "step": 12889, + "time_per_iteration": 2.513124704360962 + }, + { + "auxiliary_loss_clip": 0.01072397, + "auxiliary_loss_mlp": 0.0102948, + "balance_loss_clip": 1.03545356, + "balance_loss_mlp": 1.01720715, + "epoch": 0.7749887268901248, + "flos": 20192462749440.0, + "grad_norm": 1.9040590529086199, + "language_loss": 0.79553878, + "learning_rate": 5.07895695375838e-07, + "loss": 0.81655753, + "num_input_tokens_seen": 278102995, + "step": 12890, + "time_per_iteration": 2.5174496173858643 + }, + { + "auxiliary_loss_clip": 0.01079881, + "auxiliary_loss_mlp": 0.01027596, + "balance_loss_clip": 1.03914952, + "balance_loss_mlp": 1.01492977, + "epoch": 0.7750488501427927, + "flos": 20337541781760.0, + "grad_norm": 1.778522388927819, + "language_loss": 0.66399539, + "learning_rate": 5.076363859955932e-07, + "loss": 0.68507022, + "num_input_tokens_seen": 278121460, + "step": 12891, + "time_per_iteration": 2.550403118133545 + }, + { + "auxiliary_loss_clip": 0.01095306, + "auxiliary_loss_mlp": 0.01031443, + "balance_loss_clip": 1.03537035, + "balance_loss_mlp": 1.01900363, + "epoch": 0.7751089733954607, + "flos": 28364043116160.0, + "grad_norm": 1.4128962772659446, + "language_loss": 0.7863887, + "learning_rate": 5.073771332059257e-07, + "loss": 0.80765617, + "num_input_tokens_seen": 278143905, + "step": 12892, + "time_per_iteration": 2.5386195182800293 + }, + { + "auxiliary_loss_clip": 0.01097278, + "auxiliary_loss_mlp": 0.01030534, + "balance_loss_clip": 1.0372479, + "balance_loss_mlp": 1.01768947, + "epoch": 0.7751690966481286, + "flos": 16943803960320.0, + "grad_norm": 2.4586518459598303, + "language_loss": 0.67211932, + "learning_rate": 5.071179370166669e-07, + "loss": 0.6933974, + "num_input_tokens_seen": 278160850, + "step": 12893, + "time_per_iteration": 2.4694736003875732 + }, + { + "auxiliary_loss_clip": 0.0102363, + "auxiliary_loss_mlp": 0.01003171, + "balance_loss_clip": 1.0105325, + "balance_loss_mlp": 1.00200868, + "epoch": 0.7752292199007966, + "flos": 65668050339840.0, + "grad_norm": 0.8123943206604832, + "language_loss": 0.58554661, + "learning_rate": 5.068587974376468e-07, + "loss": 0.60581458, + "num_input_tokens_seen": 278219950, + "step": 12894, + "time_per_iteration": 3.1675500869750977 + }, + { + "auxiliary_loss_clip": 0.01087477, + "auxiliary_loss_mlp": 0.01031255, + "balance_loss_clip": 1.03722823, + "balance_loss_mlp": 1.01804066, + "epoch": 0.7752893431534646, + "flos": 20594662312320.0, + "grad_norm": 2.000014549500338, + "language_loss": 0.78188103, + "learning_rate": 5.065997144786895e-07, + "loss": 0.8030684, + "num_input_tokens_seen": 278237805, + "step": 12895, + "time_per_iteration": 2.5408387184143066 + }, + { + "auxiliary_loss_clip": 0.01066107, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.03516161, + "balance_loss_mlp": 1.01815677, + "epoch": 0.7753494664061326, + "flos": 20485350247680.0, + "grad_norm": 1.7359405239578654, + "language_loss": 0.6766358, + "learning_rate": 5.063406881496209e-07, + "loss": 0.69762111, + "num_input_tokens_seen": 278257660, + "step": 12896, + "time_per_iteration": 2.5418779850006104 + }, + { + "auxiliary_loss_clip": 0.01083477, + "auxiliary_loss_mlp": 0.01034399, + "balance_loss_clip": 1.03624821, + "balance_loss_mlp": 1.02282357, + "epoch": 0.7754095896588006, + "flos": 20265900105600.0, + "grad_norm": 1.6755887197366992, + "language_loss": 0.68937492, + "learning_rate": 5.060817184602629e-07, + "loss": 0.71055365, + "num_input_tokens_seen": 278275110, + "step": 12897, + "time_per_iteration": 2.5474109649658203 + }, + { + "auxiliary_loss_clip": 0.01108961, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.03895187, + "balance_loss_mlp": 1.0205394, + "epoch": 0.7754697129114685, + "flos": 23331091201920.0, + "grad_norm": 1.6138196934878455, + "language_loss": 0.74925184, + "learning_rate": 5.058228054204364e-07, + "loss": 0.77067804, + "num_input_tokens_seen": 278293035, + "step": 12898, + "time_per_iteration": 2.4736642837524414 + }, + { + "auxiliary_loss_clip": 0.01094929, + "auxiliary_loss_mlp": 0.00787603, + "balance_loss_clip": 1.03559113, + "balance_loss_mlp": 1.01348627, + "epoch": 0.7755298361641365, + "flos": 17347619635200.0, + "grad_norm": 2.060665734551727, + "language_loss": 0.70018786, + "learning_rate": 5.055639490399588e-07, + "loss": 0.71901315, + "num_input_tokens_seen": 278311010, + "step": 12899, + "time_per_iteration": 3.974762201309204 + }, + { + "auxiliary_loss_clip": 0.01072895, + "auxiliary_loss_mlp": 0.01036156, + "balance_loss_clip": 1.0356282, + "balance_loss_mlp": 1.02291751, + "epoch": 0.7755899594168044, + "flos": 19645866512640.0, + "grad_norm": 1.8978528963525316, + "language_loss": 0.75144708, + "learning_rate": 5.053051493286453e-07, + "loss": 0.77253759, + "num_input_tokens_seen": 278329900, + "step": 12900, + "time_per_iteration": 2.5825138092041016 + }, + { + "auxiliary_loss_clip": 0.01086255, + "auxiliary_loss_mlp": 0.01032898, + "balance_loss_clip": 1.03531981, + "balance_loss_mlp": 1.02147818, + "epoch": 0.7756500826694724, + "flos": 27414457217280.0, + "grad_norm": 2.133929518407901, + "language_loss": 0.77101606, + "learning_rate": 5.050464062963113e-07, + "loss": 0.7922076, + "num_input_tokens_seen": 278349980, + "step": 12901, + "time_per_iteration": 2.5485048294067383 + }, + { + "auxiliary_loss_clip": 0.01095639, + "auxiliary_loss_mlp": 0.01029926, + "balance_loss_clip": 1.0384109, + "balance_loss_mlp": 1.01743293, + "epoch": 0.7757102059221404, + "flos": 28730511624960.0, + "grad_norm": 1.4391343791859106, + "language_loss": 0.77400208, + "learning_rate": 5.047877199527666e-07, + "loss": 0.79525763, + "num_input_tokens_seen": 278372485, + "step": 12902, + "time_per_iteration": 2.560375452041626 + }, + { + "auxiliary_loss_clip": 0.01095245, + "auxiliary_loss_mlp": 0.01030364, + "balance_loss_clip": 1.0360148, + "balance_loss_mlp": 1.01846075, + "epoch": 0.7757703291748084, + "flos": 22486795044480.0, + "grad_norm": 3.003738254486541, + "language_loss": 0.73018539, + "learning_rate": 5.045290903078215e-07, + "loss": 0.75144148, + "num_input_tokens_seen": 278391660, + "step": 12903, + "time_per_iteration": 5.283673048019409 + }, + { + "auxiliary_loss_clip": 0.01084367, + "auxiliary_loss_mlp": 0.01025487, + "balance_loss_clip": 1.03799486, + "balance_loss_mlp": 1.01376295, + "epoch": 0.7758304524274763, + "flos": 21430159637760.0, + "grad_norm": 2.254434906746113, + "language_loss": 0.76045573, + "learning_rate": 5.042705173712835e-07, + "loss": 0.78155422, + "num_input_tokens_seen": 278409125, + "step": 12904, + "time_per_iteration": 2.514991044998169 + }, + { + "auxiliary_loss_clip": 0.01102528, + "auxiliary_loss_mlp": 0.01027726, + "balance_loss_clip": 1.03629446, + "balance_loss_mlp": 1.01615047, + "epoch": 0.7758905756801443, + "flos": 23659242877440.0, + "grad_norm": 2.102372589575635, + "language_loss": 0.68248528, + "learning_rate": 5.040120011529576e-07, + "loss": 0.7037878, + "num_input_tokens_seen": 278429450, + "step": 12905, + "time_per_iteration": 2.475625514984131 + }, + { + "auxiliary_loss_clip": 0.01090951, + "auxiliary_loss_mlp": 0.00785341, + "balance_loss_clip": 1.03968632, + "balance_loss_mlp": 1.01316357, + "epoch": 0.7759506989328122, + "flos": 28365479660160.0, + "grad_norm": 1.6393277936426756, + "language_loss": 0.675403, + "learning_rate": 5.037535416626459e-07, + "loss": 0.69416595, + "num_input_tokens_seen": 278449925, + "step": 12906, + "time_per_iteration": 2.5448105335235596 + }, + { + "auxiliary_loss_clip": 0.01068475, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.03376281, + "balance_loss_mlp": 1.01879883, + "epoch": 0.7760108221854802, + "flos": 14902785354240.0, + "grad_norm": 1.8986056788164603, + "language_loss": 0.80986983, + "learning_rate": 5.034951389101498e-07, + "loss": 0.8308692, + "num_input_tokens_seen": 278467255, + "step": 12907, + "time_per_iteration": 2.529893159866333 + }, + { + "auxiliary_loss_clip": 0.01091782, + "auxiliary_loss_mlp": 0.01034434, + "balance_loss_clip": 1.03646803, + "balance_loss_mlp": 1.02251351, + "epoch": 0.7760709454381483, + "flos": 14792503622400.0, + "grad_norm": 2.055720138657292, + "language_loss": 0.67554367, + "learning_rate": 5.032367929052685e-07, + "loss": 0.69680583, + "num_input_tokens_seen": 278484250, + "step": 12908, + "time_per_iteration": 2.463670015335083 + }, + { + "auxiliary_loss_clip": 0.01074398, + "auxiliary_loss_mlp": 0.01038094, + "balance_loss_clip": 1.03530741, + "balance_loss_mlp": 1.02572584, + "epoch": 0.7761310686908162, + "flos": 17379831156480.0, + "grad_norm": 1.6513786656273939, + "language_loss": 0.70369214, + "learning_rate": 5.029785036577976e-07, + "loss": 0.72481704, + "num_input_tokens_seen": 278502740, + "step": 12909, + "time_per_iteration": 3.933603525161743 + }, + { + "auxiliary_loss_clip": 0.01091856, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.03556156, + "balance_loss_mlp": 1.02287793, + "epoch": 0.7761911919434842, + "flos": 25556547168000.0, + "grad_norm": 1.66737617909252, + "language_loss": 0.67869174, + "learning_rate": 5.027202711775324e-07, + "loss": 0.69995534, + "num_input_tokens_seen": 278523890, + "step": 12910, + "time_per_iteration": 2.5230565071105957 + }, + { + "auxiliary_loss_clip": 0.01060684, + "auxiliary_loss_mlp": 0.01034601, + "balance_loss_clip": 1.0361762, + "balance_loss_mlp": 1.02315676, + "epoch": 0.7762513151961521, + "flos": 23179763203200.0, + "grad_norm": 1.5998886340337886, + "language_loss": 0.71941864, + "learning_rate": 5.024620954742646e-07, + "loss": 0.74037147, + "num_input_tokens_seen": 278543185, + "step": 12911, + "time_per_iteration": 2.6369431018829346 + }, + { + "auxiliary_loss_clip": 0.01108764, + "auxiliary_loss_mlp": 0.00785307, + "balance_loss_clip": 1.03829813, + "balance_loss_mlp": 1.00792587, + "epoch": 0.7763114384488201, + "flos": 21689614552320.0, + "grad_norm": 2.8794336503562725, + "language_loss": 0.63379043, + "learning_rate": 5.022039765577836e-07, + "loss": 0.65273118, + "num_input_tokens_seen": 278559220, + "step": 12912, + "time_per_iteration": 2.4639651775360107 + }, + { + "auxiliary_loss_clip": 0.01011859, + "auxiliary_loss_mlp": 0.01003198, + "balance_loss_clip": 1.01132727, + "balance_loss_mlp": 1.00188696, + "epoch": 0.776371561701488, + "flos": 69025554316800.0, + "grad_norm": 0.7783884063267448, + "language_loss": 0.53276455, + "learning_rate": 5.019459144378779e-07, + "loss": 0.55291522, + "num_input_tokens_seen": 278618185, + "step": 12913, + "time_per_iteration": 3.2307779788970947 + }, + { + "auxiliary_loss_clip": 0.01085818, + "auxiliary_loss_mlp": 0.01031065, + "balance_loss_clip": 1.03763425, + "balance_loss_mlp": 1.01878595, + "epoch": 0.776431684954156, + "flos": 22893914770560.0, + "grad_norm": 2.0391695118996567, + "language_loss": 0.61929512, + "learning_rate": 5.016879091243338e-07, + "loss": 0.64046395, + "num_input_tokens_seen": 278636210, + "step": 12914, + "time_per_iteration": 2.580322027206421 + }, + { + "auxiliary_loss_clip": 0.0108508, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.03780627, + "balance_loss_mlp": 1.0201602, + "epoch": 0.776491808206824, + "flos": 20261554560000.0, + "grad_norm": 2.3077914384482345, + "language_loss": 0.82444942, + "learning_rate": 5.014299606269339e-07, + "loss": 0.84562391, + "num_input_tokens_seen": 278653305, + "step": 12915, + "time_per_iteration": 2.5242555141448975 + }, + { + "auxiliary_loss_clip": 0.01094705, + "auxiliary_loss_mlp": 0.01033688, + "balance_loss_clip": 1.03775442, + "balance_loss_mlp": 1.02068806, + "epoch": 0.776551931459492, + "flos": 26759051706240.0, + "grad_norm": 1.6455940207899116, + "language_loss": 0.74796617, + "learning_rate": 5.011720689554603e-07, + "loss": 0.76925009, + "num_input_tokens_seen": 278671850, + "step": 12916, + "time_per_iteration": 2.5551278591156006 + }, + { + "auxiliary_loss_clip": 0.01052424, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.03647602, + "balance_loss_mlp": 1.02183461, + "epoch": 0.7766120547121599, + "flos": 52665080250240.0, + "grad_norm": 1.4817817907499973, + "language_loss": 0.65755397, + "learning_rate": 5.009142341196919e-07, + "loss": 0.67843521, + "num_input_tokens_seen": 278697860, + "step": 12917, + "time_per_iteration": 2.9118571281433105 + }, + { + "auxiliary_loss_clip": 0.01094318, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.03384984, + "balance_loss_mlp": 1.018978, + "epoch": 0.7766721779648279, + "flos": 25156215112320.0, + "grad_norm": 1.4138640183735087, + "language_loss": 0.64509696, + "learning_rate": 5.006564561294065e-07, + "loss": 0.66634983, + "num_input_tokens_seen": 278720655, + "step": 12918, + "time_per_iteration": 2.5666329860687256 + }, + { + "auxiliary_loss_clip": 0.01104898, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.03653657, + "balance_loss_mlp": 1.01881671, + "epoch": 0.7767323012174958, + "flos": 23760761690880.0, + "grad_norm": 2.0305901212625934, + "language_loss": 0.73539841, + "learning_rate": 5.003987349943777e-07, + "loss": 0.75675428, + "num_input_tokens_seen": 278737375, + "step": 12919, + "time_per_iteration": 2.493507146835327 + }, + { + "auxiliary_loss_clip": 0.01064181, + "auxiliary_loss_mlp": 0.01031484, + "balance_loss_clip": 1.03635013, + "balance_loss_mlp": 1.01829338, + "epoch": 0.7767924244701638, + "flos": 22086642556800.0, + "grad_norm": 1.7212368572927992, + "language_loss": 0.79109669, + "learning_rate": 5.001410707243792e-07, + "loss": 0.81205332, + "num_input_tokens_seen": 278756510, + "step": 12920, + "time_per_iteration": 2.623487949371338 + }, + { + "auxiliary_loss_clip": 0.01096629, + "auxiliary_loss_mlp": 0.01029992, + "balance_loss_clip": 1.03744912, + "balance_loss_mlp": 1.01791596, + "epoch": 0.7768525477228319, + "flos": 21981640124160.0, + "grad_norm": 1.5820370882808008, + "language_loss": 0.70773095, + "learning_rate": 4.998834633291829e-07, + "loss": 0.72899717, + "num_input_tokens_seen": 278775410, + "step": 12921, + "time_per_iteration": 2.49204158782959 + }, + { + "auxiliary_loss_clip": 0.01099695, + "auxiliary_loss_mlp": 0.01031532, + "balance_loss_clip": 1.03784323, + "balance_loss_mlp": 1.01837134, + "epoch": 0.7769126709754998, + "flos": 21794581071360.0, + "grad_norm": 1.71344842786542, + "language_loss": 0.76095855, + "learning_rate": 4.996259128185547e-07, + "loss": 0.78227085, + "num_input_tokens_seen": 278794260, + "step": 12922, + "time_per_iteration": 2.545332431793213 + }, + { + "auxiliary_loss_clip": 0.01058167, + "auxiliary_loss_mlp": 0.01036864, + "balance_loss_clip": 1.03500843, + "balance_loss_mlp": 1.02432954, + "epoch": 0.7769727942281678, + "flos": 20047994248320.0, + "grad_norm": 1.6932111265116463, + "language_loss": 0.80342996, + "learning_rate": 4.993684192022625e-07, + "loss": 0.82438022, + "num_input_tokens_seen": 278813290, + "step": 12923, + "time_per_iteration": 2.5614752769470215 + }, + { + "auxiliary_loss_clip": 0.01068037, + "auxiliary_loss_mlp": 0.01031441, + "balance_loss_clip": 1.03732896, + "balance_loss_mlp": 1.01944304, + "epoch": 0.7770329174808357, + "flos": 21686777377920.0, + "grad_norm": 1.9328724881114492, + "language_loss": 0.92231548, + "learning_rate": 4.991109824900699e-07, + "loss": 0.94331026, + "num_input_tokens_seen": 278830610, + "step": 12924, + "time_per_iteration": 2.565948486328125 + }, + { + "auxiliary_loss_clip": 0.01093495, + "auxiliary_loss_mlp": 0.0102949, + "balance_loss_clip": 1.03521216, + "balance_loss_mlp": 1.01697922, + "epoch": 0.7770930407335037, + "flos": 25849255098240.0, + "grad_norm": 2.048887916207168, + "language_loss": 0.66275996, + "learning_rate": 4.988536026917401e-07, + "loss": 0.68398976, + "num_input_tokens_seen": 278849530, + "step": 12925, + "time_per_iteration": 2.5196340084075928 + }, + { + "auxiliary_loss_clip": 0.01074588, + "auxiliary_loss_mlp": 0.0103152, + "balance_loss_clip": 1.03689957, + "balance_loss_mlp": 1.01944995, + "epoch": 0.7771531639861716, + "flos": 24347865490560.0, + "grad_norm": 1.6469959909624128, + "language_loss": 0.71788496, + "learning_rate": 4.985962798170314e-07, + "loss": 0.73894602, + "num_input_tokens_seen": 278869005, + "step": 12926, + "time_per_iteration": 2.5999913215637207 + }, + { + "auxiliary_loss_clip": 0.01096656, + "auxiliary_loss_mlp": 0.01027679, + "balance_loss_clip": 1.03615761, + "balance_loss_mlp": 1.01508999, + "epoch": 0.7772132872388396, + "flos": 25629948610560.0, + "grad_norm": 1.7004420593574334, + "language_loss": 0.65462631, + "learning_rate": 4.983390138757027e-07, + "loss": 0.67586958, + "num_input_tokens_seen": 278888790, + "step": 12927, + "time_per_iteration": 2.5170023441314697 + }, + { + "auxiliary_loss_clip": 0.01084328, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.03704381, + "balance_loss_mlp": 1.02313423, + "epoch": 0.7772734104915076, + "flos": 26067412350720.0, + "grad_norm": 1.778573033140268, + "language_loss": 0.72477388, + "learning_rate": 4.980818048775093e-07, + "loss": 0.74597692, + "num_input_tokens_seen": 278908150, + "step": 12928, + "time_per_iteration": 2.5482072830200195 + }, + { + "auxiliary_loss_clip": 0.01062697, + "auxiliary_loss_mlp": 0.01031426, + "balance_loss_clip": 1.03371322, + "balance_loss_mlp": 1.01871252, + "epoch": 0.7773335337441756, + "flos": 22925048883840.0, + "grad_norm": 1.704993161563091, + "language_loss": 0.74435419, + "learning_rate": 4.978246528322036e-07, + "loss": 0.76529545, + "num_input_tokens_seen": 278927425, + "step": 12929, + "time_per_iteration": 2.587257146835327 + }, + { + "auxiliary_loss_clip": 0.01069701, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.03721619, + "balance_loss_mlp": 1.01828194, + "epoch": 0.7773936569968435, + "flos": 20776765288320.0, + "grad_norm": 1.7848517174143814, + "language_loss": 0.7757982, + "learning_rate": 4.975675577495377e-07, + "loss": 0.79680002, + "num_input_tokens_seen": 278946475, + "step": 12930, + "time_per_iteration": 2.559598207473755 + }, + { + "auxiliary_loss_clip": 0.01107516, + "auxiliary_loss_mlp": 0.01032414, + "balance_loss_clip": 1.0388751, + "balance_loss_mlp": 1.01999879, + "epoch": 0.7774537802495115, + "flos": 20372267255040.0, + "grad_norm": 1.924953217864129, + "language_loss": 0.80043161, + "learning_rate": 4.973105196392613e-07, + "loss": 0.82183093, + "num_input_tokens_seen": 278964345, + "step": 12931, + "time_per_iteration": 2.4600746631622314 + }, + { + "auxiliary_loss_clip": 0.01013554, + "auxiliary_loss_mlp": 0.01002453, + "balance_loss_clip": 1.03431642, + "balance_loss_mlp": 1.00084364, + "epoch": 0.7775139035021794, + "flos": 53912081738880.0, + "grad_norm": 0.8164212293045781, + "language_loss": 0.59782535, + "learning_rate": 4.970535385111199e-07, + "loss": 0.61798543, + "num_input_tokens_seen": 279022380, + "step": 12932, + "time_per_iteration": 3.1208577156066895 + }, + { + "auxiliary_loss_clip": 0.01096798, + "auxiliary_loss_mlp": 0.01031218, + "balance_loss_clip": 1.03758717, + "balance_loss_mlp": 1.01965475, + "epoch": 0.7775740267548474, + "flos": 28842481296000.0, + "grad_norm": 1.4289576189573685, + "language_loss": 0.76217765, + "learning_rate": 4.967966143748595e-07, + "loss": 0.78345782, + "num_input_tokens_seen": 279044275, + "step": 12933, + "time_per_iteration": 2.5824007987976074 + }, + { + "auxiliary_loss_clip": 0.01087262, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.03740454, + "balance_loss_mlp": 1.02067435, + "epoch": 0.7776341500075155, + "flos": 21872471713920.0, + "grad_norm": 2.3520390802465974, + "language_loss": 0.72849488, + "learning_rate": 4.965397472402215e-07, + "loss": 0.74970233, + "num_input_tokens_seen": 279063375, + "step": 12934, + "time_per_iteration": 2.5667359828948975 + }, + { + "auxiliary_loss_clip": 0.01062665, + "auxiliary_loss_mlp": 0.01026732, + "balance_loss_clip": 1.03363681, + "balance_loss_mlp": 1.01379132, + "epoch": 0.7776942732601834, + "flos": 20229845829120.0, + "grad_norm": 1.853996513206363, + "language_loss": 0.70207101, + "learning_rate": 4.962829371169475e-07, + "loss": 0.722965, + "num_input_tokens_seen": 279082680, + "step": 12935, + "time_per_iteration": 2.5957260131835938 + }, + { + "auxiliary_loss_clip": 0.01081086, + "auxiliary_loss_mlp": 0.0078621, + "balance_loss_clip": 1.03693163, + "balance_loss_mlp": 1.01139534, + "epoch": 0.7777543965128514, + "flos": 22231829329920.0, + "grad_norm": 1.7640495639235767, + "language_loss": 0.83631837, + "learning_rate": 4.960261840147746e-07, + "loss": 0.85499132, + "num_input_tokens_seen": 279099805, + "step": 12936, + "time_per_iteration": 2.5335693359375 + }, + { + "auxiliary_loss_clip": 0.01099009, + "auxiliary_loss_mlp": 0.01027458, + "balance_loss_clip": 1.03612542, + "balance_loss_mlp": 1.01579905, + "epoch": 0.7778145197655193, + "flos": 14501950508160.0, + "grad_norm": 1.9114266800232635, + "language_loss": 0.67606688, + "learning_rate": 4.957694879434397e-07, + "loss": 0.69733155, + "num_input_tokens_seen": 279117975, + "step": 12937, + "time_per_iteration": 2.483898878097534 + }, + { + "auxiliary_loss_clip": 0.01106144, + "auxiliary_loss_mlp": 0.01028766, + "balance_loss_clip": 1.03572464, + "balance_loss_mlp": 1.01664877, + "epoch": 0.7778746430181873, + "flos": 21140288881920.0, + "grad_norm": 1.4442072333579719, + "language_loss": 0.87286329, + "learning_rate": 4.955128489126777e-07, + "loss": 0.89421242, + "num_input_tokens_seen": 279137255, + "step": 12938, + "time_per_iteration": 3.897878646850586 + }, + { + "auxiliary_loss_clip": 0.01094748, + "auxiliary_loss_mlp": 0.01032333, + "balance_loss_clip": 1.03606975, + "balance_loss_mlp": 1.01972044, + "epoch": 0.7779347662708552, + "flos": 20266366982400.0, + "grad_norm": 1.9771756057164271, + "language_loss": 0.85028434, + "learning_rate": 4.95256266932218e-07, + "loss": 0.87155515, + "num_input_tokens_seen": 279154500, + "step": 12939, + "time_per_iteration": 2.5126852989196777 + }, + { + "auxiliary_loss_clip": 0.01103551, + "auxiliary_loss_mlp": 0.00785608, + "balance_loss_clip": 1.03654099, + "balance_loss_mlp": 1.01334047, + "epoch": 0.7779948895235232, + "flos": 19209013303680.0, + "grad_norm": 1.68106654703397, + "language_loss": 0.6903913, + "learning_rate": 4.949997420117915e-07, + "loss": 0.70928288, + "num_input_tokens_seen": 279173635, + "step": 12940, + "time_per_iteration": 2.45277738571167 + }, + { + "auxiliary_loss_clip": 0.0107101, + "auxiliary_loss_mlp": 0.01026806, + "balance_loss_clip": 1.03447509, + "balance_loss_mlp": 1.01541531, + "epoch": 0.7780550127761912, + "flos": 23914711382400.0, + "grad_norm": 1.5054133372951808, + "language_loss": 0.77872694, + "learning_rate": 4.947432741611255e-07, + "loss": 0.79970515, + "num_input_tokens_seen": 279194430, + "step": 12941, + "time_per_iteration": 3.989126205444336 + }, + { + "auxiliary_loss_clip": 0.01100105, + "auxiliary_loss_mlp": 0.01033884, + "balance_loss_clip": 1.03739524, + "balance_loss_mlp": 1.02048492, + "epoch": 0.7781151360288592, + "flos": 32415951795840.0, + "grad_norm": 2.2106376287375618, + "language_loss": 0.73262519, + "learning_rate": 4.944868633899462e-07, + "loss": 0.75396514, + "num_input_tokens_seen": 279212920, + "step": 12942, + "time_per_iteration": 4.04442286491394 + }, + { + "auxiliary_loss_clip": 0.01051446, + "auxiliary_loss_mlp": 0.01039755, + "balance_loss_clip": 1.03435421, + "balance_loss_mlp": 1.02670765, + "epoch": 0.7781752592815271, + "flos": 22346384780160.0, + "grad_norm": 2.841527811606394, + "language_loss": 0.67912912, + "learning_rate": 4.942305097079751e-07, + "loss": 0.70004112, + "num_input_tokens_seen": 279232310, + "step": 12943, + "time_per_iteration": 2.588712692260742 + }, + { + "auxiliary_loss_clip": 0.01014298, + "auxiliary_loss_mlp": 0.01015442, + "balance_loss_clip": 1.01334476, + "balance_loss_mlp": 1.01390445, + "epoch": 0.7782353825341951, + "flos": 70460183520000.0, + "grad_norm": 0.7808201226025938, + "language_loss": 0.58488983, + "learning_rate": 4.939742131249347e-07, + "loss": 0.60518724, + "num_input_tokens_seen": 279295375, + "step": 12944, + "time_per_iteration": 3.306283712387085 + }, + { + "auxiliary_loss_clip": 0.01107599, + "auxiliary_loss_mlp": 0.01036376, + "balance_loss_clip": 1.03653681, + "balance_loss_mlp": 1.02300692, + "epoch": 0.778295505786863, + "flos": 19062569554560.0, + "grad_norm": 2.444858844469556, + "language_loss": 0.67997372, + "learning_rate": 4.937179736505428e-07, + "loss": 0.70141351, + "num_input_tokens_seen": 279313660, + "step": 12945, + "time_per_iteration": 2.4550700187683105 + }, + { + "auxiliary_loss_clip": 0.01093082, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.03604198, + "balance_loss_mlp": 1.02049005, + "epoch": 0.778355629039531, + "flos": 20999734963200.0, + "grad_norm": 1.8182410455411544, + "language_loss": 0.69411945, + "learning_rate": 4.93461791294516e-07, + "loss": 0.71538132, + "num_input_tokens_seen": 279334495, + "step": 12946, + "time_per_iteration": 2.5272586345672607 + }, + { + "auxiliary_loss_clip": 0.01106675, + "auxiliary_loss_mlp": 0.01029618, + "balance_loss_clip": 1.0377512, + "balance_loss_mlp": 1.0169524, + "epoch": 0.7784157522921991, + "flos": 21398091770880.0, + "grad_norm": 1.7815527393535227, + "language_loss": 0.65527618, + "learning_rate": 4.932056660665689e-07, + "loss": 0.6766392, + "num_input_tokens_seen": 279352985, + "step": 12947, + "time_per_iteration": 2.4667985439300537 + }, + { + "auxiliary_loss_clip": 0.01044243, + "auxiliary_loss_mlp": 0.01038832, + "balance_loss_clip": 1.03271627, + "balance_loss_mlp": 1.02490819, + "epoch": 0.778475875544867, + "flos": 20813861059200.0, + "grad_norm": 2.5029606321594815, + "language_loss": 0.65083426, + "learning_rate": 4.929495979764147e-07, + "loss": 0.67166501, + "num_input_tokens_seen": 279371360, + "step": 12948, + "time_per_iteration": 4.001905679702759 + }, + { + "auxiliary_loss_clip": 0.01106456, + "auxiliary_loss_mlp": 0.01032234, + "balance_loss_clip": 1.03699863, + "balance_loss_mlp": 1.01980615, + "epoch": 0.778535998797535, + "flos": 14355363104640.0, + "grad_norm": 1.704056838096069, + "language_loss": 0.75091684, + "learning_rate": 4.926935870337625e-07, + "loss": 0.77230376, + "num_input_tokens_seen": 279389400, + "step": 12949, + "time_per_iteration": 2.46199107170105 + }, + { + "auxiliary_loss_clip": 0.01110088, + "auxiliary_loss_mlp": 0.01032456, + "balance_loss_clip": 1.03796172, + "balance_loss_mlp": 1.01987922, + "epoch": 0.7785961220502029, + "flos": 19209552007680.0, + "grad_norm": 1.6082704531930117, + "language_loss": 0.69083387, + "learning_rate": 4.924376332483202e-07, + "loss": 0.71225929, + "num_input_tokens_seen": 279409715, + "step": 12950, + "time_per_iteration": 2.475525379180908 + }, + { + "auxiliary_loss_clip": 0.01089174, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.03557587, + "balance_loss_mlp": 1.02020586, + "epoch": 0.7786562453028709, + "flos": 25738757884800.0, + "grad_norm": 1.706537559210045, + "language_loss": 0.71940571, + "learning_rate": 4.921817366297938e-07, + "loss": 0.74062324, + "num_input_tokens_seen": 279427705, + "step": 12951, + "time_per_iteration": 2.5303313732147217 + }, + { + "auxiliary_loss_clip": 0.01082316, + "auxiliary_loss_mlp": 0.01034274, + "balance_loss_clip": 1.03553927, + "balance_loss_mlp": 1.02209067, + "epoch": 0.7787163685555388, + "flos": 25739440243200.0, + "grad_norm": 1.859370861220475, + "language_loss": 0.65541434, + "learning_rate": 4.919258971878877e-07, + "loss": 0.67658025, + "num_input_tokens_seen": 279448215, + "step": 12952, + "time_per_iteration": 2.5791444778442383 + }, + { + "auxiliary_loss_clip": 0.01075958, + "auxiliary_loss_mlp": 0.01027231, + "balance_loss_clip": 1.03353834, + "balance_loss_mlp": 1.01609087, + "epoch": 0.7787764918082068, + "flos": 22747722416640.0, + "grad_norm": 1.712883752670185, + "language_loss": 0.81388128, + "learning_rate": 4.916701149323022e-07, + "loss": 0.83491313, + "num_input_tokens_seen": 279466260, + "step": 12953, + "time_per_iteration": 2.5463006496429443 + }, + { + "auxiliary_loss_clip": 0.01111478, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.04009604, + "balance_loss_mlp": 1.018489, + "epoch": 0.7788366150608748, + "flos": 15190860430080.0, + "grad_norm": 2.1618789658045756, + "language_loss": 0.7686497, + "learning_rate": 4.91414389872737e-07, + "loss": 0.79007137, + "num_input_tokens_seen": 279484520, + "step": 12954, + "time_per_iteration": 2.450113296508789 + }, + { + "auxiliary_loss_clip": 0.01095679, + "auxiliary_loss_mlp": 0.01031767, + "balance_loss_clip": 1.03582919, + "balance_loss_mlp": 1.01986408, + "epoch": 0.7788967383135428, + "flos": 21210242618880.0, + "grad_norm": 1.6873628181255842, + "language_loss": 0.72919893, + "learning_rate": 4.911587220188905e-07, + "loss": 0.75047338, + "num_input_tokens_seen": 279503130, + "step": 12955, + "time_per_iteration": 2.502089500427246 + }, + { + "auxiliary_loss_clip": 0.01072583, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.03347862, + "balance_loss_mlp": 1.02202129, + "epoch": 0.7789568615662107, + "flos": 21682970536320.0, + "grad_norm": 1.9662635334752019, + "language_loss": 0.68497795, + "learning_rate": 4.909031113804551e-07, + "loss": 0.70605028, + "num_input_tokens_seen": 279521930, + "step": 12956, + "time_per_iteration": 2.551457405090332 + }, + { + "auxiliary_loss_clip": 0.01071113, + "auxiliary_loss_mlp": 0.01032668, + "balance_loss_clip": 1.03547525, + "balance_loss_mlp": 1.02085447, + "epoch": 0.7790169848188787, + "flos": 26360371676160.0, + "grad_norm": 1.5888472766298276, + "language_loss": 0.75827169, + "learning_rate": 4.906475579671252e-07, + "loss": 0.77930951, + "num_input_tokens_seen": 279542375, + "step": 12957, + "time_per_iteration": 2.6357386112213135 + }, + { + "auxiliary_loss_clip": 0.01035153, + "auxiliary_loss_mlp": 0.01028722, + "balance_loss_clip": 1.03603792, + "balance_loss_mlp": 1.0165925, + "epoch": 0.7790771080715466, + "flos": 25516183259520.0, + "grad_norm": 2.050122423796401, + "language_loss": 0.77743489, + "learning_rate": 4.903920617885917e-07, + "loss": 0.79807359, + "num_input_tokens_seen": 279561885, + "step": 12958, + "time_per_iteration": 2.762251615524292 + }, + { + "auxiliary_loss_clip": 0.01092258, + "auxiliary_loss_mlp": 0.01042206, + "balance_loss_clip": 1.03527546, + "balance_loss_mlp": 1.02774, + "epoch": 0.7791372313242146, + "flos": 16034186920320.0, + "grad_norm": 2.278057625011404, + "language_loss": 0.71247244, + "learning_rate": 4.901366228545418e-07, + "loss": 0.7338171, + "num_input_tokens_seen": 279579965, + "step": 12959, + "time_per_iteration": 2.6730854511260986 + }, + { + "auxiliary_loss_clip": 0.01096092, + "auxiliary_loss_mlp": 0.00784726, + "balance_loss_clip": 1.0374707, + "balance_loss_mlp": 1.01091146, + "epoch": 0.7791973545768827, + "flos": 23842207779840.0, + "grad_norm": 1.5703243308243429, + "language_loss": 0.77918518, + "learning_rate": 4.898812411746632e-07, + "loss": 0.79799342, + "num_input_tokens_seen": 279599030, + "step": 12960, + "time_per_iteration": 2.5081260204315186 + }, + { + "auxiliary_loss_clip": 0.01098066, + "auxiliary_loss_mlp": 0.01034913, + "balance_loss_clip": 1.03739309, + "balance_loss_mlp": 1.02242017, + "epoch": 0.7792574778295506, + "flos": 24168384207360.0, + "grad_norm": 1.782267142108547, + "language_loss": 0.75358009, + "learning_rate": 4.896259167586385e-07, + "loss": 0.77490985, + "num_input_tokens_seen": 279614400, + "step": 12961, + "time_per_iteration": 2.506835460662842 + }, + { + "auxiliary_loss_clip": 0.01082938, + "auxiliary_loss_mlp": 0.01036352, + "balance_loss_clip": 1.03910542, + "balance_loss_mlp": 1.02468777, + "epoch": 0.7793176010822186, + "flos": 21464921024640.0, + "grad_norm": 1.583035338536796, + "language_loss": 0.73798847, + "learning_rate": 4.893706496161511e-07, + "loss": 0.75918138, + "num_input_tokens_seen": 279633745, + "step": 12962, + "time_per_iteration": 2.5578320026397705 + }, + { + "auxiliary_loss_clip": 0.01094711, + "auxiliary_loss_mlp": 0.01027507, + "balance_loss_clip": 1.03663731, + "balance_loss_mlp": 1.0159024, + "epoch": 0.7793777243348865, + "flos": 20666699038080.0, + "grad_norm": 1.9736474787128522, + "language_loss": 0.69942188, + "learning_rate": 4.891154397568795e-07, + "loss": 0.72064412, + "num_input_tokens_seen": 279651165, + "step": 12963, + "time_per_iteration": 2.4803755283355713 + }, + { + "auxiliary_loss_clip": 0.0109519, + "auxiliary_loss_mlp": 0.00784175, + "balance_loss_clip": 1.0374186, + "balance_loss_mlp": 1.01131034, + "epoch": 0.7794378475875545, + "flos": 27125771610240.0, + "grad_norm": 1.7729299483010115, + "language_loss": 0.63710546, + "learning_rate": 4.888602871905019e-07, + "loss": 0.65589911, + "num_input_tokens_seen": 279671175, + "step": 12964, + "time_per_iteration": 2.5842459201812744 + }, + { + "auxiliary_loss_clip": 0.01087018, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.03650248, + "balance_loss_mlp": 1.01992297, + "epoch": 0.7794979708402224, + "flos": 28074136446720.0, + "grad_norm": 1.623509227811248, + "language_loss": 0.76634622, + "learning_rate": 4.88605191926694e-07, + "loss": 0.78753543, + "num_input_tokens_seen": 279688675, + "step": 12965, + "time_per_iteration": 2.573005199432373 + }, + { + "auxiliary_loss_clip": 0.01085036, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.03390372, + "balance_loss_mlp": 1.02366734, + "epoch": 0.7795580940928905, + "flos": 26869548919680.0, + "grad_norm": 1.4252090849960406, + "language_loss": 0.72780955, + "learning_rate": 4.883501539751289e-07, + "loss": 0.74902236, + "num_input_tokens_seen": 279710245, + "step": 12966, + "time_per_iteration": 2.541987180709839 + }, + { + "auxiliary_loss_clip": 0.01083125, + "auxiliary_loss_mlp": 0.00781556, + "balance_loss_clip": 1.03764391, + "balance_loss_mlp": 1.00690246, + "epoch": 0.7796182173455584, + "flos": 23835384195840.0, + "grad_norm": 1.4780310990132308, + "language_loss": 0.74451321, + "learning_rate": 4.880951733454768e-07, + "loss": 0.76315999, + "num_input_tokens_seen": 279729045, + "step": 12967, + "time_per_iteration": 2.5361344814300537 + }, + { + "auxiliary_loss_clip": 0.01107239, + "auxiliary_loss_mlp": 0.01031308, + "balance_loss_clip": 1.03791785, + "balance_loss_mlp": 1.01852298, + "epoch": 0.7796783405982264, + "flos": 19792238434560.0, + "grad_norm": 2.275931410346895, + "language_loss": 0.71849239, + "learning_rate": 4.878402500474073e-07, + "loss": 0.73987782, + "num_input_tokens_seen": 279748350, + "step": 12968, + "time_per_iteration": 2.47818660736084 + }, + { + "auxiliary_loss_clip": 0.01081279, + "auxiliary_loss_mlp": 0.01035789, + "balance_loss_clip": 1.03751874, + "balance_loss_mlp": 1.02350473, + "epoch": 0.7797384638508943, + "flos": 15450207603840.0, + "grad_norm": 1.8155442411700582, + "language_loss": 0.607952, + "learning_rate": 4.875853840905874e-07, + "loss": 0.62912267, + "num_input_tokens_seen": 279765620, + "step": 12969, + "time_per_iteration": 2.4996719360351562 + }, + { + "auxiliary_loss_clip": 0.01085669, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.03537416, + "balance_loss_mlp": 1.02180302, + "epoch": 0.7797985871035623, + "flos": 20922742160640.0, + "grad_norm": 1.991097124185547, + "language_loss": 0.70322776, + "learning_rate": 4.873305754846811e-07, + "loss": 0.724419, + "num_input_tokens_seen": 279782485, + "step": 12970, + "time_per_iteration": 2.5082459449768066 + }, + { + "auxiliary_loss_clip": 0.01075948, + "auxiliary_loss_mlp": 0.00784744, + "balance_loss_clip": 1.03868032, + "balance_loss_mlp": 1.01065886, + "epoch": 0.7798587103562302, + "flos": 36937212514560.0, + "grad_norm": 1.6565343189392445, + "language_loss": 0.71988404, + "learning_rate": 4.870758242393507e-07, + "loss": 0.73849094, + "num_input_tokens_seen": 279804170, + "step": 12971, + "time_per_iteration": 2.674285888671875 + }, + { + "auxiliary_loss_clip": 0.01072076, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.03513098, + "balance_loss_mlp": 1.02062833, + "epoch": 0.7799188336088982, + "flos": 22419283432320.0, + "grad_norm": 1.6269681296274814, + "language_loss": 0.74558151, + "learning_rate": 4.868211303642578e-07, + "loss": 0.7666384, + "num_input_tokens_seen": 279823730, + "step": 12972, + "time_per_iteration": 2.5682570934295654 + }, + { + "auxiliary_loss_clip": 0.01104148, + "auxiliary_loss_mlp": 0.01025523, + "balance_loss_clip": 1.03504503, + "balance_loss_mlp": 1.01314282, + "epoch": 0.7799789568615663, + "flos": 18880466578560.0, + "grad_norm": 1.771861243927943, + "language_loss": 0.71290284, + "learning_rate": 4.865664938690584e-07, + "loss": 0.73419952, + "num_input_tokens_seen": 279843035, + "step": 12973, + "time_per_iteration": 2.44003963470459 + }, + { + "auxiliary_loss_clip": 0.01091771, + "auxiliary_loss_mlp": 0.01032855, + "balance_loss_clip": 1.03592277, + "balance_loss_mlp": 1.02166748, + "epoch": 0.7800390801142342, + "flos": 20262272832000.0, + "grad_norm": 2.0215921607350023, + "language_loss": 0.7764262, + "learning_rate": 4.863119147634089e-07, + "loss": 0.79767251, + "num_input_tokens_seen": 279861450, + "step": 12974, + "time_per_iteration": 2.4945425987243652 + }, + { + "auxiliary_loss_clip": 0.01069761, + "auxiliary_loss_mlp": 0.01031, + "balance_loss_clip": 1.03347921, + "balance_loss_mlp": 1.01840568, + "epoch": 0.7800992033669022, + "flos": 16690310703360.0, + "grad_norm": 1.4783044117133326, + "language_loss": 0.6919719, + "learning_rate": 4.86057393056964e-07, + "loss": 0.71297944, + "num_input_tokens_seen": 279878660, + "step": 12975, + "time_per_iteration": 2.521122932434082 + }, + { + "auxiliary_loss_clip": 0.01068899, + "auxiliary_loss_mlp": 0.01029482, + "balance_loss_clip": 1.03532207, + "balance_loss_mlp": 1.01769781, + "epoch": 0.7801593266195701, + "flos": 18585208782720.0, + "grad_norm": 1.8185517625165502, + "language_loss": 0.81801057, + "learning_rate": 4.858029287593739e-07, + "loss": 0.83899438, + "num_input_tokens_seen": 279895685, + "step": 12976, + "time_per_iteration": 2.561774730682373 + }, + { + "auxiliary_loss_clip": 0.01084957, + "auxiliary_loss_mlp": 0.00784574, + "balance_loss_clip": 1.03467453, + "balance_loss_mlp": 1.01215589, + "epoch": 0.7802194498722381, + "flos": 25484941405440.0, + "grad_norm": 1.4444527302242933, + "language_loss": 0.6576066, + "learning_rate": 4.85548521880289e-07, + "loss": 0.6763019, + "num_input_tokens_seen": 279917240, + "step": 12977, + "time_per_iteration": 3.9706859588623047 + }, + { + "auxiliary_loss_clip": 0.01083148, + "auxiliary_loss_mlp": 0.01028803, + "balance_loss_clip": 1.03658593, + "balance_loss_mlp": 1.01731682, + "epoch": 0.780279573124906, + "flos": 31176315573120.0, + "grad_norm": 2.100351325099898, + "language_loss": 0.74873257, + "learning_rate": 4.852941724293554e-07, + "loss": 0.76985204, + "num_input_tokens_seen": 279938665, + "step": 12978, + "time_per_iteration": 2.6065220832824707 + }, + { + "auxiliary_loss_clip": 0.01080681, + "auxiliary_loss_mlp": 0.01038509, + "balance_loss_clip": 1.03404391, + "balance_loss_mlp": 1.02378631, + "epoch": 0.780339696377574, + "flos": 26944027770240.0, + "grad_norm": 1.8108874370288688, + "language_loss": 0.6227107, + "learning_rate": 4.85039880416219e-07, + "loss": 0.64390266, + "num_input_tokens_seen": 279957965, + "step": 12979, + "time_per_iteration": 2.5532548427581787 + }, + { + "auxiliary_loss_clip": 0.01105608, + "auxiliary_loss_mlp": 0.01031689, + "balance_loss_clip": 1.03745246, + "balance_loss_mlp": 1.01914787, + "epoch": 0.780399819630242, + "flos": 27957426180480.0, + "grad_norm": 1.9458233394351334, + "language_loss": 0.76713705, + "learning_rate": 4.847856458505217e-07, + "loss": 0.78851008, + "num_input_tokens_seen": 279977490, + "step": 12980, + "time_per_iteration": 3.9115207195281982 + }, + { + "auxiliary_loss_clip": 0.01107574, + "auxiliary_loss_mlp": 0.01034625, + "balance_loss_clip": 1.03743875, + "balance_loss_mlp": 1.02305007, + "epoch": 0.78045994288291, + "flos": 22486795044480.0, + "grad_norm": 2.024296450085091, + "language_loss": 0.7743414, + "learning_rate": 4.845314687419046e-07, + "loss": 0.79576337, + "num_input_tokens_seen": 279994220, + "step": 12981, + "time_per_iteration": 3.8836231231689453 + }, + { + "auxiliary_loss_clip": 0.01071359, + "auxiliary_loss_mlp": 0.01035038, + "balance_loss_clip": 1.03703094, + "balance_loss_mlp": 1.02284837, + "epoch": 0.7805200661355779, + "flos": 20850849089280.0, + "grad_norm": 1.6773429407857483, + "language_loss": 0.72400403, + "learning_rate": 4.842773491000067e-07, + "loss": 0.74506795, + "num_input_tokens_seen": 280012590, + "step": 12982, + "time_per_iteration": 2.59193754196167 + }, + { + "auxiliary_loss_clip": 0.01079035, + "auxiliary_loss_mlp": 0.01030606, + "balance_loss_clip": 1.0359745, + "balance_loss_mlp": 1.01914978, + "epoch": 0.7805801893882459, + "flos": 25665966973440.0, + "grad_norm": 1.328367214279769, + "language_loss": 0.73397791, + "learning_rate": 4.840232869344636e-07, + "loss": 0.75507432, + "num_input_tokens_seen": 280033700, + "step": 12983, + "time_per_iteration": 2.564222812652588 + }, + { + "auxiliary_loss_clip": 0.0107844, + "auxiliary_loss_mlp": 0.01028686, + "balance_loss_clip": 1.03594995, + "balance_loss_mlp": 1.01716483, + "epoch": 0.7806403126409138, + "flos": 11327806483200.0, + "grad_norm": 1.791045093096207, + "language_loss": 0.74619615, + "learning_rate": 4.837692822549086e-07, + "loss": 0.76726735, + "num_input_tokens_seen": 280052215, + "step": 12984, + "time_per_iteration": 2.514319658279419 + }, + { + "auxiliary_loss_clip": 0.01077598, + "auxiliary_loss_mlp": 0.01037195, + "balance_loss_clip": 1.03332996, + "balance_loss_mlp": 1.02554822, + "epoch": 0.7807004358935818, + "flos": 19573362910080.0, + "grad_norm": 1.8340869227038836, + "language_loss": 0.81187469, + "learning_rate": 4.835153350709746e-07, + "loss": 0.83302265, + "num_input_tokens_seen": 280070525, + "step": 12985, + "time_per_iteration": 2.507357358932495 + }, + { + "auxiliary_loss_clip": 0.01083814, + "auxiliary_loss_mlp": 0.01031308, + "balance_loss_clip": 1.03588116, + "balance_loss_mlp": 1.01913059, + "epoch": 0.7807605591462499, + "flos": 19135827342720.0, + "grad_norm": 1.5624243066085874, + "language_loss": 0.77335191, + "learning_rate": 4.832614453922915e-07, + "loss": 0.79450309, + "num_input_tokens_seen": 280089855, + "step": 12986, + "time_per_iteration": 2.529770612716675 + }, + { + "auxiliary_loss_clip": 0.01094501, + "auxiliary_loss_mlp": 0.01033452, + "balance_loss_clip": 1.03462696, + "balance_loss_mlp": 1.02159691, + "epoch": 0.7808206823989178, + "flos": 32374654133760.0, + "grad_norm": 1.701468706056767, + "language_loss": 0.74140358, + "learning_rate": 4.830076132284859e-07, + "loss": 0.76268303, + "num_input_tokens_seen": 280109960, + "step": 12987, + "time_per_iteration": 3.9619791507720947 + }, + { + "auxiliary_loss_clip": 0.01021872, + "auxiliary_loss_mlp": 0.0099874, + "balance_loss_clip": 1.0091846, + "balance_loss_mlp": 0.9976905, + "epoch": 0.7808808056515858, + "flos": 55050235061760.0, + "grad_norm": 0.745797502437112, + "language_loss": 0.55118859, + "learning_rate": 4.82753838589184e-07, + "loss": 0.57139468, + "num_input_tokens_seen": 280169805, + "step": 12988, + "time_per_iteration": 3.128983736038208 + }, + { + "auxiliary_loss_clip": 0.01079788, + "auxiliary_loss_mlp": 0.01034476, + "balance_loss_clip": 1.03476262, + "balance_loss_mlp": 1.02284157, + "epoch": 0.7809409289042537, + "flos": 12859468277760.0, + "grad_norm": 3.9449606509766446, + "language_loss": 0.80530286, + "learning_rate": 4.82500121484009e-07, + "loss": 0.82644558, + "num_input_tokens_seen": 280184630, + "step": 12989, + "time_per_iteration": 2.504418134689331 + }, + { + "auxiliary_loss_clip": 0.0107102, + "auxiliary_loss_mlp": 0.0102809, + "balance_loss_clip": 1.03460813, + "balance_loss_mlp": 1.01622236, + "epoch": 0.7810010521569217, + "flos": 21687244254720.0, + "grad_norm": 1.5126927566730415, + "language_loss": 0.70363891, + "learning_rate": 4.822464619225806e-07, + "loss": 0.72463, + "num_input_tokens_seen": 280203880, + "step": 12990, + "time_per_iteration": 2.5748088359832764 + }, + { + "auxiliary_loss_clip": 0.01080084, + "auxiliary_loss_mlp": 0.01030561, + "balance_loss_clip": 1.03425789, + "balance_loss_mlp": 1.01707268, + "epoch": 0.7810611754095896, + "flos": 16757068129920.0, + "grad_norm": 1.9854706215796256, + "language_loss": 0.7772373, + "learning_rate": 4.819928599145184e-07, + "loss": 0.79834378, + "num_input_tokens_seen": 280220460, + "step": 12991, + "time_per_iteration": 2.5005359649658203 + }, + { + "auxiliary_loss_clip": 0.01066697, + "auxiliary_loss_mlp": 0.0103227, + "balance_loss_clip": 1.0336895, + "balance_loss_mlp": 1.01991987, + "epoch": 0.7811212986622577, + "flos": 43507464658560.0, + "grad_norm": 1.4971879407099793, + "language_loss": 0.66160643, + "learning_rate": 4.817393154694398e-07, + "loss": 0.68259609, + "num_input_tokens_seen": 280242680, + "step": 12992, + "time_per_iteration": 2.7561628818511963 + }, + { + "auxiliary_loss_clip": 0.01106391, + "auxiliary_loss_mlp": 0.01029384, + "balance_loss_clip": 1.03698492, + "balance_loss_mlp": 1.0173018, + "epoch": 0.7811814219149256, + "flos": 21757700782080.0, + "grad_norm": 1.788657001942646, + "language_loss": 0.61757773, + "learning_rate": 4.814858285969578e-07, + "loss": 0.63893545, + "num_input_tokens_seen": 280260655, + "step": 12993, + "time_per_iteration": 2.4816274642944336 + }, + { + "auxiliary_loss_clip": 0.0108075, + "auxiliary_loss_mlp": 0.01028463, + "balance_loss_clip": 1.03463018, + "balance_loss_mlp": 1.0161128, + "epoch": 0.7812415451675936, + "flos": 24061514267520.0, + "grad_norm": 1.67214780946007, + "language_loss": 0.6848135, + "learning_rate": 4.812323993066862e-07, + "loss": 0.70590562, + "num_input_tokens_seen": 280281185, + "step": 12994, + "time_per_iteration": 2.556504487991333 + }, + { + "auxiliary_loss_clip": 0.01102348, + "auxiliary_loss_mlp": 0.01026524, + "balance_loss_clip": 1.03480291, + "balance_loss_mlp": 1.01518703, + "epoch": 0.7813016684202615, + "flos": 18989706816000.0, + "grad_norm": 2.019807541894518, + "language_loss": 0.68796498, + "learning_rate": 4.809790276082335e-07, + "loss": 0.70925367, + "num_input_tokens_seen": 280298255, + "step": 12995, + "time_per_iteration": 2.4788713455200195 + }, + { + "auxiliary_loss_clip": 0.01069015, + "auxiliary_loss_mlp": 0.01028226, + "balance_loss_clip": 1.03596354, + "balance_loss_mlp": 1.01721692, + "epoch": 0.7813617916729295, + "flos": 25260786581760.0, + "grad_norm": 1.736260922343832, + "language_loss": 0.74691796, + "learning_rate": 4.807257135112088e-07, + "loss": 0.76789039, + "num_input_tokens_seen": 280319000, + "step": 12996, + "time_per_iteration": 2.5799834728240967 + }, + { + "auxiliary_loss_clip": 0.01110034, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.03784966, + "balance_loss_mlp": 1.02097225, + "epoch": 0.7814219149255974, + "flos": 17966037116160.0, + "grad_norm": 5.2576105419936106, + "language_loss": 0.68117839, + "learning_rate": 4.804724570252167e-07, + "loss": 0.70261282, + "num_input_tokens_seen": 280336375, + "step": 12997, + "time_per_iteration": 2.4372987747192383 + }, + { + "auxiliary_loss_clip": 0.01108863, + "auxiliary_loss_mlp": 0.01033344, + "balance_loss_clip": 1.03705883, + "balance_loss_mlp": 1.0205822, + "epoch": 0.7814820381782654, + "flos": 25776176878080.0, + "grad_norm": 1.6294597606476013, + "language_loss": 0.81947386, + "learning_rate": 4.802192581598614e-07, + "loss": 0.84089589, + "num_input_tokens_seen": 280358760, + "step": 12998, + "time_per_iteration": 2.4848928451538086 + }, + { + "auxiliary_loss_clip": 0.01077148, + "auxiliary_loss_mlp": 0.01036053, + "balance_loss_clip": 1.03133488, + "balance_loss_mlp": 1.02274287, + "epoch": 0.7815421614309335, + "flos": 20519572930560.0, + "grad_norm": 2.8903696000746297, + "language_loss": 0.74348688, + "learning_rate": 4.799661169247453e-07, + "loss": 0.76461893, + "num_input_tokens_seen": 280377085, + "step": 12999, + "time_per_iteration": 2.520977020263672 + }, + { + "auxiliary_loss_clip": 0.01092689, + "auxiliary_loss_mlp": 0.01035565, + "balance_loss_clip": 1.03556609, + "balance_loss_mlp": 1.02176094, + "epoch": 0.7816022846836014, + "flos": 21287666384640.0, + "grad_norm": 1.4623253388498227, + "language_loss": 0.84600073, + "learning_rate": 4.797130333294652e-07, + "loss": 0.86728334, + "num_input_tokens_seen": 280395465, + "step": 13000, + "time_per_iteration": 2.4857125282287598 + }, + { + "auxiliary_loss_clip": 0.01096175, + "auxiliary_loss_mlp": 0.01031254, + "balance_loss_clip": 1.03685081, + "balance_loss_mlp": 1.01916647, + "epoch": 0.7816624079362694, + "flos": 19208402772480.0, + "grad_norm": 1.7203575673763005, + "language_loss": 0.66170025, + "learning_rate": 4.794600073836192e-07, + "loss": 0.68297446, + "num_input_tokens_seen": 280412775, + "step": 13001, + "time_per_iteration": 2.50039005279541 + }, + { + "auxiliary_loss_clip": 0.01067753, + "auxiliary_loss_mlp": 0.01034241, + "balance_loss_clip": 1.03512573, + "balance_loss_mlp": 1.02234387, + "epoch": 0.7817225311889373, + "flos": 26104687689600.0, + "grad_norm": 1.4573836362276584, + "language_loss": 0.66849542, + "learning_rate": 4.792070390968027e-07, + "loss": 0.68951535, + "num_input_tokens_seen": 280432905, + "step": 13002, + "time_per_iteration": 2.5946924686431885 + }, + { + "auxiliary_loss_clip": 0.01098044, + "auxiliary_loss_mlp": 0.01033211, + "balance_loss_clip": 1.03766751, + "balance_loss_mlp": 1.02054453, + "epoch": 0.7817826544416053, + "flos": 21250929749760.0, + "grad_norm": 2.0913026452602153, + "language_loss": 0.73451805, + "learning_rate": 4.78954128478607e-07, + "loss": 0.75583059, + "num_input_tokens_seen": 280450785, + "step": 13003, + "time_per_iteration": 2.502394199371338 + }, + { + "auxiliary_loss_clip": 0.01095045, + "auxiliary_loss_mlp": 0.01033343, + "balance_loss_clip": 1.03659058, + "balance_loss_mlp": 1.02110648, + "epoch": 0.7818427776942732, + "flos": 19932181822080.0, + "grad_norm": 1.7069110243573784, + "language_loss": 0.62243938, + "learning_rate": 4.787012755386233e-07, + "loss": 0.64372325, + "num_input_tokens_seen": 280468400, + "step": 13004, + "time_per_iteration": 2.474656581878662 + }, + { + "auxiliary_loss_clip": 0.01099248, + "auxiliary_loss_mlp": 0.01030366, + "balance_loss_clip": 1.03434944, + "balance_loss_mlp": 1.01954174, + "epoch": 0.7819029009469413, + "flos": 11363753018880.0, + "grad_norm": 1.8318114520549877, + "language_loss": 0.82880878, + "learning_rate": 4.784484802864403e-07, + "loss": 0.85010493, + "num_input_tokens_seen": 280483930, + "step": 13005, + "time_per_iteration": 2.4471733570098877 + }, + { + "auxiliary_loss_clip": 0.01064237, + "auxiliary_loss_mlp": 0.00784212, + "balance_loss_clip": 1.03194928, + "balance_loss_mlp": 1.00868332, + "epoch": 0.7819630241996092, + "flos": 24279276470400.0, + "grad_norm": 1.714272265774244, + "language_loss": 0.72664309, + "learning_rate": 4.781957427316432e-07, + "loss": 0.74512762, + "num_input_tokens_seen": 280503465, + "step": 13006, + "time_per_iteration": 2.5689024925231934 + }, + { + "auxiliary_loss_clip": 0.01097436, + "auxiliary_loss_mlp": 0.00785463, + "balance_loss_clip": 1.03759658, + "balance_loss_mlp": 1.01270545, + "epoch": 0.7820231474522772, + "flos": 22708902792960.0, + "grad_norm": 1.5638143026775078, + "language_loss": 0.72039449, + "learning_rate": 4.779430628838157e-07, + "loss": 0.73922348, + "num_input_tokens_seen": 280523375, + "step": 13007, + "time_per_iteration": 2.516085386276245 + }, + { + "auxiliary_loss_clip": 0.0110565, + "auxiliary_loss_mlp": 0.01030361, + "balance_loss_clip": 1.03481567, + "balance_loss_mlp": 1.01782572, + "epoch": 0.7820832707049451, + "flos": 20047419630720.0, + "grad_norm": 1.9622807444910655, + "language_loss": 0.689448, + "learning_rate": 4.776904407525397e-07, + "loss": 0.71080804, + "num_input_tokens_seen": 280542920, + "step": 13008, + "time_per_iteration": 2.4463868141174316 + }, + { + "auxiliary_loss_clip": 0.01077654, + "auxiliary_loss_mlp": 0.01026732, + "balance_loss_clip": 1.03578496, + "balance_loss_mlp": 1.01400638, + "epoch": 0.7821433939576131, + "flos": 27162795553920.0, + "grad_norm": 1.89510750336667, + "language_loss": 0.69680274, + "learning_rate": 4.774378763473954e-07, + "loss": 0.71784663, + "num_input_tokens_seen": 280561700, + "step": 13009, + "time_per_iteration": 2.610992193222046 + }, + { + "auxiliary_loss_clip": 0.01064464, + "auxiliary_loss_mlp": 0.01027414, + "balance_loss_clip": 1.03207827, + "balance_loss_mlp": 1.01505172, + "epoch": 0.782203517210281, + "flos": 22602068766720.0, + "grad_norm": 1.6276754164285818, + "language_loss": 0.81676781, + "learning_rate": 4.771853696779586e-07, + "loss": 0.8376866, + "num_input_tokens_seen": 280580605, + "step": 13010, + "time_per_iteration": 2.551722526550293 + }, + { + "auxiliary_loss_clip": 0.01091634, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.03434801, + "balance_loss_mlp": 1.02124262, + "epoch": 0.782263640462949, + "flos": 29059812535680.0, + "grad_norm": 1.5557890986180725, + "language_loss": 0.62443864, + "learning_rate": 4.76932920753806e-07, + "loss": 0.64568043, + "num_input_tokens_seen": 280601495, + "step": 13011, + "time_per_iteration": 2.5515785217285156 + }, + { + "auxiliary_loss_clip": 0.01094121, + "auxiliary_loss_mlp": 0.01028701, + "balance_loss_clip": 1.03753996, + "balance_loss_mlp": 1.01829982, + "epoch": 0.782323763715617, + "flos": 25299498464640.0, + "grad_norm": 1.8733405806923116, + "language_loss": 0.69891208, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.72014034, + "num_input_tokens_seen": 280622760, + "step": 13012, + "time_per_iteration": 2.535952091217041 + }, + { + "auxiliary_loss_clip": 0.01031667, + "auxiliary_loss_mlp": 0.0100608, + "balance_loss_clip": 1.00864577, + "balance_loss_mlp": 1.00479221, + "epoch": 0.782383886968285, + "flos": 65194388668800.0, + "grad_norm": 0.7039131077630228, + "language_loss": 0.55023277, + "learning_rate": 4.764281961796395e-07, + "loss": 0.57061028, + "num_input_tokens_seen": 280687115, + "step": 13013, + "time_per_iteration": 3.1727874279022217 + }, + { + "auxiliary_loss_clip": 0.01081626, + "auxiliary_loss_mlp": 0.01034678, + "balance_loss_clip": 1.03647566, + "balance_loss_mlp": 1.02260244, + "epoch": 0.782444010220953, + "flos": 18405440190720.0, + "grad_norm": 1.9409096026699597, + "language_loss": 0.65432155, + "learning_rate": 4.76175920548765e-07, + "loss": 0.67548466, + "num_input_tokens_seen": 280705000, + "step": 13014, + "time_per_iteration": 2.500490665435791 + }, + { + "auxiliary_loss_clip": 0.01008406, + "auxiliary_loss_mlp": 0.01000189, + "balance_loss_clip": 1.01106358, + "balance_loss_mlp": 0.99872255, + "epoch": 0.7825041334736209, + "flos": 63955003841280.0, + "grad_norm": 0.7263432952175565, + "language_loss": 0.5843733, + "learning_rate": 4.759237027014524e-07, + "loss": 0.60445923, + "num_input_tokens_seen": 280773525, + "step": 13015, + "time_per_iteration": 3.2573177814483643 + }, + { + "auxiliary_loss_clip": 0.01084423, + "auxiliary_loss_mlp": 0.01031502, + "balance_loss_clip": 1.03682351, + "balance_loss_mlp": 1.02027893, + "epoch": 0.7825642567262889, + "flos": 20339373375360.0, + "grad_norm": 1.6389407790932553, + "language_loss": 0.74602288, + "learning_rate": 4.756715426472666e-07, + "loss": 0.76718211, + "num_input_tokens_seen": 280791915, + "step": 13016, + "time_per_iteration": 3.962952136993408 + }, + { + "auxiliary_loss_clip": 0.01105486, + "auxiliary_loss_mlp": 0.0102872, + "balance_loss_clip": 1.03615713, + "balance_loss_mlp": 1.01526117, + "epoch": 0.7826243799789568, + "flos": 20262955190400.0, + "grad_norm": 1.6301409706473726, + "language_loss": 0.75054812, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.77189016, + "num_input_tokens_seen": 280811460, + "step": 13017, + "time_per_iteration": 2.4528629779815674 + }, + { + "auxiliary_loss_clip": 0.01079864, + "auxiliary_loss_mlp": 0.01031642, + "balance_loss_clip": 1.03523207, + "balance_loss_mlp": 1.01925611, + "epoch": 0.7826845032316249, + "flos": 21132926593920.0, + "grad_norm": 2.0321698843195586, + "language_loss": 0.75407892, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.77519399, + "num_input_tokens_seen": 280825415, + "step": 13018, + "time_per_iteration": 3.8766555786132812 + }, + { + "auxiliary_loss_clip": 0.01103305, + "auxiliary_loss_mlp": 0.01028295, + "balance_loss_clip": 1.03526759, + "balance_loss_mlp": 1.01592731, + "epoch": 0.7827446264842928, + "flos": 22492253911680.0, + "grad_norm": 1.4610165715352714, + "language_loss": 0.77224791, + "learning_rate": 4.749154093390708e-07, + "loss": 0.7935639, + "num_input_tokens_seen": 280845335, + "step": 13019, + "time_per_iteration": 2.4723141193389893 + }, + { + "auxiliary_loss_clip": 0.01057315, + "auxiliary_loss_mlp": 0.01021197, + "balance_loss_clip": 1.03474164, + "balance_loss_mlp": 1.0103786, + "epoch": 0.7828047497369608, + "flos": 28840649702400.0, + "grad_norm": 1.4058647528959523, + "language_loss": 0.6745857, + "learning_rate": 4.746634805529852e-07, + "loss": 0.69537085, + "num_input_tokens_seen": 280867145, + "step": 13020, + "time_per_iteration": 4.065423488616943 + }, + { + "auxiliary_loss_clip": 0.01092241, + "auxiliary_loss_mlp": 0.0102802, + "balance_loss_clip": 1.03859496, + "balance_loss_mlp": 1.01623619, + "epoch": 0.7828648729896287, + "flos": 23257689759360.0, + "grad_norm": 1.7361485863291195, + "language_loss": 0.62358892, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.64479148, + "num_input_tokens_seen": 280886185, + "step": 13021, + "time_per_iteration": 2.5113656520843506 + }, + { + "auxiliary_loss_clip": 0.01101525, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.03526938, + "balance_loss_mlp": 1.02017307, + "epoch": 0.7829249962422967, + "flos": 25265670831360.0, + "grad_norm": 1.6762412424569924, + "language_loss": 0.69607687, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.71740586, + "num_input_tokens_seen": 280907665, + "step": 13022, + "time_per_iteration": 2.5003914833068848 + }, + { + "auxiliary_loss_clip": 0.00992377, + "auxiliary_loss_mlp": 0.00999564, + "balance_loss_clip": 1.01559985, + "balance_loss_mlp": 0.9984318, + "epoch": 0.7829851194949646, + "flos": 70722044645760.0, + "grad_norm": 0.6432050880553647, + "language_loss": 0.56200415, + "learning_rate": 4.739080412784131e-07, + "loss": 0.58192354, + "num_input_tokens_seen": 280971405, + "step": 13023, + "time_per_iteration": 3.3136653900146484 + }, + { + "auxiliary_loss_clip": 0.0107469, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.03266072, + "balance_loss_mlp": 1.02040994, + "epoch": 0.7830452427476327, + "flos": 25660795415040.0, + "grad_norm": 1.603806008777355, + "language_loss": 0.66886234, + "learning_rate": 4.736563439132792e-07, + "loss": 0.68993902, + "num_input_tokens_seen": 280989615, + "step": 13024, + "time_per_iteration": 2.529069662094116 + }, + { + "auxiliary_loss_clip": 0.01106649, + "auxiliary_loss_mlp": 0.01027539, + "balance_loss_clip": 1.03682446, + "balance_loss_mlp": 1.01479554, + "epoch": 0.7831053660003006, + "flos": 22784315397120.0, + "grad_norm": 2.8350822299388843, + "language_loss": 0.77817899, + "learning_rate": 4.734047044272498e-07, + "loss": 0.79952085, + "num_input_tokens_seen": 281009450, + "step": 13025, + "time_per_iteration": 2.508657217025757 + }, + { + "auxiliary_loss_clip": 0.01079287, + "auxiliary_loss_mlp": 0.01032264, + "balance_loss_clip": 1.03499103, + "balance_loss_mlp": 1.02055812, + "epoch": 0.7831654892529686, + "flos": 25812267068160.0, + "grad_norm": 1.6226034178925606, + "language_loss": 0.78553587, + "learning_rate": 4.731531228298673e-07, + "loss": 0.80665135, + "num_input_tokens_seen": 281028120, + "step": 13026, + "time_per_iteration": 4.046311855316162 + }, + { + "auxiliary_loss_clip": 0.01092607, + "auxiliary_loss_mlp": 0.01024486, + "balance_loss_clip": 1.03731847, + "balance_loss_mlp": 1.01300025, + "epoch": 0.7832256125056366, + "flos": 20771557816320.0, + "grad_norm": 2.261926749247982, + "language_loss": 0.75479025, + "learning_rate": 4.729015991306715e-07, + "loss": 0.77596116, + "num_input_tokens_seen": 281042130, + "step": 13027, + "time_per_iteration": 2.4544484615325928 + }, + { + "auxiliary_loss_clip": 0.01094957, + "auxiliary_loss_mlp": 0.01027957, + "balance_loss_clip": 1.03699625, + "balance_loss_mlp": 1.01656055, + "epoch": 0.7832857357583045, + "flos": 21506541909120.0, + "grad_norm": 1.7050326646645175, + "language_loss": 0.70366257, + "learning_rate": 4.726501333391997e-07, + "loss": 0.72489178, + "num_input_tokens_seen": 281060945, + "step": 13028, + "time_per_iteration": 2.500354051589966 + }, + { + "auxiliary_loss_clip": 0.01058275, + "auxiliary_loss_mlp": 0.01040822, + "balance_loss_clip": 1.03644896, + "balance_loss_mlp": 1.02795959, + "epoch": 0.7833458590109725, + "flos": 18077791305600.0, + "grad_norm": 2.006215113839031, + "language_loss": 0.68972403, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.710715, + "num_input_tokens_seen": 281079270, + "step": 13029, + "time_per_iteration": 2.554739475250244 + }, + { + "auxiliary_loss_clip": 0.01074946, + "auxiliary_loss_mlp": 0.01027366, + "balance_loss_clip": 1.03339505, + "balance_loss_mlp": 1.01523054, + "epoch": 0.7834059822636404, + "flos": 28288738252800.0, + "grad_norm": 1.6863230610323796, + "language_loss": 0.80827534, + "learning_rate": 4.721473755175698e-07, + "loss": 0.8292985, + "num_input_tokens_seen": 281099500, + "step": 13030, + "time_per_iteration": 2.618567705154419 + }, + { + "auxiliary_loss_clip": 0.01096877, + "auxiliary_loss_mlp": 0.01032566, + "balance_loss_clip": 1.03500688, + "balance_loss_mlp": 1.0205493, + "epoch": 0.7834661055163085, + "flos": 31686211088640.0, + "grad_norm": 1.6126103299959011, + "language_loss": 0.70747888, + "learning_rate": 4.71896083506476e-07, + "loss": 0.72877336, + "num_input_tokens_seen": 281121250, + "step": 13031, + "time_per_iteration": 2.5619826316833496 + }, + { + "auxiliary_loss_clip": 0.01066871, + "auxiliary_loss_mlp": 0.01029224, + "balance_loss_clip": 1.03436875, + "balance_loss_mlp": 1.01705885, + "epoch": 0.7835262287689764, + "flos": 12933192942720.0, + "grad_norm": 1.8779739007097658, + "language_loss": 0.78391206, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.80487299, + "num_input_tokens_seen": 281138760, + "step": 13032, + "time_per_iteration": 2.5481278896331787 + }, + { + "auxiliary_loss_clip": 0.01098694, + "auxiliary_loss_mlp": 0.01037867, + "balance_loss_clip": 1.03698277, + "balance_loss_mlp": 1.02532053, + "epoch": 0.7835863520216444, + "flos": 16143211676160.0, + "grad_norm": 2.287083433098722, + "language_loss": 0.63298941, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.65435505, + "num_input_tokens_seen": 281157420, + "step": 13033, + "time_per_iteration": 2.456186294555664 + }, + { + "auxiliary_loss_clip": 0.0109225, + "auxiliary_loss_mlp": 0.01033308, + "balance_loss_clip": 1.03514242, + "balance_loss_mlp": 1.02130342, + "epoch": 0.7836464752743123, + "flos": 11509909459200.0, + "grad_norm": 1.6689414425545086, + "language_loss": 0.71643627, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.73769188, + "num_input_tokens_seen": 281174620, + "step": 13034, + "time_per_iteration": 2.5267012119293213 + }, + { + "auxiliary_loss_clip": 0.01105471, + "auxiliary_loss_mlp": 0.00785165, + "balance_loss_clip": 1.03590417, + "balance_loss_mlp": 1.01183224, + "epoch": 0.7837065985269803, + "flos": 18223696350720.0, + "grad_norm": 1.5907583299067307, + "language_loss": 0.71971524, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.73862159, + "num_input_tokens_seen": 281193865, + "step": 13035, + "time_per_iteration": 2.4351465702056885 + }, + { + "auxiliary_loss_clip": 0.01106774, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.03759432, + "balance_loss_mlp": 1.02400041, + "epoch": 0.7837667217796482, + "flos": 24754410599040.0, + "grad_norm": 1.9274046117459727, + "language_loss": 0.66145921, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.68289149, + "num_input_tokens_seen": 281212250, + "step": 13036, + "time_per_iteration": 2.478936195373535 + }, + { + "auxiliary_loss_clip": 0.0109815, + "auxiliary_loss_mlp": 0.01037405, + "balance_loss_clip": 1.03683949, + "balance_loss_mlp": 1.02480483, + "epoch": 0.7838268450323163, + "flos": 22383121415040.0, + "grad_norm": 2.8846305398118792, + "language_loss": 0.72620767, + "learning_rate": 4.703895486362031e-07, + "loss": 0.74756324, + "num_input_tokens_seen": 281230850, + "step": 13037, + "time_per_iteration": 2.487478017807007 + }, + { + "auxiliary_loss_clip": 0.01064565, + "auxiliary_loss_mlp": 0.01036234, + "balance_loss_clip": 1.03090465, + "balance_loss_mlp": 1.02334118, + "epoch": 0.7838869682849842, + "flos": 19500284689920.0, + "grad_norm": 2.8311827451458607, + "language_loss": 0.59689486, + "learning_rate": 4.701386624460717e-07, + "loss": 0.61790287, + "num_input_tokens_seen": 281249810, + "step": 13038, + "time_per_iteration": 2.5455222129821777 + }, + { + "auxiliary_loss_clip": 0.01082551, + "auxiliary_loss_mlp": 0.01027816, + "balance_loss_clip": 1.03564358, + "balance_loss_mlp": 1.01677144, + "epoch": 0.7839470915376522, + "flos": 32892845690880.0, + "grad_norm": 1.5607932424089606, + "language_loss": 0.67832506, + "learning_rate": 4.698878342684349e-07, + "loss": 0.6994288, + "num_input_tokens_seen": 281273730, + "step": 13039, + "time_per_iteration": 2.625549793243408 + }, + { + "auxiliary_loss_clip": 0.010662, + "auxiliary_loss_mlp": 0.01023778, + "balance_loss_clip": 1.03259349, + "balance_loss_mlp": 1.01331174, + "epoch": 0.7840072147903202, + "flos": 29676003373440.0, + "grad_norm": 2.679270654770113, + "language_loss": 0.69068098, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.71158075, + "num_input_tokens_seen": 281293670, + "step": 13040, + "time_per_iteration": 2.622471809387207 + }, + { + "auxiliary_loss_clip": 0.01061949, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.03586173, + "balance_loss_mlp": 1.01931179, + "epoch": 0.7840673380429881, + "flos": 18186744234240.0, + "grad_norm": 1.6540891068651573, + "language_loss": 0.67120779, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.69214535, + "num_input_tokens_seen": 281313070, + "step": 13041, + "time_per_iteration": 2.569120168685913 + }, + { + "auxiliary_loss_clip": 0.01022386, + "auxiliary_loss_mlp": 0.00762788, + "balance_loss_clip": 1.00931418, + "balance_loss_mlp": 1.00306213, + "epoch": 0.7841274612956561, + "flos": 66346006613760.0, + "grad_norm": 0.6605618267714153, + "language_loss": 0.57473719, + "learning_rate": 4.691356979055998e-07, + "loss": 0.59258902, + "num_input_tokens_seen": 281374880, + "step": 13042, + "time_per_iteration": 3.074313163757324 + }, + { + "auxiliary_loss_clip": 0.01081227, + "auxiliary_loss_mlp": 0.01028077, + "balance_loss_clip": 1.03637004, + "balance_loss_mlp": 1.0155834, + "epoch": 0.784187584548324, + "flos": 26648482665600.0, + "grad_norm": 1.867128931269078, + "language_loss": 0.83674157, + "learning_rate": 4.688851018730369e-07, + "loss": 0.85783458, + "num_input_tokens_seen": 281392620, + "step": 13043, + "time_per_iteration": 2.5671727657318115 + }, + { + "auxiliary_loss_clip": 0.01089805, + "auxiliary_loss_mlp": 0.0102975, + "balance_loss_clip": 1.03536165, + "balance_loss_mlp": 1.0184077, + "epoch": 0.7842477078009921, + "flos": 25740158515200.0, + "grad_norm": 1.3778999240365586, + "language_loss": 0.8832081, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.90440363, + "num_input_tokens_seen": 281413140, + "step": 13044, + "time_per_iteration": 2.5377919673919678 + }, + { + "auxiliary_loss_clip": 0.01086918, + "auxiliary_loss_mlp": 0.01029808, + "balance_loss_clip": 1.03550768, + "balance_loss_mlp": 1.01792836, + "epoch": 0.78430783105366, + "flos": 21980957765760.0, + "grad_norm": 1.6172235906076722, + "language_loss": 0.79086924, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.81203645, + "num_input_tokens_seen": 281430860, + "step": 13045, + "time_per_iteration": 2.538931369781494 + }, + { + "auxiliary_loss_clip": 0.01076271, + "auxiliary_loss_mlp": 0.01027621, + "balance_loss_clip": 1.03443098, + "balance_loss_mlp": 1.01631403, + "epoch": 0.784367954306328, + "flos": 23842279607040.0, + "grad_norm": 1.4929639509408479, + "language_loss": 0.72375071, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.74478966, + "num_input_tokens_seen": 281451385, + "step": 13046, + "time_per_iteration": 2.55049467086792 + }, + { + "auxiliary_loss_clip": 0.01060347, + "auxiliary_loss_mlp": 0.0103469, + "balance_loss_clip": 1.03447485, + "balance_loss_mlp": 1.02225029, + "epoch": 0.7844280775589959, + "flos": 24826662806400.0, + "grad_norm": 1.5165785232665245, + "language_loss": 0.63055003, + "learning_rate": 4.678832984380809e-07, + "loss": 0.6515004, + "num_input_tokens_seen": 281472255, + "step": 13047, + "time_per_iteration": 2.6557538509368896 + }, + { + "auxiliary_loss_clip": 0.0109185, + "auxiliary_loss_mlp": 0.01028559, + "balance_loss_clip": 1.03458762, + "balance_loss_mlp": 1.01693642, + "epoch": 0.7844882008116639, + "flos": 22455660931200.0, + "grad_norm": 1.4668914840696121, + "language_loss": 0.73015583, + "learning_rate": 4.676329928006515e-07, + "loss": 0.75135994, + "num_input_tokens_seen": 281492860, + "step": 13048, + "time_per_iteration": 2.4971556663513184 + }, + { + "auxiliary_loss_clip": 0.01081893, + "auxiliary_loss_mlp": 0.01028423, + "balance_loss_clip": 1.03743148, + "balance_loss_mlp": 1.01675892, + "epoch": 0.7845483240643318, + "flos": 26104041244800.0, + "grad_norm": 1.9293034234921174, + "language_loss": 0.74602568, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.76712888, + "num_input_tokens_seen": 281511815, + "step": 13049, + "time_per_iteration": 2.57753849029541 + }, + { + "auxiliary_loss_clip": 0.01107879, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.0360074, + "balance_loss_mlp": 1.01778531, + "epoch": 0.7846084473169999, + "flos": 19354307817600.0, + "grad_norm": 1.8862345783298364, + "language_loss": 0.72608942, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.74747515, + "num_input_tokens_seen": 281530090, + "step": 13050, + "time_per_iteration": 2.4583442211151123 + }, + { + "auxiliary_loss_clip": 0.01091955, + "auxiliary_loss_mlp": 0.01031727, + "balance_loss_clip": 1.03504801, + "balance_loss_mlp": 1.01947188, + "epoch": 0.7846685705696678, + "flos": 23325811902720.0, + "grad_norm": 1.9531420701643483, + "language_loss": 0.73514378, + "learning_rate": 4.668824245713825e-07, + "loss": 0.75638056, + "num_input_tokens_seen": 281547075, + "step": 13051, + "time_per_iteration": 2.5108530521392822 + }, + { + "auxiliary_loss_clip": 0.01107652, + "auxiliary_loss_mlp": 0.0103368, + "balance_loss_clip": 1.03739297, + "balance_loss_mlp": 1.0210079, + "epoch": 0.7847286938223358, + "flos": 35809545962880.0, + "grad_norm": 1.8890484104551903, + "language_loss": 0.72775108, + "learning_rate": 4.666323514209227e-07, + "loss": 0.74916434, + "num_input_tokens_seen": 281568080, + "step": 13052, + "time_per_iteration": 2.5786654949188232 + }, + { + "auxiliary_loss_clip": 0.010784, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.03587198, + "balance_loss_mlp": 1.02132154, + "epoch": 0.7847888170750038, + "flos": 18478159274880.0, + "grad_norm": 2.9168964052599478, + "language_loss": 0.68898618, + "learning_rate": 4.663823364159183e-07, + "loss": 0.71009785, + "num_input_tokens_seen": 281586925, + "step": 13053, + "time_per_iteration": 2.54837703704834 + }, + { + "auxiliary_loss_clip": 0.01094891, + "auxiliary_loss_mlp": 0.01031663, + "balance_loss_clip": 1.03759289, + "balance_loss_mlp": 1.02056456, + "epoch": 0.7848489403276717, + "flos": 25119155255040.0, + "grad_norm": 2.03516021197348, + "language_loss": 0.70019245, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.72145802, + "num_input_tokens_seen": 281603915, + "step": 13054, + "time_per_iteration": 3.9188766479492188 + }, + { + "auxiliary_loss_clip": 0.01096444, + "auxiliary_loss_mlp": 0.01028522, + "balance_loss_clip": 1.03641319, + "balance_loss_mlp": 1.01628566, + "epoch": 0.7849090635803397, + "flos": 26502433966080.0, + "grad_norm": 1.6254964808833352, + "language_loss": 0.75557363, + "learning_rate": 4.658824808801938e-07, + "loss": 0.77682328, + "num_input_tokens_seen": 281624220, + "step": 13055, + "time_per_iteration": 2.5192887783050537 + }, + { + "auxiliary_loss_clip": 0.01109669, + "auxiliary_loss_mlp": 0.01032375, + "balance_loss_clip": 1.03737414, + "balance_loss_mlp": 1.01971531, + "epoch": 0.7849691868330076, + "flos": 20959658363520.0, + "grad_norm": 2.0034261693037414, + "language_loss": 0.74428737, + "learning_rate": 4.656326403684283e-07, + "loss": 0.76570779, + "num_input_tokens_seen": 281642325, + "step": 13056, + "time_per_iteration": 2.4535651206970215 + }, + { + "auxiliary_loss_clip": 0.01049099, + "auxiliary_loss_mlp": 0.01028511, + "balance_loss_clip": 1.03606915, + "balance_loss_mlp": 1.01651287, + "epoch": 0.7850293100856757, + "flos": 26067484177920.0, + "grad_norm": 1.6997047032754706, + "language_loss": 0.70140338, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.72217953, + "num_input_tokens_seen": 281663065, + "step": 13057, + "time_per_iteration": 4.3238747119903564 + }, + { + "auxiliary_loss_clip": 0.01066938, + "auxiliary_loss_mlp": 0.01030874, + "balance_loss_clip": 1.03786623, + "balance_loss_mlp": 1.01917362, + "epoch": 0.7850894333383436, + "flos": 22491894775680.0, + "grad_norm": 1.849544944620275, + "language_loss": 0.76612771, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.7871058, + "num_input_tokens_seen": 281681005, + "step": 13058, + "time_per_iteration": 2.5695457458496094 + }, + { + "auxiliary_loss_clip": 0.01096172, + "auxiliary_loss_mlp": 0.010343, + "balance_loss_clip": 1.03762984, + "balance_loss_mlp": 1.02188468, + "epoch": 0.7851495565910116, + "flos": 20558643949440.0, + "grad_norm": 1.768928003588431, + "language_loss": 0.70775157, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.72905636, + "num_input_tokens_seen": 281697965, + "step": 13059, + "time_per_iteration": 3.8780956268310547 + }, + { + "auxiliary_loss_clip": 0.01076102, + "auxiliary_loss_mlp": 0.01039252, + "balance_loss_clip": 1.03482485, + "balance_loss_mlp": 1.02635944, + "epoch": 0.7852096798436795, + "flos": 15924838942080.0, + "grad_norm": 2.0005750833308413, + "language_loss": 0.7698831, + "learning_rate": 4.646338602497144e-07, + "loss": 0.79103661, + "num_input_tokens_seen": 281716035, + "step": 13060, + "time_per_iteration": 2.5143232345581055 + }, + { + "auxiliary_loss_clip": 0.01085531, + "auxiliary_loss_mlp": 0.01028721, + "balance_loss_clip": 1.03709698, + "balance_loss_mlp": 1.01630569, + "epoch": 0.7852698030963475, + "flos": 19062282245760.0, + "grad_norm": 2.148073697765887, + "language_loss": 0.76926279, + "learning_rate": 4.643843107494654e-07, + "loss": 0.79040527, + "num_input_tokens_seen": 281732815, + "step": 13061, + "time_per_iteration": 2.5092647075653076 + }, + { + "auxiliary_loss_clip": 0.01068838, + "auxiliary_loss_mlp": 0.01032616, + "balance_loss_clip": 1.03407431, + "balance_loss_mlp": 1.01986086, + "epoch": 0.7853299263490154, + "flos": 24644380262400.0, + "grad_norm": 1.9600387403927144, + "language_loss": 0.73906589, + "learning_rate": 4.641348194799164e-07, + "loss": 0.76008046, + "num_input_tokens_seen": 281751980, + "step": 13062, + "time_per_iteration": 2.5772688388824463 + }, + { + "auxiliary_loss_clip": 0.01092107, + "auxiliary_loss_mlp": 0.01031381, + "balance_loss_clip": 1.03458726, + "balance_loss_mlp": 1.01969218, + "epoch": 0.7853900496016835, + "flos": 22017981709440.0, + "grad_norm": 1.403548412349145, + "language_loss": 0.68427092, + "learning_rate": 4.638853864505297e-07, + "loss": 0.70550585, + "num_input_tokens_seen": 281772670, + "step": 13063, + "time_per_iteration": 3.9192802906036377 + }, + { + "auxiliary_loss_clip": 0.01093886, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.03846264, + "balance_loss_mlp": 1.02016675, + "epoch": 0.7854501728543514, + "flos": 30227412032640.0, + "grad_norm": 1.8624799989300924, + "language_loss": 0.72707045, + "learning_rate": 4.636360116707625e-07, + "loss": 0.74833238, + "num_input_tokens_seen": 281792930, + "step": 13064, + "time_per_iteration": 2.55252742767334 + }, + { + "auxiliary_loss_clip": 0.01075304, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.03412318, + "balance_loss_mlp": 1.01866031, + "epoch": 0.7855102961070194, + "flos": 18843694030080.0, + "grad_norm": 1.7300841396453583, + "language_loss": 0.6782558, + "learning_rate": 4.633866951500718e-07, + "loss": 0.69931358, + "num_input_tokens_seen": 281811805, + "step": 13065, + "time_per_iteration": 2.5524344444274902 + }, + { + "auxiliary_loss_clip": 0.0109332, + "auxiliary_loss_mlp": 0.01033893, + "balance_loss_clip": 1.03964853, + "balance_loss_mlp": 1.02211511, + "epoch": 0.7855704193596874, + "flos": 22309971367680.0, + "grad_norm": 1.9047258839500067, + "language_loss": 0.76357853, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.78485072, + "num_input_tokens_seen": 281831885, + "step": 13066, + "time_per_iteration": 2.4977493286132812 + }, + { + "auxiliary_loss_clip": 0.0103131, + "auxiliary_loss_mlp": 0.01005249, + "balance_loss_clip": 1.00839877, + "balance_loss_mlp": 1.00403905, + "epoch": 0.7856305426123553, + "flos": 60004434407040.0, + "grad_norm": 0.7168198979877674, + "language_loss": 0.53436494, + "learning_rate": 4.628882369237346e-07, + "loss": 0.55473053, + "num_input_tokens_seen": 281900310, + "step": 13067, + "time_per_iteration": 3.1492693424224854 + }, + { + "auxiliary_loss_clip": 0.01060122, + "auxiliary_loss_mlp": 0.01031839, + "balance_loss_clip": 1.03481066, + "balance_loss_mlp": 1.0191617, + "epoch": 0.7856906658650233, + "flos": 21868593045120.0, + "grad_norm": 1.6338137100815981, + "language_loss": 0.6781888, + "learning_rate": 4.62639095236989e-07, + "loss": 0.69910836, + "num_input_tokens_seen": 281918870, + "step": 13068, + "time_per_iteration": 2.5973987579345703 + }, + { + "auxiliary_loss_clip": 0.01071862, + "auxiliary_loss_mlp": 0.01030228, + "balance_loss_clip": 1.03492546, + "balance_loss_mlp": 1.01836061, + "epoch": 0.7857507891176913, + "flos": 23622937205760.0, + "grad_norm": 1.8355498958734748, + "language_loss": 0.68275356, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.70377445, + "num_input_tokens_seen": 281936905, + "step": 13069, + "time_per_iteration": 2.5731067657470703 + }, + { + "auxiliary_loss_clip": 0.01096966, + "auxiliary_loss_mlp": 0.01033684, + "balance_loss_clip": 1.03761208, + "balance_loss_mlp": 1.02147102, + "epoch": 0.7858109123703593, + "flos": 25520061928320.0, + "grad_norm": 1.5234092608260015, + "language_loss": 0.76775885, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.78906536, + "num_input_tokens_seen": 281955625, + "step": 13070, + "time_per_iteration": 2.5345969200134277 + }, + { + "auxiliary_loss_clip": 0.01047378, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.03285515, + "balance_loss_mlp": 1.02238703, + "epoch": 0.7858710356230272, + "flos": 17457398576640.0, + "grad_norm": 1.5871325164000656, + "language_loss": 0.65802586, + "learning_rate": 4.618920199958083e-07, + "loss": 0.67885017, + "num_input_tokens_seen": 281973285, + "step": 13071, + "time_per_iteration": 2.5797383785247803 + }, + { + "auxiliary_loss_clip": 0.01059805, + "auxiliary_loss_mlp": 0.01031308, + "balance_loss_clip": 1.0340519, + "balance_loss_mlp": 1.01945806, + "epoch": 0.7859311588756952, + "flos": 24679680353280.0, + "grad_norm": 1.8708565495733256, + "language_loss": 0.7397691, + "learning_rate": 4.616431115532442e-07, + "loss": 0.76068026, + "num_input_tokens_seen": 281991410, + "step": 13072, + "time_per_iteration": 2.6234352588653564 + }, + { + "auxiliary_loss_clip": 0.0109132, + "auxiliary_loss_mlp": 0.01029694, + "balance_loss_clip": 1.03782225, + "balance_loss_mlp": 1.01698625, + "epoch": 0.7859912821283631, + "flos": 21799142098560.0, + "grad_norm": 2.469743787467539, + "language_loss": 0.71712661, + "learning_rate": 4.613942614453268e-07, + "loss": 0.73833674, + "num_input_tokens_seen": 282010845, + "step": 13073, + "time_per_iteration": 2.526869058609009 + }, + { + "auxiliary_loss_clip": 0.01079831, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.03774846, + "balance_loss_mlp": 1.01964676, + "epoch": 0.7860514053810311, + "flos": 20847293642880.0, + "grad_norm": 1.5774651732552534, + "language_loss": 0.76554757, + "learning_rate": 4.611454696814938e-07, + "loss": 0.78666496, + "num_input_tokens_seen": 282029635, + "step": 13074, + "time_per_iteration": 2.542867422103882 + }, + { + "auxiliary_loss_clip": 0.01069869, + "auxiliary_loss_mlp": 0.01029825, + "balance_loss_clip": 1.03496373, + "balance_loss_mlp": 1.01813626, + "epoch": 0.786111528633699, + "flos": 24315689882880.0, + "grad_norm": 1.606608931650668, + "language_loss": 0.75167197, + "learning_rate": 4.608967362711782e-07, + "loss": 0.77266896, + "num_input_tokens_seen": 282050285, + "step": 13075, + "time_per_iteration": 2.5756590366363525 + }, + { + "auxiliary_loss_clip": 0.01074737, + "auxiliary_loss_mlp": 0.01027255, + "balance_loss_clip": 1.03638673, + "balance_loss_mlp": 1.01649642, + "epoch": 0.7861716518863671, + "flos": 24353180703360.0, + "grad_norm": 1.840073586287196, + "language_loss": 0.68871188, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.70973182, + "num_input_tokens_seen": 282071040, + "step": 13076, + "time_per_iteration": 2.591090202331543 + }, + { + "auxiliary_loss_clip": 0.01090509, + "auxiliary_loss_mlp": 0.01030573, + "balance_loss_clip": 1.0349431, + "balance_loss_mlp": 1.01825273, + "epoch": 0.786231775139035, + "flos": 14022399006720.0, + "grad_norm": 1.9081288645049805, + "language_loss": 0.80165279, + "learning_rate": 4.603994445488282e-07, + "loss": 0.82286364, + "num_input_tokens_seen": 282086610, + "step": 13077, + "time_per_iteration": 2.4528255462646484 + }, + { + "auxiliary_loss_clip": 0.01093946, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.03684235, + "balance_loss_mlp": 1.01916122, + "epoch": 0.786291898391703, + "flos": 33724248865920.0, + "grad_norm": 1.5207188020111702, + "language_loss": 0.70889884, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.73015332, + "num_input_tokens_seen": 282107440, + "step": 13078, + "time_per_iteration": 2.609437942504883 + }, + { + "auxiliary_loss_clip": 0.01091825, + "auxiliary_loss_mlp": 0.01033047, + "balance_loss_clip": 1.03689289, + "balance_loss_mlp": 1.02148366, + "epoch": 0.786352021644371, + "flos": 25811476968960.0, + "grad_norm": 1.427760843751378, + "language_loss": 0.81339097, + "learning_rate": 4.599023863537039e-07, + "loss": 0.83463967, + "num_input_tokens_seen": 282127290, + "step": 13079, + "time_per_iteration": 2.532857894897461 + }, + { + "auxiliary_loss_clip": 0.01070329, + "auxiliary_loss_mlp": 0.01030742, + "balance_loss_clip": 1.035586, + "balance_loss_mlp": 1.01869035, + "epoch": 0.7864121448970389, + "flos": 28910818920960.0, + "grad_norm": 1.4601848117945384, + "language_loss": 0.68267453, + "learning_rate": 4.596539448524146e-07, + "loss": 0.70368528, + "num_input_tokens_seen": 282147505, + "step": 13080, + "time_per_iteration": 2.624238967895508 + }, + { + "auxiliary_loss_clip": 0.01093574, + "auxiliary_loss_mlp": 0.01033164, + "balance_loss_clip": 1.0361402, + "balance_loss_mlp": 1.02115333, + "epoch": 0.7864722681497069, + "flos": 19208833735680.0, + "grad_norm": 1.5478637255103993, + "language_loss": 0.69401526, + "learning_rate": 4.594055617612016e-07, + "loss": 0.71528268, + "num_input_tokens_seen": 282166450, + "step": 13081, + "time_per_iteration": 2.4859063625335693 + }, + { + "auxiliary_loss_clip": 0.01080701, + "auxiliary_loss_mlp": 0.01034731, + "balance_loss_clip": 1.03482664, + "balance_loss_mlp": 1.0232451, + "epoch": 0.7865323914023749, + "flos": 21871573873920.0, + "grad_norm": 1.50379712103308, + "language_loss": 0.68448901, + "learning_rate": 4.591572370894838e-07, + "loss": 0.7056433, + "num_input_tokens_seen": 282186465, + "step": 13082, + "time_per_iteration": 2.543154239654541 + }, + { + "auxiliary_loss_clip": 0.01074524, + "auxiliary_loss_mlp": 0.01031866, + "balance_loss_clip": 1.03406572, + "balance_loss_mlp": 1.01990914, + "epoch": 0.7865925146550429, + "flos": 25520313323520.0, + "grad_norm": 1.5204480489992114, + "language_loss": 0.66300863, + "learning_rate": 4.589089708466789e-07, + "loss": 0.68407261, + "num_input_tokens_seen": 282207180, + "step": 13083, + "time_per_iteration": 2.53957462310791 + }, + { + "auxiliary_loss_clip": 0.01085445, + "auxiliary_loss_mlp": 0.01032233, + "balance_loss_clip": 1.03700066, + "balance_loss_mlp": 1.01919162, + "epoch": 0.7866526379077108, + "flos": 19097366855040.0, + "grad_norm": 2.0402210108742804, + "language_loss": 0.74655467, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.76773143, + "num_input_tokens_seen": 282225865, + "step": 13084, + "time_per_iteration": 2.503448963165283 + }, + { + "auxiliary_loss_clip": 0.01079541, + "auxiliary_loss_mlp": 0.01037954, + "balance_loss_clip": 1.0366056, + "balance_loss_mlp": 1.0251807, + "epoch": 0.7867127611603788, + "flos": 16173771171840.0, + "grad_norm": 3.0594407408908477, + "language_loss": 0.70680964, + "learning_rate": 4.584126136854591e-07, + "loss": 0.72798455, + "num_input_tokens_seen": 282242895, + "step": 13085, + "time_per_iteration": 2.4677326679229736 + }, + { + "auxiliary_loss_clip": 0.01079962, + "auxiliary_loss_mlp": 0.01029856, + "balance_loss_clip": 1.03393841, + "balance_loss_mlp": 1.01734531, + "epoch": 0.7867728844130467, + "flos": 20773640805120.0, + "grad_norm": 1.703855984410092, + "language_loss": 0.72090673, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.74200499, + "num_input_tokens_seen": 282260425, + "step": 13086, + "time_per_iteration": 2.50201416015625 + }, + { + "auxiliary_loss_clip": 0.01104704, + "auxiliary_loss_mlp": 0.01028291, + "balance_loss_clip": 1.03597617, + "balance_loss_mlp": 1.01669168, + "epoch": 0.7868330076657147, + "flos": 21760106993280.0, + "grad_norm": 2.154269312621411, + "language_loss": 0.74566287, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.76699281, + "num_input_tokens_seen": 282279335, + "step": 13087, + "time_per_iteration": 2.452317953109741 + }, + { + "auxiliary_loss_clip": 0.01082306, + "auxiliary_loss_mlp": 0.0103144, + "balance_loss_clip": 1.03550005, + "balance_loss_mlp": 1.02043092, + "epoch": 0.7868931309183826, + "flos": 25700692446720.0, + "grad_norm": 1.5588390951709792, + "language_loss": 0.71306276, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.73420024, + "num_input_tokens_seen": 282299905, + "step": 13088, + "time_per_iteration": 2.5520641803741455 + }, + { + "auxiliary_loss_clip": 0.01032658, + "auxiliary_loss_mlp": 0.01004346, + "balance_loss_clip": 1.00976396, + "balance_loss_mlp": 1.00321341, + "epoch": 0.7869532541710507, + "flos": 64644883430400.0, + "grad_norm": 0.6732884845195526, + "language_loss": 0.55472881, + "learning_rate": 4.574206009240431e-07, + "loss": 0.57509881, + "num_input_tokens_seen": 282367620, + "step": 13089, + "time_per_iteration": 3.146073818206787 + }, + { + "auxiliary_loss_clip": 0.01021905, + "auxiliary_loss_mlp": 0.01004526, + "balance_loss_clip": 1.01085818, + "balance_loss_mlp": 1.00342321, + "epoch": 0.7870133774237186, + "flos": 67453600440960.0, + "grad_norm": 0.7252255157912649, + "language_loss": 0.49934068, + "learning_rate": 4.571727439470976e-07, + "loss": 0.51960492, + "num_input_tokens_seen": 282435695, + "step": 13090, + "time_per_iteration": 3.2068629264831543 + }, + { + "auxiliary_loss_clip": 0.01092593, + "auxiliary_loss_mlp": 0.0102926, + "balance_loss_clip": 1.036098, + "balance_loss_mlp": 1.01809001, + "epoch": 0.7870735006763866, + "flos": 26068310190720.0, + "grad_norm": 1.433581849038391, + "language_loss": 0.83761144, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.85882998, + "num_input_tokens_seen": 282456025, + "step": 13091, + "time_per_iteration": 2.5248043537139893 + }, + { + "auxiliary_loss_clip": 0.01022318, + "auxiliary_loss_mlp": 0.01003057, + "balance_loss_clip": 1.00952053, + "balance_loss_mlp": 1.00196028, + "epoch": 0.7871336239290546, + "flos": 70289572896000.0, + "grad_norm": 0.7111967543549162, + "language_loss": 0.64026529, + "learning_rate": 4.566772055150947e-07, + "loss": 0.66051906, + "num_input_tokens_seen": 282520995, + "step": 13092, + "time_per_iteration": 3.1398584842681885 + }, + { + "auxiliary_loss_clip": 0.01084056, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.03828132, + "balance_loss_mlp": 1.02163339, + "epoch": 0.7871937471817225, + "flos": 15778574760960.0, + "grad_norm": 2.6261363255698202, + "language_loss": 0.79096669, + "learning_rate": 4.564295240788285e-07, + "loss": 0.81214827, + "num_input_tokens_seen": 282539355, + "step": 13093, + "time_per_iteration": 2.511087417602539 + }, + { + "auxiliary_loss_clip": 0.01076244, + "auxiliary_loss_mlp": 0.010245, + "balance_loss_clip": 1.03712618, + "balance_loss_mlp": 1.01297295, + "epoch": 0.7872538704343905, + "flos": 20485242506880.0, + "grad_norm": 1.7261222886463472, + "language_loss": 0.7532028, + "learning_rate": 4.561819011749106e-07, + "loss": 0.77421027, + "num_input_tokens_seen": 282555735, + "step": 13094, + "time_per_iteration": 5.309385061264038 + }, + { + "auxiliary_loss_clip": 0.01059652, + "auxiliary_loss_mlp": 0.01044535, + "balance_loss_clip": 1.03510666, + "balance_loss_mlp": 1.0307188, + "epoch": 0.7873139936870585, + "flos": 25082670015360.0, + "grad_norm": 1.6172613991369318, + "language_loss": 0.80051059, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.82155246, + "num_input_tokens_seen": 282574550, + "step": 13095, + "time_per_iteration": 2.60383677482605 + }, + { + "auxiliary_loss_clip": 0.01095353, + "auxiliary_loss_mlp": 0.01031162, + "balance_loss_clip": 1.03643203, + "balance_loss_mlp": 1.01908648, + "epoch": 0.7873741169397265, + "flos": 30883176679680.0, + "grad_norm": 1.7025478642646834, + "language_loss": 0.6794678, + "learning_rate": 4.556868310016715e-07, + "loss": 0.70073295, + "num_input_tokens_seen": 282596520, + "step": 13096, + "time_per_iteration": 2.5673410892486572 + }, + { + "auxiliary_loss_clip": 0.01076248, + "auxiliary_loss_mlp": 0.01027635, + "balance_loss_clip": 1.0332557, + "balance_loss_mlp": 1.01740062, + "epoch": 0.7874342401923944, + "flos": 46791962242560.0, + "grad_norm": 1.4487673323533563, + "language_loss": 0.70518196, + "learning_rate": 4.55439383751125e-07, + "loss": 0.72622085, + "num_input_tokens_seen": 282620560, + "step": 13097, + "time_per_iteration": 4.162252426147461 + }, + { + "auxiliary_loss_clip": 0.01086456, + "auxiliary_loss_mlp": 0.01033118, + "balance_loss_clip": 1.03799474, + "balance_loss_mlp": 1.02109528, + "epoch": 0.7874943634450624, + "flos": 23584548545280.0, + "grad_norm": 2.065542326342619, + "language_loss": 0.80700362, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.82819939, + "num_input_tokens_seen": 282639830, + "step": 13098, + "time_per_iteration": 2.5486583709716797 + }, + { + "auxiliary_loss_clip": 0.01066292, + "auxiliary_loss_mlp": 0.01030981, + "balance_loss_clip": 1.03537273, + "balance_loss_mlp": 1.01944125, + "epoch": 0.7875544866977303, + "flos": 20191169859840.0, + "grad_norm": 1.6617089773634088, + "language_loss": 0.74348581, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.76445854, + "num_input_tokens_seen": 282660130, + "step": 13099, + "time_per_iteration": 2.5750772953033447 + }, + { + "auxiliary_loss_clip": 0.01082884, + "auxiliary_loss_mlp": 0.01023505, + "balance_loss_clip": 1.03533471, + "balance_loss_mlp": 1.01165605, + "epoch": 0.7876146099503983, + "flos": 22602571557120.0, + "grad_norm": 1.7609556189803828, + "language_loss": 0.78126663, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.80233049, + "num_input_tokens_seen": 282681125, + "step": 13100, + "time_per_iteration": 2.5394973754882812 + }, + { + "auxiliary_loss_clip": 0.01097357, + "auxiliary_loss_mlp": 0.00786932, + "balance_loss_clip": 1.03718638, + "balance_loss_mlp": 1.01265693, + "epoch": 0.7876747332030662, + "flos": 10705833555840.0, + "grad_norm": 2.0139717009989613, + "language_loss": 0.65972173, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.67856461, + "num_input_tokens_seen": 282696690, + "step": 13101, + "time_per_iteration": 2.4449574947357178 + }, + { + "auxiliary_loss_clip": 0.01081696, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.03555715, + "balance_loss_mlp": 1.01893973, + "epoch": 0.7877348564557343, + "flos": 38399315621760.0, + "grad_norm": 1.418390588387776, + "language_loss": 0.77796876, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.79909396, + "num_input_tokens_seen": 282721210, + "step": 13102, + "time_per_iteration": 4.0586512088775635 + }, + { + "auxiliary_loss_clip": 0.01092671, + "auxiliary_loss_mlp": 0.01035095, + "balance_loss_clip": 1.03524148, + "balance_loss_mlp": 1.0233891, + "epoch": 0.7877949797084022, + "flos": 18329524796160.0, + "grad_norm": 1.8399714163393464, + "language_loss": 0.82297242, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.84425008, + "num_input_tokens_seen": 282738505, + "step": 13103, + "time_per_iteration": 2.483966112136841 + }, + { + "auxiliary_loss_clip": 0.01096803, + "auxiliary_loss_mlp": 0.01031949, + "balance_loss_clip": 1.03702545, + "balance_loss_mlp": 1.01949167, + "epoch": 0.7878551029610702, + "flos": 25806736373760.0, + "grad_norm": 2.0466430645179208, + "language_loss": 0.80789924, + "learning_rate": 4.537088934794913e-07, + "loss": 0.8291868, + "num_input_tokens_seen": 282756895, + "step": 13104, + "time_per_iteration": 2.519477128982544 + }, + { + "auxiliary_loss_clip": 0.01106625, + "auxiliary_loss_mlp": 0.01032787, + "balance_loss_clip": 1.03680897, + "balance_loss_mlp": 1.020854, + "epoch": 0.7879152262137382, + "flos": 22342685679360.0, + "grad_norm": 1.648149375016571, + "language_loss": 0.7376399, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.75903404, + "num_input_tokens_seen": 282774955, + "step": 13105, + "time_per_iteration": 2.4851861000061035 + }, + { + "auxiliary_loss_clip": 0.01047081, + "auxiliary_loss_mlp": 0.01037467, + "balance_loss_clip": 1.03395295, + "balance_loss_mlp": 1.02527833, + "epoch": 0.7879753494664061, + "flos": 24785329230720.0, + "grad_norm": 1.8479717583421433, + "language_loss": 0.75870419, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.77954966, + "num_input_tokens_seen": 282793165, + "step": 13106, + "time_per_iteration": 2.642749309539795 + }, + { + "auxiliary_loss_clip": 0.01056563, + "auxiliary_loss_mlp": 0.01031538, + "balance_loss_clip": 1.03815031, + "balance_loss_mlp": 1.01988482, + "epoch": 0.7880354727190741, + "flos": 16909078487040.0, + "grad_norm": 2.1418264226565626, + "language_loss": 0.73345172, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.75433278, + "num_input_tokens_seen": 282809820, + "step": 13107, + "time_per_iteration": 2.5739641189575195 + }, + { + "auxiliary_loss_clip": 0.01104477, + "auxiliary_loss_mlp": 0.01032354, + "balance_loss_clip": 1.03667927, + "balance_loss_mlp": 1.02018237, + "epoch": 0.7880955959717421, + "flos": 22230500526720.0, + "grad_norm": 1.6596467975797882, + "language_loss": 0.73405492, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.75542319, + "num_input_tokens_seen": 282828600, + "step": 13108, + "time_per_iteration": 2.469514846801758 + }, + { + "auxiliary_loss_clip": 0.0103179, + "auxiliary_loss_mlp": 0.01001777, + "balance_loss_clip": 1.00887656, + "balance_loss_mlp": 1.00062692, + "epoch": 0.7881557192244101, + "flos": 69183200131200.0, + "grad_norm": 0.8825754435429964, + "language_loss": 0.60356319, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.62389886, + "num_input_tokens_seen": 282882775, + "step": 13109, + "time_per_iteration": 3.029965400695801 + }, + { + "auxiliary_loss_clip": 0.01070722, + "auxiliary_loss_mlp": 0.01030134, + "balance_loss_clip": 1.03677189, + "balance_loss_mlp": 1.0182668, + "epoch": 0.788215842477078, + "flos": 24935436167040.0, + "grad_norm": 1.5725876567387969, + "language_loss": 0.7217921, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.74280065, + "num_input_tokens_seen": 282902680, + "step": 13110, + "time_per_iteration": 2.619856595993042 + }, + { + "auxiliary_loss_clip": 0.01053989, + "auxiliary_loss_mlp": 0.01027188, + "balance_loss_clip": 1.03429985, + "balance_loss_mlp": 1.01577961, + "epoch": 0.788275965729746, + "flos": 26106483369600.0, + "grad_norm": 1.3576226973574288, + "language_loss": 0.75164557, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.77245736, + "num_input_tokens_seen": 282923625, + "step": 13111, + "time_per_iteration": 2.625410795211792 + }, + { + "auxiliary_loss_clip": 0.01088734, + "auxiliary_loss_mlp": 0.010347, + "balance_loss_clip": 1.03456223, + "balance_loss_mlp": 1.02201664, + "epoch": 0.7883360889824139, + "flos": 21214803646080.0, + "grad_norm": 2.0199596380797065, + "language_loss": 0.61126029, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.63249469, + "num_input_tokens_seen": 282941955, + "step": 13112, + "time_per_iteration": 2.512524366378784 + }, + { + "auxiliary_loss_clip": 0.01081873, + "auxiliary_loss_mlp": 0.01028885, + "balance_loss_clip": 1.03400826, + "balance_loss_mlp": 1.01644576, + "epoch": 0.7883962122350819, + "flos": 21142551438720.0, + "grad_norm": 1.786407504260886, + "language_loss": 0.67571372, + "learning_rate": 4.514881996216644e-07, + "loss": 0.69682133, + "num_input_tokens_seen": 282961280, + "step": 13113, + "time_per_iteration": 2.52264404296875 + }, + { + "auxiliary_loss_clip": 0.01067421, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.03552139, + "balance_loss_mlp": 1.0185312, + "epoch": 0.7884563354877498, + "flos": 15302901928320.0, + "grad_norm": 2.2866354632409087, + "language_loss": 0.58427382, + "learning_rate": 4.5124174933361e-07, + "loss": 0.60526061, + "num_input_tokens_seen": 282978210, + "step": 13114, + "time_per_iteration": 2.5269784927368164 + }, + { + "auxiliary_loss_clip": 0.01058269, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.03744376, + "balance_loss_mlp": 1.01613533, + "epoch": 0.7885164587404179, + "flos": 24388301226240.0, + "grad_norm": 1.7302605455778182, + "language_loss": 0.66649902, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.6873697, + "num_input_tokens_seen": 282998845, + "step": 13115, + "time_per_iteration": 2.6236155033111572 + }, + { + "auxiliary_loss_clip": 0.01081372, + "auxiliary_loss_mlp": 0.01031643, + "balance_loss_clip": 1.03555477, + "balance_loss_mlp": 1.01838648, + "epoch": 0.7885765819930858, + "flos": 14385886686720.0, + "grad_norm": 1.8340264697151658, + "language_loss": 0.88450432, + "learning_rate": 4.50749024954048e-07, + "loss": 0.90563446, + "num_input_tokens_seen": 283015200, + "step": 13116, + "time_per_iteration": 2.517118215560913 + }, + { + "auxiliary_loss_clip": 0.01089341, + "auxiliary_loss_mlp": 0.01032422, + "balance_loss_clip": 1.03612733, + "balance_loss_mlp": 1.01934445, + "epoch": 0.7886367052457538, + "flos": 18259930195200.0, + "grad_norm": 1.6327572938292425, + "language_loss": 0.72720528, + "learning_rate": 4.505027508812245e-07, + "loss": 0.74842286, + "num_input_tokens_seen": 283033680, + "step": 13117, + "time_per_iteration": 2.505702257156372 + }, + { + "auxiliary_loss_clip": 0.01092393, + "auxiliary_loss_mlp": 0.01027021, + "balance_loss_clip": 1.03639269, + "balance_loss_mlp": 1.01582682, + "epoch": 0.7886968284984217, + "flos": 15305092657920.0, + "grad_norm": 1.4172235578552708, + "language_loss": 0.80048156, + "learning_rate": 4.502565355654926e-07, + "loss": 0.82167578, + "num_input_tokens_seen": 283050620, + "step": 13118, + "time_per_iteration": 2.4778311252593994 + }, + { + "auxiliary_loss_clip": 0.01093148, + "auxiliary_loss_mlp": 0.01025489, + "balance_loss_clip": 1.03625679, + "balance_loss_mlp": 1.01387787, + "epoch": 0.7887569517510897, + "flos": 21215450090880.0, + "grad_norm": 3.314945247564775, + "language_loss": 0.73212045, + "learning_rate": 4.500103790161878e-07, + "loss": 0.75330681, + "num_input_tokens_seen": 283070215, + "step": 13119, + "time_per_iteration": 2.482278347015381 + }, + { + "auxiliary_loss_clip": 0.01092249, + "auxiliary_loss_mlp": 0.01027303, + "balance_loss_clip": 1.03536367, + "balance_loss_mlp": 1.01490545, + "epoch": 0.7888170750037578, + "flos": 22711237176960.0, + "grad_norm": 1.3270408164428418, + "language_loss": 0.71850485, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.73970032, + "num_input_tokens_seen": 283091485, + "step": 13120, + "time_per_iteration": 2.515949249267578 + }, + { + "auxiliary_loss_clip": 0.01079254, + "auxiliary_loss_mlp": 0.00786952, + "balance_loss_clip": 1.03473234, + "balance_loss_mlp": 1.01012182, + "epoch": 0.7888771982564257, + "flos": 36429148592640.0, + "grad_norm": 1.4010531520633145, + "language_loss": 0.78735566, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.80601776, + "num_input_tokens_seen": 283115040, + "step": 13121, + "time_per_iteration": 2.633202075958252 + }, + { + "auxiliary_loss_clip": 0.01091001, + "auxiliary_loss_mlp": 0.01031177, + "balance_loss_clip": 1.03459418, + "balance_loss_mlp": 1.01877928, + "epoch": 0.7889373215090937, + "flos": 27309993488640.0, + "grad_norm": 1.4813101558450685, + "language_loss": 0.80203754, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.82325935, + "num_input_tokens_seen": 283136925, + "step": 13122, + "time_per_iteration": 2.5559046268463135 + }, + { + "auxiliary_loss_clip": 0.01083592, + "auxiliary_loss_mlp": 0.01025626, + "balance_loss_clip": 1.03569484, + "balance_loss_mlp": 1.01416361, + "epoch": 0.7889974447617616, + "flos": 19829010983040.0, + "grad_norm": 5.139496870167601, + "language_loss": 0.78098136, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.80207348, + "num_input_tokens_seen": 283155725, + "step": 13123, + "time_per_iteration": 2.550250768661499 + }, + { + "auxiliary_loss_clip": 0.0108235, + "auxiliary_loss_mlp": 0.0103331, + "balance_loss_clip": 1.03766835, + "balance_loss_mlp": 1.0208168, + "epoch": 0.7890575680144296, + "flos": 17271201450240.0, + "grad_norm": 1.8562750753026471, + "language_loss": 0.67136782, + "learning_rate": 4.487804780926985e-07, + "loss": 0.69252443, + "num_input_tokens_seen": 283173845, + "step": 13124, + "time_per_iteration": 2.552367687225342 + }, + { + "auxiliary_loss_clip": 0.01083058, + "auxiliary_loss_mlp": 0.01027389, + "balance_loss_clip": 1.03540063, + "balance_loss_mlp": 1.01498568, + "epoch": 0.7891176912670975, + "flos": 27600151553280.0, + "grad_norm": 2.3676472898377026, + "language_loss": 0.72788393, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.74898839, + "num_input_tokens_seen": 283191985, + "step": 13125, + "time_per_iteration": 2.5396409034729004 + }, + { + "auxiliary_loss_clip": 0.01086847, + "auxiliary_loss_mlp": 0.01027636, + "balance_loss_clip": 1.03387868, + "balance_loss_mlp": 1.01491618, + "epoch": 0.7891778145197655, + "flos": 22711668140160.0, + "grad_norm": 1.9353253072029226, + "language_loss": 0.72315526, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.74430007, + "num_input_tokens_seen": 283210855, + "step": 13126, + "time_per_iteration": 2.549556016921997 + }, + { + "auxiliary_loss_clip": 0.01086424, + "auxiliary_loss_mlp": 0.01030437, + "balance_loss_clip": 1.0345099, + "balance_loss_mlp": 1.01744938, + "epoch": 0.7892379377724335, + "flos": 17310775259520.0, + "grad_norm": 1.7157654873688692, + "language_loss": 0.76728386, + "learning_rate": 4.480432433327845e-07, + "loss": 0.78845251, + "num_input_tokens_seen": 283229665, + "step": 13127, + "time_per_iteration": 2.4817049503326416 + }, + { + "auxiliary_loss_clip": 0.01088985, + "auxiliary_loss_mlp": 0.01034169, + "balance_loss_clip": 1.0358814, + "balance_loss_mlp": 1.02105594, + "epoch": 0.7892980610251015, + "flos": 25775674087680.0, + "grad_norm": 1.6195762634593, + "language_loss": 0.85785109, + "learning_rate": 4.47797616101103e-07, + "loss": 0.87908268, + "num_input_tokens_seen": 283248615, + "step": 13128, + "time_per_iteration": 2.534285068511963 + }, + { + "auxiliary_loss_clip": 0.01093759, + "auxiliary_loss_mlp": 0.01032251, + "balance_loss_clip": 1.03633285, + "balance_loss_mlp": 1.02127814, + "epoch": 0.7893581842777694, + "flos": 21579943351680.0, + "grad_norm": 2.229045492202683, + "language_loss": 0.69179684, + "learning_rate": 4.475520477290904e-07, + "loss": 0.71305692, + "num_input_tokens_seen": 283267135, + "step": 13129, + "time_per_iteration": 2.484806537628174 + }, + { + "auxiliary_loss_clip": 0.01025263, + "auxiliary_loss_mlp": 0.01015608, + "balance_loss_clip": 1.0127455, + "balance_loss_mlp": 1.01408184, + "epoch": 0.7894183075304374, + "flos": 69016468176000.0, + "grad_norm": 0.7157991328470195, + "language_loss": 0.6161499, + "learning_rate": 4.473065382260597e-07, + "loss": 0.63655865, + "num_input_tokens_seen": 283328940, + "step": 13130, + "time_per_iteration": 3.1297969818115234 + }, + { + "auxiliary_loss_clip": 0.0109669, + "auxiliary_loss_mlp": 0.01030883, + "balance_loss_clip": 1.03827095, + "balance_loss_mlp": 1.01931381, + "epoch": 0.7894784307831053, + "flos": 24243258107520.0, + "grad_norm": 1.559947820501089, + "language_loss": 0.73948091, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.76075661, + "num_input_tokens_seen": 283350000, + "step": 13131, + "time_per_iteration": 2.5693306922912598 + }, + { + "auxiliary_loss_clip": 0.01093426, + "auxiliary_loss_mlp": 0.01026946, + "balance_loss_clip": 1.03802848, + "balance_loss_mlp": 1.01333833, + "epoch": 0.7895385540357733, + "flos": 20266546550400.0, + "grad_norm": 5.0011340212951865, + "language_loss": 0.68973231, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.71093601, + "num_input_tokens_seen": 283368020, + "step": 13132, + "time_per_iteration": 3.8918983936309814 + }, + { + "auxiliary_loss_clip": 0.01098247, + "auxiliary_loss_mlp": 0.01039925, + "balance_loss_clip": 1.03797698, + "balance_loss_mlp": 1.02728915, + "epoch": 0.7895986772884414, + "flos": 20996574566400.0, + "grad_norm": 1.9229138751777557, + "language_loss": 0.62084764, + "learning_rate": 4.465703630239468e-07, + "loss": 0.64222944, + "num_input_tokens_seen": 283387030, + "step": 13133, + "time_per_iteration": 3.8475501537323 + }, + { + "auxiliary_loss_clip": 0.01076824, + "auxiliary_loss_mlp": 0.01034981, + "balance_loss_clip": 1.03668118, + "balance_loss_mlp": 1.0217663, + "epoch": 0.7896588005411093, + "flos": 18657999694080.0, + "grad_norm": 2.057836366561185, + "language_loss": 0.79681021, + "learning_rate": 4.463250890899195e-07, + "loss": 0.81792825, + "num_input_tokens_seen": 283402090, + "step": 13134, + "time_per_iteration": 2.5017220973968506 + }, + { + "auxiliary_loss_clip": 0.01093158, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.03425729, + "balance_loss_mlp": 1.01987493, + "epoch": 0.7897189237937773, + "flos": 18405907067520.0, + "grad_norm": 4.904785344937257, + "language_loss": 0.80519736, + "learning_rate": 4.460798740713998e-07, + "loss": 0.82644904, + "num_input_tokens_seen": 283421035, + "step": 13135, + "time_per_iteration": 3.8616526126861572 + }, + { + "auxiliary_loss_clip": 0.0109389, + "auxiliary_loss_mlp": 0.01029043, + "balance_loss_clip": 1.03639662, + "balance_loss_mlp": 1.01654434, + "epoch": 0.7897790470464452, + "flos": 23731602825600.0, + "grad_norm": 1.5038441853985973, + "language_loss": 0.72485703, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.74608636, + "num_input_tokens_seen": 283441830, + "step": 13136, + "time_per_iteration": 2.5261356830596924 + }, + { + "auxiliary_loss_clip": 0.01112291, + "auxiliary_loss_mlp": 0.01036187, + "balance_loss_clip": 1.03754139, + "balance_loss_mlp": 1.0234791, + "epoch": 0.7898391702991132, + "flos": 15918949111680.0, + "grad_norm": 2.2669578990135415, + "language_loss": 0.71208918, + "learning_rate": 4.455896208180778e-07, + "loss": 0.73357397, + "num_input_tokens_seen": 283459540, + "step": 13137, + "time_per_iteration": 2.4155349731445312 + }, + { + "auxiliary_loss_clip": 0.01104126, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.03664231, + "balance_loss_mlp": 1.01868606, + "epoch": 0.7898992935517811, + "flos": 19829046896640.0, + "grad_norm": 1.6711437437807006, + "language_loss": 0.73751533, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.75887084, + "num_input_tokens_seen": 283478790, + "step": 13138, + "time_per_iteration": 2.457265853881836 + }, + { + "auxiliary_loss_clip": 0.01066468, + "auxiliary_loss_mlp": 0.01030326, + "balance_loss_clip": 1.03719032, + "balance_loss_mlp": 1.01844716, + "epoch": 0.7899594168044491, + "flos": 16216253982720.0, + "grad_norm": 2.149827731168992, + "language_loss": 0.68059599, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.70156395, + "num_input_tokens_seen": 283495720, + "step": 13139, + "time_per_iteration": 2.5211362838745117 + }, + { + "auxiliary_loss_clip": 0.01023241, + "auxiliary_loss_mlp": 0.01003689, + "balance_loss_clip": 1.00976491, + "balance_loss_mlp": 1.00252104, + "epoch": 0.790019540057117, + "flos": 68331005959680.0, + "grad_norm": 0.8498170750344725, + "language_loss": 0.60205382, + "learning_rate": 4.448546830368003e-07, + "loss": 0.6223231, + "num_input_tokens_seen": 283558795, + "step": 13140, + "time_per_iteration": 3.182072162628174 + }, + { + "auxiliary_loss_clip": 0.01106986, + "auxiliary_loss_mlp": 0.0103348, + "balance_loss_clip": 1.03769672, + "balance_loss_mlp": 1.02109361, + "epoch": 0.7900796633097851, + "flos": 30332773601280.0, + "grad_norm": 1.7000944921320733, + "language_loss": 0.76008761, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.78149223, + "num_input_tokens_seen": 283579305, + "step": 13141, + "time_per_iteration": 3.991665840148926 + }, + { + "auxiliary_loss_clip": 0.01097005, + "auxiliary_loss_mlp": 0.01031862, + "balance_loss_clip": 1.03646195, + "balance_loss_mlp": 1.01949394, + "epoch": 0.790139786562453, + "flos": 22126790983680.0, + "grad_norm": 1.8610218559269482, + "language_loss": 0.68312669, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.70441538, + "num_input_tokens_seen": 283597840, + "step": 13142, + "time_per_iteration": 2.4930179119110107 + }, + { + "auxiliary_loss_clip": 0.00988601, + "auxiliary_loss_mlp": 0.01010513, + "balance_loss_clip": 1.0172286, + "balance_loss_mlp": 1.00913072, + "epoch": 0.790199909815121, + "flos": 58207284213120.0, + "grad_norm": 0.8203421553660541, + "language_loss": 0.59952366, + "learning_rate": 4.441202759969049e-07, + "loss": 0.61951482, + "num_input_tokens_seen": 283647950, + "step": 13143, + "time_per_iteration": 3.0622165203094482 + }, + { + "auxiliary_loss_clip": 0.01077566, + "auxiliary_loss_mlp": 0.01032187, + "balance_loss_clip": 1.03758168, + "balance_loss_mlp": 1.02001595, + "epoch": 0.7902600330677889, + "flos": 34533316759680.0, + "grad_norm": 1.4622068273483844, + "language_loss": 0.74529731, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.76639485, + "num_input_tokens_seen": 283670645, + "step": 13144, + "time_per_iteration": 3.2034802436828613 + }, + { + "auxiliary_loss_clip": 0.01098556, + "auxiliary_loss_mlp": 0.01031771, + "balance_loss_clip": 1.03607714, + "balance_loss_mlp": 1.01874161, + "epoch": 0.7903201563204569, + "flos": 22346384780160.0, + "grad_norm": 2.148375051385357, + "language_loss": 0.83436692, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.85567015, + "num_input_tokens_seen": 283688830, + "step": 13145, + "time_per_iteration": 2.5351593494415283 + }, + { + "auxiliary_loss_clip": 0.01088333, + "auxiliary_loss_mlp": 0.01030275, + "balance_loss_clip": 1.0336169, + "balance_loss_mlp": 1.01965964, + "epoch": 0.790380279573125, + "flos": 22053533195520.0, + "grad_norm": 1.9545732641052236, + "language_loss": 0.7290889, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.75027502, + "num_input_tokens_seen": 283708625, + "step": 13146, + "time_per_iteration": 2.501826286315918 + }, + { + "auxiliary_loss_clip": 0.01107461, + "auxiliary_loss_mlp": 0.01031846, + "balance_loss_clip": 1.03591585, + "balance_loss_mlp": 1.01985908, + "epoch": 0.7904404028257929, + "flos": 20302600826880.0, + "grad_norm": 1.7606279585998483, + "language_loss": 0.7557137, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.77710676, + "num_input_tokens_seen": 283725710, + "step": 13147, + "time_per_iteration": 2.471686601638794 + }, + { + "auxiliary_loss_clip": 0.01089036, + "auxiliary_loss_mlp": 0.01037365, + "balance_loss_clip": 1.03563428, + "balance_loss_mlp": 1.02424002, + "epoch": 0.7905005260784609, + "flos": 20008923229440.0, + "grad_norm": 1.6746092497362883, + "language_loss": 0.71910417, + "learning_rate": 4.428974443697087e-07, + "loss": 0.74036825, + "num_input_tokens_seen": 283744150, + "step": 13148, + "time_per_iteration": 2.486349105834961 + }, + { + "auxiliary_loss_clip": 0.01090842, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.03349447, + "balance_loss_mlp": 1.0183444, + "epoch": 0.7905606493311288, + "flos": 26905926418560.0, + "grad_norm": 1.8919445658321192, + "language_loss": 0.71586269, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.73707962, + "num_input_tokens_seen": 283764170, + "step": 13149, + "time_per_iteration": 2.5693464279174805 + }, + { + "auxiliary_loss_clip": 0.01072038, + "auxiliary_loss_mlp": 0.01035375, + "balance_loss_clip": 1.03454804, + "balance_loss_mlp": 1.02061105, + "epoch": 0.7906207725837968, + "flos": 23696230907520.0, + "grad_norm": 2.459241666707848, + "language_loss": 0.6506266, + "learning_rate": 4.424087249723225e-07, + "loss": 0.67170072, + "num_input_tokens_seen": 283784305, + "step": 13150, + "time_per_iteration": 2.589700937271118 + }, + { + "auxiliary_loss_clip": 0.01103664, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.03521204, + "balance_loss_mlp": 1.01946211, + "epoch": 0.7906808958364647, + "flos": 20848837927680.0, + "grad_norm": 1.532853171368158, + "language_loss": 0.69829631, + "learning_rate": 4.421644538650231e-07, + "loss": 0.71964371, + "num_input_tokens_seen": 283804040, + "step": 13151, + "time_per_iteration": 2.481691837310791 + }, + { + "auxiliary_loss_clip": 0.01087547, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.03673768, + "balance_loss_mlp": 1.02185702, + "epoch": 0.7907410190891327, + "flos": 40735196974080.0, + "grad_norm": 1.6565763032149325, + "language_loss": 0.70518172, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.72640824, + "num_input_tokens_seen": 283827120, + "step": 13152, + "time_per_iteration": 2.694915294647217 + }, + { + "auxiliary_loss_clip": 0.01072389, + "auxiliary_loss_mlp": 0.00783579, + "balance_loss_clip": 1.03537202, + "balance_loss_mlp": 1.00900316, + "epoch": 0.7908011423418007, + "flos": 13261165050240.0, + "grad_norm": 1.771473331837769, + "language_loss": 0.7271086, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.74566829, + "num_input_tokens_seen": 283844820, + "step": 13153, + "time_per_iteration": 2.5986404418945312 + }, + { + "auxiliary_loss_clip": 0.01105825, + "auxiliary_loss_mlp": 0.01027764, + "balance_loss_clip": 1.03641272, + "balance_loss_mlp": 1.01568222, + "epoch": 0.7908612655944687, + "flos": 19754747614080.0, + "grad_norm": 1.4732141699951657, + "language_loss": 0.78885978, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.81019568, + "num_input_tokens_seen": 283862870, + "step": 13154, + "time_per_iteration": 2.430135726928711 + }, + { + "auxiliary_loss_clip": 0.01099802, + "auxiliary_loss_mlp": 0.01029504, + "balance_loss_clip": 1.03656936, + "balance_loss_mlp": 1.01643252, + "epoch": 0.7909213888471366, + "flos": 21287738211840.0, + "grad_norm": 2.3202896864526443, + "language_loss": 0.7003839, + "learning_rate": 4.411879602612185e-07, + "loss": 0.72167701, + "num_input_tokens_seen": 283882405, + "step": 13155, + "time_per_iteration": 2.492530345916748 + }, + { + "auxiliary_loss_clip": 0.01106642, + "auxiliary_loss_mlp": 0.01029164, + "balance_loss_clip": 1.03670502, + "balance_loss_mlp": 1.01669455, + "epoch": 0.7909815120998046, + "flos": 22528882805760.0, + "grad_norm": 1.6288749333965507, + "language_loss": 0.76997286, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.79133093, + "num_input_tokens_seen": 283902070, + "step": 13156, + "time_per_iteration": 2.446924924850464 + }, + { + "auxiliary_loss_clip": 0.01071904, + "auxiliary_loss_mlp": 0.01028777, + "balance_loss_clip": 1.03189445, + "balance_loss_mlp": 1.01630199, + "epoch": 0.7910416353524725, + "flos": 26727702111360.0, + "grad_norm": 1.6176294498247645, + "language_loss": 0.65791535, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.67892212, + "num_input_tokens_seen": 283924100, + "step": 13157, + "time_per_iteration": 2.6245779991149902 + }, + { + "auxiliary_loss_clip": 0.0109523, + "auxiliary_loss_mlp": 0.0103637, + "balance_loss_clip": 1.03461266, + "balance_loss_mlp": 1.02297664, + "epoch": 0.7911017586051405, + "flos": 24644847139200.0, + "grad_norm": 1.7769083835316202, + "language_loss": 0.73949814, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.76081419, + "num_input_tokens_seen": 283944955, + "step": 13158, + "time_per_iteration": 2.5131125450134277 + }, + { + "auxiliary_loss_clip": 0.01090593, + "auxiliary_loss_mlp": 0.01027114, + "balance_loss_clip": 1.03470421, + "balance_loss_mlp": 1.01611137, + "epoch": 0.7911618818578086, + "flos": 17565489578880.0, + "grad_norm": 2.1646325732530265, + "language_loss": 0.67133236, + "learning_rate": 4.40212412422309e-07, + "loss": 0.69250941, + "num_input_tokens_seen": 283963125, + "step": 13159, + "time_per_iteration": 2.4795029163360596 + }, + { + "auxiliary_loss_clip": 0.01093162, + "auxiliary_loss_mlp": 0.01030771, + "balance_loss_clip": 1.03616095, + "balance_loss_mlp": 1.01932096, + "epoch": 0.7912220051104765, + "flos": 16721660298240.0, + "grad_norm": 1.948005949684928, + "language_loss": 0.67448503, + "learning_rate": 4.399686733077206e-07, + "loss": 0.69572431, + "num_input_tokens_seen": 283982850, + "step": 13160, + "time_per_iteration": 2.478736639022827 + }, + { + "auxiliary_loss_clip": 0.01076698, + "auxiliary_loss_mlp": 0.01027706, + "balance_loss_clip": 1.03177845, + "balance_loss_mlp": 1.01739454, + "epoch": 0.7912821283631445, + "flos": 13698736531200.0, + "grad_norm": 2.432144149022387, + "language_loss": 0.7306186, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.75166267, + "num_input_tokens_seen": 283998275, + "step": 13161, + "time_per_iteration": 2.4970364570617676 + }, + { + "auxiliary_loss_clip": 0.01076408, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.03485048, + "balance_loss_mlp": 1.01738214, + "epoch": 0.7913422516158124, + "flos": 23769021818880.0, + "grad_norm": 2.723981496451493, + "language_loss": 0.7346347, + "learning_rate": 4.39481372557418e-07, + "loss": 0.75569582, + "num_input_tokens_seen": 284018750, + "step": 13162, + "time_per_iteration": 2.529572010040283 + }, + { + "auxiliary_loss_clip": 0.01083709, + "auxiliary_loss_mlp": 0.01028823, + "balance_loss_clip": 1.03615499, + "balance_loss_mlp": 1.01667571, + "epoch": 0.7914023748684804, + "flos": 19938251220480.0, + "grad_norm": 1.7924277220476235, + "language_loss": 0.71859634, + "learning_rate": 4.392378109401811e-07, + "loss": 0.73972166, + "num_input_tokens_seen": 284037850, + "step": 13163, + "time_per_iteration": 2.5426783561706543 + }, + { + "auxiliary_loss_clip": 0.01070134, + "auxiliary_loss_mlp": 0.01032344, + "balance_loss_clip": 1.03325009, + "balance_loss_mlp": 1.01911795, + "epoch": 0.7914624981211483, + "flos": 20594805966720.0, + "grad_norm": 1.8718273928293514, + "language_loss": 0.70016432, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.72118914, + "num_input_tokens_seen": 284056380, + "step": 13164, + "time_per_iteration": 2.520493745803833 + }, + { + "auxiliary_loss_clip": 0.01065211, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.0348196, + "balance_loss_mlp": 1.01807642, + "epoch": 0.7915226213738163, + "flos": 21799465320960.0, + "grad_norm": 1.9144319277662667, + "language_loss": 0.6636892, + "learning_rate": 4.387508652677177e-07, + "loss": 0.68463707, + "num_input_tokens_seen": 284074945, + "step": 13165, + "time_per_iteration": 2.565697193145752 + }, + { + "auxiliary_loss_clip": 0.01055918, + "auxiliary_loss_mlp": 0.01025302, + "balance_loss_clip": 1.03543246, + "balance_loss_mlp": 1.01461518, + "epoch": 0.7915827446264843, + "flos": 16288362535680.0, + "grad_norm": 1.8207589694483788, + "language_loss": 0.72234142, + "learning_rate": 4.385074812309557e-07, + "loss": 0.74315363, + "num_input_tokens_seen": 284092070, + "step": 13166, + "time_per_iteration": 2.533301830291748 + }, + { + "auxiliary_loss_clip": 0.01103766, + "auxiliary_loss_mlp": 0.01033108, + "balance_loss_clip": 1.03521764, + "balance_loss_mlp": 1.01955962, + "epoch": 0.7916428678791523, + "flos": 25702595867520.0, + "grad_norm": 1.691965327005288, + "language_loss": 0.77659774, + "learning_rate": 4.382641564061462e-07, + "loss": 0.79796648, + "num_input_tokens_seen": 284112255, + "step": 13167, + "time_per_iteration": 2.5208961963653564 + }, + { + "auxiliary_loss_clip": 0.01071453, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.03584671, + "balance_loss_mlp": 1.01721644, + "epoch": 0.7917029911318202, + "flos": 23878513451520.0, + "grad_norm": 1.6004389293603498, + "language_loss": 0.84378529, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.86478591, + "num_input_tokens_seen": 284132330, + "step": 13168, + "time_per_iteration": 2.5565226078033447 + }, + { + "auxiliary_loss_clip": 0.01106639, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.03712451, + "balance_loss_mlp": 1.01834917, + "epoch": 0.7917631143844882, + "flos": 21646593037440.0, + "grad_norm": 1.971303424471412, + "language_loss": 0.72780633, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.74917364, + "num_input_tokens_seen": 284150640, + "step": 13169, + "time_per_iteration": 2.5167088508605957 + }, + { + "auxiliary_loss_clip": 0.01106629, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.03564358, + "balance_loss_mlp": 1.02152824, + "epoch": 0.7918232376371561, + "flos": 38874198355200.0, + "grad_norm": 1.8151550616878152, + "language_loss": 0.67684901, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.69825989, + "num_input_tokens_seen": 284171910, + "step": 13170, + "time_per_iteration": 3.9719409942626953 + }, + { + "auxiliary_loss_clip": 0.01093914, + "auxiliary_loss_mlp": 0.01023028, + "balance_loss_clip": 1.03469682, + "balance_loss_mlp": 1.01175094, + "epoch": 0.7918833608898241, + "flos": 20775544225920.0, + "grad_norm": 1.6365779857229437, + "language_loss": 0.70617974, + "learning_rate": 4.372914494109412e-07, + "loss": 0.72734916, + "num_input_tokens_seen": 284191340, + "step": 13171, + "time_per_iteration": 2.4768433570861816 + }, + { + "auxiliary_loss_clip": 0.01092736, + "auxiliary_loss_mlp": 0.01027201, + "balance_loss_clip": 1.03536153, + "balance_loss_mlp": 1.01502371, + "epoch": 0.7919434841424922, + "flos": 33910122769920.0, + "grad_norm": 1.904992849524794, + "language_loss": 0.66878241, + "learning_rate": 4.370484207842553e-07, + "loss": 0.68998182, + "num_input_tokens_seen": 284212495, + "step": 13172, + "time_per_iteration": 3.9697887897491455 + }, + { + "auxiliary_loss_clip": 0.01080224, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.03549528, + "balance_loss_mlp": 1.02185094, + "epoch": 0.7920036073951601, + "flos": 21064660796160.0, + "grad_norm": 2.0370204503780154, + "language_loss": 0.79512984, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.81627065, + "num_input_tokens_seen": 284230825, + "step": 13173, + "time_per_iteration": 3.9237518310546875 + }, + { + "auxiliary_loss_clip": 0.01067539, + "auxiliary_loss_mlp": 0.01034186, + "balance_loss_clip": 1.03231907, + "balance_loss_mlp": 1.02147222, + "epoch": 0.7920637306478281, + "flos": 23655974739840.0, + "grad_norm": 1.8273587430240612, + "language_loss": 0.76658159, + "learning_rate": 4.365625413419365e-07, + "loss": 0.78759879, + "num_input_tokens_seen": 284250365, + "step": 13174, + "time_per_iteration": 2.6060094833374023 + }, + { + "auxiliary_loss_clip": 0.01078265, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.03224182, + "balance_loss_mlp": 1.01978922, + "epoch": 0.792123853900496, + "flos": 27195438038400.0, + "grad_norm": 1.6850920827691889, + "language_loss": 0.71889567, + "learning_rate": 4.363196905447297e-07, + "loss": 0.73999047, + "num_input_tokens_seen": 284269635, + "step": 13175, + "time_per_iteration": 2.574516534805298 + }, + { + "auxiliary_loss_clip": 0.01092909, + "auxiliary_loss_mlp": 0.0102987, + "balance_loss_clip": 1.03448963, + "balance_loss_mlp": 1.01759791, + "epoch": 0.792183977153164, + "flos": 19098659744640.0, + "grad_norm": 1.8611115816172499, + "language_loss": 0.59508944, + "learning_rate": 4.360768990424364e-07, + "loss": 0.61631727, + "num_input_tokens_seen": 284288380, + "step": 13176, + "time_per_iteration": 2.451698064804077 + }, + { + "auxiliary_loss_clip": 0.01106869, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.03906846, + "balance_loss_mlp": 1.01872683, + "epoch": 0.7922441004058319, + "flos": 17128851851520.0, + "grad_norm": 1.7986649271149069, + "language_loss": 0.73413515, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.75551069, + "num_input_tokens_seen": 284306920, + "step": 13177, + "time_per_iteration": 2.4519453048706055 + }, + { + "auxiliary_loss_clip": 0.01087542, + "auxiliary_loss_mlp": 0.01031722, + "balance_loss_clip": 1.03494608, + "balance_loss_mlp": 1.01962256, + "epoch": 0.7923042236585, + "flos": 17821640442240.0, + "grad_norm": 1.9093988702350015, + "language_loss": 0.64183331, + "learning_rate": 4.355914939594174e-07, + "loss": 0.66302598, + "num_input_tokens_seen": 284324700, + "step": 13178, + "time_per_iteration": 2.441577434539795 + }, + { + "auxiliary_loss_clip": 0.01079865, + "auxiliary_loss_mlp": 0.01031732, + "balance_loss_clip": 1.03361952, + "balance_loss_mlp": 1.02102745, + "epoch": 0.7923643469111679, + "flos": 29935206892800.0, + "grad_norm": 1.5421284892907268, + "language_loss": 0.68544, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.70655596, + "num_input_tokens_seen": 284345985, + "step": 13179, + "time_per_iteration": 2.60136342048645 + }, + { + "auxiliary_loss_clip": 0.01102273, + "auxiliary_loss_mlp": 0.01027685, + "balance_loss_clip": 1.03480196, + "balance_loss_mlp": 1.01571596, + "epoch": 0.7924244701638359, + "flos": 22674716023680.0, + "grad_norm": 1.9935971334300522, + "language_loss": 0.74359739, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.76489699, + "num_input_tokens_seen": 284364475, + "step": 13180, + "time_per_iteration": 3.8719544410705566 + }, + { + "auxiliary_loss_clip": 0.01093669, + "auxiliary_loss_mlp": 0.01035238, + "balance_loss_clip": 1.03767753, + "balance_loss_mlp": 1.02230418, + "epoch": 0.7924845934165038, + "flos": 17968156018560.0, + "grad_norm": 2.009058378286736, + "language_loss": 0.81420845, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.8354975, + "num_input_tokens_seen": 284382125, + "step": 13181, + "time_per_iteration": 2.4489307403564453 + }, + { + "auxiliary_loss_clip": 0.01075774, + "auxiliary_loss_mlp": 0.01035403, + "balance_loss_clip": 1.03476405, + "balance_loss_mlp": 1.02276635, + "epoch": 0.7925447166691718, + "flos": 23476960333440.0, + "grad_norm": 1.779442516650618, + "language_loss": 0.77660525, + "learning_rate": 4.346213957372895e-07, + "loss": 0.79771698, + "num_input_tokens_seen": 284401585, + "step": 13182, + "time_per_iteration": 2.5456299781799316 + }, + { + "auxiliary_loss_clip": 0.01099877, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.03801358, + "balance_loss_mlp": 1.02331448, + "epoch": 0.7926048399218397, + "flos": 20447572118400.0, + "grad_norm": 1.850407664163874, + "language_loss": 0.73844552, + "learning_rate": 4.34379019557056e-07, + "loss": 0.75981259, + "num_input_tokens_seen": 284419125, + "step": 13183, + "time_per_iteration": 2.4586498737335205 + }, + { + "auxiliary_loss_clip": 0.01073653, + "auxiliary_loss_mlp": 0.01026646, + "balance_loss_clip": 1.03509855, + "balance_loss_mlp": 1.01337183, + "epoch": 0.7926649631745077, + "flos": 37160038535040.0, + "grad_norm": 1.8123674292201173, + "language_loss": 0.68082094, + "learning_rate": 4.341367027453264e-07, + "loss": 0.70182383, + "num_input_tokens_seen": 284440445, + "step": 13184, + "time_per_iteration": 2.6618990898132324 + }, + { + "auxiliary_loss_clip": 0.01066278, + "auxiliary_loss_mlp": 0.01033232, + "balance_loss_clip": 1.03717566, + "balance_loss_mlp": 1.02097154, + "epoch": 0.7927250864271758, + "flos": 17018606033280.0, + "grad_norm": 1.7362120078361776, + "language_loss": 0.71063006, + "learning_rate": 4.338944453112907e-07, + "loss": 0.7316252, + "num_input_tokens_seen": 284459370, + "step": 13185, + "time_per_iteration": 2.537388563156128 + }, + { + "auxiliary_loss_clip": 0.01090846, + "auxiliary_loss_mlp": 0.01028257, + "balance_loss_clip": 1.03662205, + "balance_loss_mlp": 1.01569211, + "epoch": 0.7927852096798437, + "flos": 17749208666880.0, + "grad_norm": 3.9185797132809785, + "language_loss": 0.65336668, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.67455769, + "num_input_tokens_seen": 284477525, + "step": 13186, + "time_per_iteration": 2.489509344100952 + }, + { + "auxiliary_loss_clip": 0.01087162, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.0338335, + "balance_loss_mlp": 1.02105927, + "epoch": 0.7928453329325117, + "flos": 23838436851840.0, + "grad_norm": 1.75431095837258, + "language_loss": 0.76763535, + "learning_rate": 4.334101086130408e-07, + "loss": 0.78883362, + "num_input_tokens_seen": 284496590, + "step": 13187, + "time_per_iteration": 2.5379977226257324 + }, + { + "auxiliary_loss_clip": 0.01084322, + "auxiliary_loss_mlp": 0.01026384, + "balance_loss_clip": 1.03621733, + "balance_loss_mlp": 1.01451683, + "epoch": 0.7929054561851796, + "flos": 17454920538240.0, + "grad_norm": 1.921777012413162, + "language_loss": 0.72297192, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.74407899, + "num_input_tokens_seen": 284511470, + "step": 13188, + "time_per_iteration": 2.497368812561035 + }, + { + "auxiliary_loss_clip": 0.01105984, + "auxiliary_loss_mlp": 0.00784344, + "balance_loss_clip": 1.03521633, + "balance_loss_mlp": 1.00993431, + "epoch": 0.7929655794378476, + "flos": 21981280988160.0, + "grad_norm": 2.0516926783239966, + "language_loss": 0.63006079, + "learning_rate": 4.329260095357725e-07, + "loss": 0.64896405, + "num_input_tokens_seen": 284531125, + "step": 13189, + "time_per_iteration": 2.448115825653076 + }, + { + "auxiliary_loss_clip": 0.0105789, + "auxiliary_loss_mlp": 0.01029456, + "balance_loss_clip": 1.03458059, + "balance_loss_mlp": 1.01847649, + "epoch": 0.7930257026905155, + "flos": 17273930883840.0, + "grad_norm": 1.8738646153742748, + "language_loss": 0.72152179, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.74239528, + "num_input_tokens_seen": 284549340, + "step": 13190, + "time_per_iteration": 2.5836496353149414 + }, + { + "auxiliary_loss_clip": 0.01091189, + "auxiliary_loss_mlp": 0.01027661, + "balance_loss_clip": 1.03757, + "balance_loss_mlp": 1.01726007, + "epoch": 0.7930858259431836, + "flos": 27300584125440.0, + "grad_norm": 1.7730795406922149, + "language_loss": 0.73115295, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.75234145, + "num_input_tokens_seen": 284567060, + "step": 13191, + "time_per_iteration": 2.5335373878479004 + }, + { + "auxiliary_loss_clip": 0.0109211, + "auxiliary_loss_mlp": 0.01035421, + "balance_loss_clip": 1.0351212, + "balance_loss_mlp": 1.0226897, + "epoch": 0.7931459491958515, + "flos": 19863736456320.0, + "grad_norm": 1.8408789420606098, + "language_loss": 0.68683851, + "learning_rate": 4.322003066198219e-07, + "loss": 0.70811379, + "num_input_tokens_seen": 284586600, + "step": 13192, + "time_per_iteration": 2.510612964630127 + }, + { + "auxiliary_loss_clip": 0.01070936, + "auxiliary_loss_mlp": 0.01037031, + "balance_loss_clip": 1.03412068, + "balance_loss_mlp": 1.02485394, + "epoch": 0.7932060724485195, + "flos": 23147120718720.0, + "grad_norm": 1.5437493258656407, + "language_loss": 0.74796194, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.7690416, + "num_input_tokens_seen": 284605715, + "step": 13193, + "time_per_iteration": 2.53916072845459 + }, + { + "auxiliary_loss_clip": 0.01092643, + "auxiliary_loss_mlp": 0.01030047, + "balance_loss_clip": 1.03728032, + "balance_loss_mlp": 1.01689816, + "epoch": 0.7932661957011874, + "flos": 29934847756800.0, + "grad_norm": 1.4716361325591532, + "language_loss": 0.71890461, + "learning_rate": 4.317168019161741e-07, + "loss": 0.7401315, + "num_input_tokens_seen": 284628540, + "step": 13194, + "time_per_iteration": 2.5463688373565674 + }, + { + "auxiliary_loss_clip": 0.01107877, + "auxiliary_loss_mlp": 0.01033266, + "balance_loss_clip": 1.03650522, + "balance_loss_mlp": 1.02079034, + "epoch": 0.7933263189538554, + "flos": 22559119079040.0, + "grad_norm": 1.9906596521576876, + "language_loss": 0.7010386, + "learning_rate": 4.314751387639517e-07, + "loss": 0.72245002, + "num_input_tokens_seen": 284646040, + "step": 13195, + "time_per_iteration": 2.433253288269043 + }, + { + "auxiliary_loss_clip": 0.01053823, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.03632808, + "balance_loss_mlp": 1.01435328, + "epoch": 0.7933864422065233, + "flos": 25479051575040.0, + "grad_norm": 1.6788474722170903, + "language_loss": 0.77445751, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.79525626, + "num_input_tokens_seen": 284665110, + "step": 13196, + "time_per_iteration": 2.6459922790527344 + }, + { + "auxiliary_loss_clip": 0.01073242, + "auxiliary_loss_mlp": 0.01037594, + "balance_loss_clip": 1.03576493, + "balance_loss_mlp": 1.02542233, + "epoch": 0.7934465654591913, + "flos": 33583156243200.0, + "grad_norm": 1.5861764964000218, + "language_loss": 0.68631828, + "learning_rate": 4.309919909045268e-07, + "loss": 0.70742667, + "num_input_tokens_seen": 284686515, + "step": 13197, + "time_per_iteration": 2.6409270763397217 + }, + { + "auxiliary_loss_clip": 0.01091843, + "auxiliary_loss_mlp": 0.01028593, + "balance_loss_clip": 1.03540528, + "balance_loss_mlp": 1.01679754, + "epoch": 0.7935066887118594, + "flos": 31432538263680.0, + "grad_norm": 1.6479064261612526, + "language_loss": 0.64929438, + "learning_rate": 4.30750506215646e-07, + "loss": 0.67049873, + "num_input_tokens_seen": 284707300, + "step": 13198, + "time_per_iteration": 2.5902647972106934 + }, + { + "auxiliary_loss_clip": 0.01056793, + "auxiliary_loss_mlp": 0.01040996, + "balance_loss_clip": 1.03469157, + "balance_loss_mlp": 1.02625012, + "epoch": 0.7935668119645273, + "flos": 14682616940160.0, + "grad_norm": 1.9895710891339322, + "language_loss": 0.72520709, + "learning_rate": 4.30509081032864e-07, + "loss": 0.74618506, + "num_input_tokens_seen": 284723545, + "step": 13199, + "time_per_iteration": 2.532724380493164 + }, + { + "auxiliary_loss_clip": 0.01077953, + "auxiliary_loss_mlp": 0.01030496, + "balance_loss_clip": 1.03364444, + "balance_loss_mlp": 1.01874197, + "epoch": 0.7936269352171953, + "flos": 18004246208640.0, + "grad_norm": 1.8672598215577085, + "language_loss": 0.8063162, + "learning_rate": 4.302677153653349e-07, + "loss": 0.82740068, + "num_input_tokens_seen": 284742650, + "step": 13200, + "time_per_iteration": 2.5364186763763428 + }, + { + "auxiliary_loss_clip": 0.01091815, + "auxiliary_loss_mlp": 0.01029356, + "balance_loss_clip": 1.03686976, + "balance_loss_mlp": 1.01788771, + "epoch": 0.7936870584698632, + "flos": 18880215183360.0, + "grad_norm": 1.640679408302962, + "language_loss": 0.77306491, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.7942766, + "num_input_tokens_seen": 284760955, + "step": 13201, + "time_per_iteration": 2.4687345027923584 + }, + { + "auxiliary_loss_clip": 0.01103096, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.03541076, + "balance_loss_mlp": 1.01979685, + "epoch": 0.7937471817225312, + "flos": 23367001824000.0, + "grad_norm": 1.4717353306747516, + "language_loss": 0.67109442, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.69244105, + "num_input_tokens_seen": 284780745, + "step": 13202, + "time_per_iteration": 2.548905372619629 + }, + { + "auxiliary_loss_clip": 0.01094116, + "auxiliary_loss_mlp": 0.01031666, + "balance_loss_clip": 1.03601706, + "balance_loss_mlp": 1.01892209, + "epoch": 0.7938073049751991, + "flos": 22674428714880.0, + "grad_norm": 1.8146861707796742, + "language_loss": 0.74739659, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.76865447, + "num_input_tokens_seen": 284799000, + "step": 13203, + "time_per_iteration": 2.4939615726470947 + }, + { + "auxiliary_loss_clip": 0.0104977, + "auxiliary_loss_mlp": 0.01030695, + "balance_loss_clip": 1.03276849, + "balance_loss_mlp": 1.01925111, + "epoch": 0.7938674282278672, + "flos": 22851431959680.0, + "grad_norm": 1.806018109897258, + "language_loss": 0.66436803, + "learning_rate": 4.293028480307643e-07, + "loss": 0.68517268, + "num_input_tokens_seen": 284817450, + "step": 13204, + "time_per_iteration": 2.6724603176116943 + }, + { + "auxiliary_loss_clip": 0.01046284, + "auxiliary_loss_mlp": 0.0102724, + "balance_loss_clip": 1.0355438, + "balance_loss_mlp": 1.01584363, + "epoch": 0.7939275514805351, + "flos": 27012509049600.0, + "grad_norm": 1.3111537033805436, + "language_loss": 0.79516029, + "learning_rate": 4.290617800767438e-07, + "loss": 0.8158955, + "num_input_tokens_seen": 284838865, + "step": 13205, + "time_per_iteration": 2.648261308670044 + }, + { + "auxiliary_loss_clip": 0.01070285, + "auxiliary_loss_mlp": 0.01030051, + "balance_loss_clip": 1.03424215, + "balance_loss_mlp": 1.01810575, + "epoch": 0.7939876747332031, + "flos": 21142838747520.0, + "grad_norm": 1.9812185113480505, + "language_loss": 0.7787407, + "learning_rate": 4.28820771692858e-07, + "loss": 0.79974407, + "num_input_tokens_seen": 284857975, + "step": 13206, + "time_per_iteration": 2.5571513175964355 + }, + { + "auxiliary_loss_clip": 0.01081585, + "auxiliary_loss_mlp": 0.0103942, + "balance_loss_clip": 1.03598583, + "balance_loss_mlp": 1.02530515, + "epoch": 0.794047797985871, + "flos": 23289075267840.0, + "grad_norm": 1.9458918681852955, + "language_loss": 0.79419506, + "learning_rate": 4.285798228882456e-07, + "loss": 0.81540507, + "num_input_tokens_seen": 284877145, + "step": 13207, + "time_per_iteration": 2.553318738937378 + }, + { + "auxiliary_loss_clip": 0.01068627, + "auxiliary_loss_mlp": 0.01031127, + "balance_loss_clip": 1.03629017, + "balance_loss_mlp": 1.01931345, + "epoch": 0.794107921238539, + "flos": 24608074590720.0, + "grad_norm": 1.771933747673076, + "language_loss": 0.83872581, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.85972333, + "num_input_tokens_seen": 284895560, + "step": 13208, + "time_per_iteration": 2.597020387649536 + }, + { + "auxiliary_loss_clip": 0.00994371, + "auxiliary_loss_mlp": 0.01004227, + "balance_loss_clip": 1.00969017, + "balance_loss_mlp": 1.00292766, + "epoch": 0.7941680444912069, + "flos": 64093690252800.0, + "grad_norm": 0.7163224593324753, + "language_loss": 0.58325076, + "learning_rate": 4.280981040533875e-07, + "loss": 0.60323679, + "num_input_tokens_seen": 284963135, + "step": 13209, + "time_per_iteration": 4.748367547988892 + }, + { + "auxiliary_loss_clip": 0.01070072, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.03370738, + "balance_loss_mlp": 1.01837456, + "epoch": 0.794228167743875, + "flos": 24388839930240.0, + "grad_norm": 2.0830480249151884, + "language_loss": 0.62873131, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.64974672, + "num_input_tokens_seen": 284981755, + "step": 13210, + "time_per_iteration": 2.571667194366455 + }, + { + "auxiliary_loss_clip": 0.01088616, + "auxiliary_loss_mlp": 0.01032132, + "balance_loss_clip": 1.03571844, + "balance_loss_mlp": 1.02065253, + "epoch": 0.794288290996543, + "flos": 28512498026880.0, + "grad_norm": 1.5655829167959905, + "language_loss": 0.69090295, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.7121104, + "num_input_tokens_seen": 285003060, + "step": 13211, + "time_per_iteration": 3.9563143253326416 + }, + { + "auxiliary_loss_clip": 0.01096623, + "auxiliary_loss_mlp": 0.01034004, + "balance_loss_clip": 1.03629756, + "balance_loss_mlp": 1.02164829, + "epoch": 0.7943484142492109, + "flos": 25922117836800.0, + "grad_norm": 1.7288808238498627, + "language_loss": 0.72254169, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.74384803, + "num_input_tokens_seen": 285021640, + "step": 13212, + "time_per_iteration": 3.8920576572418213 + }, + { + "auxiliary_loss_clip": 0.01091172, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.03522134, + "balance_loss_mlp": 1.01619196, + "epoch": 0.7944085375018789, + "flos": 23915286000000.0, + "grad_norm": 1.5978588177964779, + "language_loss": 0.81014895, + "learning_rate": 4.271353817368246e-07, + "loss": 0.83133805, + "num_input_tokens_seen": 285040490, + "step": 13213, + "time_per_iteration": 2.480397939682007 + }, + { + "auxiliary_loss_clip": 0.01098057, + "auxiliary_loss_mlp": 0.0103006, + "balance_loss_clip": 1.03718746, + "balance_loss_mlp": 1.01766229, + "epoch": 0.7944686607545468, + "flos": 20229953569920.0, + "grad_norm": 2.090563871805547, + "language_loss": 0.68053931, + "learning_rate": 4.268948502428327e-07, + "loss": 0.70182049, + "num_input_tokens_seen": 285059270, + "step": 13214, + "time_per_iteration": 2.490478754043579 + }, + { + "auxiliary_loss_clip": 0.01102318, + "auxiliary_loss_mlp": 0.0102845, + "balance_loss_clip": 1.03578985, + "balance_loss_mlp": 1.01715505, + "epoch": 0.7945287840072148, + "flos": 21980993679360.0, + "grad_norm": 2.14047709542354, + "language_loss": 0.72620785, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.74751556, + "num_input_tokens_seen": 285075390, + "step": 13215, + "time_per_iteration": 2.431408643722534 + }, + { + "auxiliary_loss_clip": 0.01053908, + "auxiliary_loss_mlp": 0.01034056, + "balance_loss_clip": 1.03493834, + "balance_loss_mlp": 1.02115154, + "epoch": 0.7945889072598827, + "flos": 26397718842240.0, + "grad_norm": 1.5629648225577593, + "language_loss": 0.78813583, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.80901545, + "num_input_tokens_seen": 285096290, + "step": 13216, + "time_per_iteration": 2.62217378616333 + }, + { + "auxiliary_loss_clip": 0.01091641, + "auxiliary_loss_mlp": 0.01032388, + "balance_loss_clip": 1.03552306, + "balance_loss_mlp": 1.0201211, + "epoch": 0.7946490305125508, + "flos": 25810255906560.0, + "grad_norm": 1.584101574006816, + "language_loss": 0.73852062, + "learning_rate": 4.261736137111598e-07, + "loss": 0.75976086, + "num_input_tokens_seen": 285116020, + "step": 13217, + "time_per_iteration": 2.5042057037353516 + }, + { + "auxiliary_loss_clip": 0.01075964, + "auxiliary_loss_mlp": 0.01032857, + "balance_loss_clip": 1.03357637, + "balance_loss_mlp": 1.02032804, + "epoch": 0.7947091537652187, + "flos": 15960965045760.0, + "grad_norm": 1.8088663320061202, + "language_loss": 0.74436778, + "learning_rate": 4.259333208810907e-07, + "loss": 0.76545602, + "num_input_tokens_seen": 285133510, + "step": 13218, + "time_per_iteration": 2.5118370056152344 + }, + { + "auxiliary_loss_clip": 0.01095708, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.03554535, + "balance_loss_mlp": 1.02240777, + "epoch": 0.7947692770178867, + "flos": 18587866389120.0, + "grad_norm": 1.9210050251752622, + "language_loss": 0.83219767, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.85351062, + "num_input_tokens_seen": 285151690, + "step": 13219, + "time_per_iteration": 3.8837080001831055 + }, + { + "auxiliary_loss_clip": 0.01094231, + "auxiliary_loss_mlp": 0.01040385, + "balance_loss_clip": 1.03591704, + "balance_loss_mlp": 1.02568626, + "epoch": 0.7948294002705546, + "flos": 20442220992000.0, + "grad_norm": 3.27541271630113, + "language_loss": 0.75475824, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.77610445, + "num_input_tokens_seen": 285170485, + "step": 13220, + "time_per_iteration": 2.5024161338806152 + }, + { + "auxiliary_loss_clip": 0.01088128, + "auxiliary_loss_mlp": 0.01032374, + "balance_loss_clip": 1.03603351, + "balance_loss_mlp": 1.01991653, + "epoch": 0.7948895235232226, + "flos": 38181194282880.0, + "grad_norm": 1.710344383739916, + "language_loss": 0.72549617, + "learning_rate": 4.252128005599176e-07, + "loss": 0.74670118, + "num_input_tokens_seen": 285191050, + "step": 13221, + "time_per_iteration": 2.669774055480957 + }, + { + "auxiliary_loss_clip": 0.01093125, + "auxiliary_loss_mlp": 0.01025907, + "balance_loss_clip": 1.03645897, + "balance_loss_mlp": 1.01458788, + "epoch": 0.7949496467758905, + "flos": 15559806977280.0, + "grad_norm": 2.1230476212234985, + "language_loss": 0.74712038, + "learning_rate": 4.249727465395634e-07, + "loss": 0.76831067, + "num_input_tokens_seen": 285208750, + "step": 13222, + "time_per_iteration": 2.444411516189575 + }, + { + "auxiliary_loss_clip": 0.01011684, + "auxiliary_loss_mlp": 0.01001703, + "balance_loss_clip": 1.00916624, + "balance_loss_mlp": 1.00058198, + "epoch": 0.7950097700285585, + "flos": 70897036728960.0, + "grad_norm": 0.7666821029573112, + "language_loss": 0.67073858, + "learning_rate": 4.247327522443993e-07, + "loss": 0.69087243, + "num_input_tokens_seen": 285264605, + "step": 13223, + "time_per_iteration": 3.0004827976226807 + }, + { + "auxiliary_loss_clip": 0.01092703, + "auxiliary_loss_mlp": 0.0102909, + "balance_loss_clip": 1.03508735, + "balance_loss_mlp": 1.01639402, + "epoch": 0.7950698932812266, + "flos": 23951627585280.0, + "grad_norm": 1.6378725692833767, + "language_loss": 0.71279943, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.73401731, + "num_input_tokens_seen": 285283940, + "step": 13224, + "time_per_iteration": 2.4958302974700928 + }, + { + "auxiliary_loss_clip": 0.01031663, + "auxiliary_loss_mlp": 0.01003705, + "balance_loss_clip": 1.00885916, + "balance_loss_mlp": 1.00239396, + "epoch": 0.7951300165338945, + "flos": 60282561415680.0, + "grad_norm": 0.6676627337500398, + "language_loss": 0.55005997, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.57041365, + "num_input_tokens_seen": 285349525, + "step": 13225, + "time_per_iteration": 3.135474443435669 + }, + { + "auxiliary_loss_clip": 0.01078766, + "auxiliary_loss_mlp": 0.01021389, + "balance_loss_clip": 1.03261662, + "balance_loss_mlp": 1.01011777, + "epoch": 0.7951901397865625, + "flos": 22819004956800.0, + "grad_norm": 1.990812863730598, + "language_loss": 0.65203363, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.67303514, + "num_input_tokens_seen": 285367355, + "step": 13226, + "time_per_iteration": 2.505829334259033 + }, + { + "auxiliary_loss_clip": 0.01060602, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.03619623, + "balance_loss_mlp": 1.02419734, + "epoch": 0.7952502630392304, + "flos": 35695672871040.0, + "grad_norm": 2.10313229420545, + "language_loss": 0.70109773, + "learning_rate": 4.237733724976349e-07, + "loss": 0.72206283, + "num_input_tokens_seen": 285386190, + "step": 13227, + "time_per_iteration": 2.7088866233825684 + }, + { + "auxiliary_loss_clip": 0.01068614, + "auxiliary_loss_mlp": 0.01028778, + "balance_loss_clip": 1.03524375, + "balance_loss_mlp": 1.01766777, + "epoch": 0.7953103862918984, + "flos": 25629840869760.0, + "grad_norm": 1.6615762884527814, + "language_loss": 0.6892125, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.71018636, + "num_input_tokens_seen": 285406150, + "step": 13228, + "time_per_iteration": 2.6202423572540283 + }, + { + "auxiliary_loss_clip": 0.01057116, + "auxiliary_loss_mlp": 0.01038555, + "balance_loss_clip": 1.03179586, + "balance_loss_mlp": 1.02570426, + "epoch": 0.7953705095445663, + "flos": 40551980676480.0, + "grad_norm": 1.4161574405640922, + "language_loss": 0.7077204, + "learning_rate": 4.232940412119095e-07, + "loss": 0.72867709, + "num_input_tokens_seen": 285429900, + "step": 13229, + "time_per_iteration": 2.742006778717041 + }, + { + "auxiliary_loss_clip": 0.01099299, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.03892946, + "balance_loss_mlp": 1.02045798, + "epoch": 0.7954306327972344, + "flos": 27636672706560.0, + "grad_norm": 1.6952991344465926, + "language_loss": 0.71915197, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.7404741, + "num_input_tokens_seen": 285452555, + "step": 13230, + "time_per_iteration": 2.57804799079895 + }, + { + "auxiliary_loss_clip": 0.01013509, + "auxiliary_loss_mlp": 0.01000038, + "balance_loss_clip": 1.01167798, + "balance_loss_mlp": 0.99892348, + "epoch": 0.7954907560499023, + "flos": 59504055995520.0, + "grad_norm": 0.8919757519885787, + "language_loss": 0.63613057, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.65626609, + "num_input_tokens_seen": 285515700, + "step": 13231, + "time_per_iteration": 3.1293840408325195 + }, + { + "auxiliary_loss_clip": 0.01081326, + "auxiliary_loss_mlp": 0.01026459, + "balance_loss_clip": 1.03424132, + "balance_loss_mlp": 1.01477087, + "epoch": 0.7955508793025703, + "flos": 20120533764480.0, + "grad_norm": 1.7060056902447116, + "language_loss": 0.69970542, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.72078335, + "num_input_tokens_seen": 285533910, + "step": 13232, + "time_per_iteration": 2.521815299987793 + }, + { + "auxiliary_loss_clip": 0.01093235, + "auxiliary_loss_mlp": 0.01027017, + "balance_loss_clip": 1.03515947, + "balance_loss_mlp": 1.01507258, + "epoch": 0.7956110025552382, + "flos": 26505378881280.0, + "grad_norm": 2.9374979561657177, + "language_loss": 0.77852917, + "learning_rate": 4.223360961792952e-07, + "loss": 0.79973167, + "num_input_tokens_seen": 285554080, + "step": 13233, + "time_per_iteration": 2.52329421043396 + }, + { + "auxiliary_loss_clip": 0.01094564, + "auxiliary_loss_mlp": 0.01032111, + "balance_loss_clip": 1.03553724, + "balance_loss_mlp": 1.02011251, + "epoch": 0.7956711258079062, + "flos": 22565475786240.0, + "grad_norm": 1.92927225006163, + "language_loss": 0.79039514, + "learning_rate": 4.220967594613769e-07, + "loss": 0.81166184, + "num_input_tokens_seen": 285572325, + "step": 13234, + "time_per_iteration": 2.501094102859497 + }, + { + "auxiliary_loss_clip": 0.01082979, + "auxiliary_loss_mlp": 0.00780932, + "balance_loss_clip": 1.03599429, + "balance_loss_mlp": 1.00659692, + "epoch": 0.7957312490605741, + "flos": 17379005143680.0, + "grad_norm": 1.5899201878544813, + "language_loss": 0.70076466, + "learning_rate": 4.218574825777077e-07, + "loss": 0.7194038, + "num_input_tokens_seen": 285589770, + "step": 13235, + "time_per_iteration": 2.4995243549346924 + }, + { + "auxiliary_loss_clip": 0.01066829, + "auxiliary_loss_mlp": 0.01027448, + "balance_loss_clip": 1.03611755, + "balance_loss_mlp": 1.01537848, + "epoch": 0.7957913723132422, + "flos": 22491427898880.0, + "grad_norm": 1.578202652555452, + "language_loss": 0.6809094, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.7018522, + "num_input_tokens_seen": 285610065, + "step": 13236, + "time_per_iteration": 2.5930373668670654 + }, + { + "auxiliary_loss_clip": 0.01055245, + "auxiliary_loss_mlp": 0.01025982, + "balance_loss_clip": 1.0334146, + "balance_loss_mlp": 1.01381111, + "epoch": 0.7958514955659101, + "flos": 22638087129600.0, + "grad_norm": 1.7392182004996524, + "language_loss": 0.75212389, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.77293617, + "num_input_tokens_seen": 285628480, + "step": 13237, + "time_per_iteration": 2.576793670654297 + }, + { + "auxiliary_loss_clip": 0.01094473, + "auxiliary_loss_mlp": 0.01034664, + "balance_loss_clip": 1.03648806, + "balance_loss_mlp": 1.02224243, + "epoch": 0.7959116188185781, + "flos": 20704225772160.0, + "grad_norm": 3.2927590053638895, + "language_loss": 0.71859026, + "learning_rate": 4.211400110229175e-07, + "loss": 0.73988163, + "num_input_tokens_seen": 285647805, + "step": 13238, + "time_per_iteration": 2.5159196853637695 + }, + { + "auxiliary_loss_clip": 0.01089385, + "auxiliary_loss_mlp": 0.01026902, + "balance_loss_clip": 1.03478003, + "balance_loss_mlp": 1.01488566, + "epoch": 0.7959717420712461, + "flos": 19024683684480.0, + "grad_norm": 1.7024174191986974, + "language_loss": 0.7349658, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.75612867, + "num_input_tokens_seen": 285665505, + "step": 13239, + "time_per_iteration": 2.463737726211548 + }, + { + "auxiliary_loss_clip": 0.01107563, + "auxiliary_loss_mlp": 0.01032929, + "balance_loss_clip": 1.03641343, + "balance_loss_mlp": 1.02084136, + "epoch": 0.796031865323914, + "flos": 26356636661760.0, + "grad_norm": 1.8389488216304122, + "language_loss": 0.69676816, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.71817309, + "num_input_tokens_seen": 285685855, + "step": 13240, + "time_per_iteration": 2.5436160564422607 + }, + { + "auxiliary_loss_clip": 0.01024932, + "auxiliary_loss_mlp": 0.01002265, + "balance_loss_clip": 1.01187158, + "balance_loss_mlp": 1.00110841, + "epoch": 0.796091988576582, + "flos": 62069440320000.0, + "grad_norm": 0.8894929062282761, + "language_loss": 0.58694494, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.60721689, + "num_input_tokens_seen": 285735710, + "step": 13241, + "time_per_iteration": 2.882469654083252 + }, + { + "auxiliary_loss_clip": 0.01074145, + "auxiliary_loss_mlp": 0.01030897, + "balance_loss_clip": 1.03696966, + "balance_loss_mlp": 1.01965594, + "epoch": 0.7961521118292499, + "flos": 39020103400320.0, + "grad_norm": 2.4998962379746863, + "language_loss": 0.64260161, + "learning_rate": 4.201842205128772e-07, + "loss": 0.663652, + "num_input_tokens_seen": 285757045, + "step": 13242, + "time_per_iteration": 2.6980950832366943 + }, + { + "auxiliary_loss_clip": 0.01104646, + "auxiliary_loss_mlp": 0.01034399, + "balance_loss_clip": 1.03546596, + "balance_loss_mlp": 1.02191782, + "epoch": 0.796212235081918, + "flos": 21762836426880.0, + "grad_norm": 1.9391778549737408, + "language_loss": 0.75975037, + "learning_rate": 4.199454226296526e-07, + "loss": 0.78114086, + "num_input_tokens_seen": 285776050, + "step": 13243, + "time_per_iteration": 2.4617419242858887 + }, + { + "auxiliary_loss_clip": 0.01077169, + "auxiliary_loss_mlp": 0.01027813, + "balance_loss_clip": 1.03622675, + "balance_loss_mlp": 1.01537967, + "epoch": 0.7962723583345859, + "flos": 21178857110400.0, + "grad_norm": 1.6054770026277123, + "language_loss": 0.7939235, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.81497335, + "num_input_tokens_seen": 285796830, + "step": 13244, + "time_per_iteration": 2.597487211227417 + }, + { + "auxiliary_loss_clip": 0.0109673, + "auxiliary_loss_mlp": 0.01028699, + "balance_loss_clip": 1.03562558, + "balance_loss_mlp": 1.01596761, + "epoch": 0.7963324815872539, + "flos": 17128636369920.0, + "grad_norm": 2.0544943145311882, + "language_loss": 0.68209231, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.70334661, + "num_input_tokens_seen": 285814755, + "step": 13245, + "time_per_iteration": 2.4767088890075684 + }, + { + "auxiliary_loss_clip": 0.01084122, + "auxiliary_loss_mlp": 0.01032389, + "balance_loss_clip": 1.03559971, + "balance_loss_mlp": 1.02006912, + "epoch": 0.7963926048399218, + "flos": 21397481239680.0, + "grad_norm": 1.4688082024812112, + "language_loss": 0.79095185, + "learning_rate": 4.192293885111549e-07, + "loss": 0.81211698, + "num_input_tokens_seen": 285834255, + "step": 13246, + "time_per_iteration": 2.536513328552246 + }, + { + "auxiliary_loss_clip": 0.01084447, + "auxiliary_loss_mlp": 0.01027008, + "balance_loss_clip": 1.03446627, + "balance_loss_mlp": 1.01505744, + "epoch": 0.7964527280925898, + "flos": 25184188828800.0, + "grad_norm": 1.6704279361428402, + "language_loss": 0.65969992, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.6808145, + "num_input_tokens_seen": 285853540, + "step": 13247, + "time_per_iteration": 2.5416057109832764 + }, + { + "auxiliary_loss_clip": 0.01077923, + "auxiliary_loss_mlp": 0.01027265, + "balance_loss_clip": 1.03308988, + "balance_loss_mlp": 1.01664352, + "epoch": 0.7965128513452577, + "flos": 27015884928000.0, + "grad_norm": 1.9137652874781979, + "language_loss": 0.71459162, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.73564357, + "num_input_tokens_seen": 285872705, + "step": 13248, + "time_per_iteration": 5.3422019481658936 + }, + { + "auxiliary_loss_clip": 0.01086275, + "auxiliary_loss_mlp": 0.01028068, + "balance_loss_clip": 1.03481734, + "balance_loss_mlp": 1.01577759, + "epoch": 0.7965729745979258, + "flos": 24419578993920.0, + "grad_norm": 1.9362713061569736, + "language_loss": 0.75956547, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.78070891, + "num_input_tokens_seen": 285890290, + "step": 13249, + "time_per_iteration": 2.548143148422241 + }, + { + "auxiliary_loss_clip": 0.01076376, + "auxiliary_loss_mlp": 0.01032786, + "balance_loss_clip": 1.03576922, + "balance_loss_mlp": 1.02138352, + "epoch": 0.7966330978505937, + "flos": 18840389978880.0, + "grad_norm": 2.0231233864144733, + "language_loss": 0.62236667, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.64345825, + "num_input_tokens_seen": 285909190, + "step": 13250, + "time_per_iteration": 2.4835221767425537 + }, + { + "auxiliary_loss_clip": 0.01083085, + "auxiliary_loss_mlp": 0.01024134, + "balance_loss_clip": 1.0347507, + "balance_loss_mlp": 1.01177835, + "epoch": 0.7966932211032617, + "flos": 13152319862400.0, + "grad_norm": 2.44142669807278, + "language_loss": 0.71697384, + "learning_rate": 4.180371972938206e-07, + "loss": 0.73804605, + "num_input_tokens_seen": 285927570, + "step": 13251, + "time_per_iteration": 3.9977009296417236 + }, + { + "auxiliary_loss_clip": 0.01108781, + "auxiliary_loss_mlp": 0.0103249, + "balance_loss_clip": 1.03724337, + "balance_loss_mlp": 1.01948452, + "epoch": 0.7967533443559297, + "flos": 23949760078080.0, + "grad_norm": 2.9866308401023502, + "language_loss": 0.7325269, + "learning_rate": 4.177989389787624e-07, + "loss": 0.75393957, + "num_input_tokens_seen": 285945810, + "step": 13252, + "time_per_iteration": 2.4762418270111084 + }, + { + "auxiliary_loss_clip": 0.01101281, + "auxiliary_loss_mlp": 0.01030484, + "balance_loss_clip": 1.03517151, + "balance_loss_mlp": 1.01860452, + "epoch": 0.7968134676085976, + "flos": 30368791964160.0, + "grad_norm": 2.0005914239079665, + "language_loss": 0.66025251, + "learning_rate": 4.175607406609278e-07, + "loss": 0.68157017, + "num_input_tokens_seen": 285964235, + "step": 13253, + "time_per_iteration": 2.5017457008361816 + }, + { + "auxiliary_loss_clip": 0.01079632, + "auxiliary_loss_mlp": 0.01035236, + "balance_loss_clip": 1.03945887, + "balance_loss_mlp": 1.02257013, + "epoch": 0.7968735908612656, + "flos": 23075048079360.0, + "grad_norm": 1.4361955491350193, + "language_loss": 0.68002558, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.7011742, + "num_input_tokens_seen": 285983710, + "step": 13254, + "time_per_iteration": 2.5740509033203125 + }, + { + "auxiliary_loss_clip": 0.01092712, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.03440809, + "balance_loss_mlp": 1.0220598, + "epoch": 0.7969337141139335, + "flos": 23582250074880.0, + "grad_norm": 1.7871829556532768, + "language_loss": 0.69200301, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.71326685, + "num_input_tokens_seen": 286003425, + "step": 13255, + "time_per_iteration": 2.4927141666412354 + }, + { + "auxiliary_loss_clip": 0.01101539, + "auxiliary_loss_mlp": 0.01028429, + "balance_loss_clip": 1.03407741, + "balance_loss_mlp": 1.01690722, + "epoch": 0.7969938373666016, + "flos": 19755860935680.0, + "grad_norm": 1.8589655606821995, + "language_loss": 0.78862518, + "learning_rate": 4.168465057810733e-07, + "loss": 0.80992496, + "num_input_tokens_seen": 286020130, + "step": 13256, + "time_per_iteration": 2.4588615894317627 + }, + { + "auxiliary_loss_clip": 0.01093001, + "auxiliary_loss_mlp": 0.01027585, + "balance_loss_clip": 1.03631926, + "balance_loss_mlp": 1.01544392, + "epoch": 0.7970539606192695, + "flos": 24134089697280.0, + "grad_norm": 1.825487247715329, + "language_loss": 0.65966737, + "learning_rate": 4.166085475424315e-07, + "loss": 0.68087316, + "num_input_tokens_seen": 286040230, + "step": 13257, + "time_per_iteration": 2.486119031906128 + }, + { + "auxiliary_loss_clip": 0.01085986, + "auxiliary_loss_mlp": 0.01032099, + "balance_loss_clip": 1.03579354, + "balance_loss_mlp": 1.02010036, + "epoch": 0.7971140838719375, + "flos": 17968622895360.0, + "grad_norm": 1.6811890693598976, + "language_loss": 0.72089088, + "learning_rate": 4.163706493461523e-07, + "loss": 0.74207175, + "num_input_tokens_seen": 286059475, + "step": 13258, + "time_per_iteration": 3.8730008602142334 + }, + { + "auxiliary_loss_clip": 0.01096755, + "auxiliary_loss_mlp": 0.01031452, + "balance_loss_clip": 1.03625035, + "balance_loss_mlp": 1.0189414, + "epoch": 0.7971742071246054, + "flos": 19169547235200.0, + "grad_norm": 1.6644352046972792, + "language_loss": 0.69202065, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.71330273, + "num_input_tokens_seen": 286077820, + "step": 13259, + "time_per_iteration": 2.4964866638183594 + }, + { + "auxiliary_loss_clip": 0.01090813, + "auxiliary_loss_mlp": 0.0102897, + "balance_loss_clip": 1.03520584, + "balance_loss_mlp": 1.01792526, + "epoch": 0.7972343303772734, + "flos": 27125951178240.0, + "grad_norm": 2.6200728821374897, + "language_loss": 0.73817325, + "learning_rate": 4.158950331167641e-07, + "loss": 0.7593711, + "num_input_tokens_seen": 286097285, + "step": 13260, + "time_per_iteration": 2.5178604125976562 + }, + { + "auxiliary_loss_clip": 0.01078427, + "auxiliary_loss_mlp": 0.01028042, + "balance_loss_clip": 1.03261471, + "balance_loss_mlp": 1.0165565, + "epoch": 0.7972944536299413, + "flos": 20996646393600.0, + "grad_norm": 1.702181950932197, + "language_loss": 0.78287929, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.80394393, + "num_input_tokens_seen": 286116000, + "step": 13261, + "time_per_iteration": 2.5473992824554443 + }, + { + "auxiliary_loss_clip": 0.01089867, + "auxiliary_loss_mlp": 0.01027049, + "balance_loss_clip": 1.03540909, + "balance_loss_mlp": 1.01646376, + "epoch": 0.7973545768826094, + "flos": 21580015178880.0, + "grad_norm": 1.6181045657758593, + "language_loss": 0.76223749, + "learning_rate": 4.154196571650501e-07, + "loss": 0.78340662, + "num_input_tokens_seen": 286135110, + "step": 13262, + "time_per_iteration": 2.48492693901062 + }, + { + "auxiliary_loss_clip": 0.01075886, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.03774381, + "balance_loss_mlp": 1.01699042, + "epoch": 0.7974147001352773, + "flos": 20558536208640.0, + "grad_norm": 2.3572436446602576, + "language_loss": 0.70475858, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.72582334, + "num_input_tokens_seen": 286152835, + "step": 13263, + "time_per_iteration": 2.5712146759033203 + }, + { + "auxiliary_loss_clip": 0.01098736, + "auxiliary_loss_mlp": 0.0103172, + "balance_loss_clip": 1.03561234, + "balance_loss_mlp": 1.01851797, + "epoch": 0.7974748233879453, + "flos": 20996790048000.0, + "grad_norm": 1.621283840775451, + "language_loss": 0.70957935, + "learning_rate": 4.149445215631153e-07, + "loss": 0.73088384, + "num_input_tokens_seen": 286171785, + "step": 13264, + "time_per_iteration": 2.4827563762664795 + }, + { + "auxiliary_loss_clip": 0.01101903, + "auxiliary_loss_mlp": 0.01031176, + "balance_loss_clip": 1.03533113, + "balance_loss_mlp": 1.0194217, + "epoch": 0.7975349466406133, + "flos": 22565188477440.0, + "grad_norm": 1.9036258143621274, + "language_loss": 0.76808614, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.78941697, + "num_input_tokens_seen": 286190420, + "step": 13265, + "time_per_iteration": 2.4708216190338135 + }, + { + "auxiliary_loss_clip": 0.01072534, + "auxiliary_loss_mlp": 0.01028343, + "balance_loss_clip": 1.0350951, + "balance_loss_mlp": 1.01668477, + "epoch": 0.7975950698932812, + "flos": 21689542725120.0, + "grad_norm": 1.7953271891447327, + "language_loss": 0.75471693, + "learning_rate": 4.144696263830285e-07, + "loss": 0.77572572, + "num_input_tokens_seen": 286210105, + "step": 13266, + "time_per_iteration": 2.5436112880706787 + }, + { + "auxiliary_loss_clip": 0.0107806, + "auxiliary_loss_mlp": 0.01025556, + "balance_loss_clip": 1.03577304, + "balance_loss_mlp": 1.01386189, + "epoch": 0.7976551931459492, + "flos": 19604568850560.0, + "grad_norm": 1.5762933137923194, + "language_loss": 0.84064019, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.86167628, + "num_input_tokens_seen": 286228180, + "step": 13267, + "time_per_iteration": 2.526582956314087 + }, + { + "auxiliary_loss_clip": 0.01092371, + "auxiliary_loss_mlp": 0.01031043, + "balance_loss_clip": 1.03542495, + "balance_loss_mlp": 1.01865745, + "epoch": 0.7977153163986171, + "flos": 21687603390720.0, + "grad_norm": 1.6917749328555682, + "language_loss": 0.7601707, + "learning_rate": 4.139949716968223e-07, + "loss": 0.78140485, + "num_input_tokens_seen": 286247305, + "step": 13268, + "time_per_iteration": 2.4846396446228027 + }, + { + "auxiliary_loss_clip": 0.0110359, + "auxiliary_loss_mlp": 0.01028898, + "balance_loss_clip": 1.03541636, + "balance_loss_mlp": 1.01642859, + "epoch": 0.7977754396512852, + "flos": 23476780765440.0, + "grad_norm": 1.5600846453866883, + "language_loss": 0.77902615, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.80035108, + "num_input_tokens_seen": 286268145, + "step": 13269, + "time_per_iteration": 2.4935390949249268 + }, + { + "auxiliary_loss_clip": 0.01085373, + "auxiliary_loss_mlp": 0.01036293, + "balance_loss_clip": 1.03227603, + "balance_loss_mlp": 1.0240922, + "epoch": 0.7978355629039531, + "flos": 22382223575040.0, + "grad_norm": 1.7544193423281385, + "language_loss": 0.82239574, + "learning_rate": 4.135205575764922e-07, + "loss": 0.84361237, + "num_input_tokens_seen": 286286775, + "step": 13270, + "time_per_iteration": 2.483074188232422 + }, + { + "auxiliary_loss_clip": 0.01064749, + "auxiliary_loss_mlp": 0.01034752, + "balance_loss_clip": 1.03543353, + "balance_loss_mlp": 1.02169323, + "epoch": 0.7978956861566211, + "flos": 20266331068800.0, + "grad_norm": 2.3893799784034333, + "language_loss": 0.59755838, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.6185534, + "num_input_tokens_seen": 286305590, + "step": 13271, + "time_per_iteration": 2.5559189319610596 + }, + { + "auxiliary_loss_clip": 0.01086475, + "auxiliary_loss_mlp": 0.01028522, + "balance_loss_clip": 1.03586292, + "balance_loss_mlp": 1.01646376, + "epoch": 0.797955809409289, + "flos": 28112417366400.0, + "grad_norm": 1.603264273973951, + "language_loss": 0.73160207, + "learning_rate": 4.130463840939975e-07, + "loss": 0.75275207, + "num_input_tokens_seen": 286328050, + "step": 13272, + "time_per_iteration": 2.574772596359253 + }, + { + "auxiliary_loss_clip": 0.01042768, + "auxiliary_loss_mlp": 0.01033047, + "balance_loss_clip": 1.03184223, + "balance_loss_mlp": 1.0197854, + "epoch": 0.798015932661957, + "flos": 15559591495680.0, + "grad_norm": 1.749781061764243, + "language_loss": 0.71465576, + "learning_rate": 4.128093876144161e-07, + "loss": 0.73541391, + "num_input_tokens_seen": 286345265, + "step": 13273, + "time_per_iteration": 2.5914793014526367 + }, + { + "auxiliary_loss_clip": 0.01083184, + "auxiliary_loss_mlp": 0.01033997, + "balance_loss_clip": 1.0364995, + "balance_loss_mlp": 1.02133656, + "epoch": 0.7980760559146249, + "flos": 23951196622080.0, + "grad_norm": 1.7345356106681178, + "language_loss": 0.75649399, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.77766579, + "num_input_tokens_seen": 286364465, + "step": 13274, + "time_per_iteration": 2.5294363498687744 + }, + { + "auxiliary_loss_clip": 0.01051479, + "auxiliary_loss_mlp": 0.01028277, + "balance_loss_clip": 1.03218484, + "balance_loss_mlp": 1.01716113, + "epoch": 0.798136179167293, + "flos": 28038082170240.0, + "grad_norm": 1.3081115644855235, + "language_loss": 0.7779181, + "learning_rate": 4.12335575223518e-07, + "loss": 0.79871571, + "num_input_tokens_seen": 286385565, + "step": 13275, + "time_per_iteration": 2.6309478282928467 + }, + { + "auxiliary_loss_clip": 0.0109561, + "auxiliary_loss_mlp": 0.0103434, + "balance_loss_clip": 1.03588104, + "balance_loss_mlp": 1.02154255, + "epoch": 0.7981963024199609, + "flos": 35984538046080.0, + "grad_norm": 1.9308364009838062, + "language_loss": 0.64156985, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.66286939, + "num_input_tokens_seen": 286403950, + "step": 13276, + "time_per_iteration": 2.5662143230438232 + }, + { + "auxiliary_loss_clip": 0.0106286, + "auxiliary_loss_mlp": 0.01032025, + "balance_loss_clip": 1.0339601, + "balance_loss_mlp": 1.02040827, + "epoch": 0.7982564256726289, + "flos": 25884914325120.0, + "grad_norm": 1.6493748511605029, + "language_loss": 0.60804826, + "learning_rate": 4.118620036501945e-07, + "loss": 0.62899709, + "num_input_tokens_seen": 286426160, + "step": 13277, + "time_per_iteration": 2.5986385345458984 + }, + { + "auxiliary_loss_clip": 0.01080978, + "auxiliary_loss_mlp": 0.01035291, + "balance_loss_clip": 1.03588653, + "balance_loss_mlp": 1.02277994, + "epoch": 0.7983165489252969, + "flos": 25739152934400.0, + "grad_norm": 1.8630638729775306, + "language_loss": 0.79970646, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.82086915, + "num_input_tokens_seen": 286446610, + "step": 13278, + "time_per_iteration": 2.534487247467041 + }, + { + "auxiliary_loss_clip": 0.01084434, + "auxiliary_loss_mlp": 0.01034561, + "balance_loss_clip": 1.03529048, + "balance_loss_mlp": 1.02203798, + "epoch": 0.7983766721779648, + "flos": 21908202768000.0, + "grad_norm": 1.9536461288656146, + "language_loss": 0.63583457, + "learning_rate": 4.113886729662768e-07, + "loss": 0.6570245, + "num_input_tokens_seen": 286465460, + "step": 13279, + "time_per_iteration": 2.535396099090576 + }, + { + "auxiliary_loss_clip": 0.01089529, + "auxiliary_loss_mlp": 0.0102658, + "balance_loss_clip": 1.03578258, + "balance_loss_mlp": 1.01566005, + "epoch": 0.7984367954306328, + "flos": 29347420734720.0, + "grad_norm": 1.8358511408257965, + "language_loss": 0.71084499, + "learning_rate": 4.111520979802825e-07, + "loss": 0.73200607, + "num_input_tokens_seen": 286485720, + "step": 13280, + "time_per_iteration": 2.5524742603302 + }, + { + "auxiliary_loss_clip": 0.01065445, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.03397536, + "balance_loss_mlp": 1.02043009, + "epoch": 0.7984969186833007, + "flos": 31357772104320.0, + "grad_norm": 1.8005110732560285, + "language_loss": 0.62975854, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.65076309, + "num_input_tokens_seen": 286507465, + "step": 13281, + "time_per_iteration": 2.6515886783599854 + }, + { + "auxiliary_loss_clip": 0.01093673, + "auxiliary_loss_mlp": 0.01033915, + "balance_loss_clip": 1.03361416, + "balance_loss_mlp": 1.02135003, + "epoch": 0.7985570419359688, + "flos": 24312924535680.0, + "grad_norm": 2.0003697898732895, + "language_loss": 0.80217367, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.82344949, + "num_input_tokens_seen": 286526345, + "step": 13282, + "time_per_iteration": 2.496497869491577 + }, + { + "auxiliary_loss_clip": 0.01071686, + "auxiliary_loss_mlp": 0.00783428, + "balance_loss_clip": 1.03334069, + "balance_loss_mlp": 1.01083875, + "epoch": 0.7986171651886367, + "flos": 15742233175680.0, + "grad_norm": 1.8014220771384368, + "language_loss": 0.71762258, + "learning_rate": 4.10442734553802e-07, + "loss": 0.73617375, + "num_input_tokens_seen": 286544095, + "step": 13283, + "time_per_iteration": 2.5499420166015625 + }, + { + "auxiliary_loss_clip": 0.01089089, + "auxiliary_loss_mlp": 0.01026601, + "balance_loss_clip": 1.03343511, + "balance_loss_mlp": 1.0153234, + "epoch": 0.7986772884413047, + "flos": 11619401091840.0, + "grad_norm": 1.8885048269022147, + "language_loss": 0.73305601, + "learning_rate": 4.102064006186967e-07, + "loss": 0.75421292, + "num_input_tokens_seen": 286560960, + "step": 13284, + "time_per_iteration": 2.457382917404175 + }, + { + "auxiliary_loss_clip": 0.01076905, + "auxiliary_loss_mlp": 0.01036417, + "balance_loss_clip": 1.03342867, + "balance_loss_mlp": 1.02521741, + "epoch": 0.7987374116939726, + "flos": 22091059929600.0, + "grad_norm": 1.4303897242407437, + "language_loss": 0.70467502, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.72580826, + "num_input_tokens_seen": 286579865, + "step": 13285, + "time_per_iteration": 2.566587448120117 + }, + { + "auxiliary_loss_clip": 0.01076045, + "auxiliary_loss_mlp": 0.01029261, + "balance_loss_clip": 1.03250575, + "balance_loss_mlp": 1.01773357, + "epoch": 0.7987975349466406, + "flos": 17890696339200.0, + "grad_norm": 1.6930351777122357, + "language_loss": 0.73077178, + "learning_rate": 4.097339136128437e-07, + "loss": 0.7518248, + "num_input_tokens_seen": 286597295, + "step": 13286, + "time_per_iteration": 2.485409736633301 + }, + { + "auxiliary_loss_clip": 0.01082288, + "auxiliary_loss_mlp": 0.0103162, + "balance_loss_clip": 1.03469586, + "balance_loss_mlp": 1.01978886, + "epoch": 0.7988576581993085, + "flos": 19719232041600.0, + "grad_norm": 1.7727940748981907, + "language_loss": 0.75125241, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.7723915, + "num_input_tokens_seen": 286616270, + "step": 13287, + "time_per_iteration": 5.264992952346802 + }, + { + "auxiliary_loss_clip": 0.01080884, + "auxiliary_loss_mlp": 0.01026153, + "balance_loss_clip": 1.03751624, + "balance_loss_mlp": 1.01467311, + "epoch": 0.7989177814519766, + "flos": 28036358317440.0, + "grad_norm": 1.474529186062847, + "language_loss": 0.61799055, + "learning_rate": 4.092616678191863e-07, + "loss": 0.63906091, + "num_input_tokens_seen": 286638315, + "step": 13288, + "time_per_iteration": 2.6056551933288574 + }, + { + "auxiliary_loss_clip": 0.01093123, + "auxiliary_loss_mlp": 0.01028934, + "balance_loss_clip": 1.0367918, + "balance_loss_mlp": 1.01809835, + "epoch": 0.7989779047046445, + "flos": 28871029630080.0, + "grad_norm": 1.8993816038295326, + "language_loss": 0.69960684, + "learning_rate": 4.090256353993169e-07, + "loss": 0.72082734, + "num_input_tokens_seen": 286658630, + "step": 13289, + "time_per_iteration": 2.52897572517395 + }, + { + "auxiliary_loss_clip": 0.01069323, + "auxiliary_loss_mlp": 0.01034115, + "balance_loss_clip": 1.03562832, + "balance_loss_mlp": 1.02174103, + "epoch": 0.7990380279573125, + "flos": 18186887888640.0, + "grad_norm": 1.998427638628982, + "language_loss": 0.62400842, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.64504278, + "num_input_tokens_seen": 286676870, + "step": 13290, + "time_per_iteration": 4.05820369720459 + }, + { + "auxiliary_loss_clip": 0.01096334, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.03586972, + "balance_loss_mlp": 1.0175221, + "epoch": 0.7990981512099805, + "flos": 20879936127360.0, + "grad_norm": 2.4688692287134515, + "language_loss": 0.71719831, + "learning_rate": 4.08553751558248e-07, + "loss": 0.73846674, + "num_input_tokens_seen": 286694300, + "step": 13291, + "time_per_iteration": 2.529148817062378 + }, + { + "auxiliary_loss_clip": 0.01064974, + "auxiliary_loss_mlp": 0.01025949, + "balance_loss_clip": 1.0340817, + "balance_loss_mlp": 1.01501119, + "epoch": 0.7991582744626484, + "flos": 26099911180800.0, + "grad_norm": 1.5208533606290249, + "language_loss": 0.63467246, + "learning_rate": 4.083179001549422e-07, + "loss": 0.65558165, + "num_input_tokens_seen": 286714545, + "step": 13292, + "time_per_iteration": 2.567009687423706 + }, + { + "auxiliary_loss_clip": 0.01091776, + "auxiliary_loss_mlp": 0.0103245, + "balance_loss_clip": 1.03426325, + "balance_loss_mlp": 1.02125025, + "epoch": 0.7992183977153164, + "flos": 35295843605760.0, + "grad_norm": 1.9084777367447023, + "language_loss": 0.56018895, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.58143121, + "num_input_tokens_seen": 286734525, + "step": 13293, + "time_per_iteration": 2.648062229156494 + }, + { + "auxiliary_loss_clip": 0.0107842, + "auxiliary_loss_mlp": 0.01031636, + "balance_loss_clip": 1.03742754, + "balance_loss_mlp": 1.01972067, + "epoch": 0.7992785209679844, + "flos": 51853426577280.0, + "grad_norm": 2.7893960989675644, + "language_loss": 0.71615899, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.73725963, + "num_input_tokens_seen": 286753430, + "step": 13294, + "time_per_iteration": 2.7580318450927734 + }, + { + "auxiliary_loss_clip": 0.01069245, + "auxiliary_loss_mlp": 0.01035752, + "balance_loss_clip": 1.03507876, + "balance_loss_mlp": 1.02312791, + "epoch": 0.7993386442206524, + "flos": 22565116650240.0, + "grad_norm": 1.7888106865134243, + "language_loss": 0.72384816, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.74489814, + "num_input_tokens_seen": 286771915, + "step": 13295, + "time_per_iteration": 2.558220386505127 + }, + { + "auxiliary_loss_clip": 0.01070751, + "auxiliary_loss_mlp": 0.01035757, + "balance_loss_clip": 1.03565621, + "balance_loss_mlp": 1.0244441, + "epoch": 0.7993987674733203, + "flos": 18800277465600.0, + "grad_norm": 1.6881909113370248, + "language_loss": 0.76558661, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.78665173, + "num_input_tokens_seen": 286789835, + "step": 13296, + "time_per_iteration": 2.525397777557373 + }, + { + "auxiliary_loss_clip": 0.01004385, + "auxiliary_loss_mlp": 0.00997161, + "balance_loss_clip": 1.01361716, + "balance_loss_mlp": 0.99598122, + "epoch": 0.7994588907259883, + "flos": 69421720394880.0, + "grad_norm": 0.6948636867311111, + "language_loss": 0.60819817, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.62821364, + "num_input_tokens_seen": 286855580, + "step": 13297, + "time_per_iteration": 4.623366117477417 + }, + { + "auxiliary_loss_clip": 0.01082448, + "auxiliary_loss_mlp": 0.01030197, + "balance_loss_clip": 1.03628063, + "balance_loss_mlp": 1.01900351, + "epoch": 0.7995190139786562, + "flos": 13480327883520.0, + "grad_norm": 2.123015536565371, + "language_loss": 0.69865847, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.71978498, + "num_input_tokens_seen": 286874360, + "step": 13298, + "time_per_iteration": 2.5298879146575928 + }, + { + "auxiliary_loss_clip": 0.01067275, + "auxiliary_loss_mlp": 0.01035168, + "balance_loss_clip": 1.03426349, + "balance_loss_mlp": 1.02180445, + "epoch": 0.7995791372313242, + "flos": 21652842003840.0, + "grad_norm": 1.9641277104959538, + "language_loss": 0.75562048, + "learning_rate": 4.066686308212037e-07, + "loss": 0.77664495, + "num_input_tokens_seen": 286891950, + "step": 13299, + "time_per_iteration": 2.53023362159729 + }, + { + "auxiliary_loss_clip": 0.01078013, + "auxiliary_loss_mlp": 0.01030461, + "balance_loss_clip": 1.03389287, + "balance_loss_mlp": 1.01941013, + "epoch": 0.7996392604839921, + "flos": 26068130622720.0, + "grad_norm": 1.6332192841080453, + "language_loss": 0.77607715, + "learning_rate": 4.064332625220828e-07, + "loss": 0.79716188, + "num_input_tokens_seen": 286911725, + "step": 13300, + "time_per_iteration": 2.6062376499176025 + }, + { + "auxiliary_loss_clip": 0.01064062, + "auxiliary_loss_mlp": 0.01039573, + "balance_loss_clip": 1.03187001, + "balance_loss_mlp": 1.02516055, + "epoch": 0.7996993837366602, + "flos": 24606889441920.0, + "grad_norm": 2.209297622575398, + "language_loss": 0.63990778, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.6609441, + "num_input_tokens_seen": 286931400, + "step": 13301, + "time_per_iteration": 2.5811424255371094 + }, + { + "auxiliary_loss_clip": 0.01092073, + "auxiliary_loss_mlp": 0.0103773, + "balance_loss_clip": 1.03642178, + "balance_loss_mlp": 1.02567172, + "epoch": 0.7997595069893281, + "flos": 20992049452800.0, + "grad_norm": 1.606530216888061, + "language_loss": 0.72011799, + "learning_rate": 4.059627072173928e-07, + "loss": 0.74141598, + "num_input_tokens_seen": 286949795, + "step": 13302, + "time_per_iteration": 2.5813241004943848 + }, + { + "auxiliary_loss_clip": 0.01106842, + "auxiliary_loss_mlp": 0.0078726, + "balance_loss_clip": 1.03603709, + "balance_loss_mlp": 1.01529384, + "epoch": 0.7998196302419961, + "flos": 24426510318720.0, + "grad_norm": 1.7861110881445073, + "language_loss": 0.8350457, + "learning_rate": 4.057275202296684e-07, + "loss": 0.85398674, + "num_input_tokens_seen": 286968805, + "step": 13303, + "time_per_iteration": 2.4809443950653076 + }, + { + "auxiliary_loss_clip": 0.01101855, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.03551757, + "balance_loss_mlp": 1.01892436, + "epoch": 0.7998797534946641, + "flos": 30264651457920.0, + "grad_norm": 1.728649453475977, + "language_loss": 0.58999354, + "learning_rate": 4.054923936969166e-07, + "loss": 0.61131454, + "num_input_tokens_seen": 286990235, + "step": 13304, + "time_per_iteration": 2.5681190490722656 + }, + { + "auxiliary_loss_clip": 0.01105463, + "auxiliary_loss_mlp": 0.0102892, + "balance_loss_clip": 1.03448009, + "balance_loss_mlp": 1.01599777, + "epoch": 0.799939876747332, + "flos": 23513984277120.0, + "grad_norm": 1.5442205248312337, + "language_loss": 0.6893127, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.71065658, + "num_input_tokens_seen": 287011060, + "step": 13305, + "time_per_iteration": 2.4815049171447754 + }, + { + "auxiliary_loss_clip": 0.01067717, + "auxiliary_loss_mlp": 0.01028779, + "balance_loss_clip": 1.03604078, + "balance_loss_mlp": 1.01733506, + "epoch": 0.8, + "flos": 19318109886720.0, + "grad_norm": 1.5769833032011886, + "language_loss": 0.6946187, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.71558368, + "num_input_tokens_seen": 287029215, + "step": 13306, + "time_per_iteration": 2.586491584777832 + }, + { + "auxiliary_loss_clip": 0.01093162, + "auxiliary_loss_mlp": 0.0103286, + "balance_loss_clip": 1.035622, + "balance_loss_mlp": 1.02109456, + "epoch": 0.800060123252668, + "flos": 32412432263040.0, + "grad_norm": 1.6390274856589602, + "language_loss": 0.69741178, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.71867198, + "num_input_tokens_seen": 287050855, + "step": 13307, + "time_per_iteration": 2.5763046741485596 + }, + { + "auxiliary_loss_clip": 0.01082871, + "auxiliary_loss_mlp": 0.01034815, + "balance_loss_clip": 1.03584003, + "balance_loss_mlp": 1.02325821, + "epoch": 0.800120246505336, + "flos": 20010611168640.0, + "grad_norm": 2.1679408904688895, + "language_loss": 0.7694096, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.79058647, + "num_input_tokens_seen": 287069915, + "step": 13308, + "time_per_iteration": 2.5316720008850098 + }, + { + "auxiliary_loss_clip": 0.01058498, + "auxiliary_loss_mlp": 0.01029419, + "balance_loss_clip": 1.03469729, + "balance_loss_mlp": 1.01638341, + "epoch": 0.8001803697580039, + "flos": 31868278151040.0, + "grad_norm": 1.3860982236034685, + "language_loss": 0.78555453, + "learning_rate": 4.0431766816972e-07, + "loss": 0.80643368, + "num_input_tokens_seen": 287091450, + "step": 13309, + "time_per_iteration": 2.6407670974731445 + }, + { + "auxiliary_loss_clip": 0.01030828, + "auxiliary_loss_mlp": 0.01002371, + "balance_loss_clip": 1.00798631, + "balance_loss_mlp": 1.00122106, + "epoch": 0.8002404930106719, + "flos": 63392066916480.0, + "grad_norm": 0.9408338294991487, + "language_loss": 0.64609748, + "learning_rate": 4.040829045539571e-07, + "loss": 0.66642946, + "num_input_tokens_seen": 287148365, + "step": 13310, + "time_per_iteration": 3.0011370182037354 + }, + { + "auxiliary_loss_clip": 0.0109379, + "auxiliary_loss_mlp": 0.0103379, + "balance_loss_clip": 1.03590107, + "balance_loss_mlp": 1.0222801, + "epoch": 0.8003006162633398, + "flos": 27855476403840.0, + "grad_norm": 1.833716676248296, + "language_loss": 0.82588959, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.84716541, + "num_input_tokens_seen": 287168280, + "step": 13311, + "time_per_iteration": 2.5320956707000732 + }, + { + "auxiliary_loss_clip": 0.0109512, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.03629231, + "balance_loss_mlp": 1.0216893, + "epoch": 0.8003607395160078, + "flos": 18223337214720.0, + "grad_norm": 2.6590635902118795, + "language_loss": 0.66220164, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.68348706, + "num_input_tokens_seen": 287185980, + "step": 13312, + "time_per_iteration": 2.4875237941741943 + }, + { + "auxiliary_loss_clip": 0.01109762, + "auxiliary_loss_mlp": 0.01033877, + "balance_loss_clip": 1.03905129, + "balance_loss_mlp": 1.02118707, + "epoch": 0.8004208627686757, + "flos": 20886975192960.0, + "grad_norm": 1.703268378942902, + "language_loss": 0.75325143, + "learning_rate": 4.033789768462843e-07, + "loss": 0.77468783, + "num_input_tokens_seen": 287203875, + "step": 13313, + "time_per_iteration": 2.500614643096924 + }, + { + "auxiliary_loss_clip": 0.01091577, + "auxiliary_loss_mlp": 0.0103246, + "balance_loss_clip": 1.03376472, + "balance_loss_mlp": 1.02019298, + "epoch": 0.8004809860213438, + "flos": 26436143416320.0, + "grad_norm": 1.3199652453216375, + "language_loss": 0.75586075, + "learning_rate": 4.031444553532575e-07, + "loss": 0.7771011, + "num_input_tokens_seen": 287226445, + "step": 13314, + "time_per_iteration": 2.560427188873291 + }, + { + "auxiliary_loss_clip": 0.00992168, + "auxiliary_loss_mlp": 0.0100162, + "balance_loss_clip": 1.01326847, + "balance_loss_mlp": 1.00057149, + "epoch": 0.8005411092740117, + "flos": 63648612829440.0, + "grad_norm": 0.8145517156108162, + "language_loss": 0.53761435, + "learning_rate": 4.029099944131522e-07, + "loss": 0.55755222, + "num_input_tokens_seen": 287286240, + "step": 13315, + "time_per_iteration": 3.121859550476074 + }, + { + "auxiliary_loss_clip": 0.01080576, + "auxiliary_loss_mlp": 0.01032077, + "balance_loss_clip": 1.03448415, + "balance_loss_mlp": 1.01997709, + "epoch": 0.8006012325266797, + "flos": 36138056774400.0, + "grad_norm": 1.5119309087865334, + "language_loss": 0.71077794, + "learning_rate": 4.026755940348603e-07, + "loss": 0.73190445, + "num_input_tokens_seen": 287310265, + "step": 13316, + "time_per_iteration": 2.670164108276367 + }, + { + "auxiliary_loss_clip": 0.01084186, + "auxiliary_loss_mlp": 0.01029162, + "balance_loss_clip": 1.03592563, + "balance_loss_mlp": 1.01689506, + "epoch": 0.8006613557793477, + "flos": 33838947970560.0, + "grad_norm": 1.810500196853428, + "language_loss": 0.64524603, + "learning_rate": 4.024412542272706e-07, + "loss": 0.66637951, + "num_input_tokens_seen": 287331610, + "step": 13317, + "time_per_iteration": 2.6386115550994873 + }, + { + "auxiliary_loss_clip": 0.01030593, + "auxiliary_loss_mlp": 0.01005247, + "balance_loss_clip": 1.00785232, + "balance_loss_mlp": 1.00411439, + "epoch": 0.8007214790320156, + "flos": 67348310699520.0, + "grad_norm": 0.7873625651663488, + "language_loss": 0.58995062, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.610309, + "num_input_tokens_seen": 287394795, + "step": 13318, + "time_per_iteration": 3.158604621887207 + }, + { + "auxiliary_loss_clip": 0.01070179, + "auxiliary_loss_mlp": 0.01025886, + "balance_loss_clip": 1.03537011, + "balance_loss_mlp": 1.01445413, + "epoch": 0.8007816022846836, + "flos": 23185653033600.0, + "grad_norm": 1.640218059856688, + "language_loss": 0.66601396, + "learning_rate": 4.019727563597366e-07, + "loss": 0.68697459, + "num_input_tokens_seen": 287414595, + "step": 13319, + "time_per_iteration": 2.5701730251312256 + }, + { + "auxiliary_loss_clip": 0.01105446, + "auxiliary_loss_mlp": 0.00786595, + "balance_loss_clip": 1.03482199, + "balance_loss_mlp": 1.01220965, + "epoch": 0.8008417255373516, + "flos": 21981388728960.0, + "grad_norm": 2.322316088063077, + "language_loss": 0.73790193, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.75682235, + "num_input_tokens_seen": 287434395, + "step": 13320, + "time_per_iteration": 2.5361669063568115 + }, + { + "auxiliary_loss_clip": 0.01091484, + "auxiliary_loss_mlp": 0.01026025, + "balance_loss_clip": 1.0356741, + "balance_loss_mlp": 1.01422358, + "epoch": 0.8009018487900196, + "flos": 16727334647040.0, + "grad_norm": 2.4874330352689267, + "language_loss": 0.80477959, + "learning_rate": 4.015045008816138e-07, + "loss": 0.82595462, + "num_input_tokens_seen": 287450590, + "step": 13321, + "time_per_iteration": 2.4656593799591064 + }, + { + "auxiliary_loss_clip": 0.01038328, + "auxiliary_loss_mlp": 0.01033839, + "balance_loss_clip": 1.02967834, + "balance_loss_mlp": 1.02192414, + "epoch": 0.8009619720426875, + "flos": 20813609664000.0, + "grad_norm": 1.6840438039764396, + "language_loss": 0.66068524, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.68140692, + "num_input_tokens_seen": 287468455, + "step": 13322, + "time_per_iteration": 2.6366305351257324 + }, + { + "auxiliary_loss_clip": 0.01093089, + "auxiliary_loss_mlp": 0.01026597, + "balance_loss_clip": 1.03435409, + "balance_loss_mlp": 1.01495051, + "epoch": 0.8010220952953555, + "flos": 17931096161280.0, + "grad_norm": 1.7304915018033709, + "language_loss": 0.77987981, + "learning_rate": 4.010364878639265e-07, + "loss": 0.80107671, + "num_input_tokens_seen": 287486485, + "step": 13323, + "time_per_iteration": 2.4652516841888428 + }, + { + "auxiliary_loss_clip": 0.01105903, + "auxiliary_loss_mlp": 0.01034504, + "balance_loss_clip": 1.03536582, + "balance_loss_mlp": 1.02235067, + "epoch": 0.8010822185480234, + "flos": 24572235795840.0, + "grad_norm": 3.278598536802285, + "language_loss": 0.71834219, + "learning_rate": 4.00802572299932e-07, + "loss": 0.73974621, + "num_input_tokens_seen": 287503940, + "step": 13324, + "time_per_iteration": 2.5188510417938232 + }, + { + "auxiliary_loss_clip": 0.01070718, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.03224802, + "balance_loss_mlp": 1.01891005, + "epoch": 0.8011423418006914, + "flos": 21829988903040.0, + "grad_norm": 2.1078516340882514, + "language_loss": 0.76646745, + "learning_rate": 4.005687173776635e-07, + "loss": 0.78748918, + "num_input_tokens_seen": 287521660, + "step": 13325, + "time_per_iteration": 5.3655829429626465 + }, + { + "auxiliary_loss_clip": 0.0108614, + "auxiliary_loss_mlp": 0.01026314, + "balance_loss_clip": 1.03312159, + "balance_loss_mlp": 1.0157758, + "epoch": 0.8012024650533593, + "flos": 23915178259200.0, + "grad_norm": 1.6978798092074072, + "language_loss": 0.79394805, + "learning_rate": 4.003349231059898e-07, + "loss": 0.8150726, + "num_input_tokens_seen": 287541505, + "step": 13326, + "time_per_iteration": 2.5177502632141113 + }, + { + "auxiliary_loss_clip": 0.01090485, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.03396702, + "balance_loss_mlp": 1.02050436, + "epoch": 0.8012625883060274, + "flos": 23587062497280.0, + "grad_norm": 2.912477557845531, + "language_loss": 0.66161281, + "learning_rate": 4.001011894937765e-07, + "loss": 0.68283439, + "num_input_tokens_seen": 287560015, + "step": 13327, + "time_per_iteration": 2.5111331939697266 + }, + { + "auxiliary_loss_clip": 0.01089458, + "auxiliary_loss_mlp": 0.01027476, + "balance_loss_clip": 1.0354948, + "balance_loss_mlp": 1.01665151, + "epoch": 0.8013227115586953, + "flos": 20813932886400.0, + "grad_norm": 1.8205493290709498, + "language_loss": 0.73709273, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.75826204, + "num_input_tokens_seen": 287579150, + "step": 13328, + "time_per_iteration": 4.1316986083984375 + }, + { + "auxiliary_loss_clip": 0.01054057, + "auxiliary_loss_mlp": 0.01032132, + "balance_loss_clip": 1.03644693, + "balance_loss_mlp": 1.01923966, + "epoch": 0.8013828348113633, + "flos": 15888317788800.0, + "grad_norm": 1.9041794062794475, + "language_loss": 0.7410742, + "learning_rate": 3.996339042831798e-07, + "loss": 0.76193607, + "num_input_tokens_seen": 287597420, + "step": 13329, + "time_per_iteration": 2.549942970275879 + }, + { + "auxiliary_loss_clip": 0.01023602, + "auxiliary_loss_mlp": 0.01001551, + "balance_loss_clip": 1.00968885, + "balance_loss_mlp": 1.00050759, + "epoch": 0.8014429580640313, + "flos": 71062981562880.0, + "grad_norm": 0.6905274449975086, + "language_loss": 0.52924335, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.54949486, + "num_input_tokens_seen": 287667280, + "step": 13330, + "time_per_iteration": 3.201615810394287 + }, + { + "auxiliary_loss_clip": 0.01080285, + "auxiliary_loss_mlp": 0.01036242, + "balance_loss_clip": 1.03457117, + "balance_loss_mlp": 1.02262878, + "epoch": 0.8015030813166992, + "flos": 23076340968960.0, + "grad_norm": 1.6884841419359697, + "language_loss": 0.72646618, + "learning_rate": 3.991668618167519e-07, + "loss": 0.74763143, + "num_input_tokens_seen": 287687375, + "step": 13331, + "time_per_iteration": 2.52752685546875 + }, + { + "auxiliary_loss_clip": 0.01087159, + "auxiliary_loss_mlp": 0.01030235, + "balance_loss_clip": 1.03454638, + "balance_loss_mlp": 1.01954794, + "epoch": 0.8015632045693672, + "flos": 21872328059520.0, + "grad_norm": 1.951989496180462, + "language_loss": 0.77215278, + "learning_rate": 3.989334316347401e-07, + "loss": 0.79332674, + "num_input_tokens_seen": 287707895, + "step": 13332, + "time_per_iteration": 2.516373872756958 + }, + { + "auxiliary_loss_clip": 0.0110572, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.03707957, + "balance_loss_mlp": 1.01533175, + "epoch": 0.8016233278220352, + "flos": 23656728925440.0, + "grad_norm": 1.7369412907283384, + "language_loss": 0.83228683, + "learning_rate": 3.987000621653338e-07, + "loss": 0.85361993, + "num_input_tokens_seen": 287723990, + "step": 13333, + "time_per_iteration": 2.4531333446502686 + }, + { + "auxiliary_loss_clip": 0.01082802, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.03421903, + "balance_loss_mlp": 1.01433158, + "epoch": 0.8016834510747032, + "flos": 16253170185600.0, + "grad_norm": 1.996126962887249, + "language_loss": 0.73358667, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.75467908, + "num_input_tokens_seen": 287742380, + "step": 13334, + "time_per_iteration": 3.900388717651367 + }, + { + "auxiliary_loss_clip": 0.01065096, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.03323591, + "balance_loss_mlp": 1.02038431, + "epoch": 0.8017435743273711, + "flos": 12276027665280.0, + "grad_norm": 1.8754934942047101, + "language_loss": 0.74536192, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.76634657, + "num_input_tokens_seen": 287760130, + "step": 13335, + "time_per_iteration": 2.55130672454834 + }, + { + "auxiliary_loss_clip": 0.01062914, + "auxiliary_loss_mlp": 0.01031054, + "balance_loss_clip": 1.03246295, + "balance_loss_mlp": 1.01859641, + "epoch": 0.8018036975800391, + "flos": 17196112068480.0, + "grad_norm": 1.7703856834538236, + "language_loss": 0.75363553, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.77457523, + "num_input_tokens_seen": 287777565, + "step": 13336, + "time_per_iteration": 2.567640781402588 + }, + { + "auxiliary_loss_clip": 0.01072967, + "auxiliary_loss_mlp": 0.01035988, + "balance_loss_clip": 1.03711772, + "balance_loss_mlp": 1.02312565, + "epoch": 0.801863820832707, + "flos": 20631865824000.0, + "grad_norm": 2.0758767605761075, + "language_loss": 0.75050652, + "learning_rate": 3.977671915907068e-07, + "loss": 0.77159613, + "num_input_tokens_seen": 287796310, + "step": 13337, + "time_per_iteration": 2.5829451084136963 + }, + { + "auxiliary_loss_clip": 0.01046574, + "auxiliary_loss_mlp": 0.0078395, + "balance_loss_clip": 1.03871942, + "balance_loss_mlp": 1.00954533, + "epoch": 0.801923944085375, + "flos": 30445569285120.0, + "grad_norm": 1.705399119771899, + "language_loss": 0.80342925, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.82173449, + "num_input_tokens_seen": 287817330, + "step": 13338, + "time_per_iteration": 2.6872522830963135 + }, + { + "auxiliary_loss_clip": 0.01068863, + "auxiliary_loss_mlp": 0.01030846, + "balance_loss_clip": 1.03286004, + "balance_loss_mlp": 1.01784015, + "epoch": 0.801984067338043, + "flos": 20010575255040.0, + "grad_norm": 1.8136746244296729, + "language_loss": 0.74446017, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.76545727, + "num_input_tokens_seen": 287835095, + "step": 13339, + "time_per_iteration": 2.5608227252960205 + }, + { + "auxiliary_loss_clip": 0.01089424, + "auxiliary_loss_mlp": 0.01031364, + "balance_loss_clip": 1.03348064, + "balance_loss_mlp": 1.02018821, + "epoch": 0.802044190590711, + "flos": 22784028088320.0, + "grad_norm": 1.6774071267319046, + "language_loss": 0.7927376, + "learning_rate": 3.970681765754775e-07, + "loss": 0.81394547, + "num_input_tokens_seen": 287854595, + "step": 13340, + "time_per_iteration": 2.511946678161621 + }, + { + "auxiliary_loss_clip": 0.01074949, + "auxiliary_loss_mlp": 0.01028007, + "balance_loss_clip": 1.03514183, + "balance_loss_mlp": 1.01667023, + "epoch": 0.8021043138433789, + "flos": 27600115639680.0, + "grad_norm": 1.7574466578034205, + "language_loss": 0.67706728, + "learning_rate": 3.968352931252936e-07, + "loss": 0.69809687, + "num_input_tokens_seen": 287876960, + "step": 13341, + "time_per_iteration": 2.6934967041015625 + }, + { + "auxiliary_loss_clip": 0.01012686, + "auxiliary_loss_mlp": 0.01006435, + "balance_loss_clip": 1.00865841, + "balance_loss_mlp": 1.00518882, + "epoch": 0.8021644370960469, + "flos": 62063730057600.0, + "grad_norm": 0.8109848907676462, + "language_loss": 0.61618185, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.6363731, + "num_input_tokens_seen": 287936530, + "step": 13342, + "time_per_iteration": 3.06744122505188 + }, + { + "auxiliary_loss_clip": 0.01086778, + "auxiliary_loss_mlp": 0.0103317, + "balance_loss_clip": 1.03697562, + "balance_loss_mlp": 1.02065945, + "epoch": 0.8022245603487148, + "flos": 23361794352000.0, + "grad_norm": 1.8333080454217499, + "language_loss": 0.63677675, + "learning_rate": 3.963697086102522e-07, + "loss": 0.65797627, + "num_input_tokens_seen": 287954285, + "step": 13343, + "time_per_iteration": 2.544748067855835 + }, + { + "auxiliary_loss_clip": 0.01077782, + "auxiliary_loss_mlp": 0.01025551, + "balance_loss_clip": 1.03361297, + "balance_loss_mlp": 1.0145241, + "epoch": 0.8022846836013828, + "flos": 10853354712960.0, + "grad_norm": 1.8176144429317986, + "language_loss": 0.68549788, + "learning_rate": 3.96137007563051e-07, + "loss": 0.70653117, + "num_input_tokens_seen": 287971595, + "step": 13344, + "time_per_iteration": 2.497309923171997 + }, + { + "auxiliary_loss_clip": 0.01094821, + "auxiliary_loss_mlp": 0.01027528, + "balance_loss_clip": 1.03718472, + "balance_loss_mlp": 1.01524353, + "epoch": 0.8023448068540509, + "flos": 29240443054080.0, + "grad_norm": 1.6303371131209852, + "language_loss": 0.70206916, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.72329271, + "num_input_tokens_seen": 287992540, + "step": 13345, + "time_per_iteration": 2.561580181121826 + }, + { + "auxiliary_loss_clip": 0.01012069, + "auxiliary_loss_mlp": 0.01004421, + "balance_loss_clip": 1.00843906, + "balance_loss_mlp": 1.00326467, + "epoch": 0.8024049301067188, + "flos": 64153588181760.0, + "grad_norm": 0.883928676296958, + "language_loss": 0.62981355, + "learning_rate": 3.956717879334059e-07, + "loss": 0.64997846, + "num_input_tokens_seen": 288052810, + "step": 13346, + "time_per_iteration": 3.2149910926818848 + }, + { + "auxiliary_loss_clip": 0.0107944, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.03697801, + "balance_loss_mlp": 1.02046168, + "epoch": 0.8024650533593868, + "flos": 28585360765440.0, + "grad_norm": 1.526223294966733, + "language_loss": 0.72732967, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.74844623, + "num_input_tokens_seen": 288073045, + "step": 13347, + "time_per_iteration": 2.6037845611572266 + }, + { + "auxiliary_loss_clip": 0.01096008, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.03683412, + "balance_loss_mlp": 1.02041054, + "epoch": 0.8025251766120547, + "flos": 16982264448000.0, + "grad_norm": 3.005347487713174, + "language_loss": 0.72915757, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.7504437, + "num_input_tokens_seen": 288091165, + "step": 13348, + "time_per_iteration": 2.5179545879364014 + }, + { + "auxiliary_loss_clip": 0.01083995, + "auxiliary_loss_mlp": 0.01025399, + "balance_loss_clip": 1.03721523, + "balance_loss_mlp": 1.01360273, + "epoch": 0.8025852998647227, + "flos": 22163671272960.0, + "grad_norm": 2.098107771364205, + "language_loss": 0.75502479, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.77611876, + "num_input_tokens_seen": 288110595, + "step": 13349, + "time_per_iteration": 2.5509252548217773 + }, + { + "auxiliary_loss_clip": 0.01105502, + "auxiliary_loss_mlp": 0.01035276, + "balance_loss_clip": 1.03753901, + "balance_loss_mlp": 1.0238142, + "epoch": 0.8026454231173906, + "flos": 22017012042240.0, + "grad_norm": 2.01755459234755, + "language_loss": 0.83377945, + "learning_rate": 3.947420787800755e-07, + "loss": 0.85518724, + "num_input_tokens_seen": 288128995, + "step": 13350, + "time_per_iteration": 2.4717376232147217 + }, + { + "auxiliary_loss_clip": 0.01094964, + "auxiliary_loss_mlp": 0.0103375, + "balance_loss_clip": 1.03679359, + "balance_loss_mlp": 1.02193677, + "epoch": 0.8027055463700586, + "flos": 22491320158080.0, + "grad_norm": 2.497726803890766, + "language_loss": 0.71650499, + "learning_rate": 3.945098036485679e-07, + "loss": 0.73779213, + "num_input_tokens_seen": 288149265, + "step": 13351, + "time_per_iteration": 2.4903812408447266 + }, + { + "auxiliary_loss_clip": 0.0106559, + "auxiliary_loss_mlp": 0.01029994, + "balance_loss_clip": 1.03362751, + "balance_loss_mlp": 1.01760173, + "epoch": 0.8027656696227266, + "flos": 28912901909760.0, + "grad_norm": 1.6372182576046679, + "language_loss": 0.61719322, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.63814902, + "num_input_tokens_seen": 288170745, + "step": 13352, + "time_per_iteration": 2.616285562515259 + }, + { + "auxiliary_loss_clip": 0.01093321, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.03710389, + "balance_loss_mlp": 1.02341533, + "epoch": 0.8028257928753946, + "flos": 18589374760320.0, + "grad_norm": 1.6198038648675461, + "language_loss": 0.76510072, + "learning_rate": 3.940454360354046e-07, + "loss": 0.78638661, + "num_input_tokens_seen": 288189415, + "step": 13353, + "time_per_iteration": 2.466602325439453 + }, + { + "auxiliary_loss_clip": 0.01047623, + "auxiliary_loss_mlp": 0.01027588, + "balance_loss_clip": 1.03648007, + "balance_loss_mlp": 1.01393247, + "epoch": 0.8028859161280625, + "flos": 19130009339520.0, + "grad_norm": 2.479525279796178, + "language_loss": 0.73323971, + "learning_rate": 3.938133435713582e-07, + "loss": 0.75399178, + "num_input_tokens_seen": 288206900, + "step": 13354, + "time_per_iteration": 2.6257808208465576 + }, + { + "auxiliary_loss_clip": 0.01066113, + "auxiliary_loss_mlp": 0.01037891, + "balance_loss_clip": 1.0348922, + "balance_loss_mlp": 1.02481413, + "epoch": 0.8029460393807305, + "flos": 20229881742720.0, + "grad_norm": 1.9382328274244234, + "language_loss": 0.65956122, + "learning_rate": 3.935813120140714e-07, + "loss": 0.68060124, + "num_input_tokens_seen": 288224800, + "step": 13355, + "time_per_iteration": 2.550434112548828 + }, + { + "auxiliary_loss_clip": 0.01070555, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.03337026, + "balance_loss_mlp": 1.02015126, + "epoch": 0.8030061626333984, + "flos": 49783320933120.0, + "grad_norm": 2.075622788260489, + "language_loss": 0.6896823, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.71072495, + "num_input_tokens_seen": 288249400, + "step": 13356, + "time_per_iteration": 2.8160178661346436 + }, + { + "auxiliary_loss_clip": 0.01057821, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.03479362, + "balance_loss_mlp": 1.01591933, + "epoch": 0.8030662858860664, + "flos": 21615243442560.0, + "grad_norm": 1.5973810549953866, + "language_loss": 0.77349156, + "learning_rate": 3.931174316549666e-07, + "loss": 0.79434562, + "num_input_tokens_seen": 288268780, + "step": 13357, + "time_per_iteration": 2.597038984298706 + }, + { + "auxiliary_loss_clip": 0.01068725, + "auxiliary_loss_mlp": 0.01030049, + "balance_loss_clip": 1.03242028, + "balance_loss_mlp": 1.01715004, + "epoch": 0.8031264091387345, + "flos": 25630056351360.0, + "grad_norm": 1.4646503966642257, + "language_loss": 0.76954699, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.79053473, + "num_input_tokens_seen": 288290830, + "step": 13358, + "time_per_iteration": 2.6498491764068604 + }, + { + "auxiliary_loss_clip": 0.01093022, + "auxiliary_loss_mlp": 0.01029509, + "balance_loss_clip": 1.03516746, + "balance_loss_mlp": 1.01802373, + "epoch": 0.8031865323914024, + "flos": 19646225648640.0, + "grad_norm": 1.7607498497660674, + "language_loss": 0.84897596, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.87020129, + "num_input_tokens_seen": 288308865, + "step": 13359, + "time_per_iteration": 2.494602918624878 + }, + { + "auxiliary_loss_clip": 0.01079035, + "auxiliary_loss_mlp": 0.01026224, + "balance_loss_clip": 1.03665912, + "balance_loss_mlp": 1.01511943, + "epoch": 0.8032466556440704, + "flos": 26169110732160.0, + "grad_norm": 1.7605318112990773, + "language_loss": 0.73083961, + "learning_rate": 3.924220681368928e-07, + "loss": 0.75189215, + "num_input_tokens_seen": 288327325, + "step": 13360, + "time_per_iteration": 2.584104061126709 + }, + { + "auxiliary_loss_clip": 0.01105864, + "auxiliary_loss_mlp": 0.01029618, + "balance_loss_clip": 1.03630781, + "balance_loss_mlp": 1.01803064, + "epoch": 0.8033067788967383, + "flos": 25520026014720.0, + "grad_norm": 1.8986745463187606, + "language_loss": 0.69647849, + "learning_rate": 3.921904022048512e-07, + "loss": 0.71783334, + "num_input_tokens_seen": 288347285, + "step": 13361, + "time_per_iteration": 2.484532594680786 + }, + { + "auxiliary_loss_clip": 0.01108124, + "auxiliary_loss_mlp": 0.01035112, + "balance_loss_clip": 1.03573632, + "balance_loss_mlp": 1.02229059, + "epoch": 0.8033669021494063, + "flos": 24024274842240.0, + "grad_norm": 2.6031135984086875, + "language_loss": 0.70269454, + "learning_rate": 3.919587972411098e-07, + "loss": 0.72412688, + "num_input_tokens_seen": 288367785, + "step": 13362, + "time_per_iteration": 2.5250771045684814 + }, + { + "auxiliary_loss_clip": 0.01112728, + "auxiliary_loss_mlp": 0.01037203, + "balance_loss_clip": 1.03802633, + "balance_loss_mlp": 1.02315986, + "epoch": 0.8034270254020742, + "flos": 13588059749760.0, + "grad_norm": 2.284922339814391, + "language_loss": 0.78394192, + "learning_rate": 3.91727253254452e-07, + "loss": 0.8054412, + "num_input_tokens_seen": 288384135, + "step": 13363, + "time_per_iteration": 3.863267660140991 + }, + { + "auxiliary_loss_clip": 0.01093295, + "auxiliary_loss_mlp": 0.01028384, + "balance_loss_clip": 1.03439474, + "balance_loss_mlp": 1.01577783, + "epoch": 0.8034871486547422, + "flos": 27412661537280.0, + "grad_norm": 1.9069995209118715, + "language_loss": 0.74891394, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.77013075, + "num_input_tokens_seen": 288403805, + "step": 13364, + "time_per_iteration": 3.926029682159424 + }, + { + "auxiliary_loss_clip": 0.01092619, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.03906858, + "balance_loss_mlp": 1.01751339, + "epoch": 0.8035472719074102, + "flos": 32598593475840.0, + "grad_norm": 2.0645978139304915, + "language_loss": 0.60653353, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.62774956, + "num_input_tokens_seen": 288424895, + "step": 13365, + "time_per_iteration": 2.568396806716919 + }, + { + "auxiliary_loss_clip": 0.01083699, + "auxiliary_loss_mlp": 0.01032522, + "balance_loss_clip": 1.03456807, + "balance_loss_mlp": 1.01996303, + "epoch": 0.8036073951600782, + "flos": 21287989607040.0, + "grad_norm": 27.646359262791382, + "language_loss": 0.66233498, + "learning_rate": 3.910329872447706e-07, + "loss": 0.68349719, + "num_input_tokens_seen": 288443865, + "step": 13366, + "time_per_iteration": 2.5014655590057373 + }, + { + "auxiliary_loss_clip": 0.01102092, + "auxiliary_loss_mlp": 0.01029092, + "balance_loss_clip": 1.03492355, + "balance_loss_mlp": 1.01702809, + "epoch": 0.8036675184127461, + "flos": 18113845582080.0, + "grad_norm": 2.8297758421375376, + "language_loss": 0.74963737, + "learning_rate": 3.908016872542259e-07, + "loss": 0.77094918, + "num_input_tokens_seen": 288461065, + "step": 13367, + "time_per_iteration": 3.846749782562256 + }, + { + "auxiliary_loss_clip": 0.0110262, + "auxiliary_loss_mlp": 0.01027513, + "balance_loss_clip": 1.03518891, + "balance_loss_mlp": 1.01629496, + "epoch": 0.8037276416654141, + "flos": 26030280666240.0, + "grad_norm": 1.4466072852806793, + "language_loss": 0.73801064, + "learning_rate": 3.905704482846428e-07, + "loss": 0.75931191, + "num_input_tokens_seen": 288481865, + "step": 13368, + "time_per_iteration": 2.5061404705047607 + }, + { + "auxiliary_loss_clip": 0.0110566, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.03477788, + "balance_loss_mlp": 1.01993585, + "epoch": 0.803787764918082, + "flos": 18802180886400.0, + "grad_norm": 1.895948819279036, + "language_loss": 0.69739908, + "learning_rate": 3.90339270344789e-07, + "loss": 0.7187736, + "num_input_tokens_seen": 288499345, + "step": 13369, + "time_per_iteration": 2.429579973220825 + }, + { + "auxiliary_loss_clip": 0.01082592, + "auxiliary_loss_mlp": 0.01030337, + "balance_loss_clip": 1.03410947, + "balance_loss_mlp": 1.01908994, + "epoch": 0.80384788817075, + "flos": 20225787592320.0, + "grad_norm": 1.648630609448983, + "language_loss": 0.73570299, + "learning_rate": 3.901081534434312e-07, + "loss": 0.7568323, + "num_input_tokens_seen": 288517660, + "step": 13370, + "time_per_iteration": 2.5294971466064453 + }, + { + "auxiliary_loss_clip": 0.01082692, + "auxiliary_loss_mlp": 0.0103218, + "balance_loss_clip": 1.0339241, + "balance_loss_mlp": 1.01873326, + "epoch": 0.8039080114234181, + "flos": 18515290959360.0, + "grad_norm": 2.874213982351197, + "language_loss": 0.87185812, + "learning_rate": 3.898770975893342e-07, + "loss": 0.89300686, + "num_input_tokens_seen": 288534180, + "step": 13371, + "time_per_iteration": 2.496948003768921 + }, + { + "auxiliary_loss_clip": 0.01097101, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.03483593, + "balance_loss_mlp": 1.01584244, + "epoch": 0.803968134676086, + "flos": 22382510883840.0, + "grad_norm": 2.5891342049582806, + "language_loss": 0.74791694, + "learning_rate": 3.89646102791259e-07, + "loss": 0.76917994, + "num_input_tokens_seen": 288553350, + "step": 13372, + "time_per_iteration": 2.491957664489746 + }, + { + "auxiliary_loss_clip": 0.01063845, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.03591871, + "balance_loss_mlp": 1.01587534, + "epoch": 0.804028257928754, + "flos": 23842566915840.0, + "grad_norm": 2.134721906471902, + "language_loss": 0.79230094, + "learning_rate": 3.894151690579646e-07, + "loss": 0.81323022, + "num_input_tokens_seen": 288571325, + "step": 13373, + "time_per_iteration": 3.9948012828826904 + }, + { + "auxiliary_loss_clip": 0.01080058, + "auxiliary_loss_mlp": 0.01034594, + "balance_loss_clip": 1.03522956, + "balance_loss_mlp": 1.0230248, + "epoch": 0.8040883811814219, + "flos": 23550720912000.0, + "grad_norm": 1.7751817690024492, + "language_loss": 0.74281162, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.76395816, + "num_input_tokens_seen": 288592100, + "step": 13374, + "time_per_iteration": 2.554072618484497 + }, + { + "auxiliary_loss_clip": 0.01057047, + "auxiliary_loss_mlp": 0.01039245, + "balance_loss_clip": 1.03215492, + "balance_loss_mlp": 1.02492189, + "epoch": 0.8041485044340899, + "flos": 19026263882880.0, + "grad_norm": 1.9378163270766697, + "language_loss": 0.68572056, + "learning_rate": 3.889534848207452e-07, + "loss": 0.70668352, + "num_input_tokens_seen": 288612305, + "step": 13375, + "time_per_iteration": 2.5922799110412598 + }, + { + "auxiliary_loss_clip": 0.01010982, + "auxiliary_loss_mlp": 0.01003688, + "balance_loss_clip": 1.01783705, + "balance_loss_mlp": 1.00241232, + "epoch": 0.8042086276867578, + "flos": 70005663797760.0, + "grad_norm": 0.7253296602307644, + "language_loss": 0.55664819, + "learning_rate": 3.887227343343271e-07, + "loss": 0.57679492, + "num_input_tokens_seen": 288676015, + "step": 13376, + "time_per_iteration": 3.249617576599121 + }, + { + "auxiliary_loss_clip": 0.0106207, + "auxiliary_loss_mlp": 0.01033614, + "balance_loss_clip": 1.03444219, + "balance_loss_mlp": 1.02008986, + "epoch": 0.8042687509394258, + "flos": 21872435800320.0, + "grad_norm": 1.638577443839273, + "language_loss": 0.72976846, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.75072527, + "num_input_tokens_seen": 288696455, + "step": 13377, + "time_per_iteration": 2.6367011070251465 + }, + { + "auxiliary_loss_clip": 0.01091014, + "auxiliary_loss_mlp": 0.0102918, + "balance_loss_clip": 1.03343606, + "balance_loss_mlp": 1.01675868, + "epoch": 0.8043288741920938, + "flos": 26614870513920.0, + "grad_norm": 1.9333208146645127, + "language_loss": 0.70256734, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.72376931, + "num_input_tokens_seen": 288715560, + "step": 13378, + "time_per_iteration": 2.528536319732666 + }, + { + "auxiliary_loss_clip": 0.01097237, + "auxiliary_loss_mlp": 0.01028914, + "balance_loss_clip": 1.0367986, + "balance_loss_mlp": 1.01628399, + "epoch": 0.8043889974447618, + "flos": 33403387651200.0, + "grad_norm": 1.5888929887623413, + "language_loss": 0.69366574, + "learning_rate": 3.880308495088347e-07, + "loss": 0.7149272, + "num_input_tokens_seen": 288739485, + "step": 13379, + "time_per_iteration": 2.6258974075317383 + }, + { + "auxiliary_loss_clip": 0.01110051, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.03821445, + "balance_loss_mlp": 1.01854134, + "epoch": 0.8044491206974297, + "flos": 20375966355840.0, + "grad_norm": 2.54464821124022, + "language_loss": 0.76279229, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.78421068, + "num_input_tokens_seen": 288757420, + "step": 13380, + "time_per_iteration": 2.4461846351623535 + }, + { + "auxiliary_loss_clip": 0.01058091, + "auxiliary_loss_mlp": 0.01026033, + "balance_loss_clip": 1.0330379, + "balance_loss_mlp": 1.014768, + "epoch": 0.8045092439500977, + "flos": 23403810286080.0, + "grad_norm": 2.5703430379826204, + "language_loss": 0.69191515, + "learning_rate": 3.875698985740887e-07, + "loss": 0.71275634, + "num_input_tokens_seen": 288775535, + "step": 13381, + "time_per_iteration": 2.6355161666870117 + }, + { + "auxiliary_loss_clip": 0.01096964, + "auxiliary_loss_mlp": 0.01035413, + "balance_loss_clip": 1.03715158, + "balance_loss_mlp": 1.02312279, + "epoch": 0.8045693672027656, + "flos": 24097245321600.0, + "grad_norm": 1.7927408818103339, + "language_loss": 0.6381799, + "learning_rate": 3.873395148176135e-07, + "loss": 0.65950364, + "num_input_tokens_seen": 288795035, + "step": 13382, + "time_per_iteration": 2.5027074813842773 + }, + { + "auxiliary_loss_clip": 0.01083104, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.03678763, + "balance_loss_mlp": 1.02120996, + "epoch": 0.8046294904554336, + "flos": 27707165147520.0, + "grad_norm": 7.044418196142844, + "language_loss": 0.7652936, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.78644967, + "num_input_tokens_seen": 288816270, + "step": 13383, + "time_per_iteration": 2.590162515640259 + }, + { + "auxiliary_loss_clip": 0.01090732, + "auxiliary_loss_mlp": 0.01041219, + "balance_loss_clip": 1.03546405, + "balance_loss_mlp": 1.02777815, + "epoch": 0.8046896137081017, + "flos": 24972998814720.0, + "grad_norm": 2.2426722541602966, + "language_loss": 0.69919097, + "learning_rate": 3.868789307701381e-07, + "loss": 0.72051048, + "num_input_tokens_seen": 288836050, + "step": 13384, + "time_per_iteration": 2.4977378845214844 + }, + { + "auxiliary_loss_clip": 0.01094484, + "auxiliary_loss_mlp": 0.0103708, + "balance_loss_clip": 1.03389359, + "balance_loss_mlp": 1.02413428, + "epoch": 0.8047497369607696, + "flos": 17675484001920.0, + "grad_norm": 2.6131138455864726, + "language_loss": 0.79030651, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.81162214, + "num_input_tokens_seen": 288852900, + "step": 13385, + "time_per_iteration": 2.4689950942993164 + }, + { + "auxiliary_loss_clip": 0.01105687, + "auxiliary_loss_mlp": 0.01034181, + "balance_loss_clip": 1.03552186, + "balance_loss_mlp": 1.02172971, + "epoch": 0.8048098602134376, + "flos": 22382079920640.0, + "grad_norm": 1.7381334569539308, + "language_loss": 0.72058332, + "learning_rate": 3.864185914015108e-07, + "loss": 0.74198198, + "num_input_tokens_seen": 288872625, + "step": 13386, + "time_per_iteration": 2.4455037117004395 + }, + { + "auxiliary_loss_clip": 0.01009391, + "auxiliary_loss_mlp": 0.01003217, + "balance_loss_clip": 1.00914633, + "balance_loss_mlp": 1.00205505, + "epoch": 0.8048699834661055, + "flos": 71200949702400.0, + "grad_norm": 0.6736834872445414, + "language_loss": 0.51287585, + "learning_rate": 3.861885134935865e-07, + "loss": 0.53300196, + "num_input_tokens_seen": 288939180, + "step": 13387, + "time_per_iteration": 3.19726300239563 + }, + { + "auxiliary_loss_clip": 0.0110542, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.0349648, + "balance_loss_mlp": 1.01874375, + "epoch": 0.8049301067187735, + "flos": 23660320285440.0, + "grad_norm": 1.7523593498265764, + "language_loss": 0.73828721, + "learning_rate": 3.859584967815559e-07, + "loss": 0.75966513, + "num_input_tokens_seen": 288958925, + "step": 13388, + "time_per_iteration": 2.4600484371185303 + }, + { + "auxiliary_loss_clip": 0.01073292, + "auxiliary_loss_mlp": 0.01027567, + "balance_loss_clip": 1.03588259, + "balance_loss_mlp": 1.01605117, + "epoch": 0.8049902299714414, + "flos": 24426330750720.0, + "grad_norm": 2.0910349843915954, + "language_loss": 0.71611261, + "learning_rate": 3.857285412741411e-07, + "loss": 0.73712122, + "num_input_tokens_seen": 288980935, + "step": 13389, + "time_per_iteration": 2.6151931285858154 + }, + { + "auxiliary_loss_clip": 0.01079584, + "auxiliary_loss_mlp": 0.0103508, + "balance_loss_clip": 1.03768218, + "balance_loss_mlp": 1.02238369, + "epoch": 0.8050503532241094, + "flos": 17492626840320.0, + "grad_norm": 2.262386953923954, + "language_loss": 0.82705092, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.84819758, + "num_input_tokens_seen": 288996780, + "step": 13390, + "time_per_iteration": 2.4888815879821777 + }, + { + "auxiliary_loss_clip": 0.01022125, + "auxiliary_loss_mlp": 0.0100259, + "balance_loss_clip": 1.00812674, + "balance_loss_mlp": 1.00140381, + "epoch": 0.8051104764767774, + "flos": 57658030369920.0, + "grad_norm": 0.7756338311084859, + "language_loss": 0.55528748, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.57553464, + "num_input_tokens_seen": 289057590, + "step": 13391, + "time_per_iteration": 3.0758655071258545 + }, + { + "auxiliary_loss_clip": 0.01091345, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.03579521, + "balance_loss_mlp": 1.02056086, + "epoch": 0.8051705997294454, + "flos": 18003456109440.0, + "grad_norm": 1.5264551630611596, + "language_loss": 0.84814405, + "learning_rate": 3.850390420667762e-07, + "loss": 0.86938024, + "num_input_tokens_seen": 289076285, + "step": 13392, + "time_per_iteration": 2.464813470840454 + }, + { + "auxiliary_loss_clip": 0.01071361, + "auxiliary_loss_mlp": 0.01026384, + "balance_loss_clip": 1.03320336, + "balance_loss_mlp": 1.01491046, + "epoch": 0.8052307229821133, + "flos": 26397754755840.0, + "grad_norm": 1.4276030921994185, + "language_loss": 0.70267677, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.72365421, + "num_input_tokens_seen": 289097585, + "step": 13393, + "time_per_iteration": 2.6039328575134277 + }, + { + "auxiliary_loss_clip": 0.01095367, + "auxiliary_loss_mlp": 0.01030987, + "balance_loss_clip": 1.03550065, + "balance_loss_mlp": 1.01820743, + "epoch": 0.8052908462347813, + "flos": 21757018423680.0, + "grad_norm": 1.9576355407722206, + "language_loss": 0.76198173, + "learning_rate": 3.84579682111414e-07, + "loss": 0.78324521, + "num_input_tokens_seen": 289116890, + "step": 13394, + "time_per_iteration": 2.480156421661377 + }, + { + "auxiliary_loss_clip": 0.01107688, + "auxiliary_loss_mlp": 0.01030641, + "balance_loss_clip": 1.03826773, + "balance_loss_mlp": 1.01890469, + "epoch": 0.8053509694874492, + "flos": 25442279026560.0, + "grad_norm": 1.6707155604193806, + "language_loss": 0.648853, + "learning_rate": 3.843500940147304e-07, + "loss": 0.67023635, + "num_input_tokens_seen": 289136670, + "step": 13395, + "time_per_iteration": 2.503253936767578 + }, + { + "auxiliary_loss_clip": 0.01021941, + "auxiliary_loss_mlp": 0.01003154, + "balance_loss_clip": 1.00826669, + "balance_loss_mlp": 1.00198007, + "epoch": 0.8054110927401172, + "flos": 57668122091520.0, + "grad_norm": 0.7620429408536284, + "language_loss": 0.57322359, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.59347451, + "num_input_tokens_seen": 289200150, + "step": 13396, + "time_per_iteration": 3.2477612495422363 + }, + { + "auxiliary_loss_clip": 0.01095375, + "auxiliary_loss_mlp": 0.01037161, + "balance_loss_clip": 1.0359447, + "balance_loss_mlp": 1.02401209, + "epoch": 0.8054712159927853, + "flos": 19276201693440.0, + "grad_norm": 1.6559970727412714, + "language_loss": 0.77517152, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.79649687, + "num_input_tokens_seen": 289218125, + "step": 13397, + "time_per_iteration": 2.4827191829681396 + }, + { + "auxiliary_loss_clip": 0.01090485, + "auxiliary_loss_mlp": 0.01027295, + "balance_loss_clip": 1.03821337, + "balance_loss_mlp": 1.01590443, + "epoch": 0.8055313392454532, + "flos": 17967617314560.0, + "grad_norm": 1.74114740982197, + "language_loss": 0.70399547, + "learning_rate": 3.836616973531266e-07, + "loss": 0.72517329, + "num_input_tokens_seen": 289237115, + "step": 13398, + "time_per_iteration": 2.4957783222198486 + }, + { + "auxiliary_loss_clip": 0.01082653, + "auxiliary_loss_mlp": 0.01028805, + "balance_loss_clip": 1.03432429, + "balance_loss_mlp": 1.01742029, + "epoch": 0.8055914624981212, + "flos": 13478352635520.0, + "grad_norm": 3.108877753155029, + "language_loss": 0.69269228, + "learning_rate": 3.834323543710805e-07, + "loss": 0.71380687, + "num_input_tokens_seen": 289253635, + "step": 13399, + "time_per_iteration": 2.4747018814086914 + }, + { + "auxiliary_loss_clip": 0.01105769, + "auxiliary_loss_mlp": 0.01036363, + "balance_loss_clip": 1.03641081, + "balance_loss_mlp": 1.0247519, + "epoch": 0.8056515857507891, + "flos": 13224787551360.0, + "grad_norm": 2.086437665809118, + "language_loss": 0.72065437, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.74207562, + "num_input_tokens_seen": 289270085, + "step": 13400, + "time_per_iteration": 2.4518284797668457 + }, + { + "auxiliary_loss_clip": 0.0109153, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.03255856, + "balance_loss_mlp": 1.01833332, + "epoch": 0.8057117090034571, + "flos": 23878190229120.0, + "grad_norm": 3.1195764920304203, + "language_loss": 0.63832927, + "learning_rate": 3.829738523169037e-07, + "loss": 0.65955305, + "num_input_tokens_seen": 289289645, + "step": 13401, + "time_per_iteration": 2.506031036376953 + }, + { + "auxiliary_loss_clip": 0.01093513, + "auxiliary_loss_mlp": 0.01032565, + "balance_loss_clip": 1.03403246, + "balance_loss_mlp": 1.02068567, + "epoch": 0.805771832256125, + "flos": 21214300855680.0, + "grad_norm": 2.0413766331904926, + "language_loss": 0.83736944, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.85863018, + "num_input_tokens_seen": 289306630, + "step": 13402, + "time_per_iteration": 3.88748836517334 + }, + { + "auxiliary_loss_clip": 0.01054202, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.03641808, + "balance_loss_mlp": 1.01970243, + "epoch": 0.805831955508793, + "flos": 17566818382080.0, + "grad_norm": 2.7062480387105357, + "language_loss": 0.68047547, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.70133173, + "num_input_tokens_seen": 289324960, + "step": 13403, + "time_per_iteration": 4.051624536514282 + }, + { + "auxiliary_loss_clip": 0.01067137, + "auxiliary_loss_mlp": 0.00783587, + "balance_loss_clip": 1.03438258, + "balance_loss_mlp": 1.00909245, + "epoch": 0.805892078761461, + "flos": 26907542530560.0, + "grad_norm": 1.7365389480847733, + "language_loss": 0.84814215, + "learning_rate": 3.822865591408084e-07, + "loss": 0.86664939, + "num_input_tokens_seen": 289344980, + "step": 13404, + "time_per_iteration": 2.5914323329925537 + }, + { + "auxiliary_loss_clip": 0.01069097, + "auxiliary_loss_mlp": 0.01030865, + "balance_loss_clip": 1.03432369, + "balance_loss_mlp": 1.01997542, + "epoch": 0.805952202014129, + "flos": 31506442496640.0, + "grad_norm": 1.464944311347173, + "language_loss": 0.70487094, + "learning_rate": 3.820575840915743e-07, + "loss": 0.72587055, + "num_input_tokens_seen": 289367500, + "step": 13405, + "time_per_iteration": 2.6530344486236572 + }, + { + "auxiliary_loss_clip": 0.01093679, + "auxiliary_loss_mlp": 0.01025353, + "balance_loss_clip": 1.03617704, + "balance_loss_mlp": 1.01398087, + "epoch": 0.8060123252667969, + "flos": 24389953251840.0, + "grad_norm": 2.8450230937305396, + "language_loss": 0.75031948, + "learning_rate": 3.818286703948788e-07, + "loss": 0.77150983, + "num_input_tokens_seen": 289385930, + "step": 13406, + "time_per_iteration": 3.9375033378601074 + }, + { + "auxiliary_loss_clip": 0.01096299, + "auxiliary_loss_mlp": 0.01034288, + "balance_loss_clip": 1.03672886, + "balance_loss_mlp": 1.02162194, + "epoch": 0.8060724485194649, + "flos": 23479941162240.0, + "grad_norm": 1.4488799515543587, + "language_loss": 0.76265389, + "learning_rate": 3.815998180594018e-07, + "loss": 0.78395975, + "num_input_tokens_seen": 289408025, + "step": 13407, + "time_per_iteration": 2.549997091293335 + }, + { + "auxiliary_loss_clip": 0.01076352, + "auxiliary_loss_mlp": 0.00785515, + "balance_loss_clip": 1.03316903, + "balance_loss_mlp": 1.01067543, + "epoch": 0.8061325717721328, + "flos": 18624495283200.0, + "grad_norm": 1.8418156486840802, + "language_loss": 0.73281717, + "learning_rate": 3.81371027093822e-07, + "loss": 0.75143588, + "num_input_tokens_seen": 289426575, + "step": 13408, + "time_per_iteration": 2.50502347946167 + }, + { + "auxiliary_loss_clip": 0.01076618, + "auxiliary_loss_mlp": 0.010338, + "balance_loss_clip": 1.03589356, + "balance_loss_mlp": 1.02006698, + "epoch": 0.8061926950248008, + "flos": 23582752865280.0, + "grad_norm": 1.918956520883007, + "language_loss": 0.70756918, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.72867334, + "num_input_tokens_seen": 289447760, + "step": 13409, + "time_per_iteration": 2.546689748764038 + }, + { + "auxiliary_loss_clip": 0.01104477, + "auxiliary_loss_mlp": 0.01025187, + "balance_loss_clip": 1.03469849, + "balance_loss_mlp": 1.01278317, + "epoch": 0.8062528182774689, + "flos": 11143333209600.0, + "grad_norm": 2.0672452244505677, + "language_loss": 0.77130526, + "learning_rate": 3.809136293070545e-07, + "loss": 0.79260194, + "num_input_tokens_seen": 289463920, + "step": 13410, + "time_per_iteration": 2.4543099403381348 + }, + { + "auxiliary_loss_clip": 0.01093148, + "auxiliary_loss_mlp": 0.0103217, + "balance_loss_clip": 1.03596997, + "balance_loss_mlp": 1.01992154, + "epoch": 0.8063129415301368, + "flos": 22346815743360.0, + "grad_norm": 2.2403458045294555, + "language_loss": 0.6840862, + "learning_rate": 3.806850225032117e-07, + "loss": 0.70533943, + "num_input_tokens_seen": 289482635, + "step": 13411, + "time_per_iteration": 2.494877815246582 + }, + { + "auxiliary_loss_clip": 0.01071203, + "auxiliary_loss_mlp": 0.01028131, + "balance_loss_clip": 1.03417981, + "balance_loss_mlp": 1.01596546, + "epoch": 0.8063730647828048, + "flos": 23988400133760.0, + "grad_norm": 2.1307343396564584, + "language_loss": 0.68136013, + "learning_rate": 3.804564771039551e-07, + "loss": 0.70235348, + "num_input_tokens_seen": 289502040, + "step": 13412, + "time_per_iteration": 3.9720804691314697 + }, + { + "auxiliary_loss_clip": 0.010962, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.03923547, + "balance_loss_mlp": 1.01978803, + "epoch": 0.8064331880354727, + "flos": 21321494017920.0, + "grad_norm": 1.5952779005573559, + "language_loss": 0.81734228, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.83863884, + "num_input_tokens_seen": 289520740, + "step": 13413, + "time_per_iteration": 2.4875619411468506 + }, + { + "auxiliary_loss_clip": 0.01089353, + "auxiliary_loss_mlp": 0.01041897, + "balance_loss_clip": 1.03431118, + "balance_loss_mlp": 1.02831316, + "epoch": 0.8064933112881407, + "flos": 19682890456320.0, + "grad_norm": 1.9267441179905571, + "language_loss": 0.8497321, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.87104464, + "num_input_tokens_seen": 289535840, + "step": 13414, + "time_per_iteration": 2.493429183959961 + }, + { + "auxiliary_loss_clip": 0.01079702, + "auxiliary_loss_mlp": 0.01031093, + "balance_loss_clip": 1.03368402, + "balance_loss_mlp": 1.01935101, + "epoch": 0.8065534345408086, + "flos": 19279721226240.0, + "grad_norm": 1.8762615920537162, + "language_loss": 0.67145783, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.69256574, + "num_input_tokens_seen": 289555205, + "step": 13415, + "time_per_iteration": 2.5439505577087402 + }, + { + "auxiliary_loss_clip": 0.01068708, + "auxiliary_loss_mlp": 0.01024117, + "balance_loss_clip": 1.03347218, + "balance_loss_mlp": 1.01267314, + "epoch": 0.8066135577934767, + "flos": 19677718897920.0, + "grad_norm": 1.7666236406041032, + "language_loss": 0.76289964, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.7838279, + "num_input_tokens_seen": 289573000, + "step": 13416, + "time_per_iteration": 2.537405252456665 + }, + { + "auxiliary_loss_clip": 0.01091277, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.03379202, + "balance_loss_mlp": 1.02090335, + "epoch": 0.8066736810461446, + "flos": 21143592933120.0, + "grad_norm": 1.4748844327515631, + "language_loss": 0.65509415, + "learning_rate": 3.793146714797086e-07, + "loss": 0.67634046, + "num_input_tokens_seen": 289592625, + "step": 13417, + "time_per_iteration": 2.491903305053711 + }, + { + "auxiliary_loss_clip": 0.01072642, + "auxiliary_loss_mlp": 0.01048617, + "balance_loss_clip": 1.03477871, + "balance_loss_mlp": 1.03518796, + "epoch": 0.8067338042988126, + "flos": 22598261925120.0, + "grad_norm": 1.575936612542222, + "language_loss": 0.80540264, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.82661521, + "num_input_tokens_seen": 289610780, + "step": 13418, + "time_per_iteration": 2.575634002685547 + }, + { + "auxiliary_loss_clip": 0.01085257, + "auxiliary_loss_mlp": 0.0102928, + "balance_loss_clip": 1.03670061, + "balance_loss_mlp": 1.01654887, + "epoch": 0.8067939275514805, + "flos": 16508423208960.0, + "grad_norm": 1.6115370315105824, + "language_loss": 0.85017633, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.87132174, + "num_input_tokens_seen": 289628890, + "step": 13419, + "time_per_iteration": 2.4832308292388916 + }, + { + "auxiliary_loss_clip": 0.01074181, + "auxiliary_loss_mlp": 0.0078695, + "balance_loss_clip": 1.03498042, + "balance_loss_mlp": 1.01431358, + "epoch": 0.8068540508041485, + "flos": 28541836460160.0, + "grad_norm": 1.5995880353382983, + "language_loss": 0.75751603, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.77612734, + "num_input_tokens_seen": 289647220, + "step": 13420, + "time_per_iteration": 2.6008808612823486 + }, + { + "auxiliary_loss_clip": 0.01085227, + "auxiliary_loss_mlp": 0.00783162, + "balance_loss_clip": 1.03190923, + "balance_loss_mlp": 1.00888848, + "epoch": 0.8069141740568164, + "flos": 21652482867840.0, + "grad_norm": 1.6855881538406192, + "language_loss": 0.78622478, + "learning_rate": 3.784023331462207e-07, + "loss": 0.80490869, + "num_input_tokens_seen": 289665800, + "step": 13421, + "time_per_iteration": 2.524552583694458 + }, + { + "auxiliary_loss_clip": 0.01074514, + "auxiliary_loss_mlp": 0.01025281, + "balance_loss_clip": 1.03675735, + "balance_loss_mlp": 1.01319885, + "epoch": 0.8069742973094844, + "flos": 17529327561600.0, + "grad_norm": 1.683589976901725, + "language_loss": 0.80048919, + "learning_rate": 3.78174402269098e-07, + "loss": 0.82148719, + "num_input_tokens_seen": 289682705, + "step": 13422, + "time_per_iteration": 2.5435802936553955 + }, + { + "auxiliary_loss_clip": 0.01103393, + "auxiliary_loss_mlp": 0.01031193, + "balance_loss_clip": 1.03422213, + "balance_loss_mlp": 1.0193913, + "epoch": 0.8070344205621525, + "flos": 23367037737600.0, + "grad_norm": 1.6750883488668717, + "language_loss": 0.68208474, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.70343059, + "num_input_tokens_seen": 289702920, + "step": 13423, + "time_per_iteration": 2.459404468536377 + }, + { + "auxiliary_loss_clip": 0.01080116, + "auxiliary_loss_mlp": 0.01035992, + "balance_loss_clip": 1.03597116, + "balance_loss_mlp": 1.0233674, + "epoch": 0.8070945438148204, + "flos": 22930184528640.0, + "grad_norm": 1.7268804224316383, + "language_loss": 0.80255085, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.82371187, + "num_input_tokens_seen": 289723280, + "step": 13424, + "time_per_iteration": 2.550837516784668 + }, + { + "auxiliary_loss_clip": 0.01094859, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.03429639, + "balance_loss_mlp": 1.01643729, + "epoch": 0.8071546670674884, + "flos": 25300683613440.0, + "grad_norm": 1.4662367875856352, + "language_loss": 0.78841656, + "learning_rate": 3.774909786710232e-07, + "loss": 0.80965137, + "num_input_tokens_seen": 289743475, + "step": 13425, + "time_per_iteration": 2.5116984844207764 + }, + { + "auxiliary_loss_clip": 0.01073018, + "auxiliary_loss_mlp": 0.01031133, + "balance_loss_clip": 1.03331006, + "balance_loss_mlp": 1.01942658, + "epoch": 0.8072147903201563, + "flos": 18113701927680.0, + "grad_norm": 2.411618764588188, + "language_loss": 0.75675893, + "learning_rate": 3.772632938448923e-07, + "loss": 0.77780038, + "num_input_tokens_seen": 289761400, + "step": 13426, + "time_per_iteration": 2.4966671466827393 + }, + { + "auxiliary_loss_clip": 0.01093396, + "auxiliary_loss_mlp": 0.01023491, + "balance_loss_clip": 1.03468275, + "balance_loss_mlp": 1.01238632, + "epoch": 0.8072749135728243, + "flos": 26688164215680.0, + "grad_norm": 1.693231110991996, + "language_loss": 0.72957766, + "learning_rate": 3.770356705530997e-07, + "loss": 0.75074655, + "num_input_tokens_seen": 289781025, + "step": 13427, + "time_per_iteration": 2.5264010429382324 + }, + { + "auxiliary_loss_clip": 0.01055804, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.03648925, + "balance_loss_mlp": 1.02065885, + "epoch": 0.8073350368254922, + "flos": 19240291071360.0, + "grad_norm": 1.7064019518094282, + "language_loss": 0.70047659, + "learning_rate": 3.768081088042774e-07, + "loss": 0.72137439, + "num_input_tokens_seen": 289798380, + "step": 13428, + "time_per_iteration": 2.6026830673217773 + }, + { + "auxiliary_loss_clip": 0.01082746, + "auxiliary_loss_mlp": 0.0102687, + "balance_loss_clip": 1.0331459, + "balance_loss_mlp": 1.01543772, + "epoch": 0.8073951600781603, + "flos": 13334530579200.0, + "grad_norm": 1.8045441457882236, + "language_loss": 0.75076199, + "learning_rate": 3.765806086070544e-07, + "loss": 0.77185816, + "num_input_tokens_seen": 289814515, + "step": 13429, + "time_per_iteration": 2.482254981994629 + }, + { + "auxiliary_loss_clip": 0.01091627, + "auxiliary_loss_mlp": 0.01029735, + "balance_loss_clip": 1.03545237, + "balance_loss_mlp": 1.0181601, + "epoch": 0.8074552833308282, + "flos": 22853191726080.0, + "grad_norm": 2.768916179194125, + "language_loss": 0.67096603, + "learning_rate": 3.763531699700568e-07, + "loss": 0.69217968, + "num_input_tokens_seen": 289834315, + "step": 13430, + "time_per_iteration": 2.5008552074432373 + }, + { + "auxiliary_loss_clip": 0.01066664, + "auxiliary_loss_mlp": 0.01027877, + "balance_loss_clip": 1.03568578, + "balance_loss_mlp": 1.01611114, + "epoch": 0.8075154065834962, + "flos": 20339409288960.0, + "grad_norm": 1.698210724412682, + "language_loss": 0.80346692, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.82441235, + "num_input_tokens_seen": 289853770, + "step": 13431, + "time_per_iteration": 2.542090892791748 + }, + { + "auxiliary_loss_clip": 0.01080042, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.03427076, + "balance_loss_mlp": 1.01626039, + "epoch": 0.8075755298361641, + "flos": 21908059113600.0, + "grad_norm": 2.1391201367333386, + "language_loss": 0.80411553, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.82520747, + "num_input_tokens_seen": 289870480, + "step": 13432, + "time_per_iteration": 2.5037527084350586 + }, + { + "auxiliary_loss_clip": 0.01080012, + "auxiliary_loss_mlp": 0.0103309, + "balance_loss_clip": 1.03870082, + "balance_loss_mlp": 1.01999521, + "epoch": 0.8076356530888321, + "flos": 15669298609920.0, + "grad_norm": 1.8506030304519465, + "language_loss": 0.70320427, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.72433531, + "num_input_tokens_seen": 289888275, + "step": 13433, + "time_per_iteration": 2.522468090057373 + }, + { + "auxiliary_loss_clip": 0.01079302, + "auxiliary_loss_mlp": 0.01028595, + "balance_loss_clip": 1.03525591, + "balance_loss_mlp": 1.01657331, + "epoch": 0.8076957763415, + "flos": 37777414521600.0, + "grad_norm": 1.6233841021150393, + "language_loss": 0.72751486, + "learning_rate": 3.754440311967828e-07, + "loss": 0.74859381, + "num_input_tokens_seen": 289911495, + "step": 13434, + "time_per_iteration": 2.661424398422241 + }, + { + "auxiliary_loss_clip": 0.01067776, + "auxiliary_loss_mlp": 0.01028668, + "balance_loss_clip": 1.03768873, + "balance_loss_mlp": 1.01646113, + "epoch": 0.807755899594168, + "flos": 19610781903360.0, + "grad_norm": 4.08174125061118, + "language_loss": 0.68102163, + "learning_rate": 3.752169004902361e-07, + "loss": 0.70198607, + "num_input_tokens_seen": 289930045, + "step": 13435, + "time_per_iteration": 2.5333874225616455 + }, + { + "auxiliary_loss_clip": 0.01064306, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.03654099, + "balance_loss_mlp": 1.01864266, + "epoch": 0.8078160228468361, + "flos": 23294893271040.0, + "grad_norm": 1.5764106753399945, + "language_loss": 0.75116932, + "learning_rate": 3.749898313956279e-07, + "loss": 0.7721445, + "num_input_tokens_seen": 289950815, + "step": 13436, + "time_per_iteration": 2.6830732822418213 + }, + { + "auxiliary_loss_clip": 0.01101594, + "auxiliary_loss_mlp": 0.0102967, + "balance_loss_clip": 1.03435826, + "balance_loss_mlp": 1.01730788, + "epoch": 0.807876146099504, + "flos": 27162651899520.0, + "grad_norm": 1.7519629244472612, + "language_loss": 0.70325857, + "learning_rate": 3.747628239215674e-07, + "loss": 0.72457123, + "num_input_tokens_seen": 289971730, + "step": 13437, + "time_per_iteration": 2.5000627040863037 + }, + { + "auxiliary_loss_clip": 0.01085395, + "auxiliary_loss_mlp": 0.01031408, + "balance_loss_clip": 1.03809178, + "balance_loss_mlp": 1.01968384, + "epoch": 0.807936269352172, + "flos": 27160030206720.0, + "grad_norm": 1.6085573392273413, + "language_loss": 0.72808737, + "learning_rate": 3.745358780766636e-07, + "loss": 0.74925536, + "num_input_tokens_seen": 289992995, + "step": 13438, + "time_per_iteration": 2.5887696743011475 + }, + { + "auxiliary_loss_clip": 0.010814, + "auxiliary_loss_mlp": 0.01029826, + "balance_loss_clip": 1.03543901, + "balance_loss_mlp": 1.01803637, + "epoch": 0.8079963926048399, + "flos": 20740423703040.0, + "grad_norm": 1.9089901041630044, + "language_loss": 0.77073938, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.79185164, + "num_input_tokens_seen": 290009405, + "step": 13439, + "time_per_iteration": 2.5075926780700684 + }, + { + "auxiliary_loss_clip": 0.01104023, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.03549266, + "balance_loss_mlp": 1.02220356, + "epoch": 0.8080565158575079, + "flos": 25009663622400.0, + "grad_norm": 1.558408542716528, + "language_loss": 0.7884897, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.8098734, + "num_input_tokens_seen": 290031085, + "step": 13440, + "time_per_iteration": 2.510411500930786 + }, + { + "auxiliary_loss_clip": 0.01084131, + "auxiliary_loss_mlp": 0.00785353, + "balance_loss_clip": 1.03496099, + "balance_loss_mlp": 1.01234186, + "epoch": 0.8081166391101758, + "flos": 18698076293760.0, + "grad_norm": 7.468712660480382, + "language_loss": 0.58883297, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.60752779, + "num_input_tokens_seen": 290048670, + "step": 13441, + "time_per_iteration": 5.350764751434326 + }, + { + "auxiliary_loss_clip": 0.01092841, + "auxiliary_loss_mlp": 0.01032253, + "balance_loss_clip": 1.03412437, + "balance_loss_mlp": 1.01970029, + "epoch": 0.8081767623628439, + "flos": 19828651847040.0, + "grad_norm": 1.9449709556750394, + "language_loss": 0.76070184, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.78195274, + "num_input_tokens_seen": 290064085, + "step": 13442, + "time_per_iteration": 2.4739067554473877 + }, + { + "auxiliary_loss_clip": 0.01082439, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.03511381, + "balance_loss_mlp": 1.01801896, + "epoch": 0.8082368856155118, + "flos": 35772952982400.0, + "grad_norm": 1.4150970132857144, + "language_loss": 0.70640761, + "learning_rate": 3.734020735906169e-07, + "loss": 0.72753286, + "num_input_tokens_seen": 290086255, + "step": 13443, + "time_per_iteration": 2.6232006549835205 + }, + { + "auxiliary_loss_clip": 0.01062298, + "auxiliary_loss_mlp": 0.01030582, + "balance_loss_clip": 1.03704715, + "balance_loss_mlp": 1.01870322, + "epoch": 0.8082970088681798, + "flos": 17198015489280.0, + "grad_norm": 1.7571397466860874, + "language_loss": 0.82402211, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.84495091, + "num_input_tokens_seen": 290103995, + "step": 13444, + "time_per_iteration": 3.9211857318878174 + }, + { + "auxiliary_loss_clip": 0.01007476, + "auxiliary_loss_mlp": 0.00761876, + "balance_loss_clip": 1.01642799, + "balance_loss_mlp": 0.99956733, + "epoch": 0.8083571321208477, + "flos": 63555207511680.0, + "grad_norm": 1.12124287238449, + "language_loss": 0.53639859, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.55409211, + "num_input_tokens_seen": 290157245, + "step": 13445, + "time_per_iteration": 3.014122247695923 + }, + { + "auxiliary_loss_clip": 0.01068396, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.0362072, + "balance_loss_mlp": 1.0195775, + "epoch": 0.8084172553735157, + "flos": 17930701111680.0, + "grad_norm": 3.051317029308302, + "language_loss": 0.72407377, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.74508262, + "num_input_tokens_seen": 290174970, + "step": 13446, + "time_per_iteration": 2.5582425594329834 + }, + { + "auxiliary_loss_clip": 0.0108102, + "auxiliary_loss_mlp": 0.01034582, + "balance_loss_clip": 1.03554094, + "balance_loss_mlp": 1.02103996, + "epoch": 0.8084773786261836, + "flos": 24097999507200.0, + "grad_norm": 1.6813885176281118, + "language_loss": 0.71171558, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.73287159, + "num_input_tokens_seen": 290194395, + "step": 13447, + "time_per_iteration": 2.5343635082244873 + }, + { + "auxiliary_loss_clip": 0.01041226, + "auxiliary_loss_mlp": 0.01033975, + "balance_loss_clip": 1.0328629, + "balance_loss_mlp": 1.01984847, + "epoch": 0.8085375018788516, + "flos": 15588211656960.0, + "grad_norm": 2.106140692522029, + "language_loss": 0.74815637, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.76890838, + "num_input_tokens_seen": 290209200, + "step": 13448, + "time_per_iteration": 2.5801353454589844 + }, + { + "auxiliary_loss_clip": 0.01030218, + "auxiliary_loss_mlp": 0.01001979, + "balance_loss_clip": 1.00744534, + "balance_loss_mlp": 1.00067353, + "epoch": 0.8085976251315197, + "flos": 67561296393600.0, + "grad_norm": 0.858274181069563, + "language_loss": 0.63867569, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.65899765, + "num_input_tokens_seen": 290274565, + "step": 13449, + "time_per_iteration": 3.098720073699951 + }, + { + "auxiliary_loss_clip": 0.01098404, + "auxiliary_loss_mlp": 0.01028155, + "balance_loss_clip": 1.03885007, + "balance_loss_mlp": 1.01491094, + "epoch": 0.8086577483841876, + "flos": 22561453463040.0, + "grad_norm": 1.6523478310728965, + "language_loss": 0.73980927, + "learning_rate": 3.718173381422105e-07, + "loss": 0.76107484, + "num_input_tokens_seen": 290293630, + "step": 13450, + "time_per_iteration": 3.8686087131500244 + }, + { + "auxiliary_loss_clip": 0.01080681, + "auxiliary_loss_mlp": 0.00786114, + "balance_loss_clip": 1.03282404, + "balance_loss_mlp": 1.01202238, + "epoch": 0.8087178716368556, + "flos": 17968084191360.0, + "grad_norm": 1.7816325469559289, + "language_loss": 0.74050063, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.75916862, + "num_input_tokens_seen": 290311450, + "step": 13451, + "time_per_iteration": 2.5282537937164307 + }, + { + "auxiliary_loss_clip": 0.01082283, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.0331763, + "balance_loss_mlp": 1.01661754, + "epoch": 0.8087779948895235, + "flos": 21719527603200.0, + "grad_norm": 1.7642485045176768, + "language_loss": 0.80379581, + "learning_rate": 3.713651121244543e-07, + "loss": 0.82492566, + "num_input_tokens_seen": 290330165, + "step": 13452, + "time_per_iteration": 2.508531332015991 + }, + { + "auxiliary_loss_clip": 0.01096691, + "auxiliary_loss_mlp": 0.0103464, + "balance_loss_clip": 1.03636551, + "balance_loss_mlp": 1.02264214, + "epoch": 0.8088381181421915, + "flos": 29092885983360.0, + "grad_norm": 1.5704841381602948, + "language_loss": 0.78501844, + "learning_rate": 3.711390917482875e-07, + "loss": 0.80633175, + "num_input_tokens_seen": 290350815, + "step": 13453, + "time_per_iteration": 2.5479650497436523 + }, + { + "auxiliary_loss_clip": 0.01053357, + "auxiliary_loss_mlp": 0.01031621, + "balance_loss_clip": 1.03145123, + "balance_loss_mlp": 1.01869321, + "epoch": 0.8088982413948594, + "flos": 22198432659840.0, + "grad_norm": 2.873101353610194, + "language_loss": 0.77234232, + "learning_rate": 3.709131331386892e-07, + "loss": 0.79319209, + "num_input_tokens_seen": 290367380, + "step": 13454, + "time_per_iteration": 2.5402090549468994 + }, + { + "auxiliary_loss_clip": 0.01073785, + "auxiliary_loss_mlp": 0.01031142, + "balance_loss_clip": 1.03669643, + "balance_loss_mlp": 1.01814246, + "epoch": 0.8089583646475275, + "flos": 28036717453440.0, + "grad_norm": 2.669215904540028, + "language_loss": 0.76466572, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.78571504, + "num_input_tokens_seen": 290387965, + "step": 13455, + "time_per_iteration": 2.6208949089050293 + }, + { + "auxiliary_loss_clip": 0.01078595, + "auxiliary_loss_mlp": 0.0103231, + "balance_loss_clip": 1.0331955, + "balance_loss_mlp": 1.01932812, + "epoch": 0.8090184879001954, + "flos": 16617735273600.0, + "grad_norm": 1.8141928330035448, + "language_loss": 0.78921223, + "learning_rate": 3.70461401253471e-07, + "loss": 0.81032121, + "num_input_tokens_seen": 290404150, + "step": 13456, + "time_per_iteration": 2.4786179065704346 + }, + { + "auxiliary_loss_clip": 0.01105476, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.0380429, + "balance_loss_mlp": 1.02474427, + "epoch": 0.8090786111528634, + "flos": 27340804379520.0, + "grad_norm": 1.8386846667132055, + "language_loss": 0.71925366, + "learning_rate": 3.702356279949801e-07, + "loss": 0.74067235, + "num_input_tokens_seen": 290422370, + "step": 13457, + "time_per_iteration": 2.5039196014404297 + }, + { + "auxiliary_loss_clip": 0.01082018, + "auxiliary_loss_mlp": 0.01028426, + "balance_loss_clip": 1.03408682, + "balance_loss_mlp": 1.01717234, + "epoch": 0.8091387344055313, + "flos": 21105742976640.0, + "grad_norm": 1.7572415395313743, + "language_loss": 0.72606707, + "learning_rate": 3.700099165373176e-07, + "loss": 0.74717152, + "num_input_tokens_seen": 290442645, + "step": 13458, + "time_per_iteration": 2.509395122528076 + }, + { + "auxiliary_loss_clip": 0.01094003, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.03518605, + "balance_loss_mlp": 1.02222633, + "epoch": 0.8091988576581993, + "flos": 11655060318720.0, + "grad_norm": 6.613367883645332, + "language_loss": 0.79333818, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.81462216, + "num_input_tokens_seen": 290458520, + "step": 13459, + "time_per_iteration": 2.4819753170013428 + }, + { + "auxiliary_loss_clip": 0.01079732, + "auxiliary_loss_mlp": 0.01025171, + "balance_loss_clip": 1.03627229, + "balance_loss_mlp": 1.01233184, + "epoch": 0.8092589809108672, + "flos": 22963329803520.0, + "grad_norm": 1.996948170732086, + "language_loss": 0.7995311, + "learning_rate": 3.695586790587113e-07, + "loss": 0.82058024, + "num_input_tokens_seen": 290474465, + "step": 13460, + "time_per_iteration": 2.5079989433288574 + }, + { + "auxiliary_loss_clip": 0.0108349, + "auxiliary_loss_mlp": 0.01033416, + "balance_loss_clip": 1.03395259, + "balance_loss_mlp": 1.02035093, + "epoch": 0.8093191041635353, + "flos": 13260985482240.0, + "grad_norm": 1.8966764052107896, + "language_loss": 0.8457011, + "learning_rate": 3.693331530548789e-07, + "loss": 0.86687016, + "num_input_tokens_seen": 290492060, + "step": 13461, + "time_per_iteration": 2.530317544937134 + }, + { + "auxiliary_loss_clip": 0.01096876, + "auxiliary_loss_mlp": 0.01037091, + "balance_loss_clip": 1.03728783, + "balance_loss_mlp": 1.02394819, + "epoch": 0.8093792274162032, + "flos": 25516003691520.0, + "grad_norm": 1.7300795018869375, + "language_loss": 0.76377636, + "learning_rate": 3.69107688886096e-07, + "loss": 0.78511602, + "num_input_tokens_seen": 290511510, + "step": 13462, + "time_per_iteration": 2.5187902450561523 + }, + { + "auxiliary_loss_clip": 0.01085222, + "auxiliary_loss_mlp": 0.01036295, + "balance_loss_clip": 1.03907633, + "balance_loss_mlp": 1.02297354, + "epoch": 0.8094393506688712, + "flos": 23546483107200.0, + "grad_norm": 1.6757498685713699, + "language_loss": 0.83107698, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.85229218, + "num_input_tokens_seen": 290530035, + "step": 13463, + "time_per_iteration": 2.570019245147705 + }, + { + "auxiliary_loss_clip": 0.01103852, + "auxiliary_loss_mlp": 0.01035285, + "balance_loss_clip": 1.03602386, + "balance_loss_mlp": 1.02386463, + "epoch": 0.8094994739215392, + "flos": 17055917285760.0, + "grad_norm": 1.815064688933616, + "language_loss": 0.62185377, + "learning_rate": 3.686569460878779e-07, + "loss": 0.64324516, + "num_input_tokens_seen": 290548245, + "step": 13464, + "time_per_iteration": 2.419294834136963 + }, + { + "auxiliary_loss_clip": 0.01103214, + "auxiliary_loss_mlp": 0.01029491, + "balance_loss_clip": 1.03595531, + "balance_loss_mlp": 1.01804662, + "epoch": 0.8095595971742071, + "flos": 23551223702400.0, + "grad_norm": 1.5152508608648634, + "language_loss": 0.61854351, + "learning_rate": 3.684316674755341e-07, + "loss": 0.63987052, + "num_input_tokens_seen": 290568625, + "step": 13465, + "time_per_iteration": 2.4879698753356934 + }, + { + "auxiliary_loss_clip": 0.01095709, + "auxiliary_loss_mlp": 0.01035672, + "balance_loss_clip": 1.03835309, + "balance_loss_mlp": 1.02337599, + "epoch": 0.8096197204268751, + "flos": 20373201008640.0, + "grad_norm": 2.4631838056043285, + "language_loss": 0.81999326, + "learning_rate": 3.682064507324256e-07, + "loss": 0.8413071, + "num_input_tokens_seen": 290586575, + "step": 13466, + "time_per_iteration": 2.459765672683716 + }, + { + "auxiliary_loss_clip": 0.01089907, + "auxiliary_loss_mlp": 0.00788407, + "balance_loss_clip": 1.0361135, + "balance_loss_mlp": 1.01686001, + "epoch": 0.809679843679543, + "flos": 27818775682560.0, + "grad_norm": 1.9302829005677207, + "language_loss": 0.76190174, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.78068489, + "num_input_tokens_seen": 290606790, + "step": 13467, + "time_per_iteration": 2.5741984844207764 + }, + { + "auxiliary_loss_clip": 0.01070125, + "auxiliary_loss_mlp": 0.01026328, + "balance_loss_clip": 1.03079128, + "balance_loss_mlp": 1.01400757, + "epoch": 0.8097399669322111, + "flos": 22014103040640.0, + "grad_norm": 1.6515073553760422, + "language_loss": 0.79379702, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.81476152, + "num_input_tokens_seen": 290625525, + "step": 13468, + "time_per_iteration": 2.5347628593444824 + }, + { + "auxiliary_loss_clip": 0.01092132, + "auxiliary_loss_mlp": 0.01028504, + "balance_loss_clip": 1.03517032, + "balance_loss_mlp": 1.0168395, + "epoch": 0.809800090184879, + "flos": 18988988544000.0, + "grad_norm": 1.8602259438843383, + "language_loss": 0.67125869, + "learning_rate": 3.675311718038978e-07, + "loss": 0.69246507, + "num_input_tokens_seen": 290644935, + "step": 13469, + "time_per_iteration": 2.4925785064697266 + }, + { + "auxiliary_loss_clip": 0.01005176, + "auxiliary_loss_mlp": 0.01016335, + "balance_loss_clip": 1.01454377, + "balance_loss_mlp": 1.01491094, + "epoch": 0.809860213437547, + "flos": 66099516508800.0, + "grad_norm": 0.7012125986289238, + "language_loss": 0.54697812, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.56719327, + "num_input_tokens_seen": 290710735, + "step": 13470, + "time_per_iteration": 3.2326226234436035 + }, + { + "auxiliary_loss_clip": 0.0110376, + "auxiliary_loss_mlp": 0.01032374, + "balance_loss_clip": 1.03468418, + "balance_loss_mlp": 1.02063775, + "epoch": 0.8099203366902149, + "flos": 20882485992960.0, + "grad_norm": 1.7133434799370288, + "language_loss": 0.69548899, + "learning_rate": 3.670812953542279e-07, + "loss": 0.7168504, + "num_input_tokens_seen": 290729565, + "step": 13471, + "time_per_iteration": 2.454738140106201 + }, + { + "auxiliary_loss_clip": 0.01096443, + "auxiliary_loss_mlp": 0.01027477, + "balance_loss_clip": 1.03712296, + "balance_loss_mlp": 1.01586008, + "epoch": 0.8099804599428829, + "flos": 26030927111040.0, + "grad_norm": 1.8888712990381693, + "language_loss": 0.79671299, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.81795216, + "num_input_tokens_seen": 290749360, + "step": 13472, + "time_per_iteration": 2.5263824462890625 + }, + { + "auxiliary_loss_clip": 0.01022152, + "auxiliary_loss_mlp": 0.01005157, + "balance_loss_clip": 1.00793135, + "balance_loss_mlp": 1.00386381, + "epoch": 0.8100405831955508, + "flos": 69303573584640.0, + "grad_norm": 0.748138137928801, + "language_loss": 0.57797754, + "learning_rate": 3.666316665863201e-07, + "loss": 0.59825063, + "num_input_tokens_seen": 290812145, + "step": 13473, + "time_per_iteration": 3.054250717163086 + }, + { + "auxiliary_loss_clip": 0.01054752, + "auxiliary_loss_mlp": 0.01028679, + "balance_loss_clip": 1.03534043, + "balance_loss_mlp": 1.01650167, + "epoch": 0.8101007064482189, + "flos": 15012492468480.0, + "grad_norm": 1.7327519745334876, + "language_loss": 0.73618472, + "learning_rate": 3.664069451043399e-07, + "loss": 0.75701904, + "num_input_tokens_seen": 290829845, + "step": 13474, + "time_per_iteration": 2.5743112564086914 + }, + { + "auxiliary_loss_clip": 0.01094536, + "auxiliary_loss_mlp": 0.01036788, + "balance_loss_clip": 1.03770804, + "balance_loss_mlp": 1.02439654, + "epoch": 0.8101608297008868, + "flos": 21067210661760.0, + "grad_norm": 1.707475092931714, + "language_loss": 0.78764009, + "learning_rate": 3.661822855683723e-07, + "loss": 0.80895334, + "num_input_tokens_seen": 290848815, + "step": 13475, + "time_per_iteration": 2.4810335636138916 + }, + { + "auxiliary_loss_clip": 0.01092139, + "auxiliary_loss_mlp": 0.01033739, + "balance_loss_clip": 1.03471994, + "balance_loss_mlp": 1.0220623, + "epoch": 0.8102209529535548, + "flos": 23731279603200.0, + "grad_norm": 1.550863660263918, + "language_loss": 0.75135654, + "learning_rate": 3.659576879869364e-07, + "loss": 0.77261531, + "num_input_tokens_seen": 290868580, + "step": 13476, + "time_per_iteration": 2.530224084854126 + }, + { + "auxiliary_loss_clip": 0.01087086, + "auxiliary_loss_mlp": 0.0103661, + "balance_loss_clip": 1.03534293, + "balance_loss_mlp": 1.02340794, + "epoch": 0.8102810762062228, + "flos": 10955879107200.0, + "grad_norm": 2.1682605014044207, + "language_loss": 0.74054408, + "learning_rate": 3.657331523685485e-07, + "loss": 0.76178104, + "num_input_tokens_seen": 290883540, + "step": 13477, + "time_per_iteration": 2.4677090644836426 + }, + { + "auxiliary_loss_clip": 0.01086216, + "auxiliary_loss_mlp": 0.01034979, + "balance_loss_clip": 1.03608203, + "balance_loss_mlp": 1.02320707, + "epoch": 0.8103411994588907, + "flos": 14648825220480.0, + "grad_norm": 2.06521305020464, + "language_loss": 0.69511408, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.716326, + "num_input_tokens_seen": 290901560, + "step": 13478, + "time_per_iteration": 2.5183346271514893 + }, + { + "auxiliary_loss_clip": 0.01029985, + "auxiliary_loss_mlp": 0.01005614, + "balance_loss_clip": 1.00714529, + "balance_loss_mlp": 1.00433862, + "epoch": 0.8104013227115587, + "flos": 59153314665600.0, + "grad_norm": 0.6879278010860218, + "language_loss": 0.52166045, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.54201639, + "num_input_tokens_seen": 290959185, + "step": 13479, + "time_per_iteration": 4.409225702285767 + }, + { + "auxiliary_loss_clip": 0.01061499, + "auxiliary_loss_mlp": 0.01030951, + "balance_loss_clip": 1.03574347, + "balance_loss_mlp": 1.01821303, + "epoch": 0.8104614459642266, + "flos": 19828687760640.0, + "grad_norm": 2.3654490178000445, + "language_loss": 0.71154708, + "learning_rate": 3.650599173768072e-07, + "loss": 0.73247159, + "num_input_tokens_seen": 290979585, + "step": 13480, + "time_per_iteration": 3.939840316772461 + }, + { + "auxiliary_loss_clip": 0.0110535, + "auxiliary_loss_mlp": 0.01033585, + "balance_loss_clip": 1.03607583, + "balance_loss_mlp": 1.02175355, + "epoch": 0.8105215692168947, + "flos": 25374264624000.0, + "grad_norm": 1.724476872368226, + "language_loss": 0.7970953, + "learning_rate": 3.648356296957327e-07, + "loss": 0.81848466, + "num_input_tokens_seen": 291000865, + "step": 13481, + "time_per_iteration": 2.495518445968628 + }, + { + "auxiliary_loss_clip": 0.0108088, + "auxiliary_loss_mlp": 0.0103336, + "balance_loss_clip": 1.03411293, + "balance_loss_mlp": 1.02155876, + "epoch": 0.8105816924695626, + "flos": 20481722974080.0, + "grad_norm": 1.6465191533476875, + "language_loss": 0.72511113, + "learning_rate": 3.646114040202548e-07, + "loss": 0.74625355, + "num_input_tokens_seen": 291018285, + "step": 13482, + "time_per_iteration": 2.477145195007324 + }, + { + "auxiliary_loss_clip": 0.01047882, + "auxiliary_loss_mlp": 0.01025158, + "balance_loss_clip": 1.03425491, + "balance_loss_mlp": 1.01296318, + "epoch": 0.8106418157222306, + "flos": 14538687143040.0, + "grad_norm": 2.3981835504458635, + "language_loss": 0.65720528, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.67793566, + "num_input_tokens_seen": 291035745, + "step": 13483, + "time_per_iteration": 3.993739604949951 + }, + { + "auxiliary_loss_clip": 0.01077558, + "auxiliary_loss_mlp": 0.0103231, + "balance_loss_clip": 1.03226781, + "balance_loss_mlp": 1.01907146, + "epoch": 0.8107019389748985, + "flos": 22564470205440.0, + "grad_norm": 1.562378538015318, + "language_loss": 0.7635231, + "learning_rate": 3.641631387200992e-07, + "loss": 0.78462172, + "num_input_tokens_seen": 291053280, + "step": 13484, + "time_per_iteration": 2.538999557495117 + }, + { + "auxiliary_loss_clip": 0.0108704, + "auxiliary_loss_mlp": 0.0103509, + "balance_loss_clip": 1.03529298, + "balance_loss_mlp": 1.02181602, + "epoch": 0.8107620622275665, + "flos": 19609560840960.0, + "grad_norm": 1.6606371645948337, + "language_loss": 0.7209937, + "learning_rate": 3.639390991124183e-07, + "loss": 0.74221498, + "num_input_tokens_seen": 291072855, + "step": 13485, + "time_per_iteration": 2.6343650817871094 + }, + { + "auxiliary_loss_clip": 0.01058938, + "auxiliary_loss_mlp": 0.01033139, + "balance_loss_clip": 1.0313189, + "balance_loss_mlp": 1.0196923, + "epoch": 0.8108221854802344, + "flos": 16143498984960.0, + "grad_norm": 1.8824491943826365, + "language_loss": 0.75819612, + "learning_rate": 3.637151215443308e-07, + "loss": 0.77911687, + "num_input_tokens_seen": 291090285, + "step": 13486, + "time_per_iteration": 2.5341413021087646 + }, + { + "auxiliary_loss_clip": 0.01086423, + "auxiliary_loss_mlp": 0.01030453, + "balance_loss_clip": 1.03612077, + "balance_loss_mlp": 1.01828218, + "epoch": 0.8108823087329025, + "flos": 21106209853440.0, + "grad_norm": 2.0111423069326153, + "language_loss": 0.71953094, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.74069971, + "num_input_tokens_seen": 291107675, + "step": 13487, + "time_per_iteration": 2.5038161277770996 + }, + { + "auxiliary_loss_clip": 0.01042588, + "auxiliary_loss_mlp": 0.01033496, + "balance_loss_clip": 1.03599226, + "balance_loss_mlp": 1.02201664, + "epoch": 0.8109424319855704, + "flos": 29199648182400.0, + "grad_norm": 1.7958456300690162, + "language_loss": 0.84113014, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.86189097, + "num_input_tokens_seen": 291126900, + "step": 13488, + "time_per_iteration": 2.701881170272827 + }, + { + "auxiliary_loss_clip": 0.01106916, + "auxiliary_loss_mlp": 0.01031909, + "balance_loss_clip": 1.03649688, + "balance_loss_mlp": 1.01937437, + "epoch": 0.8110025552382384, + "flos": 23111856541440.0, + "grad_norm": 1.7998419748350067, + "language_loss": 0.73870081, + "learning_rate": 3.630435611625502e-07, + "loss": 0.76008904, + "num_input_tokens_seen": 291145285, + "step": 13489, + "time_per_iteration": 3.87990665435791 + }, + { + "auxiliary_loss_clip": 0.01060544, + "auxiliary_loss_mlp": 0.00782417, + "balance_loss_clip": 1.03533447, + "balance_loss_mlp": 1.00914299, + "epoch": 0.8110626784909064, + "flos": 22379961018240.0, + "grad_norm": 1.5931549164924441, + "language_loss": 0.71697903, + "learning_rate": 3.628198318377453e-07, + "loss": 0.7354086, + "num_input_tokens_seen": 291163485, + "step": 13490, + "time_per_iteration": 2.5707807540893555 + }, + { + "auxiliary_loss_clip": 0.01073224, + "auxiliary_loss_mlp": 0.01048269, + "balance_loss_clip": 1.03517735, + "balance_loss_mlp": 1.03305173, + "epoch": 0.8111228017435743, + "flos": 23368043318400.0, + "grad_norm": 1.9824391329981224, + "language_loss": 0.71231234, + "learning_rate": 3.625961645949762e-07, + "loss": 0.7335273, + "num_input_tokens_seen": 291182215, + "step": 13491, + "time_per_iteration": 2.5717296600341797 + }, + { + "auxiliary_loss_clip": 0.01104483, + "auxiliary_loss_mlp": 0.01030032, + "balance_loss_clip": 1.03486896, + "balance_loss_mlp": 1.01777148, + "epoch": 0.8111829249962423, + "flos": 21286553063040.0, + "grad_norm": 1.3102271601740754, + "language_loss": 0.67793602, + "learning_rate": 3.623725594427245e-07, + "loss": 0.6992811, + "num_input_tokens_seen": 291203145, + "step": 13492, + "time_per_iteration": 2.4872841835021973 + }, + { + "auxiliary_loss_clip": 0.01062552, + "auxiliary_loss_mlp": 0.01030711, + "balance_loss_clip": 1.03749609, + "balance_loss_mlp": 1.0184505, + "epoch": 0.8112430482489102, + "flos": 22345558767360.0, + "grad_norm": 1.8016541171844693, + "language_loss": 0.72058529, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.7415179, + "num_input_tokens_seen": 291220600, + "step": 13493, + "time_per_iteration": 2.631155252456665 + }, + { + "auxiliary_loss_clip": 0.01088338, + "auxiliary_loss_mlp": 0.01042449, + "balance_loss_clip": 1.03285217, + "balance_loss_mlp": 1.02892447, + "epoch": 0.8113031715015783, + "flos": 31138321962240.0, + "grad_norm": 1.5812226710503685, + "language_loss": 0.70608079, + "learning_rate": 3.619255354436885e-07, + "loss": 0.72738874, + "num_input_tokens_seen": 291241195, + "step": 13494, + "time_per_iteration": 2.5480966567993164 + }, + { + "auxiliary_loss_clip": 0.0109842, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.03720641, + "balance_loss_mlp": 1.02025878, + "epoch": 0.8113632947542462, + "flos": 25335445000320.0, + "grad_norm": 2.298252046677942, + "language_loss": 0.76666605, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.78798938, + "num_input_tokens_seen": 291258715, + "step": 13495, + "time_per_iteration": 2.5338222980499268 + }, + { + "auxiliary_loss_clip": 0.01085563, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.03513908, + "balance_loss_mlp": 1.02063286, + "epoch": 0.8114234180069142, + "flos": 28439168411520.0, + "grad_norm": 2.1458204717416125, + "language_loss": 0.79555148, + "learning_rate": 3.614787599084417e-07, + "loss": 0.8167392, + "num_input_tokens_seen": 291278030, + "step": 13496, + "time_per_iteration": 2.5588297843933105 + }, + { + "auxiliary_loss_clip": 0.01095101, + "auxiliary_loss_mlp": 0.01035964, + "balance_loss_clip": 1.03487217, + "balance_loss_mlp": 1.02254128, + "epoch": 0.8114835412595821, + "flos": 20338870584960.0, + "grad_norm": 1.6391168878361722, + "language_loss": 0.71166933, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.73297989, + "num_input_tokens_seen": 291296740, + "step": 13497, + "time_per_iteration": 2.504910707473755 + }, + { + "auxiliary_loss_clip": 0.01070887, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.03443825, + "balance_loss_mlp": 1.02035248, + "epoch": 0.8115436645122501, + "flos": 22490889194880.0, + "grad_norm": 1.7138158703376067, + "language_loss": 0.76797378, + "learning_rate": 3.610322329047508e-07, + "loss": 0.7890054, + "num_input_tokens_seen": 291318730, + "step": 13498, + "time_per_iteration": 2.5896711349487305 + }, + { + "auxiliary_loss_clip": 0.01104917, + "auxiliary_loss_mlp": 0.01033714, + "balance_loss_clip": 1.0350976, + "balance_loss_mlp": 1.02147174, + "epoch": 0.811603787764918, + "flos": 13845288021120.0, + "grad_norm": 1.8346128630326792, + "language_loss": 0.83932143, + "learning_rate": 3.608090626234055e-07, + "loss": 0.86070776, + "num_input_tokens_seen": 291336755, + "step": 13499, + "time_per_iteration": 2.4593067169189453 + }, + { + "auxiliary_loss_clip": 0.01071585, + "auxiliary_loss_mlp": 0.01029635, + "balance_loss_clip": 1.0340178, + "balance_loss_mlp": 1.01600969, + "epoch": 0.8116639110175861, + "flos": 21614632911360.0, + "grad_norm": 1.478332255748856, + "language_loss": 0.7634784, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.7844907, + "num_input_tokens_seen": 291356795, + "step": 13500, + "time_per_iteration": 2.5429863929748535 + }, + { + "auxiliary_loss_clip": 0.01009823, + "auxiliary_loss_mlp": 0.01001923, + "balance_loss_clip": 1.00815535, + "balance_loss_mlp": 1.00069511, + "epoch": 0.811724034270254, + "flos": 64459799625600.0, + "grad_norm": 0.8041681984532766, + "language_loss": 0.59894121, + "learning_rate": 3.603629085440303e-07, + "loss": 0.61905861, + "num_input_tokens_seen": 291416005, + "step": 13501, + "time_per_iteration": 3.1992151737213135 + }, + { + "auxiliary_loss_clip": 0.01090532, + "auxiliary_loss_mlp": 0.01027251, + "balance_loss_clip": 1.03479815, + "balance_loss_mlp": 1.01560998, + "epoch": 0.811784157522922, + "flos": 24754123290240.0, + "grad_norm": 1.812669804547587, + "language_loss": 0.79299885, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.81417668, + "num_input_tokens_seen": 291434870, + "step": 13502, + "time_per_iteration": 2.5064752101898193 + }, + { + "auxiliary_loss_clip": 0.0107928, + "auxiliary_loss_mlp": 0.01039213, + "balance_loss_clip": 1.0335567, + "balance_loss_mlp": 1.02579582, + "epoch": 0.81184428077559, + "flos": 12167146563840.0, + "grad_norm": 1.8222646766497628, + "language_loss": 0.71218187, + "learning_rate": 3.599170031654635e-07, + "loss": 0.73336679, + "num_input_tokens_seen": 291452230, + "step": 13503, + "time_per_iteration": 2.5078284740448 + }, + { + "auxiliary_loss_clip": 0.01075117, + "auxiliary_loss_mlp": 0.01031684, + "balance_loss_clip": 1.03420186, + "balance_loss_mlp": 1.01793957, + "epoch": 0.8119044040282579, + "flos": 44422037775360.0, + "grad_norm": 1.7149152056760173, + "language_loss": 0.67949259, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.70056063, + "num_input_tokens_seen": 291477425, + "step": 13504, + "time_per_iteration": 2.7182419300079346 + }, + { + "auxiliary_loss_clip": 0.01079532, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.03539801, + "balance_loss_mlp": 1.01796365, + "epoch": 0.8119645272809259, + "flos": 52155507957120.0, + "grad_norm": 1.9108291380399665, + "language_loss": 0.74560642, + "learning_rate": 3.594713465553403e-07, + "loss": 0.76671439, + "num_input_tokens_seen": 291501070, + "step": 13505, + "time_per_iteration": 2.8055641651153564 + }, + { + "auxiliary_loss_clip": 0.01083962, + "auxiliary_loss_mlp": 0.01028016, + "balance_loss_clip": 1.0357759, + "balance_loss_mlp": 1.01444459, + "epoch": 0.8120246505335939, + "flos": 30232978640640.0, + "grad_norm": 1.8377068530784149, + "language_loss": 0.72351646, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.7446363, + "num_input_tokens_seen": 291524945, + "step": 13506, + "time_per_iteration": 2.6256158351898193 + }, + { + "auxiliary_loss_clip": 0.01110258, + "auxiliary_loss_mlp": 0.01029672, + "balance_loss_clip": 1.03618622, + "balance_loss_mlp": 1.01736999, + "epoch": 0.8120847737862619, + "flos": 22127652910080.0, + "grad_norm": 2.286956278737283, + "language_loss": 0.76284623, + "learning_rate": 3.590259387812593e-07, + "loss": 0.78424561, + "num_input_tokens_seen": 291544605, + "step": 13507, + "time_per_iteration": 2.5013647079467773 + }, + { + "auxiliary_loss_clip": 0.01105628, + "auxiliary_loss_mlp": 0.01026437, + "balance_loss_clip": 1.03398085, + "balance_loss_mlp": 1.01435494, + "epoch": 0.8121448970389298, + "flos": 23295180579840.0, + "grad_norm": 1.5975871882106458, + "language_loss": 0.69977462, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.7210952, + "num_input_tokens_seen": 291563850, + "step": 13508, + "time_per_iteration": 2.456258773803711 + }, + { + "auxiliary_loss_clip": 0.01093072, + "auxiliary_loss_mlp": 0.01030895, + "balance_loss_clip": 1.03516197, + "balance_loss_mlp": 1.01886106, + "epoch": 0.8122050202915978, + "flos": 22164138149760.0, + "grad_norm": 3.3546064926186117, + "language_loss": 0.75630605, + "learning_rate": 3.585807799107785e-07, + "loss": 0.77754569, + "num_input_tokens_seen": 291581730, + "step": 13509, + "time_per_iteration": 2.5372109413146973 + }, + { + "auxiliary_loss_clip": 0.0110764, + "auxiliary_loss_mlp": 0.01031588, + "balance_loss_clip": 1.03623891, + "balance_loss_mlp": 1.01870751, + "epoch": 0.8122651435442657, + "flos": 23258946735360.0, + "grad_norm": 1.6912726136113183, + "language_loss": 0.76919103, + "learning_rate": 3.58358293835491e-07, + "loss": 0.79058325, + "num_input_tokens_seen": 291601225, + "step": 13510, + "time_per_iteration": 2.4911715984344482 + }, + { + "auxiliary_loss_clip": 0.01096388, + "auxiliary_loss_mlp": 0.01034948, + "balance_loss_clip": 1.03491104, + "balance_loss_mlp": 1.0214715, + "epoch": 0.8123252667969337, + "flos": 16140015365760.0, + "grad_norm": 1.7936784902429423, + "language_loss": 0.70019907, + "learning_rate": 3.581358700114212e-07, + "loss": 0.72151244, + "num_input_tokens_seen": 291616995, + "step": 13511, + "time_per_iteration": 2.478416919708252 + }, + { + "auxiliary_loss_clip": 0.0108759, + "auxiliary_loss_mlp": 0.01034535, + "balance_loss_clip": 1.03726733, + "balance_loss_mlp": 1.02233362, + "epoch": 0.8123853900496016, + "flos": 21245399055360.0, + "grad_norm": 1.55542383695618, + "language_loss": 0.79775178, + "learning_rate": 3.57913508447004e-07, + "loss": 0.818973, + "num_input_tokens_seen": 291636145, + "step": 13512, + "time_per_iteration": 2.5116941928863525 + }, + { + "auxiliary_loss_clip": 0.01092251, + "auxiliary_loss_mlp": 0.01031714, + "balance_loss_clip": 1.0342021, + "balance_loss_mlp": 1.02025247, + "epoch": 0.8124455133022697, + "flos": 64377596373120.0, + "grad_norm": 1.8320881960268358, + "language_loss": 0.63341737, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.65465701, + "num_input_tokens_seen": 291662440, + "step": 13513, + "time_per_iteration": 2.8999693393707275 + }, + { + "auxiliary_loss_clip": 0.01058155, + "auxiliary_loss_mlp": 0.0103419, + "balance_loss_clip": 1.03473771, + "balance_loss_mlp": 1.02126169, + "epoch": 0.8125056365549376, + "flos": 23842207779840.0, + "grad_norm": 1.6412105492341407, + "language_loss": 0.71628612, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.73720956, + "num_input_tokens_seen": 291680950, + "step": 13514, + "time_per_iteration": 2.615200996398926 + }, + { + "auxiliary_loss_clip": 0.01067839, + "auxiliary_loss_mlp": 0.0102964, + "balance_loss_clip": 1.03342426, + "balance_loss_mlp": 1.01760554, + "epoch": 0.8125657598076056, + "flos": 23550325862400.0, + "grad_norm": 1.593813484702048, + "language_loss": 0.62784863, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.64882338, + "num_input_tokens_seen": 291702395, + "step": 13515, + "time_per_iteration": 2.5983948707580566 + }, + { + "auxiliary_loss_clip": 0.01099002, + "auxiliary_loss_mlp": 0.0078276, + "balance_loss_clip": 1.03380692, + "balance_loss_mlp": 1.01055503, + "epoch": 0.8126258830602736, + "flos": 20704225772160.0, + "grad_norm": 1.5750285597602072, + "language_loss": 0.7534306, + "learning_rate": 3.570246849544616e-07, + "loss": 0.77224827, + "num_input_tokens_seen": 291721135, + "step": 13516, + "time_per_iteration": 2.4811291694641113 + }, + { + "auxiliary_loss_clip": 0.01064808, + "auxiliary_loss_mlp": 0.01030118, + "balance_loss_clip": 1.03614044, + "balance_loss_mlp": 1.01800644, + "epoch": 0.8126860063129415, + "flos": 23618160696960.0, + "grad_norm": 1.4882806905598813, + "language_loss": 0.91422576, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.93517494, + "num_input_tokens_seen": 291741235, + "step": 13517, + "time_per_iteration": 2.62703537940979 + }, + { + "auxiliary_loss_clip": 0.01097365, + "auxiliary_loss_mlp": 0.00785466, + "balance_loss_clip": 1.0381844, + "balance_loss_mlp": 1.01018977, + "epoch": 0.8127461295656095, + "flos": 25007149670400.0, + "grad_norm": 1.4841449349530553, + "language_loss": 0.78627187, + "learning_rate": 3.565806469852244e-07, + "loss": 0.8051002, + "num_input_tokens_seen": 291761430, + "step": 13518, + "time_per_iteration": 3.889030933380127 + }, + { + "auxiliary_loss_clip": 0.0109342, + "auxiliary_loss_mlp": 0.01029661, + "balance_loss_clip": 1.03610229, + "balance_loss_mlp": 1.01869369, + "epoch": 0.8128062528182775, + "flos": 27342169096320.0, + "grad_norm": 1.611181259856512, + "language_loss": 0.78869343, + "learning_rate": 3.56358721474336e-07, + "loss": 0.80992424, + "num_input_tokens_seen": 291781755, + "step": 13519, + "time_per_iteration": 2.5348429679870605 + }, + { + "auxiliary_loss_clip": 0.01104895, + "auxiliary_loss_mlp": 0.01037149, + "balance_loss_clip": 1.03388977, + "balance_loss_mlp": 1.02483499, + "epoch": 0.8128663760709455, + "flos": 26506312634880.0, + "grad_norm": 1.5842883800319059, + "language_loss": 0.7020728, + "learning_rate": 3.561368582904905e-07, + "loss": 0.72349328, + "num_input_tokens_seen": 291804410, + "step": 13520, + "time_per_iteration": 3.924917221069336 + }, + { + "auxiliary_loss_clip": 0.0108721, + "auxiliary_loss_mlp": 0.01029831, + "balance_loss_clip": 1.03474569, + "balance_loss_mlp": 1.01796412, + "epoch": 0.8129264993236134, + "flos": 17931239815680.0, + "grad_norm": 1.4739678049306009, + "language_loss": 0.72440779, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.74557817, + "num_input_tokens_seen": 291823285, + "step": 13521, + "time_per_iteration": 2.5779337882995605 + }, + { + "auxiliary_loss_clip": 0.01096626, + "auxiliary_loss_mlp": 0.01030671, + "balance_loss_clip": 1.0351212, + "balance_loss_mlp": 1.01834524, + "epoch": 0.8129866225762814, + "flos": 26177694082560.0, + "grad_norm": 1.8198246600742918, + "language_loss": 0.69780672, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.71907967, + "num_input_tokens_seen": 291845305, + "step": 13522, + "time_per_iteration": 3.965945243835449 + }, + { + "auxiliary_loss_clip": 0.01088897, + "auxiliary_loss_mlp": 0.01034398, + "balance_loss_clip": 1.03521621, + "balance_loss_mlp": 1.023157, + "epoch": 0.8130467458289493, + "flos": 21032197879680.0, + "grad_norm": 1.519661192449538, + "language_loss": 0.70685053, + "learning_rate": 3.554716427853233e-07, + "loss": 0.72808349, + "num_input_tokens_seen": 291863715, + "step": 13523, + "time_per_iteration": 2.5087697505950928 + }, + { + "auxiliary_loss_clip": 0.01092145, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.03326011, + "balance_loss_mlp": 1.01700783, + "epoch": 0.8131068690816173, + "flos": 15487051979520.0, + "grad_norm": 2.3394092640195483, + "language_loss": 0.71266526, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.73388231, + "num_input_tokens_seen": 291880735, + "step": 13524, + "time_per_iteration": 2.434755563735962 + }, + { + "auxiliary_loss_clip": 0.0109314, + "auxiliary_loss_mlp": 0.01026305, + "balance_loss_clip": 1.03505576, + "balance_loss_mlp": 1.0149622, + "epoch": 0.8131669923342852, + "flos": 29351227576320.0, + "grad_norm": 1.845289447033173, + "language_loss": 0.62384439, + "learning_rate": 3.550284775712653e-07, + "loss": 0.64503884, + "num_input_tokens_seen": 291900535, + "step": 13525, + "time_per_iteration": 2.5842974185943604 + }, + { + "auxiliary_loss_clip": 0.01085084, + "auxiliary_loss_mlp": 0.01036999, + "balance_loss_clip": 1.03664339, + "balance_loss_mlp": 1.02538776, + "epoch": 0.8132271155869533, + "flos": 35256162055680.0, + "grad_norm": 1.4695521410933603, + "language_loss": 0.65372926, + "learning_rate": 3.548069885262628e-07, + "loss": 0.67495012, + "num_input_tokens_seen": 291919760, + "step": 13526, + "time_per_iteration": 2.6480839252471924 + }, + { + "auxiliary_loss_clip": 0.01079962, + "auxiliary_loss_mlp": 0.01029138, + "balance_loss_clip": 1.03423381, + "balance_loss_mlp": 1.01761639, + "epoch": 0.8132872388396212, + "flos": 27781895393280.0, + "grad_norm": 2.026405696812929, + "language_loss": 0.74924183, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.77033281, + "num_input_tokens_seen": 291938915, + "step": 13527, + "time_per_iteration": 2.5842063426971436 + }, + { + "auxiliary_loss_clip": 0.01101781, + "auxiliary_loss_mlp": 0.01028369, + "balance_loss_clip": 1.03387499, + "balance_loss_mlp": 1.01675773, + "epoch": 0.8133473620922892, + "flos": 27819601695360.0, + "grad_norm": 1.7179378036664423, + "language_loss": 0.70519549, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.72649699, + "num_input_tokens_seen": 291958145, + "step": 13528, + "time_per_iteration": 3.8957135677337646 + }, + { + "auxiliary_loss_clip": 0.01104764, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.03471267, + "balance_loss_mlp": 1.01787364, + "epoch": 0.8134074853449572, + "flos": 18989527248000.0, + "grad_norm": 1.9692142502019614, + "language_loss": 0.68746084, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.70880497, + "num_input_tokens_seen": 291976860, + "step": 13529, + "time_per_iteration": 2.4437363147735596 + }, + { + "auxiliary_loss_clip": 0.01093295, + "auxiliary_loss_mlp": 0.0103075, + "balance_loss_clip": 1.03613162, + "balance_loss_mlp": 1.0194844, + "epoch": 0.8134676085976251, + "flos": 24242863057920.0, + "grad_norm": 1.3146574939297855, + "language_loss": 0.77465117, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.79589152, + "num_input_tokens_seen": 291998085, + "step": 13530, + "time_per_iteration": 2.5460402965545654 + }, + { + "auxiliary_loss_clip": 0.01092972, + "auxiliary_loss_mlp": 0.01031412, + "balance_loss_clip": 1.03550076, + "balance_loss_mlp": 1.01879334, + "epoch": 0.8135277318502931, + "flos": 19062389986560.0, + "grad_norm": 1.8515373636013563, + "language_loss": 0.81803614, + "learning_rate": 3.537004792574052e-07, + "loss": 0.83928001, + "num_input_tokens_seen": 292016585, + "step": 13531, + "time_per_iteration": 2.4839963912963867 + }, + { + "auxiliary_loss_clip": 0.01081878, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.03491652, + "balance_loss_mlp": 1.01951718, + "epoch": 0.813587855102961, + "flos": 17269728992640.0, + "grad_norm": 8.520391249064106, + "language_loss": 0.71506333, + "learning_rate": 3.534793646536065e-07, + "loss": 0.73621738, + "num_input_tokens_seen": 292033255, + "step": 13532, + "time_per_iteration": 2.4987785816192627 + }, + { + "auxiliary_loss_clip": 0.01071094, + "auxiliary_loss_mlp": 0.01028684, + "balance_loss_clip": 1.03400362, + "balance_loss_mlp": 1.01697183, + "epoch": 0.8136479783556291, + "flos": 20157593621760.0, + "grad_norm": 1.8225666960864597, + "language_loss": 0.76695126, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.78794903, + "num_input_tokens_seen": 292051800, + "step": 13533, + "time_per_iteration": 2.5464775562286377 + }, + { + "auxiliary_loss_clip": 0.01108855, + "auxiliary_loss_mlp": 0.00783941, + "balance_loss_clip": 1.03598082, + "balance_loss_mlp": 1.00973785, + "epoch": 0.813708101608297, + "flos": 22052348046720.0, + "grad_norm": 2.055988204989701, + "language_loss": 0.76259983, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.78152776, + "num_input_tokens_seen": 292072215, + "step": 13534, + "time_per_iteration": 2.475858449935913 + }, + { + "auxiliary_loss_clip": 0.01094646, + "auxiliary_loss_mlp": 0.01025133, + "balance_loss_clip": 1.03644407, + "balance_loss_mlp": 1.01457083, + "epoch": 0.813768224860965, + "flos": 16173412035840.0, + "grad_norm": 2.0431229740037296, + "language_loss": 0.9310838, + "learning_rate": 3.5281639549310336e-07, + "loss": 0.95228159, + "num_input_tokens_seen": 292088830, + "step": 13535, + "time_per_iteration": 2.4759867191314697 + }, + { + "auxiliary_loss_clip": 0.01065046, + "auxiliary_loss_mlp": 0.01025381, + "balance_loss_clip": 1.03605533, + "balance_loss_mlp": 1.0139432, + "epoch": 0.8138283481136329, + "flos": 24352318776960.0, + "grad_norm": 1.755710562588575, + "language_loss": 0.70697522, + "learning_rate": 3.52595530684499e-07, + "loss": 0.72787952, + "num_input_tokens_seen": 292109225, + "step": 13536, + "time_per_iteration": 2.6056900024414062 + }, + { + "auxiliary_loss_clip": 0.01069448, + "auxiliary_loss_mlp": 0.01033083, + "balance_loss_clip": 1.0354321, + "balance_loss_mlp": 1.02050614, + "epoch": 0.8138884713663009, + "flos": 25516362827520.0, + "grad_norm": 1.782050419702371, + "language_loss": 0.75821096, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.77923632, + "num_input_tokens_seen": 292129660, + "step": 13537, + "time_per_iteration": 2.5904340744018555 + }, + { + "auxiliary_loss_clip": 0.01079081, + "auxiliary_loss_mlp": 0.01034866, + "balance_loss_clip": 1.03461862, + "balance_loss_mlp": 1.02218175, + "epoch": 0.8139485946189688, + "flos": 22454368041600.0, + "grad_norm": 1.4133561619925668, + "language_loss": 0.76404613, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.78518564, + "num_input_tokens_seen": 292149090, + "step": 13538, + "time_per_iteration": 2.539875030517578 + }, + { + "auxiliary_loss_clip": 0.01087134, + "auxiliary_loss_mlp": 0.01027777, + "balance_loss_clip": 1.0333184, + "balance_loss_mlp": 1.01637423, + "epoch": 0.8140087178716369, + "flos": 21250391045760.0, + "grad_norm": 1.5985171328388066, + "language_loss": 0.78040045, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.80154955, + "num_input_tokens_seen": 292169260, + "step": 13539, + "time_per_iteration": 2.4967734813690186 + }, + { + "auxiliary_loss_clip": 0.0106309, + "auxiliary_loss_mlp": 0.01031294, + "balance_loss_clip": 1.03829801, + "balance_loss_mlp": 1.02015424, + "epoch": 0.8140688411243048, + "flos": 39415730774400.0, + "grad_norm": 2.5391137604485783, + "language_loss": 0.66308838, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.6840322, + "num_input_tokens_seen": 292188145, + "step": 13540, + "time_per_iteration": 2.7033023834228516 + }, + { + "auxiliary_loss_clip": 0.01096167, + "auxiliary_loss_mlp": 0.01030975, + "balance_loss_clip": 1.03661299, + "balance_loss_mlp": 1.01987624, + "epoch": 0.8141289643769728, + "flos": 25415885508480.0, + "grad_norm": 1.5519137507032548, + "language_loss": 0.67653126, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.69780266, + "num_input_tokens_seen": 292212135, + "step": 13541, + "time_per_iteration": 2.564786434173584 + }, + { + "auxiliary_loss_clip": 0.01101681, + "auxiliary_loss_mlp": 0.01032876, + "balance_loss_clip": 1.0338068, + "balance_loss_mlp": 1.0203886, + "epoch": 0.8141890876296408, + "flos": 12568053237120.0, + "grad_norm": 1.7719689065839233, + "language_loss": 0.68919861, + "learning_rate": 3.512716539904355e-07, + "loss": 0.71054423, + "num_input_tokens_seen": 292230645, + "step": 13542, + "time_per_iteration": 2.4474687576293945 + }, + { + "auxiliary_loss_clip": 0.01107201, + "auxiliary_loss_mlp": 0.0103377, + "balance_loss_clip": 1.03469825, + "balance_loss_mlp": 1.0209012, + "epoch": 0.8142492108823087, + "flos": 14967172483200.0, + "grad_norm": 2.912462067731349, + "language_loss": 0.79462683, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.81603658, + "num_input_tokens_seen": 292243540, + "step": 13543, + "time_per_iteration": 2.3833887577056885 + }, + { + "auxiliary_loss_clip": 0.01078032, + "auxiliary_loss_mlp": 0.01037846, + "balance_loss_clip": 1.03728092, + "balance_loss_mlp": 1.02463722, + "epoch": 0.8143093341349767, + "flos": 12422004537600.0, + "grad_norm": 2.090323159783881, + "language_loss": 0.77462351, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.79578233, + "num_input_tokens_seen": 292261715, + "step": 13544, + "time_per_iteration": 2.5320842266082764 + }, + { + "auxiliary_loss_clip": 0.01112318, + "auxiliary_loss_mlp": 0.01032004, + "balance_loss_clip": 1.03751612, + "balance_loss_mlp": 1.01809263, + "epoch": 0.8143694573876447, + "flos": 11910564737280.0, + "grad_norm": 3.2820749636441406, + "language_loss": 0.73307717, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.75452036, + "num_input_tokens_seen": 292275080, + "step": 13545, + "time_per_iteration": 2.401869773864746 + }, + { + "auxiliary_loss_clip": 0.0109159, + "auxiliary_loss_mlp": 0.01029866, + "balance_loss_clip": 1.0350585, + "balance_loss_mlp": 1.01804614, + "epoch": 0.8144295806403127, + "flos": 21212900225280.0, + "grad_norm": 1.6890185228899988, + "language_loss": 0.77024072, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.79145527, + "num_input_tokens_seen": 292294635, + "step": 13546, + "time_per_iteration": 2.4939804077148438 + }, + { + "auxiliary_loss_clip": 0.01096359, + "auxiliary_loss_mlp": 0.01031826, + "balance_loss_clip": 1.03724396, + "balance_loss_mlp": 1.02047157, + "epoch": 0.8144897038929806, + "flos": 19865280741120.0, + "grad_norm": 3.5037280246615166, + "language_loss": 0.70575327, + "learning_rate": 3.501701426337178e-07, + "loss": 0.72703511, + "num_input_tokens_seen": 292312695, + "step": 13547, + "time_per_iteration": 2.470778226852417 + }, + { + "auxiliary_loss_clip": 0.01107384, + "auxiliary_loss_mlp": 0.01032735, + "balance_loss_clip": 1.03612947, + "balance_loss_mlp": 1.01920462, + "epoch": 0.8145498271456486, + "flos": 24571733005440.0, + "grad_norm": 2.346199082109253, + "language_loss": 0.70938724, + "learning_rate": 3.49950028014111e-07, + "loss": 0.73078847, + "num_input_tokens_seen": 292332005, + "step": 13548, + "time_per_iteration": 2.4791665077209473 + }, + { + "auxiliary_loss_clip": 0.01095544, + "auxiliary_loss_mlp": 0.01034878, + "balance_loss_clip": 1.03949201, + "balance_loss_mlp": 1.02166986, + "epoch": 0.8146099503983165, + "flos": 20193037367040.0, + "grad_norm": 2.184491874714152, + "language_loss": 0.7702992, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.79160351, + "num_input_tokens_seen": 292348365, + "step": 13549, + "time_per_iteration": 2.4777755737304688 + }, + { + "auxiliary_loss_clip": 0.01106883, + "auxiliary_loss_mlp": 0.01031068, + "balance_loss_clip": 1.03761077, + "balance_loss_mlp": 1.01922441, + "epoch": 0.8146700736509845, + "flos": 19536949497600.0, + "grad_norm": 1.981281576171727, + "language_loss": 0.70936334, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.73074281, + "num_input_tokens_seen": 292368050, + "step": 13550, + "time_per_iteration": 2.4631240367889404 + }, + { + "auxiliary_loss_clip": 0.01089189, + "auxiliary_loss_mlp": 0.01026405, + "balance_loss_clip": 1.03480601, + "balance_loss_mlp": 1.01476479, + "epoch": 0.8147301969036524, + "flos": 18041341979520.0, + "grad_norm": 1.7136808585282315, + "language_loss": 0.7237668, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.74492276, + "num_input_tokens_seen": 292385315, + "step": 13551, + "time_per_iteration": 2.4771616458892822 + }, + { + "auxiliary_loss_clip": 0.01067841, + "auxiliary_loss_mlp": 0.01036273, + "balance_loss_clip": 1.03745413, + "balance_loss_mlp": 1.02292776, + "epoch": 0.8147903201563205, + "flos": 18004713085440.0, + "grad_norm": 1.9190173988509487, + "language_loss": 0.68067896, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.70172012, + "num_input_tokens_seen": 292403375, + "step": 13552, + "time_per_iteration": 2.5980613231658936 + }, + { + "auxiliary_loss_clip": 0.01103376, + "auxiliary_loss_mlp": 0.01039227, + "balance_loss_clip": 1.03444028, + "balance_loss_mlp": 1.02730072, + "epoch": 0.8148504434089884, + "flos": 20259327916800.0, + "grad_norm": 1.9177677032248008, + "language_loss": 0.82577753, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.84720349, + "num_input_tokens_seen": 292419260, + "step": 13553, + "time_per_iteration": 2.4379117488861084 + }, + { + "auxiliary_loss_clip": 0.01092757, + "auxiliary_loss_mlp": 0.01027849, + "balance_loss_clip": 1.03333664, + "balance_loss_mlp": 1.01551712, + "epoch": 0.8149105666616564, + "flos": 12494723621760.0, + "grad_norm": 2.0469017364000694, + "language_loss": 0.67756009, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.69876611, + "num_input_tokens_seen": 292436095, + "step": 13554, + "time_per_iteration": 2.457012414932251 + }, + { + "auxiliary_loss_clip": 0.01079537, + "auxiliary_loss_mlp": 0.01036998, + "balance_loss_clip": 1.03630757, + "balance_loss_mlp": 1.02286577, + "epoch": 0.8149706899143244, + "flos": 32523683662080.0, + "grad_norm": 1.9934282604056042, + "language_loss": 0.66547179, + "learning_rate": 3.484109781056723e-07, + "loss": 0.68663716, + "num_input_tokens_seen": 292457190, + "step": 13555, + "time_per_iteration": 2.5819034576416016 + }, + { + "auxiliary_loss_clip": 0.01097736, + "auxiliary_loss_mlp": 0.01033368, + "balance_loss_clip": 1.03538704, + "balance_loss_mlp": 1.02070785, + "epoch": 0.8150308131669923, + "flos": 19386088375680.0, + "grad_norm": 2.1312635431446947, + "language_loss": 0.73022044, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.75153154, + "num_input_tokens_seen": 292474300, + "step": 13556, + "time_per_iteration": 2.489006519317627 + }, + { + "auxiliary_loss_clip": 0.01094839, + "auxiliary_loss_mlp": 0.01029001, + "balance_loss_clip": 1.03802407, + "balance_loss_mlp": 1.01787317, + "epoch": 0.8150909364196604, + "flos": 17421380213760.0, + "grad_norm": 1.5875672247731132, + "language_loss": 0.80572408, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.82696253, + "num_input_tokens_seen": 292492420, + "step": 13557, + "time_per_iteration": 3.8833038806915283 + }, + { + "auxiliary_loss_clip": 0.01087686, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.03637493, + "balance_loss_mlp": 1.02120507, + "epoch": 0.8151510596723283, + "flos": 27162795553920.0, + "grad_norm": 1.691390466246969, + "language_loss": 0.65936208, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.68057525, + "num_input_tokens_seen": 292512895, + "step": 13558, + "time_per_iteration": 3.944021224975586 + }, + { + "auxiliary_loss_clip": 0.01029646, + "auxiliary_loss_mlp": 0.01002517, + "balance_loss_clip": 1.00677872, + "balance_loss_mlp": 1.00128353, + "epoch": 0.8152111829249963, + "flos": 64219052718720.0, + "grad_norm": 0.7897487522819733, + "language_loss": 0.56933069, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.5896523, + "num_input_tokens_seen": 292566580, + "step": 13559, + "time_per_iteration": 2.9919676780700684 + }, + { + "auxiliary_loss_clip": 0.01018139, + "auxiliary_loss_mlp": 0.01020772, + "balance_loss_clip": 1.01424897, + "balance_loss_mlp": 1.01937759, + "epoch": 0.8152713061776642, + "flos": 67072012306560.0, + "grad_norm": 0.6816132319102937, + "language_loss": 0.55299163, + "learning_rate": 3.473135354283334e-07, + "loss": 0.57338071, + "num_input_tokens_seen": 292621490, + "step": 13560, + "time_per_iteration": 4.406762361526489 + }, + { + "auxiliary_loss_clip": 0.01080119, + "auxiliary_loss_mlp": 0.01028336, + "balance_loss_clip": 1.03228021, + "balance_loss_mlp": 1.01683259, + "epoch": 0.8153314294303322, + "flos": 14391130072320.0, + "grad_norm": 1.650364870230622, + "language_loss": 0.6755268, + "learning_rate": 3.470942348696948e-07, + "loss": 0.69661129, + "num_input_tokens_seen": 292638660, + "step": 13561, + "time_per_iteration": 2.4823617935180664 + }, + { + "auxiliary_loss_clip": 0.0109665, + "auxiliary_loss_mlp": 0.01032573, + "balance_loss_clip": 1.03571463, + "balance_loss_mlp": 1.02083707, + "epoch": 0.8153915526830001, + "flos": 25623520076160.0, + "grad_norm": 1.5872994748801734, + "language_loss": 0.81438971, + "learning_rate": 3.468749969894085e-07, + "loss": 0.83568192, + "num_input_tokens_seen": 292658545, + "step": 13562, + "time_per_iteration": 2.5219523906707764 + }, + { + "auxiliary_loss_clip": 0.01078577, + "auxiliary_loss_mlp": 0.01029168, + "balance_loss_clip": 1.03492498, + "balance_loss_mlp": 1.01721156, + "epoch": 0.8154516759356681, + "flos": 23369156640000.0, + "grad_norm": 1.530478104990854, + "language_loss": 0.71842325, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.73950064, + "num_input_tokens_seen": 292678460, + "step": 13563, + "time_per_iteration": 2.548308849334717 + }, + { + "auxiliary_loss_clip": 0.01026628, + "auxiliary_loss_mlp": 0.01027793, + "balance_loss_clip": 1.03701079, + "balance_loss_mlp": 1.01469779, + "epoch": 0.815511799188336, + "flos": 28149189914880.0, + "grad_norm": 1.5818168745473526, + "language_loss": 0.70124841, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.72179264, + "num_input_tokens_seen": 292699815, + "step": 13564, + "time_per_iteration": 2.859358549118042 + }, + { + "auxiliary_loss_clip": 0.01078795, + "auxiliary_loss_mlp": 0.01027546, + "balance_loss_clip": 1.03570282, + "balance_loss_mlp": 1.01577377, + "epoch": 0.8155719224410041, + "flos": 16983413683200.0, + "grad_norm": 2.277187121338606, + "language_loss": 0.70516682, + "learning_rate": 3.462176595017854e-07, + "loss": 0.72623026, + "num_input_tokens_seen": 292717370, + "step": 13565, + "time_per_iteration": 2.8851611614227295 + }, + { + "auxiliary_loss_clip": 0.01094003, + "auxiliary_loss_mlp": 0.01033756, + "balance_loss_clip": 1.03637087, + "balance_loss_mlp": 1.02175736, + "epoch": 0.815632045693672, + "flos": 24681727428480.0, + "grad_norm": 1.6217543820642966, + "language_loss": 0.79120386, + "learning_rate": 3.459986724180188e-07, + "loss": 0.81248146, + "num_input_tokens_seen": 292737110, + "step": 13566, + "time_per_iteration": 3.900590658187866 + }, + { + "auxiliary_loss_clip": 0.01083001, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.03693962, + "balance_loss_mlp": 1.02127564, + "epoch": 0.81569216894634, + "flos": 19938323047680.0, + "grad_norm": 1.6011830008830816, + "language_loss": 0.82385451, + "learning_rate": 3.457797480541491e-07, + "loss": 0.84500873, + "num_input_tokens_seen": 292756510, + "step": 13567, + "time_per_iteration": 2.5866122245788574 + }, + { + "auxiliary_loss_clip": 0.0110162, + "auxiliary_loss_mlp": 0.01026908, + "balance_loss_clip": 1.0354867, + "balance_loss_mlp": 1.01633406, + "epoch": 0.8157522921990079, + "flos": 21799393493760.0, + "grad_norm": 2.0364336640505796, + "language_loss": 0.79666567, + "learning_rate": 3.455608864184771e-07, + "loss": 0.81795096, + "num_input_tokens_seen": 292776710, + "step": 13568, + "time_per_iteration": 2.451289415359497 + }, + { + "auxiliary_loss_clip": 0.0108058, + "auxiliary_loss_mlp": 0.01029626, + "balance_loss_clip": 1.03389859, + "balance_loss_mlp": 1.0182054, + "epoch": 0.8158124154516759, + "flos": 18508323720960.0, + "grad_norm": 1.7727788120967456, + "language_loss": 0.77277756, + "learning_rate": 3.453420875193016e-07, + "loss": 0.79387963, + "num_input_tokens_seen": 292794350, + "step": 13569, + "time_per_iteration": 2.5868709087371826 + }, + { + "auxiliary_loss_clip": 0.01101606, + "auxiliary_loss_mlp": 0.01035502, + "balance_loss_clip": 1.0342871, + "balance_loss_mlp": 1.02450454, + "epoch": 0.815872538704344, + "flos": 26830801123200.0, + "grad_norm": 1.9361530607548736, + "language_loss": 0.58307368, + "learning_rate": 3.451233513649199e-07, + "loss": 0.6044448, + "num_input_tokens_seen": 292814005, + "step": 13570, + "time_per_iteration": 2.4858391284942627 + }, + { + "auxiliary_loss_clip": 0.01096255, + "auxiliary_loss_mlp": 0.01040101, + "balance_loss_clip": 1.03535354, + "balance_loss_mlp": 1.02698827, + "epoch": 0.8159326619570119, + "flos": 21725704742400.0, + "grad_norm": 1.7553400442274638, + "language_loss": 0.82363737, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.84500086, + "num_input_tokens_seen": 292833485, + "step": 13571, + "time_per_iteration": 2.4974112510681152 + }, + { + "auxiliary_loss_clip": 0.01077592, + "auxiliary_loss_mlp": 0.01041896, + "balance_loss_clip": 1.03449714, + "balance_loss_mlp": 1.02922988, + "epoch": 0.8159927852096799, + "flos": 13840726993920.0, + "grad_norm": 2.039274900691867, + "language_loss": 0.78633189, + "learning_rate": 3.446860673237142e-07, + "loss": 0.80752671, + "num_input_tokens_seen": 292848045, + "step": 13572, + "time_per_iteration": 2.473050117492676 + }, + { + "auxiliary_loss_clip": 0.01104715, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.03492844, + "balance_loss_mlp": 1.01947284, + "epoch": 0.8160529084623478, + "flos": 24499516711680.0, + "grad_norm": 1.5746125153205963, + "language_loss": 0.65153193, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.67288917, + "num_input_tokens_seen": 292869965, + "step": 13573, + "time_per_iteration": 2.4947822093963623 + }, + { + "auxiliary_loss_clip": 0.01070467, + "auxiliary_loss_mlp": 0.01029537, + "balance_loss_clip": 1.03731322, + "balance_loss_mlp": 1.01864767, + "epoch": 0.8161130317150158, + "flos": 24826339584000.0, + "grad_norm": 1.4473009587498493, + "language_loss": 0.75349635, + "learning_rate": 3.442490343611868e-07, + "loss": 0.77449644, + "num_input_tokens_seen": 292889680, + "step": 13574, + "time_per_iteration": 2.655406951904297 + }, + { + "auxiliary_loss_clip": 0.01096649, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.03601217, + "balance_loss_mlp": 1.02149057, + "epoch": 0.8161731549676837, + "flos": 30956542208640.0, + "grad_norm": 1.6481025268391045, + "language_loss": 0.59924006, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.6205439, + "num_input_tokens_seen": 292912360, + "step": 13575, + "time_per_iteration": 2.5666818618774414 + }, + { + "auxiliary_loss_clip": 0.01027042, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.03236318, + "balance_loss_mlp": 1.02124012, + "epoch": 0.8162332782203517, + "flos": 18551991680640.0, + "grad_norm": 1.8122412548274236, + "language_loss": 0.7423265, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.7629503, + "num_input_tokens_seen": 292928325, + "step": 13576, + "time_per_iteration": 2.6165342330932617 + }, + { + "auxiliary_loss_clip": 0.01010756, + "auxiliary_loss_mlp": 0.01002977, + "balance_loss_clip": 1.00870681, + "balance_loss_mlp": 1.00184441, + "epoch": 0.8162934014730197, + "flos": 70386853904640.0, + "grad_norm": 0.8273644605478022, + "language_loss": 0.58575475, + "learning_rate": 3.435939558349155e-07, + "loss": 0.60589218, + "num_input_tokens_seen": 292992795, + "step": 13577, + "time_per_iteration": 3.1288561820983887 + }, + { + "auxiliary_loss_clip": 0.01052373, + "auxiliary_loss_mlp": 0.01028773, + "balance_loss_clip": 1.03184271, + "balance_loss_mlp": 1.01651216, + "epoch": 0.8163535247256877, + "flos": 21214839559680.0, + "grad_norm": 1.66057969196801, + "language_loss": 0.70970964, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.73052114, + "num_input_tokens_seen": 293011950, + "step": 13578, + "time_per_iteration": 2.6052675247192383 + }, + { + "auxiliary_loss_clip": 0.01062408, + "auxiliary_loss_mlp": 0.01028978, + "balance_loss_clip": 1.03313494, + "balance_loss_mlp": 1.01755786, + "epoch": 0.8164136479783556, + "flos": 21098847565440.0, + "grad_norm": 1.7132886166005252, + "language_loss": 0.73684269, + "learning_rate": 3.431575508590172e-07, + "loss": 0.75775659, + "num_input_tokens_seen": 293030175, + "step": 13579, + "time_per_iteration": 2.537247657775879 + }, + { + "auxiliary_loss_clip": 0.01105385, + "auxiliary_loss_mlp": 0.01029003, + "balance_loss_clip": 1.03530133, + "balance_loss_mlp": 1.01761258, + "epoch": 0.8164737712310236, + "flos": 21720640924800.0, + "grad_norm": 1.943652330688153, + "language_loss": 0.78824568, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.80958951, + "num_input_tokens_seen": 293047980, + "step": 13580, + "time_per_iteration": 2.467878818511963 + }, + { + "auxiliary_loss_clip": 0.01067024, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.03240097, + "balance_loss_mlp": 1.02484465, + "epoch": 0.8165338944836915, + "flos": 19536805843200.0, + "grad_norm": 1.677196354206817, + "language_loss": 0.68868029, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.70972407, + "num_input_tokens_seen": 293067030, + "step": 13581, + "time_per_iteration": 2.535973072052002 + }, + { + "auxiliary_loss_clip": 0.01104102, + "auxiliary_loss_mlp": 0.0102708, + "balance_loss_clip": 1.0353446, + "balance_loss_mlp": 1.0152719, + "epoch": 0.8165940177363595, + "flos": 22928568416640.0, + "grad_norm": 1.714409022979012, + "language_loss": 0.59906179, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.62037361, + "num_input_tokens_seen": 293085575, + "step": 13582, + "time_per_iteration": 2.4696834087371826 + }, + { + "auxiliary_loss_clip": 0.01068726, + "auxiliary_loss_mlp": 0.0078309, + "balance_loss_clip": 1.03412199, + "balance_loss_mlp": 1.00901771, + "epoch": 0.8166541409890276, + "flos": 23370377702400.0, + "grad_norm": 1.340799806078612, + "language_loss": 0.82214427, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.84066248, + "num_input_tokens_seen": 293108200, + "step": 13583, + "time_per_iteration": 2.6061742305755615 + }, + { + "auxiliary_loss_clip": 0.01082284, + "auxiliary_loss_mlp": 0.01030523, + "balance_loss_clip": 1.0332098, + "balance_loss_mlp": 1.01864338, + "epoch": 0.8167142642416955, + "flos": 18441997257600.0, + "grad_norm": 2.9637558515009395, + "language_loss": 0.74376541, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.76489353, + "num_input_tokens_seen": 293126020, + "step": 13584, + "time_per_iteration": 2.4934563636779785 + }, + { + "auxiliary_loss_clip": 0.01096263, + "auxiliary_loss_mlp": 0.01027596, + "balance_loss_clip": 1.03844726, + "balance_loss_mlp": 1.01558614, + "epoch": 0.8167743874943635, + "flos": 21214983214080.0, + "grad_norm": 1.5663492417279683, + "language_loss": 0.74493474, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.7661733, + "num_input_tokens_seen": 293144620, + "step": 13585, + "time_per_iteration": 2.4752769470214844 + }, + { + "auxiliary_loss_clip": 0.010778, + "auxiliary_loss_mlp": 0.01032337, + "balance_loss_clip": 1.03607142, + "balance_loss_mlp": 1.0201714, + "epoch": 0.8168345107470314, + "flos": 18697681244160.0, + "grad_norm": 1.7796479867452657, + "language_loss": 0.69661337, + "learning_rate": 3.416321129478068e-07, + "loss": 0.71771473, + "num_input_tokens_seen": 293162850, + "step": 13586, + "time_per_iteration": 2.526461362838745 + }, + { + "auxiliary_loss_clip": 0.01045522, + "auxiliary_loss_mlp": 0.01038424, + "balance_loss_clip": 1.03319073, + "balance_loss_mlp": 1.02633035, + "epoch": 0.8168946339996994, + "flos": 16253098358400.0, + "grad_norm": 7.028573781724938, + "language_loss": 0.60927391, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.63011336, + "num_input_tokens_seen": 293181620, + "step": 13587, + "time_per_iteration": 2.5828168392181396 + }, + { + "auxiliary_loss_clip": 0.01095264, + "auxiliary_loss_mlp": 0.01031294, + "balance_loss_clip": 1.03410578, + "balance_loss_mlp": 1.01874101, + "epoch": 0.8169547572523673, + "flos": 26941585645440.0, + "grad_norm": 2.7308603473180444, + "language_loss": 0.68997347, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.71123904, + "num_input_tokens_seen": 293200270, + "step": 13588, + "time_per_iteration": 2.559894561767578 + }, + { + "auxiliary_loss_clip": 0.01080969, + "auxiliary_loss_mlp": 0.01035938, + "balance_loss_clip": 1.03496826, + "balance_loss_mlp": 1.02207422, + "epoch": 0.8170148805050353, + "flos": 18952323736320.0, + "grad_norm": 1.4902606645104202, + "language_loss": 0.73181069, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.75297976, + "num_input_tokens_seen": 293218960, + "step": 13589, + "time_per_iteration": 2.5112459659576416 + }, + { + "auxiliary_loss_clip": 0.01088059, + "auxiliary_loss_mlp": 0.01030566, + "balance_loss_clip": 1.03600883, + "balance_loss_mlp": 1.01877594, + "epoch": 0.8170750037577033, + "flos": 21834909066240.0, + "grad_norm": 1.8514627965744839, + "language_loss": 0.73420519, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.75539142, + "num_input_tokens_seen": 293236450, + "step": 13590, + "time_per_iteration": 2.5109808444976807 + }, + { + "auxiliary_loss_clip": 0.01108094, + "auxiliary_loss_mlp": 0.0103311, + "balance_loss_clip": 1.03582394, + "balance_loss_mlp": 1.01953816, + "epoch": 0.8171351270103713, + "flos": 33507169021440.0, + "grad_norm": 2.1641791270026256, + "language_loss": 0.65585053, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.67726254, + "num_input_tokens_seen": 293256480, + "step": 13591, + "time_per_iteration": 2.5616204738616943 + }, + { + "auxiliary_loss_clip": 0.01106105, + "auxiliary_loss_mlp": 0.01034616, + "balance_loss_clip": 1.03462934, + "balance_loss_mlp": 1.02267134, + "epoch": 0.8171952502630392, + "flos": 22708184520960.0, + "grad_norm": 2.850677771664054, + "language_loss": 0.6832099, + "learning_rate": 3.403270471641373e-07, + "loss": 0.70461708, + "num_input_tokens_seen": 293274960, + "step": 13592, + "time_per_iteration": 2.4719650745391846 + }, + { + "auxiliary_loss_clip": 0.0108137, + "auxiliary_loss_mlp": 0.01028903, + "balance_loss_clip": 1.0340271, + "balance_loss_mlp": 1.0159626, + "epoch": 0.8172553735157072, + "flos": 26723715701760.0, + "grad_norm": 1.9055449407063152, + "language_loss": 0.66370976, + "learning_rate": 3.401097564244759e-07, + "loss": 0.68481249, + "num_input_tokens_seen": 293295945, + "step": 13593, + "time_per_iteration": 2.56549334526062 + }, + { + "auxiliary_loss_clip": 0.01092839, + "auxiliary_loss_mlp": 0.0103202, + "balance_loss_clip": 1.03388548, + "balance_loss_mlp": 1.02058208, + "epoch": 0.8173154967683751, + "flos": 15961072786560.0, + "grad_norm": 2.0465152479210214, + "language_loss": 0.69500703, + "learning_rate": 3.398925286280188e-07, + "loss": 0.71625566, + "num_input_tokens_seen": 293313300, + "step": 13594, + "time_per_iteration": 2.4884448051452637 + }, + { + "auxiliary_loss_clip": 0.01105543, + "auxiliary_loss_mlp": 0.01028923, + "balance_loss_clip": 1.03545403, + "balance_loss_mlp": 1.01696074, + "epoch": 0.8173756200210431, + "flos": 25986720447360.0, + "grad_norm": 2.5974578288372605, + "language_loss": 0.65663415, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.67797887, + "num_input_tokens_seen": 293333085, + "step": 13595, + "time_per_iteration": 3.850938081741333 + }, + { + "auxiliary_loss_clip": 0.0107287, + "auxiliary_loss_mlp": 0.01028601, + "balance_loss_clip": 1.03698587, + "balance_loss_mlp": 1.01627469, + "epoch": 0.8174357432737112, + "flos": 25664422688640.0, + "grad_norm": 1.5409967877897346, + "language_loss": 0.78580201, + "learning_rate": 3.394582618976658e-07, + "loss": 0.8068167, + "num_input_tokens_seen": 293351895, + "step": 13596, + "time_per_iteration": 4.027684211730957 + }, + { + "auxiliary_loss_clip": 0.01077059, + "auxiliary_loss_mlp": 0.01027169, + "balance_loss_clip": 1.03183174, + "balance_loss_mlp": 1.01476562, + "epoch": 0.8174958665263791, + "flos": 21835088634240.0, + "grad_norm": 3.727220442715241, + "language_loss": 0.58424461, + "learning_rate": 3.392412229802362e-07, + "loss": 0.60528696, + "num_input_tokens_seen": 293371165, + "step": 13597, + "time_per_iteration": 2.6559948921203613 + }, + { + "auxiliary_loss_clip": 0.01061061, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.03440976, + "balance_loss_mlp": 1.01978731, + "epoch": 0.8175559897790471, + "flos": 22455517276800.0, + "grad_norm": 1.9424358582043217, + "language_loss": 0.82464492, + "learning_rate": 3.390242470389462e-07, + "loss": 0.84556997, + "num_input_tokens_seen": 293391150, + "step": 13598, + "time_per_iteration": 2.591569185256958 + }, + { + "auxiliary_loss_clip": 0.01043308, + "auxiliary_loss_mlp": 0.01027802, + "balance_loss_clip": 1.03484011, + "balance_loss_mlp": 1.0163343, + "epoch": 0.817616113031715, + "flos": 23615790399360.0, + "grad_norm": 1.9952123747554074, + "language_loss": 0.82665056, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.84736168, + "num_input_tokens_seen": 293409440, + "step": 13599, + "time_per_iteration": 4.190845251083374 + }, + { + "auxiliary_loss_clip": 0.01055064, + "auxiliary_loss_mlp": 0.01040916, + "balance_loss_clip": 1.03189421, + "balance_loss_mlp": 1.02794051, + "epoch": 0.817676236284383, + "flos": 27672260106240.0, + "grad_norm": 1.7443460623456057, + "language_loss": 0.83873987, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.85969961, + "num_input_tokens_seen": 293428995, + "step": 13600, + "time_per_iteration": 2.776707410812378 + }, + { + "auxiliary_loss_clip": 0.0107071, + "auxiliary_loss_mlp": 0.01033565, + "balance_loss_clip": 1.03402996, + "balance_loss_mlp": 1.02128649, + "epoch": 0.8177363595370509, + "flos": 24681009156480.0, + "grad_norm": 1.7714872267482222, + "language_loss": 0.7403214, + "learning_rate": 3.383736971541766e-07, + "loss": 0.7613641, + "num_input_tokens_seen": 293449155, + "step": 13601, + "time_per_iteration": 2.5699915885925293 + }, + { + "auxiliary_loss_clip": 0.01073289, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.03535736, + "balance_loss_mlp": 1.01702225, + "epoch": 0.817796482789719, + "flos": 17346326745600.0, + "grad_norm": 2.168196644548913, + "language_loss": 0.67939508, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.70041972, + "num_input_tokens_seen": 293466125, + "step": 13602, + "time_per_iteration": 2.537374496459961 + }, + { + "auxiliary_loss_clip": 0.010636, + "auxiliary_loss_mlp": 0.01032231, + "balance_loss_clip": 1.03252363, + "balance_loss_mlp": 1.01946378, + "epoch": 0.8178566060423869, + "flos": 17778475272960.0, + "grad_norm": 2.200777772428991, + "language_loss": 0.83731377, + "learning_rate": 3.379403122624718e-07, + "loss": 0.85827208, + "num_input_tokens_seen": 293481345, + "step": 13603, + "time_per_iteration": 2.500786781311035 + }, + { + "auxiliary_loss_clip": 0.01056635, + "auxiliary_loss_mlp": 0.01024563, + "balance_loss_clip": 1.03612053, + "balance_loss_mlp": 1.01325011, + "epoch": 0.8179167292950549, + "flos": 24973250209920.0, + "grad_norm": 1.680642849720776, + "language_loss": 0.68960428, + "learning_rate": 3.377237143507159e-07, + "loss": 0.7104162, + "num_input_tokens_seen": 293502330, + "step": 13604, + "time_per_iteration": 2.635911226272583 + }, + { + "auxiliary_loss_clip": 0.01085188, + "auxiliary_loss_mlp": 0.01035662, + "balance_loss_clip": 1.03779221, + "balance_loss_mlp": 1.02321625, + "epoch": 0.8179768525477228, + "flos": 22856783086080.0, + "grad_norm": 2.4624179208286563, + "language_loss": 0.74442029, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.76562881, + "num_input_tokens_seen": 293521415, + "step": 13605, + "time_per_iteration": 3.895524501800537 + }, + { + "auxiliary_loss_clip": 0.01065079, + "auxiliary_loss_mlp": 0.01035472, + "balance_loss_clip": 1.03815806, + "balance_loss_mlp": 1.02389622, + "epoch": 0.8180369758003908, + "flos": 18515147304960.0, + "grad_norm": 1.8686400263906888, + "language_loss": 0.7424525, + "learning_rate": 3.372907076364666e-07, + "loss": 0.76345801, + "num_input_tokens_seen": 293539245, + "step": 13606, + "time_per_iteration": 2.540588140487671 + }, + { + "auxiliary_loss_clip": 0.01103171, + "auxiliary_loss_mlp": 0.01030056, + "balance_loss_clip": 1.035748, + "balance_loss_mlp": 1.01847482, + "epoch": 0.8180970990530587, + "flos": 33182105915520.0, + "grad_norm": 1.8433118813121732, + "language_loss": 0.65450025, + "learning_rate": 3.370742988503916e-07, + "loss": 0.67583251, + "num_input_tokens_seen": 293560640, + "step": 13607, + "time_per_iteration": 2.5554890632629395 + }, + { + "auxiliary_loss_clip": 0.01081825, + "auxiliary_loss_mlp": 0.01029636, + "balance_loss_clip": 1.0347898, + "balance_loss_mlp": 1.01719666, + "epoch": 0.8181572223057267, + "flos": 25010022758400.0, + "grad_norm": 1.7730750084785125, + "language_loss": 0.70206821, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.72318286, + "num_input_tokens_seen": 293579465, + "step": 13608, + "time_per_iteration": 2.554417371749878 + }, + { + "auxiliary_loss_clip": 0.01089459, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.03296065, + "balance_loss_mlp": 1.0212245, + "epoch": 0.8182173455583948, + "flos": 28548731871360.0, + "grad_norm": 2.045575366651012, + "language_loss": 0.79400682, + "learning_rate": 3.366416704613735e-07, + "loss": 0.81523144, + "num_input_tokens_seen": 293600540, + "step": 13609, + "time_per_iteration": 2.5795366764068604 + }, + { + "auxiliary_loss_clip": 0.01012259, + "auxiliary_loss_mlp": 0.01002389, + "balance_loss_clip": 1.01799703, + "balance_loss_mlp": 1.00126886, + "epoch": 0.8182774688110627, + "flos": 72028043245440.0, + "grad_norm": 0.753442347892229, + "language_loss": 0.55942738, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.57957387, + "num_input_tokens_seen": 293665160, + "step": 13610, + "time_per_iteration": 3.23667311668396 + }, + { + "auxiliary_loss_clip": 0.01041775, + "auxiliary_loss_mlp": 0.00784647, + "balance_loss_clip": 1.03211439, + "balance_loss_mlp": 1.01157618, + "epoch": 0.8183375920637307, + "flos": 19755358145280.0, + "grad_norm": 1.833400791965003, + "language_loss": 0.77604067, + "learning_rate": 3.362092943712107e-07, + "loss": 0.79430485, + "num_input_tokens_seen": 293683995, + "step": 13611, + "time_per_iteration": 2.7209906578063965 + }, + { + "auxiliary_loss_clip": 0.01071679, + "auxiliary_loss_mlp": 0.01036092, + "balance_loss_clip": 1.03465891, + "balance_loss_mlp": 1.0224781, + "epoch": 0.8183977153163986, + "flos": 22341895580160.0, + "grad_norm": 2.1169755605568987, + "language_loss": 0.77135003, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.79242772, + "num_input_tokens_seen": 293704115, + "step": 13612, + "time_per_iteration": 2.6320464611053467 + }, + { + "auxiliary_loss_clip": 0.01068066, + "auxiliary_loss_mlp": 0.01024943, + "balance_loss_clip": 1.03301203, + "balance_loss_mlp": 1.01334381, + "epoch": 0.8184578385690666, + "flos": 17712472032000.0, + "grad_norm": 1.792583756654176, + "language_loss": 0.86167949, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.88260961, + "num_input_tokens_seen": 293722225, + "step": 13613, + "time_per_iteration": 2.619736671447754 + }, + { + "auxiliary_loss_clip": 0.01093632, + "auxiliary_loss_mlp": 0.01041825, + "balance_loss_clip": 1.03639579, + "balance_loss_mlp": 1.03053021, + "epoch": 0.8185179618217345, + "flos": 25701159323520.0, + "grad_norm": 2.318425706684189, + "language_loss": 0.72771609, + "learning_rate": 3.355612034397746e-07, + "loss": 0.7490707, + "num_input_tokens_seen": 293743995, + "step": 13614, + "time_per_iteration": 2.5543928146362305 + }, + { + "auxiliary_loss_clip": 0.01082808, + "auxiliary_loss_mlp": 0.01037069, + "balance_loss_clip": 1.03510046, + "balance_loss_mlp": 1.02466547, + "epoch": 0.8185780850744026, + "flos": 25960326929280.0, + "grad_norm": 1.482047088190772, + "language_loss": 0.80885154, + "learning_rate": 3.353452993497479e-07, + "loss": 0.83005035, + "num_input_tokens_seen": 293764935, + "step": 13615, + "time_per_iteration": 2.6628401279449463 + }, + { + "auxiliary_loss_clip": 0.01090227, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.03264427, + "balance_loss_mlp": 1.01922512, + "epoch": 0.8186382083270705, + "flos": 25228431406080.0, + "grad_norm": 1.8363085873263219, + "language_loss": 0.75634378, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.77756476, + "num_input_tokens_seen": 293784035, + "step": 13616, + "time_per_iteration": 2.5567097663879395 + }, + { + "auxiliary_loss_clip": 0.01063369, + "auxiliary_loss_mlp": 0.01039674, + "balance_loss_clip": 1.03028727, + "balance_loss_mlp": 1.02616143, + "epoch": 0.8186983315797385, + "flos": 22415009713920.0, + "grad_norm": 1.679621033385655, + "language_loss": 0.75378454, + "learning_rate": 3.349136805494979e-07, + "loss": 0.7748149, + "num_input_tokens_seen": 293803360, + "step": 13617, + "time_per_iteration": 2.6009321212768555 + }, + { + "auxiliary_loss_clip": 0.01075107, + "auxiliary_loss_mlp": 0.01029677, + "balance_loss_clip": 1.03401208, + "balance_loss_mlp": 1.01868558, + "epoch": 0.8187584548324064, + "flos": 22018017623040.0, + "grad_norm": 2.040995170042844, + "language_loss": 0.68264806, + "learning_rate": 3.346979658556415e-07, + "loss": 0.70369589, + "num_input_tokens_seen": 293821325, + "step": 13618, + "time_per_iteration": 2.5578482151031494 + }, + { + "auxiliary_loss_clip": 0.01083308, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.03670156, + "balance_loss_mlp": 1.01835728, + "epoch": 0.8188185780850744, + "flos": 29241664116480.0, + "grad_norm": 1.9561942113004454, + "language_loss": 0.69766688, + "learning_rate": 3.344823143102058e-07, + "loss": 0.71880811, + "num_input_tokens_seen": 293840315, + "step": 13619, + "time_per_iteration": 2.633636474609375 + }, + { + "auxiliary_loss_clip": 0.01043623, + "auxiliary_loss_mlp": 0.01025042, + "balance_loss_clip": 1.03748441, + "balance_loss_mlp": 1.01344264, + "epoch": 0.8188787013377423, + "flos": 20696504348160.0, + "grad_norm": 1.837527021084957, + "language_loss": 0.73614979, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.75683641, + "num_input_tokens_seen": 293855685, + "step": 13620, + "time_per_iteration": 2.6340267658233643 + }, + { + "auxiliary_loss_clip": 0.01075958, + "auxiliary_loss_mlp": 0.00786201, + "balance_loss_clip": 1.03217959, + "balance_loss_mlp": 1.0104897, + "epoch": 0.8189388245904103, + "flos": 23732967542400.0, + "grad_norm": 1.6486186623549157, + "language_loss": 0.76210439, + "learning_rate": 3.340512006973011e-07, + "loss": 0.78072596, + "num_input_tokens_seen": 293875540, + "step": 13621, + "time_per_iteration": 2.640263080596924 + }, + { + "auxiliary_loss_clip": 0.01078672, + "auxiliary_loss_mlp": 0.01027392, + "balance_loss_clip": 1.03168559, + "balance_loss_mlp": 1.01482105, + "epoch": 0.8189989478430784, + "flos": 28255090187520.0, + "grad_norm": 2.1561168377058753, + "language_loss": 0.65768003, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.67874062, + "num_input_tokens_seen": 293896570, + "step": 13622, + "time_per_iteration": 2.627437114715576 + }, + { + "auxiliary_loss_clip": 0.01107787, + "auxiliary_loss_mlp": 0.01029177, + "balance_loss_clip": 1.03781343, + "balance_loss_mlp": 1.01586723, + "epoch": 0.8190590710957463, + "flos": 21397696721280.0, + "grad_norm": 1.7817095338263493, + "language_loss": 0.74814135, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.76951098, + "num_input_tokens_seen": 293914680, + "step": 13623, + "time_per_iteration": 2.476555109024048 + }, + { + "auxiliary_loss_clip": 0.01081814, + "auxiliary_loss_mlp": 0.01038298, + "balance_loss_clip": 1.03331041, + "balance_loss_mlp": 1.02517951, + "epoch": 0.8191191943484143, + "flos": 38796451367040.0, + "grad_norm": 2.0050310649594176, + "language_loss": 0.63451326, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.65571439, + "num_input_tokens_seen": 293936480, + "step": 13624, + "time_per_iteration": 2.736358165740967 + }, + { + "auxiliary_loss_clip": 0.01101912, + "auxiliary_loss_mlp": 0.01033466, + "balance_loss_clip": 1.03441083, + "balance_loss_mlp": 1.0222007, + "epoch": 0.8191793176010822, + "flos": 25446516831360.0, + "grad_norm": 1.521089496069259, + "language_loss": 0.78236181, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.80371559, + "num_input_tokens_seen": 293957815, + "step": 13625, + "time_per_iteration": 2.4797842502593994 + }, + { + "auxiliary_loss_clip": 0.01099048, + "auxiliary_loss_mlp": 0.00784895, + "balance_loss_clip": 1.03452766, + "balance_loss_mlp": 1.00940824, + "epoch": 0.8192394408537502, + "flos": 25083029151360.0, + "grad_norm": 1.9766321856734506, + "language_loss": 0.76074558, + "learning_rate": 3.329745223345244e-07, + "loss": 0.779585, + "num_input_tokens_seen": 293975440, + "step": 13626, + "time_per_iteration": 2.536447048187256 + }, + { + "auxiliary_loss_clip": 0.01093194, + "auxiliary_loss_mlp": 0.01035542, + "balance_loss_clip": 1.03544378, + "balance_loss_mlp": 1.02440178, + "epoch": 0.8192995641064181, + "flos": 27673732563840.0, + "grad_norm": 1.466530954930497, + "language_loss": 0.73513758, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.7564249, + "num_input_tokens_seen": 293997540, + "step": 13627, + "time_per_iteration": 2.522843599319458 + }, + { + "auxiliary_loss_clip": 0.01105925, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.03607559, + "balance_loss_mlp": 1.02100408, + "epoch": 0.8193596873590862, + "flos": 21288492397440.0, + "grad_norm": 1.6861431815739538, + "language_loss": 0.68878531, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.71017683, + "num_input_tokens_seen": 294017030, + "step": 13628, + "time_per_iteration": 2.4668054580688477 + }, + { + "auxiliary_loss_clip": 0.01083921, + "auxiliary_loss_mlp": 0.01036271, + "balance_loss_clip": 1.03582215, + "balance_loss_mlp": 1.02216899, + "epoch": 0.8194198106117541, + "flos": 17492626840320.0, + "grad_norm": 1.5530157048371536, + "language_loss": 0.85365415, + "learning_rate": 3.323292738168171e-07, + "loss": 0.874856, + "num_input_tokens_seen": 294035700, + "step": 13629, + "time_per_iteration": 2.4703521728515625 + }, + { + "auxiliary_loss_clip": 0.01103533, + "auxiliary_loss_mlp": 0.01026691, + "balance_loss_clip": 1.03544593, + "balance_loss_mlp": 1.01521683, + "epoch": 0.8194799338644221, + "flos": 15267925059840.0, + "grad_norm": 2.1003452826714963, + "language_loss": 0.73666281, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.75796503, + "num_input_tokens_seen": 294049730, + "step": 13630, + "time_per_iteration": 2.4221086502075195 + }, + { + "auxiliary_loss_clip": 0.01091703, + "auxiliary_loss_mlp": 0.01034311, + "balance_loss_clip": 1.03456259, + "balance_loss_mlp": 1.02123427, + "epoch": 0.81954005711709, + "flos": 14718814871040.0, + "grad_norm": 1.7995922731098632, + "language_loss": 0.71850216, + "learning_rate": 3.31899424315957e-07, + "loss": 0.73976231, + "num_input_tokens_seen": 294066545, + "step": 13631, + "time_per_iteration": 2.459794521331787 + }, + { + "auxiliary_loss_clip": 0.01103851, + "auxiliary_loss_mlp": 0.01031744, + "balance_loss_clip": 1.03425431, + "balance_loss_mlp": 1.02022195, + "epoch": 0.819600180369758, + "flos": 23074042498560.0, + "grad_norm": 1.5320132319241526, + "language_loss": 0.76738596, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.78874195, + "num_input_tokens_seen": 294087455, + "step": 13632, + "time_per_iteration": 2.515244960784912 + }, + { + "auxiliary_loss_clip": 0.01078361, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.03277707, + "balance_loss_mlp": 1.0226593, + "epoch": 0.8196603036224259, + "flos": 27599792417280.0, + "grad_norm": 1.971867432745887, + "language_loss": 0.65728033, + "learning_rate": 3.314698278332588e-07, + "loss": 0.67841178, + "num_input_tokens_seen": 294107480, + "step": 13633, + "time_per_iteration": 2.5458459854125977 + }, + { + "auxiliary_loss_clip": 0.01088514, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.03541207, + "balance_loss_mlp": 1.02165961, + "epoch": 0.8197204268750939, + "flos": 28582020800640.0, + "grad_norm": 1.5127356923893243, + "language_loss": 0.75862938, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.77984285, + "num_input_tokens_seen": 294130115, + "step": 13634, + "time_per_iteration": 4.373814105987549 + }, + { + "auxiliary_loss_clip": 0.01043686, + "auxiliary_loss_mlp": 0.00782276, + "balance_loss_clip": 1.03310311, + "balance_loss_mlp": 1.00758839, + "epoch": 0.819780550127762, + "flos": 23258300290560.0, + "grad_norm": 1.9151579934068332, + "language_loss": 0.82046998, + "learning_rate": 3.310404844338841e-07, + "loss": 0.83872962, + "num_input_tokens_seen": 294148495, + "step": 13635, + "time_per_iteration": 4.030940771102905 + }, + { + "auxiliary_loss_clip": 0.01091461, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.03377008, + "balance_loss_mlp": 1.01882255, + "epoch": 0.8198406733804299, + "flos": 26685255214080.0, + "grad_norm": 1.6111697814940498, + "language_loss": 0.75836396, + "learning_rate": 3.308259076607949e-07, + "loss": 0.77959919, + "num_input_tokens_seen": 294169595, + "step": 13636, + "time_per_iteration": 2.515103816986084 + }, + { + "auxiliary_loss_clip": 0.01072291, + "auxiliary_loss_mlp": 0.01030688, + "balance_loss_clip": 1.03309703, + "balance_loss_mlp": 1.01873708, + "epoch": 0.8199007966330979, + "flos": 20084084438400.0, + "grad_norm": 2.1047594237921263, + "language_loss": 0.81038451, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.83141422, + "num_input_tokens_seen": 294183885, + "step": 13637, + "time_per_iteration": 2.5411343574523926 + }, + { + "auxiliary_loss_clip": 0.01093053, + "auxiliary_loss_mlp": 0.01030581, + "balance_loss_clip": 1.03653443, + "balance_loss_mlp": 1.01873136, + "epoch": 0.8199609198857658, + "flos": 31902788142720.0, + "grad_norm": 2.3438146275047993, + "language_loss": 0.71505821, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.73629451, + "num_input_tokens_seen": 294200150, + "step": 13638, + "time_per_iteration": 3.957608699798584 + }, + { + "auxiliary_loss_clip": 0.01058661, + "auxiliary_loss_mlp": 0.01034784, + "balance_loss_clip": 1.03301513, + "balance_loss_mlp": 1.02053261, + "epoch": 0.8200210431384338, + "flos": 26470150617600.0, + "grad_norm": 2.072550967816083, + "language_loss": 0.79594857, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.81688303, + "num_input_tokens_seen": 294220385, + "step": 13639, + "time_per_iteration": 2.6242294311523438 + }, + { + "auxiliary_loss_clip": 0.01058942, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.03252077, + "balance_loss_mlp": 1.01811492, + "epoch": 0.8200811663911017, + "flos": 22091454979200.0, + "grad_norm": 1.7876894018643867, + "language_loss": 0.78850585, + "learning_rate": 3.299682336022589e-07, + "loss": 0.80939651, + "num_input_tokens_seen": 294239355, + "step": 13640, + "time_per_iteration": 2.5684409141540527 + }, + { + "auxiliary_loss_clip": 0.0107034, + "auxiliary_loss_mlp": 0.01032937, + "balance_loss_clip": 1.03231859, + "balance_loss_mlp": 1.02021098, + "epoch": 0.8201412896437698, + "flos": 37593659520000.0, + "grad_norm": 1.9513420401955777, + "language_loss": 0.63180339, + "learning_rate": 3.297539733867336e-07, + "loss": 0.65283614, + "num_input_tokens_seen": 294259395, + "step": 13641, + "time_per_iteration": 2.6585452556610107 + }, + { + "auxiliary_loss_clip": 0.01050604, + "auxiliary_loss_mlp": 0.01026391, + "balance_loss_clip": 1.0349462, + "balance_loss_mlp": 1.01391602, + "epoch": 0.8202014128964377, + "flos": 19646333389440.0, + "grad_norm": 1.8706528312155397, + "language_loss": 0.73666883, + "learning_rate": 3.295397765071055e-07, + "loss": 0.75743878, + "num_input_tokens_seen": 294277365, + "step": 13642, + "time_per_iteration": 2.592912197113037 + }, + { + "auxiliary_loss_clip": 0.01079795, + "auxiliary_loss_mlp": 0.01029752, + "balance_loss_clip": 1.03697014, + "balance_loss_mlp": 1.01834989, + "epoch": 0.8202615361491057, + "flos": 31467335564160.0, + "grad_norm": 1.53471131852201, + "language_loss": 0.7009362, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.72203165, + "num_input_tokens_seen": 294297555, + "step": 13643, + "time_per_iteration": 2.5977113246917725 + }, + { + "auxiliary_loss_clip": 0.01092534, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.03551948, + "balance_loss_mlp": 1.02079034, + "epoch": 0.8203216594017736, + "flos": 24715555061760.0, + "grad_norm": 1.8633377340604833, + "language_loss": 0.65547168, + "learning_rate": 3.291115727880256e-07, + "loss": 0.67672122, + "num_input_tokens_seen": 294317600, + "step": 13644, + "time_per_iteration": 3.9514551162719727 + }, + { + "auxiliary_loss_clip": 0.01073664, + "auxiliary_loss_mlp": 0.01034362, + "balance_loss_clip": 1.03498697, + "balance_loss_mlp": 1.02239954, + "epoch": 0.8203817826544416, + "flos": 26031824951040.0, + "grad_norm": 1.3976702383308246, + "language_loss": 0.70830333, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.72938359, + "num_input_tokens_seen": 294340215, + "step": 13645, + "time_per_iteration": 2.575320243835449 + }, + { + "auxiliary_loss_clip": 0.01078416, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.03438485, + "balance_loss_mlp": 1.01761174, + "epoch": 0.8204419059071095, + "flos": 25954544839680.0, + "grad_norm": 1.8967401147659033, + "language_loss": 0.71471405, + "learning_rate": 3.286836225099707e-07, + "loss": 0.73579168, + "num_input_tokens_seen": 294358590, + "step": 13646, + "time_per_iteration": 2.5499613285064697 + }, + { + "auxiliary_loss_clip": 0.01086156, + "auxiliary_loss_mlp": 0.01033701, + "balance_loss_clip": 1.03536224, + "balance_loss_mlp": 1.02135706, + "epoch": 0.8205020291597775, + "flos": 23580059345280.0, + "grad_norm": 2.2342565405831243, + "language_loss": 0.78389859, + "learning_rate": 3.284697424316132e-07, + "loss": 0.8050971, + "num_input_tokens_seen": 294375825, + "step": 13647, + "time_per_iteration": 2.503262519836426 + }, + { + "auxiliary_loss_clip": 0.01101734, + "auxiliary_loss_mlp": 0.01031698, + "balance_loss_clip": 1.03617597, + "balance_loss_mlp": 1.01999116, + "epoch": 0.8205621524124456, + "flos": 26799164219520.0, + "grad_norm": 1.2986811431378495, + "language_loss": 0.67869347, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.70002782, + "num_input_tokens_seen": 294398500, + "step": 13648, + "time_per_iteration": 2.5273921489715576 + }, + { + "auxiliary_loss_clip": 0.01078492, + "auxiliary_loss_mlp": 0.01028254, + "balance_loss_clip": 1.03319144, + "balance_loss_mlp": 1.01599288, + "epoch": 0.8206222756651135, + "flos": 27527863432320.0, + "grad_norm": 1.7185618064278274, + "language_loss": 0.80244482, + "learning_rate": 3.28042172436791e-07, + "loss": 0.82351232, + "num_input_tokens_seen": 294418840, + "step": 13649, + "time_per_iteration": 2.5790982246398926 + }, + { + "auxiliary_loss_clip": 0.01096329, + "auxiliary_loss_mlp": 0.01032914, + "balance_loss_clip": 1.0376687, + "balance_loss_mlp": 1.01972342, + "epoch": 0.8206823989177815, + "flos": 21178605715200.0, + "grad_norm": 1.6657161046104314, + "language_loss": 0.68771458, + "learning_rate": 3.278284825365396e-07, + "loss": 0.70900702, + "num_input_tokens_seen": 294438215, + "step": 13650, + "time_per_iteration": 2.512507200241089 + }, + { + "auxiliary_loss_clip": 0.01087127, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.03741634, + "balance_loss_mlp": 1.02141809, + "epoch": 0.8207425221704494, + "flos": 11509622150400.0, + "grad_norm": 2.269813050640077, + "language_loss": 0.61126071, + "learning_rate": 3.276148560452001e-07, + "loss": 0.632478, + "num_input_tokens_seen": 294455260, + "step": 13651, + "time_per_iteration": 2.4748425483703613 + }, + { + "auxiliary_loss_clip": 0.01068374, + "auxiliary_loss_mlp": 0.00788703, + "balance_loss_clip": 1.03637385, + "balance_loss_mlp": 1.01188755, + "epoch": 0.8208026454231174, + "flos": 19791987039360.0, + "grad_norm": 2.5836343012278626, + "language_loss": 0.71848238, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.73705316, + "num_input_tokens_seen": 294473205, + "step": 13652, + "time_per_iteration": 2.5552477836608887 + }, + { + "auxiliary_loss_clip": 0.01078249, + "auxiliary_loss_mlp": 0.0102843, + "balance_loss_clip": 1.034899, + "balance_loss_mlp": 1.01794577, + "epoch": 0.8208627686757853, + "flos": 15667538843520.0, + "grad_norm": 1.8181644722760613, + "language_loss": 0.72845685, + "learning_rate": 3.271877933216558e-07, + "loss": 0.74952364, + "num_input_tokens_seen": 294490645, + "step": 13653, + "time_per_iteration": 2.4920341968536377 + }, + { + "auxiliary_loss_clip": 0.01068152, + "auxiliary_loss_mlp": 0.01033728, + "balance_loss_clip": 1.03554118, + "balance_loss_mlp": 1.0202868, + "epoch": 0.8209228919284534, + "flos": 37482659516160.0, + "grad_norm": 2.2113260100534338, + "language_loss": 0.62941128, + "learning_rate": 3.269743571056451e-07, + "loss": 0.65043008, + "num_input_tokens_seen": 294513500, + "step": 13654, + "time_per_iteration": 2.7248647212982178 + }, + { + "auxiliary_loss_clip": 0.01076452, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.03236246, + "balance_loss_mlp": 1.01723218, + "epoch": 0.8209830151811213, + "flos": 23112969863040.0, + "grad_norm": 1.4513552023399998, + "language_loss": 0.69906998, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.72012818, + "num_input_tokens_seen": 294535710, + "step": 13655, + "time_per_iteration": 2.59395170211792 + }, + { + "auxiliary_loss_clip": 0.01080275, + "auxiliary_loss_mlp": 0.01034668, + "balance_loss_clip": 1.03584003, + "balance_loss_mlp": 1.02259779, + "epoch": 0.8210431384337893, + "flos": 21288169175040.0, + "grad_norm": 4.588960162398, + "language_loss": 0.81951815, + "learning_rate": 3.265476750056162e-07, + "loss": 0.84066761, + "num_input_tokens_seen": 294554055, + "step": 13656, + "time_per_iteration": 2.521470308303833 + }, + { + "auxiliary_loss_clip": 0.01076554, + "auxiliary_loss_mlp": 0.01031652, + "balance_loss_clip": 1.03485394, + "balance_loss_mlp": 1.01975489, + "epoch": 0.8211032616864572, + "flos": 11502403516800.0, + "grad_norm": 2.1739220150291065, + "language_loss": 0.74113584, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.76221794, + "num_input_tokens_seen": 294570390, + "step": 13657, + "time_per_iteration": 2.501217842102051 + }, + { + "auxiliary_loss_clip": 0.01072262, + "auxiliary_loss_mlp": 0.01034336, + "balance_loss_clip": 1.03376555, + "balance_loss_mlp": 1.0221591, + "epoch": 0.8211633849391252, + "flos": 29821477455360.0, + "grad_norm": 1.7048304578190787, + "language_loss": 0.55807281, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.57913876, + "num_input_tokens_seen": 294593050, + "step": 13658, + "time_per_iteration": 2.608198404312134 + }, + { + "auxiliary_loss_clip": 0.01050198, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.03344584, + "balance_loss_mlp": 1.01796949, + "epoch": 0.8212235081917931, + "flos": 13115439573120.0, + "grad_norm": 2.0928158106623447, + "language_loss": 0.78947854, + "learning_rate": 3.259081278068805e-07, + "loss": 0.81028163, + "num_input_tokens_seen": 294608550, + "step": 13659, + "time_per_iteration": 2.590679883956909 + }, + { + "auxiliary_loss_clip": 0.01087505, + "auxiliary_loss_mlp": 0.01024892, + "balance_loss_clip": 1.03294051, + "balance_loss_mlp": 1.01449049, + "epoch": 0.8212836314444611, + "flos": 40515351782400.0, + "grad_norm": 1.5781714304596575, + "language_loss": 0.60070872, + "learning_rate": 3.256950723599887e-07, + "loss": 0.62183273, + "num_input_tokens_seen": 294630380, + "step": 13660, + "time_per_iteration": 2.6639597415924072 + }, + { + "auxiliary_loss_clip": 0.01092637, + "auxiliary_loss_mlp": 0.01033663, + "balance_loss_clip": 1.0353483, + "balance_loss_mlp": 1.02031183, + "epoch": 0.8213437546971292, + "flos": 18770543982720.0, + "grad_norm": 2.554792591164921, + "language_loss": 0.73019308, + "learning_rate": 3.254820804029075e-07, + "loss": 0.75145614, + "num_input_tokens_seen": 294648655, + "step": 13661, + "time_per_iteration": 2.4931888580322266 + }, + { + "auxiliary_loss_clip": 0.01094715, + "auxiliary_loss_mlp": 0.0103066, + "balance_loss_clip": 1.03349352, + "balance_loss_mlp": 1.01852453, + "epoch": 0.8214038779497971, + "flos": 19682279925120.0, + "grad_norm": 2.9717399954792474, + "language_loss": 0.7487275, + "learning_rate": 3.252691519437143e-07, + "loss": 0.76998127, + "num_input_tokens_seen": 294666915, + "step": 13662, + "time_per_iteration": 2.4629268646240234 + }, + { + "auxiliary_loss_clip": 0.01030024, + "auxiliary_loss_mlp": 0.01003392, + "balance_loss_clip": 1.00718331, + "balance_loss_mlp": 1.00215781, + "epoch": 0.8214640012024651, + "flos": 71602969697280.0, + "grad_norm": 0.7649024242008559, + "language_loss": 0.5404439, + "learning_rate": 3.250562869904825e-07, + "loss": 0.56077802, + "num_input_tokens_seen": 294731545, + "step": 13663, + "time_per_iteration": 3.1974599361419678 + }, + { + "auxiliary_loss_clip": 0.01059223, + "auxiliary_loss_mlp": 0.01035769, + "balance_loss_clip": 1.03294849, + "balance_loss_mlp": 1.02291214, + "epoch": 0.821524124455133, + "flos": 14757203531520.0, + "grad_norm": 2.0237780159437397, + "language_loss": 0.65492028, + "learning_rate": 3.248434855512838e-07, + "loss": 0.67587018, + "num_input_tokens_seen": 294748745, + "step": 13664, + "time_per_iteration": 2.5572309494018555 + }, + { + "auxiliary_loss_clip": 0.01080509, + "auxiliary_loss_mlp": 0.01029905, + "balance_loss_clip": 1.03517604, + "balance_loss_mlp": 1.01896787, + "epoch": 0.821584247707801, + "flos": 25082274965760.0, + "grad_norm": 1.4339926563262932, + "language_loss": 0.74935794, + "learning_rate": 3.246307476341881e-07, + "loss": 0.77046204, + "num_input_tokens_seen": 294768955, + "step": 13665, + "time_per_iteration": 2.582526445388794 + }, + { + "auxiliary_loss_clip": 0.01089758, + "auxiliary_loss_mlp": 0.00785812, + "balance_loss_clip": 1.03671193, + "balance_loss_mlp": 1.01436138, + "epoch": 0.8216443709604689, + "flos": 36830701710720.0, + "grad_norm": 2.207159008959412, + "language_loss": 0.65348387, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.67223954, + "num_input_tokens_seen": 294789250, + "step": 13666, + "time_per_iteration": 2.6086766719818115 + }, + { + "auxiliary_loss_clip": 0.01055472, + "auxiliary_loss_mlp": 0.01028959, + "balance_loss_clip": 1.03491378, + "balance_loss_mlp": 1.01772332, + "epoch": 0.821704494213137, + "flos": 25081808088960.0, + "grad_norm": 1.680661279867439, + "language_loss": 0.77221292, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.7930572, + "num_input_tokens_seen": 294809760, + "step": 13667, + "time_per_iteration": 2.637730836868286 + }, + { + "auxiliary_loss_clip": 0.01072407, + "auxiliary_loss_mlp": 0.01034544, + "balance_loss_clip": 1.03586555, + "balance_loss_mlp": 1.02197945, + "epoch": 0.8217646174658049, + "flos": 14356117290240.0, + "grad_norm": 1.8985613780444102, + "language_loss": 0.77127242, + "learning_rate": 3.239929150961773e-07, + "loss": 0.79234195, + "num_input_tokens_seen": 294826495, + "step": 13668, + "time_per_iteration": 2.5103647708892822 + }, + { + "auxiliary_loss_clip": 0.01057666, + "auxiliary_loss_mlp": 0.01029586, + "balance_loss_clip": 1.03337479, + "balance_loss_mlp": 1.01808226, + "epoch": 0.8218247407184729, + "flos": 22090557139200.0, + "grad_norm": 2.0650135392651205, + "language_loss": 0.73470962, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.75558209, + "num_input_tokens_seen": 294845370, + "step": 13669, + "time_per_iteration": 2.6220126152038574 + }, + { + "auxiliary_loss_clip": 0.01092746, + "auxiliary_loss_mlp": 0.01024615, + "balance_loss_clip": 1.03550231, + "balance_loss_mlp": 1.01335621, + "epoch": 0.8218848639711408, + "flos": 16764035368320.0, + "grad_norm": 1.6693339922808688, + "language_loss": 0.78677434, + "learning_rate": 3.235680111625161e-07, + "loss": 0.80794787, + "num_input_tokens_seen": 294863740, + "step": 13670, + "time_per_iteration": 2.467154026031494 + }, + { + "auxiliary_loss_clip": 0.01097959, + "auxiliary_loss_mlp": 0.01032516, + "balance_loss_clip": 1.03723431, + "balance_loss_mlp": 1.01961184, + "epoch": 0.8219449872238088, + "flos": 25994801007360.0, + "grad_norm": 1.802106749376981, + "language_loss": 0.7481057, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.76941049, + "num_input_tokens_seen": 294882815, + "step": 13671, + "time_per_iteration": 2.5314815044403076 + }, + { + "auxiliary_loss_clip": 0.0109922, + "auxiliary_loss_mlp": 0.0103018, + "balance_loss_clip": 1.03662634, + "balance_loss_mlp": 1.01744294, + "epoch": 0.8220051104764767, + "flos": 20778094091520.0, + "grad_norm": 1.7123744114769295, + "language_loss": 0.7639032, + "learning_rate": 3.23143361510728e-07, + "loss": 0.7851972, + "num_input_tokens_seen": 294901985, + "step": 13672, + "time_per_iteration": 3.9099624156951904 + }, + { + "auxiliary_loss_clip": 0.01054271, + "auxiliary_loss_mlp": 0.01032737, + "balance_loss_clip": 1.03330731, + "balance_loss_mlp": 1.01970768, + "epoch": 0.8220652337291448, + "flos": 14574849160320.0, + "grad_norm": 2.810477976613284, + "language_loss": 0.74700928, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.76787937, + "num_input_tokens_seen": 294919705, + "step": 13673, + "time_per_iteration": 2.548940658569336 + }, + { + "auxiliary_loss_clip": 0.01084029, + "auxiliary_loss_mlp": 0.01033828, + "balance_loss_clip": 1.03578603, + "balance_loss_mlp": 1.02157962, + "epoch": 0.8221253569818128, + "flos": 23805866194560.0, + "grad_norm": 1.6044431154163046, + "language_loss": 0.79409277, + "learning_rate": 3.227189662052254e-07, + "loss": 0.81527132, + "num_input_tokens_seen": 294939900, + "step": 13674, + "time_per_iteration": 3.907717227935791 + }, + { + "auxiliary_loss_clip": 0.01079418, + "auxiliary_loss_mlp": 0.01034711, + "balance_loss_clip": 1.03271472, + "balance_loss_mlp": 1.0226891, + "epoch": 0.8221854802344807, + "flos": 21288241002240.0, + "grad_norm": 1.8118395529851583, + "language_loss": 0.70332783, + "learning_rate": 3.225068639524484e-07, + "loss": 0.72446913, + "num_input_tokens_seen": 294959110, + "step": 13675, + "time_per_iteration": 2.4971230030059814 + }, + { + "auxiliary_loss_clip": 0.0108582, + "auxiliary_loss_mlp": 0.01036773, + "balance_loss_clip": 1.03312349, + "balance_loss_mlp": 1.02460146, + "epoch": 0.8222456034871487, + "flos": 20956785275520.0, + "grad_norm": 1.5667996081462106, + "language_loss": 0.74571884, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.76694477, + "num_input_tokens_seen": 294978660, + "step": 13676, + "time_per_iteration": 2.4892139434814453 + }, + { + "auxiliary_loss_clip": 0.01076757, + "auxiliary_loss_mlp": 0.0103037, + "balance_loss_clip": 1.03434634, + "balance_loss_mlp": 1.01877689, + "epoch": 0.8223057267398166, + "flos": 21397517153280.0, + "grad_norm": 1.7154149904632143, + "language_loss": 0.80215526, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.82322657, + "num_input_tokens_seen": 294998075, + "step": 13677, + "time_per_iteration": 4.0228049755096436 + }, + { + "auxiliary_loss_clip": 0.01089259, + "auxiliary_loss_mlp": 0.01034711, + "balance_loss_clip": 1.03358412, + "balance_loss_mlp": 1.02207518, + "epoch": 0.8223658499924846, + "flos": 15268212368640.0, + "grad_norm": 1.7338524056658107, + "language_loss": 0.69760221, + "learning_rate": 3.218709388905245e-07, + "loss": 0.71884191, + "num_input_tokens_seen": 295015950, + "step": 13678, + "time_per_iteration": 2.4472744464874268 + }, + { + "auxiliary_loss_clip": 0.01102003, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.0340749, + "balance_loss_mlp": 1.02105546, + "epoch": 0.8224259732451525, + "flos": 31249537447680.0, + "grad_norm": 1.5677654115587853, + "language_loss": 0.71698064, + "learning_rate": 3.216590911288133e-07, + "loss": 0.73833418, + "num_input_tokens_seen": 295036800, + "step": 13679, + "time_per_iteration": 2.5593512058258057 + }, + { + "auxiliary_loss_clip": 0.01074924, + "auxiliary_loss_mlp": 0.01031752, + "balance_loss_clip": 1.03245723, + "balance_loss_mlp": 1.01858592, + "epoch": 0.8224860964978206, + "flos": 21574628138880.0, + "grad_norm": 1.9068074787432832, + "language_loss": 0.70029783, + "learning_rate": 3.214473070099564e-07, + "loss": 0.72136462, + "num_input_tokens_seen": 295055300, + "step": 13680, + "time_per_iteration": 2.492856502532959 + }, + { + "auxiliary_loss_clip": 0.01072729, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.03610277, + "balance_loss_mlp": 1.02216935, + "epoch": 0.8225462197504885, + "flos": 25483217552640.0, + "grad_norm": 2.187967599563728, + "language_loss": 0.59917259, + "learning_rate": 3.21235586541986e-07, + "loss": 0.62023288, + "num_input_tokens_seen": 295076420, + "step": 13681, + "time_per_iteration": 2.6170737743377686 + }, + { + "auxiliary_loss_clip": 0.01080055, + "auxiliary_loss_mlp": 0.01037126, + "balance_loss_clip": 1.0335288, + "balance_loss_mlp": 1.02433515, + "epoch": 0.8226063430031565, + "flos": 39385458587520.0, + "grad_norm": 1.58199692836229, + "language_loss": 0.697366, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.71853781, + "num_input_tokens_seen": 295100540, + "step": 13682, + "time_per_iteration": 4.143878936767578 + }, + { + "auxiliary_loss_clip": 0.01104473, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.03428376, + "balance_loss_mlp": 1.01860011, + "epoch": 0.8226664662558244, + "flos": 22815269942400.0, + "grad_norm": 1.9859477843324669, + "language_loss": 0.78903198, + "learning_rate": 3.20812336590816e-07, + "loss": 0.81039357, + "num_input_tokens_seen": 295120180, + "step": 13683, + "time_per_iteration": 2.4602532386779785 + }, + { + "auxiliary_loss_clip": 0.01099442, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.03440166, + "balance_loss_mlp": 1.01994634, + "epoch": 0.8227265895084924, + "flos": 25665607837440.0, + "grad_norm": 1.9244657083526502, + "language_loss": 0.86636102, + "learning_rate": 3.206008071236661e-07, + "loss": 0.88766158, + "num_input_tokens_seen": 295138530, + "step": 13684, + "time_per_iteration": 2.485523223876953 + }, + { + "auxiliary_loss_clip": 0.01099585, + "auxiliary_loss_mlp": 0.0102894, + "balance_loss_clip": 1.0339402, + "balance_loss_mlp": 1.0176146, + "epoch": 0.8227867127611603, + "flos": 26179274280960.0, + "grad_norm": 1.4509080662168439, + "language_loss": 0.79602337, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.8173086, + "num_input_tokens_seen": 295160260, + "step": 13685, + "time_per_iteration": 2.4974024295806885 + }, + { + "auxiliary_loss_clip": 0.01066236, + "auxiliary_loss_mlp": 0.01029583, + "balance_loss_clip": 1.03304815, + "balance_loss_mlp": 1.01768613, + "epoch": 0.8228468360138284, + "flos": 22018053536640.0, + "grad_norm": 2.3427416594496715, + "language_loss": 0.69118285, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.71214104, + "num_input_tokens_seen": 295177055, + "step": 13686, + "time_per_iteration": 2.5387625694274902 + }, + { + "auxiliary_loss_clip": 0.01070681, + "auxiliary_loss_mlp": 0.01029449, + "balance_loss_clip": 1.03311229, + "balance_loss_mlp": 1.0176115, + "epoch": 0.8229069592664963, + "flos": 14903359971840.0, + "grad_norm": 1.868689744868676, + "language_loss": 0.7795707, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.80057198, + "num_input_tokens_seen": 295193870, + "step": 13687, + "time_per_iteration": 2.514427900314331 + }, + { + "auxiliary_loss_clip": 0.01092386, + "auxiliary_loss_mlp": 0.01029468, + "balance_loss_clip": 1.03418863, + "balance_loss_mlp": 1.01702273, + "epoch": 0.8229670825191643, + "flos": 15669478177920.0, + "grad_norm": 1.9423968478733573, + "language_loss": 0.72330236, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.7445209, + "num_input_tokens_seen": 295211040, + "step": 13688, + "time_per_iteration": 2.482572317123413 + }, + { + "auxiliary_loss_clip": 0.01103779, + "auxiliary_loss_mlp": 0.00782343, + "balance_loss_clip": 1.03535485, + "balance_loss_mlp": 1.00832474, + "epoch": 0.8230272057718323, + "flos": 23183498217600.0, + "grad_norm": 1.609548191771626, + "language_loss": 0.73445487, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.75331616, + "num_input_tokens_seen": 295231300, + "step": 13689, + "time_per_iteration": 2.4818973541259766 + }, + { + "auxiliary_loss_clip": 0.01094857, + "auxiliary_loss_mlp": 0.01031768, + "balance_loss_clip": 1.03474307, + "balance_loss_mlp": 1.01971042, + "epoch": 0.8230873290245002, + "flos": 21032413361280.0, + "grad_norm": 3.100656229255575, + "language_loss": 0.69278246, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.71404874, + "num_input_tokens_seen": 295251045, + "step": 13690, + "time_per_iteration": 2.4873337745666504 + }, + { + "auxiliary_loss_clip": 0.01058357, + "auxiliary_loss_mlp": 0.01037232, + "balance_loss_clip": 1.03189528, + "balance_loss_mlp": 1.02351689, + "epoch": 0.8231474522771682, + "flos": 21250139650560.0, + "grad_norm": 8.077849253777021, + "language_loss": 0.8530916, + "learning_rate": 3.191218844260988e-07, + "loss": 0.87404752, + "num_input_tokens_seen": 295270225, + "step": 13691, + "time_per_iteration": 2.5592188835144043 + }, + { + "auxiliary_loss_clip": 0.01096161, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.03564537, + "balance_loss_mlp": 1.02096891, + "epoch": 0.8232075755298361, + "flos": 23842028211840.0, + "grad_norm": 1.8553114097976604, + "language_loss": 0.7695936, + "learning_rate": 3.189108646472252e-07, + "loss": 0.79088306, + "num_input_tokens_seen": 295288950, + "step": 13692, + "time_per_iteration": 2.50541090965271 + }, + { + "auxiliary_loss_clip": 0.01091517, + "auxiliary_loss_mlp": 0.01025823, + "balance_loss_clip": 1.03505886, + "balance_loss_mlp": 1.01414609, + "epoch": 0.8232676987825042, + "flos": 21653955325440.0, + "grad_norm": 1.5158864203035978, + "language_loss": 0.7168805, + "learning_rate": 3.186999086154205e-07, + "loss": 0.73805386, + "num_input_tokens_seen": 295309405, + "step": 13693, + "time_per_iteration": 2.4617600440979004 + }, + { + "auxiliary_loss_clip": 0.01066569, + "auxiliary_loss_mlp": 0.01033465, + "balance_loss_clip": 1.03268075, + "balance_loss_mlp": 1.02244401, + "epoch": 0.8233278220351721, + "flos": 26322701287680.0, + "grad_norm": 1.3225183433813361, + "language_loss": 0.83800626, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.85900664, + "num_input_tokens_seen": 295331115, + "step": 13694, + "time_per_iteration": 2.5937135219573975 + }, + { + "auxiliary_loss_clip": 0.01041593, + "auxiliary_loss_mlp": 0.01027318, + "balance_loss_clip": 1.03370857, + "balance_loss_mlp": 1.01516485, + "epoch": 0.8233879452878401, + "flos": 21725812483200.0, + "grad_norm": 1.7202923222517448, + "language_loss": 0.77389884, + "learning_rate": 3.182781878250118e-07, + "loss": 0.79458797, + "num_input_tokens_seen": 295350495, + "step": 13695, + "time_per_iteration": 2.612786293029785 + }, + { + "auxiliary_loss_clip": 0.01082968, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.03539205, + "balance_loss_mlp": 1.02094924, + "epoch": 0.823448068540508, + "flos": 20557746109440.0, + "grad_norm": 2.3466821973900442, + "language_loss": 0.8065179, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.82767355, + "num_input_tokens_seen": 295368225, + "step": 13696, + "time_per_iteration": 2.5230674743652344 + }, + { + "auxiliary_loss_clip": 0.01021469, + "auxiliary_loss_mlp": 0.00999699, + "balance_loss_clip": 1.00818098, + "balance_loss_mlp": 0.99845356, + "epoch": 0.823508191793176, + "flos": 67273688194560.0, + "grad_norm": 0.7374254051037429, + "language_loss": 0.63926387, + "learning_rate": 3.178567221188393e-07, + "loss": 0.65947556, + "num_input_tokens_seen": 295430035, + "step": 13697, + "time_per_iteration": 3.153226375579834 + }, + { + "auxiliary_loss_clip": 0.01066504, + "auxiliary_loss_mlp": 0.01026182, + "balance_loss_clip": 1.03372622, + "balance_loss_mlp": 1.01573324, + "epoch": 0.8235683150458439, + "flos": 17928402641280.0, + "grad_norm": 1.5927256151393094, + "language_loss": 0.72704273, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.74796963, + "num_input_tokens_seen": 295447765, + "step": 13698, + "time_per_iteration": 2.523817777633667 + }, + { + "auxiliary_loss_clip": 0.01059584, + "auxiliary_loss_mlp": 0.01040979, + "balance_loss_clip": 1.03027296, + "balance_loss_mlp": 1.0258038, + "epoch": 0.823628438298512, + "flos": 18916089891840.0, + "grad_norm": 2.093301578173963, + "language_loss": 0.71874332, + "learning_rate": 3.174355115608305e-07, + "loss": 0.7397489, + "num_input_tokens_seen": 295464810, + "step": 13699, + "time_per_iteration": 2.523703098297119 + }, + { + "auxiliary_loss_clip": 0.01080444, + "auxiliary_loss_mlp": 0.01028774, + "balance_loss_clip": 1.03302503, + "balance_loss_mlp": 1.01710892, + "epoch": 0.8236885615511799, + "flos": 18696460181760.0, + "grad_norm": 1.978993727333208, + "language_loss": 0.81838828, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.8394804, + "num_input_tokens_seen": 295482605, + "step": 13700, + "time_per_iteration": 2.499953031539917 + }, + { + "auxiliary_loss_clip": 0.01082097, + "auxiliary_loss_mlp": 0.01033589, + "balance_loss_clip": 1.03448617, + "balance_loss_mlp": 1.02209175, + "epoch": 0.8237486848038479, + "flos": 23695009845120.0, + "grad_norm": 1.6378758960861395, + "language_loss": 0.72911727, + "learning_rate": 3.170145562148763e-07, + "loss": 0.75027412, + "num_input_tokens_seen": 295503780, + "step": 13701, + "time_per_iteration": 2.5373096466064453 + }, + { + "auxiliary_loss_clip": 0.01092889, + "auxiliary_loss_mlp": 0.01033914, + "balance_loss_clip": 1.0329392, + "balance_loss_mlp": 1.02123058, + "epoch": 0.8238088080565159, + "flos": 23441301106560.0, + "grad_norm": 1.8900752714771187, + "language_loss": 0.68994635, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.71121436, + "num_input_tokens_seen": 295522035, + "step": 13702, + "time_per_iteration": 2.5409021377563477 + }, + { + "auxiliary_loss_clip": 0.01056981, + "auxiliary_loss_mlp": 0.01028465, + "balance_loss_clip": 1.03499293, + "balance_loss_mlp": 1.01635909, + "epoch": 0.8238689313091838, + "flos": 22746537267840.0, + "grad_norm": 1.615780496698445, + "language_loss": 0.7487765, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.76963091, + "num_input_tokens_seen": 295541190, + "step": 13703, + "time_per_iteration": 2.5793471336364746 + }, + { + "auxiliary_loss_clip": 0.01108196, + "auxiliary_loss_mlp": 0.01038802, + "balance_loss_clip": 1.03539348, + "balance_loss_mlp": 1.02518249, + "epoch": 0.8239290545618518, + "flos": 25630092264960.0, + "grad_norm": 1.7831004340503915, + "language_loss": 0.69609576, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.71756566, + "num_input_tokens_seen": 295558860, + "step": 13704, + "time_per_iteration": 2.5044496059417725 + }, + { + "auxiliary_loss_clip": 0.01100895, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.03351855, + "balance_loss_mlp": 1.01824343, + "epoch": 0.8239891778145197, + "flos": 26026473824640.0, + "grad_norm": 1.9697894718771298, + "language_loss": 0.63909256, + "learning_rate": 3.161734114144916e-07, + "loss": 0.6603992, + "num_input_tokens_seen": 295578155, + "step": 13705, + "time_per_iteration": 2.4783847332000732 + }, + { + "auxiliary_loss_clip": 0.01106058, + "auxiliary_loss_mlp": 0.01028255, + "balance_loss_clip": 1.0354917, + "balance_loss_mlp": 1.01595867, + "epoch": 0.8240493010671878, + "flos": 21833257040640.0, + "grad_norm": 1.6315001131099398, + "language_loss": 0.69382554, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.71516871, + "num_input_tokens_seen": 295599170, + "step": 13706, + "time_per_iteration": 2.4862122535705566 + }, + { + "auxiliary_loss_clip": 0.0107947, + "auxiliary_loss_mlp": 0.01036227, + "balance_loss_clip": 1.03523088, + "balance_loss_mlp": 1.0229888, + "epoch": 0.8241094243198557, + "flos": 18551919853440.0, + "grad_norm": 1.8288316897918095, + "language_loss": 0.69483256, + "learning_rate": 3.157532220876475e-07, + "loss": 0.71598953, + "num_input_tokens_seen": 295617465, + "step": 13707, + "time_per_iteration": 2.483332872390747 + }, + { + "auxiliary_loss_clip": 0.01068883, + "auxiliary_loss_mlp": 0.01039615, + "balance_loss_clip": 1.0328238, + "balance_loss_mlp": 1.02593517, + "epoch": 0.8241695475725237, + "flos": 25447163276160.0, + "grad_norm": 1.7328909880435355, + "language_loss": 0.78894472, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.81002969, + "num_input_tokens_seen": 295634960, + "step": 13708, + "time_per_iteration": 2.5995748043060303 + }, + { + "auxiliary_loss_clip": 0.01088136, + "auxiliary_loss_mlp": 0.01031283, + "balance_loss_clip": 1.0324707, + "balance_loss_mlp": 1.01811671, + "epoch": 0.8242296708251916, + "flos": 18989670902400.0, + "grad_norm": 2.349072748548048, + "language_loss": 0.68543988, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.70663404, + "num_input_tokens_seen": 295652725, + "step": 13709, + "time_per_iteration": 2.4535481929779053 + }, + { + "auxiliary_loss_clip": 0.0106199, + "auxiliary_loss_mlp": 0.01030631, + "balance_loss_clip": 1.03290117, + "balance_loss_mlp": 1.019032, + "epoch": 0.8242897940778596, + "flos": 22600883617920.0, + "grad_norm": 1.7986063604465334, + "language_loss": 0.82310599, + "learning_rate": 3.151234171183319e-07, + "loss": 0.84403223, + "num_input_tokens_seen": 295671195, + "step": 13710, + "time_per_iteration": 2.5865705013275146 + }, + { + "auxiliary_loss_clip": 0.01092754, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.03430986, + "balance_loss_mlp": 1.01969016, + "epoch": 0.8243499173305275, + "flos": 21468153248640.0, + "grad_norm": 1.8642906690199172, + "language_loss": 0.78436685, + "learning_rate": 3.149136098993257e-07, + "loss": 0.80561441, + "num_input_tokens_seen": 295689130, + "step": 13711, + "time_per_iteration": 3.9194467067718506 + }, + { + "auxiliary_loss_clip": 0.0107078, + "auxiliary_loss_mlp": 0.01030353, + "balance_loss_clip": 1.0339545, + "balance_loss_mlp": 1.01800323, + "epoch": 0.8244100405831956, + "flos": 20010359773440.0, + "grad_norm": 1.7782303627751168, + "language_loss": 0.65944815, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.6804595, + "num_input_tokens_seen": 295706385, + "step": 13712, + "time_per_iteration": 2.5072569847106934 + }, + { + "auxiliary_loss_clip": 0.01094069, + "auxiliary_loss_mlp": 0.01029002, + "balance_loss_clip": 1.03536391, + "balance_loss_mlp": 1.0175277, + "epoch": 0.8244701638358635, + "flos": 26430684549120.0, + "grad_norm": 1.8237788780923978, + "language_loss": 0.74183524, + "learning_rate": 3.14494187165202e-07, + "loss": 0.76306593, + "num_input_tokens_seen": 295727925, + "step": 13713, + "time_per_iteration": 3.979630708694458 + }, + { + "auxiliary_loss_clip": 0.01091726, + "auxiliary_loss_mlp": 0.01029317, + "balance_loss_clip": 1.03298283, + "balance_loss_mlp": 1.01747346, + "epoch": 0.8245302870885315, + "flos": 17640004343040.0, + "grad_norm": 2.1918878791904195, + "language_loss": 0.81352758, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.83473796, + "num_input_tokens_seen": 295744420, + "step": 13714, + "time_per_iteration": 2.434802770614624 + }, + { + "auxiliary_loss_clip": 0.01095187, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.03786731, + "balance_loss_mlp": 1.02196693, + "epoch": 0.8245904103411995, + "flos": 26209510554240.0, + "grad_norm": 1.7143449370553712, + "language_loss": 0.66366428, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.6849618, + "num_input_tokens_seen": 295765105, + "step": 13715, + "time_per_iteration": 3.9443399906158447 + }, + { + "auxiliary_loss_clip": 0.01074572, + "auxiliary_loss_mlp": 0.01030018, + "balance_loss_clip": 1.03460586, + "balance_loss_mlp": 1.01728106, + "epoch": 0.8246505335938674, + "flos": 24205084928640.0, + "grad_norm": 2.0134213377978587, + "language_loss": 0.74945915, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.77050507, + "num_input_tokens_seen": 295784200, + "step": 13716, + "time_per_iteration": 2.568373203277588 + }, + { + "auxiliary_loss_clip": 0.01001942, + "auxiliary_loss_mlp": 0.01002397, + "balance_loss_clip": 1.01367021, + "balance_loss_mlp": 1.00121677, + "epoch": 0.8247106568465354, + "flos": 67092195749760.0, + "grad_norm": 0.7202419950721755, + "language_loss": 0.58977371, + "learning_rate": 3.136561087351175e-07, + "loss": 0.60981709, + "num_input_tokens_seen": 295846555, + "step": 13717, + "time_per_iteration": 3.259536027908325 + }, + { + "auxiliary_loss_clip": 0.01090692, + "auxiliary_loss_mlp": 0.00783384, + "balance_loss_clip": 1.03577244, + "balance_loss_mlp": 1.00851965, + "epoch": 0.8247707800992033, + "flos": 12568232805120.0, + "grad_norm": 2.013133513603316, + "language_loss": 0.79507363, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.8138144, + "num_input_tokens_seen": 295863425, + "step": 13718, + "time_per_iteration": 2.473606586456299 + }, + { + "auxiliary_loss_clip": 0.01089487, + "auxiliary_loss_mlp": 0.0102947, + "balance_loss_clip": 1.03392982, + "balance_loss_mlp": 1.01825237, + "epoch": 0.8248309033518714, + "flos": 15923617879680.0, + "grad_norm": 1.648434828225315, + "language_loss": 0.68957287, + "learning_rate": 3.132374531662778e-07, + "loss": 0.71076238, + "num_input_tokens_seen": 295880925, + "step": 13719, + "time_per_iteration": 2.4307472705841064 + }, + { + "auxiliary_loss_clip": 0.01075414, + "auxiliary_loss_mlp": 0.01034017, + "balance_loss_clip": 1.03289819, + "balance_loss_mlp": 1.01958036, + "epoch": 0.8248910266045393, + "flos": 17564735393280.0, + "grad_norm": 2.3958643974517235, + "language_loss": 0.69568467, + "learning_rate": 3.13028221321197e-07, + "loss": 0.71677899, + "num_input_tokens_seen": 295898205, + "step": 13720, + "time_per_iteration": 2.468945026397705 + }, + { + "auxiliary_loss_clip": 0.01037839, + "auxiliary_loss_mlp": 0.01028437, + "balance_loss_clip": 1.03569627, + "balance_loss_mlp": 1.01640284, + "epoch": 0.8249511498572073, + "flos": 28619655275520.0, + "grad_norm": 1.6743324292830122, + "language_loss": 0.75658166, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.77724445, + "num_input_tokens_seen": 295918130, + "step": 13721, + "time_per_iteration": 4.2036614418029785 + }, + { + "auxiliary_loss_clip": 0.01053507, + "auxiliary_loss_mlp": 0.01027709, + "balance_loss_clip": 1.03540945, + "balance_loss_mlp": 1.01618183, + "epoch": 0.8250112731098752, + "flos": 25556583081600.0, + "grad_norm": 1.8088309522601314, + "language_loss": 0.77921855, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.80003071, + "num_input_tokens_seen": 295937760, + "step": 13722, + "time_per_iteration": 2.8400325775146484 + }, + { + "auxiliary_loss_clip": 0.01101682, + "auxiliary_loss_mlp": 0.01034255, + "balance_loss_clip": 1.03517365, + "balance_loss_mlp": 1.02239323, + "epoch": 0.8250713963625432, + "flos": 27746164339200.0, + "grad_norm": 1.8696950077764947, + "language_loss": 0.62869155, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.650051, + "num_input_tokens_seen": 295957585, + "step": 13723, + "time_per_iteration": 2.5200655460357666 + }, + { + "auxiliary_loss_clip": 0.01105754, + "auxiliary_loss_mlp": 0.01033081, + "balance_loss_clip": 1.03584373, + "balance_loss_mlp": 1.02116573, + "epoch": 0.8251315196152111, + "flos": 21610610588160.0, + "grad_norm": 1.4692067555428319, + "language_loss": 0.74447739, + "learning_rate": 3.121919337215666e-07, + "loss": 0.76586574, + "num_input_tokens_seen": 295977135, + "step": 13724, + "time_per_iteration": 2.4444024562835693 + }, + { + "auxiliary_loss_clip": 0.01071195, + "auxiliary_loss_mlp": 0.0103419, + "balance_loss_clip": 1.03405833, + "balance_loss_mlp": 1.02178621, + "epoch": 0.8251916428678792, + "flos": 28579363194240.0, + "grad_norm": 1.7510220113255721, + "language_loss": 0.64093155, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.6619854, + "num_input_tokens_seen": 295996265, + "step": 13725, + "time_per_iteration": 2.618880033493042 + }, + { + "auxiliary_loss_clip": 0.01077037, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.03335309, + "balance_loss_mlp": 1.01893699, + "epoch": 0.8252517661205471, + "flos": 23075191733760.0, + "grad_norm": 1.6585041995631224, + "language_loss": 0.81898594, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.84007013, + "num_input_tokens_seen": 296014745, + "step": 13726, + "time_per_iteration": 2.5210978984832764 + }, + { + "auxiliary_loss_clip": 0.01085416, + "auxiliary_loss_mlp": 0.01035761, + "balance_loss_clip": 1.03054428, + "balance_loss_mlp": 1.02407885, + "epoch": 0.8253118893732151, + "flos": 31759576617600.0, + "grad_norm": 3.4632511965942445, + "language_loss": 0.70374268, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.72495449, + "num_input_tokens_seen": 296036960, + "step": 13727, + "time_per_iteration": 2.5621700286865234 + }, + { + "auxiliary_loss_clip": 0.0109617, + "auxiliary_loss_mlp": 0.01031064, + "balance_loss_clip": 1.03564608, + "balance_loss_mlp": 1.01821899, + "epoch": 0.8253720126258831, + "flos": 18296415434880.0, + "grad_norm": 1.9043862282464106, + "language_loss": 0.62805068, + "learning_rate": 3.113566701515036e-07, + "loss": 0.64932305, + "num_input_tokens_seen": 296056540, + "step": 13728, + "time_per_iteration": 2.477975606918335 + }, + { + "auxiliary_loss_clip": 0.01088485, + "auxiliary_loss_mlp": 0.01031677, + "balance_loss_clip": 1.03745556, + "balance_loss_mlp": 1.01922524, + "epoch": 0.825432135878551, + "flos": 26797332625920.0, + "grad_norm": 1.722085643201409, + "language_loss": 0.71140647, + "learning_rate": 3.111480143230092e-07, + "loss": 0.73260808, + "num_input_tokens_seen": 296077950, + "step": 13729, + "time_per_iteration": 2.563001871109009 + }, + { + "auxiliary_loss_clip": 0.01013483, + "auxiliary_loss_mlp": 0.01001801, + "balance_loss_clip": 1.00940061, + "balance_loss_mlp": 1.00049531, + "epoch": 0.825492259131219, + "flos": 54219116217600.0, + "grad_norm": 0.8637344057388846, + "language_loss": 0.62780476, + "learning_rate": 3.109394225359514e-07, + "loss": 0.64795756, + "num_input_tokens_seen": 296127060, + "step": 13730, + "time_per_iteration": 2.9335312843322754 + }, + { + "auxiliary_loss_clip": 0.01054594, + "auxiliary_loss_mlp": 0.01035539, + "balance_loss_clip": 1.03356457, + "balance_loss_mlp": 1.02323031, + "epoch": 0.825552382383887, + "flos": 43756145493120.0, + "grad_norm": 2.3652174757975866, + "language_loss": 0.63461334, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.65551466, + "num_input_tokens_seen": 296147775, + "step": 13731, + "time_per_iteration": 2.761463165283203 + }, + { + "auxiliary_loss_clip": 0.0106972, + "auxiliary_loss_mlp": 0.00787729, + "balance_loss_clip": 1.03322649, + "balance_loss_mlp": 1.01110899, + "epoch": 0.825612505636555, + "flos": 12602814624000.0, + "grad_norm": 2.0806822124928184, + "language_loss": 0.69765472, + "learning_rate": 3.105224311177812e-07, + "loss": 0.7162292, + "num_input_tokens_seen": 296163560, + "step": 13732, + "time_per_iteration": 2.4991185665130615 + }, + { + "auxiliary_loss_clip": 0.01094547, + "auxiliary_loss_mlp": 0.01039712, + "balance_loss_clip": 1.03424692, + "balance_loss_mlp": 1.02729595, + "epoch": 0.8256726288892229, + "flos": 17595618111360.0, + "grad_norm": 2.493802120757609, + "language_loss": 0.70620871, + "learning_rate": 3.103140315024817e-07, + "loss": 0.72755134, + "num_input_tokens_seen": 296178730, + "step": 13733, + "time_per_iteration": 2.4632673263549805 + }, + { + "auxiliary_loss_clip": 0.01101348, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.03382945, + "balance_loss_mlp": 1.01761103, + "epoch": 0.8257327521418909, + "flos": 23805794367360.0, + "grad_norm": 1.5210412149745005, + "language_loss": 0.82556534, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.84687561, + "num_input_tokens_seen": 296200175, + "step": 13734, + "time_per_iteration": 2.4904592037200928 + }, + { + "auxiliary_loss_clip": 0.01072549, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.03144789, + "balance_loss_mlp": 1.02042294, + "epoch": 0.8257928753945588, + "flos": 19281121856640.0, + "grad_norm": 1.8850320151325466, + "language_loss": 0.82668889, + "learning_rate": 3.098974244989676e-07, + "loss": 0.84774166, + "num_input_tokens_seen": 296219305, + "step": 13735, + "time_per_iteration": 2.5090553760528564 + }, + { + "auxiliary_loss_clip": 0.0109691, + "auxiliary_loss_mlp": 0.01029121, + "balance_loss_clip": 1.03692651, + "balance_loss_mlp": 1.01808822, + "epoch": 0.8258529986472268, + "flos": 18478841633280.0, + "grad_norm": 1.8286896147125027, + "language_loss": 0.70855117, + "learning_rate": 3.096892171265497e-07, + "loss": 0.72981143, + "num_input_tokens_seen": 296236945, + "step": 13736, + "time_per_iteration": 2.459522247314453 + }, + { + "auxiliary_loss_clip": 0.01022068, + "auxiliary_loss_mlp": 0.0100264, + "balance_loss_clip": 1.00921702, + "balance_loss_mlp": 1.00138271, + "epoch": 0.8259131218998947, + "flos": 62137957512960.0, + "grad_norm": 0.8498606825105989, + "language_loss": 0.67961681, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.69986391, + "num_input_tokens_seen": 296294685, + "step": 13737, + "time_per_iteration": 3.0983591079711914 + }, + { + "auxiliary_loss_clip": 0.0108054, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.03396869, + "balance_loss_mlp": 1.02228808, + "epoch": 0.8259732451525628, + "flos": 22159038418560.0, + "grad_norm": 1.782762228498345, + "language_loss": 0.69691104, + "learning_rate": 3.0927299467987e-07, + "loss": 0.7180562, + "num_input_tokens_seen": 296314790, + "step": 13738, + "time_per_iteration": 2.529181957244873 + }, + { + "auxiliary_loss_clip": 0.01087054, + "auxiliary_loss_mlp": 0.01031581, + "balance_loss_clip": 1.03654051, + "balance_loss_mlp": 1.01796687, + "epoch": 0.8260333684052307, + "flos": 38361645233280.0, + "grad_norm": 2.4324392874212193, + "language_loss": 0.62767243, + "learning_rate": 3.090649796213911e-07, + "loss": 0.64885885, + "num_input_tokens_seen": 296335355, + "step": 13739, + "time_per_iteration": 2.6497867107391357 + }, + { + "auxiliary_loss_clip": 0.01012283, + "auxiliary_loss_mlp": 0.01001245, + "balance_loss_clip": 1.00888562, + "balance_loss_mlp": 1.00006521, + "epoch": 0.8260934916578987, + "flos": 62185611882240.0, + "grad_norm": 0.8241126698395173, + "language_loss": 0.59363127, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.61376655, + "num_input_tokens_seen": 296399885, + "step": 13740, + "time_per_iteration": 3.1825897693634033 + }, + { + "auxiliary_loss_clip": 0.01108456, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.03647816, + "balance_loss_mlp": 1.02022004, + "epoch": 0.8261536149105667, + "flos": 22565475786240.0, + "grad_norm": 1.8603904323101035, + "language_loss": 0.75504613, + "learning_rate": 3.086491418735959e-07, + "loss": 0.77646458, + "num_input_tokens_seen": 296417660, + "step": 13741, + "time_per_iteration": 2.4529848098754883 + }, + { + "auxiliary_loss_clip": 0.01091534, + "auxiliary_loss_mlp": 0.01032995, + "balance_loss_clip": 1.0343591, + "balance_loss_mlp": 1.02131224, + "epoch": 0.8262137381632346, + "flos": 32525479342080.0, + "grad_norm": 1.8763066507107642, + "language_loss": 0.62486029, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.64610553, + "num_input_tokens_seen": 296438255, + "step": 13742, + "time_per_iteration": 2.620152711868286 + }, + { + "auxiliary_loss_clip": 0.01061697, + "auxiliary_loss_mlp": 0.01036615, + "balance_loss_clip": 1.03373218, + "balance_loss_mlp": 1.02297115, + "epoch": 0.8262738614159026, + "flos": 14136451666560.0, + "grad_norm": 2.772873236102213, + "language_loss": 0.66005862, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.68104172, + "num_input_tokens_seen": 296454485, + "step": 13743, + "time_per_iteration": 2.5487794876098633 + }, + { + "auxiliary_loss_clip": 0.01083939, + "auxiliary_loss_mlp": 0.01034054, + "balance_loss_clip": 1.0339421, + "balance_loss_mlp": 1.02177501, + "epoch": 0.8263339846685706, + "flos": 19825347795840.0, + "grad_norm": 1.9548101938603908, + "language_loss": 0.67138809, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.69256794, + "num_input_tokens_seen": 296473740, + "step": 13744, + "time_per_iteration": 2.598337411880493 + }, + { + "auxiliary_loss_clip": 0.01072062, + "auxiliary_loss_mlp": 0.01028689, + "balance_loss_clip": 1.03692698, + "balance_loss_mlp": 1.0175612, + "epoch": 0.8263941079212386, + "flos": 22745962650240.0, + "grad_norm": 1.7775720241861561, + "language_loss": 0.75306547, + "learning_rate": 3.078182360753612e-07, + "loss": 0.77407295, + "num_input_tokens_seen": 296493355, + "step": 13745, + "time_per_iteration": 2.5919029712677 + }, + { + "auxiliary_loss_clip": 0.01073594, + "auxiliary_loss_mlp": 0.00784532, + "balance_loss_clip": 1.03445387, + "balance_loss_mlp": 1.00914145, + "epoch": 0.8264542311739065, + "flos": 20120641505280.0, + "grad_norm": 1.8691808651503856, + "language_loss": 0.78841329, + "learning_rate": 3.076106700253709e-07, + "loss": 0.80699456, + "num_input_tokens_seen": 296510520, + "step": 13746, + "time_per_iteration": 2.535961627960205 + }, + { + "auxiliary_loss_clip": 0.01099254, + "auxiliary_loss_mlp": 0.01036161, + "balance_loss_clip": 1.03758848, + "balance_loss_mlp": 1.02335191, + "epoch": 0.8265143544265745, + "flos": 16837149502080.0, + "grad_norm": 1.9835930709111191, + "language_loss": 0.67544699, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.69680113, + "num_input_tokens_seen": 296528265, + "step": 13747, + "time_per_iteration": 2.4539108276367188 + }, + { + "auxiliary_loss_clip": 0.01090791, + "auxiliary_loss_mlp": 0.01041672, + "balance_loss_clip": 1.0341264, + "balance_loss_mlp": 1.02752149, + "epoch": 0.8265744776792424, + "flos": 22018592240640.0, + "grad_norm": 1.962960738608889, + "language_loss": 0.75489652, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.77622104, + "num_input_tokens_seen": 296547810, + "step": 13748, + "time_per_iteration": 2.5020344257354736 + }, + { + "auxiliary_loss_clip": 0.01065613, + "auxiliary_loss_mlp": 0.0103158, + "balance_loss_clip": 1.03699827, + "balance_loss_mlp": 1.02113724, + "epoch": 0.8266346009319104, + "flos": 19244852098560.0, + "grad_norm": 1.7292026949672679, + "language_loss": 0.63622987, + "learning_rate": 3.069883569603102e-07, + "loss": 0.65720177, + "num_input_tokens_seen": 296565940, + "step": 13749, + "time_per_iteration": 2.530625820159912 + }, + { + "auxiliary_loss_clip": 0.01078913, + "auxiliary_loss_mlp": 0.01026547, + "balance_loss_clip": 1.03227162, + "balance_loss_mlp": 1.01492405, + "epoch": 0.8266947241845783, + "flos": 24166768095360.0, + "grad_norm": 1.8517908756232848, + "language_loss": 0.73478484, + "learning_rate": 3.067810476598132e-07, + "loss": 0.75583947, + "num_input_tokens_seen": 296585090, + "step": 13750, + "time_per_iteration": 4.19384503364563 + }, + { + "auxiliary_loss_clip": 0.01094817, + "auxiliary_loss_mlp": 0.01036772, + "balance_loss_clip": 1.03591979, + "balance_loss_mlp": 1.02435076, + "epoch": 0.8267548474372464, + "flos": 21105814803840.0, + "grad_norm": 1.8908309914416515, + "language_loss": 0.66124713, + "learning_rate": 3.065738025663496e-07, + "loss": 0.68256301, + "num_input_tokens_seen": 296604950, + "step": 13751, + "time_per_iteration": 3.8950278759002686 + }, + { + "auxiliary_loss_clip": 0.01076547, + "auxiliary_loss_mlp": 0.01028408, + "balance_loss_clip": 1.03204966, + "balance_loss_mlp": 1.01760149, + "epoch": 0.8268149706899143, + "flos": 39968288668800.0, + "grad_norm": 1.434714228303696, + "language_loss": 0.60291088, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.62396038, + "num_input_tokens_seen": 296627780, + "step": 13752, + "time_per_iteration": 2.67452335357666 + }, + { + "auxiliary_loss_clip": 0.01021134, + "auxiliary_loss_mlp": 0.01002853, + "balance_loss_clip": 1.00888729, + "balance_loss_mlp": 1.00165546, + "epoch": 0.8268750939425823, + "flos": 65782423244160.0, + "grad_norm": 0.7697279688427325, + "language_loss": 0.57408082, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.59432065, + "num_input_tokens_seen": 296683850, + "step": 13753, + "time_per_iteration": 3.139103889465332 + }, + { + "auxiliary_loss_clip": 0.00993979, + "auxiliary_loss_mlp": 0.00763476, + "balance_loss_clip": 1.01328897, + "balance_loss_mlp": 1.00198162, + "epoch": 0.8269352171952503, + "flos": 52981455242880.0, + "grad_norm": 0.6991419744479438, + "language_loss": 0.55042869, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.56800324, + "num_input_tokens_seen": 296741420, + "step": 13754, + "time_per_iteration": 4.670770645141602 + }, + { + "auxiliary_loss_clip": 0.01060498, + "auxiliary_loss_mlp": 0.01036217, + "balance_loss_clip": 1.03172314, + "balance_loss_mlp": 1.02561903, + "epoch": 0.8269953404479182, + "flos": 23076125487360.0, + "grad_norm": 1.902709589106428, + "language_loss": 0.6907562, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.71172333, + "num_input_tokens_seen": 296759620, + "step": 13755, + "time_per_iteration": 2.783496141433716 + }, + { + "auxiliary_loss_clip": 0.01058689, + "auxiliary_loss_mlp": 0.01028384, + "balance_loss_clip": 1.03397417, + "balance_loss_mlp": 1.01745272, + "epoch": 0.8270554637005862, + "flos": 14209996763520.0, + "grad_norm": 1.930453879655492, + "language_loss": 0.69983971, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.7207104, + "num_input_tokens_seen": 296777275, + "step": 13756, + "time_per_iteration": 2.563490867614746 + }, + { + "auxiliary_loss_clip": 0.01094853, + "auxiliary_loss_mlp": 0.0103014, + "balance_loss_clip": 1.03713751, + "balance_loss_mlp": 1.01839161, + "epoch": 0.8271155869532542, + "flos": 21762046327680.0, + "grad_norm": 1.8496955098919012, + "language_loss": 0.72195375, + "learning_rate": 3.053316807931623e-07, + "loss": 0.7432037, + "num_input_tokens_seen": 296796655, + "step": 13757, + "time_per_iteration": 2.5056746006011963 + }, + { + "auxiliary_loss_clip": 0.01097083, + "auxiliary_loss_mlp": 0.01031585, + "balance_loss_clip": 1.03586292, + "balance_loss_mlp": 1.01759017, + "epoch": 0.8271757102059222, + "flos": 15120475729920.0, + "grad_norm": 2.408019644756395, + "language_loss": 0.69047558, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.71176219, + "num_input_tokens_seen": 296813705, + "step": 13758, + "time_per_iteration": 2.453334093093872 + }, + { + "auxiliary_loss_clip": 0.01079008, + "auxiliary_loss_mlp": 0.01029853, + "balance_loss_clip": 1.03284168, + "balance_loss_mlp": 1.01833153, + "epoch": 0.8272358334585901, + "flos": 24133730561280.0, + "grad_norm": 1.5399483567234775, + "language_loss": 0.69934583, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.72043443, + "num_input_tokens_seen": 296833985, + "step": 13759, + "time_per_iteration": 3.9600484371185303 + }, + { + "auxiliary_loss_clip": 0.01081206, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.03398252, + "balance_loss_mlp": 1.01810598, + "epoch": 0.8272959567112581, + "flos": 18990712396800.0, + "grad_norm": 1.698754179967429, + "language_loss": 0.70726383, + "learning_rate": 3.047114873375161e-07, + "loss": 0.72837877, + "num_input_tokens_seen": 296850150, + "step": 13760, + "time_per_iteration": 2.4885857105255127 + }, + { + "auxiliary_loss_clip": 0.01062052, + "auxiliary_loss_mlp": 0.01033523, + "balance_loss_clip": 1.03340101, + "balance_loss_mlp": 1.02166164, + "epoch": 0.827356079963926, + "flos": 20631614428800.0, + "grad_norm": 1.6967136424717588, + "language_loss": 0.77847427, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.79943001, + "num_input_tokens_seen": 296869585, + "step": 13761, + "time_per_iteration": 2.567659854888916 + }, + { + "auxiliary_loss_clip": 0.01068397, + "auxiliary_loss_mlp": 0.01030687, + "balance_loss_clip": 1.03444731, + "balance_loss_mlp": 1.02010703, + "epoch": 0.827416203216594, + "flos": 22416625825920.0, + "grad_norm": 1.737762983649448, + "language_loss": 0.7002281, + "learning_rate": 3.042983464482387e-07, + "loss": 0.72121894, + "num_input_tokens_seen": 296887710, + "step": 13762, + "time_per_iteration": 2.5412840843200684 + }, + { + "auxiliary_loss_clip": 0.01049564, + "auxiliary_loss_mlp": 0.01030379, + "balance_loss_clip": 1.03199434, + "balance_loss_mlp": 1.01861298, + "epoch": 0.827476326469262, + "flos": 19026192055680.0, + "grad_norm": 1.7867537035933192, + "language_loss": 0.69872546, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.71952486, + "num_input_tokens_seen": 296906265, + "step": 13763, + "time_per_iteration": 2.599407434463501 + }, + { + "auxiliary_loss_clip": 0.01013088, + "auxiliary_loss_mlp": 0.01000778, + "balance_loss_clip": 1.01533723, + "balance_loss_mlp": 0.9994961, + "epoch": 0.82753644972193, + "flos": 68500575089280.0, + "grad_norm": 0.8395302397199025, + "language_loss": 0.65137666, + "learning_rate": 3.038854627636651e-07, + "loss": 0.67151541, + "num_input_tokens_seen": 296971290, + "step": 13764, + "time_per_iteration": 3.254702091217041 + }, + { + "auxiliary_loss_clip": 0.01094302, + "auxiliary_loss_mlp": 0.01032949, + "balance_loss_clip": 1.03555942, + "balance_loss_mlp": 1.02062845, + "epoch": 0.8275965729745979, + "flos": 18405404277120.0, + "grad_norm": 2.1923458216558034, + "language_loss": 0.7760756, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.79734808, + "num_input_tokens_seen": 296989060, + "step": 13765, + "time_per_iteration": 2.4713244438171387 + }, + { + "auxiliary_loss_clip": 0.01050868, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.03419542, + "balance_loss_mlp": 1.01722991, + "epoch": 0.8276566962272659, + "flos": 28512067063680.0, + "grad_norm": 1.5064561121929607, + "language_loss": 0.62473631, + "learning_rate": 3.034728363464214e-07, + "loss": 0.64554381, + "num_input_tokens_seen": 297011300, + "step": 13766, + "time_per_iteration": 2.672788143157959 + }, + { + "auxiliary_loss_clip": 0.01069769, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.0358423, + "balance_loss_mlp": 1.018543, + "epoch": 0.8277168194799339, + "flos": 20230240878720.0, + "grad_norm": 1.6333018619521407, + "language_loss": 0.82568395, + "learning_rate": 3.03266619632609e-07, + "loss": 0.84668982, + "num_input_tokens_seen": 297030350, + "step": 13767, + "time_per_iteration": 2.557683229446411 + }, + { + "auxiliary_loss_clip": 0.01078804, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.03945065, + "balance_loss_mlp": 1.0163641, + "epoch": 0.8277769427326018, + "flos": 28476623318400.0, + "grad_norm": 1.5494839278570682, + "language_loss": 0.69281024, + "learning_rate": 3.030604672590964e-07, + "loss": 0.7138806, + "num_input_tokens_seen": 297049710, + "step": 13768, + "time_per_iteration": 2.5780515670776367 + }, + { + "auxiliary_loss_clip": 0.01045041, + "auxiliary_loss_mlp": 0.01034252, + "balance_loss_clip": 1.03281331, + "balance_loss_mlp": 1.02141929, + "epoch": 0.8278370659852698, + "flos": 27197628768000.0, + "grad_norm": 1.8816555559976995, + "language_loss": 0.74724287, + "learning_rate": 3.028543792337006e-07, + "loss": 0.76803583, + "num_input_tokens_seen": 297070510, + "step": 13769, + "time_per_iteration": 2.6421194076538086 + }, + { + "auxiliary_loss_clip": 0.01080093, + "auxiliary_loss_mlp": 0.01029957, + "balance_loss_clip": 1.03425241, + "balance_loss_mlp": 1.01829815, + "epoch": 0.8278971892379378, + "flos": 37816126404480.0, + "grad_norm": 1.7921071783067353, + "language_loss": 0.74576354, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.76686406, + "num_input_tokens_seen": 297092585, + "step": 13770, + "time_per_iteration": 2.6709301471710205 + }, + { + "auxiliary_loss_clip": 0.01074229, + "auxiliary_loss_mlp": 0.01032244, + "balance_loss_clip": 1.03519368, + "balance_loss_mlp": 1.01975131, + "epoch": 0.8279573124906058, + "flos": 22560160573440.0, + "grad_norm": 1.6819123251030312, + "language_loss": 0.75783736, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.77890205, + "num_input_tokens_seen": 297110055, + "step": 13771, + "time_per_iteration": 2.5439581871032715 + }, + { + "auxiliary_loss_clip": 0.01103143, + "auxiliary_loss_mlp": 0.01031609, + "balance_loss_clip": 1.03453314, + "balance_loss_mlp": 1.01994419, + "epoch": 0.8280174357432737, + "flos": 36064619418240.0, + "grad_norm": 1.590593537573226, + "language_loss": 0.72566205, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.74700952, + "num_input_tokens_seen": 297132170, + "step": 13772, + "time_per_iteration": 2.582756996154785 + }, + { + "auxiliary_loss_clip": 0.01081964, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.03480983, + "balance_loss_mlp": 1.0183382, + "epoch": 0.8280775589959417, + "flos": 22961067246720.0, + "grad_norm": 2.501083071461501, + "language_loss": 0.74649966, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.76762795, + "num_input_tokens_seen": 297149515, + "step": 13773, + "time_per_iteration": 2.504692316055298 + }, + { + "auxiliary_loss_clip": 0.01058183, + "auxiliary_loss_mlp": 0.01031804, + "balance_loss_clip": 1.03691006, + "balance_loss_mlp": 1.02077711, + "epoch": 0.8281376822486096, + "flos": 26063282286720.0, + "grad_norm": 1.7383643570029466, + "language_loss": 0.75776052, + "learning_rate": 3.01824904601915e-07, + "loss": 0.77866042, + "num_input_tokens_seen": 297170320, + "step": 13774, + "time_per_iteration": 2.6227054595947266 + }, + { + "auxiliary_loss_clip": 0.0107624, + "auxiliary_loss_mlp": 0.0078386, + "balance_loss_clip": 1.03709388, + "balance_loss_mlp": 1.01075077, + "epoch": 0.8281978055012776, + "flos": 20667776446080.0, + "grad_norm": 1.702503160144713, + "language_loss": 0.75307691, + "learning_rate": 3.01619202829249e-07, + "loss": 0.77167785, + "num_input_tokens_seen": 297189935, + "step": 13775, + "time_per_iteration": 2.5441153049468994 + }, + { + "auxiliary_loss_clip": 0.01107543, + "auxiliary_loss_mlp": 0.01032455, + "balance_loss_clip": 1.03484106, + "balance_loss_mlp": 1.01955688, + "epoch": 0.8282579287539455, + "flos": 29315281040640.0, + "grad_norm": 4.621910496719665, + "language_loss": 0.73738182, + "learning_rate": 3.01413565459353e-07, + "loss": 0.75878179, + "num_input_tokens_seen": 297210885, + "step": 13776, + "time_per_iteration": 2.5880653858184814 + }, + { + "auxiliary_loss_clip": 0.01050752, + "auxiliary_loss_mlp": 0.01032762, + "balance_loss_clip": 1.02933276, + "balance_loss_mlp": 1.01912427, + "epoch": 0.8283180520066136, + "flos": 15706178899200.0, + "grad_norm": 1.900685674268679, + "language_loss": 0.77318406, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.79401922, + "num_input_tokens_seen": 297228500, + "step": 13777, + "time_per_iteration": 2.562140703201294 + }, + { + "auxiliary_loss_clip": 0.01091418, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.03588605, + "balance_loss_mlp": 1.01757383, + "epoch": 0.8283781752592815, + "flos": 24791470456320.0, + "grad_norm": 1.5145547550286076, + "language_loss": 0.82535315, + "learning_rate": 3.010024839590604e-07, + "loss": 0.84655178, + "num_input_tokens_seen": 297249470, + "step": 13778, + "time_per_iteration": 2.530228853225708 + }, + { + "auxiliary_loss_clip": 0.01087691, + "auxiliary_loss_mlp": 0.01024147, + "balance_loss_clip": 1.03319943, + "balance_loss_mlp": 1.01250076, + "epoch": 0.8284382985119495, + "flos": 18982811404800.0, + "grad_norm": 1.8060068025394622, + "language_loss": 0.74942815, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.77054656, + "num_input_tokens_seen": 297265970, + "step": 13779, + "time_per_iteration": 2.4607298374176025 + }, + { + "auxiliary_loss_clip": 0.0100385, + "auxiliary_loss_mlp": 0.01002813, + "balance_loss_clip": 1.00879514, + "balance_loss_mlp": 1.00147188, + "epoch": 0.8284984217646175, + "flos": 61034460814080.0, + "grad_norm": 0.7660120474984011, + "language_loss": 0.56719732, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.58726394, + "num_input_tokens_seen": 297325525, + "step": 13780, + "time_per_iteration": 3.2000057697296143 + }, + { + "auxiliary_loss_clip": 0.0106311, + "auxiliary_loss_mlp": 0.01026504, + "balance_loss_clip": 1.03430843, + "balance_loss_mlp": 1.01378989, + "epoch": 0.8285585450172854, + "flos": 19714635100800.0, + "grad_norm": 1.5863784598446904, + "language_loss": 0.79727173, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.81816792, + "num_input_tokens_seen": 297345025, + "step": 13781, + "time_per_iteration": 2.595160722732544 + }, + { + "auxiliary_loss_clip": 0.01062029, + "auxiliary_loss_mlp": 0.01028864, + "balance_loss_clip": 1.03728259, + "balance_loss_mlp": 1.0159297, + "epoch": 0.8286186682699535, + "flos": 21688896280320.0, + "grad_norm": 2.8468546744039984, + "language_loss": 0.75193179, + "learning_rate": 3.001810941346543e-07, + "loss": 0.77284068, + "num_input_tokens_seen": 297363570, + "step": 13782, + "time_per_iteration": 2.610928773880005 + }, + { + "auxiliary_loss_clip": 0.0109132, + "auxiliary_loss_mlp": 0.01026811, + "balance_loss_clip": 1.03297091, + "balance_loss_mlp": 1.01511705, + "epoch": 0.8286787915226214, + "flos": 25775566346880.0, + "grad_norm": 1.7688811977450976, + "language_loss": 0.7619791, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.78316045, + "num_input_tokens_seen": 297385385, + "step": 13783, + "time_per_iteration": 2.543686866760254 + }, + { + "auxiliary_loss_clip": 0.01103373, + "auxiliary_loss_mlp": 0.01026406, + "balance_loss_clip": 1.03487945, + "balance_loss_mlp": 1.01486063, + "epoch": 0.8287389147752894, + "flos": 21288348743040.0, + "grad_norm": 1.5471168193635902, + "language_loss": 0.73771727, + "learning_rate": 2.997707859351304e-07, + "loss": 0.75901508, + "num_input_tokens_seen": 297403950, + "step": 13784, + "time_per_iteration": 2.5256195068359375 + }, + { + "auxiliary_loss_clip": 0.01095283, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.03363323, + "balance_loss_mlp": 1.01906776, + "epoch": 0.8287990380279573, + "flos": 33544875323520.0, + "grad_norm": 1.4846059376341723, + "language_loss": 0.69877672, + "learning_rate": 2.99565728540772e-07, + "loss": 0.720047, + "num_input_tokens_seen": 297424565, + "step": 13785, + "time_per_iteration": 2.585559368133545 + }, + { + "auxiliary_loss_clip": 0.01080353, + "auxiliary_loss_mlp": 0.01030006, + "balance_loss_clip": 1.03636718, + "balance_loss_mlp": 1.01797187, + "epoch": 0.8288591612806253, + "flos": 22966346545920.0, + "grad_norm": 1.4327602047461956, + "language_loss": 0.68304807, + "learning_rate": 2.993607356270516e-07, + "loss": 0.70415169, + "num_input_tokens_seen": 297445180, + "step": 13786, + "time_per_iteration": 2.564075231552124 + }, + { + "auxiliary_loss_clip": 0.01064921, + "auxiliary_loss_mlp": 0.01036734, + "balance_loss_clip": 1.03533983, + "balance_loss_mlp": 1.02462244, + "epoch": 0.8289192845332932, + "flos": 18588979710720.0, + "grad_norm": 1.7445751539365408, + "language_loss": 0.76785195, + "learning_rate": 2.991558072017426e-07, + "loss": 0.78886855, + "num_input_tokens_seen": 297463790, + "step": 13787, + "time_per_iteration": 2.6029598712921143 + }, + { + "auxiliary_loss_clip": 0.01092682, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.03568709, + "balance_loss_mlp": 1.02399325, + "epoch": 0.8289794077859612, + "flos": 15450423085440.0, + "grad_norm": 1.6825528578457583, + "language_loss": 0.80367959, + "learning_rate": 2.989509432726163e-07, + "loss": 0.82496208, + "num_input_tokens_seen": 297480100, + "step": 13788, + "time_per_iteration": 3.91154146194458 + }, + { + "auxiliary_loss_clip": 0.01079389, + "auxiliary_loss_mlp": 0.01031316, + "balance_loss_clip": 1.03679669, + "balance_loss_mlp": 1.02001524, + "epoch": 0.8290395310386292, + "flos": 28877853214080.0, + "grad_norm": 1.6036075158245275, + "language_loss": 0.70807004, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.72917706, + "num_input_tokens_seen": 297499890, + "step": 13789, + "time_per_iteration": 4.1907854080200195 + }, + { + "auxiliary_loss_clip": 0.0107124, + "auxiliary_loss_mlp": 0.01028522, + "balance_loss_clip": 1.03079355, + "balance_loss_mlp": 1.01590967, + "epoch": 0.8290996542912972, + "flos": 36576274700160.0, + "grad_norm": 1.717551870176779, + "language_loss": 0.68069875, + "learning_rate": 2.985414089339813e-07, + "loss": 0.7016964, + "num_input_tokens_seen": 297521440, + "step": 13790, + "time_per_iteration": 2.718311071395874 + }, + { + "auxiliary_loss_clip": 0.01094901, + "auxiliary_loss_mlp": 0.01028559, + "balance_loss_clip": 1.03483725, + "balance_loss_mlp": 1.01526749, + "epoch": 0.8291597775439651, + "flos": 23623009032960.0, + "grad_norm": 1.8102400257496356, + "language_loss": 0.77587438, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.79710895, + "num_input_tokens_seen": 297539920, + "step": 13791, + "time_per_iteration": 2.5219216346740723 + }, + { + "auxiliary_loss_clip": 0.01078139, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.03471017, + "balance_loss_mlp": 1.01570153, + "epoch": 0.8292199007966331, + "flos": 21397481239680.0, + "grad_norm": 1.4760769734780976, + "language_loss": 0.6996274, + "learning_rate": 2.981321326732651e-07, + "loss": 0.72069615, + "num_input_tokens_seen": 297560000, + "step": 13792, + "time_per_iteration": 3.9157023429870605 + }, + { + "auxiliary_loss_clip": 0.01082569, + "auxiliary_loss_mlp": 0.01034323, + "balance_loss_clip": 1.03436422, + "balance_loss_mlp": 1.02202034, + "epoch": 0.829280024049301, + "flos": 28767607395840.0, + "grad_norm": 2.2196112551783695, + "language_loss": 0.6511718, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.67234069, + "num_input_tokens_seen": 297579300, + "step": 13793, + "time_per_iteration": 2.5637457370758057 + }, + { + "auxiliary_loss_clip": 0.01049124, + "auxiliary_loss_mlp": 0.01035507, + "balance_loss_clip": 1.03253913, + "balance_loss_mlp": 1.02180433, + "epoch": 0.829340147301969, + "flos": 19938071652480.0, + "grad_norm": 1.7392558847816544, + "language_loss": 0.66164374, + "learning_rate": 2.977231145525461e-07, + "loss": 0.68248999, + "num_input_tokens_seen": 297598095, + "step": 13794, + "time_per_iteration": 2.613039016723633 + }, + { + "auxiliary_loss_clip": 0.01102841, + "auxiliary_loss_mlp": 0.01033384, + "balance_loss_clip": 1.03349638, + "balance_loss_mlp": 1.02089703, + "epoch": 0.829400270554637, + "flos": 25228575060480.0, + "grad_norm": 1.9679588470097593, + "language_loss": 0.66119969, + "learning_rate": 2.975187023140757e-07, + "loss": 0.68256193, + "num_input_tokens_seen": 297615955, + "step": 13795, + "time_per_iteration": 2.4749884605407715 + }, + { + "auxiliary_loss_clip": 0.01012265, + "auxiliary_loss_mlp": 0.01042572, + "balance_loss_clip": 1.03154027, + "balance_loss_mlp": 1.02895796, + "epoch": 0.829460393807305, + "flos": 24463570176000.0, + "grad_norm": 1.7900305576602134, + "language_loss": 0.66302365, + "learning_rate": 2.973143546338661e-07, + "loss": 0.68357205, + "num_input_tokens_seen": 297636285, + "step": 13796, + "time_per_iteration": 2.789785623550415 + }, + { + "auxiliary_loss_clip": 0.01055113, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.03075981, + "balance_loss_mlp": 1.02253842, + "epoch": 0.829520517059973, + "flos": 15122486891520.0, + "grad_norm": 1.7011874035943528, + "language_loss": 0.71888673, + "learning_rate": 2.971100715196666e-07, + "loss": 0.73979759, + "num_input_tokens_seen": 297653315, + "step": 13797, + "time_per_iteration": 2.688265085220337 + }, + { + "auxiliary_loss_clip": 0.01035813, + "auxiliary_loss_mlp": 0.01030838, + "balance_loss_clip": 1.03291166, + "balance_loss_mlp": 1.01884604, + "epoch": 0.8295806403126409, + "flos": 21579979265280.0, + "grad_norm": 1.7243802918064146, + "language_loss": 0.72436374, + "learning_rate": 2.969058529792243e-07, + "loss": 0.74503028, + "num_input_tokens_seen": 297673480, + "step": 13798, + "time_per_iteration": 4.167632341384888 + }, + { + "auxiliary_loss_clip": 0.01065658, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.0317421, + "balance_loss_mlp": 1.01960325, + "epoch": 0.8296407635653089, + "flos": 21726566668800.0, + "grad_norm": 1.5710904312101868, + "language_loss": 0.75959647, + "learning_rate": 2.967016990202822e-07, + "loss": 0.78057361, + "num_input_tokens_seen": 297693250, + "step": 13799, + "time_per_iteration": 2.795722246170044 + }, + { + "auxiliary_loss_clip": 0.01103542, + "auxiliary_loss_mlp": 0.01032723, + "balance_loss_clip": 1.03575134, + "balance_loss_mlp": 1.0208317, + "epoch": 0.8297008868179768, + "flos": 11181147252480.0, + "grad_norm": 1.7785422132184336, + "language_loss": 0.67173135, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.69309396, + "num_input_tokens_seen": 297710975, + "step": 13800, + "time_per_iteration": 2.444925308227539 + }, + { + "auxiliary_loss_clip": 0.01068756, + "auxiliary_loss_mlp": 0.01035508, + "balance_loss_clip": 1.036057, + "balance_loss_mlp": 1.02204311, + "epoch": 0.8297610100706448, + "flos": 20664041431680.0, + "grad_norm": 1.6222500305311753, + "language_loss": 0.74729437, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.76833701, + "num_input_tokens_seen": 297730860, + "step": 13801, + "time_per_iteration": 2.6141347885131836 + }, + { + "auxiliary_loss_clip": 0.01061981, + "auxiliary_loss_mlp": 0.01028833, + "balance_loss_clip": 1.03683972, + "balance_loss_mlp": 1.01760316, + "epoch": 0.8298211333233128, + "flos": 20376325491840.0, + "grad_norm": 1.513138674732876, + "language_loss": 0.73562396, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.75653219, + "num_input_tokens_seen": 297749765, + "step": 13802, + "time_per_iteration": 2.6218268871307373 + }, + { + "auxiliary_loss_clip": 0.01084699, + "auxiliary_loss_mlp": 0.0103201, + "balance_loss_clip": 1.03463674, + "balance_loss_mlp": 1.02008963, + "epoch": 0.8298812565759808, + "flos": 21508696725120.0, + "grad_norm": 1.579773479724805, + "language_loss": 0.74645382, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.76762092, + "num_input_tokens_seen": 297770380, + "step": 13803, + "time_per_iteration": 2.554824113845825 + }, + { + "auxiliary_loss_clip": 0.01093958, + "auxiliary_loss_mlp": 0.01030482, + "balance_loss_clip": 1.03604555, + "balance_loss_mlp": 1.01892471, + "epoch": 0.8299413798286487, + "flos": 22818681734400.0, + "grad_norm": 1.6946292556468103, + "language_loss": 0.79357982, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.81482422, + "num_input_tokens_seen": 297789440, + "step": 13804, + "time_per_iteration": 2.4915390014648438 + }, + { + "auxiliary_loss_clip": 0.01102306, + "auxiliary_loss_mlp": 0.01029471, + "balance_loss_clip": 1.03466928, + "balance_loss_mlp": 1.01827717, + "epoch": 0.8300015030813167, + "flos": 29679199683840.0, + "grad_norm": 1.672390215027454, + "language_loss": 0.7325207, + "learning_rate": 2.954781319115016e-07, + "loss": 0.75383854, + "num_input_tokens_seen": 297810425, + "step": 13805, + "time_per_iteration": 2.5287089347839355 + }, + { + "auxiliary_loss_clip": 0.01095143, + "auxiliary_loss_mlp": 0.00781485, + "balance_loss_clip": 1.03554392, + "balance_loss_mlp": 1.00623059, + "epoch": 0.8300616263339846, + "flos": 19719483436800.0, + "grad_norm": 2.097453678602502, + "language_loss": 0.77370495, + "learning_rate": 2.952744302396906e-07, + "loss": 0.79247117, + "num_input_tokens_seen": 297827680, + "step": 13806, + "time_per_iteration": 2.4910733699798584 + }, + { + "auxiliary_loss_clip": 0.01097062, + "auxiliary_loss_mlp": 0.01034323, + "balance_loss_clip": 1.03546321, + "balance_loss_mlp": 1.02112675, + "epoch": 0.8301217495866526, + "flos": 19901945548800.0, + "grad_norm": 1.6402176903222554, + "language_loss": 0.63401306, + "learning_rate": 2.950707932112444e-07, + "loss": 0.6553269, + "num_input_tokens_seen": 297848005, + "step": 13807, + "time_per_iteration": 2.4969959259033203 + }, + { + "auxiliary_loss_clip": 0.01092079, + "auxiliary_loss_mlp": 0.01029069, + "balance_loss_clip": 1.03859377, + "balance_loss_mlp": 1.01729679, + "epoch": 0.8301818728393207, + "flos": 19715784336000.0, + "grad_norm": 1.7440578161079283, + "language_loss": 0.7306065, + "learning_rate": 2.948672208338847e-07, + "loss": 0.751818, + "num_input_tokens_seen": 297866730, + "step": 13808, + "time_per_iteration": 2.510754346847534 + }, + { + "auxiliary_loss_clip": 0.01090179, + "auxiliary_loss_mlp": 0.01045223, + "balance_loss_clip": 1.03915524, + "balance_loss_mlp": 1.03149557, + "epoch": 0.8302419960919886, + "flos": 28293658416000.0, + "grad_norm": 1.7964508937190282, + "language_loss": 0.66378236, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.68513632, + "num_input_tokens_seen": 297886390, + "step": 13809, + "time_per_iteration": 2.570053815841675 + }, + { + "auxiliary_loss_clip": 0.01105757, + "auxiliary_loss_mlp": 0.01025009, + "balance_loss_clip": 1.03547847, + "balance_loss_mlp": 1.01345801, + "epoch": 0.8303021193446566, + "flos": 18223444955520.0, + "grad_norm": 1.9334740234690653, + "language_loss": 0.74251157, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.76381922, + "num_input_tokens_seen": 297905110, + "step": 13810, + "time_per_iteration": 2.44181227684021 + }, + { + "auxiliary_loss_clip": 0.01070984, + "auxiliary_loss_mlp": 0.0103511, + "balance_loss_clip": 1.03474879, + "balance_loss_mlp": 1.02404118, + "epoch": 0.8303622425973245, + "flos": 23111425578240.0, + "grad_norm": 1.8897016823174166, + "language_loss": 0.81176174, + "learning_rate": 2.94256891685505e-07, + "loss": 0.83282268, + "num_input_tokens_seen": 297925460, + "step": 13811, + "time_per_iteration": 2.5757362842559814 + }, + { + "auxiliary_loss_clip": 0.01072704, + "auxiliary_loss_mlp": 0.01044877, + "balance_loss_clip": 1.03539741, + "balance_loss_mlp": 1.03185368, + "epoch": 0.8304223658499925, + "flos": 19572860119680.0, + "grad_norm": 1.8374575824274413, + "language_loss": 0.73499203, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.75616789, + "num_input_tokens_seen": 297941760, + "step": 13812, + "time_per_iteration": 2.5577752590179443 + }, + { + "auxiliary_loss_clip": 0.01077602, + "auxiliary_loss_mlp": 0.01027388, + "balance_loss_clip": 1.03615165, + "balance_loss_mlp": 1.01551461, + "epoch": 0.8304824891026604, + "flos": 24426115269120.0, + "grad_norm": 2.0168874191529795, + "language_loss": 0.78330684, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.80435669, + "num_input_tokens_seen": 297959745, + "step": 13813, + "time_per_iteration": 2.5610148906707764 + }, + { + "auxiliary_loss_clip": 0.01049527, + "auxiliary_loss_mlp": 0.0078333, + "balance_loss_clip": 1.03454852, + "balance_loss_mlp": 1.01110053, + "epoch": 0.8305426123553284, + "flos": 22381792611840.0, + "grad_norm": 2.0718368028024976, + "language_loss": 0.70974678, + "learning_rate": 2.93647144674658e-07, + "loss": 0.72807539, + "num_input_tokens_seen": 297977665, + "step": 13814, + "time_per_iteration": 2.6469643115997314 + }, + { + "auxiliary_loss_clip": 0.01112308, + "auxiliary_loss_mlp": 0.0104137, + "balance_loss_clip": 1.03606236, + "balance_loss_mlp": 1.0270766, + "epoch": 0.8306027356079964, + "flos": 14903575453440.0, + "grad_norm": 2.579497449831375, + "language_loss": 0.67571127, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.69724804, + "num_input_tokens_seen": 297993525, + "step": 13815, + "time_per_iteration": 2.430887460708618 + }, + { + "auxiliary_loss_clip": 0.01091878, + "auxiliary_loss_mlp": 0.0102957, + "balance_loss_clip": 1.03755379, + "balance_loss_mlp": 1.01752949, + "epoch": 0.8306628588606644, + "flos": 19644573623040.0, + "grad_norm": 1.7966631091219092, + "language_loss": 0.75682199, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.77803648, + "num_input_tokens_seen": 298012920, + "step": 13816, + "time_per_iteration": 2.5072364807128906 + }, + { + "auxiliary_loss_clip": 0.01070114, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.03294337, + "balance_loss_mlp": 1.0224328, + "epoch": 0.8307229821133323, + "flos": 24389737770240.0, + "grad_norm": 1.920358366740514, + "language_loss": 0.81662488, + "learning_rate": 2.930379800094371e-07, + "loss": 0.83766115, + "num_input_tokens_seen": 298033310, + "step": 13817, + "time_per_iteration": 2.571570873260498 + }, + { + "auxiliary_loss_clip": 0.0109636, + "auxiliary_loss_mlp": 0.01037353, + "balance_loss_clip": 1.0369544, + "balance_loss_mlp": 1.0242219, + "epoch": 0.8307831053660003, + "flos": 20996933702400.0, + "grad_norm": 1.711400714973582, + "language_loss": 0.7787714, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.80010855, + "num_input_tokens_seen": 298053530, + "step": 13818, + "time_per_iteration": 2.5033414363861084 + }, + { + "auxiliary_loss_clip": 0.01088483, + "auxiliary_loss_mlp": 0.01034952, + "balance_loss_clip": 1.03772843, + "balance_loss_mlp": 1.02243519, + "epoch": 0.8308432286186682, + "flos": 21397301671680.0, + "grad_norm": 1.7619217796044915, + "language_loss": 0.82070011, + "learning_rate": 2.926321938606453e-07, + "loss": 0.8419345, + "num_input_tokens_seen": 298069305, + "step": 13819, + "time_per_iteration": 2.518442153930664 + }, + { + "auxiliary_loss_clip": 0.01020673, + "auxiliary_loss_mlp": 0.01002921, + "balance_loss_clip": 1.00827646, + "balance_loss_mlp": 1.00174642, + "epoch": 0.8309033518713362, + "flos": 62533656714240.0, + "grad_norm": 0.7740868584755801, + "language_loss": 0.56237477, + "learning_rate": 2.924293978977399e-07, + "loss": 0.58261067, + "num_input_tokens_seen": 298125830, + "step": 13820, + "time_per_iteration": 3.118082046508789 + }, + { + "auxiliary_loss_clip": 0.01091261, + "auxiliary_loss_mlp": 0.01026623, + "balance_loss_clip": 1.03431511, + "balance_loss_mlp": 1.01473784, + "epoch": 0.8309634751240043, + "flos": 16979104051200.0, + "grad_norm": 1.8591452375605304, + "language_loss": 0.68536413, + "learning_rate": 2.922266666860831e-07, + "loss": 0.70654309, + "num_input_tokens_seen": 298142320, + "step": 13821, + "time_per_iteration": 2.459219217300415 + }, + { + "auxiliary_loss_clip": 0.01041116, + "auxiliary_loss_mlp": 0.01037156, + "balance_loss_clip": 1.03171325, + "balance_loss_mlp": 1.02345252, + "epoch": 0.8310235983766722, + "flos": 22674464628480.0, + "grad_norm": 1.7266902105749462, + "language_loss": 0.69199753, + "learning_rate": 2.920240002333625e-07, + "loss": 0.71278024, + "num_input_tokens_seen": 298161845, + "step": 13822, + "time_per_iteration": 2.659160852432251 + }, + { + "auxiliary_loss_clip": 0.01061073, + "auxiliary_loss_mlp": 0.0103531, + "balance_loss_clip": 1.03485012, + "balance_loss_mlp": 1.0238719, + "epoch": 0.8310837216293402, + "flos": 30811463176320.0, + "grad_norm": 1.8004188325569508, + "language_loss": 0.62330663, + "learning_rate": 2.918213985472631e-07, + "loss": 0.64427042, + "num_input_tokens_seen": 298184165, + "step": 13823, + "time_per_iteration": 2.6619222164154053 + }, + { + "auxiliary_loss_clip": 0.01012857, + "auxiliary_loss_mlp": 0.01003457, + "balance_loss_clip": 1.00944126, + "balance_loss_mlp": 1.00211608, + "epoch": 0.8311438448820081, + "flos": 71276074997760.0, + "grad_norm": 0.8640424218083849, + "language_loss": 0.6194942, + "learning_rate": 2.916188616354669e-07, + "loss": 0.63965732, + "num_input_tokens_seen": 298251720, + "step": 13824, + "time_per_iteration": 3.2104811668395996 + }, + { + "auxiliary_loss_clip": 0.01104163, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.03566754, + "balance_loss_mlp": 1.02095318, + "epoch": 0.8312039681346761, + "flos": 20887082933760.0, + "grad_norm": 1.6416754556244744, + "language_loss": 0.7407347, + "learning_rate": 2.914163895056552e-07, + "loss": 0.76210153, + "num_input_tokens_seen": 298271910, + "step": 13825, + "time_per_iteration": 2.470891237258911 + }, + { + "auxiliary_loss_clip": 0.0106237, + "auxiliary_loss_mlp": 0.00784129, + "balance_loss_clip": 1.03376889, + "balance_loss_mlp": 1.0080713, + "epoch": 0.831264091387344, + "flos": 17017528625280.0, + "grad_norm": 2.964410229079954, + "language_loss": 0.80369389, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.82215887, + "num_input_tokens_seen": 298288105, + "step": 13826, + "time_per_iteration": 3.95770001411438 + }, + { + "auxiliary_loss_clip": 0.01104597, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.03528357, + "balance_loss_mlp": 1.02198648, + "epoch": 0.831324214640012, + "flos": 24419578993920.0, + "grad_norm": 1.6786912884258096, + "language_loss": 0.67957854, + "learning_rate": 2.910116396226914e-07, + "loss": 0.70097017, + "num_input_tokens_seen": 298307600, + "step": 13827, + "time_per_iteration": 2.506308078765869 + }, + { + "auxiliary_loss_clip": 0.01091566, + "auxiliary_loss_mlp": 0.01030623, + "balance_loss_clip": 1.03189003, + "balance_loss_mlp": 1.01947069, + "epoch": 0.83138433789268, + "flos": 13545576938880.0, + "grad_norm": 1.7289220191035004, + "language_loss": 0.74355602, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.7647779, + "num_input_tokens_seen": 298323055, + "step": 13828, + "time_per_iteration": 4.398099899291992 + }, + { + "auxiliary_loss_clip": 0.01068505, + "auxiliary_loss_mlp": 0.01037408, + "balance_loss_clip": 1.03110981, + "balance_loss_mlp": 1.02451599, + "epoch": 0.831444461145348, + "flos": 44492386561920.0, + "grad_norm": 1.587930216290016, + "language_loss": 0.67384195, + "learning_rate": 2.906071489597657e-07, + "loss": 0.69490117, + "num_input_tokens_seen": 298346950, + "step": 13829, + "time_per_iteration": 2.765392303466797 + }, + { + "auxiliary_loss_clip": 0.01079545, + "auxiliary_loss_mlp": 0.01029366, + "balance_loss_clip": 1.03434873, + "balance_loss_mlp": 1.01675344, + "epoch": 0.8315045843980159, + "flos": 22705024124160.0, + "grad_norm": 1.6465641042700396, + "language_loss": 0.82832253, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.84941155, + "num_input_tokens_seen": 298366315, + "step": 13830, + "time_per_iteration": 2.541916608810425 + }, + { + "auxiliary_loss_clip": 0.01092817, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.03513122, + "balance_loss_mlp": 1.02185738, + "epoch": 0.8315647076506839, + "flos": 16873491087360.0, + "grad_norm": 2.265703257620473, + "language_loss": 0.74169832, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.7629596, + "num_input_tokens_seen": 298385185, + "step": 13831, + "time_per_iteration": 3.9343345165252686 + }, + { + "auxiliary_loss_clip": 0.01106135, + "auxiliary_loss_mlp": 0.01037668, + "balance_loss_clip": 1.03678167, + "balance_loss_mlp": 1.02428615, + "epoch": 0.8316248309033518, + "flos": 13808730954240.0, + "grad_norm": 1.7406278027767697, + "language_loss": 0.71125388, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.73269188, + "num_input_tokens_seen": 298402335, + "step": 13832, + "time_per_iteration": 2.4564247131347656 + }, + { + "auxiliary_loss_clip": 0.01076447, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.0330745, + "balance_loss_mlp": 1.01934624, + "epoch": 0.8316849541560198, + "flos": 23512511819520.0, + "grad_norm": 1.6165300137387997, + "language_loss": 0.84605062, + "learning_rate": 2.897989455393979e-07, + "loss": 0.86712581, + "num_input_tokens_seen": 298423370, + "step": 13833, + "time_per_iteration": 2.5408313274383545 + }, + { + "auxiliary_loss_clip": 0.01087161, + "auxiliary_loss_mlp": 0.01035786, + "balance_loss_clip": 1.03641653, + "balance_loss_mlp": 1.02314949, + "epoch": 0.8317450774086879, + "flos": 23771356202880.0, + "grad_norm": 1.616143626168053, + "language_loss": 0.76063037, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.78185987, + "num_input_tokens_seen": 298444835, + "step": 13834, + "time_per_iteration": 2.5705676078796387 + }, + { + "auxiliary_loss_clip": 0.01100134, + "auxiliary_loss_mlp": 0.0078275, + "balance_loss_clip": 1.03345323, + "balance_loss_mlp": 1.00883603, + "epoch": 0.8318052006613558, + "flos": 16215535710720.0, + "grad_norm": 2.0382646882263553, + "language_loss": 0.79657006, + "learning_rate": 2.893952329045459e-07, + "loss": 0.81539893, + "num_input_tokens_seen": 298461845, + "step": 13835, + "time_per_iteration": 2.42704176902771 + }, + { + "auxiliary_loss_clip": 0.01098765, + "auxiliary_loss_mlp": 0.01035844, + "balance_loss_clip": 1.03730965, + "balance_loss_mlp": 1.02221799, + "epoch": 0.8318653239140238, + "flos": 19974556892160.0, + "grad_norm": 1.9011712807554872, + "language_loss": 0.80940855, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.83075464, + "num_input_tokens_seen": 298479095, + "step": 13836, + "time_per_iteration": 3.959326982498169 + }, + { + "auxiliary_loss_clip": 0.01081484, + "auxiliary_loss_mlp": 0.01028271, + "balance_loss_clip": 1.03530109, + "balance_loss_mlp": 1.01702929, + "epoch": 0.8319254471666917, + "flos": 17704714694400.0, + "grad_norm": 2.1507823156936947, + "language_loss": 0.77705002, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.79814756, + "num_input_tokens_seen": 298494475, + "step": 13837, + "time_per_iteration": 2.58900785446167 + }, + { + "auxiliary_loss_clip": 0.01109476, + "auxiliary_loss_mlp": 0.01026331, + "balance_loss_clip": 1.03574419, + "balance_loss_mlp": 1.01305699, + "epoch": 0.8319855704193597, + "flos": 19536554448000.0, + "grad_norm": 2.0208029252008335, + "language_loss": 0.83265197, + "learning_rate": 2.887901504686685e-07, + "loss": 0.85401005, + "num_input_tokens_seen": 298513185, + "step": 13838, + "time_per_iteration": 2.4313042163848877 + }, + { + "auxiliary_loss_clip": 0.01077408, + "auxiliary_loss_mlp": 0.0103128, + "balance_loss_clip": 1.03346229, + "balance_loss_mlp": 1.01806569, + "epoch": 0.8320456936720276, + "flos": 21178067011200.0, + "grad_norm": 1.819072298477902, + "language_loss": 0.74392223, + "learning_rate": 2.885885860916795e-07, + "loss": 0.76500916, + "num_input_tokens_seen": 298531885, + "step": 13839, + "time_per_iteration": 2.5341439247131348 + }, + { + "auxiliary_loss_clip": 0.0109244, + "auxiliary_loss_mlp": 0.01033279, + "balance_loss_clip": 1.03589034, + "balance_loss_mlp": 1.02057719, + "epoch": 0.8321058169246957, + "flos": 33250874503680.0, + "grad_norm": 1.5668461158895621, + "language_loss": 0.67932194, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.70057917, + "num_input_tokens_seen": 298554905, + "step": 13840, + "time_per_iteration": 2.566868543624878 + }, + { + "auxiliary_loss_clip": 0.01055573, + "auxiliary_loss_mlp": 0.01034991, + "balance_loss_clip": 1.03243506, + "balance_loss_mlp": 1.02180028, + "epoch": 0.8321659401773636, + "flos": 14208129256320.0, + "grad_norm": 1.980460929420664, + "language_loss": 0.79520911, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.81611472, + "num_input_tokens_seen": 298571185, + "step": 13841, + "time_per_iteration": 2.544855833053589 + }, + { + "auxiliary_loss_clip": 0.0106488, + "auxiliary_loss_mlp": 0.01027616, + "balance_loss_clip": 1.03826475, + "balance_loss_mlp": 1.01561713, + "epoch": 0.8322260634300316, + "flos": 15158253859200.0, + "grad_norm": 2.061346407831117, + "language_loss": 0.68137437, + "learning_rate": 2.879842823726262e-07, + "loss": 0.70229936, + "num_input_tokens_seen": 298588505, + "step": 13842, + "time_per_iteration": 2.51357364654541 + }, + { + "auxiliary_loss_clip": 0.01085373, + "auxiliary_loss_mlp": 0.0103034, + "balance_loss_clip": 1.03674889, + "balance_loss_mlp": 1.01733971, + "epoch": 0.8322861866826995, + "flos": 25300827267840.0, + "grad_norm": 1.617530683105261, + "language_loss": 0.73072314, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.75188029, + "num_input_tokens_seen": 298609295, + "step": 13843, + "time_per_iteration": 2.576111316680908 + }, + { + "auxiliary_loss_clip": 0.01073764, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.03537178, + "balance_loss_mlp": 1.01881194, + "epoch": 0.8323463099353675, + "flos": 17019360218880.0, + "grad_norm": 1.9310246798231765, + "language_loss": 0.77574241, + "learning_rate": 2.875817378128975e-07, + "loss": 0.79679215, + "num_input_tokens_seen": 298625765, + "step": 13844, + "time_per_iteration": 2.511235237121582 + }, + { + "auxiliary_loss_clip": 0.01009275, + "auxiliary_loss_mlp": 0.01004387, + "balance_loss_clip": 1.0090301, + "balance_loss_mlp": 1.0033263, + "epoch": 0.8324064331880354, + "flos": 55607889709440.0, + "grad_norm": 0.7746739475899296, + "language_loss": 0.55243498, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.57257158, + "num_input_tokens_seen": 298683005, + "step": 13845, + "time_per_iteration": 3.039168119430542 + }, + { + "auxiliary_loss_clip": 0.01096529, + "auxiliary_loss_mlp": 0.01043849, + "balance_loss_clip": 1.03501606, + "balance_loss_mlp": 1.03061688, + "epoch": 0.8324665564407034, + "flos": 26138623063680.0, + "grad_norm": 1.5738952989929795, + "language_loss": 0.75534993, + "learning_rate": 2.871794529934555e-07, + "loss": 0.77675366, + "num_input_tokens_seen": 298703060, + "step": 13846, + "time_per_iteration": 2.530256509780884 + }, + { + "auxiliary_loss_clip": 0.0105329, + "auxiliary_loss_mlp": 0.01029845, + "balance_loss_clip": 1.03352284, + "balance_loss_mlp": 1.01528943, + "epoch": 0.8325266796933715, + "flos": 22049187649920.0, + "grad_norm": 1.5826268172048579, + "language_loss": 0.78763586, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.80846721, + "num_input_tokens_seen": 298721765, + "step": 13847, + "time_per_iteration": 2.6231958866119385 + }, + { + "auxiliary_loss_clip": 0.01047177, + "auxiliary_loss_mlp": 0.01028347, + "balance_loss_clip": 1.03402328, + "balance_loss_mlp": 1.01719463, + "epoch": 0.8325868029460394, + "flos": 22816634659200.0, + "grad_norm": 1.7718381518640798, + "language_loss": 0.74497104, + "learning_rate": 2.867774279753175e-07, + "loss": 0.76572627, + "num_input_tokens_seen": 298740825, + "step": 13848, + "time_per_iteration": 2.6224231719970703 + }, + { + "auxiliary_loss_clip": 0.0109331, + "auxiliary_loss_mlp": 0.01028276, + "balance_loss_clip": 1.03567147, + "balance_loss_mlp": 1.01674891, + "epoch": 0.8326469261987074, + "flos": 14757454926720.0, + "grad_norm": 1.9043477972370058, + "language_loss": 0.63249213, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.65370804, + "num_input_tokens_seen": 298758515, + "step": 13849, + "time_per_iteration": 2.4877092838287354 + }, + { + "auxiliary_loss_clip": 0.0108169, + "auxiliary_loss_mlp": 0.01030732, + "balance_loss_clip": 1.03374314, + "balance_loss_mlp": 1.01834035, + "epoch": 0.8327070494513753, + "flos": 22926126291840.0, + "grad_norm": 2.083819047575844, + "language_loss": 0.79501992, + "learning_rate": 2.863756628194638e-07, + "loss": 0.81614411, + "num_input_tokens_seen": 298776375, + "step": 13850, + "time_per_iteration": 2.540769338607788 + }, + { + "auxiliary_loss_clip": 0.0106377, + "auxiliary_loss_mlp": 0.0103013, + "balance_loss_clip": 1.03361177, + "balance_loss_mlp": 1.01963317, + "epoch": 0.8327671727040433, + "flos": 20665334321280.0, + "grad_norm": 1.7643637275965747, + "language_loss": 0.78142476, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.80236375, + "num_input_tokens_seen": 298795135, + "step": 13851, + "time_per_iteration": 2.6344423294067383 + }, + { + "auxiliary_loss_clip": 0.01024286, + "auxiliary_loss_mlp": 0.01002848, + "balance_loss_clip": 1.01124871, + "balance_loss_mlp": 1.00162017, + "epoch": 0.8328272959567112, + "flos": 56060760384000.0, + "grad_norm": 0.7605292237615213, + "language_loss": 0.55796814, + "learning_rate": 2.859741575868344e-07, + "loss": 0.57823956, + "num_input_tokens_seen": 298855475, + "step": 13852, + "time_per_iteration": 3.1143434047698975 + }, + { + "auxiliary_loss_clip": 0.01093278, + "auxiliary_loss_mlp": 0.01031995, + "balance_loss_clip": 1.0362761, + "balance_loss_mlp": 1.02005041, + "epoch": 0.8328874192093793, + "flos": 32303084284800.0, + "grad_norm": 2.4400994335026964, + "language_loss": 0.67189711, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.69314981, + "num_input_tokens_seen": 298875875, + "step": 13853, + "time_per_iteration": 2.595062255859375 + }, + { + "auxiliary_loss_clip": 0.01078878, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.03696406, + "balance_loss_mlp": 1.01920259, + "epoch": 0.8329475424620472, + "flos": 23512691387520.0, + "grad_norm": 3.477963594565376, + "language_loss": 0.7839483, + "learning_rate": 2.855729123383286e-07, + "loss": 0.80505109, + "num_input_tokens_seen": 298895950, + "step": 13854, + "time_per_iteration": 2.5508546829223633 + }, + { + "auxiliary_loss_clip": 0.01029396, + "auxiliary_loss_mlp": 0.01002099, + "balance_loss_clip": 1.00651121, + "balance_loss_mlp": 1.00091922, + "epoch": 0.8330076657147152, + "flos": 67840680378240.0, + "grad_norm": 0.7607812289757289, + "language_loss": 0.5863871, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.60670203, + "num_input_tokens_seen": 298955770, + "step": 13855, + "time_per_iteration": 2.952234983444214 + }, + { + "auxiliary_loss_clip": 0.0109144, + "auxiliary_loss_mlp": 0.01027962, + "balance_loss_clip": 1.03478849, + "balance_loss_mlp": 1.01604092, + "epoch": 0.8330677889673831, + "flos": 22892801448960.0, + "grad_norm": 1.701846098515264, + "language_loss": 0.71314585, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.73433989, + "num_input_tokens_seen": 298976545, + "step": 13856, + "time_per_iteration": 2.5239620208740234 + }, + { + "auxiliary_loss_clip": 0.01095465, + "auxiliary_loss_mlp": 0.01030126, + "balance_loss_clip": 1.03698039, + "balance_loss_mlp": 1.01823473, + "epoch": 0.8331279122200511, + "flos": 27345042184320.0, + "grad_norm": 1.5725319289184696, + "language_loss": 0.75440431, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.77566016, + "num_input_tokens_seen": 298996750, + "step": 13857, + "time_per_iteration": 2.5327212810516357 + }, + { + "auxiliary_loss_clip": 0.01054019, + "auxiliary_loss_mlp": 0.01025877, + "balance_loss_clip": 1.03555489, + "balance_loss_mlp": 1.01554775, + "epoch": 0.833188035472719, + "flos": 19938179393280.0, + "grad_norm": 1.5189819957092017, + "language_loss": 0.73512679, + "learning_rate": 2.847712020370958e-07, + "loss": 0.75592577, + "num_input_tokens_seen": 299014895, + "step": 13858, + "time_per_iteration": 2.618290424346924 + }, + { + "auxiliary_loss_clip": 0.01106863, + "auxiliary_loss_mlp": 0.01034178, + "balance_loss_clip": 1.03421783, + "balance_loss_mlp": 1.02169085, + "epoch": 0.833248158725387, + "flos": 15232624968960.0, + "grad_norm": 1.7130368332875345, + "language_loss": 0.73048264, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.75189304, + "num_input_tokens_seen": 299032855, + "step": 13859, + "time_per_iteration": 2.4309744834899902 + }, + { + "auxiliary_loss_clip": 0.01088037, + "auxiliary_loss_mlp": 0.01025161, + "balance_loss_clip": 1.03379607, + "balance_loss_mlp": 1.01396132, + "epoch": 0.8333082819780551, + "flos": 24535535074560.0, + "grad_norm": 1.5834021951615072, + "language_loss": 0.78791153, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.80904353, + "num_input_tokens_seen": 299052055, + "step": 13860, + "time_per_iteration": 2.55118465423584 + }, + { + "auxiliary_loss_clip": 0.01031671, + "auxiliary_loss_mlp": 0.01030321, + "balance_loss_clip": 1.03313291, + "balance_loss_mlp": 1.01856136, + "epoch": 0.833368405230723, + "flos": 31467407391360.0, + "grad_norm": 1.4079100894360363, + "language_loss": 0.82080173, + "learning_rate": 2.841706022218644e-07, + "loss": 0.84142166, + "num_input_tokens_seen": 299075285, + "step": 13861, + "time_per_iteration": 2.7116944789886475 + }, + { + "auxiliary_loss_clip": 0.01106311, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.0372932, + "balance_loss_mlp": 1.02054954, + "epoch": 0.833428528483391, + "flos": 14902713527040.0, + "grad_norm": 3.6154981789090668, + "language_loss": 0.78941476, + "learning_rate": 2.839705324021806e-07, + "loss": 0.81080765, + "num_input_tokens_seen": 299092520, + "step": 13862, + "time_per_iteration": 2.438392162322998 + }, + { + "auxiliary_loss_clip": 0.01094642, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.03429842, + "balance_loss_mlp": 1.01993585, + "epoch": 0.8334886517360589, + "flos": 22199833290240.0, + "grad_norm": 2.1038970219088577, + "language_loss": 0.75296247, + "learning_rate": 2.83770527654505e-07, + "loss": 0.77423275, + "num_input_tokens_seen": 299109450, + "step": 13863, + "time_per_iteration": 2.4769136905670166 + }, + { + "auxiliary_loss_clip": 0.01052962, + "auxiliary_loss_mlp": 0.007851, + "balance_loss_clip": 1.03492415, + "balance_loss_mlp": 1.00957131, + "epoch": 0.8335487749887269, + "flos": 30372562892160.0, + "grad_norm": 2.130234188607242, + "language_loss": 0.75570917, + "learning_rate": 2.835705879864232e-07, + "loss": 0.77408981, + "num_input_tokens_seen": 299129540, + "step": 13864, + "time_per_iteration": 2.667254686355591 + }, + { + "auxiliary_loss_clip": 0.01082085, + "auxiliary_loss_mlp": 0.01034364, + "balance_loss_clip": 1.03492522, + "balance_loss_mlp": 1.02142942, + "epoch": 0.8336088982413948, + "flos": 24681152810880.0, + "grad_norm": 1.9931325777105322, + "language_loss": 0.69256824, + "learning_rate": 2.833707134055168e-07, + "loss": 0.71373272, + "num_input_tokens_seen": 299148670, + "step": 13865, + "time_per_iteration": 4.108296632766724 + }, + { + "auxiliary_loss_clip": 0.01094064, + "auxiliary_loss_mlp": 0.01033506, + "balance_loss_clip": 1.03567505, + "balance_loss_mlp": 1.02170408, + "epoch": 0.8336690214940629, + "flos": 38177207873280.0, + "grad_norm": 1.9767910396076345, + "language_loss": 0.75635284, + "learning_rate": 2.831709039193653e-07, + "loss": 0.77762848, + "num_input_tokens_seen": 299169330, + "step": 13866, + "time_per_iteration": 4.078115224838257 + }, + { + "auxiliary_loss_clip": 0.01012891, + "auxiliary_loss_mlp": 0.01013991, + "balance_loss_clip": 1.01313329, + "balance_loss_mlp": 1.01253116, + "epoch": 0.8337291447467308, + "flos": 55565119589760.0, + "grad_norm": 0.8901576806567469, + "language_loss": 0.63053864, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.65080756, + "num_input_tokens_seen": 299220980, + "step": 13867, + "time_per_iteration": 3.0569379329681396 + }, + { + "auxiliary_loss_clip": 0.01079657, + "auxiliary_loss_mlp": 0.01032234, + "balance_loss_clip": 1.03479934, + "balance_loss_mlp": 1.02143991, + "epoch": 0.8337892679993988, + "flos": 24133550993280.0, + "grad_norm": 2.423548606391088, + "language_loss": 0.72072333, + "learning_rate": 2.827714802616301e-07, + "loss": 0.74184227, + "num_input_tokens_seen": 299240130, + "step": 13868, + "time_per_iteration": 2.5332343578338623 + }, + { + "auxiliary_loss_clip": 0.01081056, + "auxiliary_loss_mlp": 0.01030113, + "balance_loss_clip": 1.03814507, + "balance_loss_mlp": 1.01782227, + "epoch": 0.8338493912520667, + "flos": 28183915388160.0, + "grad_norm": 1.475703808554861, + "language_loss": 0.80486089, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.82597256, + "num_input_tokens_seen": 299260705, + "step": 13869, + "time_per_iteration": 2.554749011993408 + }, + { + "auxiliary_loss_clip": 0.01096348, + "auxiliary_loss_mlp": 0.01033725, + "balance_loss_clip": 1.03676331, + "balance_loss_mlp": 1.02152383, + "epoch": 0.8339095145047347, + "flos": 22158356060160.0, + "grad_norm": 1.6507595584300727, + "language_loss": 0.82589674, + "learning_rate": 2.823723170738028e-07, + "loss": 0.84719741, + "num_input_tokens_seen": 299278925, + "step": 13870, + "time_per_iteration": 3.8960795402526855 + }, + { + "auxiliary_loss_clip": 0.01080182, + "auxiliary_loss_mlp": 0.01028251, + "balance_loss_clip": 1.03505039, + "balance_loss_mlp": 1.01568639, + "epoch": 0.8339696377574026, + "flos": 17307112072320.0, + "grad_norm": 2.541368547540725, + "language_loss": 0.70527685, + "learning_rate": 2.821728331750264e-07, + "loss": 0.72636122, + "num_input_tokens_seen": 299291580, + "step": 13871, + "time_per_iteration": 2.4710822105407715 + }, + { + "auxiliary_loss_clip": 0.01092252, + "auxiliary_loss_mlp": 0.01030798, + "balance_loss_clip": 1.03607607, + "balance_loss_mlp": 1.01919293, + "epoch": 0.8340297610100706, + "flos": 20668351063680.0, + "grad_norm": 1.8044358301572225, + "language_loss": 0.68934268, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.7105732, + "num_input_tokens_seen": 299310385, + "step": 13872, + "time_per_iteration": 2.502723455429077 + }, + { + "auxiliary_loss_clip": 0.01081147, + "auxiliary_loss_mlp": 0.01025923, + "balance_loss_clip": 1.03466702, + "balance_loss_mlp": 1.01437747, + "epoch": 0.8340898842627387, + "flos": 20515442866560.0, + "grad_norm": 1.9107878354892271, + "language_loss": 0.73293805, + "learning_rate": 2.817740608055712e-07, + "loss": 0.75400871, + "num_input_tokens_seen": 299327660, + "step": 13873, + "time_per_iteration": 2.505671977996826 + }, + { + "auxiliary_loss_clip": 0.01081725, + "auxiliary_loss_mlp": 0.01033788, + "balance_loss_clip": 1.03444195, + "balance_loss_mlp": 1.01992953, + "epoch": 0.8341500075154066, + "flos": 21425850005760.0, + "grad_norm": 1.9721039835565375, + "language_loss": 0.75145042, + "learning_rate": 2.81574772350013e-07, + "loss": 0.77260554, + "num_input_tokens_seen": 299343685, + "step": 13874, + "time_per_iteration": 3.913151979446411 + }, + { + "auxiliary_loss_clip": 0.01074839, + "auxiliary_loss_mlp": 0.01026649, + "balance_loss_clip": 1.03305435, + "balance_loss_mlp": 1.01549101, + "epoch": 0.8342101307680746, + "flos": 22090988102400.0, + "grad_norm": 1.7795245170499925, + "language_loss": 0.66055781, + "learning_rate": 2.813755490573118e-07, + "loss": 0.68157268, + "num_input_tokens_seen": 299363305, + "step": 13875, + "time_per_iteration": 2.564612865447998 + }, + { + "auxiliary_loss_clip": 0.01061669, + "auxiliary_loss_mlp": 0.01037671, + "balance_loss_clip": 1.03508067, + "balance_loss_mlp": 1.02536869, + "epoch": 0.8342702540207425, + "flos": 21871466133120.0, + "grad_norm": 1.9364064207484857, + "language_loss": 0.79824018, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.81923354, + "num_input_tokens_seen": 299382630, + "step": 13876, + "time_per_iteration": 2.595637083053589 + }, + { + "auxiliary_loss_clip": 0.01088402, + "auxiliary_loss_mlp": 0.01038255, + "balance_loss_clip": 1.03363967, + "balance_loss_mlp": 1.02479601, + "epoch": 0.8343303772734105, + "flos": 22528487756160.0, + "grad_norm": 1.8512335187722102, + "language_loss": 0.87282461, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.89409113, + "num_input_tokens_seen": 299402385, + "step": 13877, + "time_per_iteration": 2.5206453800201416 + }, + { + "auxiliary_loss_clip": 0.01068984, + "auxiliary_loss_mlp": 0.01029031, + "balance_loss_clip": 1.03490162, + "balance_loss_mlp": 1.01751506, + "epoch": 0.8343905005260784, + "flos": 14939773384320.0, + "grad_norm": 1.7847799874166084, + "language_loss": 0.69390225, + "learning_rate": 2.807782702318828e-07, + "loss": 0.71488237, + "num_input_tokens_seen": 299419820, + "step": 13878, + "time_per_iteration": 2.5035104751586914 + }, + { + "auxiliary_loss_clip": 0.01081611, + "auxiliary_loss_mlp": 0.01028885, + "balance_loss_clip": 1.03513813, + "balance_loss_mlp": 1.01743484, + "epoch": 0.8344506237787465, + "flos": 15012456554880.0, + "grad_norm": 1.93915079639133, + "language_loss": 0.79414845, + "learning_rate": 2.805793076661309e-07, + "loss": 0.81525344, + "num_input_tokens_seen": 299436265, + "step": 13879, + "time_per_iteration": 2.5225470066070557 + }, + { + "auxiliary_loss_clip": 0.01053377, + "auxiliary_loss_mlp": 0.01030574, + "balance_loss_clip": 1.03475571, + "balance_loss_mlp": 1.02007759, + "epoch": 0.8345107470314144, + "flos": 17560389847680.0, + "grad_norm": 2.2829282878134793, + "language_loss": 0.83192712, + "learning_rate": 2.803804103009828e-07, + "loss": 0.85276663, + "num_input_tokens_seen": 299451660, + "step": 13880, + "time_per_iteration": 2.5448951721191406 + }, + { + "auxiliary_loss_clip": 0.01085892, + "auxiliary_loss_mlp": 0.01029446, + "balance_loss_clip": 1.03489232, + "balance_loss_mlp": 1.01794839, + "epoch": 0.8345708702840824, + "flos": 25187277398400.0, + "grad_norm": 1.472788727312601, + "language_loss": 0.78227401, + "learning_rate": 2.80181578143982e-07, + "loss": 0.80342734, + "num_input_tokens_seen": 299472070, + "step": 13881, + "time_per_iteration": 2.5989532470703125 + }, + { + "auxiliary_loss_clip": 0.01060914, + "auxiliary_loss_mlp": 0.01026204, + "balance_loss_clip": 1.03454423, + "balance_loss_mlp": 1.01561821, + "epoch": 0.8346309935367503, + "flos": 15083559527040.0, + "grad_norm": 2.3211582985882, + "language_loss": 0.77961683, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.80048805, + "num_input_tokens_seen": 299486725, + "step": 13882, + "time_per_iteration": 2.505530595779419 + }, + { + "auxiliary_loss_clip": 0.01066294, + "auxiliary_loss_mlp": 0.01047789, + "balance_loss_clip": 1.03276217, + "balance_loss_mlp": 1.03438401, + "epoch": 0.8346911167894183, + "flos": 22930615491840.0, + "grad_norm": 1.6966508090055339, + "language_loss": 0.80450737, + "learning_rate": 2.79784109484579e-07, + "loss": 0.82564825, + "num_input_tokens_seen": 299505435, + "step": 13883, + "time_per_iteration": 2.5603532791137695 + }, + { + "auxiliary_loss_clip": 0.0109173, + "auxiliary_loss_mlp": 0.010298, + "balance_loss_clip": 1.0357182, + "balance_loss_mlp": 1.01749778, + "epoch": 0.8347512400420862, + "flos": 20193037367040.0, + "grad_norm": 1.8003296327033074, + "language_loss": 0.74219984, + "learning_rate": 2.795854729972482e-07, + "loss": 0.76341522, + "num_input_tokens_seen": 299523555, + "step": 13884, + "time_per_iteration": 2.4747161865234375 + }, + { + "auxiliary_loss_clip": 0.01089756, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.03673244, + "balance_loss_mlp": 1.0210278, + "epoch": 0.8348113632947542, + "flos": 25954832148480.0, + "grad_norm": 1.8330064031024795, + "language_loss": 0.70304674, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.72428876, + "num_input_tokens_seen": 299541660, + "step": 13885, + "time_per_iteration": 2.5866193771362305 + }, + { + "auxiliary_loss_clip": 0.01071466, + "auxiliary_loss_mlp": 0.01029314, + "balance_loss_clip": 1.03420997, + "balance_loss_mlp": 1.01707125, + "epoch": 0.8348714865474223, + "flos": 34204554552960.0, + "grad_norm": 2.181688033740228, + "language_loss": 0.70081067, + "learning_rate": 2.791883957449912e-07, + "loss": 0.72181851, + "num_input_tokens_seen": 299562465, + "step": 13886, + "time_per_iteration": 2.6421046257019043 + }, + { + "auxiliary_loss_clip": 0.01067265, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.03324401, + "balance_loss_mlp": 1.02054238, + "epoch": 0.8349316098000902, + "flos": 24390132819840.0, + "grad_norm": 1.8577313011952379, + "language_loss": 0.78977764, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.81078339, + "num_input_tokens_seen": 299582700, + "step": 13887, + "time_per_iteration": 2.6188101768493652 + }, + { + "auxiliary_loss_clip": 0.01086795, + "auxiliary_loss_mlp": 0.00785695, + "balance_loss_clip": 1.0364598, + "balance_loss_mlp": 1.0102936, + "epoch": 0.8349917330527582, + "flos": 23032744836480.0, + "grad_norm": 2.2957056243692335, + "language_loss": 0.64211953, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.66084439, + "num_input_tokens_seen": 299600310, + "step": 13888, + "time_per_iteration": 2.5228805541992188 + }, + { + "auxiliary_loss_clip": 0.0108195, + "auxiliary_loss_mlp": 0.01028478, + "balance_loss_clip": 1.03353035, + "balance_loss_mlp": 1.01628852, + "epoch": 0.8350518563054261, + "flos": 13625873792640.0, + "grad_norm": 2.186513991467204, + "language_loss": 0.66695374, + "learning_rate": 2.785932692855244e-07, + "loss": 0.68805802, + "num_input_tokens_seen": 299617025, + "step": 13889, + "time_per_iteration": 2.5198302268981934 + }, + { + "auxiliary_loss_clip": 0.01085435, + "auxiliary_loss_mlp": 0.01028587, + "balance_loss_clip": 1.03172445, + "balance_loss_mlp": 1.01662469, + "epoch": 0.8351119795580941, + "flos": 21579799697280.0, + "grad_norm": 1.8138056790320118, + "language_loss": 0.68607134, + "learning_rate": 2.783950243408399e-07, + "loss": 0.70721161, + "num_input_tokens_seen": 299633050, + "step": 13890, + "time_per_iteration": 2.476487398147583 + }, + { + "auxiliary_loss_clip": 0.01084932, + "auxiliary_loss_mlp": 0.0103621, + "balance_loss_clip": 1.03664231, + "balance_loss_mlp": 1.02437282, + "epoch": 0.835172102810762, + "flos": 20038297576320.0, + "grad_norm": 2.283780037424424, + "language_loss": 0.59399426, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.61520571, + "num_input_tokens_seen": 299646445, + "step": 13891, + "time_per_iteration": 2.503169059753418 + }, + { + "auxiliary_loss_clip": 0.01094638, + "auxiliary_loss_mlp": 0.0102716, + "balance_loss_clip": 1.03630149, + "balance_loss_mlp": 1.01553726, + "epoch": 0.8352322260634301, + "flos": 25111577485440.0, + "grad_norm": 1.7903443509030557, + "language_loss": 0.71742284, + "learning_rate": 2.779987303092846e-07, + "loss": 0.73864079, + "num_input_tokens_seen": 299662665, + "step": 13892, + "time_per_iteration": 2.5205421447753906 + }, + { + "auxiliary_loss_clip": 0.01100877, + "auxiliary_loss_mlp": 0.01029976, + "balance_loss_clip": 1.03415322, + "balance_loss_mlp": 1.01791167, + "epoch": 0.835292349316098, + "flos": 24863758577280.0, + "grad_norm": 1.5547044601312565, + "language_loss": 0.65830636, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.6796149, + "num_input_tokens_seen": 299683585, + "step": 13893, + "time_per_iteration": 2.5199155807495117 + }, + { + "auxiliary_loss_clip": 0.01081033, + "auxiliary_loss_mlp": 0.01026748, + "balance_loss_clip": 1.03322959, + "balance_loss_mlp": 1.01526856, + "epoch": 0.835352472568766, + "flos": 19865568049920.0, + "grad_norm": 2.059871882482799, + "language_loss": 0.78463113, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.80570894, + "num_input_tokens_seen": 299702680, + "step": 13894, + "time_per_iteration": 2.5039217472076416 + }, + { + "auxiliary_loss_clip": 0.01085461, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.03394771, + "balance_loss_mlp": 1.01836336, + "epoch": 0.8354125958214339, + "flos": 22054754257920.0, + "grad_norm": 1.6405816990444655, + "language_loss": 0.72453892, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.74570036, + "num_input_tokens_seen": 299721050, + "step": 13895, + "time_per_iteration": 2.513826370239258 + }, + { + "auxiliary_loss_clip": 0.01091558, + "auxiliary_loss_mlp": 0.01039436, + "balance_loss_clip": 1.03588152, + "balance_loss_mlp": 1.0260905, + "epoch": 0.8354727190741019, + "flos": 21397804462080.0, + "grad_norm": 2.030057774241665, + "language_loss": 0.71964276, + "learning_rate": 2.772069258877667e-07, + "loss": 0.74095273, + "num_input_tokens_seen": 299738255, + "step": 13896, + "time_per_iteration": 2.5036683082580566 + }, + { + "auxiliary_loss_clip": 0.01092716, + "auxiliary_loss_mlp": 0.01031574, + "balance_loss_clip": 1.03503382, + "balance_loss_mlp": 1.01978397, + "epoch": 0.8355328423267698, + "flos": 50840997834240.0, + "grad_norm": 2.1632513583616846, + "language_loss": 0.58833677, + "learning_rate": 2.770091380848423e-07, + "loss": 0.60957968, + "num_input_tokens_seen": 299761315, + "step": 13897, + "time_per_iteration": 2.733217477798462 + }, + { + "auxiliary_loss_clip": 0.01029532, + "auxiliary_loss_mlp": 0.00764137, + "balance_loss_clip": 1.00654733, + "balance_loss_mlp": 1.00675249, + "epoch": 0.8355929655794379, + "flos": 65551052764800.0, + "grad_norm": 0.6938136141987127, + "language_loss": 0.57626593, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.59420264, + "num_input_tokens_seen": 299828735, + "step": 13898, + "time_per_iteration": 3.13100528717041 + }, + { + "auxiliary_loss_clip": 0.01097078, + "auxiliary_loss_mlp": 0.01036137, + "balance_loss_clip": 1.0362078, + "balance_loss_mlp": 1.02306545, + "epoch": 0.8356530888321058, + "flos": 19170516902400.0, + "grad_norm": 2.466985144977586, + "language_loss": 0.79815131, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.81948346, + "num_input_tokens_seen": 299848395, + "step": 13899, + "time_per_iteration": 2.501960277557373 + }, + { + "auxiliary_loss_clip": 0.0110481, + "auxiliary_loss_mlp": 0.01032395, + "balance_loss_clip": 1.03544998, + "balance_loss_mlp": 1.02098072, + "epoch": 0.8357132120847738, + "flos": 44126672238720.0, + "grad_norm": 1.5597706485444327, + "language_loss": 0.68635029, + "learning_rate": 2.764161667219749e-07, + "loss": 0.70772231, + "num_input_tokens_seen": 299871665, + "step": 13900, + "time_per_iteration": 2.669365167617798 + }, + { + "auxiliary_loss_clip": 0.01082798, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.0392127, + "balance_loss_mlp": 1.01846409, + "epoch": 0.8357733353374418, + "flos": 24389701856640.0, + "grad_norm": 1.370612517600156, + "language_loss": 0.70928055, + "learning_rate": 2.762186403079716e-07, + "loss": 0.730407, + "num_input_tokens_seen": 299891960, + "step": 13901, + "time_per_iteration": 2.554680585861206 + }, + { + "auxiliary_loss_clip": 0.0106153, + "auxiliary_loss_mlp": 0.01037029, + "balance_loss_clip": 1.03442097, + "balance_loss_mlp": 1.02460766, + "epoch": 0.8358334585901097, + "flos": 20916313626240.0, + "grad_norm": 2.0228589356738453, + "language_loss": 0.799573, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.82055861, + "num_input_tokens_seen": 299905070, + "step": 13902, + "time_per_iteration": 2.5529561042785645 + }, + { + "auxiliary_loss_clip": 0.01092139, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.0354079, + "balance_loss_mlp": 1.01937795, + "epoch": 0.8358935818427777, + "flos": 19244169740160.0, + "grad_norm": 1.463148152753567, + "language_loss": 0.62521505, + "learning_rate": 2.758237835853379e-07, + "loss": 0.64645064, + "num_input_tokens_seen": 299925130, + "step": 13903, + "time_per_iteration": 3.8545761108398438 + }, + { + "auxiliary_loss_clip": 0.01080627, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.03430605, + "balance_loss_mlp": 1.02481532, + "epoch": 0.8359537050954456, + "flos": 24134053783680.0, + "grad_norm": 1.847160944925841, + "language_loss": 0.74571294, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.7668947, + "num_input_tokens_seen": 299943845, + "step": 13904, + "time_per_iteration": 2.548473358154297 + }, + { + "auxiliary_loss_clip": 0.01076117, + "auxiliary_loss_mlp": 0.01028854, + "balance_loss_clip": 1.03214836, + "balance_loss_mlp": 1.01680231, + "epoch": 0.8360138283481137, + "flos": 16180415187840.0, + "grad_norm": 1.7533997525022829, + "language_loss": 0.72720921, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.74825895, + "num_input_tokens_seen": 299961620, + "step": 13905, + "time_per_iteration": 3.906623601913452 + }, + { + "auxiliary_loss_clip": 0.01092753, + "auxiliary_loss_mlp": 0.01034268, + "balance_loss_clip": 1.0366708, + "balance_loss_mlp": 1.02333105, + "epoch": 0.8360739516007816, + "flos": 22198899536640.0, + "grad_norm": 1.608054588213091, + "language_loss": 0.66458142, + "learning_rate": 2.752319888771e-07, + "loss": 0.68585157, + "num_input_tokens_seen": 299982170, + "step": 13906, + "time_per_iteration": 2.499828815460205 + }, + { + "auxiliary_loss_clip": 0.01091793, + "auxiliary_loss_mlp": 0.01028962, + "balance_loss_clip": 1.03503489, + "balance_loss_mlp": 1.01725519, + "epoch": 0.8361340748534496, + "flos": 20923137210240.0, + "grad_norm": 1.468936455740421, + "language_loss": 0.74027067, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.76147825, + "num_input_tokens_seen": 300001330, + "step": 13907, + "time_per_iteration": 2.507209539413452 + }, + { + "auxiliary_loss_clip": 0.01072263, + "auxiliary_loss_mlp": 0.01034414, + "balance_loss_clip": 1.03366566, + "balance_loss_mlp": 1.02233791, + "epoch": 0.8361941981061175, + "flos": 26173599932160.0, + "grad_norm": 1.606919285973566, + "language_loss": 0.75386679, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.77493346, + "num_input_tokens_seen": 300020645, + "step": 13908, + "time_per_iteration": 4.042942523956299 + }, + { + "auxiliary_loss_clip": 0.01096012, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.03608489, + "balance_loss_mlp": 1.01945901, + "epoch": 0.8362543213587855, + "flos": 24419363512320.0, + "grad_norm": 2.218825363529686, + "language_loss": 0.71497732, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.73626602, + "num_input_tokens_seen": 300039945, + "step": 13909, + "time_per_iteration": 2.5318310260772705 + }, + { + "auxiliary_loss_clip": 0.01106626, + "auxiliary_loss_mlp": 0.00784047, + "balance_loss_clip": 1.03529763, + "balance_loss_mlp": 1.00910175, + "epoch": 0.8363144446114534, + "flos": 17202396948480.0, + "grad_norm": 1.8327195021422595, + "language_loss": 0.73062122, + "learning_rate": 2.744438449482338e-07, + "loss": 0.74952799, + "num_input_tokens_seen": 300058260, + "step": 13910, + "time_per_iteration": 2.4225032329559326 + }, + { + "auxiliary_loss_clip": 0.01089592, + "auxiliary_loss_mlp": 0.00784057, + "balance_loss_clip": 1.03404248, + "balance_loss_mlp": 1.00765872, + "epoch": 0.8363745678641215, + "flos": 19279398003840.0, + "grad_norm": 1.6823936594978595, + "language_loss": 0.73290712, + "learning_rate": 2.742469725305001e-07, + "loss": 0.7516436, + "num_input_tokens_seen": 300076720, + "step": 13911, + "time_per_iteration": 2.489025115966797 + }, + { + "auxiliary_loss_clip": 0.01084984, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.03482008, + "balance_loss_mlp": 1.02791464, + "epoch": 0.8364346911167894, + "flos": 11874869596800.0, + "grad_norm": 2.093106860618108, + "language_loss": 0.78895736, + "learning_rate": 2.740501655534946e-07, + "loss": 0.8102051, + "num_input_tokens_seen": 300092950, + "step": 13912, + "time_per_iteration": 2.585545539855957 + }, + { + "auxiliary_loss_clip": 0.0109712, + "auxiliary_loss_mlp": 0.01033326, + "balance_loss_clip": 1.0381577, + "balance_loss_mlp": 1.02193522, + "epoch": 0.8364948143694574, + "flos": 20225212974720.0, + "grad_norm": 1.748465039759902, + "language_loss": 0.79009497, + "learning_rate": 2.738534240246797e-07, + "loss": 0.8113994, + "num_input_tokens_seen": 300110950, + "step": 13913, + "time_per_iteration": 3.9602251052856445 + }, + { + "auxiliary_loss_clip": 0.01091791, + "auxiliary_loss_mlp": 0.01034412, + "balance_loss_clip": 1.03370547, + "balance_loss_mlp": 1.02223492, + "epoch": 0.8365549376221254, + "flos": 21612909058560.0, + "grad_norm": 1.9147541466787323, + "language_loss": 0.72987872, + "learning_rate": 2.736567479515153e-07, + "loss": 0.75114071, + "num_input_tokens_seen": 300128705, + "step": 13914, + "time_per_iteration": 2.560734987258911 + }, + { + "auxiliary_loss_clip": 0.01057013, + "auxiliary_loss_mlp": 0.01028994, + "balance_loss_clip": 1.0362581, + "balance_loss_mlp": 1.0168705, + "epoch": 0.8366150608747933, + "flos": 23294210912640.0, + "grad_norm": 1.591561488744853, + "language_loss": 0.71130073, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.73216081, + "num_input_tokens_seen": 300148635, + "step": 13915, + "time_per_iteration": 2.6519644260406494 + }, + { + "auxiliary_loss_clip": 0.0107284, + "auxiliary_loss_mlp": 0.01031662, + "balance_loss_clip": 1.0339992, + "balance_loss_mlp": 1.02046824, + "epoch": 0.8366751841274613, + "flos": 15267673664640.0, + "grad_norm": 1.9263281553753617, + "language_loss": 0.7234177, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.74446273, + "num_input_tokens_seen": 300165490, + "step": 13916, + "time_per_iteration": 2.614776372909546 + }, + { + "auxiliary_loss_clip": 0.0107219, + "auxiliary_loss_mlp": 0.00782171, + "balance_loss_clip": 1.03526783, + "balance_loss_mlp": 1.00768495, + "epoch": 0.8367353073801292, + "flos": 13224931205760.0, + "grad_norm": 1.9438688982750534, + "language_loss": 0.74707592, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.76561958, + "num_input_tokens_seen": 300182130, + "step": 13917, + "time_per_iteration": 2.575924873352051 + }, + { + "auxiliary_loss_clip": 0.01101537, + "auxiliary_loss_mlp": 0.01029969, + "balance_loss_clip": 1.03658068, + "balance_loss_mlp": 1.0187459, + "epoch": 0.8367954306327973, + "flos": 24205084928640.0, + "grad_norm": 1.7982244206097922, + "language_loss": 0.78647578, + "learning_rate": 2.728706983644933e-07, + "loss": 0.80779088, + "num_input_tokens_seen": 300203050, + "step": 13918, + "time_per_iteration": 2.561345338821411 + }, + { + "auxiliary_loss_clip": 0.01059896, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_clip": 1.03438079, + "balance_loss_mlp": 1.02118444, + "epoch": 0.8368555538854652, + "flos": 24534744975360.0, + "grad_norm": 1.5559761182797092, + "language_loss": 0.68017256, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.70110142, + "num_input_tokens_seen": 300224380, + "step": 13919, + "time_per_iteration": 2.6836633682250977 + }, + { + "auxiliary_loss_clip": 0.01086504, + "auxiliary_loss_mlp": 0.0103474, + "balance_loss_clip": 1.03212237, + "balance_loss_mlp": 1.02231812, + "epoch": 0.8369156771381332, + "flos": 20259363830400.0, + "grad_norm": 1.7097027352243663, + "language_loss": 0.73852497, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.75973737, + "num_input_tokens_seen": 300242915, + "step": 13920, + "time_per_iteration": 2.5117666721343994 + }, + { + "auxiliary_loss_clip": 0.01083805, + "auxiliary_loss_mlp": 0.01030657, + "balance_loss_clip": 1.03408778, + "balance_loss_mlp": 1.01815772, + "epoch": 0.8369758003908011, + "flos": 21835555511040.0, + "grad_norm": 1.960167735947292, + "language_loss": 0.6860438, + "learning_rate": 2.722818488237566e-07, + "loss": 0.70718837, + "num_input_tokens_seen": 300261905, + "step": 13921, + "time_per_iteration": 2.547477960586548 + }, + { + "auxiliary_loss_clip": 0.01097502, + "auxiliary_loss_mlp": 0.0103541, + "balance_loss_clip": 1.03624272, + "balance_loss_mlp": 1.0238471, + "epoch": 0.8370359236434691, + "flos": 21719312121600.0, + "grad_norm": 1.8894399067133132, + "language_loss": 0.85623074, + "learning_rate": 2.720856966640801e-07, + "loss": 0.87755984, + "num_input_tokens_seen": 300281145, + "step": 13922, + "time_per_iteration": 2.4766440391540527 + }, + { + "auxiliary_loss_clip": 0.01061632, + "auxiliary_loss_mlp": 0.00782387, + "balance_loss_clip": 1.03312671, + "balance_loss_mlp": 1.00959134, + "epoch": 0.837096046896137, + "flos": 23148880485120.0, + "grad_norm": 1.7009172480493002, + "language_loss": 0.71826839, + "learning_rate": 2.71889610027088e-07, + "loss": 0.73670852, + "num_input_tokens_seen": 300301610, + "step": 13923, + "time_per_iteration": 2.585829973220825 + }, + { + "auxiliary_loss_clip": 0.01078167, + "auxiliary_loss_mlp": 0.01029791, + "balance_loss_clip": 1.0351634, + "balance_loss_mlp": 1.01702404, + "epoch": 0.8371561701488051, + "flos": 24492872695680.0, + "grad_norm": 1.8693050149404153, + "language_loss": 0.76247787, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.78355747, + "num_input_tokens_seen": 300319420, + "step": 13924, + "time_per_iteration": 2.540253162384033 + }, + { + "auxiliary_loss_clip": 0.01081174, + "auxiliary_loss_mlp": 0.01024728, + "balance_loss_clip": 1.03393817, + "balance_loss_mlp": 1.01284266, + "epoch": 0.837216293401473, + "flos": 29206723161600.0, + "grad_norm": 1.4855306657387035, + "language_loss": 0.64792436, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.6689834, + "num_input_tokens_seen": 300341325, + "step": 13925, + "time_per_iteration": 2.586538314819336 + }, + { + "auxiliary_loss_clip": 0.01083067, + "auxiliary_loss_mlp": 0.01031806, + "balance_loss_clip": 1.03688991, + "balance_loss_mlp": 1.01943815, + "epoch": 0.837276416654141, + "flos": 25265275781760.0, + "grad_norm": 1.5332352944294216, + "language_loss": 0.74442226, + "learning_rate": 2.713017433265543e-07, + "loss": 0.765571, + "num_input_tokens_seen": 300361620, + "step": 13926, + "time_per_iteration": 2.5796051025390625 + }, + { + "auxiliary_loss_clip": 0.01094462, + "auxiliary_loss_mlp": 0.0103297, + "balance_loss_clip": 1.03693533, + "balance_loss_mlp": 1.02055407, + "epoch": 0.837336539906809, + "flos": 13882024656000.0, + "grad_norm": 1.7498484354281776, + "language_loss": 0.71085656, + "learning_rate": 2.711059188546274e-07, + "loss": 0.73213089, + "num_input_tokens_seen": 300378675, + "step": 13927, + "time_per_iteration": 2.4699366092681885 + }, + { + "auxiliary_loss_clip": 0.01008536, + "auxiliary_loss_mlp": 0.01002315, + "balance_loss_clip": 1.00784945, + "balance_loss_mlp": 1.00118244, + "epoch": 0.8373966631594769, + "flos": 68870599044480.0, + "grad_norm": 0.709560756023474, + "language_loss": 0.58822608, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.6083346, + "num_input_tokens_seen": 300449740, + "step": 13928, + "time_per_iteration": 3.261688232421875 + }, + { + "auxiliary_loss_clip": 0.01068119, + "auxiliary_loss_mlp": 0.0103781, + "balance_loss_clip": 1.03757548, + "balance_loss_mlp": 1.02457142, + "epoch": 0.8374567864121449, + "flos": 20448972748800.0, + "grad_norm": 1.7361076025926911, + "language_loss": 0.6992979, + "learning_rate": 2.707144665977068e-07, + "loss": 0.72035718, + "num_input_tokens_seen": 300470000, + "step": 13929, + "time_per_iteration": 2.5716052055358887 + }, + { + "auxiliary_loss_clip": 0.01094615, + "auxiliary_loss_mlp": 0.01027518, + "balance_loss_clip": 1.03534102, + "balance_loss_mlp": 1.01505494, + "epoch": 0.8375169096648128, + "flos": 41904197101440.0, + "grad_norm": 1.4357393147939819, + "language_loss": 0.66971534, + "learning_rate": 2.705188388275574e-07, + "loss": 0.69093668, + "num_input_tokens_seen": 300494975, + "step": 13930, + "time_per_iteration": 2.6938843727111816 + }, + { + "auxiliary_loss_clip": 0.01063236, + "auxiliary_loss_mlp": 0.01027354, + "balance_loss_clip": 1.03734064, + "balance_loss_mlp": 1.01552224, + "epoch": 0.8375770329174809, + "flos": 20009354192640.0, + "grad_norm": 1.6162014239351166, + "language_loss": 0.71177053, + "learning_rate": 2.703232766395067e-07, + "loss": 0.73267639, + "num_input_tokens_seen": 300513175, + "step": 13931, + "time_per_iteration": 2.5953662395477295 + }, + { + "auxiliary_loss_clip": 0.01068282, + "auxiliary_loss_mlp": 0.01032298, + "balance_loss_clip": 1.03158212, + "balance_loss_mlp": 1.0201385, + "epoch": 0.8376371561701488, + "flos": 22783597125120.0, + "grad_norm": 1.58174218051321, + "language_loss": 0.71856117, + "learning_rate": 2.701277800409705e-07, + "loss": 0.73956692, + "num_input_tokens_seen": 300533770, + "step": 13932, + "time_per_iteration": 2.554008960723877 + }, + { + "auxiliary_loss_clip": 0.0103908, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.03232956, + "balance_loss_mlp": 1.02050209, + "epoch": 0.8376972794228168, + "flos": 23914459987200.0, + "grad_norm": 2.0348648832372316, + "language_loss": 0.66496444, + "learning_rate": 2.699323490393628e-07, + "loss": 0.6856665, + "num_input_tokens_seen": 300552995, + "step": 13933, + "time_per_iteration": 2.6921565532684326 + }, + { + "auxiliary_loss_clip": 0.01074007, + "auxiliary_loss_mlp": 0.01039179, + "balance_loss_clip": 1.03422058, + "balance_loss_mlp": 1.02747893, + "epoch": 0.8377574026754847, + "flos": 13734718980480.0, + "grad_norm": 2.0740067893872185, + "language_loss": 0.76653624, + "learning_rate": 2.697369836420933e-07, + "loss": 0.78766817, + "num_input_tokens_seen": 300570275, + "step": 13934, + "time_per_iteration": 2.492398738861084 + }, + { + "auxiliary_loss_clip": 0.01093979, + "auxiliary_loss_mlp": 0.01027898, + "balance_loss_clip": 1.03797972, + "balance_loss_mlp": 1.01608396, + "epoch": 0.8378175259281527, + "flos": 21651333632640.0, + "grad_norm": 1.5593046782831266, + "language_loss": 0.77276808, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.79398692, + "num_input_tokens_seen": 300590875, + "step": 13935, + "time_per_iteration": 2.5162315368652344 + }, + { + "auxiliary_loss_clip": 0.01063486, + "auxiliary_loss_mlp": 0.01029147, + "balance_loss_clip": 1.03644395, + "balance_loss_mlp": 1.01746464, + "epoch": 0.8378776491808206, + "flos": 15448806973440.0, + "grad_norm": 3.002655436567441, + "language_loss": 0.55746806, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.57839435, + "num_input_tokens_seen": 300607490, + "step": 13936, + "time_per_iteration": 2.5022873878479004 + }, + { + "auxiliary_loss_clip": 0.01087211, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.03261173, + "balance_loss_mlp": 1.02069592, + "epoch": 0.8379377724334887, + "flos": 14720395069440.0, + "grad_norm": 1.7398262323607545, + "language_loss": 0.89500827, + "learning_rate": 2.691512811503882e-07, + "loss": 0.91621017, + "num_input_tokens_seen": 300623635, + "step": 13937, + "time_per_iteration": 2.4685986042022705 + }, + { + "auxiliary_loss_clip": 0.01095015, + "auxiliary_loss_mlp": 0.01029308, + "balance_loss_clip": 1.03565812, + "balance_loss_mlp": 1.01747704, + "epoch": 0.8379978956861566, + "flos": 24535247765760.0, + "grad_norm": 1.6732919989714432, + "language_loss": 0.81909549, + "learning_rate": 2.689561782445313e-07, + "loss": 0.84033871, + "num_input_tokens_seen": 300643835, + "step": 13938, + "time_per_iteration": 2.496149778366089 + }, + { + "auxiliary_loss_clip": 0.01096508, + "auxiliary_loss_mlp": 0.01030321, + "balance_loss_clip": 1.03545737, + "balance_loss_mlp": 1.01774406, + "epoch": 0.8380580189388246, + "flos": 18952611045120.0, + "grad_norm": 1.7334849160631554, + "language_loss": 0.70441115, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.7256794, + "num_input_tokens_seen": 300662500, + "step": 13939, + "time_per_iteration": 2.5023746490478516 + }, + { + "auxiliary_loss_clip": 0.01074238, + "auxiliary_loss_mlp": 0.01038421, + "balance_loss_clip": 1.03484201, + "balance_loss_mlp": 1.02502775, + "epoch": 0.8381181421914926, + "flos": 26540283922560.0, + "grad_norm": 1.6437890385274052, + "language_loss": 0.76120734, + "learning_rate": 2.6856616936428e-07, + "loss": 0.78233391, + "num_input_tokens_seen": 300681480, + "step": 13940, + "time_per_iteration": 2.580244541168213 + }, + { + "auxiliary_loss_clip": 0.01090319, + "auxiliary_loss_mlp": 0.01033935, + "balance_loss_clip": 1.03547549, + "balance_loss_mlp": 1.02160907, + "epoch": 0.8381782654441605, + "flos": 23291481479040.0, + "grad_norm": 1.9761393224990804, + "language_loss": 0.76670063, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.78794312, + "num_input_tokens_seen": 300699165, + "step": 13941, + "time_per_iteration": 2.5179035663604736 + }, + { + "auxiliary_loss_clip": 0.01062542, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.03330958, + "balance_loss_mlp": 1.01706195, + "epoch": 0.8382383886968285, + "flos": 26758800311040.0, + "grad_norm": 2.098510605216128, + "language_loss": 0.73601764, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.75693905, + "num_input_tokens_seen": 300714615, + "step": 13942, + "time_per_iteration": 3.983585834503174 + }, + { + "auxiliary_loss_clip": 0.01062537, + "auxiliary_loss_mlp": 0.0103587, + "balance_loss_clip": 1.03512776, + "balance_loss_mlp": 1.02234006, + "epoch": 0.8382985119494964, + "flos": 26104544035200.0, + "grad_norm": 1.4196172572646153, + "language_loss": 0.79349184, + "learning_rate": 2.679816484834554e-07, + "loss": 0.81447589, + "num_input_tokens_seen": 300734860, + "step": 13943, + "time_per_iteration": 3.9789185523986816 + }, + { + "auxiliary_loss_clip": 0.01053736, + "auxiliary_loss_mlp": 0.01030665, + "balance_loss_clip": 1.03159308, + "balance_loss_mlp": 1.01898861, + "epoch": 0.8383586352021645, + "flos": 16435129507200.0, + "grad_norm": 1.9165464582437033, + "language_loss": 0.84908515, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.86992919, + "num_input_tokens_seen": 300752735, + "step": 13944, + "time_per_iteration": 2.5898139476776123 + }, + { + "auxiliary_loss_clip": 0.01017139, + "auxiliary_loss_mlp": 0.00762318, + "balance_loss_clip": 1.0065372, + "balance_loss_mlp": 1.0033344, + "epoch": 0.8384187584548324, + "flos": 64195532288640.0, + "grad_norm": 0.7029654488139638, + "language_loss": 0.5023905, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.52018511, + "num_input_tokens_seen": 300820760, + "step": 13945, + "time_per_iteration": 3.2068533897399902 + }, + { + "auxiliary_loss_clip": 0.01058623, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.0344497, + "balance_loss_mlp": 1.01924479, + "epoch": 0.8384788817075004, + "flos": 22382905933440.0, + "grad_norm": 1.5479795236038068, + "language_loss": 0.65014857, + "learning_rate": 2.673977187074017e-07, + "loss": 0.67104387, + "num_input_tokens_seen": 300840025, + "step": 13946, + "time_per_iteration": 2.631258010864258 + }, + { + "auxiliary_loss_clip": 0.01054132, + "auxiliary_loss_mlp": 0.0103326, + "balance_loss_clip": 1.03125811, + "balance_loss_mlp": 1.02062368, + "epoch": 0.8385390049601683, + "flos": 29496845312640.0, + "grad_norm": 1.6923282852596198, + "language_loss": 0.67612338, + "learning_rate": 2.672032068397829e-07, + "loss": 0.69699728, + "num_input_tokens_seen": 300860380, + "step": 13947, + "time_per_iteration": 4.000232934951782 + }, + { + "auxiliary_loss_clip": 0.01079025, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.03455901, + "balance_loss_mlp": 1.02030659, + "epoch": 0.8385991282128363, + "flos": 32707797799680.0, + "grad_norm": 1.5194661912900052, + "language_loss": 0.69700718, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.71813023, + "num_input_tokens_seen": 300881895, + "step": 13948, + "time_per_iteration": 2.6137001514434814 + }, + { + "auxiliary_loss_clip": 0.01078556, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.03425431, + "balance_loss_mlp": 1.02185082, + "epoch": 0.8386592514655042, + "flos": 25441022050560.0, + "grad_norm": 1.7931961525172722, + "language_loss": 0.84667385, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.86778349, + "num_input_tokens_seen": 300901575, + "step": 13949, + "time_per_iteration": 2.560199737548828 + }, + { + "auxiliary_loss_clip": 0.01076927, + "auxiliary_loss_mlp": 0.01025536, + "balance_loss_clip": 1.03681612, + "balance_loss_mlp": 1.01393104, + "epoch": 0.8387193747181723, + "flos": 22015898720640.0, + "grad_norm": 3.0973921407735823, + "language_loss": 0.70326722, + "learning_rate": 2.66620065513385e-07, + "loss": 0.7242918, + "num_input_tokens_seen": 300919735, + "step": 13950, + "time_per_iteration": 2.521230459213257 + }, + { + "auxiliary_loss_clip": 0.01093724, + "auxiliary_loss_mlp": 0.01027066, + "balance_loss_clip": 1.03604341, + "balance_loss_mlp": 1.01494241, + "epoch": 0.8387794979708402, + "flos": 18150223080960.0, + "grad_norm": 2.270612636817188, + "language_loss": 0.64484739, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.66605532, + "num_input_tokens_seen": 300939150, + "step": 13951, + "time_per_iteration": 2.491940975189209 + }, + { + "auxiliary_loss_clip": 0.01094124, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.03597164, + "balance_loss_mlp": 1.0177083, + "epoch": 0.8388396212235082, + "flos": 25411216740480.0, + "grad_norm": 3.0508049334135254, + "language_loss": 0.70184088, + "learning_rate": 2.662316332665393e-07, + "loss": 0.72307414, + "num_input_tokens_seen": 300959730, + "step": 13952, + "time_per_iteration": 3.932929277420044 + }, + { + "auxiliary_loss_clip": 0.01090912, + "auxiliary_loss_mlp": 0.0102734, + "balance_loss_clip": 1.035182, + "balance_loss_mlp": 1.01589584, + "epoch": 0.8388997444761762, + "flos": 22273055164800.0, + "grad_norm": 1.9667533939445316, + "language_loss": 0.72458172, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.7457642, + "num_input_tokens_seen": 300976120, + "step": 13953, + "time_per_iteration": 2.4690165519714355 + }, + { + "auxiliary_loss_clip": 0.01039961, + "auxiliary_loss_mlp": 0.01030798, + "balance_loss_clip": 1.03356886, + "balance_loss_mlp": 1.01727414, + "epoch": 0.8389598677288441, + "flos": 19573219255680.0, + "grad_norm": 2.0272414186837606, + "language_loss": 0.68074054, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.70144808, + "num_input_tokens_seen": 300995080, + "step": 13954, + "time_per_iteration": 2.614821195602417 + }, + { + "auxiliary_loss_clip": 0.01083701, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.03689027, + "balance_loss_mlp": 1.01990318, + "epoch": 0.8390199909815121, + "flos": 17384715406080.0, + "grad_norm": 1.7864248424906386, + "language_loss": 0.73158157, + "learning_rate": 2.656494779996932e-07, + "loss": 0.75272906, + "num_input_tokens_seen": 301012920, + "step": 13955, + "time_per_iteration": 2.5112876892089844 + }, + { + "auxiliary_loss_clip": 0.01039473, + "auxiliary_loss_mlp": 0.01030339, + "balance_loss_clip": 1.03133523, + "balance_loss_mlp": 1.01797748, + "epoch": 0.83908011423418, + "flos": 24639639667200.0, + "grad_norm": 2.345639350732659, + "language_loss": 0.66278088, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.68347907, + "num_input_tokens_seen": 301028875, + "step": 13956, + "time_per_iteration": 2.6299521923065186 + }, + { + "auxiliary_loss_clip": 0.01094636, + "auxiliary_loss_mlp": 0.01033009, + "balance_loss_clip": 1.03535938, + "balance_loss_mlp": 1.02067065, + "epoch": 0.8391402374868481, + "flos": 24718356322560.0, + "grad_norm": 1.6916541217753913, + "language_loss": 0.79714274, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.81841922, + "num_input_tokens_seen": 301050115, + "step": 13957, + "time_per_iteration": 2.516850709915161 + }, + { + "auxiliary_loss_clip": 0.00993873, + "auxiliary_loss_mlp": 0.01008596, + "balance_loss_clip": 1.02593207, + "balance_loss_mlp": 1.00723076, + "epoch": 0.839200360739516, + "flos": 56871695784960.0, + "grad_norm": 0.7634375852212392, + "language_loss": 0.53354728, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.55357194, + "num_input_tokens_seen": 301114155, + "step": 13958, + "time_per_iteration": 3.3439841270446777 + }, + { + "auxiliary_loss_clip": 0.01093186, + "auxiliary_loss_mlp": 0.01030427, + "balance_loss_clip": 1.03598833, + "balance_loss_mlp": 1.01810694, + "epoch": 0.839260483992184, + "flos": 18332792933760.0, + "grad_norm": 1.810273997402119, + "language_loss": 0.73224294, + "learning_rate": 2.648741917459574e-07, + "loss": 0.75347912, + "num_input_tokens_seen": 301133150, + "step": 13959, + "time_per_iteration": 2.50077486038208 + }, + { + "auxiliary_loss_clip": 0.01068807, + "auxiliary_loss_mlp": 0.01024245, + "balance_loss_clip": 1.03745222, + "balance_loss_mlp": 1.01326549, + "epoch": 0.8393206072448519, + "flos": 27087921653760.0, + "grad_norm": 1.778078653647245, + "language_loss": 0.55045342, + "learning_rate": 2.646805346545169e-07, + "loss": 0.57138395, + "num_input_tokens_seen": 301153600, + "step": 13960, + "time_per_iteration": 2.621269941329956 + }, + { + "auxiliary_loss_clip": 0.01003728, + "auxiliary_loss_mlp": 0.01002952, + "balance_loss_clip": 1.00954449, + "balance_loss_mlp": 1.00183129, + "epoch": 0.8393807304975199, + "flos": 61521192057600.0, + "grad_norm": 0.773854739093631, + "language_loss": 0.60693908, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.62700588, + "num_input_tokens_seen": 301214335, + "step": 13961, + "time_per_iteration": 3.245011806488037 + }, + { + "auxiliary_loss_clip": 0.01049391, + "auxiliary_loss_mlp": 0.01036439, + "balance_loss_clip": 1.03092337, + "balance_loss_mlp": 1.02413058, + "epoch": 0.8394408537501878, + "flos": 14894848448640.0, + "grad_norm": 2.013106193141199, + "language_loss": 0.68211877, + "learning_rate": 2.642934178894405e-07, + "loss": 0.70297706, + "num_input_tokens_seen": 301228960, + "step": 13962, + "time_per_iteration": 2.5503060817718506 + }, + { + "auxiliary_loss_clip": 0.01072379, + "auxiliary_loss_mlp": 0.01030834, + "balance_loss_clip": 1.03278184, + "balance_loss_mlp": 1.01891851, + "epoch": 0.8395009770028559, + "flos": 17412186332160.0, + "grad_norm": 1.8004332406285057, + "language_loss": 0.73236167, + "learning_rate": 2.640999582304841e-07, + "loss": 0.75339389, + "num_input_tokens_seen": 301245875, + "step": 13963, + "time_per_iteration": 2.5347201824188232 + }, + { + "auxiliary_loss_clip": 0.0108082, + "auxiliary_loss_mlp": 0.01034277, + "balance_loss_clip": 1.03283024, + "balance_loss_mlp": 1.02285051, + "epoch": 0.8395611002555238, + "flos": 27924747782400.0, + "grad_norm": 1.5284985692084379, + "language_loss": 0.76600415, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.78715515, + "num_input_tokens_seen": 301265550, + "step": 13964, + "time_per_iteration": 2.5668647289276123 + }, + { + "auxiliary_loss_clip": 0.01085647, + "auxiliary_loss_mlp": 0.01033763, + "balance_loss_clip": 1.03607941, + "balance_loss_mlp": 1.02073944, + "epoch": 0.8396212235081918, + "flos": 11100922225920.0, + "grad_norm": 2.418753717242034, + "language_loss": 0.78081113, + "learning_rate": 2.637132363964161e-07, + "loss": 0.80200529, + "num_input_tokens_seen": 301282035, + "step": 13965, + "time_per_iteration": 2.5139474868774414 + }, + { + "auxiliary_loss_clip": 0.01085956, + "auxiliary_loss_mlp": 0.01031783, + "balance_loss_clip": 1.03328383, + "balance_loss_mlp": 1.0206728, + "epoch": 0.8396813467608598, + "flos": 35735641729920.0, + "grad_norm": 1.6206627418033916, + "language_loss": 0.65721214, + "learning_rate": 2.635199742359684e-07, + "loss": 0.67838955, + "num_input_tokens_seen": 301305210, + "step": 13966, + "time_per_iteration": 2.618116617202759 + }, + { + "auxiliary_loss_clip": 0.01081813, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.03488588, + "balance_loss_mlp": 1.01802099, + "epoch": 0.8397414700135277, + "flos": 26176724415360.0, + "grad_norm": 1.6745473593323255, + "language_loss": 0.74263608, + "learning_rate": 2.633267779230177e-07, + "loss": 0.76375049, + "num_input_tokens_seen": 301324885, + "step": 13967, + "time_per_iteration": 2.589142322540283 + }, + { + "auxiliary_loss_clip": 0.01077357, + "auxiliary_loss_mlp": 0.01029236, + "balance_loss_clip": 1.03583789, + "balance_loss_mlp": 1.01732087, + "epoch": 0.8398015932661957, + "flos": 18333116156160.0, + "grad_norm": 2.119495905415382, + "language_loss": 0.82924747, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.85031343, + "num_input_tokens_seen": 301343070, + "step": 13968, + "time_per_iteration": 2.5202994346618652 + }, + { + "auxiliary_loss_clip": 0.01085164, + "auxiliary_loss_mlp": 0.01030255, + "balance_loss_clip": 1.03675699, + "balance_loss_mlp": 1.01857305, + "epoch": 0.8398617165188637, + "flos": 17379507934080.0, + "grad_norm": 1.9470270704890418, + "language_loss": 0.77535272, + "learning_rate": 2.629405828689075e-07, + "loss": 0.79650688, + "num_input_tokens_seen": 301359280, + "step": 13969, + "time_per_iteration": 2.5028836727142334 + }, + { + "auxiliary_loss_clip": 0.01085424, + "auxiliary_loss_mlp": 0.01026436, + "balance_loss_clip": 1.03443527, + "balance_loss_mlp": 1.01371026, + "epoch": 0.8399218397715317, + "flos": 22929681738240.0, + "grad_norm": 2.017098627491471, + "language_loss": 0.77053446, + "learning_rate": 2.627475841423923e-07, + "loss": 0.7916531, + "num_input_tokens_seen": 301376465, + "step": 13970, + "time_per_iteration": 2.571639060974121 + }, + { + "auxiliary_loss_clip": 0.01081698, + "auxiliary_loss_mlp": 0.01036773, + "balance_loss_clip": 1.03393054, + "balance_loss_mlp": 1.02491724, + "epoch": 0.8399819630241996, + "flos": 23149562843520.0, + "grad_norm": 1.8289167374169204, + "language_loss": 0.72192836, + "learning_rate": 2.625546512926633e-07, + "loss": 0.74311304, + "num_input_tokens_seen": 301396000, + "step": 13971, + "time_per_iteration": 2.5542821884155273 + }, + { + "auxiliary_loss_clip": 0.01082372, + "auxiliary_loss_mlp": 0.0103033, + "balance_loss_clip": 1.03391576, + "balance_loss_mlp": 1.01792026, + "epoch": 0.8400420862768676, + "flos": 16397423205120.0, + "grad_norm": 1.9133947573434664, + "language_loss": 0.77420157, + "learning_rate": 2.623617843270358e-07, + "loss": 0.79532856, + "num_input_tokens_seen": 301413160, + "step": 13972, + "time_per_iteration": 2.4767229557037354 + }, + { + "auxiliary_loss_clip": 0.01038683, + "auxiliary_loss_mlp": 0.0103339, + "balance_loss_clip": 1.03441715, + "balance_loss_mlp": 1.02083683, + "epoch": 0.8401022095295355, + "flos": 21287486816640.0, + "grad_norm": 1.3656360380535346, + "language_loss": 0.68362391, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.70434463, + "num_input_tokens_seen": 301433325, + "step": 13973, + "time_per_iteration": 2.641472578048706 + }, + { + "auxiliary_loss_clip": 0.01082863, + "auxiliary_loss_mlp": 0.01027512, + "balance_loss_clip": 1.03468347, + "balance_loss_mlp": 1.01550186, + "epoch": 0.8401623327822035, + "flos": 17311313963520.0, + "grad_norm": 1.8998949556439089, + "language_loss": 0.78131318, + "learning_rate": 2.619762480773382e-07, + "loss": 0.80241698, + "num_input_tokens_seen": 301450265, + "step": 13974, + "time_per_iteration": 2.5020694732666016 + }, + { + "auxiliary_loss_clip": 0.01087934, + "auxiliary_loss_mlp": 0.01027188, + "balance_loss_clip": 1.03528869, + "balance_loss_mlp": 1.01534462, + "epoch": 0.8402224560348714, + "flos": 22236677665920.0, + "grad_norm": 1.5417594706075974, + "language_loss": 0.72944021, + "learning_rate": 2.617835788078868e-07, + "loss": 0.7505914, + "num_input_tokens_seen": 301470760, + "step": 13975, + "time_per_iteration": 2.521355152130127 + }, + { + "auxiliary_loss_clip": 0.01082701, + "auxiliary_loss_mlp": 0.01031728, + "balance_loss_clip": 1.03402424, + "balance_loss_mlp": 1.01937795, + "epoch": 0.8402825792875395, + "flos": 20229953569920.0, + "grad_norm": 1.6902734931603942, + "language_loss": 0.72679693, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.74794126, + "num_input_tokens_seen": 301489425, + "step": 13976, + "time_per_iteration": 2.4927144050598145 + }, + { + "auxiliary_loss_clip": 0.01100953, + "auxiliary_loss_mlp": 0.00785152, + "balance_loss_clip": 1.03382206, + "balance_loss_mlp": 1.01363933, + "epoch": 0.8403427025402074, + "flos": 23289973107840.0, + "grad_norm": 1.7799803976507613, + "language_loss": 0.71813053, + "learning_rate": 2.61398438016311e-07, + "loss": 0.73699158, + "num_input_tokens_seen": 301508885, + "step": 13977, + "time_per_iteration": 2.475717306137085 + }, + { + "auxiliary_loss_clip": 0.01090377, + "auxiliary_loss_mlp": 0.01028727, + "balance_loss_clip": 1.03180933, + "balance_loss_mlp": 1.01714611, + "epoch": 0.8404028257928754, + "flos": 32675586278400.0, + "grad_norm": 1.606603836356689, + "language_loss": 0.68375093, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.70494199, + "num_input_tokens_seen": 301533780, + "step": 13978, + "time_per_iteration": 2.6183300018310547 + }, + { + "auxiliary_loss_clip": 0.01068185, + "auxiliary_loss_mlp": 0.01030248, + "balance_loss_clip": 1.03433323, + "balance_loss_mlp": 1.01850593, + "epoch": 0.8404629490455434, + "flos": 16180522928640.0, + "grad_norm": 1.8049160861709796, + "language_loss": 0.7800808, + "learning_rate": 2.610135609365145e-07, + "loss": 0.80106515, + "num_input_tokens_seen": 301551775, + "step": 13979, + "time_per_iteration": 2.540816307067871 + }, + { + "auxiliary_loss_clip": 0.01088555, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.03693974, + "balance_loss_mlp": 1.01704335, + "epoch": 0.8405230722982113, + "flos": 15194451790080.0, + "grad_norm": 2.2874958593142316, + "language_loss": 0.78039193, + "learning_rate": 2.60821221306778e-07, + "loss": 0.80156612, + "num_input_tokens_seen": 301570495, + "step": 13980, + "time_per_iteration": 2.458989381790161 + }, + { + "auxiliary_loss_clip": 0.01068566, + "auxiliary_loss_mlp": 0.01027334, + "balance_loss_clip": 1.03439975, + "balance_loss_mlp": 1.01623607, + "epoch": 0.8405831955508793, + "flos": 27812418975360.0, + "grad_norm": 1.728087695129275, + "language_loss": 0.86739993, + "learning_rate": 2.606289476268757e-07, + "loss": 0.88835895, + "num_input_tokens_seen": 301591705, + "step": 13981, + "time_per_iteration": 3.987117290496826 + }, + { + "auxiliary_loss_clip": 0.01093165, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.03544235, + "balance_loss_mlp": 1.01940739, + "epoch": 0.8406433188035473, + "flos": 23769452782080.0, + "grad_norm": 1.771615046201088, + "language_loss": 0.67289615, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.69413853, + "num_input_tokens_seen": 301611670, + "step": 13982, + "time_per_iteration": 3.9059979915618896 + }, + { + "auxiliary_loss_clip": 0.01056478, + "auxiliary_loss_mlp": 0.01034489, + "balance_loss_clip": 1.03617597, + "balance_loss_mlp": 1.02078009, + "epoch": 0.8407034420562153, + "flos": 29205681667200.0, + "grad_norm": 2.537158398339885, + "language_loss": 0.68061149, + "learning_rate": 2.602445981457324e-07, + "loss": 0.70152116, + "num_input_tokens_seen": 301632540, + "step": 13983, + "time_per_iteration": 2.6372921466827393 + }, + { + "auxiliary_loss_clip": 0.01064651, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.02943587, + "balance_loss_mlp": 1.01949549, + "epoch": 0.8407635653088832, + "flos": 26360084367360.0, + "grad_norm": 1.8411722007302604, + "language_loss": 0.79041213, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.81137943, + "num_input_tokens_seen": 301651480, + "step": 13984, + "time_per_iteration": 2.604414701461792 + }, + { + "auxiliary_loss_clip": 0.01089281, + "auxiliary_loss_mlp": 0.01029984, + "balance_loss_clip": 1.03198361, + "balance_loss_mlp": 1.01797962, + "epoch": 0.8408236885615512, + "flos": 21468799693440.0, + "grad_norm": 2.7126331163489503, + "language_loss": 0.60068929, + "learning_rate": 2.598605125513842e-07, + "loss": 0.62188196, + "num_input_tokens_seen": 301670010, + "step": 13985, + "time_per_iteration": 3.874861001968384 + }, + { + "auxiliary_loss_clip": 0.01062576, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.03302062, + "balance_loss_mlp": 1.01551032, + "epoch": 0.8408838118142191, + "flos": 22963724853120.0, + "grad_norm": 2.383817988528982, + "language_loss": 0.8203634, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.84127033, + "num_input_tokens_seen": 301689785, + "step": 13986, + "time_per_iteration": 2.6269054412841797 + }, + { + "auxiliary_loss_clip": 0.01082853, + "auxiliary_loss_mlp": 0.00782968, + "balance_loss_clip": 1.0376066, + "balance_loss_mlp": 1.01029432, + "epoch": 0.8409439350668871, + "flos": 26800026145920.0, + "grad_norm": 1.4359836520669882, + "language_loss": 0.65803772, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.676696, + "num_input_tokens_seen": 301712225, + "step": 13987, + "time_per_iteration": 2.671391725540161 + }, + { + "auxiliary_loss_clip": 0.01103273, + "auxiliary_loss_mlp": 0.00784682, + "balance_loss_clip": 1.03514886, + "balance_loss_mlp": 1.01276839, + "epoch": 0.841004058319555, + "flos": 26578672583040.0, + "grad_norm": 1.9435432036470026, + "language_loss": 0.67448819, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.69336772, + "num_input_tokens_seen": 301730955, + "step": 13988, + "time_per_iteration": 2.538529634475708 + }, + { + "auxiliary_loss_clip": 0.01093438, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.03819144, + "balance_loss_mlp": 1.02176225, + "epoch": 0.8410641815722231, + "flos": 14501878680960.0, + "grad_norm": 2.149081050016251, + "language_loss": 0.80806673, + "learning_rate": 2.590931332560622e-07, + "loss": 0.82934606, + "num_input_tokens_seen": 301746930, + "step": 13989, + "time_per_iteration": 2.53222393989563 + }, + { + "auxiliary_loss_clip": 0.01091207, + "auxiliary_loss_mlp": 0.01027896, + "balance_loss_clip": 1.03337288, + "balance_loss_mlp": 1.01594567, + "epoch": 0.841124304824891, + "flos": 29166682475520.0, + "grad_norm": 1.6713849724804943, + "language_loss": 0.75513458, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.77632564, + "num_input_tokens_seen": 301766945, + "step": 13990, + "time_per_iteration": 2.5384159088134766 + }, + { + "auxiliary_loss_clip": 0.01086121, + "auxiliary_loss_mlp": 0.01030257, + "balance_loss_clip": 1.03212571, + "balance_loss_mlp": 1.01887822, + "epoch": 0.841184428077559, + "flos": 22412028885120.0, + "grad_norm": 1.5709025546696422, + "language_loss": 0.8107233, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.83188713, + "num_input_tokens_seen": 301785460, + "step": 13991, + "time_per_iteration": 3.9016592502593994 + }, + { + "auxiliary_loss_clip": 0.01064524, + "auxiliary_loss_mlp": 0.01027877, + "balance_loss_clip": 1.03518748, + "balance_loss_mlp": 1.01655817, + "epoch": 0.841244551330227, + "flos": 22962791099520.0, + "grad_norm": 2.1036693991442967, + "language_loss": 0.70923877, + "learning_rate": 2.585182919204105e-07, + "loss": 0.73016274, + "num_input_tokens_seen": 301804180, + "step": 13992, + "time_per_iteration": 2.551100730895996 + }, + { + "auxiliary_loss_clip": 0.01069445, + "auxiliary_loss_mlp": 0.01025118, + "balance_loss_clip": 1.03370273, + "balance_loss_mlp": 1.01325631, + "epoch": 0.8413046745828949, + "flos": 21032736583680.0, + "grad_norm": 1.5564021866570537, + "language_loss": 0.7678535, + "learning_rate": 2.583268102064959e-07, + "loss": 0.78879911, + "num_input_tokens_seen": 301823670, + "step": 13993, + "time_per_iteration": 2.5669591426849365 + }, + { + "auxiliary_loss_clip": 0.01098042, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.03394711, + "balance_loss_mlp": 1.02056968, + "epoch": 0.841364797835563, + "flos": 27052082858880.0, + "grad_norm": 2.0745629802378973, + "language_loss": 0.74213797, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.76346058, + "num_input_tokens_seen": 301845890, + "step": 13994, + "time_per_iteration": 2.521392583847046 + }, + { + "auxiliary_loss_clip": 0.01091004, + "auxiliary_loss_mlp": 0.01029074, + "balance_loss_clip": 1.03465939, + "balance_loss_mlp": 1.01829708, + "epoch": 0.8414249210882309, + "flos": 17895688329600.0, + "grad_norm": 1.583370449752617, + "language_loss": 0.59623295, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.61743367, + "num_input_tokens_seen": 301863985, + "step": 13995, + "time_per_iteration": 2.4629693031311035 + }, + { + "auxiliary_loss_clip": 0.01092112, + "auxiliary_loss_mlp": 0.01032449, + "balance_loss_clip": 1.03461838, + "balance_loss_mlp": 1.01974773, + "epoch": 0.8414850443408989, + "flos": 25441201618560.0, + "grad_norm": 1.6909463511267706, + "language_loss": 0.71822989, + "learning_rate": 2.577527613603163e-07, + "loss": 0.73947549, + "num_input_tokens_seen": 301882765, + "step": 13996, + "time_per_iteration": 2.5143539905548096 + }, + { + "auxiliary_loss_clip": 0.01078346, + "auxiliary_loss_mlp": 0.01027263, + "balance_loss_clip": 1.03374696, + "balance_loss_mlp": 1.01547337, + "epoch": 0.8415451675935668, + "flos": 23220055284480.0, + "grad_norm": 1.9078117280203712, + "language_loss": 0.63923883, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.66029489, + "num_input_tokens_seen": 301902720, + "step": 13997, + "time_per_iteration": 2.5474588871002197 + }, + { + "auxiliary_loss_clip": 0.01079045, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.03478193, + "balance_loss_mlp": 1.01873231, + "epoch": 0.8416052908462348, + "flos": 18546496899840.0, + "grad_norm": 2.962944862750636, + "language_loss": 0.81865853, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.83976293, + "num_input_tokens_seen": 301921245, + "step": 13998, + "time_per_iteration": 2.513605833053589 + }, + { + "auxiliary_loss_clip": 0.01094261, + "auxiliary_loss_mlp": 0.00782892, + "balance_loss_clip": 1.03592634, + "balance_loss_mlp": 1.00878632, + "epoch": 0.8416654140989027, + "flos": 26105190480000.0, + "grad_norm": 1.9519178945856186, + "language_loss": 0.80074787, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.81951934, + "num_input_tokens_seen": 301942320, + "step": 13999, + "time_per_iteration": 2.545562982559204 + }, + { + "auxiliary_loss_clip": 0.01093369, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.03459144, + "balance_loss_mlp": 1.0189991, + "epoch": 0.8417255373515707, + "flos": 26433270328320.0, + "grad_norm": 2.2467735973041174, + "language_loss": 0.66802597, + "learning_rate": 2.569882878592096e-07, + "loss": 0.68928123, + "num_input_tokens_seen": 301963110, + "step": 14000, + "time_per_iteration": 2.531163215637207 + }, + { + "auxiliary_loss_clip": 0.01097972, + "auxiliary_loss_mlp": 0.01030185, + "balance_loss_clip": 1.03605437, + "balance_loss_mlp": 1.0178225, + "epoch": 0.8417856606042387, + "flos": 24717745791360.0, + "grad_norm": 1.4579318033897304, + "language_loss": 0.79422134, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.81550288, + "num_input_tokens_seen": 301984915, + "step": 14001, + "time_per_iteration": 2.5269150733947754 + }, + { + "auxiliary_loss_clip": 0.01040679, + "auxiliary_loss_mlp": 0.01028356, + "balance_loss_clip": 1.03467679, + "balance_loss_mlp": 1.01754951, + "epoch": 0.8418457838569067, + "flos": 20850849089280.0, + "grad_norm": 1.5895008759139129, + "language_loss": 0.7858814, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.80657172, + "num_input_tokens_seen": 302004095, + "step": 14002, + "time_per_iteration": 2.6270828247070312 + }, + { + "auxiliary_loss_clip": 0.01056785, + "auxiliary_loss_mlp": 0.0078021, + "balance_loss_clip": 1.03304863, + "balance_loss_mlp": 1.00545847, + "epoch": 0.8419059071095746, + "flos": 28660629715200.0, + "grad_norm": 1.4247947456424408, + "language_loss": 0.78281999, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.8011899, + "num_input_tokens_seen": 302027250, + "step": 14003, + "time_per_iteration": 2.653022527694702 + }, + { + "auxiliary_loss_clip": 0.01077054, + "auxiliary_loss_mlp": 0.01028494, + "balance_loss_clip": 1.03611934, + "balance_loss_mlp": 1.01657319, + "epoch": 0.8419660303622426, + "flos": 21653596189440.0, + "grad_norm": 1.698481793283611, + "language_loss": 0.65640002, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.67745548, + "num_input_tokens_seen": 302046950, + "step": 14004, + "time_per_iteration": 2.540910005569458 + }, + { + "auxiliary_loss_clip": 0.01093576, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.03463483, + "balance_loss_mlp": 1.01649106, + "epoch": 0.8420261536149106, + "flos": 25301114576640.0, + "grad_norm": 1.9232798663906334, + "language_loss": 0.76172906, + "learning_rate": 2.560341831785724e-07, + "loss": 0.78295636, + "num_input_tokens_seen": 302065470, + "step": 14005, + "time_per_iteration": 2.5526506900787354 + }, + { + "auxiliary_loss_clip": 0.01066479, + "auxiliary_loss_mlp": 0.00784196, + "balance_loss_clip": 1.03173184, + "balance_loss_mlp": 1.00902939, + "epoch": 0.8420862768675785, + "flos": 18763397176320.0, + "grad_norm": 1.6247743149622682, + "language_loss": 0.77698839, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.79549515, + "num_input_tokens_seen": 302083190, + "step": 14006, + "time_per_iteration": 2.565410852432251 + }, + { + "auxiliary_loss_clip": 0.01092755, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.03482866, + "balance_loss_mlp": 1.02033162, + "epoch": 0.8421464001202466, + "flos": 18328052338560.0, + "grad_norm": 1.9569419504515364, + "language_loss": 0.77069658, + "learning_rate": 2.556530041751932e-07, + "loss": 0.79194552, + "num_input_tokens_seen": 302098820, + "step": 14007, + "time_per_iteration": 2.4760329723358154 + }, + { + "auxiliary_loss_clip": 0.0108419, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.03531241, + "balance_loss_mlp": 1.01743114, + "epoch": 0.8422065233729145, + "flos": 31537181560320.0, + "grad_norm": 1.9739930871568838, + "language_loss": 0.6567415, + "learning_rate": 2.554625138886102e-07, + "loss": 0.67788172, + "num_input_tokens_seen": 302117075, + "step": 14008, + "time_per_iteration": 2.6162703037261963 + }, + { + "auxiliary_loss_clip": 0.01019803, + "auxiliary_loss_mlp": 0.01004488, + "balance_loss_clip": 1.0065589, + "balance_loss_mlp": 1.00331962, + "epoch": 0.8422666466255825, + "flos": 64298128510080.0, + "grad_norm": 0.7118714307314838, + "language_loss": 0.56933463, + "learning_rate": 2.552720897550631e-07, + "loss": 0.5895775, + "num_input_tokens_seen": 302179735, + "step": 14009, + "time_per_iteration": 3.172305107116699 + }, + { + "auxiliary_loss_clip": 0.01040635, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.03151071, + "balance_loss_mlp": 1.02041328, + "epoch": 0.8423267698782504, + "flos": 24316731377280.0, + "grad_norm": 1.395751042761149, + "language_loss": 0.77996492, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.80068666, + "num_input_tokens_seen": 302202055, + "step": 14010, + "time_per_iteration": 2.6547160148620605 + }, + { + "auxiliary_loss_clip": 0.01107418, + "auxiliary_loss_mlp": 0.01034284, + "balance_loss_clip": 1.03675103, + "balance_loss_mlp": 1.02132034, + "epoch": 0.8423868931309184, + "flos": 18296092212480.0, + "grad_norm": 1.954332654757083, + "language_loss": 0.72234249, + "learning_rate": 2.548914399759592e-07, + "loss": 0.74375945, + "num_input_tokens_seen": 302221360, + "step": 14011, + "time_per_iteration": 2.4374656677246094 + }, + { + "auxiliary_loss_clip": 0.01087721, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.03464723, + "balance_loss_mlp": 1.02218711, + "epoch": 0.8424470163835863, + "flos": 23550218121600.0, + "grad_norm": 1.7702280373626171, + "language_loss": 0.83927464, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.86048859, + "num_input_tokens_seen": 302240715, + "step": 14012, + "time_per_iteration": 2.522848606109619 + }, + { + "auxiliary_loss_clip": 0.01096023, + "auxiliary_loss_mlp": 0.01029485, + "balance_loss_clip": 1.03311956, + "balance_loss_mlp": 1.01934028, + "epoch": 0.8425071396362543, + "flos": 23769488695680.0, + "grad_norm": 1.8967647532420013, + "language_loss": 0.6783185, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.69957352, + "num_input_tokens_seen": 302260950, + "step": 14013, + "time_per_iteration": 2.471808433532715 + }, + { + "auxiliary_loss_clip": 0.01108591, + "auxiliary_loss_mlp": 0.01029497, + "balance_loss_clip": 1.03620195, + "balance_loss_mlp": 1.01728415, + "epoch": 0.8425672628889223, + "flos": 16178906816640.0, + "grad_norm": 5.062969196461233, + "language_loss": 0.78640604, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.80778694, + "num_input_tokens_seen": 302277500, + "step": 14014, + "time_per_iteration": 2.4459071159362793 + }, + { + "auxiliary_loss_clip": 0.01073563, + "auxiliary_loss_mlp": 0.00784769, + "balance_loss_clip": 1.03401959, + "balance_loss_mlp": 1.01200593, + "epoch": 0.8426273861415903, + "flos": 23149131880320.0, + "grad_norm": 1.748955673945598, + "language_loss": 0.67746276, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.69604611, + "num_input_tokens_seen": 302297930, + "step": 14015, + "time_per_iteration": 2.5660958290100098 + }, + { + "auxiliary_loss_clip": 0.01103092, + "auxiliary_loss_mlp": 0.01030047, + "balance_loss_clip": 1.03508639, + "balance_loss_mlp": 1.01726747, + "epoch": 0.8426875093942582, + "flos": 17457757712640.0, + "grad_norm": 2.216353873602798, + "language_loss": 0.76214898, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.78348029, + "num_input_tokens_seen": 302315735, + "step": 14016, + "time_per_iteration": 2.4463095664978027 + }, + { + "auxiliary_loss_clip": 0.01080701, + "auxiliary_loss_mlp": 0.01033382, + "balance_loss_clip": 1.03428483, + "balance_loss_mlp": 1.02158642, + "epoch": 0.8427476326469262, + "flos": 19640551299840.0, + "grad_norm": 1.751734608495614, + "language_loss": 0.79380459, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.81494546, + "num_input_tokens_seen": 302332790, + "step": 14017, + "time_per_iteration": 2.534085750579834 + }, + { + "auxiliary_loss_clip": 0.01081577, + "auxiliary_loss_mlp": 0.01030787, + "balance_loss_clip": 1.03504086, + "balance_loss_mlp": 1.01871085, + "epoch": 0.8428077558995941, + "flos": 11941160146560.0, + "grad_norm": 1.9772314494697347, + "language_loss": 0.62621224, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.64733589, + "num_input_tokens_seen": 302346490, + "step": 14018, + "time_per_iteration": 2.4998939037323 + }, + { + "auxiliary_loss_clip": 0.01090614, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.03597367, + "balance_loss_mlp": 1.01869774, + "epoch": 0.8428678791522621, + "flos": 10451729767680.0, + "grad_norm": 1.8371737500607932, + "language_loss": 0.79249346, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.81369841, + "num_input_tokens_seen": 302363235, + "step": 14019, + "time_per_iteration": 2.4686081409454346 + }, + { + "auxiliary_loss_clip": 0.01062825, + "auxiliary_loss_mlp": 0.01039431, + "balance_loss_clip": 1.03201985, + "balance_loss_mlp": 1.02476883, + "epoch": 0.8429280024049302, + "flos": 28767248259840.0, + "grad_norm": 1.7723844039959444, + "language_loss": 0.78175735, + "learning_rate": 2.531817924498265e-07, + "loss": 0.80277997, + "num_input_tokens_seen": 302383270, + "step": 14020, + "time_per_iteration": 3.9817473888397217 + }, + { + "auxiliary_loss_clip": 0.01089369, + "auxiliary_loss_mlp": 0.01026378, + "balance_loss_clip": 1.03532529, + "balance_loss_mlp": 1.01473689, + "epoch": 0.8429881256575981, + "flos": 19537093152000.0, + "grad_norm": 1.5355613316661234, + "language_loss": 0.71187341, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.73303092, + "num_input_tokens_seen": 302401355, + "step": 14021, + "time_per_iteration": 3.8699326515197754 + }, + { + "auxiliary_loss_clip": 0.0108313, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.03807497, + "balance_loss_mlp": 1.0241245, + "epoch": 0.8430482489102661, + "flos": 24790931752320.0, + "grad_norm": 1.8048883217073266, + "language_loss": 0.69811922, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.71930623, + "num_input_tokens_seen": 302419515, + "step": 14022, + "time_per_iteration": 2.5486457347869873 + }, + { + "auxiliary_loss_clip": 0.01048686, + "auxiliary_loss_mlp": 0.01036429, + "balance_loss_clip": 1.03414106, + "balance_loss_mlp": 1.02338731, + "epoch": 0.843108372162934, + "flos": 21544248211200.0, + "grad_norm": 1.9902692796211197, + "language_loss": 0.72391009, + "learning_rate": 2.526131019933553e-07, + "loss": 0.74476129, + "num_input_tokens_seen": 302438280, + "step": 14023, + "time_per_iteration": 2.629359483718872 + }, + { + "auxiliary_loss_clip": 0.01094393, + "auxiliary_loss_mlp": 0.01034424, + "balance_loss_clip": 1.03677046, + "balance_loss_mlp": 1.02166855, + "epoch": 0.843168495415602, + "flos": 24608792862720.0, + "grad_norm": 1.537324373175969, + "language_loss": 0.66834784, + "learning_rate": 2.524236710204559e-07, + "loss": 0.68963599, + "num_input_tokens_seen": 302460860, + "step": 14024, + "time_per_iteration": 4.082241058349609 + }, + { + "auxiliary_loss_clip": 0.01089786, + "auxiliary_loss_mlp": 0.01030379, + "balance_loss_clip": 1.0365355, + "balance_loss_mlp": 1.01845229, + "epoch": 0.8432286186682699, + "flos": 15122738286720.0, + "grad_norm": 1.7845263323708536, + "language_loss": 0.81111634, + "learning_rate": 2.522343063158261e-07, + "loss": 0.83231801, + "num_input_tokens_seen": 302476980, + "step": 14025, + "time_per_iteration": 2.476935863494873 + }, + { + "auxiliary_loss_clip": 0.01087684, + "auxiliary_loss_mlp": 0.01031265, + "balance_loss_clip": 1.03390479, + "balance_loss_mlp": 1.02139461, + "epoch": 0.843288741920938, + "flos": 20301882554880.0, + "grad_norm": 1.4883081501714226, + "language_loss": 0.77732062, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.79851007, + "num_input_tokens_seen": 302496380, + "step": 14026, + "time_per_iteration": 2.521749496459961 + }, + { + "auxiliary_loss_clip": 0.01078573, + "auxiliary_loss_mlp": 0.01034784, + "balance_loss_clip": 1.03299022, + "balance_loss_mlp": 1.02280331, + "epoch": 0.8433488651736059, + "flos": 23332096782720.0, + "grad_norm": 1.3895842195320856, + "language_loss": 0.82448435, + "learning_rate": 2.518557757400945e-07, + "loss": 0.84561783, + "num_input_tokens_seen": 302516845, + "step": 14027, + "time_per_iteration": 2.5673789978027344 + }, + { + "auxiliary_loss_clip": 0.01078259, + "auxiliary_loss_mlp": 0.01030415, + "balance_loss_clip": 1.03378427, + "balance_loss_mlp": 1.01898289, + "epoch": 0.8434089884262739, + "flos": 39458105844480.0, + "grad_norm": 1.8099156712001558, + "language_loss": 0.56103534, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.58212209, + "num_input_tokens_seen": 302538865, + "step": 14028, + "time_per_iteration": 2.677839517593384 + }, + { + "auxiliary_loss_clip": 0.01078483, + "auxiliary_loss_mlp": 0.01027121, + "balance_loss_clip": 1.03435373, + "balance_loss_mlp": 1.015957, + "epoch": 0.8434691116789418, + "flos": 23768842250880.0, + "grad_norm": 1.7237342874627883, + "language_loss": 0.6350342, + "learning_rate": 2.51477510323578e-07, + "loss": 0.65609026, + "num_input_tokens_seen": 302557970, + "step": 14029, + "time_per_iteration": 2.5209009647369385 + }, + { + "auxiliary_loss_clip": 0.01099711, + "auxiliary_loss_mlp": 0.0103116, + "balance_loss_clip": 1.03546858, + "balance_loss_mlp": 1.02019882, + "epoch": 0.8435292349316098, + "flos": 22671411972480.0, + "grad_norm": 1.5717717025227453, + "language_loss": 0.75454021, + "learning_rate": 2.51288477067956e-07, + "loss": 0.77584887, + "num_input_tokens_seen": 302578915, + "step": 14030, + "time_per_iteration": 3.918175458908081 + }, + { + "auxiliary_loss_clip": 0.01083598, + "auxiliary_loss_mlp": 0.01032796, + "balance_loss_clip": 1.03629732, + "balance_loss_mlp": 1.02082109, + "epoch": 0.8435893581842777, + "flos": 18843622202880.0, + "grad_norm": 1.6951699207400748, + "language_loss": 0.82762295, + "learning_rate": 2.510995101236502e-07, + "loss": 0.84878689, + "num_input_tokens_seen": 302596300, + "step": 14031, + "time_per_iteration": 2.512314558029175 + }, + { + "auxiliary_loss_clip": 0.01078878, + "auxiliary_loss_mlp": 0.01028744, + "balance_loss_clip": 1.03276193, + "balance_loss_mlp": 1.01758575, + "epoch": 0.8436494814369457, + "flos": 20704225772160.0, + "grad_norm": 1.8272565967415897, + "language_loss": 0.80015987, + "learning_rate": 2.509106094978266e-07, + "loss": 0.82123613, + "num_input_tokens_seen": 302614975, + "step": 14032, + "time_per_iteration": 2.5044937133789062 + }, + { + "auxiliary_loss_clip": 0.01068522, + "auxiliary_loss_mlp": 0.01034235, + "balance_loss_clip": 1.0336535, + "balance_loss_mlp": 1.02047241, + "epoch": 0.8437096046896138, + "flos": 22674177319680.0, + "grad_norm": 1.5749652101396596, + "language_loss": 0.756598, + "learning_rate": 2.507217751976478e-07, + "loss": 0.77762556, + "num_input_tokens_seen": 302636415, + "step": 14033, + "time_per_iteration": 2.5657293796539307 + }, + { + "auxiliary_loss_clip": 0.01065545, + "auxiliary_loss_mlp": 0.01035736, + "balance_loss_clip": 1.03338599, + "balance_loss_mlp": 1.02447701, + "epoch": 0.8437697279422817, + "flos": 16180127879040.0, + "grad_norm": 1.7226434210672155, + "language_loss": 0.838121, + "learning_rate": 2.505330072302743e-07, + "loss": 0.85913384, + "num_input_tokens_seen": 302653605, + "step": 14034, + "time_per_iteration": 2.501718282699585 + }, + { + "auxiliary_loss_clip": 0.01068517, + "auxiliary_loss_mlp": 0.01030907, + "balance_loss_clip": 1.03270578, + "balance_loss_mlp": 1.01714396, + "epoch": 0.8438298511949497, + "flos": 28765847629440.0, + "grad_norm": 1.4612808226077214, + "language_loss": 0.78432345, + "learning_rate": 2.503443056028656e-07, + "loss": 0.80531764, + "num_input_tokens_seen": 302673965, + "step": 14035, + "time_per_iteration": 2.632803440093994 + }, + { + "auxiliary_loss_clip": 0.01089127, + "auxiliary_loss_mlp": 0.0103176, + "balance_loss_clip": 1.03444612, + "balance_loss_mlp": 1.01995182, + "epoch": 0.8438899744476176, + "flos": 33724284779520.0, + "grad_norm": 1.3332181286345353, + "language_loss": 0.72261345, + "learning_rate": 2.501556703225751e-07, + "loss": 0.74382228, + "num_input_tokens_seen": 302695560, + "step": 14036, + "time_per_iteration": 2.6750521659851074 + }, + { + "auxiliary_loss_clip": 0.01098724, + "auxiliary_loss_mlp": 0.01025102, + "balance_loss_clip": 1.03474045, + "balance_loss_mlp": 1.01498175, + "epoch": 0.8439500977002856, + "flos": 25110787386240.0, + "grad_norm": 1.6817880078587948, + "language_loss": 0.69759822, + "learning_rate": 2.49967101396557e-07, + "loss": 0.71883655, + "num_input_tokens_seen": 302713480, + "step": 14037, + "time_per_iteration": 2.5505809783935547 + }, + { + "auxiliary_loss_clip": 0.0110179, + "auxiliary_loss_mlp": 0.01027988, + "balance_loss_clip": 1.03467941, + "balance_loss_mlp": 1.01638937, + "epoch": 0.8440102209529535, + "flos": 32850362880000.0, + "grad_norm": 1.5956128940459593, + "language_loss": 0.68912709, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.7104249, + "num_input_tokens_seen": 302736860, + "step": 14038, + "time_per_iteration": 2.558093786239624 + }, + { + "auxiliary_loss_clip": 0.01055691, + "auxiliary_loss_mlp": 0.01038713, + "balance_loss_clip": 1.03224111, + "balance_loss_mlp": 1.02611232, + "epoch": 0.8440703442056215, + "flos": 23730202195200.0, + "grad_norm": 1.5304174722996131, + "language_loss": 0.7633636, + "learning_rate": 2.49590162635938e-07, + "loss": 0.7843076, + "num_input_tokens_seen": 302757745, + "step": 14039, + "time_per_iteration": 2.6460976600646973 + }, + { + "auxiliary_loss_clip": 0.01110009, + "auxiliary_loss_mlp": 0.01027604, + "balance_loss_clip": 1.0379467, + "balance_loss_mlp": 1.01541507, + "epoch": 0.8441304674582895, + "flos": 20193719725440.0, + "grad_norm": 4.108290851526094, + "language_loss": 0.79172647, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.8131026, + "num_input_tokens_seen": 302774885, + "step": 14040, + "time_per_iteration": 2.4590845108032227 + }, + { + "auxiliary_loss_clip": 0.01071719, + "auxiliary_loss_mlp": 0.01031416, + "balance_loss_clip": 1.035303, + "balance_loss_mlp": 1.01897073, + "epoch": 0.8441905907109575, + "flos": 20219897761920.0, + "grad_norm": 2.2654785945407006, + "language_loss": 0.69492865, + "learning_rate": 2.492134893781821e-07, + "loss": 0.71596003, + "num_input_tokens_seen": 302791035, + "step": 14041, + "time_per_iteration": 2.5832502841949463 + }, + { + "auxiliary_loss_clip": 0.01084328, + "auxiliary_loss_mlp": 0.01037117, + "balance_loss_clip": 1.03454852, + "balance_loss_mlp": 1.02505922, + "epoch": 0.8442507139636254, + "flos": 13516453987200.0, + "grad_norm": 1.7918305014415317, + "language_loss": 0.68602771, + "learning_rate": 2.490252523307341e-07, + "loss": 0.70724219, + "num_input_tokens_seen": 302808650, + "step": 14042, + "time_per_iteration": 2.474485397338867 + }, + { + "auxiliary_loss_clip": 0.01078959, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.03403521, + "balance_loss_mlp": 1.01946449, + "epoch": 0.8443108372162934, + "flos": 18220212731520.0, + "grad_norm": 1.7800166503663042, + "language_loss": 0.74578238, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.76687801, + "num_input_tokens_seen": 302824605, + "step": 14043, + "time_per_iteration": 2.5255541801452637 + }, + { + "auxiliary_loss_clip": 0.01102315, + "auxiliary_loss_mlp": 0.00781344, + "balance_loss_clip": 1.03519988, + "balance_loss_mlp": 1.00631893, + "epoch": 0.8443709604689613, + "flos": 16105110324480.0, + "grad_norm": 2.0457704920313566, + "language_loss": 0.72132307, + "learning_rate": 2.486489774343865e-07, + "loss": 0.74015963, + "num_input_tokens_seen": 302840170, + "step": 14044, + "time_per_iteration": 2.4234249591827393 + }, + { + "auxiliary_loss_clip": 0.01079433, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.03339744, + "balance_loss_mlp": 1.01685143, + "epoch": 0.8444310837216293, + "flos": 18512130562560.0, + "grad_norm": 1.60500141705661, + "language_loss": 0.74767393, + "learning_rate": 2.484609395997559e-07, + "loss": 0.76875591, + "num_input_tokens_seen": 302858320, + "step": 14045, + "time_per_iteration": 2.589599609375 + }, + { + "auxiliary_loss_clip": 0.01080834, + "auxiliary_loss_mlp": 0.0078306, + "balance_loss_clip": 1.03113544, + "balance_loss_mlp": 1.01022029, + "epoch": 0.8444912069742974, + "flos": 14939845211520.0, + "grad_norm": 1.7360266960947157, + "language_loss": 0.78676397, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.80540299, + "num_input_tokens_seen": 302875255, + "step": 14046, + "time_per_iteration": 2.471883773803711 + }, + { + "auxiliary_loss_clip": 0.0107737, + "auxiliary_loss_mlp": 0.01037527, + "balance_loss_clip": 1.03496397, + "balance_loss_mlp": 1.02389526, + "epoch": 0.8445513302269653, + "flos": 20120318282880.0, + "grad_norm": 1.897780690441939, + "language_loss": 0.78452039, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.80566931, + "num_input_tokens_seen": 302894690, + "step": 14047, + "time_per_iteration": 2.544961929321289 + }, + { + "auxiliary_loss_clip": 0.01080596, + "auxiliary_loss_mlp": 0.01029735, + "balance_loss_clip": 1.03685689, + "balance_loss_mlp": 1.01825547, + "epoch": 0.8446114534796333, + "flos": 31170928533120.0, + "grad_norm": 1.813930869940709, + "language_loss": 0.72309071, + "learning_rate": 2.478972246355935e-07, + "loss": 0.74419403, + "num_input_tokens_seen": 302912405, + "step": 14048, + "time_per_iteration": 2.5702719688415527 + }, + { + "auxiliary_loss_clip": 0.01023983, + "auxiliary_loss_mlp": 0.01034279, + "balance_loss_clip": 1.03570652, + "balance_loss_mlp": 1.02266169, + "epoch": 0.8446715767323012, + "flos": 23948323534080.0, + "grad_norm": 1.5439193109987108, + "language_loss": 0.73356086, + "learning_rate": 2.477094525178667e-07, + "loss": 0.75414348, + "num_input_tokens_seen": 302932525, + "step": 14049, + "time_per_iteration": 2.8575172424316406 + }, + { + "auxiliary_loss_clip": 0.0101932, + "auxiliary_loss_mlp": 0.00761628, + "balance_loss_clip": 1.00609303, + "balance_loss_mlp": 1.0011059, + "epoch": 0.8447316999849692, + "flos": 67984897484160.0, + "grad_norm": 0.8045335048243902, + "language_loss": 0.6062845, + "learning_rate": 2.475217468471729e-07, + "loss": 0.62409389, + "num_input_tokens_seen": 302991285, + "step": 14050, + "time_per_iteration": 3.266545534133911 + }, + { + "auxiliary_loss_clip": 0.01078514, + "auxiliary_loss_mlp": 0.00786401, + "balance_loss_clip": 1.03199875, + "balance_loss_mlp": 1.01073647, + "epoch": 0.8447918232376371, + "flos": 22418924296320.0, + "grad_norm": 2.3587522518857824, + "language_loss": 0.72230273, + "learning_rate": 2.473341076306303e-07, + "loss": 0.7409519, + "num_input_tokens_seen": 303009515, + "step": 14051, + "time_per_iteration": 2.582123041152954 + }, + { + "auxiliary_loss_clip": 0.01090675, + "auxiliary_loss_mlp": 0.01026639, + "balance_loss_clip": 1.03469658, + "balance_loss_mlp": 1.01496887, + "epoch": 0.8448519464903052, + "flos": 23694147918720.0, + "grad_norm": 1.8581439430208904, + "language_loss": 0.74280465, + "learning_rate": 2.471465348753547e-07, + "loss": 0.76397777, + "num_input_tokens_seen": 303026905, + "step": 14052, + "time_per_iteration": 2.4827253818511963 + }, + { + "auxiliary_loss_clip": 0.01072305, + "auxiliary_loss_mlp": 0.01024554, + "balance_loss_clip": 1.035115, + "balance_loss_mlp": 1.01431346, + "epoch": 0.8449120697429731, + "flos": 13735904129280.0, + "grad_norm": 1.7915473137272608, + "language_loss": 0.73883557, + "learning_rate": 2.469590285884575e-07, + "loss": 0.75980413, + "num_input_tokens_seen": 303045245, + "step": 14053, + "time_per_iteration": 2.5451111793518066 + }, + { + "auxiliary_loss_clip": 0.01086003, + "auxiliary_loss_mlp": 0.0102769, + "balance_loss_clip": 1.03313637, + "balance_loss_mlp": 1.01578736, + "epoch": 0.8449721929956411, + "flos": 20886795624960.0, + "grad_norm": 1.9209065089637507, + "language_loss": 0.73994362, + "learning_rate": 2.467715887770494e-07, + "loss": 0.76108062, + "num_input_tokens_seen": 303065205, + "step": 14054, + "time_per_iteration": 2.4609224796295166 + }, + { + "auxiliary_loss_clip": 0.01096468, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.03490734, + "balance_loss_mlp": 1.01819539, + "epoch": 0.845032316248309, + "flos": 33216939129600.0, + "grad_norm": 1.4921050930352815, + "language_loss": 0.78246403, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.8037312, + "num_input_tokens_seen": 303088250, + "step": 14055, + "time_per_iteration": 2.6078484058380127 + }, + { + "auxiliary_loss_clip": 0.01089834, + "auxiliary_loss_mlp": 0.01030806, + "balance_loss_clip": 1.03365564, + "balance_loss_mlp": 1.01949859, + "epoch": 0.845092439500977, + "flos": 23585230903680.0, + "grad_norm": 1.7872732960435973, + "language_loss": 0.73167539, + "learning_rate": 2.463969086091302e-07, + "loss": 0.75288177, + "num_input_tokens_seen": 303109280, + "step": 14056, + "time_per_iteration": 2.489408254623413 + }, + { + "auxiliary_loss_clip": 0.01097155, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_clip": 1.03612912, + "balance_loss_mlp": 1.0234791, + "epoch": 0.8451525627536449, + "flos": 13333920048000.0, + "grad_norm": 1.9907839074685436, + "language_loss": 0.67089581, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.6922282, + "num_input_tokens_seen": 303126075, + "step": 14057, + "time_per_iteration": 2.5094311237335205 + }, + { + "auxiliary_loss_clip": 0.01056951, + "auxiliary_loss_mlp": 0.01028736, + "balance_loss_clip": 1.03584182, + "balance_loss_mlp": 1.0165652, + "epoch": 0.8452126860063129, + "flos": 27817985583360.0, + "grad_norm": 1.6147150786226652, + "language_loss": 0.77387506, + "learning_rate": 2.460224944284284e-07, + "loss": 0.79473186, + "num_input_tokens_seen": 303146920, + "step": 14058, + "time_per_iteration": 2.5946662425994873 + }, + { + "auxiliary_loss_clip": 0.01103458, + "auxiliary_loss_mlp": 0.01032766, + "balance_loss_clip": 1.03438926, + "balance_loss_mlp": 1.02108383, + "epoch": 0.845272809258981, + "flos": 27124694202240.0, + "grad_norm": 1.5967886267850524, + "language_loss": 0.69922948, + "learning_rate": 2.45835387101033e-07, + "loss": 0.72059178, + "num_input_tokens_seen": 303167885, + "step": 14059, + "time_per_iteration": 4.245558500289917 + }, + { + "auxiliary_loss_clip": 0.01108122, + "auxiliary_loss_mlp": 0.01036133, + "balance_loss_clip": 1.03610373, + "balance_loss_mlp": 1.0234071, + "epoch": 0.8453329325116489, + "flos": 18332577452160.0, + "grad_norm": 2.2691220217651638, + "language_loss": 0.57473195, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.59617448, + "num_input_tokens_seen": 303185000, + "step": 14060, + "time_per_iteration": 3.8222591876983643 + }, + { + "auxiliary_loss_clip": 0.01085145, + "auxiliary_loss_mlp": 0.01032963, + "balance_loss_clip": 1.03363276, + "balance_loss_mlp": 1.02002835, + "epoch": 0.8453930557643169, + "flos": 22675254727680.0, + "grad_norm": 1.6864479877151286, + "language_loss": 0.75851357, + "learning_rate": 2.454613720076277e-07, + "loss": 0.77969462, + "num_input_tokens_seen": 303205210, + "step": 14061, + "time_per_iteration": 2.565387725830078 + }, + { + "auxiliary_loss_clip": 0.01085786, + "auxiliary_loss_mlp": 0.01025654, + "balance_loss_clip": 1.03513992, + "balance_loss_mlp": 1.01338172, + "epoch": 0.8454531790169848, + "flos": 22487261921280.0, + "grad_norm": 2.0180007894499505, + "language_loss": 0.70896006, + "learning_rate": 2.452744642558013e-07, + "loss": 0.73007452, + "num_input_tokens_seen": 303224655, + "step": 14062, + "time_per_iteration": 2.5023956298828125 + }, + { + "auxiliary_loss_clip": 0.01001815, + "auxiliary_loss_mlp": 0.01002755, + "balance_loss_clip": 1.0267868, + "balance_loss_mlp": 1.00160432, + "epoch": 0.8455133022696528, + "flos": 58277848481280.0, + "grad_norm": 0.6338222040276487, + "language_loss": 0.52665424, + "learning_rate": 2.450876230433432e-07, + "loss": 0.54669994, + "num_input_tokens_seen": 303289645, + "step": 14063, + "time_per_iteration": 4.707958698272705 + }, + { + "auxiliary_loss_clip": 0.01061753, + "auxiliary_loss_mlp": 0.0102503, + "balance_loss_clip": 1.03666127, + "balance_loss_mlp": 1.01452804, + "epoch": 0.8455734255223207, + "flos": 21361283308800.0, + "grad_norm": 1.8195613933235573, + "language_loss": 0.82294381, + "learning_rate": 2.449008483773378e-07, + "loss": 0.84381163, + "num_input_tokens_seen": 303308350, + "step": 14064, + "time_per_iteration": 3.1661176681518555 + }, + { + "auxiliary_loss_clip": 0.01097077, + "auxiliary_loss_mlp": 0.01032169, + "balance_loss_clip": 1.03815269, + "balance_loss_mlp": 1.0195986, + "epoch": 0.8456335487749888, + "flos": 20449260057600.0, + "grad_norm": 1.8367215900267515, + "language_loss": 0.72588086, + "learning_rate": 2.447141402648685e-07, + "loss": 0.74717331, + "num_input_tokens_seen": 303325230, + "step": 14065, + "time_per_iteration": 2.4744772911071777 + }, + { + "auxiliary_loss_clip": 0.01069658, + "auxiliary_loss_mlp": 0.01030534, + "balance_loss_clip": 1.03472042, + "balance_loss_mlp": 1.0191077, + "epoch": 0.8456936720276567, + "flos": 28840901097600.0, + "grad_norm": 1.5100051932201923, + "language_loss": 0.77498901, + "learning_rate": 2.445274987130146e-07, + "loss": 0.79599094, + "num_input_tokens_seen": 303345810, + "step": 14066, + "time_per_iteration": 2.6412084102630615 + }, + { + "auxiliary_loss_clip": 0.01075142, + "auxiliary_loss_mlp": 0.01029962, + "balance_loss_clip": 1.03725457, + "balance_loss_mlp": 1.01823139, + "epoch": 0.8457537952803247, + "flos": 22672884430080.0, + "grad_norm": 1.4751618865835265, + "language_loss": 0.69787908, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.71893013, + "num_input_tokens_seen": 303365140, + "step": 14067, + "time_per_iteration": 2.5382070541381836 + }, + { + "auxiliary_loss_clip": 0.01065746, + "auxiliary_loss_mlp": 0.01027482, + "balance_loss_clip": 1.03177154, + "balance_loss_mlp": 1.01575208, + "epoch": 0.8458139185329926, + "flos": 33802929607680.0, + "grad_norm": 3.384315473705079, + "language_loss": 0.71017116, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.73110342, + "num_input_tokens_seen": 303386150, + "step": 14068, + "time_per_iteration": 2.741776466369629 + }, + { + "auxiliary_loss_clip": 0.00999167, + "auxiliary_loss_mlp": 0.01002843, + "balance_loss_clip": 1.00795293, + "balance_loss_mlp": 1.00168061, + "epoch": 0.8458740417856606, + "flos": 70295929603200.0, + "grad_norm": 0.6922893585955606, + "language_loss": 0.60522616, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.62524617, + "num_input_tokens_seen": 303453770, + "step": 14069, + "time_per_iteration": 4.645868539810181 + }, + { + "auxiliary_loss_clip": 0.01078333, + "auxiliary_loss_mlp": 0.01026773, + "balance_loss_clip": 1.0350312, + "balance_loss_mlp": 1.01559138, + "epoch": 0.8459341650383285, + "flos": 24170862245760.0, + "grad_norm": 1.5825425501155832, + "language_loss": 0.74604785, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.7670989, + "num_input_tokens_seen": 303474520, + "step": 14070, + "time_per_iteration": 2.530360221862793 + }, + { + "auxiliary_loss_clip": 0.01056579, + "auxiliary_loss_mlp": 0.01034527, + "balance_loss_clip": 1.03294134, + "balance_loss_mlp": 1.02181911, + "epoch": 0.8459942882909965, + "flos": 38181158369280.0, + "grad_norm": 1.6307845257461309, + "language_loss": 0.67315912, + "learning_rate": 2.435952896106039e-07, + "loss": 0.69407022, + "num_input_tokens_seen": 303497345, + "step": 14071, + "time_per_iteration": 2.720252513885498 + }, + { + "auxiliary_loss_clip": 0.01019171, + "auxiliary_loss_mlp": 0.00760002, + "balance_loss_clip": 1.00662971, + "balance_loss_mlp": 0.99795038, + "epoch": 0.8460544115436646, + "flos": 64118252177280.0, + "grad_norm": 0.7336920617435406, + "language_loss": 0.6105181, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.62830979, + "num_input_tokens_seen": 303554890, + "step": 14072, + "time_per_iteration": 2.968027353286743 + }, + { + "auxiliary_loss_clip": 0.01064339, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.03523159, + "balance_loss_mlp": 1.01768565, + "epoch": 0.8461145347963325, + "flos": 24170826332160.0, + "grad_norm": 2.0818188172639926, + "language_loss": 0.72158712, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.74253792, + "num_input_tokens_seen": 303574380, + "step": 14073, + "time_per_iteration": 2.594224214553833 + }, + { + "auxiliary_loss_clip": 0.01088675, + "auxiliary_loss_mlp": 0.0103648, + "balance_loss_clip": 1.03770626, + "balance_loss_mlp": 1.02368295, + "epoch": 0.8461746580490005, + "flos": 34893787697280.0, + "grad_norm": 2.017372376353228, + "language_loss": 0.77925032, + "learning_rate": 2.430367633291155e-07, + "loss": 0.80050182, + "num_input_tokens_seen": 303594910, + "step": 14074, + "time_per_iteration": 2.6565566062927246 + }, + { + "auxiliary_loss_clip": 0.01094427, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.03641844, + "balance_loss_mlp": 1.02201462, + "epoch": 0.8462347813016684, + "flos": 25557014044800.0, + "grad_norm": 2.4252636067830537, + "language_loss": 0.7522229, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.77350795, + "num_input_tokens_seen": 303613520, + "step": 14075, + "time_per_iteration": 2.589517593383789 + }, + { + "auxiliary_loss_clip": 0.01079856, + "auxiliary_loss_mlp": 0.01028867, + "balance_loss_clip": 1.03401709, + "balance_loss_mlp": 1.01647484, + "epoch": 0.8462949045543364, + "flos": 21325336773120.0, + "grad_norm": 1.9468372890477397, + "language_loss": 0.72926062, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.75034785, + "num_input_tokens_seen": 303631225, + "step": 14076, + "time_per_iteration": 2.5452277660369873 + }, + { + "auxiliary_loss_clip": 0.01086269, + "auxiliary_loss_mlp": 0.01031721, + "balance_loss_clip": 1.0349685, + "balance_loss_mlp": 1.02010369, + "epoch": 0.8463550278070043, + "flos": 22637440684800.0, + "grad_norm": 1.9621082074976173, + "language_loss": 0.77829742, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.79947734, + "num_input_tokens_seen": 303649175, + "step": 14077, + "time_per_iteration": 2.5348451137542725 + }, + { + "auxiliary_loss_clip": 0.01072122, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.03480339, + "balance_loss_mlp": 1.02135575, + "epoch": 0.8464151510596724, + "flos": 13005588804480.0, + "grad_norm": 1.902937237994731, + "language_loss": 0.75022137, + "learning_rate": 2.422929943924643e-07, + "loss": 0.77127719, + "num_input_tokens_seen": 303665915, + "step": 14078, + "time_per_iteration": 2.5139570236206055 + }, + { + "auxiliary_loss_clip": 0.01062122, + "auxiliary_loss_mlp": 0.01024651, + "balance_loss_clip": 1.03420722, + "balance_loss_mlp": 1.01269984, + "epoch": 0.8464752743123403, + "flos": 15704921923200.0, + "grad_norm": 2.314330182511338, + "language_loss": 0.85242188, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.87328959, + "num_input_tokens_seen": 303679985, + "step": 14079, + "time_per_iteration": 2.581977128982544 + }, + { + "auxiliary_loss_clip": 0.01079252, + "auxiliary_loss_mlp": 0.0103575, + "balance_loss_clip": 1.03505743, + "balance_loss_mlp": 1.02228546, + "epoch": 0.8465353975650083, + "flos": 21653955325440.0, + "grad_norm": 1.9099246251347561, + "language_loss": 0.59306639, + "learning_rate": 2.419215098104965e-07, + "loss": 0.61421645, + "num_input_tokens_seen": 303698470, + "step": 14080, + "time_per_iteration": 2.600745439529419 + }, + { + "auxiliary_loss_clip": 0.01079317, + "auxiliary_loss_mlp": 0.01029621, + "balance_loss_clip": 1.03448796, + "balance_loss_mlp": 1.01712227, + "epoch": 0.8465955208176762, + "flos": 18515650095360.0, + "grad_norm": 2.6851014224414365, + "language_loss": 0.65955001, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.68063939, + "num_input_tokens_seen": 303716415, + "step": 14081, + "time_per_iteration": 2.5805532932281494 + }, + { + "auxiliary_loss_clip": 0.01090857, + "auxiliary_loss_mlp": 0.0103716, + "balance_loss_clip": 1.03397918, + "balance_loss_mlp": 1.02480984, + "epoch": 0.8466556440703442, + "flos": 24200559815040.0, + "grad_norm": 1.850476146642062, + "language_loss": 0.72766614, + "learning_rate": 2.41550291894576e-07, + "loss": 0.74894631, + "num_input_tokens_seen": 303734490, + "step": 14082, + "time_per_iteration": 2.509004592895508 + }, + { + "auxiliary_loss_clip": 0.01051641, + "auxiliary_loss_mlp": 0.01031349, + "balance_loss_clip": 1.0312283, + "balance_loss_mlp": 1.01800978, + "epoch": 0.8467157673230121, + "flos": 20375894528640.0, + "grad_norm": 1.8547312696622418, + "language_loss": 0.76433325, + "learning_rate": 2.413647829539809e-07, + "loss": 0.78516316, + "num_input_tokens_seen": 303752310, + "step": 14083, + "time_per_iteration": 2.5720412731170654 + }, + { + "auxiliary_loss_clip": 0.01057657, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.03230369, + "balance_loss_mlp": 1.01969028, + "epoch": 0.8467758905756801, + "flos": 28473642489600.0, + "grad_norm": 1.7358334638441375, + "language_loss": 0.66073859, + "learning_rate": 2.411793407010092e-07, + "loss": 0.68164968, + "num_input_tokens_seen": 303776065, + "step": 14084, + "time_per_iteration": 2.6330409049987793 + }, + { + "auxiliary_loss_clip": 0.01060923, + "auxiliary_loss_mlp": 0.01029156, + "balance_loss_clip": 1.03507411, + "balance_loss_mlp": 1.01795053, + "epoch": 0.8468360138283482, + "flos": 11692551139200.0, + "grad_norm": 2.0470669525957907, + "language_loss": 0.69878274, + "learning_rate": 2.409939651426938e-07, + "loss": 0.71968353, + "num_input_tokens_seen": 303793500, + "step": 14085, + "time_per_iteration": 2.56280255317688 + }, + { + "auxiliary_loss_clip": 0.01062133, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.03360188, + "balance_loss_mlp": 1.01862931, + "epoch": 0.8468961370810161, + "flos": 24607859109120.0, + "grad_norm": 1.5405991236685308, + "language_loss": 0.70902568, + "learning_rate": 2.408086562860634e-07, + "loss": 0.7299484, + "num_input_tokens_seen": 303814835, + "step": 14086, + "time_per_iteration": 2.652015209197998 + }, + { + "auxiliary_loss_clip": 0.01086322, + "auxiliary_loss_mlp": 0.01032236, + "balance_loss_clip": 1.03268695, + "balance_loss_mlp": 1.02003527, + "epoch": 0.8469562603336841, + "flos": 19609812236160.0, + "grad_norm": 1.7096072556629764, + "language_loss": 0.74442172, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.7656073, + "num_input_tokens_seen": 303834505, + "step": 14087, + "time_per_iteration": 2.475109100341797 + }, + { + "auxiliary_loss_clip": 0.01081854, + "auxiliary_loss_mlp": 0.01022362, + "balance_loss_clip": 1.03669381, + "balance_loss_mlp": 1.01069188, + "epoch": 0.847016383586352, + "flos": 22638949056000.0, + "grad_norm": 1.4968730906851149, + "language_loss": 0.74009812, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.76114023, + "num_input_tokens_seen": 303855050, + "step": 14088, + "time_per_iteration": 2.5299978256225586 + }, + { + "auxiliary_loss_clip": 0.01094735, + "auxiliary_loss_mlp": 0.01033037, + "balance_loss_clip": 1.0351814, + "balance_loss_mlp": 1.02082419, + "epoch": 0.84707650683902, + "flos": 20960161153920.0, + "grad_norm": 1.9672384279518031, + "language_loss": 0.71826535, + "learning_rate": 2.402531299965387e-07, + "loss": 0.73954308, + "num_input_tokens_seen": 303875635, + "step": 14089, + "time_per_iteration": 2.478297710418701 + }, + { + "auxiliary_loss_clip": 0.01102755, + "auxiliary_loss_mlp": 0.01027323, + "balance_loss_clip": 1.03644717, + "balance_loss_mlp": 1.01602244, + "epoch": 0.8471366300916879, + "flos": 24093007516800.0, + "grad_norm": 1.4374383668614616, + "language_loss": 0.79151487, + "learning_rate": 2.400680880168928e-07, + "loss": 0.81281567, + "num_input_tokens_seen": 303896750, + "step": 14090, + "time_per_iteration": 2.483769416809082 + }, + { + "auxiliary_loss_clip": 0.0104949, + "auxiliary_loss_mlp": 0.01040797, + "balance_loss_clip": 1.03132999, + "balance_loss_mlp": 1.02661681, + "epoch": 0.847196753344356, + "flos": 18332900674560.0, + "grad_norm": 1.9623267959299169, + "language_loss": 0.77041411, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.79131699, + "num_input_tokens_seen": 303915435, + "step": 14091, + "time_per_iteration": 2.590689182281494 + }, + { + "auxiliary_loss_clip": 0.01028129, + "auxiliary_loss_mlp": 0.01006477, + "balance_loss_clip": 1.00521851, + "balance_loss_mlp": 1.00532103, + "epoch": 0.8472568765970239, + "flos": 49567536956160.0, + "grad_norm": 0.819561073868323, + "language_loss": 0.59415406, + "learning_rate": 2.396982042749982e-07, + "loss": 0.61450011, + "num_input_tokens_seen": 303977245, + "step": 14092, + "time_per_iteration": 3.131436347961426 + }, + { + "auxiliary_loss_clip": 0.01077946, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.03286695, + "balance_loss_mlp": 1.021492, + "epoch": 0.8473169998496919, + "flos": 19279074781440.0, + "grad_norm": 1.8844147639178976, + "language_loss": 0.70294654, + "learning_rate": 2.395133625267756e-07, + "loss": 0.72406769, + "num_input_tokens_seen": 303996055, + "step": 14093, + "time_per_iteration": 2.4793708324432373 + }, + { + "auxiliary_loss_clip": 0.01100473, + "auxiliary_loss_mlp": 0.01026116, + "balance_loss_clip": 1.03390956, + "balance_loss_mlp": 1.01492202, + "epoch": 0.8473771231023598, + "flos": 17675555829120.0, + "grad_norm": 1.8885159574751447, + "language_loss": 0.83456242, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.85582829, + "num_input_tokens_seen": 304012205, + "step": 14094, + "time_per_iteration": 2.4148824214935303 + }, + { + "auxiliary_loss_clip": 0.01091184, + "auxiliary_loss_mlp": 0.01032408, + "balance_loss_clip": 1.03609872, + "balance_loss_mlp": 1.02073133, + "epoch": 0.8474372463550278, + "flos": 26359761144960.0, + "grad_norm": 1.6116688491759743, + "language_loss": 0.71133077, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.73256671, + "num_input_tokens_seen": 304033475, + "step": 14095, + "time_per_iteration": 2.503856897354126 + }, + { + "auxiliary_loss_clip": 0.01090766, + "auxiliary_loss_mlp": 0.00783318, + "balance_loss_clip": 1.03490758, + "balance_loss_mlp": 1.01004636, + "epoch": 0.8474973696076957, + "flos": 23402050519680.0, + "grad_norm": 1.5784662606792474, + "language_loss": 0.80846167, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.82720256, + "num_input_tokens_seen": 304051845, + "step": 14096, + "time_per_iteration": 2.5179052352905273 + }, + { + "auxiliary_loss_clip": 0.0109456, + "auxiliary_loss_mlp": 0.01030104, + "balance_loss_clip": 1.03461075, + "balance_loss_mlp": 1.01728272, + "epoch": 0.8475574928603637, + "flos": 25075666863360.0, + "grad_norm": 1.6929097450202593, + "language_loss": 0.77757597, + "learning_rate": 2.387746631822374e-07, + "loss": 0.79882252, + "num_input_tokens_seen": 304069965, + "step": 14097, + "time_per_iteration": 3.943601369857788 + }, + { + "auxiliary_loss_clip": 0.01072052, + "auxiliary_loss_mlp": 0.01023717, + "balance_loss_clip": 1.03351152, + "balance_loss_mlp": 1.01252949, + "epoch": 0.8476176161130318, + "flos": 19966691813760.0, + "grad_norm": 1.7864901917689309, + "language_loss": 0.80571234, + "learning_rate": 2.385901552932048e-07, + "loss": 0.82667011, + "num_input_tokens_seen": 304086805, + "step": 14098, + "time_per_iteration": 2.5659918785095215 + }, + { + "auxiliary_loss_clip": 0.01087599, + "auxiliary_loss_mlp": 0.00783696, + "balance_loss_clip": 1.03403211, + "balance_loss_mlp": 1.00955796, + "epoch": 0.8476777393656997, + "flos": 21285834791040.0, + "grad_norm": 4.575822037304887, + "language_loss": 0.71859336, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.73730624, + "num_input_tokens_seen": 304105865, + "step": 14099, + "time_per_iteration": 3.871368646621704 + }, + { + "auxiliary_loss_clip": 0.01091198, + "auxiliary_loss_mlp": 0.01032058, + "balance_loss_clip": 1.03305697, + "balance_loss_mlp": 1.01914155, + "epoch": 0.8477378626183677, + "flos": 29971476650880.0, + "grad_norm": 1.974146275469105, + "language_loss": 0.63342202, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.65465462, + "num_input_tokens_seen": 304128300, + "step": 14100, + "time_per_iteration": 2.5426323413848877 + }, + { + "auxiliary_loss_clip": 0.01094262, + "auxiliary_loss_mlp": 0.01031271, + "balance_loss_clip": 1.03503227, + "balance_loss_mlp": 1.01858664, + "epoch": 0.8477979858710356, + "flos": 24237727413120.0, + "grad_norm": 2.671279564174652, + "language_loss": 0.73419583, + "learning_rate": 2.380370324111085e-07, + "loss": 0.75545108, + "num_input_tokens_seen": 304143695, + "step": 14101, + "time_per_iteration": 2.502648115158081 + }, + { + "auxiliary_loss_clip": 0.01092979, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.03321958, + "balance_loss_mlp": 1.01896381, + "epoch": 0.8478581091237036, + "flos": 25593678852480.0, + "grad_norm": 1.5297736407856617, + "language_loss": 0.71154082, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.73277664, + "num_input_tokens_seen": 304165800, + "step": 14102, + "time_per_iteration": 3.9205312728881836 + }, + { + "auxiliary_loss_clip": 0.01079097, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.03364706, + "balance_loss_mlp": 1.01986098, + "epoch": 0.8479182323763715, + "flos": 12057116227200.0, + "grad_norm": 2.3070508981337126, + "language_loss": 0.81704056, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.83816302, + "num_input_tokens_seen": 304182910, + "step": 14103, + "time_per_iteration": 2.5113487243652344 + }, + { + "auxiliary_loss_clip": 0.01103902, + "auxiliary_loss_mlp": 0.01030793, + "balance_loss_clip": 1.03582418, + "balance_loss_mlp": 1.01943874, + "epoch": 0.8479783556290396, + "flos": 21433391861760.0, + "grad_norm": 2.162711540947283, + "language_loss": 0.78385139, + "learning_rate": 2.374845108533079e-07, + "loss": 0.80519831, + "num_input_tokens_seen": 304200175, + "step": 14104, + "time_per_iteration": 2.475825786590576 + }, + { + "auxiliary_loss_clip": 0.01096695, + "auxiliary_loss_mlp": 0.010358, + "balance_loss_clip": 1.0373385, + "balance_loss_mlp": 1.02362323, + "epoch": 0.8480384788817075, + "flos": 19642634288640.0, + "grad_norm": 1.8437295592627636, + "language_loss": 0.78845543, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.80978036, + "num_input_tokens_seen": 304217775, + "step": 14105, + "time_per_iteration": 2.468998432159424 + }, + { + "auxiliary_loss_clip": 0.01079045, + "auxiliary_loss_mlp": 0.01037126, + "balance_loss_clip": 1.03435636, + "balance_loss_mlp": 1.02273726, + "epoch": 0.8480986021343755, + "flos": 22489201255680.0, + "grad_norm": 1.8096154208018684, + "language_loss": 0.50279027, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.52395195, + "num_input_tokens_seen": 304235760, + "step": 14106, + "time_per_iteration": 2.522883892059326 + }, + { + "auxiliary_loss_clip": 0.01069365, + "auxiliary_loss_mlp": 0.01029256, + "balance_loss_clip": 1.03344262, + "balance_loss_mlp": 1.01793694, + "epoch": 0.8481587253870434, + "flos": 22090557139200.0, + "grad_norm": 1.8970926906185903, + "language_loss": 0.76042354, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.78140968, + "num_input_tokens_seen": 304253985, + "step": 14107, + "time_per_iteration": 3.924323558807373 + }, + { + "auxiliary_loss_clip": 0.0107135, + "auxiliary_loss_mlp": 0.01027528, + "balance_loss_clip": 1.03385162, + "balance_loss_mlp": 1.01578546, + "epoch": 0.8482188486397114, + "flos": 33582689366400.0, + "grad_norm": 1.5984378804915558, + "language_loss": 0.73499352, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.75598228, + "num_input_tokens_seen": 304276785, + "step": 14108, + "time_per_iteration": 2.642639398574829 + }, + { + "auxiliary_loss_clip": 0.01101422, + "auxiliary_loss_mlp": 0.01027726, + "balance_loss_clip": 1.03567994, + "balance_loss_mlp": 1.01363564, + "epoch": 0.8482789718923793, + "flos": 20919402195840.0, + "grad_norm": 1.870347301287449, + "language_loss": 0.72766477, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.74895626, + "num_input_tokens_seen": 304296310, + "step": 14109, + "time_per_iteration": 2.4398763179779053 + }, + { + "auxiliary_loss_clip": 0.01042856, + "auxiliary_loss_mlp": 0.01032999, + "balance_loss_clip": 1.03353143, + "balance_loss_mlp": 1.02061296, + "epoch": 0.8483390951450474, + "flos": 12896204912640.0, + "grad_norm": 2.1400919358585164, + "language_loss": 0.74439973, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.7651583, + "num_input_tokens_seen": 304311715, + "step": 14110, + "time_per_iteration": 2.5858349800109863 + }, + { + "auxiliary_loss_clip": 0.01046008, + "auxiliary_loss_mlp": 0.01033618, + "balance_loss_clip": 1.03512549, + "balance_loss_mlp": 1.02256703, + "epoch": 0.8483992183977154, + "flos": 25081628520960.0, + "grad_norm": 1.7337658214913485, + "language_loss": 0.76130813, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.78210437, + "num_input_tokens_seen": 304331910, + "step": 14111, + "time_per_iteration": 2.6229629516601562 + }, + { + "auxiliary_loss_clip": 0.01102598, + "auxiliary_loss_mlp": 0.01027497, + "balance_loss_clip": 1.0354147, + "balance_loss_mlp": 1.01644635, + "epoch": 0.8484593416503833, + "flos": 25557445008000.0, + "grad_norm": 1.5497110591815617, + "language_loss": 0.67754102, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.69884193, + "num_input_tokens_seen": 304351405, + "step": 14112, + "time_per_iteration": 2.4923081398010254 + }, + { + "auxiliary_loss_clip": 0.01090615, + "auxiliary_loss_mlp": 0.01036272, + "balance_loss_clip": 1.03407049, + "balance_loss_mlp": 1.02447009, + "epoch": 0.8485194649030513, + "flos": 27198454780800.0, + "grad_norm": 1.384651642319795, + "language_loss": 0.73778749, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.75905633, + "num_input_tokens_seen": 304372935, + "step": 14113, + "time_per_iteration": 2.529430627822876 + }, + { + "auxiliary_loss_clip": 0.01064873, + "auxiliary_loss_mlp": 0.01030777, + "balance_loss_clip": 1.03446817, + "balance_loss_mlp": 1.01931524, + "epoch": 0.8485795881557192, + "flos": 24205910941440.0, + "grad_norm": 1.92221842147391, + "language_loss": 0.66772324, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.6886797, + "num_input_tokens_seen": 304393070, + "step": 14114, + "time_per_iteration": 2.565100908279419 + }, + { + "auxiliary_loss_clip": 0.01106674, + "auxiliary_loss_mlp": 0.01031554, + "balance_loss_clip": 1.03681612, + "balance_loss_mlp": 1.0191685, + "epoch": 0.8486397114083872, + "flos": 21141653598720.0, + "grad_norm": 1.8981617987180666, + "language_loss": 0.78846258, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.80984485, + "num_input_tokens_seen": 304411195, + "step": 14115, + "time_per_iteration": 2.4441282749176025 + }, + { + "auxiliary_loss_clip": 0.01102948, + "auxiliary_loss_mlp": 0.01028425, + "balance_loss_clip": 1.03415108, + "balance_loss_mlp": 1.01697552, + "epoch": 0.8486998346610551, + "flos": 19974772373760.0, + "grad_norm": 1.7782009621532118, + "language_loss": 0.78745461, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.80876839, + "num_input_tokens_seen": 304429425, + "step": 14116, + "time_per_iteration": 2.442976236343384 + }, + { + "auxiliary_loss_clip": 0.01094916, + "auxiliary_loss_mlp": 0.01032614, + "balance_loss_clip": 1.03463531, + "balance_loss_mlp": 1.0201869, + "epoch": 0.8487599579137232, + "flos": 19792310261760.0, + "grad_norm": 2.6104210410277124, + "language_loss": 0.68558306, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.70685834, + "num_input_tokens_seen": 304447460, + "step": 14117, + "time_per_iteration": 2.462104082107544 + }, + { + "auxiliary_loss_clip": 0.01084778, + "auxiliary_loss_mlp": 0.0102776, + "balance_loss_clip": 1.03276443, + "balance_loss_mlp": 1.0155232, + "epoch": 0.8488200811663911, + "flos": 26396030903040.0, + "grad_norm": 1.862336326254606, + "language_loss": 0.64688456, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.66801, + "num_input_tokens_seen": 304468230, + "step": 14118, + "time_per_iteration": 2.5493640899658203 + }, + { + "auxiliary_loss_clip": 0.01062264, + "auxiliary_loss_mlp": 0.01028934, + "balance_loss_clip": 1.03357267, + "balance_loss_mlp": 1.01753736, + "epoch": 0.8488802044190591, + "flos": 16359285939840.0, + "grad_norm": 1.5751926904734792, + "language_loss": 0.73267996, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.75359195, + "num_input_tokens_seen": 304484860, + "step": 14119, + "time_per_iteration": 2.5180680751800537 + }, + { + "auxiliary_loss_clip": 0.01071494, + "auxiliary_loss_mlp": 0.01030911, + "balance_loss_clip": 1.033023, + "balance_loss_mlp": 1.01802492, + "epoch": 0.848940327671727, + "flos": 19208869649280.0, + "grad_norm": 1.7581163751766475, + "language_loss": 0.7798084, + "learning_rate": 2.345478926864446e-07, + "loss": 0.80083239, + "num_input_tokens_seen": 304503575, + "step": 14120, + "time_per_iteration": 2.5387110710144043 + }, + { + "auxiliary_loss_clip": 0.01089149, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.03632271, + "balance_loss_mlp": 1.01776624, + "epoch": 0.849000450924395, + "flos": 21871178824320.0, + "grad_norm": 1.8378986923723706, + "language_loss": 0.7545563, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.7757445, + "num_input_tokens_seen": 304525005, + "step": 14121, + "time_per_iteration": 2.487318992614746 + }, + { + "auxiliary_loss_clip": 0.00996109, + "auxiliary_loss_mlp": 0.01004582, + "balance_loss_clip": 1.00595069, + "balance_loss_mlp": 1.00344372, + "epoch": 0.8490605741770629, + "flos": 71166475624320.0, + "grad_norm": 0.8067539868655578, + "language_loss": 0.60167289, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.62167978, + "num_input_tokens_seen": 304585220, + "step": 14122, + "time_per_iteration": 3.1411499977111816 + }, + { + "auxiliary_loss_clip": 0.01093562, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.03564131, + "balance_loss_mlp": 1.01931667, + "epoch": 0.849120697429731, + "flos": 24973357950720.0, + "grad_norm": 1.7270855822877715, + "language_loss": 0.79793346, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.81917471, + "num_input_tokens_seen": 304604665, + "step": 14123, + "time_per_iteration": 2.545786142349243 + }, + { + "auxiliary_loss_clip": 0.01089983, + "auxiliary_loss_mlp": 0.01028208, + "balance_loss_clip": 1.03614223, + "balance_loss_mlp": 1.01680517, + "epoch": 0.8491808206823989, + "flos": 23032277959680.0, + "grad_norm": 1.9199175568145632, + "language_loss": 0.83032608, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.85150802, + "num_input_tokens_seen": 304620600, + "step": 14124, + "time_per_iteration": 2.475848436355591 + }, + { + "auxiliary_loss_clip": 0.01058745, + "auxiliary_loss_mlp": 0.01029591, + "balance_loss_clip": 1.03689098, + "balance_loss_mlp": 1.01782489, + "epoch": 0.8492409439350669, + "flos": 23878549365120.0, + "grad_norm": 3.8320789001463247, + "language_loss": 0.7185483, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.73943162, + "num_input_tokens_seen": 304639540, + "step": 14125, + "time_per_iteration": 2.6198043823242188 + }, + { + "auxiliary_loss_clip": 0.01107971, + "auxiliary_loss_mlp": 0.01034083, + "balance_loss_clip": 1.03599644, + "balance_loss_mlp": 1.02102971, + "epoch": 0.8493010671877349, + "flos": 22419893963520.0, + "grad_norm": 1.704454913903163, + "language_loss": 0.73566735, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.75708789, + "num_input_tokens_seen": 304660595, + "step": 14126, + "time_per_iteration": 2.4594645500183105 + }, + { + "auxiliary_loss_clip": 0.01055853, + "auxiliary_loss_mlp": 0.01037893, + "balance_loss_clip": 1.03201306, + "balance_loss_mlp": 1.02572167, + "epoch": 0.8493611904404028, + "flos": 17529435302400.0, + "grad_norm": 1.4167191825258882, + "language_loss": 0.67398006, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.69491744, + "num_input_tokens_seen": 304679580, + "step": 14127, + "time_per_iteration": 2.5765721797943115 + }, + { + "auxiliary_loss_clip": 0.01071268, + "auxiliary_loss_mlp": 0.00785168, + "balance_loss_clip": 1.03353012, + "balance_loss_mlp": 1.01074445, + "epoch": 0.8494213136930708, + "flos": 19462937523840.0, + "grad_norm": 1.6488876732713063, + "language_loss": 0.69278514, + "learning_rate": 2.330860086502211e-07, + "loss": 0.71134955, + "num_input_tokens_seen": 304698385, + "step": 14128, + "time_per_iteration": 2.535161018371582 + }, + { + "auxiliary_loss_clip": 0.01077983, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_clip": 1.0334444, + "balance_loss_mlp": 1.02043128, + "epoch": 0.8494814369457387, + "flos": 18770292587520.0, + "grad_norm": 1.7334074051398476, + "language_loss": 0.77590233, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.79701012, + "num_input_tokens_seen": 304715430, + "step": 14129, + "time_per_iteration": 2.5002689361572266 + }, + { + "auxiliary_loss_clip": 0.0103863, + "auxiliary_loss_mlp": 0.01030089, + "balance_loss_clip": 1.03381157, + "balance_loss_mlp": 1.01816177, + "epoch": 0.8495415601984068, + "flos": 23331486251520.0, + "grad_norm": 1.9278608632691003, + "language_loss": 0.68187523, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.70256245, + "num_input_tokens_seen": 304734345, + "step": 14130, + "time_per_iteration": 2.6199803352355957 + }, + { + "auxiliary_loss_clip": 0.01094014, + "auxiliary_loss_mlp": 0.01031642, + "balance_loss_clip": 1.03515518, + "balance_loss_mlp": 1.01947713, + "epoch": 0.8496016834510747, + "flos": 26612859352320.0, + "grad_norm": 1.6706637887512492, + "language_loss": 0.71418852, + "learning_rate": 2.3253890747186e-07, + "loss": 0.73544508, + "num_input_tokens_seen": 304755030, + "step": 14131, + "time_per_iteration": 2.5323610305786133 + }, + { + "auxiliary_loss_clip": 0.01071694, + "auxiliary_loss_mlp": 0.01029549, + "balance_loss_clip": 1.03387475, + "balance_loss_mlp": 1.01832557, + "epoch": 0.8496618067037427, + "flos": 25480380378240.0, + "grad_norm": 1.8390961951195566, + "language_loss": 0.68433177, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.7053442, + "num_input_tokens_seen": 304774320, + "step": 14132, + "time_per_iteration": 2.5700089931488037 + }, + { + "auxiliary_loss_clip": 0.0110059, + "auxiliary_loss_mlp": 0.01034814, + "balance_loss_clip": 1.03353918, + "balance_loss_mlp": 1.02375758, + "epoch": 0.8497219299564106, + "flos": 25374587846400.0, + "grad_norm": 1.563905314171235, + "language_loss": 0.7038548, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.72520888, + "num_input_tokens_seen": 304795355, + "step": 14133, + "time_per_iteration": 2.4921376705169678 + }, + { + "auxiliary_loss_clip": 0.00993693, + "auxiliary_loss_mlp": 0.00762417, + "balance_loss_clip": 1.00844145, + "balance_loss_mlp": 0.99891168, + "epoch": 0.8497820532090786, + "flos": 67780279658880.0, + "grad_norm": 0.727809195650896, + "language_loss": 0.5767802, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.59434128, + "num_input_tokens_seen": 304863915, + "step": 14134, + "time_per_iteration": 3.286475896835327 + }, + { + "auxiliary_loss_clip": 0.01069784, + "auxiliary_loss_mlp": 0.0103025, + "balance_loss_clip": 1.03596151, + "balance_loss_mlp": 1.01795363, + "epoch": 0.8498421764617465, + "flos": 23440546920960.0, + "grad_norm": 2.229311010484275, + "language_loss": 0.79247069, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.81347102, + "num_input_tokens_seen": 304881555, + "step": 14135, + "time_per_iteration": 2.5837128162384033 + }, + { + "auxiliary_loss_clip": 0.01092354, + "auxiliary_loss_mlp": 0.01029001, + "balance_loss_clip": 1.03611732, + "balance_loss_mlp": 1.01659679, + "epoch": 0.8499022997144146, + "flos": 17712615686400.0, + "grad_norm": 2.0056862609359296, + "language_loss": 0.63309658, + "learning_rate": 2.316284127127044e-07, + "loss": 0.65431011, + "num_input_tokens_seen": 304898760, + "step": 14136, + "time_per_iteration": 3.8931891918182373 + }, + { + "auxiliary_loss_clip": 0.01096501, + "auxiliary_loss_mlp": 0.01029849, + "balance_loss_clip": 1.03618443, + "balance_loss_mlp": 1.0169028, + "epoch": 0.8499624229670825, + "flos": 18588512833920.0, + "grad_norm": 2.0511069003768543, + "language_loss": 0.83951402, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.86077756, + "num_input_tokens_seen": 304915465, + "step": 14137, + "time_per_iteration": 3.866767406463623 + }, + { + "auxiliary_loss_clip": 0.01071969, + "auxiliary_loss_mlp": 0.01026371, + "balance_loss_clip": 1.03540802, + "balance_loss_mlp": 1.01562428, + "epoch": 0.8500225462197505, + "flos": 24345854328960.0, + "grad_norm": 1.9649068662486422, + "language_loss": 0.78583372, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.80681711, + "num_input_tokens_seen": 304933190, + "step": 14138, + "time_per_iteration": 2.5760514736175537 + }, + { + "auxiliary_loss_clip": 0.01093, + "auxiliary_loss_mlp": 0.0102843, + "balance_loss_clip": 1.03564286, + "balance_loss_mlp": 1.01710558, + "epoch": 0.8500826694724185, + "flos": 16545518979840.0, + "grad_norm": 1.6660429519414741, + "language_loss": 0.64471149, + "learning_rate": 2.310829204839073e-07, + "loss": 0.6659258, + "num_input_tokens_seen": 304951110, + "step": 14139, + "time_per_iteration": 2.4487147331237793 + }, + { + "auxiliary_loss_clip": 0.01063618, + "auxiliary_loss_mlp": 0.01026415, + "balance_loss_clip": 1.03653753, + "balance_loss_mlp": 1.01513791, + "epoch": 0.8501427927250864, + "flos": 16289404030080.0, + "grad_norm": 1.6033089485906502, + "language_loss": 0.70879722, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.72969759, + "num_input_tokens_seen": 304969095, + "step": 14140, + "time_per_iteration": 2.5398292541503906 + }, + { + "auxiliary_loss_clip": 0.01065768, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.0346024, + "balance_loss_mlp": 1.01883113, + "epoch": 0.8502029159777544, + "flos": 26687912820480.0, + "grad_norm": 1.8218745827263967, + "language_loss": 0.63942623, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.66039038, + "num_input_tokens_seen": 304989315, + "step": 14141, + "time_per_iteration": 4.006561994552612 + }, + { + "auxiliary_loss_clip": 0.01072577, + "auxiliary_loss_mlp": 0.01029651, + "balance_loss_clip": 1.03523278, + "balance_loss_mlp": 1.01802242, + "epoch": 0.8502630392304223, + "flos": 35590778179200.0, + "grad_norm": 1.4740217319572082, + "language_loss": 0.70721501, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.72823727, + "num_input_tokens_seen": 305011020, + "step": 14142, + "time_per_iteration": 2.6839096546173096 + }, + { + "auxiliary_loss_clip": 0.01055598, + "auxiliary_loss_mlp": 0.01027796, + "balance_loss_clip": 1.03139699, + "balance_loss_mlp": 1.0165602, + "epoch": 0.8503231624830904, + "flos": 21649466125440.0, + "grad_norm": 1.4192175292575733, + "language_loss": 0.6556952, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.67652917, + "num_input_tokens_seen": 305033550, + "step": 14143, + "time_per_iteration": 2.639082431793213 + }, + { + "auxiliary_loss_clip": 0.01074749, + "auxiliary_loss_mlp": 0.00781614, + "balance_loss_clip": 1.0339905, + "balance_loss_mlp": 1.00609374, + "epoch": 0.8503832857357583, + "flos": 22417451838720.0, + "grad_norm": 2.072766840753015, + "language_loss": 0.6798799, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.69844353, + "num_input_tokens_seen": 305052885, + "step": 14144, + "time_per_iteration": 2.5525529384613037 + }, + { + "auxiliary_loss_clip": 0.01044867, + "auxiliary_loss_mlp": 0.01034892, + "balance_loss_clip": 1.03326249, + "balance_loss_mlp": 1.02241123, + "epoch": 0.8504434089884263, + "flos": 18697968552960.0, + "grad_norm": 1.9086751241251951, + "language_loss": 0.6453703, + "learning_rate": 2.299937473050777e-07, + "loss": 0.66616791, + "num_input_tokens_seen": 305071995, + "step": 14145, + "time_per_iteration": 2.6046791076660156 + }, + { + "auxiliary_loss_clip": 0.01082816, + "auxiliary_loss_mlp": 0.010334, + "balance_loss_clip": 1.03450787, + "balance_loss_mlp": 1.02082324, + "epoch": 0.8505035322410942, + "flos": 20007989475840.0, + "grad_norm": 1.7277721483402266, + "language_loss": 0.85777175, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.87893391, + "num_input_tokens_seen": 305090190, + "step": 14146, + "time_per_iteration": 3.9808478355407715 + }, + { + "auxiliary_loss_clip": 0.01101287, + "auxiliary_loss_mlp": 0.01028423, + "balance_loss_clip": 1.03360415, + "balance_loss_mlp": 1.01690149, + "epoch": 0.8505636554937622, + "flos": 20812173120000.0, + "grad_norm": 1.6439957464903876, + "language_loss": 0.83649123, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.85778832, + "num_input_tokens_seen": 305109355, + "step": 14147, + "time_per_iteration": 2.442958354949951 + }, + { + "auxiliary_loss_clip": 0.01094397, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.03535521, + "balance_loss_mlp": 1.01884389, + "epoch": 0.8506237787464301, + "flos": 14174445277440.0, + "grad_norm": 2.557360920053027, + "language_loss": 0.85710573, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.87836331, + "num_input_tokens_seen": 305124165, + "step": 14148, + "time_per_iteration": 2.4650378227233887 + }, + { + "auxiliary_loss_clip": 0.01081239, + "auxiliary_loss_mlp": 0.01028625, + "balance_loss_clip": 1.03401756, + "balance_loss_mlp": 1.01605463, + "epoch": 0.8506839019990982, + "flos": 23258372117760.0, + "grad_norm": 1.6225686915835031, + "language_loss": 0.71912521, + "learning_rate": 2.292689741370204e-07, + "loss": 0.74022382, + "num_input_tokens_seen": 305143940, + "step": 14149, + "time_per_iteration": 2.521343231201172 + }, + { + "auxiliary_loss_clip": 0.01081693, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.03470421, + "balance_loss_mlp": 1.01926017, + "epoch": 0.8507440252517661, + "flos": 23659206963840.0, + "grad_norm": 1.58267397094711, + "language_loss": 0.76001811, + "learning_rate": 2.290879486935804e-07, + "loss": 0.7811476, + "num_input_tokens_seen": 305163505, + "step": 14150, + "time_per_iteration": 2.55377197265625 + }, + { + "auxiliary_loss_clip": 0.01064761, + "auxiliary_loss_mlp": 0.01036739, + "balance_loss_clip": 1.03445184, + "balance_loss_mlp": 1.02496099, + "epoch": 0.8508041485044341, + "flos": 18661339658880.0, + "grad_norm": 1.6288550917396096, + "language_loss": 0.7220161, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.74303114, + "num_input_tokens_seen": 305182325, + "step": 14151, + "time_per_iteration": 2.5338752269744873 + }, + { + "auxiliary_loss_clip": 0.00985951, + "auxiliary_loss_mlp": 0.01006058, + "balance_loss_clip": 1.01238275, + "balance_loss_mlp": 1.00481272, + "epoch": 0.8508642717571021, + "flos": 52510918055040.0, + "grad_norm": 0.883326108758271, + "language_loss": 0.59638959, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.6163097, + "num_input_tokens_seen": 305230775, + "step": 14152, + "time_per_iteration": 3.02317214012146 + }, + { + "auxiliary_loss_clip": 0.01009283, + "auxiliary_loss_mlp": 0.01004267, + "balance_loss_clip": 1.00553918, + "balance_loss_mlp": 1.00297928, + "epoch": 0.85092439500977, + "flos": 69297145050240.0, + "grad_norm": 0.6911959934000129, + "language_loss": 0.61171091, + "learning_rate": 2.285452753096797e-07, + "loss": 0.63184643, + "num_input_tokens_seen": 305296000, + "step": 14153, + "time_per_iteration": 3.244988203048706 + }, + { + "auxiliary_loss_clip": 0.01093757, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.03437066, + "balance_loss_mlp": 1.0185771, + "epoch": 0.850984518262438, + "flos": 24389737770240.0, + "grad_norm": 2.916465364309451, + "language_loss": 0.80777705, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.82902694, + "num_input_tokens_seen": 305314705, + "step": 14154, + "time_per_iteration": 2.517843723297119 + }, + { + "auxiliary_loss_clip": 0.01065902, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.03301489, + "balance_loss_mlp": 1.02285171, + "epoch": 0.851044641515106, + "flos": 23294821443840.0, + "grad_norm": 1.603262251837608, + "language_loss": 0.79350364, + "learning_rate": 2.281838289110165e-07, + "loss": 0.81449801, + "num_input_tokens_seen": 305333870, + "step": 14155, + "time_per_iteration": 2.5622315406799316 + }, + { + "auxiliary_loss_clip": 0.0106923, + "auxiliary_loss_mlp": 0.01026522, + "balance_loss_clip": 1.03393579, + "balance_loss_mlp": 1.01457739, + "epoch": 0.851104764767774, + "flos": 22050085489920.0, + "grad_norm": 1.6473491984066035, + "language_loss": 0.70981133, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.73076886, + "num_input_tokens_seen": 305352780, + "step": 14156, + "time_per_iteration": 2.5768892765045166 + }, + { + "auxiliary_loss_clip": 0.01061885, + "auxiliary_loss_mlp": 0.01029333, + "balance_loss_clip": 1.03518224, + "balance_loss_mlp": 1.01813912, + "epoch": 0.8511648880204419, + "flos": 20704728562560.0, + "grad_norm": 1.7672695944681798, + "language_loss": 0.73470837, + "learning_rate": 2.278226512621386e-07, + "loss": 0.75562054, + "num_input_tokens_seen": 305371370, + "step": 14157, + "time_per_iteration": 2.558565378189087 + }, + { + "auxiliary_loss_clip": 0.01033577, + "auxiliary_loss_mlp": 0.01025899, + "balance_loss_clip": 1.03344107, + "balance_loss_mlp": 1.01559913, + "epoch": 0.8512250112731099, + "flos": 24024669891840.0, + "grad_norm": 3.3276033001405962, + "language_loss": 0.79525733, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.81585211, + "num_input_tokens_seen": 305387955, + "step": 14158, + "time_per_iteration": 2.6828107833862305 + }, + { + "auxiliary_loss_clip": 0.01089831, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.03405762, + "balance_loss_mlp": 1.02249408, + "epoch": 0.8512851345257778, + "flos": 22015467757440.0, + "grad_norm": 2.1500544650160993, + "language_loss": 0.79548883, + "learning_rate": 2.27461742417828e-07, + "loss": 0.81674457, + "num_input_tokens_seen": 305406285, + "step": 14159, + "time_per_iteration": 2.476229190826416 + }, + { + "auxiliary_loss_clip": 0.01084187, + "auxiliary_loss_mlp": 0.01033503, + "balance_loss_clip": 1.03510988, + "balance_loss_mlp": 1.02166533, + "epoch": 0.8513452577784458, + "flos": 14830209924480.0, + "grad_norm": 1.6967227287271132, + "language_loss": 0.71410012, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.73527706, + "num_input_tokens_seen": 305424500, + "step": 14160, + "time_per_iteration": 2.533555030822754 + }, + { + "auxiliary_loss_clip": 0.01098521, + "auxiliary_loss_mlp": 0.01034041, + "balance_loss_clip": 1.03594351, + "balance_loss_mlp": 1.02097583, + "epoch": 0.8514053810311137, + "flos": 33035662166400.0, + "grad_norm": 1.9799193221414308, + "language_loss": 0.70296329, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.72428888, + "num_input_tokens_seen": 305442990, + "step": 14161, + "time_per_iteration": 2.5875399112701416 + }, + { + "auxiliary_loss_clip": 0.01093621, + "auxiliary_loss_mlp": 0.01028803, + "balance_loss_clip": 1.03238595, + "balance_loss_mlp": 1.01719809, + "epoch": 0.8514655042837818, + "flos": 27564456412800.0, + "grad_norm": 2.119349827986808, + "language_loss": 0.77529407, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.79651833, + "num_input_tokens_seen": 305463065, + "step": 14162, + "time_per_iteration": 2.540135622024536 + }, + { + "auxiliary_loss_clip": 0.01095944, + "auxiliary_loss_mlp": 0.01033887, + "balance_loss_clip": 1.03711891, + "balance_loss_mlp": 1.02172804, + "epoch": 0.8515256275364497, + "flos": 35556052705920.0, + "grad_norm": 1.8020287799985892, + "language_loss": 0.76500493, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.78630328, + "num_input_tokens_seen": 305489070, + "step": 14163, + "time_per_iteration": 2.6160409450531006 + }, + { + "auxiliary_loss_clip": 0.01014062, + "auxiliary_loss_mlp": 0.01000156, + "balance_loss_clip": 1.0113554, + "balance_loss_mlp": 0.99902326, + "epoch": 0.8515857507891177, + "flos": 70207372621440.0, + "grad_norm": 0.6991597176287263, + "language_loss": 0.55055022, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.57069242, + "num_input_tokens_seen": 305551490, + "step": 14164, + "time_per_iteration": 3.183668613433838 + }, + { + "auxiliary_loss_clip": 0.01093667, + "auxiliary_loss_mlp": 0.01035691, + "balance_loss_clip": 1.03393924, + "balance_loss_mlp": 1.02346039, + "epoch": 0.8516458740417857, + "flos": 22675290641280.0, + "grad_norm": 9.771071496145192, + "language_loss": 0.72608566, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.74737918, + "num_input_tokens_seen": 305570535, + "step": 14165, + "time_per_iteration": 2.482961416244507 + }, + { + "auxiliary_loss_clip": 0.0106528, + "auxiliary_loss_mlp": 0.01026883, + "balance_loss_clip": 1.03492188, + "balance_loss_mlp": 1.01509309, + "epoch": 0.8517059972944536, + "flos": 22747435107840.0, + "grad_norm": 1.6397680816421436, + "language_loss": 0.67177027, + "learning_rate": 2.26200679088697e-07, + "loss": 0.69269192, + "num_input_tokens_seen": 305590800, + "step": 14166, + "time_per_iteration": 2.5696189403533936 + }, + { + "auxiliary_loss_clip": 0.01079157, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.0313096, + "balance_loss_mlp": 1.0198462, + "epoch": 0.8517661205471216, + "flos": 21689147675520.0, + "grad_norm": 1.900345302549379, + "language_loss": 0.73409176, + "learning_rate": 2.260207961805125e-07, + "loss": 0.75519788, + "num_input_tokens_seen": 305609495, + "step": 14167, + "time_per_iteration": 2.521881341934204 + }, + { + "auxiliary_loss_clip": 0.01102356, + "auxiliary_loss_mlp": 0.0102876, + "balance_loss_clip": 1.0347811, + "balance_loss_mlp": 1.01754189, + "epoch": 0.8518262437997896, + "flos": 25374839241600.0, + "grad_norm": 1.6668330261377853, + "language_loss": 0.80008006, + "learning_rate": 2.258409805417969e-07, + "loss": 0.82139117, + "num_input_tokens_seen": 305629420, + "step": 14168, + "time_per_iteration": 2.483902931213379 + }, + { + "auxiliary_loss_clip": 0.0110223, + "auxiliary_loss_mlp": 0.0102482, + "balance_loss_clip": 1.0341897, + "balance_loss_mlp": 1.01381731, + "epoch": 0.8518863670524576, + "flos": 27235406897280.0, + "grad_norm": 2.0470967222298695, + "language_loss": 0.76563877, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.78690922, + "num_input_tokens_seen": 305649835, + "step": 14169, + "time_per_iteration": 2.5062568187713623 + }, + { + "auxiliary_loss_clip": 0.01106872, + "auxiliary_loss_mlp": 0.01029501, + "balance_loss_clip": 1.03632689, + "balance_loss_mlp": 1.01736569, + "epoch": 0.8519464903051255, + "flos": 20959514709120.0, + "grad_norm": 1.6683032167453562, + "language_loss": 0.63710928, + "learning_rate": 2.254815511000452e-07, + "loss": 0.65847301, + "num_input_tokens_seen": 305668840, + "step": 14170, + "time_per_iteration": 2.4369845390319824 + }, + { + "auxiliary_loss_clip": 0.01091008, + "auxiliary_loss_mlp": 0.01025454, + "balance_loss_clip": 1.03321993, + "balance_loss_mlp": 1.0138607, + "epoch": 0.8520066135577935, + "flos": 18441745862400.0, + "grad_norm": 2.10093560909819, + "language_loss": 0.8623715, + "learning_rate": 2.253019373106384e-07, + "loss": 0.8835361, + "num_input_tokens_seen": 305686955, + "step": 14171, + "time_per_iteration": 2.4422199726104736 + }, + { + "auxiliary_loss_clip": 0.01089177, + "auxiliary_loss_mlp": 0.01034556, + "balance_loss_clip": 1.03435445, + "balance_loss_mlp": 1.0227896, + "epoch": 0.8520667368104614, + "flos": 29130233149440.0, + "grad_norm": 1.7458461064980582, + "language_loss": 0.54680502, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.5680424, + "num_input_tokens_seen": 305706290, + "step": 14172, + "time_per_iteration": 2.5270235538482666 + }, + { + "auxiliary_loss_clip": 0.01078712, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.03345823, + "balance_loss_mlp": 1.01848149, + "epoch": 0.8521268600631294, + "flos": 16034366488320.0, + "grad_norm": 2.471421680359514, + "language_loss": 0.69415915, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.71523213, + "num_input_tokens_seen": 305723835, + "step": 14173, + "time_per_iteration": 2.5105814933776855 + }, + { + "auxiliary_loss_clip": 0.01082208, + "auxiliary_loss_mlp": 0.00783462, + "balance_loss_clip": 1.03396213, + "balance_loss_mlp": 1.00942802, + "epoch": 0.8521869833157973, + "flos": 22454870832000.0, + "grad_norm": 3.282163716807284, + "language_loss": 0.76738739, + "learning_rate": 2.247634997500205e-07, + "loss": 0.78604412, + "num_input_tokens_seen": 305741655, + "step": 14174, + "time_per_iteration": 2.5092735290527344 + }, + { + "auxiliary_loss_clip": 0.01070018, + "auxiliary_loss_mlp": 0.00783539, + "balance_loss_clip": 1.03308535, + "balance_loss_mlp": 1.00826263, + "epoch": 0.8522471065684654, + "flos": 24972029147520.0, + "grad_norm": 1.6499097440724706, + "language_loss": 0.81897455, + "learning_rate": 2.245841551883676e-07, + "loss": 0.83751017, + "num_input_tokens_seen": 305761890, + "step": 14175, + "time_per_iteration": 3.9617342948913574 + }, + { + "auxiliary_loss_clip": 0.01108328, + "auxiliary_loss_mlp": 0.01029078, + "balance_loss_clip": 1.03704512, + "balance_loss_mlp": 1.01651335, + "epoch": 0.8523072298211333, + "flos": 17710604524800.0, + "grad_norm": 2.2944007779187254, + "language_loss": 0.6548093, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.67618328, + "num_input_tokens_seen": 305779190, + "step": 14176, + "time_per_iteration": 2.420305013656616 + }, + { + "auxiliary_loss_clip": 0.01080118, + "auxiliary_loss_mlp": 0.00784199, + "balance_loss_clip": 1.03374219, + "balance_loss_mlp": 1.01085472, + "epoch": 0.8523673530738013, + "flos": 25446193608960.0, + "grad_norm": 2.017664337243453, + "language_loss": 0.78486341, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.80350661, + "num_input_tokens_seen": 305799870, + "step": 14177, + "time_per_iteration": 3.954669237136841 + }, + { + "auxiliary_loss_clip": 0.01082854, + "auxiliary_loss_mlp": 0.01030355, + "balance_loss_clip": 1.03528214, + "balance_loss_mlp": 1.01830328, + "epoch": 0.8524274763264693, + "flos": 31429593348480.0, + "grad_norm": 1.6315630990400944, + "language_loss": 0.73356998, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.75470209, + "num_input_tokens_seen": 305819695, + "step": 14178, + "time_per_iteration": 2.6070971488952637 + }, + { + "auxiliary_loss_clip": 0.01069076, + "auxiliary_loss_mlp": 0.01032606, + "balance_loss_clip": 1.03593159, + "balance_loss_mlp": 1.02086949, + "epoch": 0.8524875995791372, + "flos": 17712651600000.0, + "grad_norm": 1.684953669805899, + "language_loss": 0.74868852, + "learning_rate": 2.238674502491935e-07, + "loss": 0.76970536, + "num_input_tokens_seen": 305837270, + "step": 14179, + "time_per_iteration": 4.092430591583252 + }, + { + "auxiliary_loss_clip": 0.01102136, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.03537166, + "balance_loss_mlp": 1.01700521, + "epoch": 0.8525477228318052, + "flos": 21687316081920.0, + "grad_norm": 2.1660338673778354, + "language_loss": 0.81879807, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.84010231, + "num_input_tokens_seen": 305855250, + "step": 14180, + "time_per_iteration": 2.482255458831787 + }, + { + "auxiliary_loss_clip": 0.01046749, + "auxiliary_loss_mlp": 0.0103557, + "balance_loss_clip": 1.03277707, + "balance_loss_mlp": 1.02357149, + "epoch": 0.8526078460844732, + "flos": 24827057856000.0, + "grad_norm": 3.172850014138972, + "language_loss": 0.60997713, + "learning_rate": 2.235095018591815e-07, + "loss": 0.63080037, + "num_input_tokens_seen": 305875660, + "step": 14181, + "time_per_iteration": 2.611671209335327 + }, + { + "auxiliary_loss_clip": 0.0110334, + "auxiliary_loss_mlp": 0.01035559, + "balance_loss_clip": 1.03664684, + "balance_loss_mlp": 1.02436495, + "epoch": 0.8526679693371412, + "flos": 13516418073600.0, + "grad_norm": 2.087761396730017, + "language_loss": 0.72542822, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.74681723, + "num_input_tokens_seen": 305892415, + "step": 14182, + "time_per_iteration": 2.433070659637451 + }, + { + "auxiliary_loss_clip": 0.01053108, + "auxiliary_loss_mlp": 0.01031751, + "balance_loss_clip": 1.03531647, + "balance_loss_mlp": 1.01987755, + "epoch": 0.8527280925898091, + "flos": 23514092017920.0, + "grad_norm": 1.4770399771783616, + "language_loss": 0.70880985, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.72965842, + "num_input_tokens_seen": 305912665, + "step": 14183, + "time_per_iteration": 2.64776349067688 + }, + { + "auxiliary_loss_clip": 0.01077573, + "auxiliary_loss_mlp": 0.0102905, + "balance_loss_clip": 1.03716302, + "balance_loss_mlp": 1.01803553, + "epoch": 0.8527882158424771, + "flos": 20303031790080.0, + "grad_norm": 2.0572949274127708, + "language_loss": 0.73037577, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.75144196, + "num_input_tokens_seen": 305931515, + "step": 14184, + "time_per_iteration": 2.594871997833252 + }, + { + "auxiliary_loss_clip": 0.01105089, + "auxiliary_loss_mlp": 0.01032735, + "balance_loss_clip": 1.03631616, + "balance_loss_mlp": 1.02095699, + "epoch": 0.852848339095145, + "flos": 17202504689280.0, + "grad_norm": 1.7823785791244964, + "language_loss": 0.76605058, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.7874288, + "num_input_tokens_seen": 305949965, + "step": 14185, + "time_per_iteration": 3.901979446411133 + }, + { + "auxiliary_loss_clip": 0.01062103, + "auxiliary_loss_mlp": 0.01031258, + "balance_loss_clip": 1.03219318, + "balance_loss_mlp": 1.01797807, + "epoch": 0.852908462347813, + "flos": 18368990864640.0, + "grad_norm": 2.1101798272930683, + "language_loss": 0.79664183, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.81757545, + "num_input_tokens_seen": 305967820, + "step": 14186, + "time_per_iteration": 2.5126733779907227 + }, + { + "auxiliary_loss_clip": 0.01079413, + "auxiliary_loss_mlp": 0.01028017, + "balance_loss_clip": 1.03166437, + "balance_loss_mlp": 1.01526797, + "epoch": 0.8529685856004809, + "flos": 18624890332800.0, + "grad_norm": 1.6380722279918796, + "language_loss": 0.62965655, + "learning_rate": 2.224372736588449e-07, + "loss": 0.65073085, + "num_input_tokens_seen": 305985505, + "step": 14187, + "time_per_iteration": 2.499657154083252 + }, + { + "auxiliary_loss_clip": 0.01059711, + "auxiliary_loss_mlp": 0.01029783, + "balance_loss_clip": 1.0336647, + "balance_loss_mlp": 1.01691997, + "epoch": 0.853028708853149, + "flos": 29607665748480.0, + "grad_norm": 1.9459516537144488, + "language_loss": 0.76840281, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.7892977, + "num_input_tokens_seen": 306005220, + "step": 14188, + "time_per_iteration": 2.625091075897217 + }, + { + "auxiliary_loss_clip": 0.01094711, + "auxiliary_loss_mlp": 0.01029306, + "balance_loss_clip": 1.03553605, + "balance_loss_mlp": 1.01643729, + "epoch": 0.8530888321058169, + "flos": 26353153042560.0, + "grad_norm": 1.7427813822156295, + "language_loss": 0.78513217, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.8063724, + "num_input_tokens_seen": 306023785, + "step": 14189, + "time_per_iteration": 2.522097110748291 + }, + { + "auxiliary_loss_clip": 0.0108069, + "auxiliary_loss_mlp": 0.01031726, + "balance_loss_clip": 1.03322196, + "balance_loss_mlp": 1.01977539, + "epoch": 0.8531489553584849, + "flos": 20521979141760.0, + "grad_norm": 1.966226015971542, + "language_loss": 0.79718214, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.81830633, + "num_input_tokens_seen": 306041600, + "step": 14190, + "time_per_iteration": 2.4943337440490723 + }, + { + "auxiliary_loss_clip": 0.01054129, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.03474152, + "balance_loss_mlp": 1.01674485, + "epoch": 0.8532090786111529, + "flos": 20704297599360.0, + "grad_norm": 1.8349390249070154, + "language_loss": 0.75963438, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.78046364, + "num_input_tokens_seen": 306060345, + "step": 14191, + "time_per_iteration": 2.5829691886901855 + }, + { + "auxiliary_loss_clip": 0.01093058, + "auxiliary_loss_mlp": 0.01028857, + "balance_loss_clip": 1.03618312, + "balance_loss_mlp": 1.01716876, + "epoch": 0.8532692018638208, + "flos": 19828903242240.0, + "grad_norm": 1.730621034631871, + "language_loss": 0.68826729, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.70948648, + "num_input_tokens_seen": 306078285, + "step": 14192, + "time_per_iteration": 2.4670867919921875 + }, + { + "auxiliary_loss_clip": 0.01096736, + "auxiliary_loss_mlp": 0.01034719, + "balance_loss_clip": 1.0368526, + "balance_loss_mlp": 1.02124858, + "epoch": 0.8533293251164888, + "flos": 20996790048000.0, + "grad_norm": 2.2710422246857633, + "language_loss": 0.6274085, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.64872307, + "num_input_tokens_seen": 306093760, + "step": 14193, + "time_per_iteration": 2.5021474361419678 + }, + { + "auxiliary_loss_clip": 0.01083246, + "auxiliary_loss_mlp": 0.01030261, + "balance_loss_clip": 1.03396463, + "balance_loss_mlp": 1.01824522, + "epoch": 0.8533894483691568, + "flos": 22419606654720.0, + "grad_norm": 1.7280750495027208, + "language_loss": 0.76807207, + "learning_rate": 2.211894078044365e-07, + "loss": 0.78920716, + "num_input_tokens_seen": 306112595, + "step": 14194, + "time_per_iteration": 2.5220136642456055 + }, + { + "auxiliary_loss_clip": 0.01103912, + "auxiliary_loss_mlp": 0.01026245, + "balance_loss_clip": 1.03500044, + "balance_loss_mlp": 1.01441956, + "epoch": 0.8534495716218248, + "flos": 21616536332160.0, + "grad_norm": 1.6290172482773155, + "language_loss": 0.69489813, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.71619976, + "num_input_tokens_seen": 306131800, + "step": 14195, + "time_per_iteration": 2.4679746627807617 + }, + { + "auxiliary_loss_clip": 0.01077788, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.03439927, + "balance_loss_mlp": 1.01980543, + "epoch": 0.8535096948744927, + "flos": 22346277039360.0, + "grad_norm": 2.115961317791182, + "language_loss": 0.85969949, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.88080293, + "num_input_tokens_seen": 306150590, + "step": 14196, + "time_per_iteration": 2.5139529705047607 + }, + { + "auxiliary_loss_clip": 0.01013619, + "auxiliary_loss_mlp": 0.00999193, + "balance_loss_clip": 1.0109235, + "balance_loss_mlp": 0.99797118, + "epoch": 0.8535698181271607, + "flos": 52762507891200.0, + "grad_norm": 0.7655985569086489, + "language_loss": 0.55062568, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.57075387, + "num_input_tokens_seen": 306205850, + "step": 14197, + "time_per_iteration": 3.0742886066436768 + }, + { + "auxiliary_loss_clip": 0.01068231, + "auxiliary_loss_mlp": 0.00782198, + "balance_loss_clip": 1.03330529, + "balance_loss_mlp": 1.00859094, + "epoch": 0.8536299413798286, + "flos": 19062892776960.0, + "grad_norm": 1.479172641609595, + "language_loss": 0.81453049, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.83303475, + "num_input_tokens_seen": 306225220, + "step": 14198, + "time_per_iteration": 2.56241512298584 + }, + { + "auxiliary_loss_clip": 0.01103288, + "auxiliary_loss_mlp": 0.01028865, + "balance_loss_clip": 1.03519845, + "balance_loss_mlp": 1.01819599, + "epoch": 0.8536900646324966, + "flos": 49344743871360.0, + "grad_norm": 1.430285643911617, + "language_loss": 0.68343496, + "learning_rate": 2.203000984963035e-07, + "loss": 0.7047565, + "num_input_tokens_seen": 306249865, + "step": 14199, + "time_per_iteration": 2.7108147144317627 + }, + { + "auxiliary_loss_clip": 0.0106516, + "auxiliary_loss_mlp": 0.01026431, + "balance_loss_clip": 1.03130221, + "balance_loss_mlp": 1.01494575, + "epoch": 0.8537501878851645, + "flos": 21762333636480.0, + "grad_norm": 1.5183751627010416, + "language_loss": 0.8619169, + "learning_rate": 2.201224390669072e-07, + "loss": 0.88283277, + "num_input_tokens_seen": 306270215, + "step": 14200, + "time_per_iteration": 2.570683479309082 + }, + { + "auxiliary_loss_clip": 0.01072021, + "auxiliary_loss_mlp": 0.01027624, + "balance_loss_clip": 1.03360462, + "balance_loss_mlp": 1.01704431, + "epoch": 0.8538103111378326, + "flos": 22269176496000.0, + "grad_norm": 1.8694719930850656, + "language_loss": 0.78267598, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.80367243, + "num_input_tokens_seen": 306288960, + "step": 14201, + "time_per_iteration": 2.551252841949463 + }, + { + "auxiliary_loss_clip": 0.01075997, + "auxiliary_loss_mlp": 0.01028007, + "balance_loss_clip": 1.0346024, + "balance_loss_mlp": 1.01675391, + "epoch": 0.8538704343905005, + "flos": 20303929630080.0, + "grad_norm": 1.748905396646379, + "language_loss": 0.6884315, + "learning_rate": 2.19767322694256e-07, + "loss": 0.70947152, + "num_input_tokens_seen": 306308735, + "step": 14202, + "time_per_iteration": 2.5191595554351807 + }, + { + "auxiliary_loss_clip": 0.01093129, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.03502226, + "balance_loss_mlp": 1.01805925, + "epoch": 0.8539305576431685, + "flos": 24755164784640.0, + "grad_norm": 1.5069387146338764, + "language_loss": 0.79944849, + "learning_rate": 2.195898657644666e-07, + "loss": 0.82067776, + "num_input_tokens_seen": 306329015, + "step": 14203, + "time_per_iteration": 2.511061668395996 + }, + { + "auxiliary_loss_clip": 0.0108584, + "auxiliary_loss_mlp": 0.01031145, + "balance_loss_clip": 1.03394365, + "balance_loss_mlp": 1.01838946, + "epoch": 0.8539906808958365, + "flos": 26687625511680.0, + "grad_norm": 1.9318096386198393, + "language_loss": 0.65755635, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.6787262, + "num_input_tokens_seen": 306349085, + "step": 14204, + "time_per_iteration": 2.564866542816162 + }, + { + "auxiliary_loss_clip": 0.01105588, + "auxiliary_loss_mlp": 0.01032723, + "balance_loss_clip": 1.03594732, + "balance_loss_mlp": 1.02040303, + "epoch": 0.8540508041485044, + "flos": 13365521038080.0, + "grad_norm": 2.0118845295996337, + "language_loss": 0.59230334, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.61368644, + "num_input_tokens_seen": 306365385, + "step": 14205, + "time_per_iteration": 2.4015893936157227 + }, + { + "auxiliary_loss_clip": 0.01081129, + "auxiliary_loss_mlp": 0.01027511, + "balance_loss_clip": 1.03523052, + "balance_loss_mlp": 1.01574504, + "epoch": 0.8541109274011724, + "flos": 32780876019840.0, + "grad_norm": 2.0118696105433345, + "language_loss": 0.72009254, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.74117887, + "num_input_tokens_seen": 306384585, + "step": 14206, + "time_per_iteration": 2.607109308242798 + }, + { + "auxiliary_loss_clip": 0.0109511, + "auxiliary_loss_mlp": 0.01029399, + "balance_loss_clip": 1.03618598, + "balance_loss_mlp": 1.01748967, + "epoch": 0.8541710506538404, + "flos": 17639286071040.0, + "grad_norm": 3.019384669411787, + "language_loss": 0.76145703, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.78270209, + "num_input_tokens_seen": 306401565, + "step": 14207, + "time_per_iteration": 2.4479048252105713 + }, + { + "auxiliary_loss_clip": 0.01105243, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.03614855, + "balance_loss_mlp": 1.01888609, + "epoch": 0.8542311739065084, + "flos": 20263062931200.0, + "grad_norm": 1.7481748553845422, + "language_loss": 0.85042918, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.87179708, + "num_input_tokens_seen": 306419995, + "step": 14208, + "time_per_iteration": 2.4447214603424072 + }, + { + "auxiliary_loss_clip": 0.01084102, + "auxiliary_loss_mlp": 0.01034331, + "balance_loss_clip": 1.03413177, + "balance_loss_mlp": 1.02287495, + "epoch": 0.8542912971591763, + "flos": 17785657992960.0, + "grad_norm": 1.8643412865847393, + "language_loss": 0.6645546, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.68573892, + "num_input_tokens_seen": 306439240, + "step": 14209, + "time_per_iteration": 2.477638006210327 + }, + { + "auxiliary_loss_clip": 0.01059118, + "auxiliary_loss_mlp": 0.01027599, + "balance_loss_clip": 1.03400397, + "balance_loss_mlp": 1.01600027, + "epoch": 0.8543514204118443, + "flos": 26979507429120.0, + "grad_norm": 1.8274511630734058, + "language_loss": 0.70592278, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.72678995, + "num_input_tokens_seen": 306458425, + "step": 14210, + "time_per_iteration": 2.6127350330352783 + }, + { + "auxiliary_loss_clip": 0.01079038, + "auxiliary_loss_mlp": 0.01028882, + "balance_loss_clip": 1.03498983, + "balance_loss_mlp": 1.01733088, + "epoch": 0.8544115436645122, + "flos": 24024598064640.0, + "grad_norm": 1.3182432568700784, + "language_loss": 0.70362633, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.72470552, + "num_input_tokens_seen": 306477210, + "step": 14211, + "time_per_iteration": 2.5296895503997803 + }, + { + "auxiliary_loss_clip": 0.01083924, + "auxiliary_loss_mlp": 0.01030368, + "balance_loss_clip": 1.03495336, + "balance_loss_mlp": 1.01838183, + "epoch": 0.8544716669171802, + "flos": 16617986668800.0, + "grad_norm": 2.182151183848599, + "language_loss": 0.81616569, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.83730865, + "num_input_tokens_seen": 306495820, + "step": 14212, + "time_per_iteration": 2.5101587772369385 + }, + { + "auxiliary_loss_clip": 0.0107117, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.03285813, + "balance_loss_mlp": 1.02067554, + "epoch": 0.8545317901698481, + "flos": 40005779489280.0, + "grad_norm": 1.8215128774697689, + "language_loss": 0.66758728, + "learning_rate": 2.178190108088105e-07, + "loss": 0.68863863, + "num_input_tokens_seen": 306516420, + "step": 14213, + "time_per_iteration": 2.7067058086395264 + }, + { + "auxiliary_loss_clip": 0.01102012, + "auxiliary_loss_mlp": 0.01028387, + "balance_loss_clip": 1.03441882, + "balance_loss_mlp": 1.01685357, + "epoch": 0.8545919134225162, + "flos": 19902520166400.0, + "grad_norm": 2.5710208959978527, + "language_loss": 0.78354704, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.804851, + "num_input_tokens_seen": 306534785, + "step": 14214, + "time_per_iteration": 3.8267557621002197 + }, + { + "auxiliary_loss_clip": 0.01084616, + "auxiliary_loss_mlp": 0.0102879, + "balance_loss_clip": 1.03325379, + "balance_loss_mlp": 1.01580238, + "epoch": 0.8546520366751841, + "flos": 18952970181120.0, + "grad_norm": 2.341442861165409, + "language_loss": 0.66497749, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.68611157, + "num_input_tokens_seen": 306552440, + "step": 14215, + "time_per_iteration": 3.9914908409118652 + }, + { + "auxiliary_loss_clip": 0.01105187, + "auxiliary_loss_mlp": 0.01026956, + "balance_loss_clip": 1.03667164, + "balance_loss_mlp": 1.01541078, + "epoch": 0.8547121599278521, + "flos": 35621445415680.0, + "grad_norm": 2.1908378435195095, + "language_loss": 0.62670398, + "learning_rate": 2.172890718362279e-07, + "loss": 0.64802539, + "num_input_tokens_seen": 306573600, + "step": 14216, + "time_per_iteration": 2.5691967010498047 + }, + { + "auxiliary_loss_clip": 0.01068466, + "auxiliary_loss_mlp": 0.01031958, + "balance_loss_clip": 1.03284872, + "balance_loss_mlp": 1.0201323, + "epoch": 0.8547722831805201, + "flos": 16910048154240.0, + "grad_norm": 1.6181862235819566, + "language_loss": 0.65354621, + "learning_rate": 2.17112560704259e-07, + "loss": 0.67455047, + "num_input_tokens_seen": 306592840, + "step": 14217, + "time_per_iteration": 2.5615954399108887 + }, + { + "auxiliary_loss_clip": 0.01089684, + "auxiliary_loss_mlp": 0.01029546, + "balance_loss_clip": 1.03622019, + "balance_loss_mlp": 1.01839352, + "epoch": 0.854832406433188, + "flos": 23002616304000.0, + "grad_norm": 1.4659749005987077, + "language_loss": 0.64876676, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.66995907, + "num_input_tokens_seen": 306613210, + "step": 14218, + "time_per_iteration": 3.9467499256134033 + }, + { + "auxiliary_loss_clip": 0.01089866, + "auxiliary_loss_mlp": 0.01033727, + "balance_loss_clip": 1.03288603, + "balance_loss_mlp": 1.02121592, + "epoch": 0.854892529685856, + "flos": 20412595249920.0, + "grad_norm": 1.8758218508031705, + "language_loss": 0.70120585, + "learning_rate": 2.167597412688238e-07, + "loss": 0.72244179, + "num_input_tokens_seen": 306631620, + "step": 14219, + "time_per_iteration": 2.4735360145568848 + }, + { + "auxiliary_loss_clip": 0.01081363, + "auxiliary_loss_mlp": 0.01033288, + "balance_loss_clip": 1.0328095, + "balance_loss_mlp": 1.02136731, + "epoch": 0.854952652938524, + "flos": 16398716094720.0, + "grad_norm": 2.4833704075125205, + "language_loss": 0.67178315, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.69292963, + "num_input_tokens_seen": 306646695, + "step": 14220, + "time_per_iteration": 2.47493314743042 + }, + { + "auxiliary_loss_clip": 0.01100037, + "auxiliary_loss_mlp": 0.01026801, + "balance_loss_clip": 1.03446865, + "balance_loss_mlp": 1.01569033, + "epoch": 0.855012776191192, + "flos": 21178677542400.0, + "grad_norm": 1.98314001041562, + "language_loss": 0.71685874, + "learning_rate": 2.164071923159827e-07, + "loss": 0.73812717, + "num_input_tokens_seen": 306665465, + "step": 14221, + "time_per_iteration": 2.432995557785034 + }, + { + "auxiliary_loss_clip": 0.0107148, + "auxiliary_loss_mlp": 0.01036388, + "balance_loss_clip": 1.03306842, + "balance_loss_mlp": 1.02410364, + "epoch": 0.8550728994438599, + "flos": 26140993361280.0, + "grad_norm": 2.0025949116107116, + "language_loss": 0.60234261, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.62342131, + "num_input_tokens_seen": 306685950, + "step": 14222, + "time_per_iteration": 2.584054946899414 + }, + { + "auxiliary_loss_clip": 0.01075521, + "auxiliary_loss_mlp": 0.01032011, + "balance_loss_clip": 1.03198457, + "balance_loss_mlp": 1.01998866, + "epoch": 0.8551330226965279, + "flos": 22786793435520.0, + "grad_norm": 1.7762996236042858, + "language_loss": 0.83924383, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.86031914, + "num_input_tokens_seen": 306705740, + "step": 14223, + "time_per_iteration": 2.536123037338257 + }, + { + "auxiliary_loss_clip": 0.01090949, + "auxiliary_loss_mlp": 0.01031447, + "balance_loss_clip": 1.03513265, + "balance_loss_mlp": 1.02022338, + "epoch": 0.8551931459491958, + "flos": 22419032037120.0, + "grad_norm": 1.6147154175218947, + "language_loss": 0.7390101, + "learning_rate": 2.158788761585515e-07, + "loss": 0.760234, + "num_input_tokens_seen": 306725065, + "step": 14224, + "time_per_iteration": 3.9044668674468994 + }, + { + "auxiliary_loss_clip": 0.01076623, + "auxiliary_loss_mlp": 0.0078513, + "balance_loss_clip": 1.03293705, + "balance_loss_mlp": 1.00929618, + "epoch": 0.8552532692018638, + "flos": 19573183342080.0, + "grad_norm": 2.26535762279399, + "language_loss": 0.75602531, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.77464283, + "num_input_tokens_seen": 306743630, + "step": 14225, + "time_per_iteration": 2.4855306148529053 + }, + { + "auxiliary_loss_clip": 0.01042415, + "auxiliary_loss_mlp": 0.01040181, + "balance_loss_clip": 1.03218937, + "balance_loss_mlp": 1.02865338, + "epoch": 0.8553133924545318, + "flos": 26432767537920.0, + "grad_norm": 1.6038249932643707, + "language_loss": 0.7730273, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.79385328, + "num_input_tokens_seen": 306763105, + "step": 14226, + "time_per_iteration": 2.6719436645507812 + }, + { + "auxiliary_loss_clip": 0.01106971, + "auxiliary_loss_mlp": 0.01035252, + "balance_loss_clip": 1.03560424, + "balance_loss_mlp": 1.0225265, + "epoch": 0.8553735157071998, + "flos": 16362446336640.0, + "grad_norm": 2.0052042539839694, + "language_loss": 0.54224384, + "learning_rate": 2.153511688875702e-07, + "loss": 0.56366611, + "num_input_tokens_seen": 306779875, + "step": 14227, + "time_per_iteration": 2.4174771308898926 + }, + { + "auxiliary_loss_clip": 0.01072389, + "auxiliary_loss_mlp": 0.00781978, + "balance_loss_clip": 1.03412163, + "balance_loss_mlp": 1.00854897, + "epoch": 0.8554336389598677, + "flos": 20887334328960.0, + "grad_norm": 2.944814921505782, + "language_loss": 0.65535402, + "learning_rate": 2.151754018031442e-07, + "loss": 0.67389768, + "num_input_tokens_seen": 306800015, + "step": 14228, + "time_per_iteration": 2.5504443645477295 + }, + { + "auxiliary_loss_clip": 0.01072604, + "auxiliary_loss_mlp": 0.01033922, + "balance_loss_clip": 1.03615665, + "balance_loss_mlp": 1.02151275, + "epoch": 0.8554937622125357, + "flos": 21284721469440.0, + "grad_norm": 2.0956645385979096, + "language_loss": 0.73854029, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.75960553, + "num_input_tokens_seen": 306814160, + "step": 14229, + "time_per_iteration": 2.518414258956909 + }, + { + "auxiliary_loss_clip": 0.01090552, + "auxiliary_loss_mlp": 0.0102684, + "balance_loss_clip": 1.03385735, + "balance_loss_mlp": 1.01593208, + "epoch": 0.8555538854652037, + "flos": 22413178120320.0, + "grad_norm": 1.6693394487657833, + "language_loss": 0.72847921, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.74965316, + "num_input_tokens_seen": 306833310, + "step": 14230, + "time_per_iteration": 2.4915170669555664 + }, + { + "auxiliary_loss_clip": 0.01089916, + "auxiliary_loss_mlp": 0.01028043, + "balance_loss_clip": 1.03360891, + "balance_loss_mlp": 1.01619983, + "epoch": 0.8556140087178716, + "flos": 20193719725440.0, + "grad_norm": 1.7212542203417283, + "language_loss": 0.82500911, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.84618866, + "num_input_tokens_seen": 306851345, + "step": 14231, + "time_per_iteration": 2.453489303588867 + }, + { + "auxiliary_loss_clip": 0.0109564, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_clip": 1.03570879, + "balance_loss_mlp": 1.02052522, + "epoch": 0.8556741319705397, + "flos": 22638123043200.0, + "grad_norm": 1.804602863757462, + "language_loss": 0.6762082, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.69749749, + "num_input_tokens_seen": 306871040, + "step": 14232, + "time_per_iteration": 2.5174789428710938 + }, + { + "auxiliary_loss_clip": 0.01083545, + "auxiliary_loss_mlp": 0.01031022, + "balance_loss_clip": 1.03523064, + "balance_loss_mlp": 1.01868951, + "epoch": 0.8557342552232076, + "flos": 23549320281600.0, + "grad_norm": 1.4249017830594095, + "language_loss": 0.67252183, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.69366759, + "num_input_tokens_seen": 306891625, + "step": 14233, + "time_per_iteration": 2.533970355987549 + }, + { + "auxiliary_loss_clip": 0.01090073, + "auxiliary_loss_mlp": 0.01029811, + "balance_loss_clip": 1.03330576, + "balance_loss_mlp": 1.01844454, + "epoch": 0.8557943784758756, + "flos": 19609884063360.0, + "grad_norm": 1.6701839511822696, + "language_loss": 0.76428008, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.78547895, + "num_input_tokens_seen": 306910020, + "step": 14234, + "time_per_iteration": 2.4891719818115234 + }, + { + "auxiliary_loss_clip": 0.01004668, + "auxiliary_loss_mlp": 0.01016587, + "balance_loss_clip": 1.01053071, + "balance_loss_mlp": 1.01521015, + "epoch": 0.8558545017285435, + "flos": 70641891446400.0, + "grad_norm": 0.7541465257032627, + "language_loss": 0.58010322, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.60031575, + "num_input_tokens_seen": 306969505, + "step": 14235, + "time_per_iteration": 3.1210572719573975 + }, + { + "auxiliary_loss_clip": 0.01012124, + "auxiliary_loss_mlp": 0.00999553, + "balance_loss_clip": 1.00877607, + "balance_loss_mlp": 0.99835455, + "epoch": 0.8559146249812115, + "flos": 56649983086080.0, + "grad_norm": 0.7874319510846941, + "language_loss": 0.56692219, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.58703887, + "num_input_tokens_seen": 307027710, + "step": 14236, + "time_per_iteration": 3.034252166748047 + }, + { + "auxiliary_loss_clip": 0.01081086, + "auxiliary_loss_mlp": 0.01032947, + "balance_loss_clip": 1.03403711, + "balance_loss_mlp": 1.02117562, + "epoch": 0.8559747482338794, + "flos": 22888240421760.0, + "grad_norm": 1.767999131162567, + "language_loss": 0.7044909, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.72563124, + "num_input_tokens_seen": 307045515, + "step": 14237, + "time_per_iteration": 2.5484769344329834 + }, + { + "auxiliary_loss_clip": 0.01079988, + "auxiliary_loss_mlp": 0.01029127, + "balance_loss_clip": 1.03242695, + "balance_loss_mlp": 1.01788568, + "epoch": 0.8560348714865474, + "flos": 22601925112320.0, + "grad_norm": 2.072055720007472, + "language_loss": 0.63201374, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.6531049, + "num_input_tokens_seen": 307064470, + "step": 14238, + "time_per_iteration": 2.531428337097168 + }, + { + "auxiliary_loss_clip": 0.01097405, + "auxiliary_loss_mlp": 0.01032948, + "balance_loss_clip": 1.03291106, + "balance_loss_mlp": 1.02288055, + "epoch": 0.8560949947392154, + "flos": 17931455297280.0, + "grad_norm": 1.4404118788031197, + "language_loss": 0.69365078, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.71495426, + "num_input_tokens_seen": 307083900, + "step": 14239, + "time_per_iteration": 2.471531629562378 + }, + { + "auxiliary_loss_clip": 0.01106801, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.03567004, + "balance_loss_mlp": 1.02383351, + "epoch": 0.8561551179918834, + "flos": 31026208636800.0, + "grad_norm": 1.9442797939117318, + "language_loss": 0.66547358, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.68689907, + "num_input_tokens_seen": 307104590, + "step": 14240, + "time_per_iteration": 2.51489520072937 + }, + { + "auxiliary_loss_clip": 0.01063751, + "auxiliary_loss_mlp": 0.01037402, + "balance_loss_clip": 1.03252459, + "balance_loss_mlp": 1.02359152, + "epoch": 0.8562152412445513, + "flos": 30665198995200.0, + "grad_norm": 1.5014886883222083, + "language_loss": 0.62215006, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.64316154, + "num_input_tokens_seen": 307125580, + "step": 14241, + "time_per_iteration": 2.616015911102295 + }, + { + "auxiliary_loss_clip": 0.01107094, + "auxiliary_loss_mlp": 0.01035697, + "balance_loss_clip": 1.03466296, + "balance_loss_mlp": 1.02318025, + "epoch": 0.8562753644972193, + "flos": 31576144838400.0, + "grad_norm": 1.6925696368508898, + "language_loss": 0.74634409, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.76777196, + "num_input_tokens_seen": 307147625, + "step": 14242, + "time_per_iteration": 2.5192947387695312 + }, + { + "auxiliary_loss_clip": 0.01035143, + "auxiliary_loss_mlp": 0.01046928, + "balance_loss_clip": 1.03483367, + "balance_loss_mlp": 1.03340364, + "epoch": 0.8563354877498872, + "flos": 26213640618240.0, + "grad_norm": 1.9780897043750816, + "language_loss": 0.76450747, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.78532815, + "num_input_tokens_seen": 307164665, + "step": 14243, + "time_per_iteration": 2.747976541519165 + }, + { + "auxiliary_loss_clip": 0.01081127, + "auxiliary_loss_mlp": 0.00781612, + "balance_loss_clip": 1.03476501, + "balance_loss_mlp": 1.00728667, + "epoch": 0.8563956110025552, + "flos": 24134341092480.0, + "grad_norm": 2.06046157347618, + "language_loss": 0.68060833, + "learning_rate": 2.123723375556974e-07, + "loss": 0.69923574, + "num_input_tokens_seen": 307182530, + "step": 14244, + "time_per_iteration": 2.6322810649871826 + }, + { + "auxiliary_loss_clip": 0.01020392, + "auxiliary_loss_mlp": 0.01001587, + "balance_loss_clip": 1.00670207, + "balance_loss_mlp": 1.00050199, + "epoch": 0.8564557342552233, + "flos": 56271986311680.0, + "grad_norm": 0.7548785631762144, + "language_loss": 0.58441043, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.60463023, + "num_input_tokens_seen": 307241240, + "step": 14245, + "time_per_iteration": 3.0466201305389404 + }, + { + "auxiliary_loss_clip": 0.01095695, + "auxiliary_loss_mlp": 0.01032081, + "balance_loss_clip": 1.03511477, + "balance_loss_mlp": 1.01981497, + "epoch": 0.8565158575078912, + "flos": 23440618748160.0, + "grad_norm": 1.7485144631659006, + "language_loss": 0.77790177, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.79917955, + "num_input_tokens_seen": 307261485, + "step": 14246, + "time_per_iteration": 2.49139142036438 + }, + { + "auxiliary_loss_clip": 0.01077203, + "auxiliary_loss_mlp": 0.01027851, + "balance_loss_clip": 1.02971983, + "balance_loss_mlp": 1.01585269, + "epoch": 0.8565759807605592, + "flos": 20375930442240.0, + "grad_norm": 2.9165733779687795, + "language_loss": 0.81340325, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.83445382, + "num_input_tokens_seen": 307279160, + "step": 14247, + "time_per_iteration": 2.521484375 + }, + { + "auxiliary_loss_clip": 0.01081041, + "auxiliary_loss_mlp": 0.01030822, + "balance_loss_clip": 1.03401113, + "balance_loss_mlp": 1.01885378, + "epoch": 0.8566361040132271, + "flos": 18807101049600.0, + "grad_norm": 1.7115899255142006, + "language_loss": 0.77373379, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.79485238, + "num_input_tokens_seen": 307297920, + "step": 14248, + "time_per_iteration": 2.4825663566589355 + }, + { + "auxiliary_loss_clip": 0.01054106, + "auxiliary_loss_mlp": 0.01043294, + "balance_loss_clip": 1.03030658, + "balance_loss_mlp": 1.02904248, + "epoch": 0.8566962272658951, + "flos": 24535355506560.0, + "grad_norm": 1.65538887039535, + "language_loss": 0.77483851, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.79581255, + "num_input_tokens_seen": 307318320, + "step": 14249, + "time_per_iteration": 2.5996360778808594 + }, + { + "auxiliary_loss_clip": 0.01077139, + "auxiliary_loss_mlp": 0.01035319, + "balance_loss_clip": 1.03209794, + "balance_loss_mlp": 1.02339244, + "epoch": 0.856756350518563, + "flos": 23178506227200.0, + "grad_norm": 2.0090874099187572, + "language_loss": 0.78519136, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.80631602, + "num_input_tokens_seen": 307336720, + "step": 14250, + "time_per_iteration": 2.5110654830932617 + }, + { + "auxiliary_loss_clip": 0.01080047, + "auxiliary_loss_mlp": 0.01028554, + "balance_loss_clip": 1.03447545, + "balance_loss_mlp": 1.01764619, + "epoch": 0.856816473771231, + "flos": 20808581760000.0, + "grad_norm": 1.7005950997008479, + "language_loss": 0.79692328, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.81800926, + "num_input_tokens_seen": 307354120, + "step": 14251, + "time_per_iteration": 2.5197596549987793 + }, + { + "auxiliary_loss_clip": 0.01063292, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.03299737, + "balance_loss_mlp": 1.01711941, + "epoch": 0.856876597023899, + "flos": 20228157889920.0, + "grad_norm": 1.8337806562726102, + "language_loss": 0.61841136, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.63932943, + "num_input_tokens_seen": 307373165, + "step": 14252, + "time_per_iteration": 3.9161062240600586 + }, + { + "auxiliary_loss_clip": 0.01081735, + "auxiliary_loss_mlp": 0.01030451, + "balance_loss_clip": 1.03789473, + "balance_loss_mlp": 1.01799941, + "epoch": 0.856936720276567, + "flos": 18296128126080.0, + "grad_norm": 1.795150721181122, + "language_loss": 0.69805861, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.71918046, + "num_input_tokens_seen": 307391000, + "step": 14253, + "time_per_iteration": 3.887320041656494 + }, + { + "auxiliary_loss_clip": 0.01014419, + "auxiliary_loss_mlp": 0.0100169, + "balance_loss_clip": 1.01181149, + "balance_loss_mlp": 1.00063539, + "epoch": 0.8569968435292349, + "flos": 69878394933120.0, + "grad_norm": 0.7877343905341255, + "language_loss": 0.59210628, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.61226737, + "num_input_tokens_seen": 307452865, + "step": 14254, + "time_per_iteration": 3.1756110191345215 + }, + { + "auxiliary_loss_clip": 0.01077934, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.03265357, + "balance_loss_mlp": 1.0206871, + "epoch": 0.8570569667819029, + "flos": 25848572739840.0, + "grad_norm": 1.7684076771483068, + "language_loss": 0.81241775, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.83352959, + "num_input_tokens_seen": 307471940, + "step": 14255, + "time_per_iteration": 2.5554826259613037 + }, + { + "auxiliary_loss_clip": 0.01102177, + "auxiliary_loss_mlp": 0.01024557, + "balance_loss_clip": 1.03540564, + "balance_loss_mlp": 1.01315415, + "epoch": 0.8571170900345708, + "flos": 23257115141760.0, + "grad_norm": 2.358676612892262, + "language_loss": 0.67082119, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.69208848, + "num_input_tokens_seen": 307488745, + "step": 14256, + "time_per_iteration": 3.879448652267456 + }, + { + "auxiliary_loss_clip": 0.01092688, + "auxiliary_loss_mlp": 0.01028799, + "balance_loss_clip": 1.03487706, + "balance_loss_mlp": 1.0175041, + "epoch": 0.8571772132872388, + "flos": 18917670090240.0, + "grad_norm": 1.7469183003816589, + "language_loss": 0.69946647, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.72068137, + "num_input_tokens_seen": 307506855, + "step": 14257, + "time_per_iteration": 2.499655246734619 + }, + { + "auxiliary_loss_clip": 0.01068502, + "auxiliary_loss_mlp": 0.01031453, + "balance_loss_clip": 1.0323472, + "balance_loss_mlp": 1.0192759, + "epoch": 0.8572373365399069, + "flos": 33250120318080.0, + "grad_norm": 1.7850743949077634, + "language_loss": 0.76754773, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.78854728, + "num_input_tokens_seen": 307526115, + "step": 14258, + "time_per_iteration": 2.6324350833892822 + }, + { + "auxiliary_loss_clip": 0.0109283, + "auxiliary_loss_mlp": 0.00785073, + "balance_loss_clip": 1.0352273, + "balance_loss_mlp": 1.01386023, + "epoch": 0.8572974597925748, + "flos": 23327535755520.0, + "grad_norm": 1.6588867108431231, + "language_loss": 0.678249, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.69702804, + "num_input_tokens_seen": 307545230, + "step": 14259, + "time_per_iteration": 2.5161831378936768 + }, + { + "auxiliary_loss_clip": 0.01090422, + "auxiliary_loss_mlp": 0.010294, + "balance_loss_clip": 1.03235483, + "balance_loss_mlp": 1.01697206, + "epoch": 0.8573575830452428, + "flos": 24535858296960.0, + "grad_norm": 1.6318643798702532, + "language_loss": 0.76877153, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.78996974, + "num_input_tokens_seen": 307564900, + "step": 14260, + "time_per_iteration": 2.498394012451172 + }, + { + "auxiliary_loss_clip": 0.01079648, + "auxiliary_loss_mlp": 0.01027237, + "balance_loss_clip": 1.03406107, + "balance_loss_mlp": 1.01486325, + "epoch": 0.8574177062979107, + "flos": 24165403378560.0, + "grad_norm": 1.8538371859827818, + "language_loss": 0.74226093, + "learning_rate": 2.09413096654806e-07, + "loss": 0.7633298, + "num_input_tokens_seen": 307583500, + "step": 14261, + "time_per_iteration": 2.5428829193115234 + }, + { + "auxiliary_loss_clip": 0.01097815, + "auxiliary_loss_mlp": 0.01033236, + "balance_loss_clip": 1.03669095, + "balance_loss_mlp": 1.02048671, + "epoch": 0.8574778295505787, + "flos": 17930737025280.0, + "grad_norm": 1.8828307950556056, + "language_loss": 0.78455746, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.80586803, + "num_input_tokens_seen": 307601430, + "step": 14262, + "time_per_iteration": 2.4664199352264404 + }, + { + "auxiliary_loss_clip": 0.01067702, + "auxiliary_loss_mlp": 0.01031751, + "balance_loss_clip": 1.03408027, + "balance_loss_mlp": 1.02073622, + "epoch": 0.8575379528032466, + "flos": 21580697537280.0, + "grad_norm": 1.5852063800594351, + "language_loss": 0.67995787, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.70095241, + "num_input_tokens_seen": 307621495, + "step": 14263, + "time_per_iteration": 3.9632153511047363 + }, + { + "auxiliary_loss_clip": 0.01067727, + "auxiliary_loss_mlp": 0.00783114, + "balance_loss_clip": 1.03517449, + "balance_loss_mlp": 1.00950181, + "epoch": 0.8575980760559146, + "flos": 21761579450880.0, + "grad_norm": 1.429847520878928, + "language_loss": 0.79692626, + "learning_rate": 2.088929137266986e-07, + "loss": 0.81543463, + "num_input_tokens_seen": 307640840, + "step": 14264, + "time_per_iteration": 2.5703418254852295 + }, + { + "auxiliary_loss_clip": 0.01068387, + "auxiliary_loss_mlp": 0.01037375, + "balance_loss_clip": 1.03448045, + "balance_loss_mlp": 1.02447665, + "epoch": 0.8576581993085826, + "flos": 34386442047360.0, + "grad_norm": 1.3430773218525816, + "language_loss": 0.69331408, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.71437168, + "num_input_tokens_seen": 307663820, + "step": 14265, + "time_per_iteration": 2.6521377563476562 + }, + { + "auxiliary_loss_clip": 0.01098823, + "auxiliary_loss_mlp": 0.01025112, + "balance_loss_clip": 1.03377771, + "balance_loss_mlp": 1.01409125, + "epoch": 0.8577183225612506, + "flos": 23222497409280.0, + "grad_norm": 1.686631657684879, + "language_loss": 0.66219926, + "learning_rate": 2.085464646918027e-07, + "loss": 0.68343866, + "num_input_tokens_seen": 307682385, + "step": 14266, + "time_per_iteration": 2.469451427459717 + }, + { + "auxiliary_loss_clip": 0.01082936, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.03606606, + "balance_loss_mlp": 1.02137268, + "epoch": 0.8577784458139185, + "flos": 28804164462720.0, + "grad_norm": 1.5763544922898534, + "language_loss": 0.75300914, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.77416897, + "num_input_tokens_seen": 307704680, + "step": 14267, + "time_per_iteration": 2.5644521713256836 + }, + { + "auxiliary_loss_clip": 0.01090082, + "auxiliary_loss_mlp": 0.01030303, + "balance_loss_clip": 1.03467512, + "balance_loss_mlp": 1.01922274, + "epoch": 0.8578385690665865, + "flos": 19755573626880.0, + "grad_norm": 1.885405341788224, + "language_loss": 0.87863433, + "learning_rate": 2.082002873852946e-07, + "loss": 0.89983821, + "num_input_tokens_seen": 307723245, + "step": 14268, + "time_per_iteration": 2.5018723011016846 + }, + { + "auxiliary_loss_clip": 0.01094005, + "auxiliary_loss_mlp": 0.01036373, + "balance_loss_clip": 1.03609145, + "balance_loss_mlp": 1.02466106, + "epoch": 0.8578986923192544, + "flos": 20704082117760.0, + "grad_norm": 1.871110117676521, + "language_loss": 0.72763866, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.74894243, + "num_input_tokens_seen": 307742510, + "step": 14269, + "time_per_iteration": 2.475130081176758 + }, + { + "auxiliary_loss_clip": 0.01094268, + "auxiliary_loss_mlp": 0.01031458, + "balance_loss_clip": 1.03496873, + "balance_loss_mlp": 1.01947165, + "epoch": 0.8579588155719224, + "flos": 36101715189120.0, + "grad_norm": 1.5093400776272106, + "language_loss": 0.66560447, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.68686175, + "num_input_tokens_seen": 307766030, + "step": 14270, + "time_per_iteration": 2.6249141693115234 + }, + { + "auxiliary_loss_clip": 0.01076669, + "auxiliary_loss_mlp": 0.01025517, + "balance_loss_clip": 1.03083229, + "balance_loss_mlp": 1.01343536, + "epoch": 0.8580189388245905, + "flos": 22853479034880.0, + "grad_norm": 1.7457115568552428, + "language_loss": 0.73991978, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.76094162, + "num_input_tokens_seen": 307785800, + "step": 14271, + "time_per_iteration": 2.507211685180664 + }, + { + "auxiliary_loss_clip": 0.00991631, + "auxiliary_loss_mlp": 0.00760857, + "balance_loss_clip": 1.00921059, + "balance_loss_mlp": 0.99931669, + "epoch": 0.8580790620772584, + "flos": 69642104290560.0, + "grad_norm": 0.7996552781615845, + "language_loss": 0.59445429, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.61197913, + "num_input_tokens_seen": 307850995, + "step": 14272, + "time_per_iteration": 3.257549285888672 + }, + { + "auxiliary_loss_clip": 0.01082613, + "auxiliary_loss_mlp": 0.01036437, + "balance_loss_clip": 1.03523469, + "balance_loss_mlp": 1.02300835, + "epoch": 0.8581391853299264, + "flos": 13334243270400.0, + "grad_norm": 1.7569647593561282, + "language_loss": 0.75171065, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.77290118, + "num_input_tokens_seen": 307868585, + "step": 14273, + "time_per_iteration": 2.476038694381714 + }, + { + "auxiliary_loss_clip": 0.01091017, + "auxiliary_loss_mlp": 0.01030891, + "balance_loss_clip": 1.03343344, + "balance_loss_mlp": 1.01895177, + "epoch": 0.8581993085825943, + "flos": 19645651031040.0, + "grad_norm": 1.9948670066156267, + "language_loss": 0.82071626, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.8419354, + "num_input_tokens_seen": 307886820, + "step": 14274, + "time_per_iteration": 2.4740488529205322 + }, + { + "auxiliary_loss_clip": 0.01019513, + "auxiliary_loss_mlp": 0.01000916, + "balance_loss_clip": 1.00645566, + "balance_loss_mlp": 0.99980116, + "epoch": 0.8582594318352623, + "flos": 55825077294720.0, + "grad_norm": 0.7948898245030853, + "language_loss": 0.60849142, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.62869573, + "num_input_tokens_seen": 307944020, + "step": 14275, + "time_per_iteration": 3.1388211250305176 + }, + { + "auxiliary_loss_clip": 0.01088746, + "auxiliary_loss_mlp": 0.01024873, + "balance_loss_clip": 1.03509712, + "balance_loss_mlp": 1.01221895, + "epoch": 0.8583195550879302, + "flos": 24279563779200.0, + "grad_norm": 1.954722737147305, + "language_loss": 0.59397674, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.6151129, + "num_input_tokens_seen": 307961055, + "step": 14276, + "time_per_iteration": 2.5775272846221924 + }, + { + "auxiliary_loss_clip": 0.0108091, + "auxiliary_loss_mlp": 0.01030453, + "balance_loss_clip": 1.03376591, + "balance_loss_mlp": 1.01884842, + "epoch": 0.8583796783405983, + "flos": 13444129952640.0, + "grad_norm": 1.960624972890865, + "language_loss": 0.76271629, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.78382993, + "num_input_tokens_seen": 307978690, + "step": 14277, + "time_per_iteration": 2.456404447555542 + }, + { + "auxiliary_loss_clip": 0.01080615, + "auxiliary_loss_mlp": 0.01028242, + "balance_loss_clip": 1.03358054, + "balance_loss_mlp": 1.01631546, + "epoch": 0.8584398015932662, + "flos": 16180271533440.0, + "grad_norm": 1.5219061636297668, + "language_loss": 0.83456302, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.85565162, + "num_input_tokens_seen": 307995870, + "step": 14278, + "time_per_iteration": 2.4925217628479004 + }, + { + "auxiliary_loss_clip": 0.01081358, + "auxiliary_loss_mlp": 0.01031382, + "balance_loss_clip": 1.03625965, + "balance_loss_mlp": 1.0192945, + "epoch": 0.8584999248459342, + "flos": 17450431338240.0, + "grad_norm": 2.766120064029242, + "language_loss": 0.74424613, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.76537347, + "num_input_tokens_seen": 308013645, + "step": 14279, + "time_per_iteration": 2.4973630905151367 + }, + { + "auxiliary_loss_clip": 0.01102133, + "auxiliary_loss_mlp": 0.01029565, + "balance_loss_clip": 1.03469563, + "balance_loss_mlp": 1.01792431, + "epoch": 0.8585600480986021, + "flos": 23441013797760.0, + "grad_norm": 2.39658558557705, + "language_loss": 0.66223925, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.68355626, + "num_input_tokens_seen": 308032490, + "step": 14280, + "time_per_iteration": 2.4938676357269287 + }, + { + "auxiliary_loss_clip": 0.01090233, + "auxiliary_loss_mlp": 0.01029612, + "balance_loss_clip": 1.03424621, + "balance_loss_mlp": 1.01838875, + "epoch": 0.8586201713512701, + "flos": 19937927998080.0, + "grad_norm": 1.7848614455723533, + "language_loss": 0.62674105, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.64793956, + "num_input_tokens_seen": 308052110, + "step": 14281, + "time_per_iteration": 2.4762024879455566 + }, + { + "auxiliary_loss_clip": 0.01079484, + "auxiliary_loss_mlp": 0.0078411, + "balance_loss_clip": 1.03490758, + "balance_loss_mlp": 1.01191056, + "epoch": 0.858680294603938, + "flos": 15304769435520.0, + "grad_norm": 1.8808242978369565, + "language_loss": 0.73132908, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.74996501, + "num_input_tokens_seen": 308070660, + "step": 14282, + "time_per_iteration": 2.5190725326538086 + }, + { + "auxiliary_loss_clip": 0.01074707, + "auxiliary_loss_mlp": 0.01027549, + "balance_loss_clip": 1.03109586, + "balance_loss_mlp": 1.01635504, + "epoch": 0.858740417856606, + "flos": 22711237176960.0, + "grad_norm": 1.707948690752795, + "language_loss": 0.75689209, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.77791464, + "num_input_tokens_seen": 308089520, + "step": 14283, + "time_per_iteration": 2.504492998123169 + }, + { + "auxiliary_loss_clip": 0.01089428, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.03309357, + "balance_loss_mlp": 1.0195719, + "epoch": 0.8588005411092741, + "flos": 34054303962240.0, + "grad_norm": 1.8555362639405433, + "language_loss": 0.59566772, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.61687899, + "num_input_tokens_seen": 308111545, + "step": 14284, + "time_per_iteration": 2.5965402126312256 + }, + { + "auxiliary_loss_clip": 0.01076492, + "auxiliary_loss_mlp": 0.01028871, + "balance_loss_clip": 1.03537869, + "balance_loss_mlp": 1.01737356, + "epoch": 0.858860664361942, + "flos": 28913584268160.0, + "grad_norm": 1.829371845947838, + "language_loss": 0.7567181, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.77777171, + "num_input_tokens_seen": 308129690, + "step": 14285, + "time_per_iteration": 2.58284068107605 + }, + { + "auxiliary_loss_clip": 0.01090381, + "auxiliary_loss_mlp": 0.01030715, + "balance_loss_clip": 1.03766406, + "balance_loss_mlp": 1.0185616, + "epoch": 0.85892078761461, + "flos": 19792525743360.0, + "grad_norm": 1.8346033553391012, + "language_loss": 0.74366474, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.76487565, + "num_input_tokens_seen": 308147410, + "step": 14286, + "time_per_iteration": 2.489820957183838 + }, + { + "auxiliary_loss_clip": 0.01009068, + "auxiliary_loss_mlp": 0.00761475, + "balance_loss_clip": 1.0074048, + "balance_loss_mlp": 1.00033748, + "epoch": 0.8589809108672779, + "flos": 67106630039040.0, + "grad_norm": 0.7862165861254445, + "language_loss": 0.49466842, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.51237386, + "num_input_tokens_seen": 308204875, + "step": 14287, + "time_per_iteration": 3.0912904739379883 + }, + { + "auxiliary_loss_clip": 0.01094695, + "auxiliary_loss_mlp": 0.01032416, + "balance_loss_clip": 1.03683281, + "balance_loss_mlp": 1.02065051, + "epoch": 0.8590410341199459, + "flos": 29716259541120.0, + "grad_norm": 1.83524751104603, + "language_loss": 0.79105794, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.81232905, + "num_input_tokens_seen": 308225690, + "step": 14288, + "time_per_iteration": 2.551358461380005 + }, + { + "auxiliary_loss_clip": 0.0106313, + "auxiliary_loss_mlp": 0.01027999, + "balance_loss_clip": 1.03534043, + "balance_loss_mlp": 1.01581001, + "epoch": 0.8591011573726138, + "flos": 23987430466560.0, + "grad_norm": 2.057116476758319, + "language_loss": 0.81246138, + "learning_rate": 2.045818444528553e-07, + "loss": 0.83337259, + "num_input_tokens_seen": 308245255, + "step": 14289, + "time_per_iteration": 2.585846185684204 + }, + { + "auxiliary_loss_clip": 0.01096718, + "auxiliary_loss_mlp": 0.01026494, + "balance_loss_clip": 1.03780103, + "balance_loss_mlp": 1.01476347, + "epoch": 0.8591612806252819, + "flos": 14428656806400.0, + "grad_norm": 2.1805359105371904, + "language_loss": 0.65161681, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.67284894, + "num_input_tokens_seen": 308261755, + "step": 14290, + "time_per_iteration": 2.458813428878784 + }, + { + "auxiliary_loss_clip": 0.01083658, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.03462124, + "balance_loss_mlp": 1.01672745, + "epoch": 0.8592214038779498, + "flos": 31577150419200.0, + "grad_norm": 1.885358043451525, + "language_loss": 0.55216897, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.57329458, + "num_input_tokens_seen": 308285145, + "step": 14291, + "time_per_iteration": 4.023146629333496 + }, + { + "auxiliary_loss_clip": 0.01093583, + "auxiliary_loss_mlp": 0.01029994, + "balance_loss_clip": 1.03433406, + "balance_loss_mlp": 1.0180676, + "epoch": 0.8592815271306178, + "flos": 17457290835840.0, + "grad_norm": 1.9853070736969745, + "language_loss": 0.71534693, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.73658276, + "num_input_tokens_seen": 308304130, + "step": 14292, + "time_per_iteration": 3.877232551574707 + }, + { + "auxiliary_loss_clip": 0.01092822, + "auxiliary_loss_mlp": 0.01030984, + "balance_loss_clip": 1.03435445, + "balance_loss_mlp": 1.01945615, + "epoch": 0.8593416503832857, + "flos": 25411360394880.0, + "grad_norm": 1.483361975742606, + "language_loss": 0.71222097, + "learning_rate": 2.038960195018542e-07, + "loss": 0.73345912, + "num_input_tokens_seen": 308324670, + "step": 14293, + "time_per_iteration": 2.509713888168335 + }, + { + "auxiliary_loss_clip": 0.01074019, + "auxiliary_loss_mlp": 0.01033609, + "balance_loss_clip": 1.03437448, + "balance_loss_mlp": 1.0218668, + "epoch": 0.8594017736359537, + "flos": 20996646393600.0, + "grad_norm": 1.5647116355965973, + "language_loss": 0.68955636, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.71063262, + "num_input_tokens_seen": 308344215, + "step": 14294, + "time_per_iteration": 3.9416236877441406 + }, + { + "auxiliary_loss_clip": 0.01099332, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.03293359, + "balance_loss_mlp": 1.01877236, + "epoch": 0.8594618968886216, + "flos": 22091059929600.0, + "grad_norm": 1.860477358135291, + "language_loss": 0.77960026, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.8008948, + "num_input_tokens_seen": 308360520, + "step": 14295, + "time_per_iteration": 2.4700238704681396 + }, + { + "auxiliary_loss_clip": 0.01079904, + "auxiliary_loss_mlp": 0.01039086, + "balance_loss_clip": 1.03316832, + "balance_loss_mlp": 1.02543068, + "epoch": 0.8595220201412896, + "flos": 11656245467520.0, + "grad_norm": 3.0833437482347708, + "language_loss": 0.68570113, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.70689106, + "num_input_tokens_seen": 308376865, + "step": 14296, + "time_per_iteration": 2.4878060817718506 + }, + { + "auxiliary_loss_clip": 0.01075798, + "auxiliary_loss_mlp": 0.01028847, + "balance_loss_clip": 1.03224373, + "balance_loss_mlp": 1.01662803, + "epoch": 0.8595821433939577, + "flos": 25040366772480.0, + "grad_norm": 1.9864974884780775, + "language_loss": 0.79602861, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.81707507, + "num_input_tokens_seen": 308395870, + "step": 14297, + "time_per_iteration": 2.5417251586914062 + }, + { + "auxiliary_loss_clip": 0.01088604, + "auxiliary_loss_mlp": 0.01027952, + "balance_loss_clip": 1.03294694, + "balance_loss_mlp": 1.01706254, + "epoch": 0.8596422666466256, + "flos": 28511528359680.0, + "grad_norm": 1.7530587740965007, + "language_loss": 0.6832937, + "learning_rate": 2.030402708016954e-07, + "loss": 0.70445925, + "num_input_tokens_seen": 308417250, + "step": 14298, + "time_per_iteration": 2.534247636795044 + }, + { + "auxiliary_loss_clip": 0.01078518, + "auxiliary_loss_mlp": 0.01032632, + "balance_loss_clip": 1.03400171, + "balance_loss_mlp": 1.02102065, + "epoch": 0.8597023898992936, + "flos": 13589137157760.0, + "grad_norm": 1.9480448322538344, + "language_loss": 0.68880832, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.70991969, + "num_input_tokens_seen": 308434565, + "step": 14299, + "time_per_iteration": 2.4974045753479004 + }, + { + "auxiliary_loss_clip": 0.0107311, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.03514194, + "balance_loss_mlp": 1.02263141, + "epoch": 0.8597625131519615, + "flos": 32300821728000.0, + "grad_norm": 2.3068929476639917, + "language_loss": 0.71551311, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.73658794, + "num_input_tokens_seen": 308450040, + "step": 14300, + "time_per_iteration": 2.5992512702941895 + }, + { + "auxiliary_loss_clip": 0.01077215, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.03141785, + "balance_loss_mlp": 1.02026761, + "epoch": 0.8598226364046295, + "flos": 28730367970560.0, + "grad_norm": 1.4352327900068327, + "language_loss": 0.6935631, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.71466672, + "num_input_tokens_seen": 308470545, + "step": 14301, + "time_per_iteration": 3.9283857345581055 + }, + { + "auxiliary_loss_clip": 0.01052666, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.03405213, + "balance_loss_mlp": 1.0185442, + "epoch": 0.8598827596572974, + "flos": 21871825269120.0, + "grad_norm": 2.1894381784055104, + "language_loss": 0.73887759, + "learning_rate": 2.023568983386641e-07, + "loss": 0.75970811, + "num_input_tokens_seen": 308490020, + "step": 14302, + "time_per_iteration": 2.574747085571289 + }, + { + "auxiliary_loss_clip": 0.01087849, + "auxiliary_loss_mlp": 0.01025862, + "balance_loss_clip": 1.03291464, + "balance_loss_mlp": 1.01496005, + "epoch": 0.8599428829099655, + "flos": 23767297966080.0, + "grad_norm": 1.8153183580818106, + "language_loss": 0.83705652, + "learning_rate": 2.02186225623733e-07, + "loss": 0.85819364, + "num_input_tokens_seen": 308509065, + "step": 14303, + "time_per_iteration": 2.4991750717163086 + }, + { + "auxiliary_loss_clip": 0.01091105, + "auxiliary_loss_mlp": 0.01034373, + "balance_loss_clip": 1.03346276, + "balance_loss_mlp": 1.02218378, + "epoch": 0.8600030061626334, + "flos": 16212770363520.0, + "grad_norm": 5.968230029996946, + "language_loss": 0.77086902, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.79212379, + "num_input_tokens_seen": 308524725, + "step": 14304, + "time_per_iteration": 2.4447391033172607 + }, + { + "auxiliary_loss_clip": 0.01105042, + "auxiliary_loss_mlp": 0.01036463, + "balance_loss_clip": 1.03595459, + "balance_loss_mlp": 1.02365434, + "epoch": 0.8600631294153014, + "flos": 15669370437120.0, + "grad_norm": 2.460197556412072, + "language_loss": 0.5337168, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.55513185, + "num_input_tokens_seen": 308543525, + "step": 14305, + "time_per_iteration": 2.437100648880005 + }, + { + "auxiliary_loss_clip": 0.01102216, + "auxiliary_loss_mlp": 0.01025396, + "balance_loss_clip": 1.03509426, + "balance_loss_mlp": 1.01351714, + "epoch": 0.8601232526679693, + "flos": 17493093717120.0, + "grad_norm": 2.111510066055082, + "language_loss": 0.84273958, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.8640157, + "num_input_tokens_seen": 308557995, + "step": 14306, + "time_per_iteration": 2.4166693687438965 + }, + { + "auxiliary_loss_clip": 0.01090651, + "auxiliary_loss_mlp": 0.00780487, + "balance_loss_clip": 1.03398955, + "balance_loss_mlp": 1.00538898, + "epoch": 0.8601833759206373, + "flos": 26985935963520.0, + "grad_norm": 1.3955048729242354, + "language_loss": 0.71431077, + "learning_rate": 2.01504216561474e-07, + "loss": 0.73302221, + "num_input_tokens_seen": 308582750, + "step": 14307, + "time_per_iteration": 2.5940725803375244 + }, + { + "auxiliary_loss_clip": 0.01092166, + "auxiliary_loss_mlp": 0.00784889, + "balance_loss_clip": 1.03242755, + "balance_loss_mlp": 1.00941682, + "epoch": 0.8602434991733052, + "flos": 25229760209280.0, + "grad_norm": 1.6164513888427787, + "language_loss": 0.6353693, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.65413988, + "num_input_tokens_seen": 308603770, + "step": 14308, + "time_per_iteration": 2.520097017288208 + }, + { + "auxiliary_loss_clip": 0.01010494, + "auxiliary_loss_mlp": 0.01006206, + "balance_loss_clip": 1.00744319, + "balance_loss_mlp": 1.00506747, + "epoch": 0.8603036224259732, + "flos": 71015363107200.0, + "grad_norm": 0.6248265596218042, + "language_loss": 0.48486665, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.50503367, + "num_input_tokens_seen": 308667735, + "step": 14309, + "time_per_iteration": 3.206472873687744 + }, + { + "auxiliary_loss_clip": 0.01048885, + "auxiliary_loss_mlp": 0.01035911, + "balance_loss_clip": 1.03398836, + "balance_loss_mlp": 1.02276826, + "epoch": 0.8603637456786413, + "flos": 20300625578880.0, + "grad_norm": 1.7157077335028457, + "language_loss": 0.67163658, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.6924845, + "num_input_tokens_seen": 308686300, + "step": 14310, + "time_per_iteration": 2.6153416633605957 + }, + { + "auxiliary_loss_clip": 0.01037394, + "auxiliary_loss_mlp": 0.01033284, + "balance_loss_clip": 1.03099966, + "balance_loss_mlp": 1.02116013, + "epoch": 0.8604238689313092, + "flos": 21835842819840.0, + "grad_norm": 1.83250936546361, + "language_loss": 0.78786892, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.80857563, + "num_input_tokens_seen": 308705825, + "step": 14311, + "time_per_iteration": 2.627955913543701 + }, + { + "auxiliary_loss_clip": 0.01090366, + "auxiliary_loss_mlp": 0.01026353, + "balance_loss_clip": 1.03393245, + "balance_loss_mlp": 1.0148139, + "epoch": 0.8604839921839772, + "flos": 18004210295040.0, + "grad_norm": 2.0810465927289172, + "language_loss": 0.71358752, + "learning_rate": 2.006532397626639e-07, + "loss": 0.73475474, + "num_input_tokens_seen": 308723340, + "step": 14312, + "time_per_iteration": 2.487879514694214 + }, + { + "auxiliary_loss_clip": 0.01075194, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.03186238, + "balance_loss_mlp": 1.01998591, + "epoch": 0.8605441154366451, + "flos": 16252164604800.0, + "grad_norm": 1.965276591745554, + "language_loss": 0.78112292, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.80219758, + "num_input_tokens_seen": 308741280, + "step": 14313, + "time_per_iteration": 2.4891586303710938 + }, + { + "auxiliary_loss_clip": 0.01079302, + "auxiliary_loss_mlp": 0.01032344, + "balance_loss_clip": 1.03360784, + "balance_loss_mlp": 1.01952255, + "epoch": 0.8606042386893131, + "flos": 32267065921920.0, + "grad_norm": 1.6508441725652045, + "language_loss": 0.73121333, + "learning_rate": 2.003133266178474e-07, + "loss": 0.75232983, + "num_input_tokens_seen": 308762875, + "step": 14314, + "time_per_iteration": 2.602266550064087 + }, + { + "auxiliary_loss_clip": 0.01078745, + "auxiliary_loss_mlp": 0.01025625, + "balance_loss_clip": 1.03281391, + "balance_loss_mlp": 1.01400161, + "epoch": 0.860664361941981, + "flos": 20229774001920.0, + "grad_norm": 1.6673238488592372, + "language_loss": 0.69107068, + "learning_rate": 2.001434724086657e-07, + "loss": 0.71211433, + "num_input_tokens_seen": 308780315, + "step": 14315, + "time_per_iteration": 2.4937996864318848 + }, + { + "auxiliary_loss_clip": 0.01091351, + "auxiliary_loss_mlp": 0.01031948, + "balance_loss_clip": 1.03476632, + "balance_loss_mlp": 1.02037859, + "epoch": 0.8607244851946491, + "flos": 25191622944000.0, + "grad_norm": 1.810438416747908, + "language_loss": 0.72013074, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.74136364, + "num_input_tokens_seen": 308799435, + "step": 14316, + "time_per_iteration": 2.538911819458008 + }, + { + "auxiliary_loss_clip": 0.0108641, + "auxiliary_loss_mlp": 0.01029243, + "balance_loss_clip": 1.03850555, + "balance_loss_mlp": 1.01763809, + "epoch": 0.860784608447317, + "flos": 20482082110080.0, + "grad_norm": 2.4384086151207174, + "language_loss": 0.82578897, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.84694552, + "num_input_tokens_seen": 308817730, + "step": 14317, + "time_per_iteration": 2.5049307346343994 + }, + { + "auxiliary_loss_clip": 0.01082975, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.03458309, + "balance_loss_mlp": 1.01948905, + "epoch": 0.860844731699985, + "flos": 50476037696640.0, + "grad_norm": 1.6615638224618794, + "language_loss": 0.67130899, + "learning_rate": 1.996343193113108e-07, + "loss": 0.6924541, + "num_input_tokens_seen": 308841735, + "step": 14318, + "time_per_iteration": 2.770570993423462 + }, + { + "auxiliary_loss_clip": 0.01088635, + "auxiliary_loss_mlp": 0.0102473, + "balance_loss_clip": 1.03418708, + "balance_loss_mlp": 1.0135895, + "epoch": 0.8609048549526529, + "flos": 41172768455040.0, + "grad_norm": 1.549422841203104, + "language_loss": 0.71270728, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.73384094, + "num_input_tokens_seen": 308865050, + "step": 14319, + "time_per_iteration": 2.654040575027466 + }, + { + "auxiliary_loss_clip": 0.01085178, + "auxiliary_loss_mlp": 0.00783745, + "balance_loss_clip": 1.03554499, + "balance_loss_mlp": 1.01054478, + "epoch": 0.8609649782053209, + "flos": 23951196622080.0, + "grad_norm": 1.821585770455099, + "language_loss": 0.67141396, + "learning_rate": 1.992952252525839e-07, + "loss": 0.69010323, + "num_input_tokens_seen": 308885375, + "step": 14320, + "time_per_iteration": 2.5576584339141846 + }, + { + "auxiliary_loss_clip": 0.01075796, + "auxiliary_loss_mlp": 0.01035305, + "balance_loss_clip": 1.03235412, + "balance_loss_mlp": 1.0214169, + "epoch": 0.8610251014579888, + "flos": 23112574813440.0, + "grad_norm": 2.026088616953404, + "language_loss": 0.80412966, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.82524067, + "num_input_tokens_seen": 308904700, + "step": 14321, + "time_per_iteration": 2.5101845264434814 + }, + { + "auxiliary_loss_clip": 0.01087167, + "auxiliary_loss_mlp": 0.00783265, + "balance_loss_clip": 1.03275192, + "balance_loss_mlp": 1.00874937, + "epoch": 0.8610852247106568, + "flos": 19426811420160.0, + "grad_norm": 1.87582374077272, + "language_loss": 0.71013534, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.72883964, + "num_input_tokens_seen": 308922985, + "step": 14322, + "time_per_iteration": 2.478645086288452 + }, + { + "auxiliary_loss_clip": 0.01084056, + "auxiliary_loss_mlp": 0.01036624, + "balance_loss_clip": 1.03452206, + "balance_loss_mlp": 1.02323127, + "epoch": 0.8611453479633249, + "flos": 19312076401920.0, + "grad_norm": 1.8366979998199853, + "language_loss": 0.56087363, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.58208042, + "num_input_tokens_seen": 308940765, + "step": 14323, + "time_per_iteration": 2.5026872158050537 + }, + { + "auxiliary_loss_clip": 0.01068569, + "auxiliary_loss_mlp": 0.01027063, + "balance_loss_clip": 1.03322744, + "balance_loss_mlp": 1.01533866, + "epoch": 0.8612054712159928, + "flos": 23253667436160.0, + "grad_norm": 1.9072317652752087, + "language_loss": 0.75941247, + "learning_rate": 1.986178565813801e-07, + "loss": 0.7803688, + "num_input_tokens_seen": 308960110, + "step": 14324, + "time_per_iteration": 2.5698184967041016 + }, + { + "auxiliary_loss_clip": 0.01058843, + "auxiliary_loss_mlp": 0.01037779, + "balance_loss_clip": 1.03448677, + "balance_loss_mlp": 1.0232172, + "epoch": 0.8612655944686608, + "flos": 16028440744320.0, + "grad_norm": 1.9922937098769982, + "language_loss": 0.66591293, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.6868791, + "num_input_tokens_seen": 308976665, + "step": 14325, + "time_per_iteration": 2.5475683212280273 + }, + { + "auxiliary_loss_clip": 0.01094759, + "auxiliary_loss_mlp": 0.01029057, + "balance_loss_clip": 1.03556681, + "balance_loss_mlp": 1.0166707, + "epoch": 0.8613257177213287, + "flos": 22492720788480.0, + "grad_norm": 1.570178396216154, + "language_loss": 0.64752221, + "learning_rate": 1.982795820716472e-07, + "loss": 0.6687603, + "num_input_tokens_seen": 308997015, + "step": 14326, + "time_per_iteration": 2.517493486404419 + }, + { + "auxiliary_loss_clip": 0.01081335, + "auxiliary_loss_mlp": 0.01033407, + "balance_loss_clip": 1.03310144, + "balance_loss_mlp": 1.02114034, + "epoch": 0.8613858409739967, + "flos": 17238056175360.0, + "grad_norm": 2.112833627668595, + "language_loss": 0.84372538, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.86487281, + "num_input_tokens_seen": 309015250, + "step": 14327, + "time_per_iteration": 2.482632875442505 + }, + { + "auxiliary_loss_clip": 0.01092503, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.03413916, + "balance_loss_mlp": 1.02034569, + "epoch": 0.8614459642266646, + "flos": 22821123859200.0, + "grad_norm": 2.312221904073291, + "language_loss": 0.74732393, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.768574, + "num_input_tokens_seen": 309034140, + "step": 14328, + "time_per_iteration": 2.5242748260498047 + }, + { + "auxiliary_loss_clip": 0.01092885, + "auxiliary_loss_mlp": 0.01025423, + "balance_loss_clip": 1.03479242, + "balance_loss_mlp": 1.01364517, + "epoch": 0.8615060874793327, + "flos": 26504301473280.0, + "grad_norm": 1.8464338534804243, + "language_loss": 0.79932523, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.8205083, + "num_input_tokens_seen": 309055075, + "step": 14329, + "time_per_iteration": 3.909661054611206 + }, + { + "auxiliary_loss_clip": 0.01076774, + "auxiliary_loss_mlp": 0.01027184, + "balance_loss_clip": 1.03517699, + "balance_loss_mlp": 1.01517963, + "epoch": 0.8615662107320006, + "flos": 24061011477120.0, + "grad_norm": 1.863513737111843, + "language_loss": 0.77016842, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.79120803, + "num_input_tokens_seen": 309074650, + "step": 14330, + "time_per_iteration": 3.9895944595336914 + }, + { + "auxiliary_loss_clip": 0.01091264, + "auxiliary_loss_mlp": 0.01027821, + "balance_loss_clip": 1.03358364, + "balance_loss_mlp": 1.01604283, + "epoch": 0.8616263339846686, + "flos": 24165044242560.0, + "grad_norm": 1.85382558855581, + "language_loss": 0.64455986, + "learning_rate": 1.974350915342702e-07, + "loss": 0.66575068, + "num_input_tokens_seen": 309094385, + "step": 14331, + "time_per_iteration": 2.508577823638916 + }, + { + "auxiliary_loss_clip": 0.01078181, + "auxiliary_loss_mlp": 0.010288, + "balance_loss_clip": 1.03649521, + "balance_loss_mlp": 1.018327, + "epoch": 0.8616864572373365, + "flos": 21724340025600.0, + "grad_norm": 1.683165370703536, + "language_loss": 0.76183689, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.78290665, + "num_input_tokens_seen": 309111815, + "step": 14332, + "time_per_iteration": 3.9387359619140625 + }, + { + "auxiliary_loss_clip": 0.01091403, + "auxiliary_loss_mlp": 0.01030477, + "balance_loss_clip": 1.03475785, + "balance_loss_mlp": 1.01739931, + "epoch": 0.8617465804900045, + "flos": 23766651521280.0, + "grad_norm": 1.7162102180074712, + "language_loss": 0.66720891, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.68842769, + "num_input_tokens_seen": 309131385, + "step": 14333, + "time_per_iteration": 2.4972712993621826 + }, + { + "auxiliary_loss_clip": 0.01079603, + "auxiliary_loss_mlp": 0.01035259, + "balance_loss_clip": 1.03443372, + "balance_loss_mlp": 1.02157354, + "epoch": 0.8618067037426724, + "flos": 37703941251840.0, + "grad_norm": 1.7573448555395703, + "language_loss": 0.62351131, + "learning_rate": 1.969292174019157e-07, + "loss": 0.64465988, + "num_input_tokens_seen": 309155020, + "step": 14334, + "time_per_iteration": 2.667008638381958 + }, + { + "auxiliary_loss_clip": 0.01074176, + "auxiliary_loss_mlp": 0.01049724, + "balance_loss_clip": 1.03511453, + "balance_loss_mlp": 1.036057, + "epoch": 0.8618668269953405, + "flos": 21471026336640.0, + "grad_norm": 1.894903500501414, + "language_loss": 0.69301027, + "learning_rate": 1.967607294278577e-07, + "loss": 0.71424931, + "num_input_tokens_seen": 309172865, + "step": 14335, + "time_per_iteration": 2.519529104232788 + }, + { + "auxiliary_loss_clip": 0.01094576, + "auxiliary_loss_mlp": 0.01031409, + "balance_loss_clip": 1.03611839, + "balance_loss_mlp": 1.01975012, + "epoch": 0.8619269502480085, + "flos": 22232691256320.0, + "grad_norm": 1.448608804390458, + "language_loss": 0.8315804, + "learning_rate": 1.965923098328135e-07, + "loss": 0.85284019, + "num_input_tokens_seen": 309193575, + "step": 14336, + "time_per_iteration": 2.5016698837280273 + }, + { + "auxiliary_loss_clip": 0.01106742, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.03556263, + "balance_loss_mlp": 1.01821637, + "epoch": 0.8619870735006764, + "flos": 22710626645760.0, + "grad_norm": 1.6970126728312245, + "language_loss": 0.6748361, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.69620711, + "num_input_tokens_seen": 309212680, + "step": 14337, + "time_per_iteration": 2.444659948348999 + }, + { + "auxiliary_loss_clip": 0.01065882, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.03069496, + "balance_loss_mlp": 1.01736772, + "epoch": 0.8620471967533444, + "flos": 37520293991040.0, + "grad_norm": 1.5307307166339434, + "language_loss": 0.67021108, + "learning_rate": 1.962556758053089e-07, + "loss": 0.69116592, + "num_input_tokens_seen": 309234485, + "step": 14338, + "time_per_iteration": 2.699268102645874 + }, + { + "auxiliary_loss_clip": 0.01082883, + "auxiliary_loss_mlp": 0.01031943, + "balance_loss_clip": 1.0358398, + "balance_loss_mlp": 1.02022457, + "epoch": 0.8621073200060123, + "flos": 19682459493120.0, + "grad_norm": 2.0141022283664767, + "language_loss": 0.61777782, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.63892603, + "num_input_tokens_seen": 309253630, + "step": 14339, + "time_per_iteration": 3.893486499786377 + }, + { + "auxiliary_loss_clip": 0.01080691, + "auxiliary_loss_mlp": 0.00782501, + "balance_loss_clip": 1.03234971, + "balance_loss_mlp": 1.00840592, + "epoch": 0.8621674432586803, + "flos": 14536855549440.0, + "grad_norm": 1.8359739096786403, + "language_loss": 0.62706578, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.64569771, + "num_input_tokens_seen": 309270950, + "step": 14340, + "time_per_iteration": 2.5031309127807617 + }, + { + "auxiliary_loss_clip": 0.01056317, + "auxiliary_loss_mlp": 0.01024086, + "balance_loss_clip": 1.03533232, + "balance_loss_mlp": 1.0131005, + "epoch": 0.8622275665113482, + "flos": 20740100480640.0, + "grad_norm": 1.651375475473071, + "language_loss": 0.80155182, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.82235587, + "num_input_tokens_seen": 309288780, + "step": 14341, + "time_per_iteration": 2.5625181198120117 + }, + { + "auxiliary_loss_clip": 0.01087698, + "auxiliary_loss_mlp": 0.01030608, + "balance_loss_clip": 1.03503942, + "balance_loss_mlp": 1.01934266, + "epoch": 0.8622876897640163, + "flos": 24715914197760.0, + "grad_norm": 1.8162647383247241, + "language_loss": 0.74722457, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.76840758, + "num_input_tokens_seen": 309310875, + "step": 14342, + "time_per_iteration": 2.5396804809570312 + }, + { + "auxiliary_loss_clip": 0.01072705, + "auxiliary_loss_mlp": 0.01028097, + "balance_loss_clip": 1.03487611, + "balance_loss_mlp": 1.01559782, + "epoch": 0.8623478130166842, + "flos": 17457362663040.0, + "grad_norm": 2.7582309081837932, + "language_loss": 0.68724787, + "learning_rate": 1.95415287816028e-07, + "loss": 0.70825589, + "num_input_tokens_seen": 309329900, + "step": 14343, + "time_per_iteration": 2.5160303115844727 + }, + { + "auxiliary_loss_clip": 0.01092105, + "auxiliary_loss_mlp": 0.01041669, + "balance_loss_clip": 1.03420448, + "balance_loss_mlp": 1.02881789, + "epoch": 0.8624079362693522, + "flos": 18109176814080.0, + "grad_norm": 1.6608740399308295, + "language_loss": 0.68347597, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.70481372, + "num_input_tokens_seen": 309347870, + "step": 14344, + "time_per_iteration": 2.4695301055908203 + }, + { + "auxiliary_loss_clip": 0.01070416, + "auxiliary_loss_mlp": 0.01037512, + "balance_loss_clip": 1.03410816, + "balance_loss_mlp": 1.02430975, + "epoch": 0.8624680595220201, + "flos": 30666455971200.0, + "grad_norm": 1.4683342320131074, + "language_loss": 0.81459439, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.83567369, + "num_input_tokens_seen": 309371695, + "step": 14345, + "time_per_iteration": 2.606796979904175 + }, + { + "auxiliary_loss_clip": 0.01096209, + "auxiliary_loss_mlp": 0.01033669, + "balance_loss_clip": 1.0355382, + "balance_loss_mlp": 1.02121711, + "epoch": 0.8625281827746881, + "flos": 37998588516480.0, + "grad_norm": 2.000521816766669, + "language_loss": 0.50633645, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.52763522, + "num_input_tokens_seen": 309394645, + "step": 14346, + "time_per_iteration": 2.632624387741089 + }, + { + "auxiliary_loss_clip": 0.0102562, + "auxiliary_loss_mlp": 0.0103145, + "balance_loss_clip": 1.03313208, + "balance_loss_mlp": 1.01908791, + "epoch": 0.862588306027356, + "flos": 26249730808320.0, + "grad_norm": 1.4318276822531815, + "language_loss": 0.7495929, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.7701636, + "num_input_tokens_seen": 309413170, + "step": 14347, + "time_per_iteration": 2.678907632827759 + }, + { + "auxiliary_loss_clip": 0.01078802, + "auxiliary_loss_mlp": 0.01033233, + "balance_loss_clip": 1.03469706, + "balance_loss_mlp": 1.02023304, + "epoch": 0.862648429280024, + "flos": 25878809013120.0, + "grad_norm": 1.956767589993399, + "language_loss": 0.80738819, + "learning_rate": 1.945766105774449e-07, + "loss": 0.82850856, + "num_input_tokens_seen": 309431315, + "step": 14348, + "time_per_iteration": 2.5597450733184814 + }, + { + "auxiliary_loss_clip": 0.01087013, + "auxiliary_loss_mlp": 0.0102843, + "balance_loss_clip": 1.03352857, + "balance_loss_mlp": 1.01718259, + "epoch": 0.862708552532692, + "flos": 37816413713280.0, + "grad_norm": 1.8361032489845635, + "language_loss": 0.66073453, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.681889, + "num_input_tokens_seen": 309453020, + "step": 14349, + "time_per_iteration": 2.626281976699829 + }, + { + "auxiliary_loss_clip": 0.01092846, + "auxiliary_loss_mlp": 0.01037783, + "balance_loss_clip": 1.03457177, + "balance_loss_mlp": 1.02550411, + "epoch": 0.86276867578536, + "flos": 19091800247040.0, + "grad_norm": 2.1198271869230716, + "language_loss": 0.69822478, + "learning_rate": 1.942416188703573e-07, + "loss": 0.71953106, + "num_input_tokens_seen": 309469780, + "step": 14350, + "time_per_iteration": 2.453994035720825 + }, + { + "auxiliary_loss_clip": 0.01075058, + "auxiliary_loss_mlp": 0.01035441, + "balance_loss_clip": 1.03272963, + "balance_loss_mlp": 1.02345443, + "epoch": 0.862828799038028, + "flos": 22164281804160.0, + "grad_norm": 1.7639516240439652, + "language_loss": 0.77168167, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.79278666, + "num_input_tokens_seen": 309489610, + "step": 14351, + "time_per_iteration": 2.550434112548828 + }, + { + "auxiliary_loss_clip": 0.01091928, + "auxiliary_loss_mlp": 0.01028296, + "balance_loss_clip": 1.03469586, + "balance_loss_mlp": 1.01689959, + "epoch": 0.8628889222906959, + "flos": 23145576433920.0, + "grad_norm": 1.7300266489273186, + "language_loss": 0.84721148, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.86841369, + "num_input_tokens_seen": 309508295, + "step": 14352, + "time_per_iteration": 2.5152041912078857 + }, + { + "auxiliary_loss_clip": 0.01020811, + "auxiliary_loss_mlp": 0.01001823, + "balance_loss_clip": 1.00740695, + "balance_loss_mlp": 1.00064325, + "epoch": 0.8629490455433639, + "flos": 57817762151040.0, + "grad_norm": 0.7941488329985426, + "language_loss": 0.61982977, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.64005613, + "num_input_tokens_seen": 309567960, + "step": 14353, + "time_per_iteration": 3.111920118331909 + }, + { + "auxiliary_loss_clip": 0.01103261, + "auxiliary_loss_mlp": 0.01025705, + "balance_loss_clip": 1.03635216, + "balance_loss_mlp": 1.01483262, + "epoch": 0.8630091687960318, + "flos": 15919667383680.0, + "grad_norm": 1.6955655197223638, + "language_loss": 0.81897503, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.84026468, + "num_input_tokens_seen": 309586050, + "step": 14354, + "time_per_iteration": 2.474431276321411 + }, + { + "auxiliary_loss_clip": 0.01083193, + "auxiliary_loss_mlp": 0.01029569, + "balance_loss_clip": 1.0332973, + "balance_loss_mlp": 1.01702261, + "epoch": 0.8630692920486999, + "flos": 17961691570560.0, + "grad_norm": 1.989736609800371, + "language_loss": 0.85536575, + "learning_rate": 1.934053380181031e-07, + "loss": 0.87649333, + "num_input_tokens_seen": 309602910, + "step": 14355, + "time_per_iteration": 2.48223614692688 + }, + { + "auxiliary_loss_clip": 0.01072057, + "auxiliary_loss_mlp": 0.01032259, + "balance_loss_clip": 1.03292203, + "balance_loss_mlp": 1.01925969, + "epoch": 0.8631294153013678, + "flos": 22455158140800.0, + "grad_norm": 2.0269166586533167, + "language_loss": 0.58799648, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.60903966, + "num_input_tokens_seen": 309621175, + "step": 14356, + "time_per_iteration": 2.5295395851135254 + }, + { + "auxiliary_loss_clip": 0.01059958, + "auxiliary_loss_mlp": 0.01032149, + "balance_loss_clip": 1.03278542, + "balance_loss_mlp": 1.01921439, + "epoch": 0.8631895385540358, + "flos": 16837005847680.0, + "grad_norm": 1.7952721581564965, + "language_loss": 0.77021933, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.79114038, + "num_input_tokens_seen": 309639395, + "step": 14357, + "time_per_iteration": 2.558746337890625 + }, + { + "auxiliary_loss_clip": 0.01094416, + "auxiliary_loss_mlp": 0.01028899, + "balance_loss_clip": 1.03643799, + "balance_loss_mlp": 1.01703155, + "epoch": 0.8632496618067037, + "flos": 18697214367360.0, + "grad_norm": 2.117517734745847, + "language_loss": 0.77831018, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.79954326, + "num_input_tokens_seen": 309657265, + "step": 14358, + "time_per_iteration": 2.5504045486450195 + }, + { + "auxiliary_loss_clip": 0.01067716, + "auxiliary_loss_mlp": 0.01033026, + "balance_loss_clip": 1.03122473, + "balance_loss_mlp": 1.01926899, + "epoch": 0.8633097850593717, + "flos": 24279922915200.0, + "grad_norm": 1.3536098979566764, + "language_loss": 0.75162554, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.77263296, + "num_input_tokens_seen": 309678610, + "step": 14359, + "time_per_iteration": 2.581483840942383 + }, + { + "auxiliary_loss_clip": 0.01042262, + "auxiliary_loss_mlp": 0.01028256, + "balance_loss_clip": 1.0327512, + "balance_loss_mlp": 1.01625752, + "epoch": 0.8633699083120396, + "flos": 21178569801600.0, + "grad_norm": 1.9099360479195766, + "language_loss": 0.70518911, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.72589433, + "num_input_tokens_seen": 309697710, + "step": 14360, + "time_per_iteration": 2.5933117866516113 + }, + { + "auxiliary_loss_clip": 0.01072772, + "auxiliary_loss_mlp": 0.01033005, + "balance_loss_clip": 1.03513563, + "balance_loss_mlp": 1.01996303, + "epoch": 0.8634300315647077, + "flos": 19244888012160.0, + "grad_norm": 1.8664431560653567, + "language_loss": 0.76490766, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.78596544, + "num_input_tokens_seen": 309715985, + "step": 14361, + "time_per_iteration": 2.5299808979034424 + }, + { + "auxiliary_loss_clip": 0.01028156, + "auxiliary_loss_mlp": 0.01003259, + "balance_loss_clip": 1.00536895, + "balance_loss_mlp": 1.00217402, + "epoch": 0.8634901548173756, + "flos": 66195648282240.0, + "grad_norm": 0.9649512638840279, + "language_loss": 0.58846861, + "learning_rate": 1.922374222645329e-07, + "loss": 0.60878271, + "num_input_tokens_seen": 309779930, + "step": 14362, + "time_per_iteration": 3.070331335067749 + }, + { + "auxiliary_loss_clip": 0.01038368, + "auxiliary_loss_mlp": 0.01030611, + "balance_loss_clip": 1.03531682, + "balance_loss_mlp": 1.01814187, + "epoch": 0.8635502780700436, + "flos": 24789531121920.0, + "grad_norm": 1.7186453780768944, + "language_loss": 0.80659425, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.8272841, + "num_input_tokens_seen": 309800580, + "step": 14363, + "time_per_iteration": 2.694070816040039 + }, + { + "auxiliary_loss_clip": 0.01077513, + "auxiliary_loss_mlp": 0.01039517, + "balance_loss_clip": 1.03163934, + "balance_loss_mlp": 1.02519369, + "epoch": 0.8636104013227116, + "flos": 25189970918400.0, + "grad_norm": 3.23297114591796, + "language_loss": 0.72947192, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.75064224, + "num_input_tokens_seen": 309821725, + "step": 14364, + "time_per_iteration": 2.5426387786865234 + }, + { + "auxiliary_loss_clip": 0.01080671, + "auxiliary_loss_mlp": 0.01031693, + "balance_loss_clip": 1.03264809, + "balance_loss_mlp": 1.01888967, + "epoch": 0.8636705245753795, + "flos": 23878441624320.0, + "grad_norm": 2.144946792356575, + "language_loss": 0.71518731, + "learning_rate": 1.917379150731755e-07, + "loss": 0.7363109, + "num_input_tokens_seen": 309841565, + "step": 14365, + "time_per_iteration": 2.553417205810547 + }, + { + "auxiliary_loss_clip": 0.01081324, + "auxiliary_loss_mlp": 0.01042303, + "balance_loss_clip": 1.03474712, + "balance_loss_mlp": 1.0273962, + "epoch": 0.8637306478280475, + "flos": 23110455911040.0, + "grad_norm": 2.107526216804772, + "language_loss": 0.7121228, + "learning_rate": 1.915715498065993e-07, + "loss": 0.7333591, + "num_input_tokens_seen": 309858635, + "step": 14366, + "time_per_iteration": 2.507657051086426 + }, + { + "auxiliary_loss_clip": 0.01075387, + "auxiliary_loss_mlp": 0.0102728, + "balance_loss_clip": 1.03480375, + "balance_loss_mlp": 1.01637888, + "epoch": 0.8637907710807154, + "flos": 21906802137600.0, + "grad_norm": 1.5622021440368534, + "language_loss": 0.81649971, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.83752638, + "num_input_tokens_seen": 309877885, + "step": 14367, + "time_per_iteration": 2.5285778045654297 + }, + { + "auxiliary_loss_clip": 0.01078902, + "auxiliary_loss_mlp": 0.01027228, + "balance_loss_clip": 1.03485274, + "balance_loss_mlp": 1.01437092, + "epoch": 0.8638508943333835, + "flos": 23580526222080.0, + "grad_norm": 2.3252829249218325, + "language_loss": 0.61954284, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.6406042, + "num_input_tokens_seen": 309893140, + "step": 14368, + "time_per_iteration": 3.9515724182128906 + }, + { + "auxiliary_loss_clip": 0.01093838, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.03651333, + "balance_loss_mlp": 1.01722121, + "epoch": 0.8639110175860514, + "flos": 25775853655680.0, + "grad_norm": 1.9229075993352684, + "language_loss": 0.76262629, + "learning_rate": 1.91072865486821e-07, + "loss": 0.78385615, + "num_input_tokens_seen": 309914175, + "step": 14369, + "time_per_iteration": 4.005048036575317 + }, + { + "auxiliary_loss_clip": 0.01080002, + "auxiliary_loss_mlp": 0.01036432, + "balance_loss_clip": 1.03517222, + "balance_loss_mlp": 1.02372432, + "epoch": 0.8639711408387194, + "flos": 23369443948800.0, + "grad_norm": 1.6307834614385197, + "language_loss": 0.64065617, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.66182053, + "num_input_tokens_seen": 309932395, + "step": 14370, + "time_per_iteration": 2.5659241676330566 + }, + { + "auxiliary_loss_clip": 0.0104499, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.0356369, + "balance_loss_mlp": 1.01872516, + "epoch": 0.8640312640913873, + "flos": 22127221946880.0, + "grad_norm": 1.5236329362948935, + "language_loss": 0.66428733, + "learning_rate": 1.907407522366209e-07, + "loss": 0.68504858, + "num_input_tokens_seen": 309951720, + "step": 14371, + "time_per_iteration": 4.12037992477417 + }, + { + "auxiliary_loss_clip": 0.01011906, + "auxiliary_loss_mlp": 0.00999251, + "balance_loss_clip": 1.00835991, + "balance_loss_mlp": 0.99807054, + "epoch": 0.8640913873440553, + "flos": 57571735944960.0, + "grad_norm": 0.8648720129938896, + "language_loss": 0.5696491, + "learning_rate": 1.905747985193107e-07, + "loss": 0.58976066, + "num_input_tokens_seen": 310006120, + "step": 14372, + "time_per_iteration": 3.0109968185424805 + }, + { + "auxiliary_loss_clip": 0.01101632, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.03572536, + "balance_loss_mlp": 1.01992261, + "epoch": 0.8641515105967232, + "flos": 23987430466560.0, + "grad_norm": 1.6809363842205356, + "language_loss": 0.79703593, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.81837475, + "num_input_tokens_seen": 310026740, + "step": 14373, + "time_per_iteration": 2.511866807937622 + }, + { + "auxiliary_loss_clip": 0.01103569, + "auxiliary_loss_mlp": 0.01026325, + "balance_loss_clip": 1.03484464, + "balance_loss_mlp": 1.0143261, + "epoch": 0.8642116338493913, + "flos": 19062749122560.0, + "grad_norm": 2.140796543609451, + "language_loss": 0.64072603, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.66202497, + "num_input_tokens_seen": 310044135, + "step": 14374, + "time_per_iteration": 2.442509412765503 + }, + { + "auxiliary_loss_clip": 0.01071509, + "auxiliary_loss_mlp": 0.0103304, + "balance_loss_clip": 1.03320074, + "balance_loss_mlp": 1.02101195, + "epoch": 0.8642717571020592, + "flos": 18254148105600.0, + "grad_norm": 2.7212156678102994, + "language_loss": 0.77190328, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.79294878, + "num_input_tokens_seen": 310061560, + "step": 14375, + "time_per_iteration": 2.5920231342315674 + }, + { + "auxiliary_loss_clip": 0.01060791, + "auxiliary_loss_mlp": 0.00782888, + "balance_loss_clip": 1.0340364, + "balance_loss_mlp": 1.01010776, + "epoch": 0.8643318803547272, + "flos": 57663270777600.0, + "grad_norm": 1.660335461768411, + "language_loss": 0.606264, + "learning_rate": 1.899116698488117e-07, + "loss": 0.62470078, + "num_input_tokens_seen": 310087310, + "step": 14376, + "time_per_iteration": 2.9682211875915527 + }, + { + "auxiliary_loss_clip": 0.01068613, + "auxiliary_loss_mlp": 0.01035262, + "balance_loss_clip": 1.03394544, + "balance_loss_mlp": 1.0238831, + "epoch": 0.8643920036073952, + "flos": 19609524927360.0, + "grad_norm": 1.4634806683649446, + "language_loss": 0.66316885, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.68420762, + "num_input_tokens_seen": 310106260, + "step": 14377, + "time_per_iteration": 2.552854537963867 + }, + { + "auxiliary_loss_clip": 0.01079554, + "auxiliary_loss_mlp": 0.01037167, + "balance_loss_clip": 1.03336227, + "balance_loss_mlp": 1.02353525, + "epoch": 0.8644521268600631, + "flos": 20850346298880.0, + "grad_norm": 1.5185510818639527, + "language_loss": 0.70227075, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.72343791, + "num_input_tokens_seen": 310125305, + "step": 14378, + "time_per_iteration": 3.91347336769104 + }, + { + "auxiliary_loss_clip": 0.01020441, + "auxiliary_loss_mlp": 0.01002364, + "balance_loss_clip": 1.00831139, + "balance_loss_mlp": 1.00134468, + "epoch": 0.8645122501127311, + "flos": 66719550101760.0, + "grad_norm": 0.805008810325254, + "language_loss": 0.6030696, + "learning_rate": 1.894150440305995e-07, + "loss": 0.62329763, + "num_input_tokens_seen": 310189270, + "step": 14379, + "time_per_iteration": 3.107417345046997 + }, + { + "auxiliary_loss_clip": 0.01080513, + "auxiliary_loss_mlp": 0.01030926, + "balance_loss_clip": 1.03432047, + "balance_loss_mlp": 1.01886165, + "epoch": 0.864572373365399, + "flos": 21690009601920.0, + "grad_norm": 1.9557671766676492, + "language_loss": 0.74418366, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.76529813, + "num_input_tokens_seen": 310208395, + "step": 14380, + "time_per_iteration": 2.5550310611724854 + }, + { + "auxiliary_loss_clip": 0.01076811, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.03350282, + "balance_loss_mlp": 1.02015209, + "epoch": 0.8646324966180671, + "flos": 20266402896000.0, + "grad_norm": 2.3066181854298464, + "language_loss": 0.75574088, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.776838, + "num_input_tokens_seen": 310227415, + "step": 14381, + "time_per_iteration": 2.50007963180542 + }, + { + "auxiliary_loss_clip": 0.01078917, + "auxiliary_loss_mlp": 0.01031227, + "balance_loss_clip": 1.03442943, + "balance_loss_mlp": 1.02019453, + "epoch": 0.864692619870735, + "flos": 11946188050560.0, + "grad_norm": 2.542596483785563, + "language_loss": 0.84089983, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.8620013, + "num_input_tokens_seen": 310242625, + "step": 14382, + "time_per_iteration": 2.487907648086548 + }, + { + "auxiliary_loss_clip": 0.01095768, + "auxiliary_loss_mlp": 0.01034164, + "balance_loss_clip": 1.03592396, + "balance_loss_mlp": 1.02171862, + "epoch": 0.864752743123403, + "flos": 21470703114240.0, + "grad_norm": 2.422291245023437, + "language_loss": 0.75641978, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.77771914, + "num_input_tokens_seen": 310260585, + "step": 14383, + "time_per_iteration": 2.4710464477539062 + }, + { + "auxiliary_loss_clip": 0.01083059, + "auxiliary_loss_mlp": 0.01027254, + "balance_loss_clip": 1.03694248, + "balance_loss_mlp": 1.01536274, + "epoch": 0.8648128663760709, + "flos": 19530018172800.0, + "grad_norm": 1.7605897013623193, + "language_loss": 0.85087413, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.87197721, + "num_input_tokens_seen": 310277210, + "step": 14384, + "time_per_iteration": 2.516620635986328 + }, + { + "auxiliary_loss_clip": 0.01089243, + "auxiliary_loss_mlp": 0.01031646, + "balance_loss_clip": 1.0334456, + "balance_loss_mlp": 1.01996899, + "epoch": 0.8648729896287389, + "flos": 21287953693440.0, + "grad_norm": 1.7114596218779896, + "language_loss": 0.8087551, + "learning_rate": 1.884236463176072e-07, + "loss": 0.82996398, + "num_input_tokens_seen": 310296610, + "step": 14385, + "time_per_iteration": 2.480952739715576 + }, + { + "auxiliary_loss_clip": 0.01088159, + "auxiliary_loss_mlp": 0.01029558, + "balance_loss_clip": 1.03755355, + "balance_loss_mlp": 1.01721978, + "epoch": 0.8649331128814068, + "flos": 24604483230720.0, + "grad_norm": 1.9821178530434644, + "language_loss": 0.7264657, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.74764287, + "num_input_tokens_seen": 310316830, + "step": 14386, + "time_per_iteration": 2.5641837120056152 + }, + { + "auxiliary_loss_clip": 0.01089755, + "auxiliary_loss_mlp": 0.01034655, + "balance_loss_clip": 1.03458083, + "balance_loss_mlp": 1.0221796, + "epoch": 0.8649932361340749, + "flos": 15377811742080.0, + "grad_norm": 1.8100123878409637, + "language_loss": 0.8214758, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.84271991, + "num_input_tokens_seen": 310334355, + "step": 14387, + "time_per_iteration": 2.4554600715637207 + }, + { + "auxiliary_loss_clip": 0.01101359, + "auxiliary_loss_mlp": 0.01026838, + "balance_loss_clip": 1.03522468, + "balance_loss_mlp": 1.01523256, + "epoch": 0.8650533593867428, + "flos": 19901227276800.0, + "grad_norm": 1.8428830786662658, + "language_loss": 0.68590343, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.70718551, + "num_input_tokens_seen": 310352900, + "step": 14388, + "time_per_iteration": 2.4846878051757812 + }, + { + "auxiliary_loss_clip": 0.01069725, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.03550351, + "balance_loss_mlp": 1.02145803, + "epoch": 0.8651134826394108, + "flos": 25626931868160.0, + "grad_norm": 1.5004591300114027, + "language_loss": 0.90366977, + "learning_rate": 1.877640883285283e-07, + "loss": 0.92469323, + "num_input_tokens_seen": 310372855, + "step": 14389, + "time_per_iteration": 2.5894806385040283 + }, + { + "auxiliary_loss_clip": 0.01060672, + "auxiliary_loss_mlp": 0.00781095, + "balance_loss_clip": 1.03655303, + "balance_loss_mlp": 1.00806236, + "epoch": 0.8651736058920788, + "flos": 18734525619840.0, + "grad_norm": 1.602465787174566, + "language_loss": 0.70683658, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.72525418, + "num_input_tokens_seen": 310391595, + "step": 14390, + "time_per_iteration": 2.6144251823425293 + }, + { + "auxiliary_loss_clip": 0.01104901, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_clip": 1.03546166, + "balance_loss_mlp": 1.02063549, + "epoch": 0.8652337291447467, + "flos": 20776765288320.0, + "grad_norm": 1.573645123752967, + "language_loss": 0.82385993, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.8452388, + "num_input_tokens_seen": 310410090, + "step": 14391, + "time_per_iteration": 2.4699625968933105 + }, + { + "auxiliary_loss_clip": 0.01002717, + "auxiliary_loss_mlp": 0.01000466, + "balance_loss_clip": 1.01336384, + "balance_loss_mlp": 0.99918431, + "epoch": 0.8652938523974147, + "flos": 64227887464320.0, + "grad_norm": 0.8012621255877996, + "language_loss": 0.6805166, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.70054841, + "num_input_tokens_seen": 310470055, + "step": 14392, + "time_per_iteration": 3.060218334197998 + }, + { + "auxiliary_loss_clip": 0.01097197, + "auxiliary_loss_mlp": 0.01031718, + "balance_loss_clip": 1.035092, + "balance_loss_mlp": 1.01893318, + "epoch": 0.8653539756500827, + "flos": 18040587793920.0, + "grad_norm": 2.0360508004548556, + "language_loss": 0.75912416, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.78041327, + "num_input_tokens_seen": 310487665, + "step": 14393, + "time_per_iteration": 2.4742772579193115 + }, + { + "auxiliary_loss_clip": 0.01081491, + "auxiliary_loss_mlp": 0.01033347, + "balance_loss_clip": 1.03193223, + "balance_loss_mlp": 1.02119327, + "epoch": 0.8654140989027507, + "flos": 17382416935680.0, + "grad_norm": 2.404715480824046, + "language_loss": 0.73675168, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.75790012, + "num_input_tokens_seen": 310506130, + "step": 14394, + "time_per_iteration": 2.4694535732269287 + }, + { + "auxiliary_loss_clip": 0.01093753, + "auxiliary_loss_mlp": 0.01033009, + "balance_loss_clip": 1.03391612, + "balance_loss_mlp": 1.0199796, + "epoch": 0.8654742221554186, + "flos": 53284862448000.0, + "grad_norm": 2.483713687961178, + "language_loss": 0.64778, + "learning_rate": 1.867768130747036e-07, + "loss": 0.66904759, + "num_input_tokens_seen": 310532445, + "step": 14395, + "time_per_iteration": 2.7875232696533203 + }, + { + "auxiliary_loss_clip": 0.01086695, + "auxiliary_loss_mlp": 0.01035431, + "balance_loss_clip": 1.03361583, + "balance_loss_mlp": 1.02336121, + "epoch": 0.8655343454080866, + "flos": 23914711382400.0, + "grad_norm": 1.5740985234907725, + "language_loss": 0.67918807, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.70040929, + "num_input_tokens_seen": 310552300, + "step": 14396, + "time_per_iteration": 2.4845588207244873 + }, + { + "auxiliary_loss_clip": 0.01095702, + "auxiliary_loss_mlp": 0.0103605, + "balance_loss_clip": 1.03603137, + "balance_loss_mlp": 1.02378893, + "epoch": 0.8655944686607545, + "flos": 24097209408000.0, + "grad_norm": 4.906784461845033, + "language_loss": 0.69095188, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.71226937, + "num_input_tokens_seen": 310572710, + "step": 14397, + "time_per_iteration": 2.5117950439453125 + }, + { + "auxiliary_loss_clip": 0.01084867, + "auxiliary_loss_mlp": 0.01028302, + "balance_loss_clip": 1.03384447, + "balance_loss_mlp": 1.01691771, + "epoch": 0.8656545919134225, + "flos": 23112718467840.0, + "grad_norm": 1.7726908334626978, + "language_loss": 0.63292503, + "learning_rate": 1.86284103591253e-07, + "loss": 0.65405679, + "num_input_tokens_seen": 310592460, + "step": 14398, + "time_per_iteration": 2.52276349067688 + }, + { + "auxiliary_loss_clip": 0.01067604, + "auxiliary_loss_mlp": 0.0103137, + "balance_loss_clip": 1.03581762, + "balance_loss_mlp": 1.01826334, + "epoch": 0.8657147151660904, + "flos": 21141761339520.0, + "grad_norm": 2.2199238416803326, + "language_loss": 0.76109624, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.78208596, + "num_input_tokens_seen": 310609375, + "step": 14399, + "time_per_iteration": 2.5639255046844482 + }, + { + "auxiliary_loss_clip": 0.01084803, + "auxiliary_loss_mlp": 0.01026827, + "balance_loss_clip": 1.03411913, + "balance_loss_mlp": 1.01543081, + "epoch": 0.8657748384187585, + "flos": 16289439943680.0, + "grad_norm": 1.936478877469833, + "language_loss": 0.93071848, + "learning_rate": 1.8595597447334855e-07, + "loss": 0.9518348, + "num_input_tokens_seen": 310627405, + "step": 14400, + "time_per_iteration": 2.4516475200653076 + }, + { + "auxiliary_loss_clip": 0.01047964, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.03507876, + "balance_loss_mlp": 1.02525997, + "epoch": 0.8658349616714264, + "flos": 30843890179200.0, + "grad_norm": 2.031899192966752, + "language_loss": 0.67453265, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.69538212, + "num_input_tokens_seen": 310649945, + "step": 14401, + "time_per_iteration": 2.714660406112671 + }, + { + "auxiliary_loss_clip": 0.01093621, + "auxiliary_loss_mlp": 0.01029314, + "balance_loss_clip": 1.03379726, + "balance_loss_mlp": 1.01738095, + "epoch": 0.8658950849240944, + "flos": 18952862440320.0, + "grad_norm": 1.9808252493382519, + "language_loss": 0.73835278, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.75958216, + "num_input_tokens_seen": 310668285, + "step": 14402, + "time_per_iteration": 2.472064256668091 + }, + { + "auxiliary_loss_clip": 0.01037346, + "auxiliary_loss_mlp": 0.0103086, + "balance_loss_clip": 1.03284478, + "balance_loss_mlp": 1.01915371, + "epoch": 0.8659552081767624, + "flos": 23364344217600.0, + "grad_norm": 2.222115425870408, + "language_loss": 0.75052655, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.77120864, + "num_input_tokens_seen": 310687015, + "step": 14403, + "time_per_iteration": 2.668152093887329 + }, + { + "auxiliary_loss_clip": 0.01078057, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.0337522, + "balance_loss_mlp": 1.02131212, + "epoch": 0.8660153314294303, + "flos": 23841992298240.0, + "grad_norm": 1.822312171553705, + "language_loss": 0.73384976, + "learning_rate": 1.853005417520368e-07, + "loss": 0.75497341, + "num_input_tokens_seen": 310707580, + "step": 14404, + "time_per_iteration": 2.5422472953796387 + }, + { + "auxiliary_loss_clip": 0.0106603, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.03504014, + "balance_loss_mlp": 1.01828837, + "epoch": 0.8660754546820983, + "flos": 23112467072640.0, + "grad_norm": 1.7998829825882579, + "language_loss": 0.70683181, + "learning_rate": 1.851368555901447e-07, + "loss": 0.7277981, + "num_input_tokens_seen": 310727300, + "step": 14405, + "time_per_iteration": 2.5714354515075684 + }, + { + "auxiliary_loss_clip": 0.01094097, + "auxiliary_loss_mlp": 0.00784018, + "balance_loss_clip": 1.03429174, + "balance_loss_mlp": 1.01051497, + "epoch": 0.8661355779347663, + "flos": 14391991998720.0, + "grad_norm": 1.717676860369089, + "language_loss": 0.66417897, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.68296015, + "num_input_tokens_seen": 310744935, + "step": 14406, + "time_per_iteration": 2.466764450073242 + }, + { + "auxiliary_loss_clip": 0.01082548, + "auxiliary_loss_mlp": 0.01024174, + "balance_loss_clip": 1.03513241, + "balance_loss_mlp": 1.01311779, + "epoch": 0.8661957011874343, + "flos": 21870137329920.0, + "grad_norm": 2.044169927104571, + "language_loss": 0.82976329, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.8508305, + "num_input_tokens_seen": 310765085, + "step": 14407, + "time_per_iteration": 5.404285907745361 + }, + { + "auxiliary_loss_clip": 0.01093215, + "auxiliary_loss_mlp": 0.0103508, + "balance_loss_clip": 1.03629041, + "balance_loss_mlp": 1.02371335, + "epoch": 0.8662558244401022, + "flos": 21835160461440.0, + "grad_norm": 1.7812882157547132, + "language_loss": 0.69898272, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.72026575, + "num_input_tokens_seen": 310783260, + "step": 14408, + "time_per_iteration": 2.507185935974121 + }, + { + "auxiliary_loss_clip": 0.01082447, + "auxiliary_loss_mlp": 0.01031483, + "balance_loss_clip": 1.03394604, + "balance_loss_mlp": 1.02035511, + "epoch": 0.8663159476927702, + "flos": 17384104874880.0, + "grad_norm": 1.8654923238140926, + "language_loss": 0.77419215, + "learning_rate": 1.844827992025304e-07, + "loss": 0.79533148, + "num_input_tokens_seen": 310801970, + "step": 14409, + "time_per_iteration": 2.485217809677124 + }, + { + "auxiliary_loss_clip": 0.01095508, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.0363729, + "balance_loss_mlp": 1.01785684, + "epoch": 0.8663760709454381, + "flos": 22747722416640.0, + "grad_norm": 1.9289014296633187, + "language_loss": 0.77215588, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.79341596, + "num_input_tokens_seen": 310822070, + "step": 14410, + "time_per_iteration": 3.892517328262329 + }, + { + "auxiliary_loss_clip": 0.01062061, + "auxiliary_loss_mlp": 0.0103139, + "balance_loss_clip": 1.03306997, + "balance_loss_mlp": 1.01929069, + "epoch": 0.8664361941981061, + "flos": 17376850327680.0, + "grad_norm": 1.965404520009851, + "language_loss": 0.77610034, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.79703486, + "num_input_tokens_seen": 310838355, + "step": 14411, + "time_per_iteration": 2.5273244380950928 + }, + { + "auxiliary_loss_clip": 0.01074831, + "auxiliary_loss_mlp": 0.01031965, + "balance_loss_clip": 1.03307319, + "balance_loss_mlp": 1.02078366, + "epoch": 0.866496317450774, + "flos": 16034438315520.0, + "grad_norm": 2.141652419848165, + "language_loss": 0.73671895, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.75778693, + "num_input_tokens_seen": 310856055, + "step": 14412, + "time_per_iteration": 2.504023551940918 + }, + { + "auxiliary_loss_clip": 0.01086971, + "auxiliary_loss_mlp": 0.0078536, + "balance_loss_clip": 1.03345585, + "balance_loss_mlp": 1.01084685, + "epoch": 0.8665564407034421, + "flos": 20814830726400.0, + "grad_norm": 1.8730450840445714, + "language_loss": 0.69458073, + "learning_rate": 1.83829844328371e-07, + "loss": 0.71330404, + "num_input_tokens_seen": 310876695, + "step": 14413, + "time_per_iteration": 2.509875774383545 + }, + { + "auxiliary_loss_clip": 0.01093308, + "auxiliary_loss_mlp": 0.01031886, + "balance_loss_clip": 1.03592503, + "balance_loss_mlp": 1.02008462, + "epoch": 0.86661656395611, + "flos": 15815167741440.0, + "grad_norm": 2.066163244266518, + "language_loss": 0.6326617, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.65391362, + "num_input_tokens_seen": 310893880, + "step": 14414, + "time_per_iteration": 2.4538064002990723 + }, + { + "auxiliary_loss_clip": 0.0107319, + "auxiliary_loss_mlp": 0.0078202, + "balance_loss_clip": 1.0353775, + "balance_loss_mlp": 1.00707245, + "epoch": 0.866676687208778, + "flos": 23036910814080.0, + "grad_norm": 1.5689687316819567, + "language_loss": 0.63706315, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.65561521, + "num_input_tokens_seen": 310914145, + "step": 14415, + "time_per_iteration": 2.575549364089966 + }, + { + "auxiliary_loss_clip": 0.01003033, + "auxiliary_loss_mlp": 0.00999894, + "balance_loss_clip": 1.01099765, + "balance_loss_mlp": 0.99879724, + "epoch": 0.866736810461446, + "flos": 63802275212160.0, + "grad_norm": 0.7960438584738667, + "language_loss": 0.60397005, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.6239993, + "num_input_tokens_seen": 310972825, + "step": 14416, + "time_per_iteration": 4.601866006851196 + }, + { + "auxiliary_loss_clip": 0.01093988, + "auxiliary_loss_mlp": 0.00784744, + "balance_loss_clip": 1.0338372, + "balance_loss_mlp": 1.01063156, + "epoch": 0.8667969337141139, + "flos": 20449367798400.0, + "grad_norm": 1.7660233120590305, + "language_loss": 0.74721551, + "learning_rate": 1.831779913638285e-07, + "loss": 0.76600289, + "num_input_tokens_seen": 310992050, + "step": 14417, + "time_per_iteration": 2.479079008102417 + }, + { + "auxiliary_loss_clip": 0.01079034, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.03375292, + "balance_loss_mlp": 1.02368259, + "epoch": 0.866857056966782, + "flos": 21653703930240.0, + "grad_norm": 2.009781431927632, + "language_loss": 0.75131309, + "learning_rate": 1.830152003424319e-07, + "loss": 0.77245277, + "num_input_tokens_seen": 311011105, + "step": 14418, + "time_per_iteration": 2.523104429244995 + }, + { + "auxiliary_loss_clip": 0.0109068, + "auxiliary_loss_mlp": 0.01033241, + "balance_loss_clip": 1.03390741, + "balance_loss_mlp": 1.02147484, + "epoch": 0.8669171802194499, + "flos": 22852832590080.0, + "grad_norm": 1.5096522633567193, + "language_loss": 0.68330371, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.70454288, + "num_input_tokens_seen": 311032080, + "step": 14419, + "time_per_iteration": 2.4989635944366455 + }, + { + "auxiliary_loss_clip": 0.01094745, + "auxiliary_loss_mlp": 0.01030226, + "balance_loss_clip": 1.03558409, + "balance_loss_mlp": 1.01924729, + "epoch": 0.8669773034721179, + "flos": 18734166483840.0, + "grad_norm": 1.678164629918293, + "language_loss": 0.78284943, + "learning_rate": 1.826898250065465e-07, + "loss": 0.8040992, + "num_input_tokens_seen": 311049735, + "step": 14420, + "time_per_iteration": 2.4932966232299805 + }, + { + "auxiliary_loss_clip": 0.01088878, + "auxiliary_loss_mlp": 0.0103092, + "balance_loss_clip": 1.03422034, + "balance_loss_mlp": 1.01876068, + "epoch": 0.8670374267247858, + "flos": 18916018064640.0, + "grad_norm": 1.5465729079255368, + "language_loss": 0.8370347, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.85823268, + "num_input_tokens_seen": 311067675, + "step": 14421, + "time_per_iteration": 2.460062265396118 + }, + { + "auxiliary_loss_clip": 0.01009585, + "auxiliary_loss_mlp": 0.00999574, + "balance_loss_clip": 1.01100755, + "balance_loss_mlp": 0.99835247, + "epoch": 0.8670975499774538, + "flos": 48814527214080.0, + "grad_norm": 0.7032300346641079, + "language_loss": 0.49142066, + "learning_rate": 1.823647253209941e-07, + "loss": 0.51151228, + "num_input_tokens_seen": 311126605, + "step": 14422, + "time_per_iteration": 3.1209442615509033 + }, + { + "auxiliary_loss_clip": 0.01080869, + "auxiliary_loss_mlp": 0.0078321, + "balance_loss_clip": 1.03461194, + "balance_loss_mlp": 1.01089549, + "epoch": 0.8671576732301217, + "flos": 26136145025280.0, + "grad_norm": 1.5199134324055197, + "language_loss": 0.73690403, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.75554484, + "num_input_tokens_seen": 311147325, + "step": 14423, + "time_per_iteration": 2.553122043609619 + }, + { + "auxiliary_loss_clip": 0.01057291, + "auxiliary_loss_mlp": 0.01028422, + "balance_loss_clip": 1.03032231, + "balance_loss_mlp": 1.01584005, + "epoch": 0.8672177964827897, + "flos": 18367446579840.0, + "grad_norm": 1.5072241248761646, + "language_loss": 0.76713389, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.78799105, + "num_input_tokens_seen": 311165385, + "step": 14424, + "time_per_iteration": 2.584876298904419 + }, + { + "auxiliary_loss_clip": 0.01062704, + "auxiliary_loss_mlp": 0.01035317, + "balance_loss_clip": 1.03180504, + "balance_loss_mlp": 1.02323532, + "epoch": 0.8672779197354576, + "flos": 28545355992960.0, + "grad_norm": 2.844823730131768, + "language_loss": 0.712111, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.73309124, + "num_input_tokens_seen": 311185860, + "step": 14425, + "time_per_iteration": 2.6039299964904785 + }, + { + "auxiliary_loss_clip": 0.01095273, + "auxiliary_loss_mlp": 0.01030853, + "balance_loss_clip": 1.03590155, + "balance_loss_mlp": 1.01853871, + "epoch": 0.8673380429881257, + "flos": 22382474970240.0, + "grad_norm": 1.785028220009416, + "language_loss": 0.68044257, + "learning_rate": 1.817153530980926e-07, + "loss": 0.70170385, + "num_input_tokens_seen": 311205810, + "step": 14426, + "time_per_iteration": 2.55110239982605 + }, + { + "auxiliary_loss_clip": 0.0106723, + "auxiliary_loss_mlp": 0.01025273, + "balance_loss_clip": 1.03578138, + "balance_loss_mlp": 1.01274967, + "epoch": 0.8673981662407936, + "flos": 20996430912000.0, + "grad_norm": 1.7291260375608357, + "language_loss": 0.70591545, + "learning_rate": 1.815531824008234e-07, + "loss": 0.72684044, + "num_input_tokens_seen": 311226080, + "step": 14427, + "time_per_iteration": 2.547909736633301 + }, + { + "auxiliary_loss_clip": 0.01075908, + "auxiliary_loss_mlp": 0.01028645, + "balance_loss_clip": 1.03526962, + "balance_loss_mlp": 1.01671243, + "epoch": 0.8674582894934616, + "flos": 24426797627520.0, + "grad_norm": 1.6232513705115956, + "language_loss": 0.68088883, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.70193434, + "num_input_tokens_seen": 311246380, + "step": 14428, + "time_per_iteration": 2.6174912452697754 + }, + { + "auxiliary_loss_clip": 0.01074579, + "auxiliary_loss_mlp": 0.0102734, + "balance_loss_clip": 1.03597963, + "balance_loss_mlp": 1.01645005, + "epoch": 0.8675184127461296, + "flos": 20737514701440.0, + "grad_norm": 2.6067475841888776, + "language_loss": 0.70718443, + "learning_rate": 1.812290478794889e-07, + "loss": 0.72820359, + "num_input_tokens_seen": 311266465, + "step": 14429, + "time_per_iteration": 2.5142343044281006 + }, + { + "auxiliary_loss_clip": 0.0108277, + "auxiliary_loss_mlp": 0.0102777, + "balance_loss_clip": 1.03497553, + "balance_loss_mlp": 1.01598644, + "epoch": 0.8675785359987975, + "flos": 19135647774720.0, + "grad_norm": 2.046750130156718, + "language_loss": 0.66992331, + "learning_rate": 1.810670840677151e-07, + "loss": 0.69102871, + "num_input_tokens_seen": 311285075, + "step": 14430, + "time_per_iteration": 2.5104892253875732 + }, + { + "auxiliary_loss_clip": 0.01059568, + "auxiliary_loss_mlp": 0.01033484, + "balance_loss_clip": 1.03500676, + "balance_loss_mlp": 1.02075243, + "epoch": 0.8676386592514655, + "flos": 22710662559360.0, + "grad_norm": 2.268956116768792, + "language_loss": 0.68991667, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.71084726, + "num_input_tokens_seen": 311303230, + "step": 14431, + "time_per_iteration": 2.5608632564544678 + }, + { + "auxiliary_loss_clip": 0.01091441, + "auxiliary_loss_mlp": 0.01032983, + "balance_loss_clip": 1.03343737, + "balance_loss_mlp": 1.0208118, + "epoch": 0.8676987825041335, + "flos": 14209853109120.0, + "grad_norm": 2.7239039204909976, + "language_loss": 0.63489181, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.65613604, + "num_input_tokens_seen": 311318070, + "step": 14432, + "time_per_iteration": 2.501511335372925 + }, + { + "auxiliary_loss_clip": 0.01094395, + "auxiliary_loss_mlp": 0.01035998, + "balance_loss_clip": 1.03568792, + "balance_loss_mlp": 1.02501893, + "epoch": 0.8677589057568015, + "flos": 13589927256960.0, + "grad_norm": 1.916569742549719, + "language_loss": 0.78306913, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.80437309, + "num_input_tokens_seen": 311334885, + "step": 14433, + "time_per_iteration": 2.4491987228393555 + }, + { + "auxiliary_loss_clip": 0.0101282, + "auxiliary_loss_mlp": 0.01001889, + "balance_loss_clip": 1.01037836, + "balance_loss_mlp": 1.00071478, + "epoch": 0.8678190290094694, + "flos": 68933657370240.0, + "grad_norm": 1.196361005493811, + "language_loss": 0.58438575, + "learning_rate": 1.804199186231805e-07, + "loss": 0.60453284, + "num_input_tokens_seen": 311399780, + "step": 14434, + "time_per_iteration": 3.204069137573242 + }, + { + "auxiliary_loss_clip": 0.01078974, + "auxiliary_loss_mlp": 0.01032466, + "balance_loss_clip": 1.03241467, + "balance_loss_mlp": 1.02138579, + "epoch": 0.8678791522621374, + "flos": 32557726776960.0, + "grad_norm": 1.772296103460185, + "language_loss": 0.80097556, + "learning_rate": 1.802582997433628e-07, + "loss": 0.82209003, + "num_input_tokens_seen": 311419610, + "step": 14435, + "time_per_iteration": 2.595299243927002 + }, + { + "auxiliary_loss_clip": 0.01081378, + "auxiliary_loss_mlp": 0.00782678, + "balance_loss_clip": 1.03216851, + "balance_loss_mlp": 1.00764465, + "epoch": 0.8679392755148053, + "flos": 35042637657600.0, + "grad_norm": 1.8524409255509766, + "language_loss": 0.617226, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.63586652, + "num_input_tokens_seen": 311440045, + "step": 14436, + "time_per_iteration": 2.6380438804626465 + }, + { + "auxiliary_loss_clip": 0.01081272, + "auxiliary_loss_mlp": 0.01028358, + "balance_loss_clip": 1.03798223, + "balance_loss_mlp": 1.01539993, + "epoch": 0.8679993987674733, + "flos": 18552494471040.0, + "grad_norm": 2.1895988669964876, + "language_loss": 0.70453453, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.72563076, + "num_input_tokens_seen": 311456660, + "step": 14437, + "time_per_iteration": 2.4976179599761963 + }, + { + "auxiliary_loss_clip": 0.01070212, + "auxiliary_loss_mlp": 0.0102767, + "balance_loss_clip": 1.03434575, + "balance_loss_mlp": 1.01578474, + "epoch": 0.8680595220201412, + "flos": 27454390162560.0, + "grad_norm": 1.9853595822886019, + "language_loss": 0.80282462, + "learning_rate": 1.797738571571381e-07, + "loss": 0.82380342, + "num_input_tokens_seen": 311475460, + "step": 14438, + "time_per_iteration": 2.6041011810302734 + }, + { + "auxiliary_loss_clip": 0.01084126, + "auxiliary_loss_mlp": 0.01022258, + "balance_loss_clip": 1.03354406, + "balance_loss_mlp": 1.0106051, + "epoch": 0.8681196452728093, + "flos": 19208797822080.0, + "grad_norm": 2.53776374697169, + "language_loss": 0.67522347, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.69628733, + "num_input_tokens_seen": 311494575, + "step": 14439, + "time_per_iteration": 2.4864957332611084 + }, + { + "auxiliary_loss_clip": 0.0109085, + "auxiliary_loss_mlp": 0.01032426, + "balance_loss_clip": 1.03418827, + "balance_loss_mlp": 1.02135706, + "epoch": 0.8681797685254772, + "flos": 37560442417920.0, + "grad_norm": 1.5248988350073533, + "language_loss": 0.63853621, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.65976894, + "num_input_tokens_seen": 311515805, + "step": 14440, + "time_per_iteration": 2.6327784061431885 + }, + { + "auxiliary_loss_clip": 0.01091593, + "auxiliary_loss_mlp": 0.01028846, + "balance_loss_clip": 1.03546727, + "balance_loss_mlp": 1.01683593, + "epoch": 0.8682398917781452, + "flos": 23289937194240.0, + "grad_norm": 1.6098979563491111, + "language_loss": 0.65765601, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.67886043, + "num_input_tokens_seen": 311536000, + "step": 14441, + "time_per_iteration": 2.5107715129852295 + }, + { + "auxiliary_loss_clip": 0.01085846, + "auxiliary_loss_mlp": 0.01024265, + "balance_loss_clip": 1.03647733, + "balance_loss_mlp": 1.01352477, + "epoch": 0.8683000150308132, + "flos": 21872794936320.0, + "grad_norm": 1.56127051517466, + "language_loss": 0.66406894, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.68517005, + "num_input_tokens_seen": 311556220, + "step": 14442, + "time_per_iteration": 2.4997498989105225 + }, + { + "auxiliary_loss_clip": 0.01078658, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.03345203, + "balance_loss_mlp": 1.0188899, + "epoch": 0.8683601382834811, + "flos": 14647209108480.0, + "grad_norm": 1.7886701385723607, + "language_loss": 0.72135437, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.74246782, + "num_input_tokens_seen": 311572530, + "step": 14443, + "time_per_iteration": 2.4866933822631836 + }, + { + "auxiliary_loss_clip": 0.01103796, + "auxiliary_loss_mlp": 0.01029027, + "balance_loss_clip": 1.0349853, + "balance_loss_mlp": 1.01674294, + "epoch": 0.8684202615361492, + "flos": 26359904799360.0, + "grad_norm": 1.901331424415087, + "language_loss": 0.83502603, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.85635424, + "num_input_tokens_seen": 311591105, + "step": 14444, + "time_per_iteration": 2.5147132873535156 + }, + { + "auxiliary_loss_clip": 0.01064798, + "auxiliary_loss_mlp": 0.01027388, + "balance_loss_clip": 1.03559351, + "balance_loss_mlp": 1.01502037, + "epoch": 0.8684803847888171, + "flos": 20704010290560.0, + "grad_norm": 1.8192943280136162, + "language_loss": 0.77091241, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.79183424, + "num_input_tokens_seen": 311608350, + "step": 14445, + "time_per_iteration": 3.909639596939087 + }, + { + "auxiliary_loss_clip": 0.01094656, + "auxiliary_loss_mlp": 0.0102989, + "balance_loss_clip": 1.03608871, + "balance_loss_mlp": 1.0175935, + "epoch": 0.8685405080414851, + "flos": 22638123043200.0, + "grad_norm": 1.9177722077796089, + "language_loss": 0.67980933, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.70105481, + "num_input_tokens_seen": 311626380, + "step": 14446, + "time_per_iteration": 3.9154200553894043 + }, + { + "auxiliary_loss_clip": 0.01090778, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.03613257, + "balance_loss_mlp": 1.01945543, + "epoch": 0.868600631294153, + "flos": 24822065865600.0, + "grad_norm": 1.588005544073923, + "language_loss": 0.8299374, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.85116184, + "num_input_tokens_seen": 311644345, + "step": 14447, + "time_per_iteration": 2.515702962875366 + }, + { + "auxiliary_loss_clip": 0.01025224, + "auxiliary_loss_mlp": 0.01025456, + "balance_loss_clip": 1.03283608, + "balance_loss_mlp": 1.01363635, + "epoch": 0.868660754546821, + "flos": 25113983696640.0, + "grad_norm": 1.8544304848122817, + "language_loss": 0.74066603, + "learning_rate": 1.781635359686515e-07, + "loss": 0.76117283, + "num_input_tokens_seen": 311663340, + "step": 14448, + "time_per_iteration": 2.7022008895874023 + }, + { + "auxiliary_loss_clip": 0.01078841, + "auxiliary_loss_mlp": 0.01032774, + "balance_loss_clip": 1.0335288, + "balance_loss_mlp": 1.01959515, + "epoch": 0.8687208777994889, + "flos": 12677832178560.0, + "grad_norm": 2.0466642720063155, + "language_loss": 0.80271941, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.82383555, + "num_input_tokens_seen": 311679860, + "step": 14449, + "time_per_iteration": 3.853191614151001 + }, + { + "auxiliary_loss_clip": 0.01000724, + "auxiliary_loss_mlp": 0.01003839, + "balance_loss_clip": 1.00799966, + "balance_loss_mlp": 1.00266504, + "epoch": 0.8687810010521569, + "flos": 65617235573760.0, + "grad_norm": 0.7999466041093845, + "language_loss": 0.6060611, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.62610674, + "num_input_tokens_seen": 311738135, + "step": 14450, + "time_per_iteration": 3.082252025604248 + }, + { + "auxiliary_loss_clip": 0.01076951, + "auxiliary_loss_mlp": 0.0102887, + "balance_loss_clip": 1.0331614, + "balance_loss_mlp": 1.0166328, + "epoch": 0.8688411243048249, + "flos": 24244012293120.0, + "grad_norm": 2.17160081883365, + "language_loss": 0.76062059, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.7816788, + "num_input_tokens_seen": 311756975, + "step": 14451, + "time_per_iteration": 2.5809717178344727 + }, + { + "auxiliary_loss_clip": 0.01090462, + "auxiliary_loss_mlp": 0.01025263, + "balance_loss_clip": 1.03428805, + "balance_loss_mlp": 1.0134728, + "epoch": 0.8689012475574929, + "flos": 18221828843520.0, + "grad_norm": 2.616801230320457, + "language_loss": 0.72009099, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.74124825, + "num_input_tokens_seen": 311771830, + "step": 14452, + "time_per_iteration": 2.4371657371520996 + }, + { + "auxiliary_loss_clip": 0.01080744, + "auxiliary_loss_mlp": 0.00786409, + "balance_loss_clip": 1.03462815, + "balance_loss_mlp": 1.01444221, + "epoch": 0.8689613708101608, + "flos": 19646728439040.0, + "grad_norm": 1.4916465186526864, + "language_loss": 0.72074056, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.73941207, + "num_input_tokens_seen": 311790130, + "step": 14453, + "time_per_iteration": 2.512530565261841 + }, + { + "auxiliary_loss_clip": 0.01091214, + "auxiliary_loss_mlp": 0.0103269, + "balance_loss_clip": 1.03448749, + "balance_loss_mlp": 1.02064419, + "epoch": 0.8690214940628288, + "flos": 11728749070080.0, + "grad_norm": 2.1907338717695257, + "language_loss": 0.73390532, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.75514442, + "num_input_tokens_seen": 311808360, + "step": 14454, + "time_per_iteration": 2.483767032623291 + }, + { + "auxiliary_loss_clip": 0.01103057, + "auxiliary_loss_mlp": 0.01031957, + "balance_loss_clip": 1.03573525, + "balance_loss_mlp": 1.02036417, + "epoch": 0.8690816173154968, + "flos": 34936450076160.0, + "grad_norm": 1.6546090859021734, + "language_loss": 0.59430444, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.61565453, + "num_input_tokens_seen": 311831325, + "step": 14455, + "time_per_iteration": 3.937310218811035 + }, + { + "auxiliary_loss_clip": 0.01084577, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.0349021, + "balance_loss_mlp": 1.01670325, + "epoch": 0.8691417405681647, + "flos": 11614804151040.0, + "grad_norm": 2.0000630580073895, + "language_loss": 0.79940468, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.82053781, + "num_input_tokens_seen": 311848090, + "step": 14456, + "time_per_iteration": 2.484936237335205 + }, + { + "auxiliary_loss_clip": 0.01049468, + "auxiliary_loss_mlp": 0.01039051, + "balance_loss_clip": 1.03470969, + "balance_loss_mlp": 1.02407849, + "epoch": 0.8692018638208328, + "flos": 24608038677120.0, + "grad_norm": 2.8988203474634804, + "language_loss": 0.74809521, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.76898044, + "num_input_tokens_seen": 311867855, + "step": 14457, + "time_per_iteration": 2.6505770683288574 + }, + { + "auxiliary_loss_clip": 0.0104385, + "auxiliary_loss_mlp": 0.01026705, + "balance_loss_clip": 1.03481281, + "balance_loss_mlp": 1.01511228, + "epoch": 0.8692619870735007, + "flos": 25995124229760.0, + "grad_norm": 1.5840299630611727, + "language_loss": 0.78437126, + "learning_rate": 1.765601232001328e-07, + "loss": 0.80507684, + "num_input_tokens_seen": 311888675, + "step": 14458, + "time_per_iteration": 2.6167569160461426 + }, + { + "auxiliary_loss_clip": 0.01093402, + "auxiliary_loss_mlp": 0.01034885, + "balance_loss_clip": 1.03428638, + "balance_loss_mlp": 1.02195096, + "epoch": 0.8693221103261687, + "flos": 18041808856320.0, + "grad_norm": 1.5852940255091947, + "language_loss": 0.70960331, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.73088622, + "num_input_tokens_seen": 311907310, + "step": 14459, + "time_per_iteration": 2.492278575897217 + }, + { + "auxiliary_loss_clip": 0.01078583, + "auxiliary_loss_mlp": 0.01028031, + "balance_loss_clip": 1.0345993, + "balance_loss_mlp": 1.01737928, + "epoch": 0.8693822335788366, + "flos": 27492347859840.0, + "grad_norm": 1.4954399400340253, + "language_loss": 0.73957169, + "learning_rate": 1.762402701923398e-07, + "loss": 0.76063782, + "num_input_tokens_seen": 311929635, + "step": 14460, + "time_per_iteration": 2.582257032394409 + }, + { + "auxiliary_loss_clip": 0.01085797, + "auxiliary_loss_mlp": 0.01032534, + "balance_loss_clip": 1.03457797, + "balance_loss_mlp": 1.02036858, + "epoch": 0.8694423568315046, + "flos": 24097712198400.0, + "grad_norm": 1.884233682778875, + "language_loss": 0.64673662, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.66791999, + "num_input_tokens_seen": 311948800, + "step": 14461, + "time_per_iteration": 2.5485026836395264 + }, + { + "auxiliary_loss_clip": 0.01088723, + "auxiliary_loss_mlp": 0.01034873, + "balance_loss_clip": 1.03294492, + "balance_loss_mlp": 1.02224255, + "epoch": 0.8695024800841725, + "flos": 18362131367040.0, + "grad_norm": 2.0185285472477954, + "language_loss": 0.82842213, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.84965807, + "num_input_tokens_seen": 311964090, + "step": 14462, + "time_per_iteration": 2.4405837059020996 + }, + { + "auxiliary_loss_clip": 0.01092408, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.03345609, + "balance_loss_mlp": 1.01976204, + "epoch": 0.8695626033368405, + "flos": 14027750133120.0, + "grad_norm": 1.8486719102148033, + "language_loss": 0.65381539, + "learning_rate": 1.757610093744335e-07, + "loss": 0.6750589, + "num_input_tokens_seen": 311981460, + "step": 14463, + "time_per_iteration": 2.5104520320892334 + }, + { + "auxiliary_loss_clip": 0.01085306, + "auxiliary_loss_mlp": 0.0103733, + "balance_loss_clip": 1.03665257, + "balance_loss_mlp": 1.02454531, + "epoch": 0.8696227265895085, + "flos": 16836862193280.0, + "grad_norm": 2.0688561092061626, + "language_loss": 0.66466784, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.68589419, + "num_input_tokens_seen": 312000115, + "step": 14464, + "time_per_iteration": 2.518507957458496 + }, + { + "auxiliary_loss_clip": 0.01083401, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.03325176, + "balance_loss_mlp": 1.02051866, + "epoch": 0.8696828498421765, + "flos": 21799070271360.0, + "grad_norm": 1.9476867493837906, + "language_loss": 0.6243614, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.64552099, + "num_input_tokens_seen": 312020770, + "step": 14465, + "time_per_iteration": 2.5843963623046875 + }, + { + "auxiliary_loss_clip": 0.01087497, + "auxiliary_loss_mlp": 0.01036074, + "balance_loss_clip": 1.0340544, + "balance_loss_mlp": 1.02529752, + "epoch": 0.8697429730948444, + "flos": 22894812610560.0, + "grad_norm": 1.464881963209008, + "language_loss": 0.84582412, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.86705983, + "num_input_tokens_seen": 312041870, + "step": 14466, + "time_per_iteration": 2.5624406337738037 + }, + { + "auxiliary_loss_clip": 0.01075796, + "auxiliary_loss_mlp": 0.01039592, + "balance_loss_clip": 1.03538465, + "balance_loss_mlp": 1.02548969, + "epoch": 0.8698030963475124, + "flos": 24717458482560.0, + "grad_norm": 2.324717994135291, + "language_loss": 0.61475599, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.63590986, + "num_input_tokens_seen": 312058210, + "step": 14467, + "time_per_iteration": 2.56168270111084 + }, + { + "auxiliary_loss_clip": 0.01097371, + "auxiliary_loss_mlp": 0.01027467, + "balance_loss_clip": 1.03236079, + "balance_loss_mlp": 1.01627898, + "epoch": 0.8698632196001803, + "flos": 28442221067520.0, + "grad_norm": 1.3430913866157304, + "language_loss": 0.68619466, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.707443, + "num_input_tokens_seen": 312082665, + "step": 14468, + "time_per_iteration": 2.533604621887207 + }, + { + "auxiliary_loss_clip": 0.01078447, + "auxiliary_loss_mlp": 0.01025813, + "balance_loss_clip": 1.03276682, + "balance_loss_mlp": 1.01504827, + "epoch": 0.8699233428528483, + "flos": 27636457224960.0, + "grad_norm": 1.6113196312974467, + "language_loss": 0.71139991, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.73244256, + "num_input_tokens_seen": 312101960, + "step": 14469, + "time_per_iteration": 2.562858819961548 + }, + { + "auxiliary_loss_clip": 0.01087561, + "auxiliary_loss_mlp": 0.01027018, + "balance_loss_clip": 1.03471744, + "balance_loss_mlp": 1.01600301, + "epoch": 0.8699834661055164, + "flos": 20045659864320.0, + "grad_norm": 1.9449420600766056, + "language_loss": 0.8418014, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.86294723, + "num_input_tokens_seen": 312117125, + "step": 14470, + "time_per_iteration": 2.4563589096069336 + }, + { + "auxiliary_loss_clip": 0.01079028, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.03510642, + "balance_loss_mlp": 1.01956391, + "epoch": 0.8700435893581843, + "flos": 23732787974400.0, + "grad_norm": 2.265664215985485, + "language_loss": 0.73113024, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.75224626, + "num_input_tokens_seen": 312135775, + "step": 14471, + "time_per_iteration": 2.549468994140625 + }, + { + "auxiliary_loss_clip": 0.01102353, + "auxiliary_loss_mlp": 0.0102652, + "balance_loss_clip": 1.03502154, + "balance_loss_mlp": 1.01549339, + "epoch": 0.8701037126108523, + "flos": 23548422441600.0, + "grad_norm": 1.4437363697577246, + "language_loss": 0.79050136, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.81179011, + "num_input_tokens_seen": 312156070, + "step": 14472, + "time_per_iteration": 2.4599857330322266 + }, + { + "auxiliary_loss_clip": 0.01086606, + "auxiliary_loss_mlp": 0.00783539, + "balance_loss_clip": 1.03472519, + "balance_loss_mlp": 1.0090065, + "epoch": 0.8701638358635202, + "flos": 18843442634880.0, + "grad_norm": 1.7878599959121197, + "language_loss": 0.72889066, + "learning_rate": 1.741679706279644e-07, + "loss": 0.74759209, + "num_input_tokens_seen": 312174380, + "step": 14473, + "time_per_iteration": 2.4792652130126953 + }, + { + "auxiliary_loss_clip": 0.01106412, + "auxiliary_loss_mlp": 0.01027447, + "balance_loss_clip": 1.03621221, + "balance_loss_mlp": 1.01560974, + "epoch": 0.8702239591161882, + "flos": 27928339142400.0, + "grad_norm": 1.4439053796727133, + "language_loss": 0.72405684, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.74539542, + "num_input_tokens_seen": 312195130, + "step": 14474, + "time_per_iteration": 2.4827075004577637 + }, + { + "auxiliary_loss_clip": 0.01080538, + "auxiliary_loss_mlp": 0.01034079, + "balance_loss_clip": 1.03345728, + "balance_loss_mlp": 1.0215081, + "epoch": 0.8702840823688561, + "flos": 17233997938560.0, + "grad_norm": 1.8559342912896644, + "language_loss": 0.66813564, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.68928182, + "num_input_tokens_seen": 312212300, + "step": 14475, + "time_per_iteration": 2.497982978820801 + }, + { + "auxiliary_loss_clip": 0.01101568, + "auxiliary_loss_mlp": 0.0102344, + "balance_loss_clip": 1.03238702, + "balance_loss_mlp": 1.01142991, + "epoch": 0.8703442056215241, + "flos": 19427565605760.0, + "grad_norm": 1.5829894581459116, + "language_loss": 0.77785337, + "learning_rate": 1.736914088262349e-07, + "loss": 0.79910338, + "num_input_tokens_seen": 312231735, + "step": 14476, + "time_per_iteration": 2.468050956726074 + }, + { + "auxiliary_loss_clip": 0.01087118, + "auxiliary_loss_mlp": 0.01035109, + "balance_loss_clip": 1.03388596, + "balance_loss_mlp": 1.0223949, + "epoch": 0.8704043288741921, + "flos": 22273845264000.0, + "grad_norm": 2.1647822176639706, + "language_loss": 0.72286153, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.74408376, + "num_input_tokens_seen": 312253060, + "step": 14477, + "time_per_iteration": 2.5451111793518066 + }, + { + "auxiliary_loss_clip": 0.0109457, + "auxiliary_loss_mlp": 0.01027466, + "balance_loss_clip": 1.0352571, + "balance_loss_mlp": 1.01553953, + "epoch": 0.8704644521268601, + "flos": 16648725732480.0, + "grad_norm": 1.803896076857084, + "language_loss": 0.5972842, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.61850452, + "num_input_tokens_seen": 312269460, + "step": 14478, + "time_per_iteration": 2.443573474884033 + }, + { + "auxiliary_loss_clip": 0.01093613, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.03817952, + "balance_loss_mlp": 1.0198679, + "epoch": 0.870524575379528, + "flos": 24280210224000.0, + "grad_norm": 1.6267302262456167, + "language_loss": 0.71425426, + "learning_rate": 1.732154703087323e-07, + "loss": 0.73549402, + "num_input_tokens_seen": 312289830, + "step": 14479, + "time_per_iteration": 2.5244760513305664 + }, + { + "auxiliary_loss_clip": 0.01080384, + "auxiliary_loss_mlp": 0.01031566, + "balance_loss_clip": 1.0346961, + "balance_loss_mlp": 1.01940703, + "epoch": 0.870584698632196, + "flos": 28768684803840.0, + "grad_norm": 1.4996150642173522, + "language_loss": 0.71125418, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.73237371, + "num_input_tokens_seen": 312311320, + "step": 14480, + "time_per_iteration": 2.579730272293091 + }, + { + "auxiliary_loss_clip": 0.01054677, + "auxiliary_loss_mlp": 0.01033407, + "balance_loss_clip": 1.03340805, + "balance_loss_mlp": 1.02139664, + "epoch": 0.8706448218848639, + "flos": 32449635774720.0, + "grad_norm": 1.8430625338238198, + "language_loss": 0.70161456, + "learning_rate": 1.728985243129666e-07, + "loss": 0.72249544, + "num_input_tokens_seen": 312332095, + "step": 14481, + "time_per_iteration": 2.671815872192383 + }, + { + "auxiliary_loss_clip": 0.01090367, + "auxiliary_loss_mlp": 0.01027345, + "balance_loss_clip": 1.03411496, + "balance_loss_mlp": 1.01594281, + "epoch": 0.8707049451375319, + "flos": 22748009725440.0, + "grad_norm": 1.745740860809939, + "language_loss": 0.76962852, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.79080564, + "num_input_tokens_seen": 312351225, + "step": 14482, + "time_per_iteration": 2.49041748046875 + }, + { + "auxiliary_loss_clip": 0.01080802, + "auxiliary_loss_mlp": 0.01030181, + "balance_loss_clip": 1.03429139, + "balance_loss_mlp": 1.01778948, + "epoch": 0.8707650683902, + "flos": 15851976203520.0, + "grad_norm": 1.8584796014162464, + "language_loss": 0.76497602, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.78608578, + "num_input_tokens_seen": 312369730, + "step": 14483, + "time_per_iteration": 3.9839916229248047 + }, + { + "auxiliary_loss_clip": 0.01098055, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.03543746, + "balance_loss_mlp": 1.02151752, + "epoch": 0.8708251916428679, + "flos": 16468131127680.0, + "grad_norm": 2.0584274026196554, + "language_loss": 0.623402, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.64472944, + "num_input_tokens_seen": 312386780, + "step": 14484, + "time_per_iteration": 3.851515054702759 + }, + { + "auxiliary_loss_clip": 0.01103252, + "auxiliary_loss_mlp": 0.0103235, + "balance_loss_clip": 1.03564858, + "balance_loss_mlp": 1.02085197, + "epoch": 0.8708853148955359, + "flos": 15377847655680.0, + "grad_norm": 1.7480041648271312, + "language_loss": 0.67929196, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.70064795, + "num_input_tokens_seen": 312404875, + "step": 14485, + "time_per_iteration": 2.446337938308716 + }, + { + "auxiliary_loss_clip": 0.01046667, + "auxiliary_loss_mlp": 0.00783896, + "balance_loss_clip": 1.03148186, + "balance_loss_mlp": 1.00909328, + "epoch": 0.8709454381482038, + "flos": 30551325903360.0, + "grad_norm": 1.7893398432812508, + "language_loss": 0.63026518, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.6485709, + "num_input_tokens_seen": 312425280, + "step": 14486, + "time_per_iteration": 2.682859182357788 + }, + { + "auxiliary_loss_clip": 0.01105899, + "auxiliary_loss_mlp": 0.01032828, + "balance_loss_clip": 1.03433907, + "balance_loss_mlp": 1.01970959, + "epoch": 0.8710055614008718, + "flos": 22601422321920.0, + "grad_norm": 2.122709873989983, + "language_loss": 0.62171674, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.64310408, + "num_input_tokens_seen": 312443835, + "step": 14487, + "time_per_iteration": 2.467771053314209 + }, + { + "auxiliary_loss_clip": 0.01076231, + "auxiliary_loss_mlp": 0.0078048, + "balance_loss_clip": 1.03490591, + "balance_loss_mlp": 1.00608921, + "epoch": 0.8710656846535397, + "flos": 18443146492800.0, + "grad_norm": 2.3342853863908046, + "language_loss": 0.6784597, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.69702679, + "num_input_tokens_seen": 312460830, + "step": 14488, + "time_per_iteration": 3.8791208267211914 + }, + { + "auxiliary_loss_clip": 0.0108519, + "auxiliary_loss_mlp": 0.00782809, + "balance_loss_clip": 1.03512883, + "balance_loss_mlp": 1.00827861, + "epoch": 0.8711258079062077, + "flos": 16503862181760.0, + "grad_norm": 1.8263789023041945, + "language_loss": 0.85678846, + "learning_rate": 1.716335121648338e-07, + "loss": 0.87546849, + "num_input_tokens_seen": 312477575, + "step": 14489, + "time_per_iteration": 2.5048816204071045 + }, + { + "auxiliary_loss_clip": 0.01098328, + "auxiliary_loss_mlp": 0.01029371, + "balance_loss_clip": 1.03648019, + "balance_loss_mlp": 1.01661563, + "epoch": 0.8711859311588757, + "flos": 15663336952320.0, + "grad_norm": 2.1089495041904844, + "language_loss": 0.76048535, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.78176236, + "num_input_tokens_seen": 312492140, + "step": 14490, + "time_per_iteration": 2.4342377185821533 + }, + { + "auxiliary_loss_clip": 0.01098459, + "auxiliary_loss_mlp": 0.01029329, + "balance_loss_clip": 1.0363549, + "balance_loss_mlp": 1.01703274, + "epoch": 0.8712460544115437, + "flos": 15557544420480.0, + "grad_norm": 2.1175640420006605, + "language_loss": 0.76206827, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.78334612, + "num_input_tokens_seen": 312508400, + "step": 14491, + "time_per_iteration": 2.4756104946136475 + }, + { + "auxiliary_loss_clip": 0.01074867, + "auxiliary_loss_mlp": 0.01025694, + "balance_loss_clip": 1.03833306, + "balance_loss_mlp": 1.0139761, + "epoch": 0.8713061776642116, + "flos": 16763568491520.0, + "grad_norm": 1.89405419696423, + "language_loss": 0.66872376, + "learning_rate": 1.711602764198723e-07, + "loss": 0.68972939, + "num_input_tokens_seen": 312525915, + "step": 14492, + "time_per_iteration": 2.543067455291748 + }, + { + "auxiliary_loss_clip": 0.01089663, + "auxiliary_loss_mlp": 0.01030317, + "balance_loss_clip": 1.03426564, + "balance_loss_mlp": 1.01945114, + "epoch": 0.8713663009168796, + "flos": 24279887001600.0, + "grad_norm": 2.279462319158682, + "language_loss": 0.69574946, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.71694922, + "num_input_tokens_seen": 312544735, + "step": 14493, + "time_per_iteration": 2.5088891983032227 + }, + { + "auxiliary_loss_clip": 0.01107041, + "auxiliary_loss_mlp": 0.01032471, + "balance_loss_clip": 1.03784251, + "balance_loss_mlp": 1.01971006, + "epoch": 0.8714264241695475, + "flos": 23795594904960.0, + "grad_norm": 2.588389629360129, + "language_loss": 0.89128768, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.91268283, + "num_input_tokens_seen": 312557910, + "step": 14494, + "time_per_iteration": 3.9085967540740967 + }, + { + "auxiliary_loss_clip": 0.01071681, + "auxiliary_loss_mlp": 0.01029645, + "balance_loss_clip": 1.03894353, + "balance_loss_mlp": 1.01834369, + "epoch": 0.8714865474222155, + "flos": 37997942071680.0, + "grad_norm": 1.5976229463612197, + "language_loss": 0.59336495, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.61437821, + "num_input_tokens_seen": 312580360, + "step": 14495, + "time_per_iteration": 2.6967506408691406 + }, + { + "auxiliary_loss_clip": 0.01073221, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.03456044, + "balance_loss_mlp": 1.02278876, + "epoch": 0.8715466706748836, + "flos": 22455696844800.0, + "grad_norm": 1.8726672344198838, + "language_loss": 0.80161214, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.82269615, + "num_input_tokens_seen": 312597550, + "step": 14496, + "time_per_iteration": 2.547055959701538 + }, + { + "auxiliary_loss_clip": 0.01083483, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.03532505, + "balance_loss_mlp": 1.02096581, + "epoch": 0.8716067939275515, + "flos": 21215126868480.0, + "grad_norm": 1.9728769901333587, + "language_loss": 0.78479195, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.80596519, + "num_input_tokens_seen": 312616435, + "step": 14497, + "time_per_iteration": 2.4969942569732666 + }, + { + "auxiliary_loss_clip": 0.01106333, + "auxiliary_loss_mlp": 0.01028753, + "balance_loss_clip": 1.03578782, + "balance_loss_mlp": 1.01603913, + "epoch": 0.8716669171802195, + "flos": 22997732054400.0, + "grad_norm": 1.9418777230057958, + "language_loss": 0.67094177, + "learning_rate": 1.70215677535406e-07, + "loss": 0.69229263, + "num_input_tokens_seen": 312632770, + "step": 14498, + "time_per_iteration": 2.470106363296509 + }, + { + "auxiliary_loss_clip": 0.01065321, + "auxiliary_loss_mlp": 0.01031686, + "balance_loss_clip": 1.03331375, + "balance_loss_mlp": 1.02036095, + "epoch": 0.8717270404328874, + "flos": 29784058462080.0, + "grad_norm": 1.6224232864914374, + "language_loss": 0.56731594, + "learning_rate": 1.700584872028108e-07, + "loss": 0.58828598, + "num_input_tokens_seen": 312651900, + "step": 14499, + "time_per_iteration": 2.6328461170196533 + }, + { + "auxiliary_loss_clip": 0.01062815, + "auxiliary_loss_mlp": 0.0103115, + "balance_loss_clip": 1.03364325, + "balance_loss_mlp": 1.0187819, + "epoch": 0.8717871636855554, + "flos": 22018125363840.0, + "grad_norm": 1.896133786340536, + "language_loss": 0.79983801, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.82077765, + "num_input_tokens_seen": 312671380, + "step": 14500, + "time_per_iteration": 2.5965576171875 + }, + { + "auxiliary_loss_clip": 0.01087773, + "auxiliary_loss_mlp": 0.01028602, + "balance_loss_clip": 1.03624916, + "balance_loss_mlp": 1.01674628, + "epoch": 0.8718472869382233, + "flos": 16654256426880.0, + "grad_norm": 1.9416882176797718, + "language_loss": 0.73199797, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.75316173, + "num_input_tokens_seen": 312689215, + "step": 14501, + "time_per_iteration": 2.468776226043701 + }, + { + "auxiliary_loss_clip": 0.01069316, + "auxiliary_loss_mlp": 0.01029343, + "balance_loss_clip": 1.03317809, + "balance_loss_mlp": 1.01636696, + "epoch": 0.8719074101908914, + "flos": 19495328613120.0, + "grad_norm": 1.6464814885907133, + "language_loss": 0.64399064, + "learning_rate": 1.695873325782482e-07, + "loss": 0.66497719, + "num_input_tokens_seen": 312706400, + "step": 14502, + "time_per_iteration": 2.553511619567871 + }, + { + "auxiliary_loss_clip": 0.01079648, + "auxiliary_loss_mlp": 0.01036517, + "balance_loss_clip": 1.03360605, + "balance_loss_mlp": 1.02256942, + "epoch": 0.8719675334435593, + "flos": 33070890430080.0, + "grad_norm": 1.6306886567538454, + "language_loss": 0.68887353, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.71003515, + "num_input_tokens_seen": 312727985, + "step": 14503, + "time_per_iteration": 2.6167399883270264 + }, + { + "auxiliary_loss_clip": 0.01079611, + "auxiliary_loss_mlp": 0.01027499, + "balance_loss_clip": 1.0359329, + "balance_loss_mlp": 1.01571476, + "epoch": 0.8720276566962273, + "flos": 13626268842240.0, + "grad_norm": 3.204719711539022, + "language_loss": 0.69571936, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.71679044, + "num_input_tokens_seen": 312745025, + "step": 14504, + "time_per_iteration": 2.5312280654907227 + }, + { + "auxiliary_loss_clip": 0.01091396, + "auxiliary_loss_mlp": 0.00783609, + "balance_loss_clip": 1.03666914, + "balance_loss_mlp": 1.00974178, + "epoch": 0.8720877799488952, + "flos": 23514163845120.0, + "grad_norm": 1.7338585429550317, + "language_loss": 0.70429862, + "learning_rate": 1.691168026385552e-07, + "loss": 0.72304869, + "num_input_tokens_seen": 312764170, + "step": 14505, + "time_per_iteration": 2.535022020339966 + }, + { + "auxiliary_loss_clip": 0.01081895, + "auxiliary_loss_mlp": 0.01025396, + "balance_loss_clip": 1.03513813, + "balance_loss_mlp": 1.01449454, + "epoch": 0.8721479032015632, + "flos": 20814148368000.0, + "grad_norm": 1.5012725390387984, + "language_loss": 0.78092557, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.8019985, + "num_input_tokens_seen": 312783830, + "step": 14506, + "time_per_iteration": 2.5358402729034424 + }, + { + "auxiliary_loss_clip": 0.01083919, + "auxiliary_loss_mlp": 0.01030521, + "balance_loss_clip": 1.0345068, + "balance_loss_mlp": 1.01861238, + "epoch": 0.8722080264542311, + "flos": 19463655795840.0, + "grad_norm": 2.24164535751118, + "language_loss": 0.73660278, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.75774717, + "num_input_tokens_seen": 312802015, + "step": 14507, + "time_per_iteration": 2.5163071155548096 + }, + { + "auxiliary_loss_clip": 0.01047869, + "auxiliary_loss_mlp": 0.01038987, + "balance_loss_clip": 1.03292751, + "balance_loss_mlp": 1.02429438, + "epoch": 0.8722681497068991, + "flos": 21761866759680.0, + "grad_norm": 1.9880686947609014, + "language_loss": 0.71956885, + "learning_rate": 1.686468975443156e-07, + "loss": 0.74043739, + "num_input_tokens_seen": 312820650, + "step": 14508, + "time_per_iteration": 2.625107526779175 + }, + { + "auxiliary_loss_clip": 0.01086963, + "auxiliary_loss_mlp": 0.01031098, + "balance_loss_clip": 1.03709435, + "balance_loss_mlp": 1.01840186, + "epoch": 0.8723282729595672, + "flos": 28877134942080.0, + "grad_norm": 1.539780432936753, + "language_loss": 0.68794513, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.7091257, + "num_input_tokens_seen": 312841310, + "step": 14509, + "time_per_iteration": 2.570800542831421 + }, + { + "auxiliary_loss_clip": 0.01078098, + "auxiliary_loss_mlp": 0.01028473, + "balance_loss_clip": 1.03546154, + "balance_loss_mlp": 1.01696301, + "epoch": 0.8723883962122351, + "flos": 26469145036800.0, + "grad_norm": 1.5793748124876146, + "language_loss": 0.58393681, + "learning_rate": 1.683339746970558e-07, + "loss": 0.60500252, + "num_input_tokens_seen": 312862100, + "step": 14510, + "time_per_iteration": 2.5701496601104736 + }, + { + "auxiliary_loss_clip": 0.01109871, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.03588676, + "balance_loss_mlp": 1.0203259, + "epoch": 0.8724485194649031, + "flos": 20521476351360.0, + "grad_norm": 2.3061709623541793, + "language_loss": 0.67629647, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.69772863, + "num_input_tokens_seen": 312880220, + "step": 14511, + "time_per_iteration": 2.4637279510498047 + }, + { + "auxiliary_loss_clip": 0.01058602, + "auxiliary_loss_mlp": 0.01032975, + "balance_loss_clip": 1.03424048, + "balance_loss_mlp": 1.01962948, + "epoch": 0.872508642717571, + "flos": 24353360271360.0, + "grad_norm": 1.614337350136534, + "language_loss": 0.81804478, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.83896059, + "num_input_tokens_seen": 312900765, + "step": 14512, + "time_per_iteration": 2.6793863773345947 + }, + { + "auxiliary_loss_clip": 0.01011519, + "auxiliary_loss_mlp": 0.01001223, + "balance_loss_clip": 1.00792527, + "balance_loss_mlp": 0.99994797, + "epoch": 0.872568765970239, + "flos": 61410012485760.0, + "grad_norm": 0.7920912914676207, + "language_loss": 0.58669311, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.60682058, + "num_input_tokens_seen": 312955840, + "step": 14513, + "time_per_iteration": 3.003101348876953 + }, + { + "auxiliary_loss_clip": 0.01093814, + "auxiliary_loss_mlp": 0.01030606, + "balance_loss_clip": 1.03536367, + "balance_loss_mlp": 1.01814914, + "epoch": 0.8726288892229069, + "flos": 22598046443520.0, + "grad_norm": 1.7175454395014758, + "language_loss": 0.76314211, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.78438634, + "num_input_tokens_seen": 312973565, + "step": 14514, + "time_per_iteration": 2.5098469257354736 + }, + { + "auxiliary_loss_clip": 0.0109874, + "auxiliary_loss_mlp": 0.01024035, + "balance_loss_clip": 1.03673279, + "balance_loss_mlp": 1.01293635, + "epoch": 0.872689012475575, + "flos": 25885201633920.0, + "grad_norm": 2.06261463205412, + "language_loss": 0.65305054, + "learning_rate": 1.675528831794055e-07, + "loss": 0.67427838, + "num_input_tokens_seen": 312994660, + "step": 14515, + "time_per_iteration": 2.5189123153686523 + }, + { + "auxiliary_loss_clip": 0.01093566, + "auxiliary_loss_mlp": 0.01034379, + "balance_loss_clip": 1.03455758, + "balance_loss_mlp": 1.02112317, + "epoch": 0.8727491357282429, + "flos": 21506721477120.0, + "grad_norm": 1.9455865745385206, + "language_loss": 0.79243386, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.81371337, + "num_input_tokens_seen": 313009860, + "step": 14516, + "time_per_iteration": 2.499708890914917 + }, + { + "auxiliary_loss_clip": 0.01106252, + "auxiliary_loss_mlp": 0.01029992, + "balance_loss_clip": 1.0356636, + "balance_loss_mlp": 1.01776767, + "epoch": 0.8728092589809109, + "flos": 19207504932480.0, + "grad_norm": 1.9860371991610217, + "language_loss": 0.72492391, + "learning_rate": 1.672409329369453e-07, + "loss": 0.74628633, + "num_input_tokens_seen": 313027025, + "step": 14517, + "time_per_iteration": 2.428126096725464 + }, + { + "auxiliary_loss_clip": 0.01065566, + "auxiliary_loss_mlp": 0.01025043, + "balance_loss_clip": 1.03279614, + "balance_loss_mlp": 1.01403403, + "epoch": 0.8728693822335788, + "flos": 20595308757120.0, + "grad_norm": 2.011992841605645, + "language_loss": 0.73029107, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.75119716, + "num_input_tokens_seen": 313046830, + "step": 14518, + "time_per_iteration": 2.5543506145477295 + }, + { + "auxiliary_loss_clip": 0.01077562, + "auxiliary_loss_mlp": 0.01038645, + "balance_loss_clip": 1.03241658, + "balance_loss_mlp": 1.02603889, + "epoch": 0.8729295054862468, + "flos": 21728613744000.0, + "grad_norm": 1.3742065841182358, + "language_loss": 0.74090624, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.76206839, + "num_input_tokens_seen": 313067715, + "step": 14519, + "time_per_iteration": 2.537863254547119 + }, + { + "auxiliary_loss_clip": 0.01094647, + "auxiliary_loss_mlp": 0.01029191, + "balance_loss_clip": 1.03531075, + "balance_loss_mlp": 1.01622725, + "epoch": 0.8729896287389147, + "flos": 17673436926720.0, + "grad_norm": 2.6086829848184605, + "language_loss": 0.76449239, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.78573084, + "num_input_tokens_seen": 313082305, + "step": 14520, + "time_per_iteration": 2.4626524448394775 + }, + { + "auxiliary_loss_clip": 0.01084002, + "auxiliary_loss_mlp": 0.01033399, + "balance_loss_clip": 1.03582907, + "balance_loss_mlp": 1.02034569, + "epoch": 0.8730497519915827, + "flos": 24571804832640.0, + "grad_norm": 1.8422267914002988, + "language_loss": 0.81892812, + "learning_rate": 1.666178664801816e-07, + "loss": 0.8401022, + "num_input_tokens_seen": 313101190, + "step": 14521, + "time_per_iteration": 2.522062063217163 + }, + { + "auxiliary_loss_clip": 0.01092574, + "auxiliary_loss_mlp": 0.01032644, + "balance_loss_clip": 1.03660512, + "balance_loss_mlp": 1.01979899, + "epoch": 0.8731098752442508, + "flos": 13443734903040.0, + "grad_norm": 2.2988661851321517, + "language_loss": 0.7651605, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.78641272, + "num_input_tokens_seen": 313118965, + "step": 14522, + "time_per_iteration": 3.8551037311553955 + }, + { + "auxiliary_loss_clip": 0.01090162, + "auxiliary_loss_mlp": 0.00781302, + "balance_loss_clip": 1.033041, + "balance_loss_mlp": 1.00814891, + "epoch": 0.8731699984969187, + "flos": 23474446381440.0, + "grad_norm": 1.6517816119740467, + "language_loss": 0.75431484, + "learning_rate": 1.66306750360385e-07, + "loss": 0.77302945, + "num_input_tokens_seen": 313139280, + "step": 14523, + "time_per_iteration": 3.888089418411255 + }, + { + "auxiliary_loss_clip": 0.01091632, + "auxiliary_loss_mlp": 0.01027447, + "balance_loss_clip": 1.03374219, + "balance_loss_mlp": 1.01591301, + "epoch": 0.8732301217495867, + "flos": 17712651600000.0, + "grad_norm": 2.279223315746858, + "language_loss": 0.79009879, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.81128955, + "num_input_tokens_seen": 313156655, + "step": 14524, + "time_per_iteration": 2.480048179626465 + }, + { + "auxiliary_loss_clip": 0.01082378, + "auxiliary_loss_mlp": 0.01028696, + "balance_loss_clip": 1.03425133, + "balance_loss_mlp": 1.0173595, + "epoch": 0.8732902450022546, + "flos": 22054359208320.0, + "grad_norm": 2.0538145373900707, + "language_loss": 0.77783436, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.79894507, + "num_input_tokens_seen": 313174050, + "step": 14525, + "time_per_iteration": 2.533050537109375 + }, + { + "auxiliary_loss_clip": 0.01015446, + "auxiliary_loss_mlp": 0.01030957, + "balance_loss_clip": 1.03441167, + "balance_loss_mlp": 1.01934552, + "epoch": 0.8733503682549226, + "flos": 22272983337600.0, + "grad_norm": 1.6903131111111729, + "language_loss": 0.69249833, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.71296239, + "num_input_tokens_seen": 313192765, + "step": 14526, + "time_per_iteration": 4.416518211364746 + }, + { + "auxiliary_loss_clip": 0.0105566, + "auxiliary_loss_mlp": 0.01039509, + "balance_loss_clip": 1.03581607, + "balance_loss_mlp": 1.02621686, + "epoch": 0.8734104915075905, + "flos": 23364344217600.0, + "grad_norm": 1.8013766299143161, + "language_loss": 0.60963643, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.63058817, + "num_input_tokens_seen": 313210925, + "step": 14527, + "time_per_iteration": 2.891195058822632 + }, + { + "auxiliary_loss_clip": 0.01095572, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.03780401, + "balance_loss_mlp": 1.01768672, + "epoch": 0.8734706147602586, + "flos": 17712292464000.0, + "grad_norm": 2.089471573449631, + "language_loss": 0.65547341, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.67674518, + "num_input_tokens_seen": 313228250, + "step": 14528, + "time_per_iteration": 2.483466625213623 + }, + { + "auxiliary_loss_clip": 0.01074749, + "auxiliary_loss_mlp": 0.01026412, + "balance_loss_clip": 1.03688228, + "balance_loss_mlp": 1.0149684, + "epoch": 0.8735307380129265, + "flos": 22049367217920.0, + "grad_norm": 2.2016515872433575, + "language_loss": 0.89648777, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.9174993, + "num_input_tokens_seen": 313247880, + "step": 14529, + "time_per_iteration": 2.566633701324463 + }, + { + "auxiliary_loss_clip": 0.01080284, + "auxiliary_loss_mlp": 0.01027964, + "balance_loss_clip": 1.03341198, + "balance_loss_mlp": 1.01622748, + "epoch": 0.8735908612655945, + "flos": 25338425829120.0, + "grad_norm": 1.7776334760650847, + "language_loss": 0.84897524, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.87005776, + "num_input_tokens_seen": 313266790, + "step": 14530, + "time_per_iteration": 3.067650556564331 + }, + { + "auxiliary_loss_clip": 0.01080531, + "auxiliary_loss_mlp": 0.01032828, + "balance_loss_clip": 1.0332303, + "balance_loss_mlp": 1.02183127, + "epoch": 0.8736509845182624, + "flos": 21540908246400.0, + "grad_norm": 1.444286149496322, + "language_loss": 0.74109018, + "learning_rate": 1.650650677057128e-07, + "loss": 0.76222372, + "num_input_tokens_seen": 313286805, + "step": 14531, + "time_per_iteration": 2.5553054809570312 + }, + { + "auxiliary_loss_clip": 0.01088911, + "auxiliary_loss_mlp": 0.01027401, + "balance_loss_clip": 1.03199172, + "balance_loss_mlp": 1.01574874, + "epoch": 0.8737111077709304, + "flos": 22017227523840.0, + "grad_norm": 2.1380601389213365, + "language_loss": 0.61350036, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.63466346, + "num_input_tokens_seen": 313305415, + "step": 14532, + "time_per_iteration": 2.5060722827911377 + }, + { + "auxiliary_loss_clip": 0.010197, + "auxiliary_loss_mlp": 0.0100524, + "balance_loss_clip": 1.00739288, + "balance_loss_mlp": 1.00410163, + "epoch": 0.8737712310235983, + "flos": 70066315912320.0, + "grad_norm": 0.8403163675784777, + "language_loss": 0.58795118, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.60820055, + "num_input_tokens_seen": 313369940, + "step": 14533, + "time_per_iteration": 4.579978942871094 + }, + { + "auxiliary_loss_clip": 0.01075978, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.03337288, + "balance_loss_mlp": 1.02115142, + "epoch": 0.8738313542762663, + "flos": 28658331244800.0, + "grad_norm": 1.533807282255495, + "language_loss": 0.76642919, + "learning_rate": 1.646005846335954e-07, + "loss": 0.78751314, + "num_input_tokens_seen": 313390965, + "step": 14534, + "time_per_iteration": 2.599895477294922 + }, + { + "auxiliary_loss_clip": 0.01076143, + "auxiliary_loss_mlp": 0.01031801, + "balance_loss_clip": 1.03228128, + "balance_loss_mlp": 1.01992798, + "epoch": 0.8738914775289344, + "flos": 22346384780160.0, + "grad_norm": 1.872179470538046, + "language_loss": 0.74987489, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.77095437, + "num_input_tokens_seen": 313409680, + "step": 14535, + "time_per_iteration": 2.5299248695373535 + }, + { + "auxiliary_loss_clip": 0.01103086, + "auxiliary_loss_mlp": 0.01031995, + "balance_loss_clip": 1.03373444, + "balance_loss_mlp": 1.01969874, + "epoch": 0.8739516007816023, + "flos": 31759648444800.0, + "grad_norm": 1.8672328822855577, + "language_loss": 0.7433362, + "learning_rate": 1.64291277235048e-07, + "loss": 0.76468694, + "num_input_tokens_seen": 313431335, + "step": 14536, + "time_per_iteration": 2.5574238300323486 + }, + { + "auxiliary_loss_clip": 0.01083049, + "auxiliary_loss_mlp": 0.01032319, + "balance_loss_clip": 1.03414083, + "balance_loss_mlp": 1.02110124, + "epoch": 0.8740117240342703, + "flos": 21211715076480.0, + "grad_norm": 1.858050998907757, + "language_loss": 0.64153647, + "learning_rate": 1.641367279482304e-07, + "loss": 0.66269022, + "num_input_tokens_seen": 313449225, + "step": 14537, + "time_per_iteration": 2.508390188217163 + }, + { + "auxiliary_loss_clip": 0.01090373, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.03301692, + "balance_loss_mlp": 1.01362383, + "epoch": 0.8740718472869382, + "flos": 25186666867200.0, + "grad_norm": 1.828266733433781, + "language_loss": 0.5797267, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.60089368, + "num_input_tokens_seen": 313467715, + "step": 14538, + "time_per_iteration": 2.5161564350128174 + }, + { + "auxiliary_loss_clip": 0.01092891, + "auxiliary_loss_mlp": 0.01026063, + "balance_loss_clip": 1.03654075, + "balance_loss_mlp": 1.01404083, + "epoch": 0.8741319705396062, + "flos": 19500931134720.0, + "grad_norm": 2.1153223440588365, + "language_loss": 0.68644047, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.70762992, + "num_input_tokens_seen": 313486805, + "step": 14539, + "time_per_iteration": 2.490931510925293 + }, + { + "auxiliary_loss_clip": 0.01095147, + "auxiliary_loss_mlp": 0.01028854, + "balance_loss_clip": 1.03273487, + "balance_loss_mlp": 1.01619995, + "epoch": 0.8741920937922741, + "flos": 14100900180480.0, + "grad_norm": 1.9953083080500276, + "language_loss": 0.74322772, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.76446772, + "num_input_tokens_seen": 313504880, + "step": 14540, + "time_per_iteration": 2.469717502593994 + }, + { + "auxiliary_loss_clip": 0.01078347, + "auxiliary_loss_mlp": 0.01039926, + "balance_loss_clip": 1.0332731, + "balance_loss_mlp": 1.02650928, + "epoch": 0.8742522170449422, + "flos": 27709858667520.0, + "grad_norm": 2.427944396212384, + "language_loss": 0.79068899, + "learning_rate": 1.635192270207193e-07, + "loss": 0.81187171, + "num_input_tokens_seen": 313524995, + "step": 14541, + "time_per_iteration": 2.558685064315796 + }, + { + "auxiliary_loss_clip": 0.01063901, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.03307247, + "balance_loss_mlp": 1.01760828, + "epoch": 0.8743123402976101, + "flos": 21142587352320.0, + "grad_norm": 1.9109300236808564, + "language_loss": 0.66748273, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.68844032, + "num_input_tokens_seen": 313541740, + "step": 14542, + "time_per_iteration": 2.603910446166992 + }, + { + "auxiliary_loss_clip": 0.01027918, + "auxiliary_loss_mlp": 0.01004337, + "balance_loss_clip": 1.00522876, + "balance_loss_mlp": 1.00324631, + "epoch": 0.8743724635502781, + "flos": 60870024351360.0, + "grad_norm": 0.7838761554241469, + "language_loss": 0.54473031, + "learning_rate": 1.632108943707642e-07, + "loss": 0.56505287, + "num_input_tokens_seen": 313593445, + "step": 14543, + "time_per_iteration": 2.89294171333313 + }, + { + "auxiliary_loss_clip": 0.01088586, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.03711152, + "balance_loss_mlp": 1.02116013, + "epoch": 0.874432586802946, + "flos": 28109292883200.0, + "grad_norm": 1.9415013704489137, + "language_loss": 0.69635707, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.71757954, + "num_input_tokens_seen": 313615640, + "step": 14544, + "time_per_iteration": 2.5610697269439697 + }, + { + "auxiliary_loss_clip": 0.01056504, + "auxiliary_loss_mlp": 0.01025469, + "balance_loss_clip": 1.03308678, + "balance_loss_mlp": 1.01497865, + "epoch": 0.874492710055614, + "flos": 23550289948800.0, + "grad_norm": 1.5613934471429989, + "language_loss": 0.75653601, + "learning_rate": 1.62902840325714e-07, + "loss": 0.77735567, + "num_input_tokens_seen": 313635550, + "step": 14545, + "time_per_iteration": 2.6149206161499023 + }, + { + "auxiliary_loss_clip": 0.01092702, + "auxiliary_loss_mlp": 0.00784714, + "balance_loss_clip": 1.03359628, + "balance_loss_mlp": 1.00848711, + "epoch": 0.8745528333082819, + "flos": 40915647924480.0, + "grad_norm": 1.5938681983991707, + "language_loss": 0.66152233, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.68029648, + "num_input_tokens_seen": 313659275, + "step": 14546, + "time_per_iteration": 2.697160482406616 + }, + { + "auxiliary_loss_clip": 0.01104465, + "auxiliary_loss_mlp": 0.01031606, + "balance_loss_clip": 1.03579926, + "balance_loss_mlp": 1.01975632, + "epoch": 0.87461295656095, + "flos": 23622901292160.0, + "grad_norm": 1.5750694499031774, + "language_loss": 0.7285018, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.74986255, + "num_input_tokens_seen": 313680595, + "step": 14547, + "time_per_iteration": 2.4573020935058594 + }, + { + "auxiliary_loss_clip": 0.01110175, + "auxiliary_loss_mlp": 0.01037876, + "balance_loss_clip": 1.03577089, + "balance_loss_mlp": 1.02476263, + "epoch": 0.874673079813618, + "flos": 38794116983040.0, + "grad_norm": 2.179392472517967, + "language_loss": 0.69048089, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.71196139, + "num_input_tokens_seen": 313699730, + "step": 14548, + "time_per_iteration": 2.6329948902130127 + }, + { + "auxiliary_loss_clip": 0.01086029, + "auxiliary_loss_mlp": 0.01030769, + "balance_loss_clip": 1.03565574, + "balance_loss_mlp": 1.01859772, + "epoch": 0.8747332030662859, + "flos": 23696159080320.0, + "grad_norm": 2.04689680578817, + "language_loss": 0.70827186, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.72943985, + "num_input_tokens_seen": 313720090, + "step": 14549, + "time_per_iteration": 2.538029909133911 + }, + { + "auxiliary_loss_clip": 0.01095451, + "auxiliary_loss_mlp": 0.00785861, + "balance_loss_clip": 1.03340721, + "balance_loss_mlp": 1.01052642, + "epoch": 0.8747933263189539, + "flos": 24462456854400.0, + "grad_norm": 2.365046695638516, + "language_loss": 0.8380965, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.85690963, + "num_input_tokens_seen": 313736795, + "step": 14550, + "time_per_iteration": 2.514082431793213 + }, + { + "auxiliary_loss_clip": 0.01097787, + "auxiliary_loss_mlp": 0.01038482, + "balance_loss_clip": 1.03674471, + "balance_loss_mlp": 1.02677536, + "epoch": 0.8748534495716218, + "flos": 13809161917440.0, + "grad_norm": 1.6144354798968827, + "language_loss": 0.71824229, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.73960495, + "num_input_tokens_seen": 313754820, + "step": 14551, + "time_per_iteration": 2.4599106311798096 + }, + { + "auxiliary_loss_clip": 0.01093604, + "auxiliary_loss_mlp": 0.00786825, + "balance_loss_clip": 1.03553975, + "balance_loss_mlp": 1.01300359, + "epoch": 0.8749135728242898, + "flos": 29862092759040.0, + "grad_norm": 2.3933249683528546, + "language_loss": 0.64044458, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.65924883, + "num_input_tokens_seen": 313775830, + "step": 14552, + "time_per_iteration": 2.583329439163208 + }, + { + "auxiliary_loss_clip": 0.01074925, + "auxiliary_loss_mlp": 0.01027371, + "balance_loss_clip": 1.03466797, + "balance_loss_mlp": 1.01347113, + "epoch": 0.8749736960769577, + "flos": 24133479166080.0, + "grad_norm": 1.667179063127772, + "language_loss": 0.79350197, + "learning_rate": 1.616734111284479e-07, + "loss": 0.81452489, + "num_input_tokens_seen": 313795745, + "step": 14553, + "time_per_iteration": 2.5570313930511475 + }, + { + "auxiliary_loss_clip": 0.01087925, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.0314734, + "balance_loss_mlp": 1.01825333, + "epoch": 0.8750338193296258, + "flos": 17202540602880.0, + "grad_norm": 1.8165751547381694, + "language_loss": 0.70272148, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.72390157, + "num_input_tokens_seen": 313813895, + "step": 14554, + "time_per_iteration": 2.478280782699585 + }, + { + "auxiliary_loss_clip": 0.01083191, + "auxiliary_loss_mlp": 0.00782024, + "balance_loss_clip": 1.03508317, + "balance_loss_mlp": 1.00645852, + "epoch": 0.8750939425822937, + "flos": 23733218937600.0, + "grad_norm": 1.425732991644998, + "language_loss": 0.83705807, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.85571021, + "num_input_tokens_seen": 313834225, + "step": 14555, + "time_per_iteration": 2.5370240211486816 + }, + { + "auxiliary_loss_clip": 0.01092468, + "auxiliary_loss_mlp": 0.01032242, + "balance_loss_clip": 1.03440738, + "balance_loss_mlp": 1.01966584, + "epoch": 0.8751540658349617, + "flos": 26541684552960.0, + "grad_norm": 1.4908302467091437, + "language_loss": 0.70787084, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.72911793, + "num_input_tokens_seen": 313854430, + "step": 14556, + "time_per_iteration": 2.531404972076416 + }, + { + "auxiliary_loss_clip": 0.01086212, + "auxiliary_loss_mlp": 0.01035577, + "balance_loss_clip": 1.03260314, + "balance_loss_mlp": 1.02210617, + "epoch": 0.8752141890876296, + "flos": 19386806647680.0, + "grad_norm": 1.8407825862599383, + "language_loss": 0.76676023, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.78797817, + "num_input_tokens_seen": 313871600, + "step": 14557, + "time_per_iteration": 2.4906272888183594 + }, + { + "auxiliary_loss_clip": 0.01067937, + "auxiliary_loss_mlp": 0.01039891, + "balance_loss_clip": 1.03483462, + "balance_loss_mlp": 1.02662885, + "epoch": 0.8752743123402976, + "flos": 25374408278400.0, + "grad_norm": 3.364944176845108, + "language_loss": 0.83096337, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.8520416, + "num_input_tokens_seen": 313891570, + "step": 14558, + "time_per_iteration": 2.596416711807251 + }, + { + "auxiliary_loss_clip": 0.01028506, + "auxiliary_loss_mlp": 0.01003476, + "balance_loss_clip": 1.00585961, + "balance_loss_mlp": 1.00242734, + "epoch": 0.8753344355929655, + "flos": 59952398578560.0, + "grad_norm": 0.8042667483634577, + "language_loss": 0.5615865, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.58190632, + "num_input_tokens_seen": 313951290, + "step": 14559, + "time_per_iteration": 3.0683300495147705 + }, + { + "auxiliary_loss_clip": 0.01091594, + "auxiliary_loss_mlp": 0.01034419, + "balance_loss_clip": 1.03444648, + "balance_loss_mlp": 1.02283812, + "epoch": 0.8753945588456336, + "flos": 17894646835200.0, + "grad_norm": 2.244946716122349, + "language_loss": 0.65819365, + "learning_rate": 1.606013202286407e-07, + "loss": 0.67945379, + "num_input_tokens_seen": 313968645, + "step": 14560, + "time_per_iteration": 4.096326589584351 + }, + { + "auxiliary_loss_clip": 0.01102807, + "auxiliary_loss_mlp": 0.01029966, + "balance_loss_clip": 1.03450477, + "balance_loss_mlp": 1.01811683, + "epoch": 0.8754546820983016, + "flos": 30914885410560.0, + "grad_norm": 1.9416890284537542, + "language_loss": 0.78734833, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.80867606, + "num_input_tokens_seen": 313987580, + "step": 14561, + "time_per_iteration": 2.5741875171661377 + }, + { + "auxiliary_loss_clip": 0.0110485, + "auxiliary_loss_mlp": 0.01033718, + "balance_loss_clip": 1.03378296, + "balance_loss_mlp": 1.0207119, + "epoch": 0.8755148053509695, + "flos": 20631075724800.0, + "grad_norm": 1.9513123189149806, + "language_loss": 0.7754668, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.79685247, + "num_input_tokens_seen": 314004460, + "step": 14562, + "time_per_iteration": 4.323149681091309 + }, + { + "auxiliary_loss_clip": 0.01098753, + "auxiliary_loss_mlp": 0.01026768, + "balance_loss_clip": 1.03347135, + "balance_loss_mlp": 1.01552057, + "epoch": 0.8755749286036375, + "flos": 34969739005440.0, + "grad_norm": 1.6983536249512006, + "language_loss": 0.7164464, + "learning_rate": 1.601428988367981e-07, + "loss": 0.73770165, + "num_input_tokens_seen": 314026855, + "step": 14563, + "time_per_iteration": 2.5797016620635986 + }, + { + "auxiliary_loss_clip": 0.01109095, + "auxiliary_loss_mlp": 0.01033758, + "balance_loss_clip": 1.03735733, + "balance_loss_mlp": 1.02156901, + "epoch": 0.8756350518563054, + "flos": 18186456925440.0, + "grad_norm": 2.096513956613813, + "language_loss": 0.66062307, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.68205154, + "num_input_tokens_seen": 314042830, + "step": 14564, + "time_per_iteration": 2.438516855239868 + }, + { + "auxiliary_loss_clip": 0.0108971, + "auxiliary_loss_mlp": 0.01035016, + "balance_loss_clip": 1.0326314, + "balance_loss_mlp": 1.02361977, + "epoch": 0.8756951751089734, + "flos": 20084012611200.0, + "grad_norm": 1.5781452656232458, + "language_loss": 0.70419621, + "learning_rate": 1.598376334037408e-07, + "loss": 0.72544348, + "num_input_tokens_seen": 314062225, + "step": 14565, + "time_per_iteration": 3.921264410018921 + }, + { + "auxiliary_loss_clip": 0.01087632, + "auxiliary_loss_mlp": 0.01032123, + "balance_loss_clip": 1.03539109, + "balance_loss_mlp": 1.01882529, + "epoch": 0.8757552983616413, + "flos": 27525241739520.0, + "grad_norm": 1.592546932430247, + "language_loss": 0.77746964, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.79866719, + "num_input_tokens_seen": 314082325, + "step": 14566, + "time_per_iteration": 2.564500093460083 + }, + { + "auxiliary_loss_clip": 0.01083693, + "auxiliary_loss_mlp": 0.01033525, + "balance_loss_clip": 1.03803194, + "balance_loss_mlp": 1.02161002, + "epoch": 0.8758154216143094, + "flos": 18073014796800.0, + "grad_norm": 1.9800775864986948, + "language_loss": 0.71051723, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.73168933, + "num_input_tokens_seen": 314100310, + "step": 14567, + "time_per_iteration": 2.4667885303497314 + }, + { + "auxiliary_loss_clip": 0.01082827, + "auxiliary_loss_mlp": 0.00783174, + "balance_loss_clip": 1.03322315, + "balance_loss_mlp": 1.0083493, + "epoch": 0.8758755448669773, + "flos": 25045681985280.0, + "grad_norm": 1.7891105693143545, + "language_loss": 0.74608535, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.76474535, + "num_input_tokens_seen": 314121330, + "step": 14568, + "time_per_iteration": 2.5596694946289062 + }, + { + "auxiliary_loss_clip": 0.01066874, + "auxiliary_loss_mlp": 0.01028795, + "balance_loss_clip": 1.03398633, + "balance_loss_mlp": 1.01700544, + "epoch": 0.8759356681196453, + "flos": 22856818999680.0, + "grad_norm": 2.06179169607315, + "language_loss": 0.86734456, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.88830125, + "num_input_tokens_seen": 314139875, + "step": 14569, + "time_per_iteration": 2.5417699813842773 + }, + { + "auxiliary_loss_clip": 0.01066735, + "auxiliary_loss_mlp": 0.01030843, + "balance_loss_clip": 1.03313971, + "balance_loss_mlp": 1.0193336, + "epoch": 0.8759957913723132, + "flos": 21032521102080.0, + "grad_norm": 1.5969689646711007, + "language_loss": 0.73974818, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.76072395, + "num_input_tokens_seen": 314157850, + "step": 14570, + "time_per_iteration": 2.5479788780212402 + }, + { + "auxiliary_loss_clip": 0.01095606, + "auxiliary_loss_mlp": 0.00784426, + "balance_loss_clip": 1.035043, + "balance_loss_mlp": 1.01000667, + "epoch": 0.8760559146249812, + "flos": 20010467514240.0, + "grad_norm": 1.6852835734327531, + "language_loss": 0.67500937, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.69380975, + "num_input_tokens_seen": 314176720, + "step": 14571, + "time_per_iteration": 3.887359380722046 + }, + { + "auxiliary_loss_clip": 0.01065655, + "auxiliary_loss_mlp": 0.01027045, + "balance_loss_clip": 1.03484082, + "balance_loss_mlp": 1.01566696, + "epoch": 0.8761160378776491, + "flos": 19974161842560.0, + "grad_norm": 1.9465823616354196, + "language_loss": 0.62421691, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.64514387, + "num_input_tokens_seen": 314196645, + "step": 14572, + "time_per_iteration": 2.5310797691345215 + }, + { + "auxiliary_loss_clip": 0.01090053, + "auxiliary_loss_mlp": 0.01030011, + "balance_loss_clip": 1.03448427, + "balance_loss_mlp": 1.01950848, + "epoch": 0.8761761611303172, + "flos": 28804415857920.0, + "grad_norm": 1.6354157221063528, + "language_loss": 0.73516154, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.75636214, + "num_input_tokens_seen": 314217430, + "step": 14573, + "time_per_iteration": 2.5523860454559326 + }, + { + "auxiliary_loss_clip": 0.0105487, + "auxiliary_loss_mlp": 0.00780777, + "balance_loss_clip": 1.03399897, + "balance_loss_mlp": 1.00644588, + "epoch": 0.8762362843829851, + "flos": 18332505624960.0, + "grad_norm": 1.9072896244341297, + "language_loss": 0.729895, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.7482515, + "num_input_tokens_seen": 314235310, + "step": 14574, + "time_per_iteration": 2.564819574356079 + }, + { + "auxiliary_loss_clip": 0.01091409, + "auxiliary_loss_mlp": 0.01034666, + "balance_loss_clip": 1.03436053, + "balance_loss_mlp": 1.02261376, + "epoch": 0.8762964076356531, + "flos": 15779149378560.0, + "grad_norm": 1.6874944080069947, + "language_loss": 0.75699472, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.77825546, + "num_input_tokens_seen": 314252355, + "step": 14575, + "time_per_iteration": 2.487624406814575 + }, + { + "auxiliary_loss_clip": 0.01077685, + "auxiliary_loss_mlp": 0.01040615, + "balance_loss_clip": 1.03544807, + "balance_loss_mlp": 1.02886117, + "epoch": 0.8763565308883211, + "flos": 33176754789120.0, + "grad_norm": 1.8187111074492424, + "language_loss": 0.66896737, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.69015038, + "num_input_tokens_seen": 314272755, + "step": 14576, + "time_per_iteration": 2.630730152130127 + }, + { + "auxiliary_loss_clip": 0.01073483, + "auxiliary_loss_mlp": 0.01029415, + "balance_loss_clip": 1.03060853, + "balance_loss_mlp": 1.01809025, + "epoch": 0.876416654140989, + "flos": 15888102307200.0, + "grad_norm": 1.715414850476694, + "language_loss": 0.66708738, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.68811631, + "num_input_tokens_seen": 314291365, + "step": 14577, + "time_per_iteration": 2.5200281143188477 + }, + { + "auxiliary_loss_clip": 0.01095008, + "auxiliary_loss_mlp": 0.01033422, + "balance_loss_clip": 1.03668463, + "balance_loss_mlp": 1.02102399, + "epoch": 0.876476777393657, + "flos": 25885237547520.0, + "grad_norm": 1.9745455725760268, + "language_loss": 0.71480274, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.73608708, + "num_input_tokens_seen": 314310075, + "step": 14578, + "time_per_iteration": 2.505345344543457 + }, + { + "auxiliary_loss_clip": 0.01105471, + "auxiliary_loss_mlp": 0.01031462, + "balance_loss_clip": 1.03463054, + "balance_loss_mlp": 1.01912951, + "epoch": 0.876536900646325, + "flos": 13589675861760.0, + "grad_norm": 2.2286462006731833, + "language_loss": 0.71503162, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.73640096, + "num_input_tokens_seen": 314325695, + "step": 14579, + "time_per_iteration": 2.4751572608947754 + }, + { + "auxiliary_loss_clip": 0.01073374, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.03231525, + "balance_loss_mlp": 1.02158761, + "epoch": 0.876597023898993, + "flos": 12203344494720.0, + "grad_norm": 1.9214771754445348, + "language_loss": 0.70574963, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.72682989, + "num_input_tokens_seen": 314343605, + "step": 14580, + "time_per_iteration": 2.4873805046081543 + }, + { + "auxiliary_loss_clip": 0.01101842, + "auxiliary_loss_mlp": 0.00782608, + "balance_loss_clip": 1.03538156, + "balance_loss_mlp": 1.00991607, + "epoch": 0.8766571471516609, + "flos": 25336773803520.0, + "grad_norm": 4.222731067858873, + "language_loss": 0.65358198, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.67242646, + "num_input_tokens_seen": 314364275, + "step": 14581, + "time_per_iteration": 2.5160391330718994 + }, + { + "auxiliary_loss_clip": 0.01076354, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.03531897, + "balance_loss_mlp": 1.01787829, + "epoch": 0.8767172704043289, + "flos": 30113287545600.0, + "grad_norm": 1.5175793336771028, + "language_loss": 0.73717582, + "learning_rate": 1.572541512164416e-07, + "loss": 0.75823033, + "num_input_tokens_seen": 314385140, + "step": 14582, + "time_per_iteration": 2.581895112991333 + }, + { + "auxiliary_loss_clip": 0.01101807, + "auxiliary_loss_mlp": 0.00781887, + "balance_loss_clip": 1.03340447, + "balance_loss_mlp": 1.00821602, + "epoch": 0.8767773936569968, + "flos": 19281157770240.0, + "grad_norm": 2.026901181189871, + "language_loss": 0.66488069, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.68371761, + "num_input_tokens_seen": 314403715, + "step": 14583, + "time_per_iteration": 2.449711322784424 + }, + { + "auxiliary_loss_clip": 0.01095608, + "auxiliary_loss_mlp": 0.00783612, + "balance_loss_clip": 1.03504753, + "balance_loss_mlp": 1.00975358, + "epoch": 0.8768375169096648, + "flos": 21247230648960.0, + "grad_norm": 1.7368585197408792, + "language_loss": 0.79376423, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.81255651, + "num_input_tokens_seen": 314421880, + "step": 14584, + "time_per_iteration": 2.470453977584839 + }, + { + "auxiliary_loss_clip": 0.0107054, + "auxiliary_loss_mlp": 0.01026652, + "balance_loss_clip": 1.03278589, + "balance_loss_mlp": 1.01493347, + "epoch": 0.8768976401623327, + "flos": 23295539715840.0, + "grad_norm": 1.5368498975494587, + "language_loss": 0.72249043, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.74346232, + "num_input_tokens_seen": 314441585, + "step": 14585, + "time_per_iteration": 2.5647361278533936 + }, + { + "auxiliary_loss_clip": 0.01082983, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.03425741, + "balance_loss_mlp": 1.01807868, + "epoch": 0.8769577634150008, + "flos": 21361247395200.0, + "grad_norm": 1.8864474063646308, + "language_loss": 0.74550539, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.76664317, + "num_input_tokens_seen": 314459020, + "step": 14586, + "time_per_iteration": 2.5111515522003174 + }, + { + "auxiliary_loss_clip": 0.01101846, + "auxiliary_loss_mlp": 0.01031065, + "balance_loss_clip": 1.03293884, + "balance_loss_mlp": 1.01848269, + "epoch": 0.8770178866676687, + "flos": 23514056104320.0, + "grad_norm": 1.795107204544321, + "language_loss": 0.78684056, + "learning_rate": 1.564981454895844e-07, + "loss": 0.8081696, + "num_input_tokens_seen": 314478935, + "step": 14587, + "time_per_iteration": 2.472163200378418 + }, + { + "auxiliary_loss_clip": 0.01093825, + "auxiliary_loss_mlp": 0.01029794, + "balance_loss_clip": 1.03567529, + "balance_loss_mlp": 1.01601303, + "epoch": 0.8770780099203367, + "flos": 19719052473600.0, + "grad_norm": 1.552354094586315, + "language_loss": 0.73803759, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.75927377, + "num_input_tokens_seen": 314497635, + "step": 14588, + "time_per_iteration": 2.481138229370117 + }, + { + "auxiliary_loss_clip": 0.01038633, + "auxiliary_loss_mlp": 0.0078289, + "balance_loss_clip": 1.0342468, + "balance_loss_mlp": 1.00923073, + "epoch": 0.8771381331730047, + "flos": 21395901041280.0, + "grad_norm": 1.8019944547569382, + "language_loss": 0.66809684, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.68631208, + "num_input_tokens_seen": 314515445, + "step": 14589, + "time_per_iteration": 2.6260859966278076 + }, + { + "auxiliary_loss_clip": 0.01093897, + "auxiliary_loss_mlp": 0.010338, + "balance_loss_clip": 1.03593969, + "balance_loss_mlp": 1.02246296, + "epoch": 0.8771982564256726, + "flos": 20261770041600.0, + "grad_norm": 2.683788195986943, + "language_loss": 0.71015286, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.73142982, + "num_input_tokens_seen": 314533040, + "step": 14590, + "time_per_iteration": 2.471261501312256 + }, + { + "auxiliary_loss_clip": 0.01079882, + "auxiliary_loss_mlp": 0.01041977, + "balance_loss_clip": 1.03390408, + "balance_loss_mlp": 1.02844644, + "epoch": 0.8772583796783406, + "flos": 12489372495360.0, + "grad_norm": 1.9787066279147494, + "language_loss": 0.74756134, + "learning_rate": 1.558945991776086e-07, + "loss": 0.76877993, + "num_input_tokens_seen": 314548280, + "step": 14591, + "time_per_iteration": 2.4989218711853027 + }, + { + "auxiliary_loss_clip": 0.01099029, + "auxiliary_loss_mlp": 0.01026882, + "balance_loss_clip": 1.03485596, + "balance_loss_mlp": 1.01545024, + "epoch": 0.8773185029310085, + "flos": 15921103927680.0, + "grad_norm": 1.6266467508441815, + "language_loss": 0.7981028, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.81936193, + "num_input_tokens_seen": 314565345, + "step": 14592, + "time_per_iteration": 2.438707113265991 + }, + { + "auxiliary_loss_clip": 0.01099086, + "auxiliary_loss_mlp": 0.01029673, + "balance_loss_clip": 1.03406012, + "balance_loss_mlp": 1.01871157, + "epoch": 0.8773786261836766, + "flos": 21504530747520.0, + "grad_norm": 1.711805539664191, + "language_loss": 0.82486278, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.8461504, + "num_input_tokens_seen": 314584190, + "step": 14593, + "time_per_iteration": 2.465970277786255 + }, + { + "auxiliary_loss_clip": 0.01086301, + "auxiliary_loss_mlp": 0.01026777, + "balance_loss_clip": 1.03342962, + "balance_loss_mlp": 1.01505268, + "epoch": 0.8774387494363445, + "flos": 26761493831040.0, + "grad_norm": 1.820524202868089, + "language_loss": 0.76015413, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.78128493, + "num_input_tokens_seen": 314605625, + "step": 14594, + "time_per_iteration": 2.5352697372436523 + }, + { + "auxiliary_loss_clip": 0.01055159, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.03204441, + "balance_loss_mlp": 1.02795291, + "epoch": 0.8774988726890125, + "flos": 18478841633280.0, + "grad_norm": 1.9928945141529861, + "language_loss": 0.77985477, + "learning_rate": 1.552921717241651e-07, + "loss": 0.80082846, + "num_input_tokens_seen": 314622630, + "step": 14595, + "time_per_iteration": 2.5600056648254395 + }, + { + "auxiliary_loss_clip": 0.01073209, + "auxiliary_loss_mlp": 0.0103432, + "balance_loss_clip": 1.036448, + "balance_loss_mlp": 1.02183294, + "epoch": 0.8775589959416804, + "flos": 24426366664320.0, + "grad_norm": 1.7789919688859896, + "language_loss": 0.70664656, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.72772193, + "num_input_tokens_seen": 314642460, + "step": 14596, + "time_per_iteration": 2.5785014629364014 + }, + { + "auxiliary_loss_clip": 0.01071226, + "auxiliary_loss_mlp": 0.01025383, + "balance_loss_clip": 1.03579295, + "balance_loss_mlp": 1.01420724, + "epoch": 0.8776191191943484, + "flos": 23440151871360.0, + "grad_norm": 2.0064663399457405, + "language_loss": 0.86263156, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.88359761, + "num_input_tokens_seen": 314659875, + "step": 14597, + "time_per_iteration": 2.5707006454467773 + }, + { + "auxiliary_loss_clip": 0.01092296, + "auxiliary_loss_mlp": 0.01027897, + "balance_loss_clip": 1.03555465, + "balance_loss_mlp": 1.01653683, + "epoch": 0.8776792424470163, + "flos": 26830872950400.0, + "grad_norm": 1.7838088985133032, + "language_loss": 0.72879219, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.7499941, + "num_input_tokens_seen": 314680260, + "step": 14598, + "time_per_iteration": 2.507356882095337 + }, + { + "auxiliary_loss_clip": 0.01088347, + "auxiliary_loss_mlp": 0.00785162, + "balance_loss_clip": 1.0338366, + "balance_loss_mlp": 1.0109961, + "epoch": 0.8777393656996844, + "flos": 15626169354240.0, + "grad_norm": 2.1692856652516745, + "language_loss": 0.77458453, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.79331964, + "num_input_tokens_seen": 314696260, + "step": 14599, + "time_per_iteration": 3.909712314605713 + }, + { + "auxiliary_loss_clip": 0.01073125, + "auxiliary_loss_mlp": 0.01030035, + "balance_loss_clip": 1.03344822, + "balance_loss_mlp": 1.01804256, + "epoch": 0.8777994889523523, + "flos": 18879999701760.0, + "grad_norm": 2.226580816326483, + "language_loss": 0.68367678, + "learning_rate": 1.545407113589332e-07, + "loss": 0.7047084, + "num_input_tokens_seen": 314714215, + "step": 14600, + "time_per_iteration": 2.561004400253296 + }, + { + "auxiliary_loss_clip": 0.0109302, + "auxiliary_loss_mlp": 0.01034532, + "balance_loss_clip": 1.03397655, + "balance_loss_mlp": 1.02271295, + "epoch": 0.8778596122050203, + "flos": 48826516400640.0, + "grad_norm": 1.8325860328536763, + "language_loss": 0.69647253, + "learning_rate": 1.543906292031072e-07, + "loss": 0.71774805, + "num_input_tokens_seen": 314735700, + "step": 14601, + "time_per_iteration": 4.051278829574585 + }, + { + "auxiliary_loss_clip": 0.01098554, + "auxiliary_loss_mlp": 0.01030485, + "balance_loss_clip": 1.03658617, + "balance_loss_mlp": 1.0182004, + "epoch": 0.8779197354576883, + "flos": 25660184883840.0, + "grad_norm": 1.8153754974772762, + "language_loss": 0.7329421, + "learning_rate": 1.542406170329733e-07, + "loss": 0.75423253, + "num_input_tokens_seen": 314753335, + "step": 14602, + "time_per_iteration": 2.5323855876922607 + }, + { + "auxiliary_loss_clip": 0.01102541, + "auxiliary_loss_mlp": 0.01033095, + "balance_loss_clip": 1.0349673, + "balance_loss_mlp": 1.02159119, + "epoch": 0.8779798587103562, + "flos": 18843227153280.0, + "grad_norm": 1.7851284889505383, + "language_loss": 0.71265376, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.7340101, + "num_input_tokens_seen": 314770800, + "step": 14603, + "time_per_iteration": 2.4247536659240723 + }, + { + "auxiliary_loss_clip": 0.01010211, + "auxiliary_loss_mlp": 0.01000679, + "balance_loss_clip": 1.0078826, + "balance_loss_mlp": 0.99949312, + "epoch": 0.8780399819630242, + "flos": 68613119377920.0, + "grad_norm": 0.7427893166199683, + "language_loss": 0.5415749, + "learning_rate": 1.539408026725344e-07, + "loss": 0.56168377, + "num_input_tokens_seen": 314837275, + "step": 14604, + "time_per_iteration": 4.518057823181152 + }, + { + "auxiliary_loss_clip": 0.01004615, + "auxiliary_loss_mlp": 0.01019551, + "balance_loss_clip": 1.01213431, + "balance_loss_mlp": 1.01807296, + "epoch": 0.8781001052156922, + "flos": 65734807766400.0, + "grad_norm": 0.7080399574980787, + "language_loss": 0.59196031, + "learning_rate": 1.537910004935976e-07, + "loss": 0.61220193, + "num_input_tokens_seen": 314902220, + "step": 14605, + "time_per_iteration": 3.1412038803100586 + }, + { + "auxiliary_loss_clip": 0.01062915, + "auxiliary_loss_mlp": 0.0103169, + "balance_loss_clip": 1.03525138, + "balance_loss_mlp": 1.01942921, + "epoch": 0.8781602284683602, + "flos": 22049654526720.0, + "grad_norm": 1.5167309964175346, + "language_loss": 0.84886128, + "learning_rate": 1.536412683230912e-07, + "loss": 0.86980736, + "num_input_tokens_seen": 314921645, + "step": 14606, + "time_per_iteration": 2.5748403072357178 + }, + { + "auxiliary_loss_clip": 0.01106073, + "auxiliary_loss_mlp": 0.01028092, + "balance_loss_clip": 1.03638864, + "balance_loss_mlp": 1.015378, + "epoch": 0.8782203517210281, + "flos": 17562939713280.0, + "grad_norm": 1.9873772268173087, + "language_loss": 0.70566434, + "learning_rate": 1.534916061666931e-07, + "loss": 0.72700596, + "num_input_tokens_seen": 314939390, + "step": 14607, + "time_per_iteration": 2.4378836154937744 + }, + { + "auxiliary_loss_clip": 0.01082294, + "auxiliary_loss_mlp": 0.01038853, + "balance_loss_clip": 1.03491783, + "balance_loss_mlp": 1.02785611, + "epoch": 0.8782804749736961, + "flos": 25520421064320.0, + "grad_norm": 1.7134894075420333, + "language_loss": 0.72059816, + "learning_rate": 1.533420140300785e-07, + "loss": 0.74180961, + "num_input_tokens_seen": 314959205, + "step": 14608, + "time_per_iteration": 2.550614356994629 + }, + { + "auxiliary_loss_clip": 0.01095433, + "auxiliary_loss_mlp": 0.01037858, + "balance_loss_clip": 1.03460586, + "balance_loss_mlp": 1.02551997, + "epoch": 0.878340598226364, + "flos": 21798747048960.0, + "grad_norm": 1.9279408882683684, + "language_loss": 0.87873948, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.9000724, + "num_input_tokens_seen": 314977485, + "step": 14609, + "time_per_iteration": 2.4757397174835205 + }, + { + "auxiliary_loss_clip": 0.01066868, + "auxiliary_loss_mlp": 0.01029947, + "balance_loss_clip": 1.03763747, + "balance_loss_mlp": 1.0184077, + "epoch": 0.878400721479032, + "flos": 21102403011840.0, + "grad_norm": 1.5044353162674386, + "language_loss": 0.70069259, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.72166073, + "num_input_tokens_seen": 314997830, + "step": 14610, + "time_per_iteration": 3.966487169265747 + }, + { + "auxiliary_loss_clip": 0.01086069, + "auxiliary_loss_mlp": 0.00784036, + "balance_loss_clip": 1.03596401, + "balance_loss_mlp": 1.01020122, + "epoch": 0.8784608447316999, + "flos": 20923532259840.0, + "grad_norm": 1.9118802260705674, + "language_loss": 0.80287349, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.82157457, + "num_input_tokens_seen": 315016480, + "step": 14611, + "time_per_iteration": 2.4641449451446533 + }, + { + "auxiliary_loss_clip": 0.01104367, + "auxiliary_loss_mlp": 0.0103128, + "balance_loss_clip": 1.03538036, + "balance_loss_mlp": 1.01891816, + "epoch": 0.878520967984368, + "flos": 23330660238720.0, + "grad_norm": 1.4809412094184047, + "language_loss": 0.76516145, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.78651792, + "num_input_tokens_seen": 315036135, + "step": 14612, + "time_per_iteration": 2.4718329906463623 + }, + { + "auxiliary_loss_clip": 0.01061547, + "auxiliary_loss_mlp": 0.01031294, + "balance_loss_clip": 1.03557396, + "balance_loss_mlp": 1.0196774, + "epoch": 0.8785810912370359, + "flos": 25518984520320.0, + "grad_norm": 1.3809331738800725, + "language_loss": 0.72162074, + "learning_rate": 1.525951038422002e-07, + "loss": 0.74254918, + "num_input_tokens_seen": 315057995, + "step": 14613, + "time_per_iteration": 2.600696086883545 + }, + { + "auxiliary_loss_clip": 0.01006752, + "auxiliary_loss_mlp": 0.01004465, + "balance_loss_clip": 1.01227903, + "balance_loss_mlp": 1.00344014, + "epoch": 0.8786412144897039, + "flos": 61841047691520.0, + "grad_norm": 1.0336525191694408, + "language_loss": 0.64586496, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.66597712, + "num_input_tokens_seen": 315104010, + "step": 14614, + "time_per_iteration": 2.914268732070923 + }, + { + "auxiliary_loss_clip": 0.01028271, + "auxiliary_loss_mlp": 0.01001734, + "balance_loss_clip": 1.00559855, + "balance_loss_mlp": 1.00056541, + "epoch": 0.8787013377423719, + "flos": 70989364638720.0, + "grad_norm": 0.6574508025729802, + "language_loss": 0.58588648, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.60618651, + "num_input_tokens_seen": 315174550, + "step": 14615, + "time_per_iteration": 3.1473212242126465 + }, + { + "auxiliary_loss_clip": 0.01054919, + "auxiliary_loss_mlp": 0.01027215, + "balance_loss_clip": 1.03198671, + "balance_loss_mlp": 1.01526451, + "epoch": 0.8787614609950398, + "flos": 17347404153600.0, + "grad_norm": 2.2667091732037647, + "language_loss": 0.72930664, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.75012797, + "num_input_tokens_seen": 315191825, + "step": 14616, + "time_per_iteration": 2.538909673690796 + }, + { + "auxiliary_loss_clip": 0.01027894, + "auxiliary_loss_mlp": 0.0100131, + "balance_loss_clip": 1.00522876, + "balance_loss_mlp": 1.0000701, + "epoch": 0.8788215842477078, + "flos": 72511401588480.0, + "grad_norm": 0.834683340959482, + "language_loss": 0.57955176, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.5998438, + "num_input_tokens_seen": 315255075, + "step": 14617, + "time_per_iteration": 3.1584932804107666 + }, + { + "auxiliary_loss_clip": 0.01077042, + "auxiliary_loss_mlp": 0.01036508, + "balance_loss_clip": 1.03305912, + "balance_loss_mlp": 1.02390718, + "epoch": 0.8788817075003758, + "flos": 24827452905600.0, + "grad_norm": 2.082241990241764, + "language_loss": 0.83742487, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.85856038, + "num_input_tokens_seen": 315273995, + "step": 14618, + "time_per_iteration": 2.5396628379821777 + }, + { + "auxiliary_loss_clip": 0.0107741, + "auxiliary_loss_mlp": 0.01027728, + "balance_loss_clip": 1.03488469, + "balance_loss_mlp": 1.016397, + "epoch": 0.8789418307530438, + "flos": 22638769488000.0, + "grad_norm": 2.214115332468239, + "language_loss": 0.68980002, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.71085143, + "num_input_tokens_seen": 315294485, + "step": 14619, + "time_per_iteration": 2.5766639709472656 + }, + { + "auxiliary_loss_clip": 0.0106109, + "auxiliary_loss_mlp": 0.01038206, + "balance_loss_clip": 1.03250003, + "balance_loss_mlp": 1.02638614, + "epoch": 0.8790019540057117, + "flos": 19785738072960.0, + "grad_norm": 1.8797418118623481, + "language_loss": 0.77363855, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.79463148, + "num_input_tokens_seen": 315310420, + "step": 14620, + "time_per_iteration": 2.5460996627807617 + }, + { + "auxiliary_loss_clip": 0.01080473, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.03625977, + "balance_loss_mlp": 1.01608872, + "epoch": 0.8790620772583797, + "flos": 20229774001920.0, + "grad_norm": 1.8486336807159003, + "language_loss": 0.79174066, + "learning_rate": 1.514036906317542e-07, + "loss": 0.81283921, + "num_input_tokens_seen": 315330110, + "step": 14621, + "time_per_iteration": 2.570830821990967 + }, + { + "auxiliary_loss_clip": 0.01083853, + "auxiliary_loss_mlp": 0.01035455, + "balance_loss_clip": 1.03409767, + "balance_loss_mlp": 1.02384377, + "epoch": 0.8791222005110476, + "flos": 24130785646080.0, + "grad_norm": 1.669181962793135, + "language_loss": 0.67025912, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.69145226, + "num_input_tokens_seen": 315350080, + "step": 14622, + "time_per_iteration": 2.538210391998291 + }, + { + "auxiliary_loss_clip": 0.01075707, + "auxiliary_loss_mlp": 0.01035913, + "balance_loss_clip": 1.03514075, + "balance_loss_mlp": 1.02430773, + "epoch": 0.8791823237637156, + "flos": 21614201948160.0, + "grad_norm": 1.936371781891554, + "language_loss": 0.73272115, + "learning_rate": 1.511065382058687e-07, + "loss": 0.75383735, + "num_input_tokens_seen": 315366360, + "step": 14623, + "time_per_iteration": 2.5156872272491455 + }, + { + "auxiliary_loss_clip": 0.01052172, + "auxiliary_loss_mlp": 0.01032132, + "balance_loss_clip": 1.03256428, + "balance_loss_mlp": 1.02033067, + "epoch": 0.8792424470163835, + "flos": 24243401761920.0, + "grad_norm": 1.5709613469484087, + "language_loss": 0.78553683, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.80637985, + "num_input_tokens_seen": 315385890, + "step": 14624, + "time_per_iteration": 2.586944580078125 + }, + { + "auxiliary_loss_clip": 0.01093745, + "auxiliary_loss_mlp": 0.01034132, + "balance_loss_clip": 1.03369141, + "balance_loss_mlp": 1.02078032, + "epoch": 0.8793025702690516, + "flos": 24893204751360.0, + "grad_norm": 2.3012932189067903, + "language_loss": 0.79610598, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.81738472, + "num_input_tokens_seen": 315403400, + "step": 14625, + "time_per_iteration": 2.5191662311553955 + }, + { + "auxiliary_loss_clip": 0.01081683, + "auxiliary_loss_mlp": 0.01038191, + "balance_loss_clip": 1.03566432, + "balance_loss_mlp": 1.02640724, + "epoch": 0.8793626935217195, + "flos": 25373115388800.0, + "grad_norm": 1.5013243784376216, + "language_loss": 0.74200475, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.7632035, + "num_input_tokens_seen": 315423670, + "step": 14626, + "time_per_iteration": 2.5541443824768066 + }, + { + "auxiliary_loss_clip": 0.01091911, + "auxiliary_loss_mlp": 0.01032248, + "balance_loss_clip": 1.03332472, + "balance_loss_mlp": 1.02013028, + "epoch": 0.8794228167743875, + "flos": 34678000742400.0, + "grad_norm": 1.4973641746632622, + "language_loss": 0.70965087, + "learning_rate": 1.505130747218246e-07, + "loss": 0.73089248, + "num_input_tokens_seen": 315446265, + "step": 14627, + "time_per_iteration": 2.618730068206787 + }, + { + "auxiliary_loss_clip": 0.01071453, + "auxiliary_loss_mlp": 0.01027381, + "balance_loss_clip": 1.03635728, + "balance_loss_mlp": 1.0154779, + "epoch": 0.8794829400270555, + "flos": 19464014931840.0, + "grad_norm": 1.771703489346919, + "language_loss": 0.72234201, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.74333036, + "num_input_tokens_seen": 315464655, + "step": 14628, + "time_per_iteration": 2.52742075920105 + }, + { + "auxiliary_loss_clip": 0.01072688, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.03300428, + "balance_loss_mlp": 1.01903391, + "epoch": 0.8795430632797234, + "flos": 15231403906560.0, + "grad_norm": 2.5972340846474524, + "language_loss": 0.69117981, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.71222568, + "num_input_tokens_seen": 315481090, + "step": 14629, + "time_per_iteration": 2.5217175483703613 + }, + { + "auxiliary_loss_clip": 0.01077037, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.03108597, + "balance_loss_mlp": 1.02027082, + "epoch": 0.8796031865323914, + "flos": 27744727795200.0, + "grad_norm": 1.8869240019281852, + "language_loss": 0.68406713, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.70514899, + "num_input_tokens_seen": 315502010, + "step": 14630, + "time_per_iteration": 2.5296783447265625 + }, + { + "auxiliary_loss_clip": 0.01077806, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.03352189, + "balance_loss_mlp": 1.01968825, + "epoch": 0.8796633097850594, + "flos": 31285412156160.0, + "grad_norm": 1.547710883112421, + "language_loss": 0.74037564, + "learning_rate": 1.499207333613999e-07, + "loss": 0.76147759, + "num_input_tokens_seen": 315523040, + "step": 14631, + "time_per_iteration": 2.600100040435791 + }, + { + "auxiliary_loss_clip": 0.01079386, + "auxiliary_loss_mlp": 0.00783065, + "balance_loss_clip": 1.0341382, + "balance_loss_mlp": 1.00977015, + "epoch": 0.8797234330377274, + "flos": 24243150366720.0, + "grad_norm": 3.4256411302596463, + "language_loss": 0.69272506, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.71134961, + "num_input_tokens_seen": 315541865, + "step": 14632, + "time_per_iteration": 2.5291330814361572 + }, + { + "auxiliary_loss_clip": 0.01081826, + "auxiliary_loss_mlp": 0.01028806, + "balance_loss_clip": 1.03478491, + "balance_loss_mlp": 1.01789832, + "epoch": 0.8797835562903953, + "flos": 24167414540160.0, + "grad_norm": 1.8881425430915078, + "language_loss": 0.65150779, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.6726141, + "num_input_tokens_seen": 315561470, + "step": 14633, + "time_per_iteration": 2.5611515045166016 + }, + { + "auxiliary_loss_clip": 0.01072131, + "auxiliary_loss_mlp": 0.01033045, + "balance_loss_clip": 1.03304267, + "balance_loss_mlp": 1.02104616, + "epoch": 0.8798436795430633, + "flos": 19284677303040.0, + "grad_norm": 1.4599655657037083, + "language_loss": 0.8416748, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.86272657, + "num_input_tokens_seen": 315583140, + "step": 14634, + "time_per_iteration": 2.5857927799224854 + }, + { + "auxiliary_loss_clip": 0.0108044, + "auxiliary_loss_mlp": 0.00783492, + "balance_loss_clip": 1.03429651, + "balance_loss_mlp": 1.00959742, + "epoch": 0.8799038027957312, + "flos": 28179390274560.0, + "grad_norm": 1.6759555894037406, + "language_loss": 0.79985827, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.8184976, + "num_input_tokens_seen": 315601935, + "step": 14635, + "time_per_iteration": 2.563082695007324 + }, + { + "auxiliary_loss_clip": 0.01083706, + "auxiliary_loss_mlp": 0.01025511, + "balance_loss_clip": 1.03590381, + "balance_loss_mlp": 1.01379299, + "epoch": 0.8799639260483992, + "flos": 24644703484800.0, + "grad_norm": 4.079210051306064, + "language_loss": 0.65131652, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.6724087, + "num_input_tokens_seen": 315619995, + "step": 14636, + "time_per_iteration": 2.5261261463165283 + }, + { + "auxiliary_loss_clip": 0.0107866, + "auxiliary_loss_mlp": 0.01033785, + "balance_loss_clip": 1.03358221, + "balance_loss_mlp": 1.0212146, + "epoch": 0.8800240493010671, + "flos": 22200479735040.0, + "grad_norm": 1.5199540325047247, + "language_loss": 0.70205718, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.72318166, + "num_input_tokens_seen": 315637895, + "step": 14637, + "time_per_iteration": 2.5262961387634277 + }, + { + "auxiliary_loss_clip": 0.01082038, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.03789985, + "balance_loss_mlp": 1.01779723, + "epoch": 0.8800841725537352, + "flos": 14246086953600.0, + "grad_norm": 1.853161733216791, + "language_loss": 0.65878826, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.67990041, + "num_input_tokens_seen": 315655520, + "step": 14638, + "time_per_iteration": 3.905273675918579 + }, + { + "auxiliary_loss_clip": 0.01093555, + "auxiliary_loss_mlp": 0.01027582, + "balance_loss_clip": 1.03531361, + "balance_loss_mlp": 1.01530337, + "epoch": 0.8801442958064031, + "flos": 37415794348800.0, + "grad_norm": 1.786855018147461, + "language_loss": 0.58091658, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.60212791, + "num_input_tokens_seen": 315678955, + "step": 14639, + "time_per_iteration": 4.027686357498169 + }, + { + "auxiliary_loss_clip": 0.01084353, + "auxiliary_loss_mlp": 0.0103635, + "balance_loss_clip": 1.03490818, + "balance_loss_mlp": 1.0239346, + "epoch": 0.8802044190590711, + "flos": 25047334010880.0, + "grad_norm": 1.4941852273312795, + "language_loss": 0.74387896, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.76508605, + "num_input_tokens_seen": 315700360, + "step": 14640, + "time_per_iteration": 2.56791353225708 + }, + { + "auxiliary_loss_clip": 0.01081021, + "auxiliary_loss_mlp": 0.01041326, + "balance_loss_clip": 1.03451824, + "balance_loss_mlp": 1.02859473, + "epoch": 0.8802645423117391, + "flos": 24133874215680.0, + "grad_norm": 1.946175830057362, + "language_loss": 0.69632304, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.71754646, + "num_input_tokens_seen": 315719270, + "step": 14641, + "time_per_iteration": 2.5268938541412354 + }, + { + "auxiliary_loss_clip": 0.01091141, + "auxiliary_loss_mlp": 0.01031897, + "balance_loss_clip": 1.03554153, + "balance_loss_mlp": 1.01895702, + "epoch": 0.880324665564407, + "flos": 17931203902080.0, + "grad_norm": 2.0741513309650483, + "language_loss": 0.84828615, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.86951649, + "num_input_tokens_seen": 315737425, + "step": 14642, + "time_per_iteration": 3.839522123336792 + }, + { + "auxiliary_loss_clip": 0.01064313, + "auxiliary_loss_mlp": 0.01032187, + "balance_loss_clip": 1.03909349, + "balance_loss_mlp": 1.01954496, + "epoch": 0.880384788817075, + "flos": 21287630471040.0, + "grad_norm": 1.7098187124885011, + "language_loss": 0.78632009, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.80728513, + "num_input_tokens_seen": 315755725, + "step": 14643, + "time_per_iteration": 2.610787868499756 + }, + { + "auxiliary_loss_clip": 0.01086934, + "auxiliary_loss_mlp": 0.01025768, + "balance_loss_clip": 1.03188634, + "balance_loss_mlp": 1.01492572, + "epoch": 0.880444912069743, + "flos": 12458489777280.0, + "grad_norm": 1.617858352543953, + "language_loss": 0.73052353, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.75165057, + "num_input_tokens_seen": 315773835, + "step": 14644, + "time_per_iteration": 2.4823882579803467 + }, + { + "auxiliary_loss_clip": 0.01108051, + "auxiliary_loss_mlp": 0.00784017, + "balance_loss_clip": 1.03694463, + "balance_loss_mlp": 1.01046312, + "epoch": 0.880505035322411, + "flos": 13625945619840.0, + "grad_norm": 2.5912239350573403, + "language_loss": 0.79616964, + "learning_rate": 1.47856380505911e-07, + "loss": 0.8150903, + "num_input_tokens_seen": 315790615, + "step": 14645, + "time_per_iteration": 2.442098379135132 + }, + { + "auxiliary_loss_clip": 0.0108865, + "auxiliary_loss_mlp": 0.01033835, + "balance_loss_clip": 1.03327823, + "balance_loss_mlp": 1.02197969, + "epoch": 0.8805651585750789, + "flos": 23183067254400.0, + "grad_norm": 2.5832450540930965, + "language_loss": 0.64299381, + "learning_rate": 1.477094533001364e-07, + "loss": 0.6642186, + "num_input_tokens_seen": 315811010, + "step": 14646, + "time_per_iteration": 2.483525514602661 + }, + { + "auxiliary_loss_clip": 0.0108181, + "auxiliary_loss_mlp": 0.01029655, + "balance_loss_clip": 1.03689814, + "balance_loss_mlp": 1.01623189, + "epoch": 0.8806252818277469, + "flos": 14903000835840.0, + "grad_norm": 10.390273405312833, + "language_loss": 0.77481598, + "learning_rate": 1.475625963334055e-07, + "loss": 0.79593062, + "num_input_tokens_seen": 315828130, + "step": 14647, + "time_per_iteration": 2.519935369491577 + }, + { + "auxiliary_loss_clip": 0.01101421, + "auxiliary_loss_mlp": 0.01030386, + "balance_loss_clip": 1.03483868, + "balance_loss_mlp": 1.01928806, + "epoch": 0.8806854050804148, + "flos": 17639178330240.0, + "grad_norm": 1.9201412971036431, + "language_loss": 0.7492525, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.77057052, + "num_input_tokens_seen": 315844900, + "step": 14648, + "time_per_iteration": 3.785038471221924 + }, + { + "auxiliary_loss_clip": 0.01082186, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.03259921, + "balance_loss_mlp": 1.01973009, + "epoch": 0.8807455283330828, + "flos": 25332392344320.0, + "grad_norm": 1.6155078525464013, + "language_loss": 0.65388262, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.67501891, + "num_input_tokens_seen": 315863745, + "step": 14649, + "time_per_iteration": 2.5493342876434326 + }, + { + "auxiliary_loss_clip": 0.01067826, + "auxiliary_loss_mlp": 0.01028351, + "balance_loss_clip": 1.03565502, + "balance_loss_mlp": 1.01609683, + "epoch": 0.8808056515857507, + "flos": 25265168040960.0, + "grad_norm": 1.3403565553693535, + "language_loss": 0.62467444, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.64563626, + "num_input_tokens_seen": 315885765, + "step": 14650, + "time_per_iteration": 2.6117103099823 + }, + { + "auxiliary_loss_clip": 0.01076632, + "auxiliary_loss_mlp": 0.01031937, + "balance_loss_clip": 1.03328896, + "balance_loss_mlp": 1.02026033, + "epoch": 0.8808657748384188, + "flos": 26578852151040.0, + "grad_norm": 1.4887103720652275, + "language_loss": 0.72737384, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.74845952, + "num_input_tokens_seen": 315907340, + "step": 14651, + "time_per_iteration": 2.5557219982147217 + }, + { + "auxiliary_loss_clip": 0.010954, + "auxiliary_loss_mlp": 0.01031503, + "balance_loss_clip": 1.0344609, + "balance_loss_mlp": 1.01862276, + "epoch": 0.8809258980910867, + "flos": 18661231918080.0, + "grad_norm": 1.7706394469700244, + "language_loss": 0.71590579, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.73717475, + "num_input_tokens_seen": 315924935, + "step": 14652, + "time_per_iteration": 2.4707043170928955 + }, + { + "auxiliary_loss_clip": 0.01091074, + "auxiliary_loss_mlp": 0.01029051, + "balance_loss_clip": 1.03380108, + "balance_loss_mlp": 1.01753545, + "epoch": 0.8809860213437547, + "flos": 19792274348160.0, + "grad_norm": 1.9643613176033254, + "language_loss": 0.75027466, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.77147591, + "num_input_tokens_seen": 315943165, + "step": 14653, + "time_per_iteration": 2.4665112495422363 + }, + { + "auxiliary_loss_clip": 0.01107304, + "auxiliary_loss_mlp": 0.01028348, + "balance_loss_clip": 1.03541207, + "balance_loss_mlp": 1.01580727, + "epoch": 0.8810461445964227, + "flos": 17894467267200.0, + "grad_norm": 1.7483420752303584, + "language_loss": 0.70978028, + "learning_rate": 1.465365647269421e-07, + "loss": 0.7311368, + "num_input_tokens_seen": 315961340, + "step": 14654, + "time_per_iteration": 2.4366378784179688 + }, + { + "auxiliary_loss_clip": 0.01057033, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.03376842, + "balance_loss_mlp": 1.02163959, + "epoch": 0.8811062678490906, + "flos": 29163917128320.0, + "grad_norm": 1.5743215652977551, + "language_loss": 0.71492249, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.73583925, + "num_input_tokens_seen": 315981335, + "step": 14655, + "time_per_iteration": 2.6360890865325928 + }, + { + "auxiliary_loss_clip": 0.01055464, + "auxiliary_loss_mlp": 0.01034065, + "balance_loss_clip": 1.03229356, + "balance_loss_mlp": 1.02126169, + "epoch": 0.8811663911017587, + "flos": 20338834671360.0, + "grad_norm": 1.5748557833498082, + "language_loss": 0.81368953, + "learning_rate": 1.462440453077449e-07, + "loss": 0.83458483, + "num_input_tokens_seen": 316001325, + "step": 14656, + "time_per_iteration": 2.592865228652954 + }, + { + "auxiliary_loss_clip": 0.01078218, + "auxiliary_loss_mlp": 0.01033717, + "balance_loss_clip": 1.03558683, + "balance_loss_mlp": 1.02235603, + "epoch": 0.8812265143544266, + "flos": 25885704424320.0, + "grad_norm": 1.6543748297349437, + "language_loss": 0.6851685, + "learning_rate": 1.460978910372914e-07, + "loss": 0.70628786, + "num_input_tokens_seen": 316022540, + "step": 14657, + "time_per_iteration": 2.54152512550354 + }, + { + "auxiliary_loss_clip": 0.01079644, + "auxiliary_loss_mlp": 0.01033629, + "balance_loss_clip": 1.03610027, + "balance_loss_mlp": 1.02218461, + "epoch": 0.8812866376070946, + "flos": 27195509865600.0, + "grad_norm": 2.0700842546487483, + "language_loss": 0.83929151, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.86042428, + "num_input_tokens_seen": 316037735, + "step": 14658, + "time_per_iteration": 2.6052651405334473 + }, + { + "auxiliary_loss_clip": 0.01089295, + "auxiliary_loss_mlp": 0.01034211, + "balance_loss_clip": 1.03685451, + "balance_loss_mlp": 1.02133012, + "epoch": 0.8813467608597625, + "flos": 23807194997760.0, + "grad_norm": 1.8680963205169296, + "language_loss": 0.77178687, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.79302192, + "num_input_tokens_seen": 316058105, + "step": 14659, + "time_per_iteration": 2.5348258018493652 + }, + { + "auxiliary_loss_clip": 0.01080327, + "auxiliary_loss_mlp": 0.01031207, + "balance_loss_clip": 1.03407645, + "balance_loss_mlp": 1.01942277, + "epoch": 0.8814068841124305, + "flos": 21105455667840.0, + "grad_norm": 2.1239191118891414, + "language_loss": 0.60359359, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.62470895, + "num_input_tokens_seen": 316074415, + "step": 14660, + "time_per_iteration": 2.513101577758789 + }, + { + "auxiliary_loss_clip": 0.01086786, + "auxiliary_loss_mlp": 0.01035785, + "balance_loss_clip": 1.03552318, + "balance_loss_mlp": 1.02235627, + "epoch": 0.8814670073650984, + "flos": 24716991605760.0, + "grad_norm": 2.3045393297484273, + "language_loss": 0.77875531, + "learning_rate": 1.455139770123972e-07, + "loss": 0.799981, + "num_input_tokens_seen": 316094405, + "step": 14661, + "time_per_iteration": 2.535114049911499 + }, + { + "auxiliary_loss_clip": 0.01069581, + "auxiliary_loss_mlp": 0.01040205, + "balance_loss_clip": 1.03667402, + "balance_loss_mlp": 1.02768815, + "epoch": 0.8815271306177664, + "flos": 22966274718720.0, + "grad_norm": 1.5827849139445775, + "language_loss": 0.76583159, + "learning_rate": 1.45368174298081e-07, + "loss": 0.78692943, + "num_input_tokens_seen": 316113390, + "step": 14662, + "time_per_iteration": 2.562130928039551 + }, + { + "auxiliary_loss_clip": 0.01053869, + "auxiliary_loss_mlp": 0.01028311, + "balance_loss_clip": 1.03152478, + "balance_loss_mlp": 1.0171175, + "epoch": 0.8815872538704344, + "flos": 19460064435840.0, + "grad_norm": 2.1148474378225672, + "language_loss": 0.73833179, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.7591536, + "num_input_tokens_seen": 316131085, + "step": 14663, + "time_per_iteration": 2.5558643341064453 + }, + { + "auxiliary_loss_clip": 0.01093655, + "auxiliary_loss_mlp": 0.00782393, + "balance_loss_clip": 1.03564835, + "balance_loss_mlp": 1.0080992, + "epoch": 0.8816473771231024, + "flos": 32156604622080.0, + "grad_norm": 1.6146538127166332, + "language_loss": 0.70134318, + "learning_rate": 1.450767798584489e-07, + "loss": 0.72010368, + "num_input_tokens_seen": 316151440, + "step": 14664, + "time_per_iteration": 2.591186285018921 + }, + { + "auxiliary_loss_clip": 0.01024166, + "auxiliary_loss_mlp": 0.01034836, + "balance_loss_clip": 1.03189337, + "balance_loss_mlp": 1.02295065, + "epoch": 0.8817075003757703, + "flos": 19682279925120.0, + "grad_norm": 1.5074119089308244, + "language_loss": 0.81304532, + "learning_rate": 1.449311881441828e-07, + "loss": 0.83363533, + "num_input_tokens_seen": 316170750, + "step": 14665, + "time_per_iteration": 2.612718105316162 + }, + { + "auxiliary_loss_clip": 0.01080937, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.03623128, + "balance_loss_mlp": 1.02107, + "epoch": 0.8817676236284383, + "flos": 15668616251520.0, + "grad_norm": 2.8075249969635387, + "language_loss": 0.58397007, + "learning_rate": 1.447856667743117e-07, + "loss": 0.60510468, + "num_input_tokens_seen": 316187265, + "step": 14666, + "time_per_iteration": 2.515561103820801 + }, + { + "auxiliary_loss_clip": 0.01096195, + "auxiliary_loss_mlp": 0.01030454, + "balance_loss_clip": 1.03710592, + "balance_loss_mlp": 1.01750779, + "epoch": 0.8818277468811063, + "flos": 17895185539200.0, + "grad_norm": 1.9440911163955525, + "language_loss": 0.83874917, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.86001563, + "num_input_tokens_seen": 316206555, + "step": 14667, + "time_per_iteration": 2.455974578857422 + }, + { + "auxiliary_loss_clip": 0.01103368, + "auxiliary_loss_mlp": 0.01034996, + "balance_loss_clip": 1.03497219, + "balance_loss_mlp": 1.02215111, + "epoch": 0.8818878701337742, + "flos": 18770508069120.0, + "grad_norm": 1.7799638198058874, + "language_loss": 0.62551093, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.64689457, + "num_input_tokens_seen": 316225210, + "step": 14668, + "time_per_iteration": 2.454132318496704 + }, + { + "auxiliary_loss_clip": 0.01089851, + "auxiliary_loss_mlp": 0.01025677, + "balance_loss_clip": 1.03451073, + "balance_loss_mlp": 1.01538396, + "epoch": 0.8819479933864423, + "flos": 17712292464000.0, + "grad_norm": 2.587715456975285, + "language_loss": 0.57267392, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.59382915, + "num_input_tokens_seen": 316242685, + "step": 14669, + "time_per_iteration": 2.435689687728882 + }, + { + "auxiliary_loss_clip": 0.01101709, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.03327, + "balance_loss_mlp": 1.01896191, + "epoch": 0.8820081166391102, + "flos": 11728749070080.0, + "grad_norm": 1.897807524866727, + "language_loss": 0.71491289, + "learning_rate": 1.442042848491043e-07, + "loss": 0.73623723, + "num_input_tokens_seen": 316260935, + "step": 14670, + "time_per_iteration": 2.4206132888793945 + }, + { + "auxiliary_loss_clip": 0.01085181, + "auxiliary_loss_mlp": 0.01032118, + "balance_loss_clip": 1.03118598, + "balance_loss_mlp": 1.0199883, + "epoch": 0.8820682398917782, + "flos": 27490372611840.0, + "grad_norm": 2.369410977798449, + "language_loss": 0.73812664, + "learning_rate": 1.44059115283929e-07, + "loss": 0.75929964, + "num_input_tokens_seen": 316281190, + "step": 14671, + "time_per_iteration": 2.512629508972168 + }, + { + "auxiliary_loss_clip": 0.01082141, + "auxiliary_loss_mlp": 0.01028398, + "balance_loss_clip": 1.03245974, + "balance_loss_mlp": 1.01588655, + "epoch": 0.8821283631444461, + "flos": 16873850223360.0, + "grad_norm": 1.9828685639254646, + "language_loss": 0.84677428, + "learning_rate": 1.43914016096218e-07, + "loss": 0.86787969, + "num_input_tokens_seen": 316297115, + "step": 14672, + "time_per_iteration": 2.4755032062530518 + }, + { + "auxiliary_loss_clip": 0.01066182, + "auxiliary_loss_mlp": 0.01028314, + "balance_loss_clip": 1.03240764, + "balance_loss_mlp": 1.01677465, + "epoch": 0.8821884863971141, + "flos": 24280964409600.0, + "grad_norm": 1.5758692854779395, + "language_loss": 0.7291432, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.7500881, + "num_input_tokens_seen": 316318235, + "step": 14673, + "time_per_iteration": 2.5563881397247314 + }, + { + "auxiliary_loss_clip": 0.01010085, + "auxiliary_loss_mlp": 0.0100078, + "balance_loss_clip": 1.00747216, + "balance_loss_mlp": 0.9996832, + "epoch": 0.882248609649782, + "flos": 59432342492160.0, + "grad_norm": 0.9031262024706469, + "language_loss": 0.49440524, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.51451385, + "num_input_tokens_seen": 316384705, + "step": 14674, + "time_per_iteration": 3.2296383380889893 + }, + { + "auxiliary_loss_clip": 0.01079033, + "auxiliary_loss_mlp": 0.00783945, + "balance_loss_clip": 1.03297079, + "balance_loss_mlp": 1.00936711, + "epoch": 0.88230873290245, + "flos": 19937784343680.0, + "grad_norm": 2.0786343326397048, + "language_loss": 0.76439524, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.78302503, + "num_input_tokens_seen": 316401165, + "step": 14675, + "time_per_iteration": 2.4851183891296387 + }, + { + "auxiliary_loss_clip": 0.01077382, + "auxiliary_loss_mlp": 0.0103157, + "balance_loss_clip": 1.03577757, + "balance_loss_mlp": 1.01994133, + "epoch": 0.882368856155118, + "flos": 16362769559040.0, + "grad_norm": 1.7249630414055, + "language_loss": 0.79319918, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.81428874, + "num_input_tokens_seen": 316418780, + "step": 14676, + "time_per_iteration": 2.5118613243103027 + }, + { + "auxiliary_loss_clip": 0.01008989, + "auxiliary_loss_mlp": 0.01008618, + "balance_loss_clip": 1.0227859, + "balance_loss_mlp": 1.00713396, + "epoch": 0.882428979407786, + "flos": 70594563277440.0, + "grad_norm": 0.6904998444433798, + "language_loss": 0.54740286, + "learning_rate": 1.431895760121109e-07, + "loss": 0.56757891, + "num_input_tokens_seen": 316482030, + "step": 14677, + "time_per_iteration": 4.6408281326293945 + }, + { + "auxiliary_loss_clip": 0.01100209, + "auxiliary_loss_mlp": 0.01025366, + "balance_loss_clip": 1.0331192, + "balance_loss_mlp": 1.01342738, + "epoch": 0.8824891026604539, + "flos": 18150294908160.0, + "grad_norm": 2.202151547236294, + "language_loss": 0.64733076, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.66858649, + "num_input_tokens_seen": 316499175, + "step": 14678, + "time_per_iteration": 3.867016315460205 + }, + { + "auxiliary_loss_clip": 0.01081652, + "auxiliary_loss_mlp": 0.01030126, + "balance_loss_clip": 1.03357279, + "balance_loss_mlp": 1.01797879, + "epoch": 0.8825492259131219, + "flos": 27232713377280.0, + "grad_norm": 2.948543293625489, + "language_loss": 0.71026963, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.73138738, + "num_input_tokens_seen": 316519495, + "step": 14679, + "time_per_iteration": 2.6133668422698975 + }, + { + "auxiliary_loss_clip": 0.0107917, + "auxiliary_loss_mlp": 0.01029838, + "balance_loss_clip": 1.03453362, + "balance_loss_mlp": 1.01919246, + "epoch": 0.8826093491657898, + "flos": 22274419881600.0, + "grad_norm": 1.6092427029812757, + "language_loss": 0.64054632, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.66163635, + "num_input_tokens_seen": 316538180, + "step": 14680, + "time_per_iteration": 2.5834507942199707 + }, + { + "auxiliary_loss_clip": 0.01101543, + "auxiliary_loss_mlp": 0.01035048, + "balance_loss_clip": 1.03419065, + "balance_loss_mlp": 1.02286446, + "epoch": 0.8826694724184578, + "flos": 14204753377920.0, + "grad_norm": 2.2507948300580862, + "language_loss": 0.77246958, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.79383552, + "num_input_tokens_seen": 316551750, + "step": 14681, + "time_per_iteration": 3.8344619274139404 + }, + { + "auxiliary_loss_clip": 0.01082377, + "auxiliary_loss_mlp": 0.01032077, + "balance_loss_clip": 1.03443694, + "balance_loss_mlp": 1.01912498, + "epoch": 0.8827295956711259, + "flos": 20631686256000.0, + "grad_norm": 1.6879881391423697, + "language_loss": 0.7277016, + "learning_rate": 1.424668961888047e-07, + "loss": 0.74884617, + "num_input_tokens_seen": 316570680, + "step": 14682, + "time_per_iteration": 2.5244860649108887 + }, + { + "auxiliary_loss_clip": 0.01061083, + "auxiliary_loss_mlp": 0.01029669, + "balance_loss_clip": 1.03673244, + "balance_loss_mlp": 1.01651478, + "epoch": 0.8827897189237938, + "flos": 18513064316160.0, + "grad_norm": 1.72751017398126, + "language_loss": 0.74535066, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.76625812, + "num_input_tokens_seen": 316588635, + "step": 14683, + "time_per_iteration": 2.5575673580169678 + }, + { + "auxiliary_loss_clip": 0.01069967, + "auxiliary_loss_mlp": 0.01032857, + "balance_loss_clip": 1.03317022, + "balance_loss_mlp": 1.02020335, + "epoch": 0.8828498421764618, + "flos": 22747399194240.0, + "grad_norm": 2.2131483989048766, + "language_loss": 0.66104287, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.68207109, + "num_input_tokens_seen": 316607550, + "step": 14684, + "time_per_iteration": 2.548283100128174 + }, + { + "auxiliary_loss_clip": 0.01087725, + "auxiliary_loss_mlp": 0.01026464, + "balance_loss_clip": 1.03534544, + "balance_loss_mlp": 1.01544905, + "epoch": 0.8829099654291297, + "flos": 15012384727680.0, + "grad_norm": 2.3215052655237813, + "language_loss": 0.69278669, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.71392858, + "num_input_tokens_seen": 316624460, + "step": 14685, + "time_per_iteration": 2.434131383895874 + }, + { + "auxiliary_loss_clip": 0.01051374, + "auxiliary_loss_mlp": 0.01034511, + "balance_loss_clip": 1.03363085, + "balance_loss_mlp": 1.02075982, + "epoch": 0.8829700886817977, + "flos": 16720546976640.0, + "grad_norm": 1.891880050579418, + "language_loss": 0.74578744, + "learning_rate": 1.418900201783806e-07, + "loss": 0.76664627, + "num_input_tokens_seen": 316640765, + "step": 14686, + "time_per_iteration": 2.5848681926727295 + }, + { + "auxiliary_loss_clip": 0.01052191, + "auxiliary_loss_mlp": 0.01026832, + "balance_loss_clip": 1.031479, + "balance_loss_mlp": 1.01469111, + "epoch": 0.8830302119344656, + "flos": 15263256291840.0, + "grad_norm": 1.7185441602709122, + "language_loss": 0.63053674, + "learning_rate": 1.417459773114007e-07, + "loss": 0.65132701, + "num_input_tokens_seen": 316656120, + "step": 14687, + "time_per_iteration": 3.9287831783294678 + }, + { + "auxiliary_loss_clip": 0.01094763, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.03521419, + "balance_loss_mlp": 1.02355123, + "epoch": 0.8830903351871336, + "flos": 28617751854720.0, + "grad_norm": 1.7404872463509278, + "language_loss": 0.68960643, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.71091068, + "num_input_tokens_seen": 316676095, + "step": 14688, + "time_per_iteration": 2.524101495742798 + }, + { + "auxiliary_loss_clip": 0.01089368, + "auxiliary_loss_mlp": 0.01024528, + "balance_loss_clip": 1.03433037, + "balance_loss_mlp": 1.01255989, + "epoch": 0.8831504584398016, + "flos": 28001632844160.0, + "grad_norm": 2.1786073414679814, + "language_loss": 0.6695354, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.69067442, + "num_input_tokens_seen": 316696235, + "step": 14689, + "time_per_iteration": 2.556540012359619 + }, + { + "auxiliary_loss_clip": 0.01083804, + "auxiliary_loss_mlp": 0.01029366, + "balance_loss_clip": 1.03779721, + "balance_loss_mlp": 1.01789832, + "epoch": 0.8832105816924696, + "flos": 26579642250240.0, + "grad_norm": 1.3857737616894588, + "language_loss": 0.74552488, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.76665658, + "num_input_tokens_seen": 316719680, + "step": 14690, + "time_per_iteration": 2.5918853282928467 + }, + { + "auxiliary_loss_clip": 0.01080115, + "auxiliary_loss_mlp": 0.01036753, + "balance_loss_clip": 1.03350711, + "balance_loss_mlp": 1.02381849, + "epoch": 0.8832707049451375, + "flos": 24898771359360.0, + "grad_norm": 1.4312646398270445, + "language_loss": 0.72932351, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.75049222, + "num_input_tokens_seen": 316739830, + "step": 14691, + "time_per_iteration": 2.5568630695343018 + }, + { + "auxiliary_loss_clip": 0.01069511, + "auxiliary_loss_mlp": 0.01031975, + "balance_loss_clip": 1.03466439, + "balance_loss_mlp": 1.01958942, + "epoch": 0.8833308281978055, + "flos": 15451141357440.0, + "grad_norm": 2.0239292550251187, + "language_loss": 0.52185607, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.54287088, + "num_input_tokens_seen": 316758105, + "step": 14692, + "time_per_iteration": 2.5213332176208496 + }, + { + "auxiliary_loss_clip": 0.01067591, + "auxiliary_loss_mlp": 0.01032058, + "balance_loss_clip": 1.03589511, + "balance_loss_mlp": 1.02032208, + "epoch": 0.8833909514504734, + "flos": 20301523418880.0, + "grad_norm": 1.9377649427944639, + "language_loss": 0.60379422, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.62479067, + "num_input_tokens_seen": 316777455, + "step": 14693, + "time_per_iteration": 2.5571722984313965 + }, + { + "auxiliary_loss_clip": 0.01099251, + "auxiliary_loss_mlp": 0.01027234, + "balance_loss_clip": 1.03462863, + "balance_loss_mlp": 1.01637459, + "epoch": 0.8834510747031414, + "flos": 20374027021440.0, + "grad_norm": 1.5359638723286422, + "language_loss": 0.75462133, + "learning_rate": 1.407396505730898e-07, + "loss": 0.77588618, + "num_input_tokens_seen": 316796300, + "step": 14694, + "time_per_iteration": 2.436044931411743 + }, + { + "auxiliary_loss_clip": 0.01094032, + "auxiliary_loss_mlp": 0.01028754, + "balance_loss_clip": 1.03343821, + "balance_loss_mlp": 1.0175662, + "epoch": 0.8835111979558095, + "flos": 29752026508800.0, + "grad_norm": 1.6551153299245214, + "language_loss": 0.72606051, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.74728835, + "num_input_tokens_seen": 316819090, + "step": 14695, + "time_per_iteration": 2.568169355392456 + }, + { + "auxiliary_loss_clip": 0.01087542, + "auxiliary_loss_mlp": 0.0102505, + "balance_loss_clip": 1.03412998, + "balance_loss_mlp": 1.01413083, + "epoch": 0.8835713212084774, + "flos": 24134556574080.0, + "grad_norm": 1.5764595092189218, + "language_loss": 0.79756117, + "learning_rate": 1.404527630961998e-07, + "loss": 0.81868708, + "num_input_tokens_seen": 316839250, + "step": 14696, + "time_per_iteration": 2.5145013332366943 + }, + { + "auxiliary_loss_clip": 0.01062526, + "auxiliary_loss_mlp": 0.01030234, + "balance_loss_clip": 1.03328013, + "balance_loss_mlp": 1.01860499, + "epoch": 0.8836314444611454, + "flos": 27672331933440.0, + "grad_norm": 1.4560514621954532, + "language_loss": 0.74691707, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.76784462, + "num_input_tokens_seen": 316861315, + "step": 14697, + "time_per_iteration": 2.633427381515503 + }, + { + "auxiliary_loss_clip": 0.01081174, + "auxiliary_loss_mlp": 0.01032452, + "balance_loss_clip": 1.03431666, + "balance_loss_mlp": 1.02053761, + "epoch": 0.8836915677138133, + "flos": 16836969934080.0, + "grad_norm": 2.0537120406881497, + "language_loss": 0.72145909, + "learning_rate": 1.401661576761779e-07, + "loss": 0.74259531, + "num_input_tokens_seen": 316879325, + "step": 14698, + "time_per_iteration": 2.487354278564453 + }, + { + "auxiliary_loss_clip": 0.01019065, + "auxiliary_loss_mlp": 0.01001155, + "balance_loss_clip": 1.00711155, + "balance_loss_mlp": 1.00019538, + "epoch": 0.8837516909664813, + "flos": 69310540823040.0, + "grad_norm": 0.8117746981532594, + "language_loss": 0.53697282, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.55717498, + "num_input_tokens_seen": 316936425, + "step": 14699, + "time_per_iteration": 3.10251522064209 + }, + { + "auxiliary_loss_clip": 0.01083333, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.03436518, + "balance_loss_mlp": 1.01805198, + "epoch": 0.8838118142191492, + "flos": 21324726241920.0, + "grad_norm": 1.7683874936073016, + "language_loss": 0.76809037, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.78922927, + "num_input_tokens_seen": 316956360, + "step": 14700, + "time_per_iteration": 2.510392189025879 + }, + { + "auxiliary_loss_clip": 0.01070713, + "auxiliary_loss_mlp": 0.01031756, + "balance_loss_clip": 1.03537393, + "balance_loss_mlp": 1.01981187, + "epoch": 0.8838719374718172, + "flos": 21470559459840.0, + "grad_norm": 1.8713608697115733, + "language_loss": 0.73475015, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.75577486, + "num_input_tokens_seen": 316975295, + "step": 14701, + "time_per_iteration": 2.560547113418579 + }, + { + "auxiliary_loss_clip": 0.01082183, + "auxiliary_loss_mlp": 0.01036166, + "balance_loss_clip": 1.03360057, + "balance_loss_mlp": 1.02280855, + "epoch": 0.8839320607244852, + "flos": 26468929555200.0, + "grad_norm": 1.6880839193340886, + "language_loss": 0.7074756, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.72865915, + "num_input_tokens_seen": 316994520, + "step": 14702, + "time_per_iteration": 2.5355043411254883 + }, + { + "auxiliary_loss_clip": 0.01069417, + "auxiliary_loss_mlp": 0.01038347, + "balance_loss_clip": 1.03474832, + "balance_loss_mlp": 1.02406013, + "epoch": 0.8839921839771532, + "flos": 45222270923520.0, + "grad_norm": 2.4070004990481486, + "language_loss": 0.71486962, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.73594725, + "num_input_tokens_seen": 317018095, + "step": 14703, + "time_per_iteration": 2.73290753364563 + }, + { + "auxiliary_loss_clip": 0.0105952, + "auxiliary_loss_mlp": 0.01028749, + "balance_loss_clip": 1.03427386, + "balance_loss_mlp": 1.01755488, + "epoch": 0.8840523072298211, + "flos": 20006876154240.0, + "grad_norm": 1.9813796405231894, + "language_loss": 0.66497993, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.68586266, + "num_input_tokens_seen": 317035755, + "step": 14704, + "time_per_iteration": 2.5469162464141846 + }, + { + "auxiliary_loss_clip": 0.01088756, + "auxiliary_loss_mlp": 0.01028088, + "balance_loss_clip": 1.03319812, + "balance_loss_mlp": 1.01700783, + "epoch": 0.8841124304824891, + "flos": 24426007528320.0, + "grad_norm": 1.6530049053100373, + "language_loss": 0.70511425, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.72628272, + "num_input_tokens_seen": 317055765, + "step": 14705, + "time_per_iteration": 2.5158543586730957 + }, + { + "auxiliary_loss_clip": 0.01081677, + "auxiliary_loss_mlp": 0.01032608, + "balance_loss_clip": 1.03519666, + "balance_loss_mlp": 1.02203393, + "epoch": 0.884172553735157, + "flos": 31284622056960.0, + "grad_norm": 1.5245456515797116, + "language_loss": 0.70800078, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.72914362, + "num_input_tokens_seen": 317077955, + "step": 14706, + "time_per_iteration": 2.580504894256592 + }, + { + "auxiliary_loss_clip": 0.01090714, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.03302944, + "balance_loss_mlp": 1.01761329, + "epoch": 0.884232676987825, + "flos": 21391160446080.0, + "grad_norm": 1.5751884574546837, + "language_loss": 0.74496198, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.76616168, + "num_input_tokens_seen": 317095825, + "step": 14707, + "time_per_iteration": 2.4810638427734375 + }, + { + "auxiliary_loss_clip": 0.01003581, + "auxiliary_loss_mlp": 0.01002276, + "balance_loss_clip": 1.01050758, + "balance_loss_mlp": 1.00123298, + "epoch": 0.8842928002404931, + "flos": 57911451799680.0, + "grad_norm": 0.7929664546380933, + "language_loss": 0.60385525, + "learning_rate": 1.387373629491173e-07, + "loss": 0.62391376, + "num_input_tokens_seen": 317152875, + "step": 14708, + "time_per_iteration": 3.0063724517822266 + }, + { + "auxiliary_loss_clip": 0.01074449, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.03361762, + "balance_loss_mlp": 1.01904774, + "epoch": 0.884352923493161, + "flos": 41463896186880.0, + "grad_norm": 1.5609034431832263, + "language_loss": 0.67199349, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.69303799, + "num_input_tokens_seen": 317176725, + "step": 14709, + "time_per_iteration": 2.686643123626709 + }, + { + "auxiliary_loss_clip": 0.0108399, + "auxiliary_loss_mlp": 0.0103337, + "balance_loss_clip": 1.03337991, + "balance_loss_mlp": 1.02003038, + "epoch": 0.884413046745829, + "flos": 46541234332800.0, + "grad_norm": 1.6047509032564182, + "language_loss": 0.62668788, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.64786148, + "num_input_tokens_seen": 317206880, + "step": 14710, + "time_per_iteration": 2.7785916328430176 + }, + { + "auxiliary_loss_clip": 0.01070988, + "auxiliary_loss_mlp": 0.0102527, + "balance_loss_clip": 1.03519857, + "balance_loss_mlp": 1.01426673, + "epoch": 0.8844731699984969, + "flos": 19135324552320.0, + "grad_norm": 2.805871844069436, + "language_loss": 0.63856041, + "learning_rate": 1.38310100580431e-07, + "loss": 0.65952301, + "num_input_tokens_seen": 317224135, + "step": 14711, + "time_per_iteration": 2.5117268562316895 + }, + { + "auxiliary_loss_clip": 0.01068056, + "auxiliary_loss_mlp": 0.01029887, + "balance_loss_clip": 1.0343039, + "balance_loss_mlp": 1.01763868, + "epoch": 0.8845332932511649, + "flos": 23260634674560.0, + "grad_norm": 3.500120460647278, + "language_loss": 0.75931317, + "learning_rate": 1.38167820974606e-07, + "loss": 0.78029263, + "num_input_tokens_seen": 317244505, + "step": 14712, + "time_per_iteration": 2.571178674697876 + }, + { + "auxiliary_loss_clip": 0.01043568, + "auxiliary_loss_mlp": 0.01025085, + "balance_loss_clip": 1.03068244, + "balance_loss_mlp": 1.01309228, + "epoch": 0.8845934165038328, + "flos": 17564591738880.0, + "grad_norm": 4.258699843232303, + "language_loss": 0.8122263, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.83291286, + "num_input_tokens_seen": 317257830, + "step": 14713, + "time_per_iteration": 2.5560410022735596 + }, + { + "auxiliary_loss_clip": 0.01080459, + "auxiliary_loss_mlp": 0.01026986, + "balance_loss_clip": 1.03424251, + "balance_loss_mlp": 1.01516652, + "epoch": 0.8846535397565009, + "flos": 27485739757440.0, + "grad_norm": 1.7661143094873863, + "language_loss": 0.55565417, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.57672864, + "num_input_tokens_seen": 317278430, + "step": 14714, + "time_per_iteration": 2.6099441051483154 + }, + { + "auxiliary_loss_clip": 0.01045826, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.03342009, + "balance_loss_mlp": 1.02083516, + "epoch": 0.8847136630091688, + "flos": 28761430256640.0, + "grad_norm": 1.7715062804136124, + "language_loss": 0.73766029, + "learning_rate": 1.377414057838755e-07, + "loss": 0.7584511, + "num_input_tokens_seen": 317295970, + "step": 14715, + "time_per_iteration": 2.6590845584869385 + }, + { + "auxiliary_loss_clip": 0.01094881, + "auxiliary_loss_mlp": 0.01028432, + "balance_loss_clip": 1.03508914, + "balance_loss_mlp": 1.01708961, + "epoch": 0.8847737862618368, + "flos": 23476924419840.0, + "grad_norm": 1.6074226115223864, + "language_loss": 0.75384289, + "learning_rate": 1.375994086138461e-07, + "loss": 0.77507603, + "num_input_tokens_seen": 317316185, + "step": 14716, + "time_per_iteration": 3.9274466037750244 + }, + { + "auxiliary_loss_clip": 0.01069841, + "auxiliary_loss_mlp": 0.01037711, + "balance_loss_clip": 1.03366399, + "balance_loss_mlp": 1.02596283, + "epoch": 0.8848339095145047, + "flos": 18660872782080.0, + "grad_norm": 2.430115384069665, + "language_loss": 0.71286523, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.73394072, + "num_input_tokens_seen": 317333275, + "step": 14717, + "time_per_iteration": 3.963810920715332 + }, + { + "auxiliary_loss_clip": 0.01089152, + "auxiliary_loss_mlp": 0.01030438, + "balance_loss_clip": 1.0345366, + "balance_loss_mlp": 1.01931596, + "epoch": 0.8848940327671727, + "flos": 32270298145920.0, + "grad_norm": 2.5416591630696184, + "language_loss": 0.7456373, + "learning_rate": 1.373156261464208e-07, + "loss": 0.76683319, + "num_input_tokens_seen": 317351245, + "step": 14718, + "time_per_iteration": 2.5589540004730225 + }, + { + "auxiliary_loss_clip": 0.0105134, + "auxiliary_loss_mlp": 0.01028743, + "balance_loss_clip": 1.03538573, + "balance_loss_mlp": 1.01702452, + "epoch": 0.8849541560198406, + "flos": 24021832717440.0, + "grad_norm": 1.6203747585430244, + "language_loss": 0.78749609, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.80829692, + "num_input_tokens_seen": 317370740, + "step": 14719, + "time_per_iteration": 4.063754558563232 + }, + { + "auxiliary_loss_clip": 0.01104162, + "auxiliary_loss_mlp": 0.01028129, + "balance_loss_clip": 1.03464103, + "balance_loss_mlp": 1.01595747, + "epoch": 0.8850142792725086, + "flos": 16873060124160.0, + "grad_norm": 1.850391752118759, + "language_loss": 0.72303748, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.74436039, + "num_input_tokens_seen": 317388370, + "step": 14720, + "time_per_iteration": 2.4463114738464355 + }, + { + "auxiliary_loss_clip": 0.01084453, + "auxiliary_loss_mlp": 0.01028696, + "balance_loss_clip": 1.0348649, + "balance_loss_mlp": 1.01597071, + "epoch": 0.8850744025251767, + "flos": 24024059360640.0, + "grad_norm": 2.3471835823788223, + "language_loss": 0.82592547, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.84705698, + "num_input_tokens_seen": 317407390, + "step": 14721, + "time_per_iteration": 2.5212619304656982 + }, + { + "auxiliary_loss_clip": 0.01081053, + "auxiliary_loss_mlp": 0.01030067, + "balance_loss_clip": 1.03179312, + "balance_loss_mlp": 1.01726413, + "epoch": 0.8851345257778446, + "flos": 47955575329920.0, + "grad_norm": 1.6427256260382548, + "language_loss": 0.61951602, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.64062721, + "num_input_tokens_seen": 317430825, + "step": 14722, + "time_per_iteration": 2.7698121070861816 + }, + { + "auxiliary_loss_clip": 0.01093937, + "auxiliary_loss_mlp": 0.01029352, + "balance_loss_clip": 1.03440809, + "balance_loss_mlp": 1.01677537, + "epoch": 0.8851946490305126, + "flos": 36611000173440.0, + "grad_norm": 2.2704666802211113, + "language_loss": 0.69002271, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.71125567, + "num_input_tokens_seen": 317451905, + "step": 14723, + "time_per_iteration": 2.5940165519714355 + }, + { + "auxiliary_loss_clip": 0.01069902, + "auxiliary_loss_mlp": 0.01034337, + "balance_loss_clip": 1.03331959, + "balance_loss_mlp": 1.02195752, + "epoch": 0.8852547722831805, + "flos": 21544248211200.0, + "grad_norm": 1.8800517641738776, + "language_loss": 0.7790029, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.80004525, + "num_input_tokens_seen": 317470030, + "step": 14724, + "time_per_iteration": 2.56684947013855 + }, + { + "auxiliary_loss_clip": 0.01018027, + "auxiliary_loss_mlp": 0.01000593, + "balance_loss_clip": 1.00530434, + "balance_loss_mlp": 0.99947226, + "epoch": 0.8853148955358485, + "flos": 63059246472960.0, + "grad_norm": 0.8003296917487726, + "language_loss": 0.58938527, + "learning_rate": 1.363246127376143e-07, + "loss": 0.60957152, + "num_input_tokens_seen": 317527460, + "step": 14725, + "time_per_iteration": 4.322714328765869 + }, + { + "auxiliary_loss_clip": 0.01082871, + "auxiliary_loss_mlp": 0.00783832, + "balance_loss_clip": 1.03358173, + "balance_loss_mlp": 1.00856876, + "epoch": 0.8853750187885164, + "flos": 18149828031360.0, + "grad_norm": 2.1349725419992946, + "language_loss": 0.68712574, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.70579278, + "num_input_tokens_seen": 317544070, + "step": 14726, + "time_per_iteration": 2.5208945274353027 + }, + { + "auxiliary_loss_clip": 0.01090477, + "auxiliary_loss_mlp": 0.00783266, + "balance_loss_clip": 1.03449631, + "balance_loss_mlp": 1.00910902, + "epoch": 0.8854351420411845, + "flos": 39570542392320.0, + "grad_norm": 1.2005619166472559, + "language_loss": 0.69271207, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.7114495, + "num_input_tokens_seen": 317570275, + "step": 14727, + "time_per_iteration": 2.692862033843994 + }, + { + "auxiliary_loss_clip": 0.01084471, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.03864884, + "balance_loss_mlp": 1.01828444, + "epoch": 0.8854952652938524, + "flos": 23769309127680.0, + "grad_norm": 1.6319564950930607, + "language_loss": 0.70072114, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.72187042, + "num_input_tokens_seen": 317590160, + "step": 14728, + "time_per_iteration": 2.5229618549346924 + }, + { + "auxiliary_loss_clip": 0.0107137, + "auxiliary_loss_mlp": 0.0102853, + "balance_loss_clip": 1.03529286, + "balance_loss_mlp": 1.01752687, + "epoch": 0.8855553885465204, + "flos": 18290310122880.0, + "grad_norm": 2.0192388038660916, + "language_loss": 0.6661098, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.68710887, + "num_input_tokens_seen": 317608340, + "step": 14729, + "time_per_iteration": 2.5396616458892822 + }, + { + "auxiliary_loss_clip": 0.01084034, + "auxiliary_loss_mlp": 0.01030442, + "balance_loss_clip": 1.03839648, + "balance_loss_mlp": 1.01963019, + "epoch": 0.8856155117991883, + "flos": 36867402432000.0, + "grad_norm": 1.7309492565671563, + "language_loss": 0.62881577, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.64996058, + "num_input_tokens_seen": 317629910, + "step": 14730, + "time_per_iteration": 2.6425654888153076 + }, + { + "auxiliary_loss_clip": 0.0106568, + "auxiliary_loss_mlp": 0.01028926, + "balance_loss_clip": 1.03361201, + "balance_loss_mlp": 1.01724911, + "epoch": 0.8856756350518563, + "flos": 22163886754560.0, + "grad_norm": 1.515073822313099, + "language_loss": 0.7949096, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.81585562, + "num_input_tokens_seen": 317650265, + "step": 14731, + "time_per_iteration": 2.5953400135040283 + }, + { + "auxiliary_loss_clip": 0.01068989, + "auxiliary_loss_mlp": 0.01032295, + "balance_loss_clip": 1.03192234, + "balance_loss_mlp": 1.02032614, + "epoch": 0.8857357583045242, + "flos": 20740962407040.0, + "grad_norm": 1.9508176320846629, + "language_loss": 0.83139884, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.85241175, + "num_input_tokens_seen": 317669045, + "step": 14732, + "time_per_iteration": 2.5527169704437256 + }, + { + "auxiliary_loss_clip": 0.01009973, + "auxiliary_loss_mlp": 0.01001079, + "balance_loss_clip": 1.00782883, + "balance_loss_mlp": 0.9999944, + "epoch": 0.8857958815571922, + "flos": 69892329409920.0, + "grad_norm": 0.8989018659668119, + "language_loss": 0.59919125, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.61930174, + "num_input_tokens_seen": 317728065, + "step": 14733, + "time_per_iteration": 3.158027172088623 + }, + { + "auxiliary_loss_clip": 0.01105004, + "auxiliary_loss_mlp": 0.00784363, + "balance_loss_clip": 1.03618312, + "balance_loss_mlp": 1.0097568, + "epoch": 0.8858560048098603, + "flos": 15121948187520.0, + "grad_norm": 1.721681024884068, + "language_loss": 0.66242415, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.68131781, + "num_input_tokens_seen": 317746120, + "step": 14734, + "time_per_iteration": 2.452162981033325 + }, + { + "auxiliary_loss_clip": 0.01077546, + "auxiliary_loss_mlp": 0.01032722, + "balance_loss_clip": 1.03790116, + "balance_loss_mlp": 1.02183807, + "epoch": 0.8859161280625282, + "flos": 16611019430400.0, + "grad_norm": 1.8774719955246029, + "language_loss": 0.75466239, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.77576506, + "num_input_tokens_seen": 317762280, + "step": 14735, + "time_per_iteration": 2.514533758163452 + }, + { + "auxiliary_loss_clip": 0.01066789, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.03359962, + "balance_loss_mlp": 1.02015686, + "epoch": 0.8859762513151962, + "flos": 18694484933760.0, + "grad_norm": 1.7445060601385467, + "language_loss": 0.70348167, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.72447455, + "num_input_tokens_seen": 317780615, + "step": 14736, + "time_per_iteration": 2.519740104675293 + }, + { + "auxiliary_loss_clip": 0.01079927, + "auxiliary_loss_mlp": 0.01027771, + "balance_loss_clip": 1.03726709, + "balance_loss_mlp": 1.0157665, + "epoch": 0.8860363745678641, + "flos": 19536877670400.0, + "grad_norm": 1.6762464206381906, + "language_loss": 0.84377038, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.86484736, + "num_input_tokens_seen": 317798830, + "step": 14737, + "time_per_iteration": 2.5320472717285156 + }, + { + "auxiliary_loss_clip": 0.01079711, + "auxiliary_loss_mlp": 0.01033277, + "balance_loss_clip": 1.03519726, + "balance_loss_mlp": 1.01992536, + "epoch": 0.8860964978205321, + "flos": 35954912304000.0, + "grad_norm": 2.054614254820155, + "language_loss": 0.68360722, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.70473707, + "num_input_tokens_seen": 317819235, + "step": 14738, + "time_per_iteration": 2.644205331802368 + }, + { + "auxiliary_loss_clip": 0.0109574, + "auxiliary_loss_mlp": 0.01030021, + "balance_loss_clip": 1.03489685, + "balance_loss_mlp": 1.01736069, + "epoch": 0.8861566210732, + "flos": 21212577002880.0, + "grad_norm": 1.582178962390456, + "language_loss": 0.74981225, + "learning_rate": 1.343529763547222e-07, + "loss": 0.77106988, + "num_input_tokens_seen": 317836785, + "step": 14739, + "time_per_iteration": 2.4910991191864014 + }, + { + "auxiliary_loss_clip": 0.0109109, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.03530526, + "balance_loss_mlp": 1.02311277, + "epoch": 0.886216744325868, + "flos": 14609071843200.0, + "grad_norm": 1.8054401356432992, + "language_loss": 0.87166172, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.89291465, + "num_input_tokens_seen": 317854225, + "step": 14740, + "time_per_iteration": 2.4569904804229736 + }, + { + "auxiliary_loss_clip": 0.01057717, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.03548276, + "balance_loss_mlp": 1.02041721, + "epoch": 0.886276867578536, + "flos": 26651643062400.0, + "grad_norm": 1.7017012765090198, + "language_loss": 0.63412178, + "learning_rate": 1.34072445601471e-07, + "loss": 0.65502298, + "num_input_tokens_seen": 317874865, + "step": 14741, + "time_per_iteration": 2.6319611072540283 + }, + { + "auxiliary_loss_clip": 0.01103509, + "auxiliary_loss_mlp": 0.01029948, + "balance_loss_clip": 1.03541684, + "balance_loss_mlp": 1.01817024, + "epoch": 0.886336990831204, + "flos": 16764071281920.0, + "grad_norm": 2.1822745287557805, + "language_loss": 0.72925043, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.75058496, + "num_input_tokens_seen": 317892830, + "step": 14742, + "time_per_iteration": 2.422395944595337 + }, + { + "auxiliary_loss_clip": 0.01090956, + "auxiliary_loss_mlp": 0.00782266, + "balance_loss_clip": 1.03425586, + "balance_loss_mlp": 1.00872898, + "epoch": 0.8863971140838719, + "flos": 25265275781760.0, + "grad_norm": 1.7376554270139353, + "language_loss": 0.59227335, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.61100554, + "num_input_tokens_seen": 317911780, + "step": 14743, + "time_per_iteration": 2.5296425819396973 + }, + { + "auxiliary_loss_clip": 0.01074178, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.03521729, + "balance_loss_mlp": 1.01938164, + "epoch": 0.8864572373365399, + "flos": 23404313076480.0, + "grad_norm": 1.709770331678158, + "language_loss": 0.60264146, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.6237123, + "num_input_tokens_seen": 317932855, + "step": 14744, + "time_per_iteration": 2.5690457820892334 + }, + { + "auxiliary_loss_clip": 0.01095104, + "auxiliary_loss_mlp": 0.00781752, + "balance_loss_clip": 1.0362339, + "balance_loss_mlp": 1.00569272, + "epoch": 0.8865173605892078, + "flos": 18548759456640.0, + "grad_norm": 1.6060460276248727, + "language_loss": 0.76577759, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.78454614, + "num_input_tokens_seen": 317952090, + "step": 14745, + "time_per_iteration": 2.5019371509552 + }, + { + "auxiliary_loss_clip": 0.01102908, + "auxiliary_loss_mlp": 0.00782088, + "balance_loss_clip": 1.03538525, + "balance_loss_mlp": 1.00735438, + "epoch": 0.8865774838418758, + "flos": 19025868833280.0, + "grad_norm": 1.6545822118548315, + "language_loss": 0.77204478, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.79089475, + "num_input_tokens_seen": 317970370, + "step": 14746, + "time_per_iteration": 2.444553852081299 + }, + { + "auxiliary_loss_clip": 0.01080786, + "auxiliary_loss_mlp": 0.01032274, + "balance_loss_clip": 1.03704453, + "balance_loss_mlp": 1.01943469, + "epoch": 0.8866376070945439, + "flos": 22163168482560.0, + "grad_norm": 1.8208924385232765, + "language_loss": 0.76624328, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.7873739, + "num_input_tokens_seen": 317989125, + "step": 14747, + "time_per_iteration": 2.5524051189422607 + }, + { + "auxiliary_loss_clip": 0.01076886, + "auxiliary_loss_mlp": 0.00781235, + "balance_loss_clip": 1.03129816, + "balance_loss_mlp": 1.00676525, + "epoch": 0.8866977303472118, + "flos": 20704261685760.0, + "grad_norm": 1.7591568711909982, + "language_loss": 0.82579404, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.84437525, + "num_input_tokens_seen": 318007820, + "step": 14748, + "time_per_iteration": 2.5114691257476807 + }, + { + "auxiliary_loss_clip": 0.01092954, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.03504515, + "balance_loss_mlp": 1.02012897, + "epoch": 0.8867578535998798, + "flos": 48794448533760.0, + "grad_norm": 1.8081299110442413, + "language_loss": 0.77563143, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.79688674, + "num_input_tokens_seen": 318030435, + "step": 14749, + "time_per_iteration": 2.7221243381500244 + }, + { + "auxiliary_loss_clip": 0.01044386, + "auxiliary_loss_mlp": 0.0078398, + "balance_loss_clip": 1.03371394, + "balance_loss_mlp": 1.00994527, + "epoch": 0.8868179768525477, + "flos": 21105312013440.0, + "grad_norm": 2.3841849365546515, + "language_loss": 0.69656599, + "learning_rate": 1.328135602550451e-07, + "loss": 0.71484965, + "num_input_tokens_seen": 318049465, + "step": 14750, + "time_per_iteration": 2.611816167831421 + }, + { + "auxiliary_loss_clip": 0.01092783, + "auxiliary_loss_mlp": 0.01030647, + "balance_loss_clip": 1.03492546, + "balance_loss_mlp": 1.01880372, + "epoch": 0.8868781001052157, + "flos": 21830922656640.0, + "grad_norm": 1.7794370063328793, + "language_loss": 0.59201682, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.61325115, + "num_input_tokens_seen": 318067760, + "step": 14751, + "time_per_iteration": 2.524425983428955 + }, + { + "auxiliary_loss_clip": 0.01104285, + "auxiliary_loss_mlp": 0.01031014, + "balance_loss_clip": 1.03621113, + "balance_loss_mlp": 1.01889586, + "epoch": 0.8869382233578836, + "flos": 13516418073600.0, + "grad_norm": 2.47668543849538, + "language_loss": 0.81331301, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.83466601, + "num_input_tokens_seen": 318082785, + "step": 14752, + "time_per_iteration": 2.411102533340454 + }, + { + "auxiliary_loss_clip": 0.0108565, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.03646433, + "balance_loss_mlp": 1.01812816, + "epoch": 0.8869983466105517, + "flos": 22704988210560.0, + "grad_norm": 2.3816892259347147, + "language_loss": 0.80301303, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.8241818, + "num_input_tokens_seen": 318101925, + "step": 14753, + "time_per_iteration": 2.533090353012085 + }, + { + "auxiliary_loss_clip": 0.01102647, + "auxiliary_loss_mlp": 0.01029452, + "balance_loss_clip": 1.03500199, + "balance_loss_mlp": 1.01782322, + "epoch": 0.8870584698632196, + "flos": 15340751884800.0, + "grad_norm": 1.6793872582519507, + "language_loss": 0.65482378, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.67614472, + "num_input_tokens_seen": 318119945, + "step": 14754, + "time_per_iteration": 3.813854932785034 + }, + { + "auxiliary_loss_clip": 0.01106389, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.03682303, + "balance_loss_mlp": 1.01916599, + "epoch": 0.8871185931158876, + "flos": 26615624699520.0, + "grad_norm": 1.9030618858297426, + "language_loss": 0.74576867, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.76714706, + "num_input_tokens_seen": 318139685, + "step": 14755, + "time_per_iteration": 3.875464677810669 + }, + { + "auxiliary_loss_clip": 0.0107939, + "auxiliary_loss_mlp": 0.01032602, + "balance_loss_clip": 1.03088272, + "balance_loss_mlp": 1.01941156, + "epoch": 0.8871787163685555, + "flos": 21799034357760.0, + "grad_norm": 1.474849311144646, + "language_loss": 0.77886963, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.79998958, + "num_input_tokens_seen": 318160375, + "step": 14756, + "time_per_iteration": 2.5308098793029785 + }, + { + "auxiliary_loss_clip": 0.01084752, + "auxiliary_loss_mlp": 0.01032605, + "balance_loss_clip": 1.03604031, + "balance_loss_mlp": 1.02067757, + "epoch": 0.8872388396212235, + "flos": 14902964922240.0, + "grad_norm": 1.8798215198700294, + "language_loss": 0.76611865, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.78729224, + "num_input_tokens_seen": 318177995, + "step": 14757, + "time_per_iteration": 2.4772582054138184 + }, + { + "auxiliary_loss_clip": 0.01044629, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.03207016, + "balance_loss_mlp": 1.01939201, + "epoch": 0.8872989628738914, + "flos": 26432157006720.0, + "grad_norm": 1.728699522006643, + "language_loss": 0.67757869, + "learning_rate": 1.316993656021632e-07, + "loss": 0.6983375, + "num_input_tokens_seen": 318197030, + "step": 14758, + "time_per_iteration": 4.030256032943726 + }, + { + "auxiliary_loss_clip": 0.0110399, + "auxiliary_loss_mlp": 0.01033423, + "balance_loss_clip": 1.03540373, + "balance_loss_mlp": 1.02042913, + "epoch": 0.8873590861265594, + "flos": 48142562555520.0, + "grad_norm": 1.5811818165131137, + "language_loss": 0.68864012, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.71001422, + "num_input_tokens_seen": 318221780, + "step": 14759, + "time_per_iteration": 2.690025568008423 + }, + { + "auxiliary_loss_clip": 0.01101422, + "auxiliary_loss_mlp": 0.01029218, + "balance_loss_clip": 1.03390205, + "balance_loss_mlp": 1.0176723, + "epoch": 0.8874192093792275, + "flos": 18332972501760.0, + "grad_norm": 2.7193815919078435, + "language_loss": 0.74549448, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.76680088, + "num_input_tokens_seen": 318239710, + "step": 14760, + "time_per_iteration": 2.425956964492798 + }, + { + "auxiliary_loss_clip": 0.01083878, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.03521121, + "balance_loss_mlp": 1.01878738, + "epoch": 0.8874793326318954, + "flos": 17894215872000.0, + "grad_norm": 2.1119417143918184, + "language_loss": 0.76600879, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.7871536, + "num_input_tokens_seen": 318257425, + "step": 14761, + "time_per_iteration": 2.4882891178131104 + }, + { + "auxiliary_loss_clip": 0.01104126, + "auxiliary_loss_mlp": 0.01034316, + "balance_loss_clip": 1.0340451, + "balance_loss_mlp": 1.02228212, + "epoch": 0.8875394558845634, + "flos": 31102231772160.0, + "grad_norm": 1.5545149534964293, + "language_loss": 0.61009598, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.6314804, + "num_input_tokens_seen": 318278485, + "step": 14762, + "time_per_iteration": 2.5407497882843018 + }, + { + "auxiliary_loss_clip": 0.01089279, + "auxiliary_loss_mlp": 0.01031926, + "balance_loss_clip": 1.03346372, + "balance_loss_mlp": 1.01941442, + "epoch": 0.8875995791372313, + "flos": 21142048648320.0, + "grad_norm": 1.8292620808658695, + "language_loss": 0.64460361, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.66581559, + "num_input_tokens_seen": 318297560, + "step": 14763, + "time_per_iteration": 2.5466930866241455 + }, + { + "auxiliary_loss_clip": 0.01076362, + "auxiliary_loss_mlp": 0.00783565, + "balance_loss_clip": 1.03450727, + "balance_loss_mlp": 1.00765848, + "epoch": 0.8876597023898993, + "flos": 17455136019840.0, + "grad_norm": 2.10215323663168, + "language_loss": 0.71066827, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.72926748, + "num_input_tokens_seen": 318313060, + "step": 14764, + "time_per_iteration": 3.8846302032470703 + }, + { + "auxiliary_loss_clip": 0.01107413, + "auxiliary_loss_mlp": 0.01034863, + "balance_loss_clip": 1.03543496, + "balance_loss_mlp": 1.02285266, + "epoch": 0.8877198256425672, + "flos": 22707933125760.0, + "grad_norm": 2.121813034613621, + "language_loss": 0.65924615, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.68066895, + "num_input_tokens_seen": 318332030, + "step": 14765, + "time_per_iteration": 2.45858097076416 + }, + { + "auxiliary_loss_clip": 0.01063526, + "auxiliary_loss_mlp": 0.01027082, + "balance_loss_clip": 1.03469133, + "balance_loss_mlp": 1.01621604, + "epoch": 0.8877799488952353, + "flos": 24535104111360.0, + "grad_norm": 1.5766073722033274, + "language_loss": 0.76549357, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.7863996, + "num_input_tokens_seen": 318351090, + "step": 14766, + "time_per_iteration": 2.605849504470825 + }, + { + "auxiliary_loss_clip": 0.01072024, + "auxiliary_loss_mlp": 0.01029606, + "balance_loss_clip": 1.03188753, + "balance_loss_mlp": 1.01804912, + "epoch": 0.8878400721479032, + "flos": 20959191486720.0, + "grad_norm": 2.8834814202177954, + "language_loss": 0.72847444, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.74949074, + "num_input_tokens_seen": 318372000, + "step": 14767, + "time_per_iteration": 2.5258867740631104 + }, + { + "auxiliary_loss_clip": 0.01099604, + "auxiliary_loss_mlp": 0.01027655, + "balance_loss_clip": 1.03418326, + "balance_loss_mlp": 1.01657486, + "epoch": 0.8879001954005712, + "flos": 25295260659840.0, + "grad_norm": 1.7783967157743072, + "language_loss": 0.70945758, + "learning_rate": 1.303129987538778e-07, + "loss": 0.73073018, + "num_input_tokens_seen": 318391530, + "step": 14768, + "time_per_iteration": 2.500744581222534 + }, + { + "auxiliary_loss_clip": 0.01090018, + "auxiliary_loss_mlp": 0.0103171, + "balance_loss_clip": 1.03513062, + "balance_loss_mlp": 1.02019429, + "epoch": 0.8879603186532391, + "flos": 23185329811200.0, + "grad_norm": 1.9982355560255454, + "language_loss": 0.70016217, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.7213794, + "num_input_tokens_seen": 318410690, + "step": 14769, + "time_per_iteration": 2.503359317779541 + }, + { + "auxiliary_loss_clip": 0.01077587, + "auxiliary_loss_mlp": 0.0102958, + "balance_loss_clip": 1.03518736, + "balance_loss_mlp": 1.0182972, + "epoch": 0.8880204419059071, + "flos": 13655427707520.0, + "grad_norm": 2.247767717259084, + "language_loss": 0.66987288, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.69094455, + "num_input_tokens_seen": 318427380, + "step": 14770, + "time_per_iteration": 2.514626979827881 + }, + { + "auxiliary_loss_clip": 0.01088453, + "auxiliary_loss_mlp": 0.01033329, + "balance_loss_clip": 1.03415215, + "balance_loss_mlp": 1.02109182, + "epoch": 0.888080565158575, + "flos": 20631865824000.0, + "grad_norm": 1.7413124356404244, + "language_loss": 0.65281296, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.67403078, + "num_input_tokens_seen": 318448530, + "step": 14771, + "time_per_iteration": 2.5058789253234863 + }, + { + "auxiliary_loss_clip": 0.0108045, + "auxiliary_loss_mlp": 0.01025, + "balance_loss_clip": 1.03417587, + "balance_loss_mlp": 1.01354432, + "epoch": 0.888140688411243, + "flos": 28620014411520.0, + "grad_norm": 1.4387291899530263, + "language_loss": 0.82478821, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.84584278, + "num_input_tokens_seen": 318468655, + "step": 14772, + "time_per_iteration": 2.5801050662994385 + }, + { + "auxiliary_loss_clip": 0.01076356, + "auxiliary_loss_mlp": 0.01024441, + "balance_loss_clip": 1.03207076, + "balance_loss_mlp": 1.01362324, + "epoch": 0.8882008116639111, + "flos": 25520241496320.0, + "grad_norm": 1.9418214720670446, + "language_loss": 0.76508504, + "learning_rate": 1.296224737033258e-07, + "loss": 0.78609306, + "num_input_tokens_seen": 318488740, + "step": 14773, + "time_per_iteration": 2.5675265789031982 + }, + { + "auxiliary_loss_clip": 0.01079979, + "auxiliary_loss_mlp": 0.01025352, + "balance_loss_clip": 1.03463447, + "balance_loss_mlp": 1.01413405, + "epoch": 0.888260934916579, + "flos": 27673696650240.0, + "grad_norm": 1.6787941191927467, + "language_loss": 0.74947166, + "learning_rate": 1.294845814469907e-07, + "loss": 0.77052498, + "num_input_tokens_seen": 318508810, + "step": 14774, + "time_per_iteration": 2.569411039352417 + }, + { + "auxiliary_loss_clip": 0.01057911, + "auxiliary_loss_mlp": 0.00782558, + "balance_loss_clip": 1.03535175, + "balance_loss_mlp": 1.00685859, + "epoch": 0.888321058169247, + "flos": 21611077464960.0, + "grad_norm": 2.2262384520349676, + "language_loss": 0.71985286, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.73825753, + "num_input_tokens_seen": 318526860, + "step": 14775, + "time_per_iteration": 2.556896209716797 + }, + { + "auxiliary_loss_clip": 0.01101961, + "auxiliary_loss_mlp": 0.01028833, + "balance_loss_clip": 1.03405714, + "balance_loss_mlp": 1.01747251, + "epoch": 0.8883811814219149, + "flos": 18149109759360.0, + "grad_norm": 1.536238641645253, + "language_loss": 0.80110335, + "learning_rate": 1.292090097299432e-07, + "loss": 0.8224113, + "num_input_tokens_seen": 318545180, + "step": 14776, + "time_per_iteration": 2.433401584625244 + }, + { + "auxiliary_loss_clip": 0.01098021, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.03468013, + "balance_loss_mlp": 1.01923943, + "epoch": 0.8884413046745829, + "flos": 28324648874880.0, + "grad_norm": 2.306179104985734, + "language_loss": 0.69289291, + "learning_rate": 1.290713302796802e-07, + "loss": 0.71418941, + "num_input_tokens_seen": 318564350, + "step": 14777, + "time_per_iteration": 2.5101089477539062 + }, + { + "auxiliary_loss_clip": 0.01087489, + "auxiliary_loss_mlp": 0.01035777, + "balance_loss_clip": 1.03151846, + "balance_loss_mlp": 1.02376699, + "epoch": 0.8885014279272508, + "flos": 15158756649600.0, + "grad_norm": 2.150791245760462, + "language_loss": 0.70917165, + "learning_rate": 1.2893372177522e-07, + "loss": 0.73040426, + "num_input_tokens_seen": 318582275, + "step": 14778, + "time_per_iteration": 2.4723336696624756 + }, + { + "auxiliary_loss_clip": 0.0110359, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.03485346, + "balance_loss_mlp": 1.01846266, + "epoch": 0.8885615511799189, + "flos": 19099593498240.0, + "grad_norm": 1.6362460193717518, + "language_loss": 0.77413034, + "learning_rate": 1.287961842217804e-07, + "loss": 0.79546136, + "num_input_tokens_seen": 318601230, + "step": 14779, + "time_per_iteration": 2.428987741470337 + }, + { + "auxiliary_loss_clip": 0.01008808, + "auxiliary_loss_mlp": 0.01003652, + "balance_loss_clip": 1.01062799, + "balance_loss_mlp": 1.00254309, + "epoch": 0.8886216744325868, + "flos": 51186567605760.0, + "grad_norm": 0.8719873376003487, + "language_loss": 0.56821805, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.58834267, + "num_input_tokens_seen": 318645595, + "step": 14780, + "time_per_iteration": 2.9266011714935303 + }, + { + "auxiliary_loss_clip": 0.01027864, + "auxiliary_loss_mlp": 0.01001232, + "balance_loss_clip": 1.00507498, + "balance_loss_mlp": 1.00023103, + "epoch": 0.8886817976852548, + "flos": 61612981263360.0, + "grad_norm": 0.7916369161470943, + "language_loss": 0.62447226, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.64476323, + "num_input_tokens_seen": 318707850, + "step": 14781, + "time_per_iteration": 3.1335248947143555 + }, + { + "auxiliary_loss_clip": 0.00963276, + "auxiliary_loss_mlp": 0.01001523, + "balance_loss_clip": 1.01312089, + "balance_loss_mlp": 1.00039089, + "epoch": 0.8887419209379227, + "flos": 60646946935680.0, + "grad_norm": 0.7966364611463298, + "language_loss": 0.58177191, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.60141987, + "num_input_tokens_seen": 318764915, + "step": 14782, + "time_per_iteration": 3.4004065990448 + }, + { + "auxiliary_loss_clip": 0.0110225, + "auxiliary_loss_mlp": 0.01028507, + "balance_loss_clip": 1.03579354, + "balance_loss_mlp": 1.01746237, + "epoch": 0.8888020441905907, + "flos": 29205861235200.0, + "grad_norm": 1.6360913313098524, + "language_loss": 0.65877289, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.68008041, + "num_input_tokens_seen": 318785660, + "step": 14783, + "time_per_iteration": 2.7113358974456787 + }, + { + "auxiliary_loss_clip": 0.01104368, + "auxiliary_loss_mlp": 0.01030793, + "balance_loss_clip": 1.03363192, + "balance_loss_mlp": 1.01822233, + "epoch": 0.8888621674432586, + "flos": 22162701605760.0, + "grad_norm": 1.5746138761182862, + "language_loss": 0.77722031, + "learning_rate": 1.281095609023415e-07, + "loss": 0.79857194, + "num_input_tokens_seen": 318806080, + "step": 14784, + "time_per_iteration": 2.4481658935546875 + }, + { + "auxiliary_loss_clip": 0.0107759, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.03361952, + "balance_loss_mlp": 1.02073908, + "epoch": 0.8889222906959267, + "flos": 27672834723840.0, + "grad_norm": 2.126451106868472, + "language_loss": 0.60150492, + "learning_rate": 1.279724491644565e-07, + "loss": 0.62261117, + "num_input_tokens_seen": 318826445, + "step": 14785, + "time_per_iteration": 2.5773379802703857 + }, + { + "auxiliary_loss_clip": 0.01070565, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.03501058, + "balance_loss_mlp": 1.01762974, + "epoch": 0.8889824139485947, + "flos": 14168627274240.0, + "grad_norm": 1.7443448220577387, + "language_loss": 0.65019405, + "learning_rate": 1.278354084140445e-07, + "loss": 0.67119813, + "num_input_tokens_seen": 318843915, + "step": 14786, + "time_per_iteration": 2.5079758167266846 + }, + { + "auxiliary_loss_clip": 0.01072641, + "auxiliary_loss_mlp": 0.00784171, + "balance_loss_clip": 1.03492987, + "balance_loss_mlp": 1.00780559, + "epoch": 0.8890425372012626, + "flos": 12853003829760.0, + "grad_norm": 2.165595730300719, + "language_loss": 0.85514522, + "learning_rate": 1.276984386563009e-07, + "loss": 0.87371331, + "num_input_tokens_seen": 318859670, + "step": 14787, + "time_per_iteration": 2.5575172901153564 + }, + { + "auxiliary_loss_clip": 0.01078191, + "auxiliary_loss_mlp": 0.01027707, + "balance_loss_clip": 1.03557718, + "balance_loss_mlp": 1.01578569, + "epoch": 0.8891026604539306, + "flos": 21689291329920.0, + "grad_norm": 2.501196251835476, + "language_loss": 0.70919478, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.73025376, + "num_input_tokens_seen": 318877855, + "step": 14788, + "time_per_iteration": 2.5095324516296387 + }, + { + "auxiliary_loss_clip": 0.01101192, + "auxiliary_loss_mlp": 0.01029179, + "balance_loss_clip": 1.03529084, + "balance_loss_mlp": 1.01784229, + "epoch": 0.8891627837065985, + "flos": 21871430219520.0, + "grad_norm": 1.6118745288161302, + "language_loss": 0.70277691, + "learning_rate": 1.274247121395935e-07, + "loss": 0.72408056, + "num_input_tokens_seen": 318896045, + "step": 14789, + "time_per_iteration": 2.4720683097839355 + }, + { + "auxiliary_loss_clip": 0.01094914, + "auxiliary_loss_mlp": 0.01025508, + "balance_loss_clip": 1.03599107, + "balance_loss_mlp": 1.01346159, + "epoch": 0.8892229069592665, + "flos": 21580230660480.0, + "grad_norm": 1.6351552431997953, + "language_loss": 0.70204329, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.72324753, + "num_input_tokens_seen": 318915515, + "step": 14790, + "time_per_iteration": 2.4887616634368896 + }, + { + "auxiliary_loss_clip": 0.01082054, + "auxiliary_loss_mlp": 0.01027027, + "balance_loss_clip": 1.0345062, + "balance_loss_mlp": 1.01565421, + "epoch": 0.8892830302119344, + "flos": 23075981832960.0, + "grad_norm": 1.7059943107379352, + "language_loss": 0.72621655, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.74730736, + "num_input_tokens_seen": 318934305, + "step": 14791, + "time_per_iteration": 2.5478217601776123 + }, + { + "auxiliary_loss_clip": 0.01070836, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.03348541, + "balance_loss_mlp": 1.02060747, + "epoch": 0.8893431534646025, + "flos": 23072139077760.0, + "grad_norm": 1.4587296950427202, + "language_loss": 0.7415024, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.76253653, + "num_input_tokens_seen": 318953880, + "step": 14792, + "time_per_iteration": 2.5476179122924805 + }, + { + "auxiliary_loss_clip": 0.01035638, + "auxiliary_loss_mlp": 0.01033517, + "balance_loss_clip": 1.03098845, + "balance_loss_mlp": 1.02042818, + "epoch": 0.8894032767172704, + "flos": 22454978572800.0, + "grad_norm": 1.8456321843614887, + "language_loss": 0.66111988, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.68181145, + "num_input_tokens_seen": 318971395, + "step": 14793, + "time_per_iteration": 4.343828916549683 + }, + { + "auxiliary_loss_clip": 0.01075472, + "auxiliary_loss_mlp": 0.01029674, + "balance_loss_clip": 1.03482568, + "balance_loss_mlp": 1.01687682, + "epoch": 0.8894633999699384, + "flos": 25338246261120.0, + "grad_norm": 1.642017553759774, + "language_loss": 0.71513343, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.73618484, + "num_input_tokens_seen": 318990580, + "step": 14794, + "time_per_iteration": 4.152606964111328 + }, + { + "auxiliary_loss_clip": 0.01093972, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.03665054, + "balance_loss_mlp": 1.01910436, + "epoch": 0.8895235232226063, + "flos": 20994096528000.0, + "grad_norm": 1.5902845514723203, + "language_loss": 0.75336468, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.77462566, + "num_input_tokens_seen": 319010040, + "step": 14795, + "time_per_iteration": 2.479736328125 + }, + { + "auxiliary_loss_clip": 0.01002272, + "auxiliary_loss_mlp": 0.00999726, + "balance_loss_clip": 1.00910187, + "balance_loss_mlp": 0.99849814, + "epoch": 0.8895836464752743, + "flos": 69732956764800.0, + "grad_norm": 0.7717996439197335, + "language_loss": 0.56109905, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.58111894, + "num_input_tokens_seen": 319063860, + "step": 14796, + "time_per_iteration": 3.0396525859832764 + }, + { + "auxiliary_loss_clip": 0.01106425, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.03614402, + "balance_loss_mlp": 1.01885295, + "epoch": 0.8896437697279422, + "flos": 23221815050880.0, + "grad_norm": 1.8855348564433667, + "language_loss": 0.70507228, + "learning_rate": 1.263326468169843e-07, + "loss": 0.72646523, + "num_input_tokens_seen": 319082335, + "step": 14797, + "time_per_iteration": 3.870093584060669 + }, + { + "auxiliary_loss_clip": 0.01016569, + "auxiliary_loss_mlp": 0.01005536, + "balance_loss_clip": 1.01219368, + "balance_loss_mlp": 1.00450468, + "epoch": 0.8897038929806103, + "flos": 70752711882240.0, + "grad_norm": 2.0555559270329384, + "language_loss": 0.58038336, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.60060441, + "num_input_tokens_seen": 319147075, + "step": 14798, + "time_per_iteration": 3.1666107177734375 + }, + { + "auxiliary_loss_clip": 0.01092677, + "auxiliary_loss_mlp": 0.01026627, + "balance_loss_clip": 1.0339886, + "balance_loss_mlp": 1.01448512, + "epoch": 0.8897640162332782, + "flos": 19245103493760.0, + "grad_norm": 1.50829272113656, + "language_loss": 0.7957083, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.81690139, + "num_input_tokens_seen": 319166630, + "step": 14799, + "time_per_iteration": 2.469029426574707 + }, + { + "auxiliary_loss_clip": 0.01018846, + "auxiliary_loss_mlp": 0.00999343, + "balance_loss_clip": 1.00569963, + "balance_loss_mlp": 0.99832362, + "epoch": 0.8898241394859462, + "flos": 41356275039360.0, + "grad_norm": 0.8805503724292374, + "language_loss": 0.58097434, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.60115623, + "num_input_tokens_seen": 319221865, + "step": 14800, + "time_per_iteration": 3.023193120956421 + }, + { + "auxiliary_loss_clip": 0.01093032, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.03571105, + "balance_loss_mlp": 1.01762688, + "epoch": 0.8898842627386142, + "flos": 18986295024000.0, + "grad_norm": 1.4722163058480664, + "language_loss": 0.66260612, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.6838243, + "num_input_tokens_seen": 319240710, + "step": 14801, + "time_per_iteration": 2.4837498664855957 + }, + { + "auxiliary_loss_clip": 0.01077825, + "auxiliary_loss_mlp": 0.01035108, + "balance_loss_clip": 1.03658533, + "balance_loss_mlp": 1.02154231, + "epoch": 0.8899443859912821, + "flos": 13217173868160.0, + "grad_norm": 13.143894500658542, + "language_loss": 0.75125837, + "learning_rate": 1.256524149358682e-07, + "loss": 0.77238768, + "num_input_tokens_seen": 319256495, + "step": 14802, + "time_per_iteration": 2.514037847518921 + }, + { + "auxiliary_loss_clip": 0.01086707, + "auxiliary_loss_mlp": 0.01031317, + "balance_loss_clip": 1.03476036, + "balance_loss_mlp": 1.02001595, + "epoch": 0.8900045092439501, + "flos": 22674680110080.0, + "grad_norm": 1.8427096132597989, + "language_loss": 0.73094642, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.75212669, + "num_input_tokens_seen": 319273620, + "step": 14803, + "time_per_iteration": 3.8748252391815186 + }, + { + "auxiliary_loss_clip": 0.01082782, + "auxiliary_loss_mlp": 0.0103305, + "balance_loss_clip": 1.03505158, + "balance_loss_mlp": 1.02118921, + "epoch": 0.890064632496618, + "flos": 21141617685120.0, + "grad_norm": 1.821593265169082, + "language_loss": 0.72203696, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.7431953, + "num_input_tokens_seen": 319291720, + "step": 14804, + "time_per_iteration": 2.5248665809631348 + }, + { + "auxiliary_loss_clip": 0.01089002, + "auxiliary_loss_mlp": 0.01032815, + "balance_loss_clip": 1.03425944, + "balance_loss_mlp": 1.02065003, + "epoch": 0.8901247557492861, + "flos": 23397058529280.0, + "grad_norm": 1.762159693306876, + "language_loss": 0.81022114, + "learning_rate": 1.252451286713123e-07, + "loss": 0.83143938, + "num_input_tokens_seen": 319310380, + "step": 14805, + "time_per_iteration": 2.4990882873535156 + }, + { + "auxiliary_loss_clip": 0.01094921, + "auxiliary_loss_mlp": 0.0102949, + "balance_loss_clip": 1.03525376, + "balance_loss_mlp": 1.01700234, + "epoch": 0.890184879001954, + "flos": 29169591477120.0, + "grad_norm": 2.0060838836161037, + "language_loss": 0.6752671, + "learning_rate": 1.251095087580505e-07, + "loss": 0.69651121, + "num_input_tokens_seen": 319331765, + "step": 14806, + "time_per_iteration": 2.556997776031494 + }, + { + "auxiliary_loss_clip": 0.01080562, + "auxiliary_loss_mlp": 0.0103268, + "balance_loss_clip": 1.03347707, + "balance_loss_mlp": 1.02044868, + "epoch": 0.890245002254622, + "flos": 14427830793600.0, + "grad_norm": 1.9131407424449522, + "language_loss": 0.67100191, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.69213432, + "num_input_tokens_seen": 319349135, + "step": 14807, + "time_per_iteration": 2.490407705307007 + }, + { + "auxiliary_loss_clip": 0.0107875, + "auxiliary_loss_mlp": 0.01028949, + "balance_loss_clip": 1.03333783, + "balance_loss_mlp": 1.01767778, + "epoch": 0.8903051255072899, + "flos": 22382187661440.0, + "grad_norm": 1.7911575303453495, + "language_loss": 0.75134987, + "learning_rate": 1.248384822247732e-07, + "loss": 0.77242684, + "num_input_tokens_seen": 319368410, + "step": 14808, + "time_per_iteration": 2.531984329223633 + }, + { + "auxiliary_loss_clip": 0.01071015, + "auxiliary_loss_mlp": 0.0102974, + "balance_loss_clip": 1.03402948, + "balance_loss_mlp": 1.01822484, + "epoch": 0.8903652487599579, + "flos": 20777375819520.0, + "grad_norm": 1.803196553119202, + "language_loss": 0.81342244, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.83442998, + "num_input_tokens_seen": 319387535, + "step": 14809, + "time_per_iteration": 2.535243511199951 + }, + { + "auxiliary_loss_clip": 0.01093863, + "auxiliary_loss_mlp": 0.01027089, + "balance_loss_clip": 1.03502798, + "balance_loss_mlp": 1.01547766, + "epoch": 0.8904253720126258, + "flos": 24424499157120.0, + "grad_norm": 1.8322106166513674, + "language_loss": 0.68327099, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.70448047, + "num_input_tokens_seen": 319407210, + "step": 14810, + "time_per_iteration": 2.5826032161712646 + }, + { + "auxiliary_loss_clip": 0.01070197, + "auxiliary_loss_mlp": 0.01028299, + "balance_loss_clip": 1.0324614, + "balance_loss_mlp": 1.01540041, + "epoch": 0.8904854952652939, + "flos": 19463871277440.0, + "grad_norm": 2.0638228425460916, + "language_loss": 0.7016722, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.72265714, + "num_input_tokens_seen": 319425340, + "step": 14811, + "time_per_iteration": 2.5280051231384277 + }, + { + "auxiliary_loss_clip": 0.01071359, + "auxiliary_loss_mlp": 0.00783259, + "balance_loss_clip": 1.03418541, + "balance_loss_mlp": 1.00868833, + "epoch": 0.8905456185179618, + "flos": 50800741666560.0, + "grad_norm": 1.983922041570675, + "language_loss": 0.65255433, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.67110056, + "num_input_tokens_seen": 319448150, + "step": 14812, + "time_per_iteration": 2.801257610321045 + }, + { + "auxiliary_loss_clip": 0.01056287, + "auxiliary_loss_mlp": 0.01029348, + "balance_loss_clip": 1.0345844, + "balance_loss_mlp": 1.01802874, + "epoch": 0.8906057417706298, + "flos": 17784867893760.0, + "grad_norm": 1.7444793365529172, + "language_loss": 0.68781304, + "learning_rate": 1.24162160341861e-07, + "loss": 0.70866942, + "num_input_tokens_seen": 319466115, + "step": 14813, + "time_per_iteration": 2.5375816822052 + }, + { + "auxiliary_loss_clip": 0.01080457, + "auxiliary_loss_mlp": 0.01038862, + "balance_loss_clip": 1.03207254, + "balance_loss_mlp": 1.02266169, + "epoch": 0.8906658650232978, + "flos": 21944867575680.0, + "grad_norm": 1.726444062831347, + "language_loss": 0.75219786, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.77339107, + "num_input_tokens_seen": 319485255, + "step": 14814, + "time_per_iteration": 2.50970196723938 + }, + { + "auxiliary_loss_clip": 0.01094823, + "auxiliary_loss_mlp": 0.01025946, + "balance_loss_clip": 1.03426194, + "balance_loss_mlp": 1.01357794, + "epoch": 0.8907259882759657, + "flos": 21287810039040.0, + "grad_norm": 1.9579099469166052, + "language_loss": 0.74138284, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.76259053, + "num_input_tokens_seen": 319501800, + "step": 14815, + "time_per_iteration": 2.456357002258301 + }, + { + "auxiliary_loss_clip": 0.01065298, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.03272557, + "balance_loss_mlp": 1.01843369, + "epoch": 0.8907861115286337, + "flos": 20120426023680.0, + "grad_norm": 1.715445428657627, + "language_loss": 0.74947739, + "learning_rate": 1.237572207545914e-07, + "loss": 0.77043843, + "num_input_tokens_seen": 319520415, + "step": 14816, + "time_per_iteration": 2.547678232192993 + }, + { + "auxiliary_loss_clip": 0.0108274, + "auxiliary_loss_mlp": 0.01028312, + "balance_loss_clip": 1.03338337, + "balance_loss_mlp": 1.01657009, + "epoch": 0.8908462347813016, + "flos": 20084156265600.0, + "grad_norm": 1.7277091325077811, + "language_loss": 0.77749598, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.79860651, + "num_input_tokens_seen": 319538410, + "step": 14817, + "time_per_iteration": 2.534184694290161 + }, + { + "auxiliary_loss_clip": 0.01002516, + "auxiliary_loss_mlp": 0.01003476, + "balance_loss_clip": 1.00792098, + "balance_loss_mlp": 1.00229573, + "epoch": 0.8909063580339697, + "flos": 65503649790720.0, + "grad_norm": 0.7506755006812162, + "language_loss": 0.56473804, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.58479798, + "num_input_tokens_seen": 319602565, + "step": 14818, + "time_per_iteration": 3.1919074058532715 + }, + { + "auxiliary_loss_clip": 0.01055291, + "auxiliary_loss_mlp": 0.01032213, + "balance_loss_clip": 1.0367713, + "balance_loss_mlp": 1.02029812, + "epoch": 0.8909664812866376, + "flos": 29863062426240.0, + "grad_norm": 1.7680304357569707, + "language_loss": 0.64618576, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.66706079, + "num_input_tokens_seen": 319624645, + "step": 14819, + "time_per_iteration": 2.6521623134613037 + }, + { + "auxiliary_loss_clip": 0.01093104, + "auxiliary_loss_mlp": 0.01032363, + "balance_loss_clip": 1.03435123, + "balance_loss_mlp": 1.01943517, + "epoch": 0.8910266045393056, + "flos": 25447127362560.0, + "grad_norm": 2.983978733856615, + "language_loss": 0.7871325, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.80838716, + "num_input_tokens_seen": 319644040, + "step": 14820, + "time_per_iteration": 2.5172977447509766 + }, + { + "auxiliary_loss_clip": 0.01071152, + "auxiliary_loss_mlp": 0.00783284, + "balance_loss_clip": 1.03392363, + "balance_loss_mlp": 1.00963938, + "epoch": 0.8910867277919735, + "flos": 24499121662080.0, + "grad_norm": 1.796542552394135, + "language_loss": 0.76725578, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.78580022, + "num_input_tokens_seen": 319663930, + "step": 14821, + "time_per_iteration": 2.5661685466766357 + }, + { + "auxiliary_loss_clip": 0.01018598, + "auxiliary_loss_mlp": 0.00762632, + "balance_loss_clip": 1.00964868, + "balance_loss_mlp": 1.00024307, + "epoch": 0.8911468510446415, + "flos": 60688136856960.0, + "grad_norm": 0.7951907419902979, + "language_loss": 0.59265792, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.61047018, + "num_input_tokens_seen": 319721245, + "step": 14822, + "time_per_iteration": 2.9911487102508545 + }, + { + "auxiliary_loss_clip": 0.01086627, + "auxiliary_loss_mlp": 0.01030192, + "balance_loss_clip": 1.03359771, + "balance_loss_mlp": 1.018116, + "epoch": 0.8912069742973094, + "flos": 25337492075520.0, + "grad_norm": 1.7370656772097506, + "language_loss": 0.69278973, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.71395791, + "num_input_tokens_seen": 319741200, + "step": 14823, + "time_per_iteration": 2.5412843227386475 + }, + { + "auxiliary_loss_clip": 0.01084445, + "auxiliary_loss_mlp": 0.01030732, + "balance_loss_clip": 1.03174222, + "balance_loss_mlp": 1.01826835, + "epoch": 0.8912670975499775, + "flos": 18223516782720.0, + "grad_norm": 1.667301244128644, + "language_loss": 0.69171327, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.712865, + "num_input_tokens_seen": 319759265, + "step": 14824, + "time_per_iteration": 2.4994192123413086 + }, + { + "auxiliary_loss_clip": 0.0105467, + "auxiliary_loss_mlp": 0.01040823, + "balance_loss_clip": 1.03438485, + "balance_loss_mlp": 1.02657104, + "epoch": 0.8913272208026454, + "flos": 26504481041280.0, + "grad_norm": 1.7268978086662, + "language_loss": 0.70666462, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.72761953, + "num_input_tokens_seen": 319777560, + "step": 14825, + "time_per_iteration": 2.6672799587249756 + }, + { + "auxiliary_loss_clip": 0.01080936, + "auxiliary_loss_mlp": 0.01031129, + "balance_loss_clip": 1.03293896, + "balance_loss_mlp": 1.01839721, + "epoch": 0.8913873440553134, + "flos": 18802324540800.0, + "grad_norm": 2.1196632409506795, + "language_loss": 0.70680189, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.72792256, + "num_input_tokens_seen": 319794125, + "step": 14826, + "time_per_iteration": 2.5015711784362793 + }, + { + "auxiliary_loss_clip": 0.0109254, + "auxiliary_loss_mlp": 0.01023551, + "balance_loss_clip": 1.03604841, + "balance_loss_mlp": 1.01235127, + "epoch": 0.8914474673079814, + "flos": 20884892204160.0, + "grad_norm": 2.1277681917413473, + "language_loss": 0.74603981, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.76720077, + "num_input_tokens_seen": 319810310, + "step": 14827, + "time_per_iteration": 2.525562047958374 + }, + { + "auxiliary_loss_clip": 0.01095199, + "auxiliary_loss_mlp": 0.0103, + "balance_loss_clip": 1.03514242, + "balance_loss_mlp": 1.01782823, + "epoch": 0.8915075905606493, + "flos": 20952439729920.0, + "grad_norm": 1.6888287470090129, + "language_loss": 0.78134853, + "learning_rate": 1.221438670423336e-07, + "loss": 0.8026005, + "num_input_tokens_seen": 319828505, + "step": 14828, + "time_per_iteration": 2.5199453830718994 + }, + { + "auxiliary_loss_clip": 0.01070947, + "auxiliary_loss_mlp": 0.01030759, + "balance_loss_clip": 1.03514731, + "balance_loss_mlp": 1.01863551, + "epoch": 0.8915677138133173, + "flos": 23076305055360.0, + "grad_norm": 1.63639096198059, + "language_loss": 0.75383073, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.77484781, + "num_input_tokens_seen": 319848680, + "step": 14829, + "time_per_iteration": 2.593977928161621 + }, + { + "auxiliary_loss_clip": 0.01103455, + "auxiliary_loss_mlp": 0.01034457, + "balance_loss_clip": 1.03417838, + "balance_loss_mlp": 1.02326345, + "epoch": 0.8916278370659853, + "flos": 23440259612160.0, + "grad_norm": 1.5534190104338388, + "language_loss": 0.84244943, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.86382854, + "num_input_tokens_seen": 319868835, + "step": 14830, + "time_per_iteration": 2.5071849822998047 + }, + { + "auxiliary_loss_clip": 0.0108883, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.03409457, + "balance_loss_mlp": 1.01748729, + "epoch": 0.8916879603186533, + "flos": 25160488830720.0, + "grad_norm": 1.3563058533125143, + "language_loss": 0.75068378, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.77185506, + "num_input_tokens_seen": 319891585, + "step": 14831, + "time_per_iteration": 3.9934234619140625 + }, + { + "auxiliary_loss_clip": 0.01095038, + "auxiliary_loss_mlp": 0.0102875, + "balance_loss_clip": 1.03379619, + "balance_loss_mlp": 1.0163765, + "epoch": 0.8917480835713212, + "flos": 20229845829120.0, + "grad_norm": 1.9951289776671632, + "language_loss": 0.72890419, + "learning_rate": 1.216083607088847e-07, + "loss": 0.7501421, + "num_input_tokens_seen": 319910315, + "step": 14832, + "time_per_iteration": 3.893512487411499 + }, + { + "auxiliary_loss_clip": 0.01049075, + "auxiliary_loss_mlp": 0.00781434, + "balance_loss_clip": 1.03263867, + "balance_loss_mlp": 1.00638616, + "epoch": 0.8918082068239892, + "flos": 26101922342400.0, + "grad_norm": 1.7330483966043138, + "language_loss": 0.67078936, + "learning_rate": 1.214746621848355e-07, + "loss": 0.68909442, + "num_input_tokens_seen": 319932275, + "step": 14833, + "time_per_iteration": 2.6621882915496826 + }, + { + "auxiliary_loss_clip": 0.01099627, + "auxiliary_loss_mlp": 0.01032899, + "balance_loss_clip": 1.03659379, + "balance_loss_mlp": 1.0197202, + "epoch": 0.8918683300766571, + "flos": 24831439315200.0, + "grad_norm": 2.901661324015054, + "language_loss": 0.73560345, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.75692868, + "num_input_tokens_seen": 319955335, + "step": 14834, + "time_per_iteration": 2.605165958404541 + }, + { + "auxiliary_loss_clip": 0.01067344, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.03352952, + "balance_loss_mlp": 1.020684, + "epoch": 0.8919284533293251, + "flos": 22305158945280.0, + "grad_norm": 2.044618747781443, + "language_loss": 0.78971827, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.81072032, + "num_input_tokens_seen": 319973990, + "step": 14835, + "time_per_iteration": 3.984410524368286 + }, + { + "auxiliary_loss_clip": 0.01098856, + "auxiliary_loss_mlp": 0.01028436, + "balance_loss_clip": 1.03276169, + "balance_loss_mlp": 1.01736784, + "epoch": 0.891988576581993, + "flos": 30373532559360.0, + "grad_norm": 1.4952576564187399, + "language_loss": 0.73750126, + "learning_rate": 1.210739940361689e-07, + "loss": 0.75877422, + "num_input_tokens_seen": 319995555, + "step": 14836, + "time_per_iteration": 2.5052273273468018 + }, + { + "auxiliary_loss_clip": 0.01082194, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.03291345, + "balance_loss_mlp": 1.01902103, + "epoch": 0.8920486998346611, + "flos": 15552947479680.0, + "grad_norm": 3.3135604027457166, + "language_loss": 0.68894231, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.71007311, + "num_input_tokens_seen": 320012385, + "step": 14837, + "time_per_iteration": 2.487098217010498 + }, + { + "auxiliary_loss_clip": 0.0105612, + "auxiliary_loss_mlp": 0.01031705, + "balance_loss_clip": 1.03514636, + "balance_loss_mlp": 1.01869941, + "epoch": 0.892108823087329, + "flos": 21214983214080.0, + "grad_norm": 1.9180944781174813, + "language_loss": 0.67527866, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.69615692, + "num_input_tokens_seen": 320032390, + "step": 14838, + "time_per_iteration": 2.5784826278686523 + }, + { + "auxiliary_loss_clip": 0.01092098, + "auxiliary_loss_mlp": 0.01031229, + "balance_loss_clip": 1.03270912, + "balance_loss_mlp": 1.01799059, + "epoch": 0.892168946339997, + "flos": 21978982517760.0, + "grad_norm": 2.09505321382696, + "language_loss": 0.76307875, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.78431201, + "num_input_tokens_seen": 320052885, + "step": 14839, + "time_per_iteration": 2.485930919647217 + }, + { + "auxiliary_loss_clip": 0.01001056, + "auxiliary_loss_mlp": 0.00761769, + "balance_loss_clip": 1.00795984, + "balance_loss_mlp": 1.00041723, + "epoch": 0.892229069592665, + "flos": 67475289277440.0, + "grad_norm": 0.6989223465462696, + "language_loss": 0.49376684, + "learning_rate": 1.205407673483978e-07, + "loss": 0.51139516, + "num_input_tokens_seen": 320113685, + "step": 14840, + "time_per_iteration": 3.1210856437683105 + }, + { + "auxiliary_loss_clip": 0.01108701, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.03476715, + "balance_loss_mlp": 1.0192126, + "epoch": 0.8922891928453329, + "flos": 19459561645440.0, + "grad_norm": 2.558231128969223, + "language_loss": 0.6379419, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.65935487, + "num_input_tokens_seen": 320130810, + "step": 14841, + "time_per_iteration": 2.4866116046905518 + }, + { + "auxiliary_loss_clip": 0.01069059, + "auxiliary_loss_mlp": 0.00782464, + "balance_loss_clip": 1.03446352, + "balance_loss_mlp": 1.01076293, + "epoch": 0.8923493160980009, + "flos": 23367396873600.0, + "grad_norm": 1.3989295485906519, + "language_loss": 0.6815697, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.70008492, + "num_input_tokens_seen": 320152170, + "step": 14842, + "time_per_iteration": 3.9559473991394043 + }, + { + "auxiliary_loss_clip": 0.0110259, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.03556252, + "balance_loss_mlp": 1.02068448, + "epoch": 0.8924094393506689, + "flos": 26177047637760.0, + "grad_norm": 2.01498606779671, + "language_loss": 0.79842663, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.81977248, + "num_input_tokens_seen": 320172360, + "step": 14843, + "time_per_iteration": 2.481513023376465 + }, + { + "auxiliary_loss_clip": 0.01080374, + "auxiliary_loss_mlp": 0.01035067, + "balance_loss_clip": 1.03354299, + "balance_loss_mlp": 1.02257967, + "epoch": 0.8924695626033369, + "flos": 22018520413440.0, + "grad_norm": 1.7121616816048129, + "language_loss": 0.6806978, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.70185226, + "num_input_tokens_seen": 320192130, + "step": 14844, + "time_per_iteration": 2.544724464416504 + }, + { + "auxiliary_loss_clip": 0.01059353, + "auxiliary_loss_mlp": 0.01033595, + "balance_loss_clip": 1.03332937, + "balance_loss_mlp": 1.02171564, + "epoch": 0.8925296858560048, + "flos": 14793940166400.0, + "grad_norm": 1.984629803593817, + "language_loss": 0.91460389, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.9355334, + "num_input_tokens_seen": 320207760, + "step": 14845, + "time_per_iteration": 2.536334991455078 + }, + { + "auxiliary_loss_clip": 0.0108508, + "auxiliary_loss_mlp": 0.0102826, + "balance_loss_clip": 1.03306246, + "balance_loss_mlp": 1.01652968, + "epoch": 0.8925898091086728, + "flos": 22346636175360.0, + "grad_norm": 1.9060547500087672, + "language_loss": 0.72370416, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.74483758, + "num_input_tokens_seen": 320225325, + "step": 14846, + "time_per_iteration": 2.5018866062164307 + }, + { + "auxiliary_loss_clip": 0.01075678, + "auxiliary_loss_mlp": 0.01034353, + "balance_loss_clip": 1.03419757, + "balance_loss_mlp": 1.02231872, + "epoch": 0.8926499323613407, + "flos": 45806322067200.0, + "grad_norm": 1.6872753150711388, + "language_loss": 0.57176518, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.59286547, + "num_input_tokens_seen": 320247645, + "step": 14847, + "time_per_iteration": 2.7422313690185547 + }, + { + "auxiliary_loss_clip": 0.01073082, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.03418136, + "balance_loss_mlp": 1.0193572, + "epoch": 0.8927100556140087, + "flos": 22127042378880.0, + "grad_norm": 1.889045887475548, + "language_loss": 0.76863843, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.78967369, + "num_input_tokens_seen": 320266005, + "step": 14848, + "time_per_iteration": 2.5706076622009277 + }, + { + "auxiliary_loss_clip": 0.01040097, + "auxiliary_loss_mlp": 0.01042249, + "balance_loss_clip": 1.03178644, + "balance_loss_mlp": 1.02851057, + "epoch": 0.8927701788666766, + "flos": 28330143655680.0, + "grad_norm": 1.7164142989798647, + "language_loss": 0.68733108, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.70815456, + "num_input_tokens_seen": 320285555, + "step": 14849, + "time_per_iteration": 2.688546895980835 + }, + { + "auxiliary_loss_clip": 0.01096332, + "auxiliary_loss_mlp": 0.01033138, + "balance_loss_clip": 1.03783417, + "balance_loss_mlp": 1.02172351, + "epoch": 0.8928303021193447, + "flos": 25294973351040.0, + "grad_norm": 1.7257529116425367, + "language_loss": 0.8053782, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.82667291, + "num_input_tokens_seen": 320305395, + "step": 14850, + "time_per_iteration": 2.5390236377716064 + }, + { + "auxiliary_loss_clip": 0.01082991, + "auxiliary_loss_mlp": 0.01036406, + "balance_loss_clip": 1.03431749, + "balance_loss_mlp": 1.02451491, + "epoch": 0.8928904253720126, + "flos": 22236713579520.0, + "grad_norm": 1.5519111866586504, + "language_loss": 0.74881804, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.77001202, + "num_input_tokens_seen": 320324220, + "step": 14851, + "time_per_iteration": 2.5222365856170654 + }, + { + "auxiliary_loss_clip": 0.01079661, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.03378797, + "balance_loss_mlp": 1.01798081, + "epoch": 0.8929505486246806, + "flos": 27092374940160.0, + "grad_norm": 1.6703688292997558, + "language_loss": 0.78455478, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.80565423, + "num_input_tokens_seen": 320347195, + "step": 14852, + "time_per_iteration": 2.5879154205322266 + }, + { + "auxiliary_loss_clip": 0.01085743, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.03635454, + "balance_loss_mlp": 1.01793957, + "epoch": 0.8930106718773486, + "flos": 23039352938880.0, + "grad_norm": 1.3746544291533502, + "language_loss": 0.69433975, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.71549201, + "num_input_tokens_seen": 320366850, + "step": 14853, + "time_per_iteration": 2.4991390705108643 + }, + { + "auxiliary_loss_clip": 0.01057754, + "auxiliary_loss_mlp": 0.01031749, + "balance_loss_clip": 1.03648472, + "balance_loss_mlp": 1.01964307, + "epoch": 0.8930707951300165, + "flos": 35626652887680.0, + "grad_norm": 1.8124723245572947, + "language_loss": 0.67398167, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.69487667, + "num_input_tokens_seen": 320388895, + "step": 14854, + "time_per_iteration": 2.6998136043548584 + }, + { + "auxiliary_loss_clip": 0.01074376, + "auxiliary_loss_mlp": 0.01037634, + "balance_loss_clip": 1.03111184, + "balance_loss_mlp": 1.02526045, + "epoch": 0.8931309183826845, + "flos": 23039891642880.0, + "grad_norm": 1.458941569900575, + "language_loss": 0.7493661, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.77048624, + "num_input_tokens_seen": 320408520, + "step": 14855, + "time_per_iteration": 2.5093588829040527 + }, + { + "auxiliary_loss_clip": 0.01080396, + "auxiliary_loss_mlp": 0.01033189, + "balance_loss_clip": 1.03362727, + "balance_loss_mlp": 1.02145255, + "epoch": 0.8931910416353525, + "flos": 26504624695680.0, + "grad_norm": 1.936283504185197, + "language_loss": 0.64547491, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.66661072, + "num_input_tokens_seen": 320427400, + "step": 14856, + "time_per_iteration": 2.558302402496338 + }, + { + "auxiliary_loss_clip": 0.01102405, + "auxiliary_loss_mlp": 0.01028465, + "balance_loss_clip": 1.03430128, + "balance_loss_mlp": 1.01703298, + "epoch": 0.8932511648880205, + "flos": 24973609345920.0, + "grad_norm": 1.6761835533049678, + "language_loss": 0.66657883, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.68788755, + "num_input_tokens_seen": 320447570, + "step": 14857, + "time_per_iteration": 2.4735491275787354 + }, + { + "auxiliary_loss_clip": 0.0106041, + "auxiliary_loss_mlp": 0.01034734, + "balance_loss_clip": 1.03466344, + "balance_loss_mlp": 1.02222896, + "epoch": 0.8933112881406884, + "flos": 24460733001600.0, + "grad_norm": 3.4618661539101416, + "language_loss": 0.75820887, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.77916026, + "num_input_tokens_seen": 320464405, + "step": 14858, + "time_per_iteration": 2.6381285190582275 + }, + { + "auxiliary_loss_clip": 0.01090864, + "auxiliary_loss_mlp": 0.01030287, + "balance_loss_clip": 1.03339136, + "balance_loss_mlp": 1.01793647, + "epoch": 0.8933714113933564, + "flos": 28293083798400.0, + "grad_norm": 1.4752533386606812, + "language_loss": 0.69017041, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.71138191, + "num_input_tokens_seen": 320485525, + "step": 14859, + "time_per_iteration": 2.526698589324951 + }, + { + "auxiliary_loss_clip": 0.01051402, + "auxiliary_loss_mlp": 0.01027025, + "balance_loss_clip": 1.03227973, + "balance_loss_mlp": 1.01581335, + "epoch": 0.8934315346460243, + "flos": 21434864319360.0, + "grad_norm": 1.5023634035192732, + "language_loss": 0.7562325, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.77701676, + "num_input_tokens_seen": 320506725, + "step": 14860, + "time_per_iteration": 2.5832436084747314 + }, + { + "auxiliary_loss_clip": 0.01082608, + "auxiliary_loss_mlp": 0.0103235, + "balance_loss_clip": 1.03555751, + "balance_loss_mlp": 1.01893902, + "epoch": 0.8934916578986923, + "flos": 23769596436480.0, + "grad_norm": 1.675799786382552, + "language_loss": 0.5753504, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.59649998, + "num_input_tokens_seen": 320525425, + "step": 14861, + "time_per_iteration": 2.5200815200805664 + }, + { + "auxiliary_loss_clip": 0.01072615, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.03154159, + "balance_loss_mlp": 1.01730347, + "epoch": 0.8935517811513602, + "flos": 18916161719040.0, + "grad_norm": 1.7750318806667615, + "language_loss": 0.6301989, + "learning_rate": 1.176284122190685e-07, + "loss": 0.65121442, + "num_input_tokens_seen": 320543010, + "step": 14862, + "time_per_iteration": 2.5323781967163086 + }, + { + "auxiliary_loss_clip": 0.01089225, + "auxiliary_loss_mlp": 0.01029881, + "balance_loss_clip": 1.03266788, + "balance_loss_mlp": 1.01821625, + "epoch": 0.8936119044040283, + "flos": 24061370613120.0, + "grad_norm": 1.728849940634581, + "language_loss": 0.78066665, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.80185771, + "num_input_tokens_seen": 320562180, + "step": 14863, + "time_per_iteration": 2.498753309249878 + }, + { + "auxiliary_loss_clip": 0.0107515, + "auxiliary_loss_mlp": 0.01033923, + "balance_loss_clip": 1.03164506, + "balance_loss_mlp": 1.02190685, + "epoch": 0.8936720276566962, + "flos": 21324079797120.0, + "grad_norm": 2.746426770015608, + "language_loss": 0.70994467, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.73103541, + "num_input_tokens_seen": 320580395, + "step": 14864, + "time_per_iteration": 2.5100224018096924 + }, + { + "auxiliary_loss_clip": 0.01097325, + "auxiliary_loss_mlp": 0.01035927, + "balance_loss_clip": 1.03590667, + "balance_loss_mlp": 1.02341628, + "epoch": 0.8937321509093642, + "flos": 18406122549120.0, + "grad_norm": 1.9027977616912422, + "language_loss": 0.75569171, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.77702427, + "num_input_tokens_seen": 320599505, + "step": 14865, + "time_per_iteration": 2.452277898788452 + }, + { + "auxiliary_loss_clip": 0.01063322, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.03276753, + "balance_loss_mlp": 1.02010965, + "epoch": 0.8937922741620322, + "flos": 22054754257920.0, + "grad_norm": 1.6240393723412574, + "language_loss": 0.7187292, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.73967493, + "num_input_tokens_seen": 320619825, + "step": 14866, + "time_per_iteration": 2.6329727172851562 + }, + { + "auxiliary_loss_clip": 0.01094559, + "auxiliary_loss_mlp": 0.01030552, + "balance_loss_clip": 1.03526616, + "balance_loss_mlp": 1.01760602, + "epoch": 0.8938523974147001, + "flos": 25664386775040.0, + "grad_norm": 1.6146363515488438, + "language_loss": 0.84189999, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.86315107, + "num_input_tokens_seen": 320638515, + "step": 14867, + "time_per_iteration": 2.5000522136688232 + }, + { + "auxiliary_loss_clip": 0.01091298, + "auxiliary_loss_mlp": 0.01026136, + "balance_loss_clip": 1.03365493, + "balance_loss_mlp": 1.0155859, + "epoch": 0.8939125206673681, + "flos": 25742852035200.0, + "grad_norm": 1.7575772364006854, + "language_loss": 0.80623162, + "learning_rate": 1.168401272009567e-07, + "loss": 0.82740599, + "num_input_tokens_seen": 320659430, + "step": 14868, + "time_per_iteration": 2.542694091796875 + }, + { + "auxiliary_loss_clip": 0.01085431, + "auxiliary_loss_mlp": 0.01032583, + "balance_loss_clip": 1.0357635, + "balance_loss_mlp": 1.02065003, + "epoch": 0.8939726439200361, + "flos": 27344503480320.0, + "grad_norm": 2.0494903338438837, + "language_loss": 0.77438748, + "learning_rate": 1.167089962692056e-07, + "loss": 0.79556757, + "num_input_tokens_seen": 320679295, + "step": 14869, + "time_per_iteration": 2.564875602722168 + }, + { + "auxiliary_loss_clip": 0.01091952, + "auxiliary_loss_mlp": 0.00783204, + "balance_loss_clip": 1.03558803, + "balance_loss_mlp": 1.01139808, + "epoch": 0.8940327671727041, + "flos": 20338834671360.0, + "grad_norm": 2.3760820938832006, + "language_loss": 0.65318108, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.67193264, + "num_input_tokens_seen": 320697535, + "step": 14870, + "time_per_iteration": 3.8619303703308105 + }, + { + "auxiliary_loss_clip": 0.00994632, + "auxiliary_loss_mlp": 0.01017681, + "balance_loss_clip": 1.00991201, + "balance_loss_mlp": 1.01629257, + "epoch": 0.894092890425372, + "flos": 58410573235200.0, + "grad_norm": 0.7921533576190782, + "language_loss": 0.55951607, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.57963932, + "num_input_tokens_seen": 320758635, + "step": 14871, + "time_per_iteration": 4.599390506744385 + }, + { + "auxiliary_loss_clip": 0.01087089, + "auxiliary_loss_mlp": 0.01031941, + "balance_loss_clip": 1.03538799, + "balance_loss_mlp": 1.02077687, + "epoch": 0.89415301367804, + "flos": 19829657427840.0, + "grad_norm": 1.9545089171385395, + "language_loss": 0.76684487, + "learning_rate": 1.16316031981331e-07, + "loss": 0.78803515, + "num_input_tokens_seen": 320777175, + "step": 14872, + "time_per_iteration": 2.4769339561462402 + }, + { + "auxiliary_loss_clip": 0.01088594, + "auxiliary_loss_mlp": 0.01026053, + "balance_loss_clip": 1.03450215, + "balance_loss_mlp": 1.01517522, + "epoch": 0.8942131369307079, + "flos": 25775781828480.0, + "grad_norm": 1.5420986092420024, + "language_loss": 0.6726414, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.69378787, + "num_input_tokens_seen": 320797670, + "step": 14873, + "time_per_iteration": 3.932002544403076 + }, + { + "auxiliary_loss_clip": 0.01101003, + "auxiliary_loss_mlp": 0.01033557, + "balance_loss_clip": 1.03478074, + "balance_loss_mlp": 1.02188027, + "epoch": 0.8942732601833759, + "flos": 23149024139520.0, + "grad_norm": 1.5747460232468395, + "language_loss": 0.59560752, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.61695313, + "num_input_tokens_seen": 320817410, + "step": 14874, + "time_per_iteration": 2.469344139099121 + }, + { + "auxiliary_loss_clip": 0.01074261, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.03692389, + "balance_loss_mlp": 1.01636839, + "epoch": 0.8943333834360438, + "flos": 27855548231040.0, + "grad_norm": 1.9742061437336478, + "language_loss": 0.75722289, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.77825344, + "num_input_tokens_seen": 320836745, + "step": 14875, + "time_per_iteration": 2.569679021835327 + }, + { + "auxiliary_loss_clip": 0.01073631, + "auxiliary_loss_mlp": 0.01031098, + "balance_loss_clip": 1.03467774, + "balance_loss_mlp": 1.0173769, + "epoch": 0.8943935066887119, + "flos": 22163958581760.0, + "grad_norm": 3.272810959356292, + "language_loss": 0.77168202, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.79272932, + "num_input_tokens_seen": 320853305, + "step": 14876, + "time_per_iteration": 2.564401149749756 + }, + { + "auxiliary_loss_clip": 0.01091658, + "auxiliary_loss_mlp": 0.01025427, + "balance_loss_clip": 1.03508711, + "balance_loss_mlp": 1.01473379, + "epoch": 0.8944536299413798, + "flos": 21470056669440.0, + "grad_norm": 2.466172652412905, + "language_loss": 0.78542805, + "learning_rate": 1.156625201573287e-07, + "loss": 0.80659896, + "num_input_tokens_seen": 320872885, + "step": 14877, + "time_per_iteration": 2.4794716835021973 + }, + { + "auxiliary_loss_clip": 0.01056599, + "auxiliary_loss_mlp": 0.01036752, + "balance_loss_clip": 1.03301215, + "balance_loss_mlp": 1.02276278, + "epoch": 0.8945137531940478, + "flos": 17748777703680.0, + "grad_norm": 2.7527099771481884, + "language_loss": 0.75371665, + "learning_rate": 1.155320321355151e-07, + "loss": 0.77465016, + "num_input_tokens_seen": 320889755, + "step": 14878, + "time_per_iteration": 2.561741828918457 + }, + { + "auxiliary_loss_clip": 0.01088527, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.03282487, + "balance_loss_mlp": 1.01599383, + "epoch": 0.8945738764467158, + "flos": 21142264129920.0, + "grad_norm": 1.5637058313232388, + "language_loss": 0.75500488, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.7761904, + "num_input_tokens_seen": 320907860, + "step": 14879, + "time_per_iteration": 2.4788007736206055 + }, + { + "auxiliary_loss_clip": 0.01073842, + "auxiliary_loss_mlp": 0.01031229, + "balance_loss_clip": 1.03611088, + "balance_loss_mlp": 1.01889658, + "epoch": 0.8946339996993837, + "flos": 14903000835840.0, + "grad_norm": 1.8742163262391849, + "language_loss": 0.74288291, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.7639336, + "num_input_tokens_seen": 320925825, + "step": 14880, + "time_per_iteration": 3.9662015438079834 + }, + { + "auxiliary_loss_clip": 0.010874, + "auxiliary_loss_mlp": 0.01030886, + "balance_loss_clip": 1.03326786, + "balance_loss_mlp": 1.01794553, + "epoch": 0.8946941229520518, + "flos": 27382173868800.0, + "grad_norm": 1.6338641725103074, + "language_loss": 0.82851315, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.84969604, + "num_input_tokens_seen": 320946165, + "step": 14881, + "time_per_iteration": 2.5516934394836426 + }, + { + "auxiliary_loss_clip": 0.01063454, + "auxiliary_loss_mlp": 0.00782329, + "balance_loss_clip": 1.03386724, + "balance_loss_mlp": 1.00796986, + "epoch": 0.8947542462047197, + "flos": 31796277338880.0, + "grad_norm": 1.7129333419580968, + "language_loss": 0.67673606, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.69519389, + "num_input_tokens_seen": 320969330, + "step": 14882, + "time_per_iteration": 2.6245100498199463 + }, + { + "auxiliary_loss_clip": 0.01081226, + "auxiliary_loss_mlp": 0.01034417, + "balance_loss_clip": 1.03311849, + "balance_loss_mlp": 1.02017713, + "epoch": 0.8948143694573877, + "flos": 20883599314560.0, + "grad_norm": 2.156205644802734, + "language_loss": 0.75254667, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.7737031, + "num_input_tokens_seen": 320985055, + "step": 14883, + "time_per_iteration": 2.505072832107544 + }, + { + "auxiliary_loss_clip": 0.01077905, + "auxiliary_loss_mlp": 0.01034362, + "balance_loss_clip": 1.03374958, + "balance_loss_mlp": 1.02268553, + "epoch": 0.8948744927100556, + "flos": 28215552291840.0, + "grad_norm": 1.5848443854668068, + "language_loss": 0.72291565, + "learning_rate": 1.147506048211253e-07, + "loss": 0.74403834, + "num_input_tokens_seen": 321004720, + "step": 14884, + "time_per_iteration": 2.5580997467041016 + }, + { + "auxiliary_loss_clip": 0.01072305, + "auxiliary_loss_mlp": 0.01026911, + "balance_loss_clip": 1.0305723, + "balance_loss_mlp": 1.01550913, + "epoch": 0.8949346159627236, + "flos": 21902672073600.0, + "grad_norm": 1.7026129818959745, + "language_loss": 0.75601816, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.77701032, + "num_input_tokens_seen": 321022350, + "step": 14885, + "time_per_iteration": 2.524350881576538 + }, + { + "auxiliary_loss_clip": 0.01075501, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.03342247, + "balance_loss_mlp": 1.01745999, + "epoch": 0.8949947392153915, + "flos": 21359128492800.0, + "grad_norm": 1.901712751641727, + "language_loss": 0.81681651, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.83787179, + "num_input_tokens_seen": 321040450, + "step": 14886, + "time_per_iteration": 2.506679058074951 + }, + { + "auxiliary_loss_clip": 0.01043575, + "auxiliary_loss_mlp": 0.01029257, + "balance_loss_clip": 1.03392661, + "balance_loss_mlp": 1.01772916, + "epoch": 0.8950548624680595, + "flos": 52445342799360.0, + "grad_norm": 2.164505619426018, + "language_loss": 0.63979816, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.66052651, + "num_input_tokens_seen": 321063970, + "step": 14887, + "time_per_iteration": 2.9183855056762695 + }, + { + "auxiliary_loss_clip": 0.0108385, + "auxiliary_loss_mlp": 0.01036163, + "balance_loss_clip": 1.03371668, + "balance_loss_mlp": 1.02417064, + "epoch": 0.8951149857207275, + "flos": 20121323863680.0, + "grad_norm": 1.6342875615338222, + "language_loss": 0.60700512, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.6282053, + "num_input_tokens_seen": 321083840, + "step": 14888, + "time_per_iteration": 2.508223295211792 + }, + { + "auxiliary_loss_clip": 0.01104608, + "auxiliary_loss_mlp": 0.01026475, + "balance_loss_clip": 1.0345968, + "balance_loss_mlp": 1.01510274, + "epoch": 0.8951751089733955, + "flos": 29862631463040.0, + "grad_norm": 2.1892152822891404, + "language_loss": 0.70059121, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.72190201, + "num_input_tokens_seen": 321104165, + "step": 14889, + "time_per_iteration": 2.514225482940674 + }, + { + "auxiliary_loss_clip": 0.01091018, + "auxiliary_loss_mlp": 0.00782889, + "balance_loss_clip": 1.03550076, + "balance_loss_mlp": 1.00715363, + "epoch": 0.8952352322260634, + "flos": 15262789415040.0, + "grad_norm": 2.514500359700935, + "language_loss": 0.71577203, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.73451102, + "num_input_tokens_seen": 321117290, + "step": 14890, + "time_per_iteration": 2.4307191371917725 + }, + { + "auxiliary_loss_clip": 0.01020698, + "auxiliary_loss_mlp": 0.00785237, + "balance_loss_clip": 1.02995813, + "balance_loss_mlp": 1.00816631, + "epoch": 0.8952953554787314, + "flos": 26798338206720.0, + "grad_norm": 1.5560712752273933, + "language_loss": 0.75710559, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.77516496, + "num_input_tokens_seen": 321137115, + "step": 14891, + "time_per_iteration": 2.858551263809204 + }, + { + "auxiliary_loss_clip": 0.01045893, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.03211904, + "balance_loss_mlp": 1.01981425, + "epoch": 0.8953554787313994, + "flos": 14137205852160.0, + "grad_norm": 1.7846618635341365, + "language_loss": 0.76554042, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.78631699, + "num_input_tokens_seen": 321154490, + "step": 14892, + "time_per_iteration": 3.0761544704437256 + }, + { + "auxiliary_loss_clip": 0.01088456, + "auxiliary_loss_mlp": 0.01028481, + "balance_loss_clip": 1.03678679, + "balance_loss_mlp": 1.01704252, + "epoch": 0.8954156019840673, + "flos": 25703314139520.0, + "grad_norm": 1.4420218133113842, + "language_loss": 0.81827474, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.83944416, + "num_input_tokens_seen": 321175625, + "step": 14893, + "time_per_iteration": 2.5501554012298584 + }, + { + "auxiliary_loss_clip": 0.01064913, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.03345382, + "balance_loss_mlp": 1.01879191, + "epoch": 0.8954757252367354, + "flos": 21907987286400.0, + "grad_norm": 1.7022940111614189, + "language_loss": 0.74747407, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.7684288, + "num_input_tokens_seen": 321193895, + "step": 14894, + "time_per_iteration": 2.537607431411743 + }, + { + "auxiliary_loss_clip": 0.01092899, + "auxiliary_loss_mlp": 0.01032227, + "balance_loss_clip": 1.03778112, + "balance_loss_mlp": 1.02008569, + "epoch": 0.8955358484894033, + "flos": 12970396454400.0, + "grad_norm": 1.8798652602460064, + "language_loss": 0.6634686, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.68471986, + "num_input_tokens_seen": 321211610, + "step": 14895, + "time_per_iteration": 2.453655958175659 + }, + { + "auxiliary_loss_clip": 0.01093571, + "auxiliary_loss_mlp": 0.010277, + "balance_loss_clip": 1.03477991, + "balance_loss_mlp": 1.01511121, + "epoch": 0.8955959717420713, + "flos": 17273966797440.0, + "grad_norm": 1.516262571728655, + "language_loss": 0.67231232, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.69352502, + "num_input_tokens_seen": 321229805, + "step": 14896, + "time_per_iteration": 2.4464073181152344 + }, + { + "auxiliary_loss_clip": 0.01093815, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.03583741, + "balance_loss_mlp": 1.01858401, + "epoch": 0.8956560949947392, + "flos": 14793868339200.0, + "grad_norm": 1.82499769966295, + "language_loss": 0.75978434, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.78102493, + "num_input_tokens_seen": 321247165, + "step": 14897, + "time_per_iteration": 2.4658544063568115 + }, + { + "auxiliary_loss_clip": 0.00995537, + "auxiliary_loss_mlp": 0.00765352, + "balance_loss_clip": 1.01072085, + "balance_loss_mlp": 1.00093174, + "epoch": 0.8957162182474072, + "flos": 63607817957760.0, + "grad_norm": 0.7584300234045035, + "language_loss": 0.55354077, + "learning_rate": 1.129372846953931e-07, + "loss": 0.57114965, + "num_input_tokens_seen": 321308425, + "step": 14898, + "time_per_iteration": 3.226452350616455 + }, + { + "auxiliary_loss_clip": 0.01104551, + "auxiliary_loss_mlp": 0.00782805, + "balance_loss_clip": 1.03543925, + "balance_loss_mlp": 1.00856161, + "epoch": 0.8957763415000751, + "flos": 25009843190400.0, + "grad_norm": 1.6207109124741392, + "language_loss": 0.70129156, + "learning_rate": 1.12808298352008e-07, + "loss": 0.72016501, + "num_input_tokens_seen": 321329295, + "step": 14899, + "time_per_iteration": 2.4902055263519287 + }, + { + "auxiliary_loss_clip": 0.01050222, + "auxiliary_loss_mlp": 0.01036999, + "balance_loss_clip": 1.03672993, + "balance_loss_mlp": 1.02302742, + "epoch": 0.8958364647527431, + "flos": 19828615933440.0, + "grad_norm": 1.6303206770393293, + "language_loss": 0.73855937, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.7594316, + "num_input_tokens_seen": 321347580, + "step": 14900, + "time_per_iteration": 2.6083016395568848 + }, + { + "auxiliary_loss_clip": 0.00998396, + "auxiliary_loss_mlp": 0.01000605, + "balance_loss_clip": 1.01396787, + "balance_loss_mlp": 0.99947804, + "epoch": 0.895896588005411, + "flos": 65537190115200.0, + "grad_norm": 0.7915035142093432, + "language_loss": 0.61805767, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.6380477, + "num_input_tokens_seen": 321407820, + "step": 14901, + "time_per_iteration": 3.171571731567383 + }, + { + "auxiliary_loss_clip": 0.01092202, + "auxiliary_loss_mlp": 0.01027978, + "balance_loss_clip": 1.03333306, + "balance_loss_mlp": 1.01638472, + "epoch": 0.8959567112580791, + "flos": 25591021246080.0, + "grad_norm": 1.6659568878874844, + "language_loss": 0.70828998, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.72949177, + "num_input_tokens_seen": 321426745, + "step": 14902, + "time_per_iteration": 2.543672800064087 + }, + { + "auxiliary_loss_clip": 0.01074814, + "auxiliary_loss_mlp": 0.01026438, + "balance_loss_clip": 1.03274465, + "balance_loss_mlp": 1.01546502, + "epoch": 0.896016834510747, + "flos": 24201780877440.0, + "grad_norm": 1.7219344244828183, + "language_loss": 0.78291976, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.80393225, + "num_input_tokens_seen": 321446165, + "step": 14903, + "time_per_iteration": 2.544766902923584 + }, + { + "auxiliary_loss_clip": 0.01080285, + "auxiliary_loss_mlp": 0.01032087, + "balance_loss_clip": 1.03468609, + "balance_loss_mlp": 1.01923037, + "epoch": 0.896076957763415, + "flos": 23075945919360.0, + "grad_norm": 1.7657251633702402, + "language_loss": 0.72845972, + "learning_rate": 1.121644401702877e-07, + "loss": 0.74958336, + "num_input_tokens_seen": 321465285, + "step": 14904, + "time_per_iteration": 2.541011095046997 + }, + { + "auxiliary_loss_clip": 0.01093459, + "auxiliary_loss_mlp": 0.0102508, + "balance_loss_clip": 1.03450251, + "balance_loss_mlp": 1.01184201, + "epoch": 0.8961370810160829, + "flos": 22236605838720.0, + "grad_norm": 1.9840795432214966, + "language_loss": 0.751073, + "learning_rate": 1.12035883275166e-07, + "loss": 0.7722584, + "num_input_tokens_seen": 321483670, + "step": 14905, + "time_per_iteration": 2.4886460304260254 + }, + { + "auxiliary_loss_clip": 0.01091372, + "auxiliary_loss_mlp": 0.01031282, + "balance_loss_clip": 1.03393078, + "balance_loss_mlp": 1.01927137, + "epoch": 0.8961972042687509, + "flos": 23072318645760.0, + "grad_norm": 1.6339552311754262, + "language_loss": 0.76265287, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.78387952, + "num_input_tokens_seen": 321501190, + "step": 14906, + "time_per_iteration": 2.556986093521118 + }, + { + "auxiliary_loss_clip": 0.01093791, + "auxiliary_loss_mlp": 0.01031804, + "balance_loss_clip": 1.03634739, + "balance_loss_mlp": 1.01963842, + "epoch": 0.896257327521419, + "flos": 18185882307840.0, + "grad_norm": 1.5985911353572508, + "language_loss": 0.74397635, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.76523232, + "num_input_tokens_seen": 321518540, + "step": 14907, + "time_per_iteration": 2.438049554824829 + }, + { + "auxiliary_loss_clip": 0.01093961, + "auxiliary_loss_mlp": 0.01033857, + "balance_loss_clip": 1.03586841, + "balance_loss_mlp": 1.02220416, + "epoch": 0.8963174507740869, + "flos": 17895472848000.0, + "grad_norm": 1.631770494967735, + "language_loss": 0.82798779, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.84926593, + "num_input_tokens_seen": 321536555, + "step": 14908, + "time_per_iteration": 2.4582738876342773 + }, + { + "auxiliary_loss_clip": 0.01081062, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.03565669, + "balance_loss_mlp": 1.01654768, + "epoch": 0.8963775740267549, + "flos": 21032269706880.0, + "grad_norm": 1.914796051122056, + "language_loss": 0.70565665, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.72675979, + "num_input_tokens_seen": 321557655, + "step": 14909, + "time_per_iteration": 5.589207649230957 + }, + { + "auxiliary_loss_clip": 0.0106832, + "auxiliary_loss_mlp": 0.01033156, + "balance_loss_clip": 1.03434527, + "balance_loss_mlp": 1.02120519, + "epoch": 0.8964376972794228, + "flos": 23179619548800.0, + "grad_norm": 1.8030729693883132, + "language_loss": 0.7251488, + "learning_rate": 1.113941727737877e-07, + "loss": 0.74616355, + "num_input_tokens_seen": 321576160, + "step": 14910, + "time_per_iteration": 2.5329909324645996 + }, + { + "auxiliary_loss_clip": 0.01089517, + "auxiliary_loss_mlp": 0.01028385, + "balance_loss_clip": 1.03300393, + "balance_loss_mlp": 1.01710773, + "epoch": 0.8964978205320908, + "flos": 24972998814720.0, + "grad_norm": 2.033826570978709, + "language_loss": 0.63619328, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.65737236, + "num_input_tokens_seen": 321596205, + "step": 14911, + "time_per_iteration": 3.89689040184021 + }, + { + "auxiliary_loss_clip": 0.01085069, + "auxiliary_loss_mlp": 0.00783252, + "balance_loss_clip": 1.03608847, + "balance_loss_mlp": 1.00849164, + "epoch": 0.8965579437847587, + "flos": 19172025273600.0, + "grad_norm": 1.6563999525290118, + "language_loss": 0.75196517, + "learning_rate": 1.111379898520437e-07, + "loss": 0.77064836, + "num_input_tokens_seen": 321614800, + "step": 14912, + "time_per_iteration": 2.5052449703216553 + }, + { + "auxiliary_loss_clip": 0.01077159, + "auxiliary_loss_mlp": 0.01035689, + "balance_loss_clip": 1.03153586, + "balance_loss_mlp": 1.0228982, + "epoch": 0.8966180670374267, + "flos": 24276690691200.0, + "grad_norm": 1.6996771890102855, + "language_loss": 0.8228026, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.84393108, + "num_input_tokens_seen": 321633445, + "step": 14913, + "time_per_iteration": 2.5224111080169678 + }, + { + "auxiliary_loss_clip": 0.01094705, + "auxiliary_loss_mlp": 0.01032503, + "balance_loss_clip": 1.03521538, + "balance_loss_mlp": 1.01931858, + "epoch": 0.8966781902900947, + "flos": 13553190622080.0, + "grad_norm": 2.20976832711564, + "language_loss": 0.61292732, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.63419944, + "num_input_tokens_seen": 321650890, + "step": 14914, + "time_per_iteration": 2.463059663772583 + }, + { + "auxiliary_loss_clip": 0.01011036, + "auxiliary_loss_mlp": 0.00999116, + "balance_loss_clip": 1.01130724, + "balance_loss_mlp": 0.99789423, + "epoch": 0.8967383135427627, + "flos": 65066114223360.0, + "grad_norm": 1.6962184132780993, + "language_loss": 0.55087483, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.57097638, + "num_input_tokens_seen": 321710960, + "step": 14915, + "time_per_iteration": 3.1209776401519775 + }, + { + "auxiliary_loss_clip": 0.01064333, + "auxiliary_loss_mlp": 0.01026234, + "balance_loss_clip": 1.03251982, + "balance_loss_mlp": 1.01523733, + "epoch": 0.8967984367954306, + "flos": 29713027317120.0, + "grad_norm": 1.4651704287147678, + "language_loss": 0.7154392, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.73634493, + "num_input_tokens_seen": 321733290, + "step": 14916, + "time_per_iteration": 2.613931655883789 + }, + { + "auxiliary_loss_clip": 0.01081589, + "auxiliary_loss_mlp": 0.01031557, + "balance_loss_clip": 1.03379321, + "balance_loss_mlp": 1.02026796, + "epoch": 0.8968585600480986, + "flos": 25702488126720.0, + "grad_norm": 1.697507788081029, + "language_loss": 0.77776289, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.79889429, + "num_input_tokens_seen": 321753120, + "step": 14917, + "time_per_iteration": 2.537705421447754 + }, + { + "auxiliary_loss_clip": 0.0109831, + "auxiliary_loss_mlp": 0.01038384, + "balance_loss_clip": 1.03735709, + "balance_loss_mlp": 1.02565813, + "epoch": 0.8969186833007665, + "flos": 30044698525440.0, + "grad_norm": 2.670073130790352, + "language_loss": 0.68283349, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.70420045, + "num_input_tokens_seen": 321772840, + "step": 14918, + "time_per_iteration": 2.5576884746551514 + }, + { + "auxiliary_loss_clip": 0.0105432, + "auxiliary_loss_mlp": 0.01029037, + "balance_loss_clip": 1.03531253, + "balance_loss_mlp": 1.01813507, + "epoch": 0.8969788065534345, + "flos": 22818143030400.0, + "grad_norm": 1.8800785229508608, + "language_loss": 0.83759397, + "learning_rate": 1.102436060943881e-07, + "loss": 0.85842752, + "num_input_tokens_seen": 321791020, + "step": 14919, + "time_per_iteration": 4.090195178985596 + }, + { + "auxiliary_loss_clip": 0.01104861, + "auxiliary_loss_mlp": 0.00784298, + "balance_loss_clip": 1.03444541, + "balance_loss_mlp": 1.01003861, + "epoch": 0.8970389298061026, + "flos": 13261488272640.0, + "grad_norm": 2.5473216085534935, + "language_loss": 0.72403157, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.7429232, + "num_input_tokens_seen": 321810075, + "step": 14920, + "time_per_iteration": 2.4460973739624023 + }, + { + "auxiliary_loss_clip": 0.01090896, + "auxiliary_loss_mlp": 0.01030726, + "balance_loss_clip": 1.03350854, + "balance_loss_mlp": 1.01777375, + "epoch": 0.8970990530587705, + "flos": 10266071345280.0, + "grad_norm": 2.30480315781291, + "language_loss": 0.90991235, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.9311285, + "num_input_tokens_seen": 321822635, + "step": 14921, + "time_per_iteration": 2.442185401916504 + }, + { + "auxiliary_loss_clip": 0.01046336, + "auxiliary_loss_mlp": 0.01034299, + "balance_loss_clip": 1.03136134, + "balance_loss_mlp": 1.02061391, + "epoch": 0.8971591763114385, + "flos": 20302708567680.0, + "grad_norm": 1.9069253135195177, + "language_loss": 0.7356658, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.75647211, + "num_input_tokens_seen": 321841130, + "step": 14922, + "time_per_iteration": 2.5863051414489746 + }, + { + "auxiliary_loss_clip": 0.01056112, + "auxiliary_loss_mlp": 0.01034441, + "balance_loss_clip": 1.03213239, + "balance_loss_mlp": 1.02125692, + "epoch": 0.8972192995641064, + "flos": 23257043314560.0, + "grad_norm": 1.906300080851892, + "language_loss": 0.70293438, + "learning_rate": 1.097341060694219e-07, + "loss": 0.72383988, + "num_input_tokens_seen": 321859855, + "step": 14923, + "time_per_iteration": 2.5943427085876465 + }, + { + "auxiliary_loss_clip": 0.01082272, + "auxiliary_loss_mlp": 0.01028885, + "balance_loss_clip": 1.03391576, + "balance_loss_mlp": 1.01573026, + "epoch": 0.8972794228167744, + "flos": 18369601395840.0, + "grad_norm": 2.1491517896911616, + "language_loss": 0.70808691, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.72919852, + "num_input_tokens_seen": 321877990, + "step": 14924, + "time_per_iteration": 2.5203096866607666 + }, + { + "auxiliary_loss_clip": 0.0109004, + "auxiliary_loss_mlp": 0.01030063, + "balance_loss_clip": 1.03274882, + "balance_loss_mlp": 1.01889849, + "epoch": 0.8973395460694423, + "flos": 23952058548480.0, + "grad_norm": 1.3663670941970738, + "language_loss": 0.72162521, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.74282622, + "num_input_tokens_seen": 321898120, + "step": 14925, + "time_per_iteration": 2.537475824356079 + }, + { + "auxiliary_loss_clip": 0.01082877, + "auxiliary_loss_mlp": 0.00787082, + "balance_loss_clip": 1.03559148, + "balance_loss_mlp": 1.01054478, + "epoch": 0.8973996693221103, + "flos": 24970843998720.0, + "grad_norm": 1.6139766999250178, + "language_loss": 0.82468092, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.84338051, + "num_input_tokens_seen": 321918140, + "step": 14926, + "time_per_iteration": 2.538933038711548 + }, + { + "auxiliary_loss_clip": 0.01059416, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.03274238, + "balance_loss_mlp": 1.02053785, + "epoch": 0.8974597925747783, + "flos": 25738937452800.0, + "grad_norm": 1.4567631743618752, + "language_loss": 0.7926842, + "learning_rate": 1.092257529095555e-07, + "loss": 0.81360418, + "num_input_tokens_seen": 321938580, + "step": 14927, + "time_per_iteration": 2.6261584758758545 + }, + { + "auxiliary_loss_clip": 0.01076432, + "auxiliary_loss_mlp": 0.0102765, + "balance_loss_clip": 1.03272343, + "balance_loss_mlp": 1.01626599, + "epoch": 0.8975199158274463, + "flos": 38071918131840.0, + "grad_norm": 1.5910959763461041, + "language_loss": 0.66273493, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.68377578, + "num_input_tokens_seen": 321961135, + "step": 14928, + "time_per_iteration": 2.6442508697509766 + }, + { + "auxiliary_loss_clip": 0.0108472, + "auxiliary_loss_mlp": 0.01039698, + "balance_loss_clip": 1.03516281, + "balance_loss_mlp": 1.02474284, + "epoch": 0.8975800390801142, + "flos": 25411683617280.0, + "grad_norm": 3.1343671130295525, + "language_loss": 0.71413296, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.73537713, + "num_input_tokens_seen": 321980945, + "step": 14929, + "time_per_iteration": 2.5720274448394775 + }, + { + "auxiliary_loss_clip": 0.01083978, + "auxiliary_loss_mlp": 0.01031021, + "balance_loss_clip": 1.0360589, + "balance_loss_mlp": 1.01984477, + "epoch": 0.8976401623327822, + "flos": 21759604202880.0, + "grad_norm": 1.8459890686610967, + "language_loss": 0.67959613, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.70074606, + "num_input_tokens_seen": 322000350, + "step": 14930, + "time_per_iteration": 2.509366035461426 + }, + { + "auxiliary_loss_clip": 0.0107366, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.0323112, + "balance_loss_mlp": 1.01927507, + "epoch": 0.8977002855854501, + "flos": 13845323934720.0, + "grad_norm": 1.6305659523365066, + "language_loss": 0.75104696, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.77209604, + "num_input_tokens_seen": 322018980, + "step": 14931, + "time_per_iteration": 2.491755485534668 + }, + { + "auxiliary_loss_clip": 0.0109376, + "auxiliary_loss_mlp": 0.01028784, + "balance_loss_clip": 1.03647423, + "balance_loss_mlp": 1.01788878, + "epoch": 0.8977604088381181, + "flos": 19427529692160.0, + "grad_norm": 1.5583191989167866, + "language_loss": 0.62759066, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.64881611, + "num_input_tokens_seen": 322037675, + "step": 14932, + "time_per_iteration": 2.4594523906707764 + }, + { + "auxiliary_loss_clip": 0.01088208, + "auxiliary_loss_mlp": 0.01026749, + "balance_loss_clip": 1.03449523, + "balance_loss_mlp": 1.01623428, + "epoch": 0.8978205320907862, + "flos": 22742083981440.0, + "grad_norm": 1.771447357176084, + "language_loss": 0.71839607, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.73954564, + "num_input_tokens_seen": 322055130, + "step": 14933, + "time_per_iteration": 2.4924161434173584 + }, + { + "auxiliary_loss_clip": 0.01060476, + "auxiliary_loss_mlp": 0.01031289, + "balance_loss_clip": 1.03369033, + "balance_loss_mlp": 1.01836038, + "epoch": 0.8978806553434541, + "flos": 21360529123200.0, + "grad_norm": 1.4388812692074606, + "language_loss": 0.74556267, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.76648033, + "num_input_tokens_seen": 322074850, + "step": 14934, + "time_per_iteration": 2.5743865966796875 + }, + { + "auxiliary_loss_clip": 0.01061464, + "auxiliary_loss_mlp": 0.01037725, + "balance_loss_clip": 1.0319649, + "balance_loss_mlp": 1.02424216, + "epoch": 0.8979407785961221, + "flos": 20924178704640.0, + "grad_norm": 1.7351300654218393, + "language_loss": 0.60785824, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.6288501, + "num_input_tokens_seen": 322093315, + "step": 14935, + "time_per_iteration": 2.5479352474212646 + }, + { + "auxiliary_loss_clip": 0.010646, + "auxiliary_loss_mlp": 0.0102592, + "balance_loss_clip": 1.03442073, + "balance_loss_mlp": 1.01407671, + "epoch": 0.89800090184879, + "flos": 25228934196480.0, + "grad_norm": 1.993737154480922, + "language_loss": 0.77016199, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.79106724, + "num_input_tokens_seen": 322112555, + "step": 14936, + "time_per_iteration": 2.560037851333618 + }, + { + "auxiliary_loss_clip": 0.01079445, + "auxiliary_loss_mlp": 0.01032372, + "balance_loss_clip": 1.03326845, + "balance_loss_mlp": 1.0208267, + "epoch": 0.898061025101458, + "flos": 22562674525440.0, + "grad_norm": 1.6688293397530796, + "language_loss": 0.7408216, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.76193976, + "num_input_tokens_seen": 322130440, + "step": 14937, + "time_per_iteration": 2.5327670574188232 + }, + { + "auxiliary_loss_clip": 0.01005227, + "auxiliary_loss_mlp": 0.01001828, + "balance_loss_clip": 1.01023281, + "balance_loss_mlp": 1.00080299, + "epoch": 0.8981211483541259, + "flos": 56192551384320.0, + "grad_norm": 0.8539839562118778, + "language_loss": 0.63568944, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.65575999, + "num_input_tokens_seen": 322187295, + "step": 14938, + "time_per_iteration": 3.0436127185821533 + }, + { + "auxiliary_loss_clip": 0.01080629, + "auxiliary_loss_mlp": 0.01027908, + "balance_loss_clip": 1.03483593, + "balance_loss_mlp": 1.01606417, + "epoch": 0.898181271606794, + "flos": 16392718523520.0, + "grad_norm": 1.923749728224475, + "language_loss": 0.80356073, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.82464617, + "num_input_tokens_seen": 322202965, + "step": 14939, + "time_per_iteration": 2.50113582611084 + }, + { + "auxiliary_loss_clip": 0.01001165, + "auxiliary_loss_mlp": 0.01001694, + "balance_loss_clip": 1.00956273, + "balance_loss_mlp": 1.00062704, + "epoch": 0.8982413948594619, + "flos": 63440259989760.0, + "grad_norm": 0.719408221191893, + "language_loss": 0.52825207, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.54828066, + "num_input_tokens_seen": 322269490, + "step": 14940, + "time_per_iteration": 3.283578634262085 + }, + { + "auxiliary_loss_clip": 0.01102899, + "auxiliary_loss_mlp": 0.01030079, + "balance_loss_clip": 1.03382087, + "balance_loss_mlp": 1.01736557, + "epoch": 0.8983015181121299, + "flos": 21835340029440.0, + "grad_norm": 1.9238929766138169, + "language_loss": 0.77889073, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.80022049, + "num_input_tokens_seen": 322288060, + "step": 14941, + "time_per_iteration": 2.482966899871826 + }, + { + "auxiliary_loss_clip": 0.01093032, + "auxiliary_loss_mlp": 0.01035733, + "balance_loss_clip": 1.0338279, + "balance_loss_mlp": 1.02291179, + "epoch": 0.8983616413647978, + "flos": 28949961767040.0, + "grad_norm": 2.376185222073388, + "language_loss": 0.7362498, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.75753748, + "num_input_tokens_seen": 322307930, + "step": 14942, + "time_per_iteration": 2.542780637741089 + }, + { + "auxiliary_loss_clip": 0.01083662, + "auxiliary_loss_mlp": 0.01036916, + "balance_loss_clip": 1.0326612, + "balance_loss_mlp": 1.02386832, + "epoch": 0.8984217646174658, + "flos": 17785083375360.0, + "grad_norm": 2.6579819956239903, + "language_loss": 0.80111909, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.82232487, + "num_input_tokens_seen": 322326155, + "step": 14943, + "time_per_iteration": 2.4719245433807373 + }, + { + "auxiliary_loss_clip": 0.01082783, + "auxiliary_loss_mlp": 0.01035218, + "balance_loss_clip": 1.03602338, + "balance_loss_mlp": 1.02181923, + "epoch": 0.8984818878701337, + "flos": 23404528558080.0, + "grad_norm": 1.429578109278118, + "language_loss": 0.71511465, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.73629463, + "num_input_tokens_seen": 322345850, + "step": 14944, + "time_per_iteration": 2.547766923904419 + }, + { + "auxiliary_loss_clip": 0.0106769, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.03421068, + "balance_loss_mlp": 1.02189946, + "epoch": 0.8985420111228017, + "flos": 22346061557760.0, + "grad_norm": 2.252209843710455, + "language_loss": 0.75746202, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.77849156, + "num_input_tokens_seen": 322364715, + "step": 14945, + "time_per_iteration": 2.5218148231506348 + }, + { + "auxiliary_loss_clip": 0.01108496, + "auxiliary_loss_mlp": 0.0103243, + "balance_loss_clip": 1.03478122, + "balance_loss_mlp": 1.01974034, + "epoch": 0.8986021343754698, + "flos": 21392776558080.0, + "grad_norm": 2.613970692297457, + "language_loss": 0.73869663, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.76010591, + "num_input_tokens_seen": 322383570, + "step": 14946, + "time_per_iteration": 2.451948881149292 + }, + { + "auxiliary_loss_clip": 0.01052908, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.03377545, + "balance_loss_mlp": 1.01642346, + "epoch": 0.8986622576281377, + "flos": 21325372686720.0, + "grad_norm": 1.959780006292004, + "language_loss": 0.64632785, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.66714823, + "num_input_tokens_seen": 322401375, + "step": 14947, + "time_per_iteration": 3.9442718029022217 + }, + { + "auxiliary_loss_clip": 0.0108139, + "auxiliary_loss_mlp": 0.01037149, + "balance_loss_clip": 1.03435838, + "balance_loss_mlp": 1.02533555, + "epoch": 0.8987223808808057, + "flos": 23988292392960.0, + "grad_norm": 2.0174792806356625, + "language_loss": 0.69700742, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.71819282, + "num_input_tokens_seen": 322421890, + "step": 14948, + "time_per_iteration": 3.9402267932891846 + }, + { + "auxiliary_loss_clip": 0.01070242, + "auxiliary_loss_mlp": 0.01029065, + "balance_loss_clip": 1.03298712, + "balance_loss_mlp": 1.01718533, + "epoch": 0.8987825041334736, + "flos": 41500956044160.0, + "grad_norm": 1.7803875514183096, + "language_loss": 0.74554932, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.76654232, + "num_input_tokens_seen": 322445730, + "step": 14949, + "time_per_iteration": 2.733386516571045 + }, + { + "auxiliary_loss_clip": 0.0106401, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.03517151, + "balance_loss_mlp": 1.02204871, + "epoch": 0.8988426273861416, + "flos": 27564276844800.0, + "grad_norm": 1.532324669199321, + "language_loss": 0.75772929, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.77872217, + "num_input_tokens_seen": 322464595, + "step": 14950, + "time_per_iteration": 4.13750147819519 + }, + { + "auxiliary_loss_clip": 0.01077406, + "auxiliary_loss_mlp": 0.01030231, + "balance_loss_clip": 1.03344655, + "balance_loss_mlp": 1.01837564, + "epoch": 0.8989027506388095, + "flos": 17092653920640.0, + "grad_norm": 1.9651300637571811, + "language_loss": 0.66680968, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.68788606, + "num_input_tokens_seen": 322483305, + "step": 14951, + "time_per_iteration": 2.5109846591949463 + }, + { + "auxiliary_loss_clip": 0.01105313, + "auxiliary_loss_mlp": 0.01028459, + "balance_loss_clip": 1.03302169, + "balance_loss_mlp": 1.01719975, + "epoch": 0.8989628738914776, + "flos": 20555124416640.0, + "grad_norm": 2.1543693946100233, + "language_loss": 0.73883271, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.7601704, + "num_input_tokens_seen": 322501905, + "step": 14952, + "time_per_iteration": 2.4638993740081787 + }, + { + "auxiliary_loss_clip": 0.01102806, + "auxiliary_loss_mlp": 0.01032197, + "balance_loss_clip": 1.03495514, + "balance_loss_mlp": 1.0206275, + "epoch": 0.8990229971441455, + "flos": 16251087196800.0, + "grad_norm": 2.467943186680906, + "language_loss": 0.56707597, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.58842599, + "num_input_tokens_seen": 322518135, + "step": 14953, + "time_per_iteration": 2.42580246925354 + }, + { + "auxiliary_loss_clip": 0.0108041, + "auxiliary_loss_mlp": 0.0103486, + "balance_loss_clip": 1.03317332, + "balance_loss_mlp": 1.02278447, + "epoch": 0.8990831203968135, + "flos": 21981316901760.0, + "grad_norm": 1.947596952155307, + "language_loss": 0.81806362, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.83921635, + "num_input_tokens_seen": 322537905, + "step": 14954, + "time_per_iteration": 2.5226314067840576 + }, + { + "auxiliary_loss_clip": 0.01102391, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.03538823, + "balance_loss_mlp": 1.01877201, + "epoch": 0.8991432436494814, + "flos": 27447171528960.0, + "grad_norm": 1.9429487248765867, + "language_loss": 0.60099703, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.6223278, + "num_input_tokens_seen": 322557945, + "step": 14955, + "time_per_iteration": 2.497976541519165 + }, + { + "auxiliary_loss_clip": 0.01092847, + "auxiliary_loss_mlp": 0.01028641, + "balance_loss_clip": 1.03521633, + "balance_loss_mlp": 1.0172689, + "epoch": 0.8992033669021494, + "flos": 21579835610880.0, + "grad_norm": 2.0073130222692623, + "language_loss": 0.55252624, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.5737412, + "num_input_tokens_seen": 322575765, + "step": 14956, + "time_per_iteration": 2.5001754760742188 + }, + { + "auxiliary_loss_clip": 0.01060876, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.03460693, + "balance_loss_mlp": 1.02014828, + "epoch": 0.8992634901548173, + "flos": 28584211530240.0, + "grad_norm": 1.7205160464646085, + "language_loss": 0.79650295, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.81743479, + "num_input_tokens_seen": 322595665, + "step": 14957, + "time_per_iteration": 4.0569212436676025 + }, + { + "auxiliary_loss_clip": 0.0110795, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.03687179, + "balance_loss_mlp": 1.02122092, + "epoch": 0.8993236134074853, + "flos": 19867435557120.0, + "grad_norm": 1.7390404049693857, + "language_loss": 0.78550589, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.80692232, + "num_input_tokens_seen": 322614755, + "step": 14958, + "time_per_iteration": 2.4653921127319336 + }, + { + "auxiliary_loss_clip": 0.01039157, + "auxiliary_loss_mlp": 0.01026579, + "balance_loss_clip": 1.03619671, + "balance_loss_mlp": 1.01530194, + "epoch": 0.8993837366601534, + "flos": 19390649402880.0, + "grad_norm": 1.6275820209494452, + "language_loss": 0.74576288, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.76642025, + "num_input_tokens_seen": 322633425, + "step": 14959, + "time_per_iteration": 2.6748387813568115 + }, + { + "auxiliary_loss_clip": 0.01099155, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.0327847, + "balance_loss_mlp": 1.01766527, + "epoch": 0.8994438599128213, + "flos": 18551740285440.0, + "grad_norm": 1.7907181167381108, + "language_loss": 0.68326211, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.70454842, + "num_input_tokens_seen": 322652065, + "step": 14960, + "time_per_iteration": 2.72328519821167 + }, + { + "auxiliary_loss_clip": 0.01080993, + "auxiliary_loss_mlp": 0.01029827, + "balance_loss_clip": 1.03572166, + "balance_loss_mlp": 1.0185318, + "epoch": 0.8995039831654893, + "flos": 24427587726720.0, + "grad_norm": 1.6154286530425233, + "language_loss": 0.65652037, + "learning_rate": 1.049510991294591e-07, + "loss": 0.67762864, + "num_input_tokens_seen": 322673275, + "step": 14961, + "time_per_iteration": 2.5502817630767822 + }, + { + "auxiliary_loss_clip": 0.01079096, + "auxiliary_loss_mlp": 0.01031225, + "balance_loss_clip": 1.03273809, + "balance_loss_mlp": 1.01968563, + "epoch": 0.8995641064181572, + "flos": 21251324799360.0, + "grad_norm": 1.6639687922594502, + "language_loss": 0.83223307, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.85333622, + "num_input_tokens_seen": 322693375, + "step": 14962, + "time_per_iteration": 2.53401780128479 + }, + { + "auxiliary_loss_clip": 0.01085, + "auxiliary_loss_mlp": 0.01030963, + "balance_loss_clip": 1.03606164, + "balance_loss_mlp": 1.01802862, + "epoch": 0.8996242296708252, + "flos": 23513661054720.0, + "grad_norm": 1.9906901612936543, + "language_loss": 0.76090288, + "learning_rate": 1.047022340612298e-07, + "loss": 0.78206253, + "num_input_tokens_seen": 322712615, + "step": 14963, + "time_per_iteration": 2.513127088546753 + }, + { + "auxiliary_loss_clip": 0.00991854, + "auxiliary_loss_mlp": 0.01000185, + "balance_loss_clip": 1.01617217, + "balance_loss_mlp": 0.9990471, + "epoch": 0.8996843529234931, + "flos": 62403230430720.0, + "grad_norm": 0.7758841278985987, + "language_loss": 0.57562411, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.59554446, + "num_input_tokens_seen": 322766855, + "step": 14964, + "time_per_iteration": 3.0742242336273193 + }, + { + "auxiliary_loss_clip": 0.01099216, + "auxiliary_loss_mlp": 0.01032517, + "balance_loss_clip": 1.03735971, + "balance_loss_mlp": 1.01949978, + "epoch": 0.8997444761761612, + "flos": 24236829573120.0, + "grad_norm": 2.281564391586615, + "language_loss": 0.67691088, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.69822818, + "num_input_tokens_seen": 322781130, + "step": 14965, + "time_per_iteration": 2.488295078277588 + }, + { + "auxiliary_loss_clip": 0.01105229, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.03511882, + "balance_loss_mlp": 1.0214746, + "epoch": 0.8998045994288291, + "flos": 21361103740800.0, + "grad_norm": 2.0757920433689376, + "language_loss": 0.72225636, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.74363923, + "num_input_tokens_seen": 322800310, + "step": 14966, + "time_per_iteration": 2.431565761566162 + }, + { + "auxiliary_loss_clip": 0.01072676, + "auxiliary_loss_mlp": 0.01029902, + "balance_loss_clip": 1.03459895, + "balance_loss_mlp": 1.01683688, + "epoch": 0.8998647226814971, + "flos": 28986159697920.0, + "grad_norm": 1.6794697825729414, + "language_loss": 0.73042428, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.75145006, + "num_input_tokens_seen": 322820955, + "step": 14967, + "time_per_iteration": 2.6131317615509033 + }, + { + "auxiliary_loss_clip": 0.01059741, + "auxiliary_loss_mlp": 0.00781575, + "balance_loss_clip": 1.03474379, + "balance_loss_mlp": 1.00467777, + "epoch": 0.899924845934165, + "flos": 13625909706240.0, + "grad_norm": 1.8236487703229451, + "language_loss": 0.71852326, + "learning_rate": 1.040813291960323e-07, + "loss": 0.73693639, + "num_input_tokens_seen": 322838780, + "step": 14968, + "time_per_iteration": 2.543549060821533 + }, + { + "auxiliary_loss_clip": 0.01089172, + "auxiliary_loss_mlp": 0.01028376, + "balance_loss_clip": 1.03510094, + "balance_loss_mlp": 1.01646662, + "epoch": 0.899984969186833, + "flos": 20882629647360.0, + "grad_norm": 1.868854096000121, + "language_loss": 0.71279967, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.73397517, + "num_input_tokens_seen": 322856710, + "step": 14969, + "time_per_iteration": 2.4861767292022705 + }, + { + "auxiliary_loss_clip": 0.0110643, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.03672636, + "balance_loss_mlp": 1.01769912, + "epoch": 0.9000450924395009, + "flos": 20921808407040.0, + "grad_norm": 2.644233945459289, + "language_loss": 0.75986695, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.78122866, + "num_input_tokens_seen": 322876070, + "step": 14970, + "time_per_iteration": 2.4307961463928223 + }, + { + "auxiliary_loss_clip": 0.01096204, + "auxiliary_loss_mlp": 0.0103003, + "balance_loss_clip": 1.03541732, + "balance_loss_mlp": 1.01857436, + "epoch": 0.900105215692169, + "flos": 17165049782400.0, + "grad_norm": 1.7102600389926637, + "language_loss": 0.72920632, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.75046873, + "num_input_tokens_seen": 322895095, + "step": 14971, + "time_per_iteration": 2.4674694538116455 + }, + { + "auxiliary_loss_clip": 0.01066397, + "auxiliary_loss_mlp": 0.01027925, + "balance_loss_clip": 1.03285575, + "balance_loss_mlp": 1.01545, + "epoch": 0.900165338944837, + "flos": 19931930426880.0, + "grad_norm": 2.172001290211485, + "language_loss": 0.81601655, + "learning_rate": 1.035858993572476e-07, + "loss": 0.83695978, + "num_input_tokens_seen": 322911845, + "step": 14972, + "time_per_iteration": 2.5129406452178955 + }, + { + "auxiliary_loss_clip": 0.01084699, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.03419209, + "balance_loss_mlp": 1.01714468, + "epoch": 0.9002254621975049, + "flos": 16107085572480.0, + "grad_norm": 2.222537229280011, + "language_loss": 0.81608516, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.83722603, + "num_input_tokens_seen": 322928170, + "step": 14973, + "time_per_iteration": 2.481732130050659 + }, + { + "auxiliary_loss_clip": 0.01103034, + "auxiliary_loss_mlp": 0.01034291, + "balance_loss_clip": 1.03496277, + "balance_loss_mlp": 1.02167249, + "epoch": 0.9002855854501729, + "flos": 28476120528000.0, + "grad_norm": 2.458377975583296, + "language_loss": 0.58122873, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.60260201, + "num_input_tokens_seen": 322948165, + "step": 14974, + "time_per_iteration": 2.4812731742858887 + }, + { + "auxiliary_loss_clip": 0.01108659, + "auxiliary_loss_mlp": 0.01033651, + "balance_loss_clip": 1.03925872, + "balance_loss_mlp": 1.0216403, + "epoch": 0.9003457087028408, + "flos": 25630307746560.0, + "grad_norm": 1.7965509849716224, + "language_loss": 0.63541055, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.65683365, + "num_input_tokens_seen": 322968880, + "step": 14975, + "time_per_iteration": 2.4845187664031982 + }, + { + "auxiliary_loss_clip": 0.0109339, + "auxiliary_loss_mlp": 0.01030345, + "balance_loss_clip": 1.03465056, + "balance_loss_mlp": 1.01814353, + "epoch": 0.9004058319555088, + "flos": 24389414547840.0, + "grad_norm": 1.5336930359591952, + "language_loss": 0.73015338, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.7513907, + "num_input_tokens_seen": 322989395, + "step": 14976, + "time_per_iteration": 2.5107274055480957 + }, + { + "auxiliary_loss_clip": 0.01096353, + "auxiliary_loss_mlp": 0.01030157, + "balance_loss_clip": 1.03757596, + "balance_loss_mlp": 1.01815295, + "epoch": 0.9004659552081767, + "flos": 29059345658880.0, + "grad_norm": 1.6973404121381008, + "language_loss": 0.69506651, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.71633166, + "num_input_tokens_seen": 323009060, + "step": 14977, + "time_per_iteration": 2.5322084426879883 + }, + { + "auxiliary_loss_clip": 0.01078332, + "auxiliary_loss_mlp": 0.00785889, + "balance_loss_clip": 1.03469038, + "balance_loss_mlp": 1.01223612, + "epoch": 0.9005260784608448, + "flos": 16763855800320.0, + "grad_norm": 2.4123549722284015, + "language_loss": 0.65445167, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.67309386, + "num_input_tokens_seen": 323027530, + "step": 14978, + "time_per_iteration": 2.520141124725342 + }, + { + "auxiliary_loss_clip": 0.01074452, + "auxiliary_loss_mlp": 0.01036448, + "balance_loss_clip": 1.0364573, + "balance_loss_mlp": 1.02315652, + "epoch": 0.9005862017135127, + "flos": 20376002269440.0, + "grad_norm": 1.793645346301852, + "language_loss": 0.78891718, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.81002617, + "num_input_tokens_seen": 323045370, + "step": 14979, + "time_per_iteration": 2.5562963485717773 + }, + { + "auxiliary_loss_clip": 0.01009997, + "auxiliary_loss_mlp": 0.009995, + "balance_loss_clip": 1.00735617, + "balance_loss_mlp": 0.99849224, + "epoch": 0.9006463249661807, + "flos": 67580255796480.0, + "grad_norm": 0.7248824597139121, + "language_loss": 0.53680253, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.55689746, + "num_input_tokens_seen": 323105660, + "step": 14980, + "time_per_iteration": 3.1366372108459473 + }, + { + "auxiliary_loss_clip": 0.01096872, + "auxiliary_loss_mlp": 0.01036071, + "balance_loss_clip": 1.03635406, + "balance_loss_mlp": 1.02359581, + "epoch": 0.9007064482188486, + "flos": 28293335193600.0, + "grad_norm": 2.199777135619201, + "language_loss": 0.82206267, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.84339207, + "num_input_tokens_seen": 323126365, + "step": 14981, + "time_per_iteration": 2.5475268363952637 + }, + { + "auxiliary_loss_clip": 0.01060937, + "auxiliary_loss_mlp": 0.01028958, + "balance_loss_clip": 1.03417754, + "balance_loss_mlp": 1.01732302, + "epoch": 0.9007665714715166, + "flos": 21616320850560.0, + "grad_norm": 1.480465695983654, + "language_loss": 0.81760031, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.83849931, + "num_input_tokens_seen": 323145655, + "step": 14982, + "time_per_iteration": 2.585956573486328 + }, + { + "auxiliary_loss_clip": 0.01072208, + "auxiliary_loss_mlp": 0.01037919, + "balance_loss_clip": 1.03284812, + "balance_loss_mlp": 1.02518129, + "epoch": 0.9008266947241845, + "flos": 26541864120960.0, + "grad_norm": 1.6999847733657323, + "language_loss": 0.71830392, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.73940516, + "num_input_tokens_seen": 323164540, + "step": 14983, + "time_per_iteration": 2.579643964767456 + }, + { + "auxiliary_loss_clip": 0.01092481, + "auxiliary_loss_mlp": 0.01025599, + "balance_loss_clip": 1.03590822, + "balance_loss_mlp": 1.01489401, + "epoch": 0.9008868179768525, + "flos": 23110527738240.0, + "grad_norm": 1.33424693163311, + "language_loss": 0.7494784, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.77065915, + "num_input_tokens_seen": 323186960, + "step": 14984, + "time_per_iteration": 2.50393009185791 + }, + { + "auxiliary_loss_clip": 0.0109925, + "auxiliary_loss_mlp": 0.01031586, + "balance_loss_clip": 1.03304243, + "balance_loss_mlp": 1.01967049, + "epoch": 0.9009469412295206, + "flos": 19060809788160.0, + "grad_norm": 1.500043315133158, + "language_loss": 0.7043283, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.7256366, + "num_input_tokens_seen": 323206135, + "step": 14985, + "time_per_iteration": 4.090293645858765 + }, + { + "auxiliary_loss_clip": 0.01081799, + "auxiliary_loss_mlp": 0.01029984, + "balance_loss_clip": 1.03357983, + "balance_loss_mlp": 1.01740766, + "epoch": 0.9010070644821885, + "flos": 23222281927680.0, + "grad_norm": 1.89821728942643, + "language_loss": 0.70479637, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.72591424, + "num_input_tokens_seen": 323225980, + "step": 14986, + "time_per_iteration": 2.5960068702697754 + }, + { + "auxiliary_loss_clip": 0.01091999, + "auxiliary_loss_mlp": 0.01029857, + "balance_loss_clip": 1.0327158, + "balance_loss_mlp": 1.01744688, + "epoch": 0.9010671877348565, + "flos": 17384823146880.0, + "grad_norm": 1.695721735177916, + "language_loss": 0.76819319, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.78941178, + "num_input_tokens_seen": 323243700, + "step": 14987, + "time_per_iteration": 3.846806287765503 + }, + { + "auxiliary_loss_clip": 0.01088122, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.0351249, + "balance_loss_mlp": 1.02307463, + "epoch": 0.9011273109875244, + "flos": 21908166854400.0, + "grad_norm": 1.6328688334891839, + "language_loss": 0.73460424, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.75583458, + "num_input_tokens_seen": 323261535, + "step": 14988, + "time_per_iteration": 2.4645888805389404 + }, + { + "auxiliary_loss_clip": 0.01080072, + "auxiliary_loss_mlp": 0.01027681, + "balance_loss_clip": 1.03633022, + "balance_loss_mlp": 1.0149554, + "epoch": 0.9011874342401924, + "flos": 24060831909120.0, + "grad_norm": 1.7587905743799241, + "language_loss": 0.69598752, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.71706504, + "num_input_tokens_seen": 323281855, + "step": 14989, + "time_per_iteration": 3.9902853965759277 + }, + { + "auxiliary_loss_clip": 0.01104405, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.03530157, + "balance_loss_mlp": 1.01755881, + "epoch": 0.9012475574928603, + "flos": 16758791982720.0, + "grad_norm": 1.7710943487787882, + "language_loss": 0.80075085, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.82209086, + "num_input_tokens_seen": 323299505, + "step": 14990, + "time_per_iteration": 2.4039034843444824 + }, + { + "auxiliary_loss_clip": 0.01068583, + "auxiliary_loss_mlp": 0.01030277, + "balance_loss_clip": 1.03583121, + "balance_loss_mlp": 1.01784372, + "epoch": 0.9013076807455284, + "flos": 19971109186560.0, + "grad_norm": 1.7277013480012444, + "language_loss": 0.78029394, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.80128253, + "num_input_tokens_seen": 323318365, + "step": 14991, + "time_per_iteration": 2.5664615631103516 + }, + { + "auxiliary_loss_clip": 0.00996908, + "auxiliary_loss_mlp": 0.00761575, + "balance_loss_clip": 1.00829697, + "balance_loss_mlp": 1.00005627, + "epoch": 0.9013678039981963, + "flos": 65180274624000.0, + "grad_norm": 0.7776720257691508, + "language_loss": 0.60184216, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.61942697, + "num_input_tokens_seen": 323371835, + "step": 14992, + "time_per_iteration": 3.0569722652435303 + }, + { + "auxiliary_loss_clip": 0.01088672, + "auxiliary_loss_mlp": 0.01027114, + "balance_loss_clip": 1.03490078, + "balance_loss_mlp": 1.01500237, + "epoch": 0.9014279272508643, + "flos": 20521224956160.0, + "grad_norm": 1.9991484180476893, + "language_loss": 0.82671291, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.84787083, + "num_input_tokens_seen": 323388495, + "step": 14993, + "time_per_iteration": 2.49094557762146 + }, + { + "auxiliary_loss_clip": 0.01103212, + "auxiliary_loss_mlp": 0.01032868, + "balance_loss_clip": 1.03449702, + "balance_loss_mlp": 1.02041698, + "epoch": 0.9014880505035322, + "flos": 17309051406720.0, + "grad_norm": 2.028747185581525, + "language_loss": 0.72836822, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.74972904, + "num_input_tokens_seen": 323405280, + "step": 14994, + "time_per_iteration": 2.3895137310028076 + }, + { + "auxiliary_loss_clip": 0.01089799, + "auxiliary_loss_mlp": 0.01029039, + "balance_loss_clip": 1.03422999, + "balance_loss_mlp": 1.017923, + "epoch": 0.9015481737562002, + "flos": 28402862739840.0, + "grad_norm": 1.7642293859611897, + "language_loss": 0.6511519, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.67234027, + "num_input_tokens_seen": 323425310, + "step": 14995, + "time_per_iteration": 2.5550334453582764 + }, + { + "auxiliary_loss_clip": 0.01065501, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.03027868, + "balance_loss_mlp": 1.02036846, + "epoch": 0.9016082970088681, + "flos": 29752672953600.0, + "grad_norm": 3.248226901141404, + "language_loss": 0.66696167, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.68795276, + "num_input_tokens_seen": 323447805, + "step": 14996, + "time_per_iteration": 4.0160136222839355 + }, + { + "auxiliary_loss_clip": 0.01091168, + "auxiliary_loss_mlp": 0.01027008, + "balance_loss_clip": 1.03343678, + "balance_loss_mlp": 1.0157783, + "epoch": 0.9016684202615362, + "flos": 23513230091520.0, + "grad_norm": 1.930619534370469, + "language_loss": 0.65859067, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.67977238, + "num_input_tokens_seen": 323467150, + "step": 14997, + "time_per_iteration": 2.509484052658081 + }, + { + "auxiliary_loss_clip": 0.01075032, + "auxiliary_loss_mlp": 0.01034918, + "balance_loss_clip": 1.03078723, + "balance_loss_mlp": 1.02268159, + "epoch": 0.9017285435142042, + "flos": 16979247705600.0, + "grad_norm": 1.7881838187175252, + "language_loss": 0.7753762, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.79647577, + "num_input_tokens_seen": 323484250, + "step": 14998, + "time_per_iteration": 2.5007588863372803 + }, + { + "auxiliary_loss_clip": 0.011041, + "auxiliary_loss_mlp": 0.0103145, + "balance_loss_clip": 1.03491712, + "balance_loss_mlp": 1.01923132, + "epoch": 0.9017886667668721, + "flos": 21393351175680.0, + "grad_norm": 1.7854655769870653, + "language_loss": 0.75534004, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.77669555, + "num_input_tokens_seen": 323502910, + "step": 14999, + "time_per_iteration": 2.4415955543518066 + }, + { + "auxiliary_loss_clip": 0.01048146, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.03661621, + "balance_loss_mlp": 1.01972413, + "epoch": 0.9018487900195401, + "flos": 20996574566400.0, + "grad_norm": 2.241486362802402, + "language_loss": 0.75825673, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.77906013, + "num_input_tokens_seen": 323521820, + "step": 15000, + "time_per_iteration": 2.62198805809021 + }, + { + "auxiliary_loss_clip": 0.01084347, + "auxiliary_loss_mlp": 0.01025814, + "balance_loss_clip": 1.03372884, + "balance_loss_mlp": 1.01478148, + "epoch": 0.901908913272208, + "flos": 53358443458560.0, + "grad_norm": 2.4988023321351465, + "language_loss": 0.80832511, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.82942665, + "num_input_tokens_seen": 323543200, + "step": 15001, + "time_per_iteration": 2.7611000537872314 + }, + { + "auxiliary_loss_clip": 0.01071867, + "auxiliary_loss_mlp": 0.01025617, + "balance_loss_clip": 1.03600621, + "balance_loss_mlp": 1.01442885, + "epoch": 0.901969036524876, + "flos": 22089838867200.0, + "grad_norm": 1.4064053731782085, + "language_loss": 0.78469294, + "learning_rate": 9.990687143794407e-08, + "loss": 0.80566782, + "num_input_tokens_seen": 323563075, + "step": 15002, + "time_per_iteration": 2.583815574645996 + }, + { + "auxiliary_loss_clip": 0.01077109, + "auxiliary_loss_mlp": 0.01037314, + "balance_loss_clip": 1.03579617, + "balance_loss_mlp": 1.02321792, + "epoch": 0.9020291597775439, + "flos": 23835025059840.0, + "grad_norm": 3.22610396632346, + "language_loss": 0.68143904, + "learning_rate": 9.978535328195347e-08, + "loss": 0.70258331, + "num_input_tokens_seen": 323579065, + "step": 15003, + "time_per_iteration": 2.5153868198394775 + }, + { + "auxiliary_loss_clip": 0.0108092, + "auxiliary_loss_mlp": 0.01033775, + "balance_loss_clip": 1.0342598, + "balance_loss_mlp": 1.02113307, + "epoch": 0.902089283030212, + "flos": 18326005263360.0, + "grad_norm": 3.298622541647494, + "language_loss": 0.85697478, + "learning_rate": 9.9663907182292e-08, + "loss": 0.87812173, + "num_input_tokens_seen": 323594835, + "step": 15004, + "time_per_iteration": 2.4852654933929443 + }, + { + "auxiliary_loss_clip": 0.01070779, + "auxiliary_loss_mlp": 0.01035012, + "balance_loss_clip": 1.0348624, + "balance_loss_mlp": 1.02287042, + "epoch": 0.9021494062828799, + "flos": 24170359455360.0, + "grad_norm": 2.062139521995895, + "language_loss": 0.72007138, + "learning_rate": 9.954253314356575e-08, + "loss": 0.74112928, + "num_input_tokens_seen": 323611475, + "step": 15005, + "time_per_iteration": 2.549708604812622 + }, + { + "auxiliary_loss_clip": 0.01094043, + "auxiliary_loss_mlp": 0.01031096, + "balance_loss_clip": 1.03172398, + "balance_loss_mlp": 1.01832891, + "epoch": 0.9022095295355479, + "flos": 21616859554560.0, + "grad_norm": 2.023646247586754, + "language_loss": 0.70992249, + "learning_rate": 9.942123117037748e-08, + "loss": 0.73117387, + "num_input_tokens_seen": 323629730, + "step": 15006, + "time_per_iteration": 2.483577013015747 + }, + { + "auxiliary_loss_clip": 0.01084246, + "auxiliary_loss_mlp": 0.01027959, + "balance_loss_clip": 1.03442192, + "balance_loss_mlp": 1.01578164, + "epoch": 0.9022696527882158, + "flos": 18726229578240.0, + "grad_norm": 1.74324725869261, + "language_loss": 0.84149039, + "learning_rate": 9.930000126732618e-08, + "loss": 0.86261249, + "num_input_tokens_seen": 323646000, + "step": 15007, + "time_per_iteration": 2.471668004989624 + }, + { + "auxiliary_loss_clip": 0.01074932, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.03212762, + "balance_loss_mlp": 1.01696813, + "epoch": 0.9023297760408838, + "flos": 26761206522240.0, + "grad_norm": 1.3963727150052647, + "language_loss": 0.78559959, + "learning_rate": 9.917884343900928e-08, + "loss": 0.80663806, + "num_input_tokens_seen": 323667250, + "step": 15008, + "time_per_iteration": 2.5911448001861572 + }, + { + "auxiliary_loss_clip": 0.0105913, + "auxiliary_loss_mlp": 0.0102927, + "balance_loss_clip": 1.03484261, + "balance_loss_mlp": 1.01780236, + "epoch": 0.9023898992935517, + "flos": 20522553759360.0, + "grad_norm": 1.6959332158851823, + "language_loss": 0.73696303, + "learning_rate": 9.905775769002156e-08, + "loss": 0.75784707, + "num_input_tokens_seen": 323687150, + "step": 15009, + "time_per_iteration": 2.5318822860717773 + }, + { + "auxiliary_loss_clip": 0.01101475, + "auxiliary_loss_mlp": 0.0103269, + "balance_loss_clip": 1.03434896, + "balance_loss_mlp": 1.0207988, + "epoch": 0.9024500225462198, + "flos": 17456644391040.0, + "grad_norm": 1.7756639426897396, + "language_loss": 0.72960496, + "learning_rate": 9.893674402495399e-08, + "loss": 0.75094652, + "num_input_tokens_seen": 323703660, + "step": 15010, + "time_per_iteration": 2.440182685852051 + }, + { + "auxiliary_loss_clip": 0.01077686, + "auxiliary_loss_mlp": 0.01028853, + "balance_loss_clip": 1.03548479, + "balance_loss_mlp": 1.01641953, + "epoch": 0.9025101457988878, + "flos": 20813609664000.0, + "grad_norm": 1.7514192444461092, + "language_loss": 0.7422719, + "learning_rate": 9.881580244839538e-08, + "loss": 0.76333731, + "num_input_tokens_seen": 323722060, + "step": 15011, + "time_per_iteration": 2.500464677810669 + }, + { + "auxiliary_loss_clip": 0.01095556, + "auxiliary_loss_mlp": 0.01028482, + "balance_loss_clip": 1.0338397, + "balance_loss_mlp": 1.01597106, + "epoch": 0.9025702690515557, + "flos": 19026371623680.0, + "grad_norm": 1.8696595309099802, + "language_loss": 0.72739804, + "learning_rate": 9.869493296493204e-08, + "loss": 0.74863839, + "num_input_tokens_seen": 323740645, + "step": 15012, + "time_per_iteration": 2.499265432357788 + }, + { + "auxiliary_loss_clip": 0.01071788, + "auxiliary_loss_mlp": 0.01038385, + "balance_loss_clip": 1.03548861, + "balance_loss_mlp": 1.02678585, + "epoch": 0.9026303923042237, + "flos": 19682818629120.0, + "grad_norm": 1.535254288822677, + "language_loss": 0.6921519, + "learning_rate": 9.857413557914763e-08, + "loss": 0.71325362, + "num_input_tokens_seen": 323758905, + "step": 15013, + "time_per_iteration": 2.5283010005950928 + }, + { + "auxiliary_loss_clip": 0.01087013, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.03216839, + "balance_loss_mlp": 1.02029383, + "epoch": 0.9026905155568916, + "flos": 24608110504320.0, + "grad_norm": 1.3860982737231504, + "language_loss": 0.73191655, + "learning_rate": 9.845341029562249e-08, + "loss": 0.75310141, + "num_input_tokens_seen": 323780595, + "step": 15014, + "time_per_iteration": 2.5347697734832764 + }, + { + "auxiliary_loss_clip": 0.01102232, + "auxiliary_loss_mlp": 0.01029669, + "balance_loss_clip": 1.03373504, + "balance_loss_mlp": 1.01728916, + "epoch": 0.9027506388095596, + "flos": 20521799573760.0, + "grad_norm": 2.624407360546308, + "language_loss": 0.72189379, + "learning_rate": 9.833275711893474e-08, + "loss": 0.74321282, + "num_input_tokens_seen": 323798160, + "step": 15015, + "time_per_iteration": 2.4398386478424072 + }, + { + "auxiliary_loss_clip": 0.01079597, + "auxiliary_loss_mlp": 0.01029863, + "balance_loss_clip": 1.03496766, + "balance_loss_mlp": 1.01838911, + "epoch": 0.9028107620622275, + "flos": 22784494965120.0, + "grad_norm": 2.142106974688478, + "language_loss": 0.69052136, + "learning_rate": 9.821217605365895e-08, + "loss": 0.71161604, + "num_input_tokens_seen": 323816810, + "step": 15016, + "time_per_iteration": 2.5307672023773193 + }, + { + "auxiliary_loss_clip": 0.01101128, + "auxiliary_loss_mlp": 0.01026773, + "balance_loss_clip": 1.03372669, + "balance_loss_mlp": 1.01593697, + "epoch": 0.9028708853148956, + "flos": 25410534382080.0, + "grad_norm": 1.924845654412354, + "language_loss": 0.70414418, + "learning_rate": 9.809166710436855e-08, + "loss": 0.72542322, + "num_input_tokens_seen": 323836900, + "step": 15017, + "time_per_iteration": 2.469637870788574 + }, + { + "auxiliary_loss_clip": 0.01081435, + "auxiliary_loss_mlp": 0.01032488, + "balance_loss_clip": 1.03884792, + "balance_loss_mlp": 1.02123439, + "epoch": 0.9029310085675635, + "flos": 21871322478720.0, + "grad_norm": 1.572574864794066, + "language_loss": 0.69420946, + "learning_rate": 9.797123027563237e-08, + "loss": 0.71534866, + "num_input_tokens_seen": 323855325, + "step": 15018, + "time_per_iteration": 2.5313761234283447 + }, + { + "auxiliary_loss_clip": 0.01093639, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.03555465, + "balance_loss_mlp": 1.01959682, + "epoch": 0.9029911318202315, + "flos": 26214394803840.0, + "grad_norm": 1.680465253385712, + "language_loss": 0.69103873, + "learning_rate": 9.785086557201782e-08, + "loss": 0.71229219, + "num_input_tokens_seen": 323875650, + "step": 15019, + "time_per_iteration": 2.5163917541503906 + }, + { + "auxiliary_loss_clip": 0.01099997, + "auxiliary_loss_mlp": 0.01032513, + "balance_loss_clip": 1.03365946, + "balance_loss_mlp": 1.0211587, + "epoch": 0.9030512550728994, + "flos": 15961360095360.0, + "grad_norm": 1.8335991973161119, + "language_loss": 0.72092903, + "learning_rate": 9.773057299808951e-08, + "loss": 0.74225414, + "num_input_tokens_seen": 323892920, + "step": 15020, + "time_per_iteration": 2.4221320152282715 + }, + { + "auxiliary_loss_clip": 0.01089997, + "auxiliary_loss_mlp": 0.01032114, + "balance_loss_clip": 1.03349948, + "balance_loss_mlp": 1.01946592, + "epoch": 0.9031113783255674, + "flos": 23987610034560.0, + "grad_norm": 1.5829502399334423, + "language_loss": 0.74386585, + "learning_rate": 9.7610352558408e-08, + "loss": 0.76508695, + "num_input_tokens_seen": 323913835, + "step": 15021, + "time_per_iteration": 2.5051286220550537 + }, + { + "auxiliary_loss_clip": 0.01107551, + "auxiliary_loss_mlp": 0.01029301, + "balance_loss_clip": 1.03610253, + "balance_loss_mlp": 1.01653326, + "epoch": 0.9031715015782353, + "flos": 22237216369920.0, + "grad_norm": 2.5182885859104953, + "language_loss": 0.72234541, + "learning_rate": 9.749020425753251e-08, + "loss": 0.74371392, + "num_input_tokens_seen": 323933440, + "step": 15022, + "time_per_iteration": 2.4579663276672363 + }, + { + "auxiliary_loss_clip": 0.01063076, + "auxiliary_loss_mlp": 0.0102844, + "balance_loss_clip": 1.03426242, + "balance_loss_mlp": 1.01724052, + "epoch": 0.9032316248309034, + "flos": 26323168164480.0, + "grad_norm": 1.9237241098375688, + "language_loss": 0.72523808, + "learning_rate": 9.737012810001943e-08, + "loss": 0.74615324, + "num_input_tokens_seen": 323954090, + "step": 15023, + "time_per_iteration": 2.5879459381103516 + }, + { + "auxiliary_loss_clip": 0.01090361, + "auxiliary_loss_mlp": 0.01032888, + "balance_loss_clip": 1.03730261, + "balance_loss_mlp": 1.02142644, + "epoch": 0.9032917480835713, + "flos": 22636686499200.0, + "grad_norm": 1.7240815570681225, + "language_loss": 0.82646823, + "learning_rate": 9.725012409042155e-08, + "loss": 0.84770072, + "num_input_tokens_seen": 323974040, + "step": 15024, + "time_per_iteration": 3.92330265045166 + }, + { + "auxiliary_loss_clip": 0.01093299, + "auxiliary_loss_mlp": 0.01028375, + "balance_loss_clip": 1.03417051, + "balance_loss_mlp": 1.01668096, + "epoch": 0.9033518713362393, + "flos": 23878764846720.0, + "grad_norm": 1.470205203000881, + "language_loss": 0.69601285, + "learning_rate": 9.713019223328966e-08, + "loss": 0.7172296, + "num_input_tokens_seen": 323996125, + "step": 15025, + "time_per_iteration": 2.487698554992676 + }, + { + "auxiliary_loss_clip": 0.01067972, + "auxiliary_loss_mlp": 0.01033833, + "balance_loss_clip": 1.03258204, + "balance_loss_mlp": 1.02229321, + "epoch": 0.9034119945889073, + "flos": 26905279973760.0, + "grad_norm": 1.6304710183700022, + "language_loss": 0.77046514, + "learning_rate": 9.70103325331717e-08, + "loss": 0.79148322, + "num_input_tokens_seen": 324017645, + "step": 15026, + "time_per_iteration": 4.1952149868011475 + }, + { + "auxiliary_loss_clip": 0.01091589, + "auxiliary_loss_mlp": 0.01028555, + "balance_loss_clip": 1.03503132, + "balance_loss_mlp": 1.01755214, + "epoch": 0.9034721178415752, + "flos": 20850166730880.0, + "grad_norm": 1.6534228903927537, + "language_loss": 0.68348634, + "learning_rate": 9.68905449946129e-08, + "loss": 0.70468777, + "num_input_tokens_seen": 324036875, + "step": 15027, + "time_per_iteration": 3.937288284301758 + }, + { + "auxiliary_loss_clip": 0.0104884, + "auxiliary_loss_mlp": 0.01037013, + "balance_loss_clip": 1.0312705, + "balance_loss_mlp": 1.02392399, + "epoch": 0.9035322410942432, + "flos": 22234307368320.0, + "grad_norm": 2.186293985256084, + "language_loss": 0.76043975, + "learning_rate": 9.677082962215477e-08, + "loss": 0.78129828, + "num_input_tokens_seen": 324057045, + "step": 15028, + "time_per_iteration": 2.608381509780884 + }, + { + "auxiliary_loss_clip": 0.0105582, + "auxiliary_loss_mlp": 0.01037005, + "balance_loss_clip": 1.03410697, + "balance_loss_mlp": 1.02508974, + "epoch": 0.9035923643469111, + "flos": 25923410726400.0, + "grad_norm": 1.6804459229643494, + "language_loss": 0.68991888, + "learning_rate": 9.665118642033765e-08, + "loss": 0.71084708, + "num_input_tokens_seen": 324079735, + "step": 15029, + "time_per_iteration": 2.6137635707855225 + }, + { + "auxiliary_loss_clip": 0.01089717, + "auxiliary_loss_mlp": 0.01028742, + "balance_loss_clip": 1.03570282, + "balance_loss_mlp": 1.0162251, + "epoch": 0.9036524875995792, + "flos": 20339804338560.0, + "grad_norm": 3.0870234954046194, + "language_loss": 0.74051976, + "learning_rate": 9.653161539369858e-08, + "loss": 0.76170433, + "num_input_tokens_seen": 324097785, + "step": 15030, + "time_per_iteration": 2.478355884552002 + }, + { + "auxiliary_loss_clip": 0.01096792, + "auxiliary_loss_mlp": 0.01030187, + "balance_loss_clip": 1.03532219, + "balance_loss_mlp": 1.01765203, + "epoch": 0.9037126108522471, + "flos": 40114624677120.0, + "grad_norm": 1.9260782456417946, + "language_loss": 0.68133241, + "learning_rate": 9.641211654677151e-08, + "loss": 0.70260221, + "num_input_tokens_seen": 324121625, + "step": 15031, + "time_per_iteration": 2.617034673690796 + }, + { + "auxiliary_loss_clip": 0.01078742, + "auxiliary_loss_mlp": 0.01027469, + "balance_loss_clip": 1.03326607, + "balance_loss_mlp": 1.01652539, + "epoch": 0.9037727341049151, + "flos": 23332024955520.0, + "grad_norm": 1.499087600739369, + "language_loss": 0.76006413, + "learning_rate": 9.629268988408723e-08, + "loss": 0.78112626, + "num_input_tokens_seen": 324142535, + "step": 15032, + "time_per_iteration": 2.554337501525879 + }, + { + "auxiliary_loss_clip": 0.01103806, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.03510523, + "balance_loss_mlp": 1.01855898, + "epoch": 0.903832857357583, + "flos": 12822659815680.0, + "grad_norm": 1.9293270192148513, + "language_loss": 0.7544691, + "learning_rate": 9.617333541017502e-08, + "loss": 0.77581054, + "num_input_tokens_seen": 324159610, + "step": 15033, + "time_per_iteration": 2.4161760807037354 + }, + { + "auxiliary_loss_clip": 0.01071144, + "auxiliary_loss_mlp": 0.01035557, + "balance_loss_clip": 1.03208697, + "balance_loss_mlp": 1.02245617, + "epoch": 0.903892980610251, + "flos": 25703026830720.0, + "grad_norm": 1.773986569993747, + "language_loss": 0.74001634, + "learning_rate": 9.605405312956105e-08, + "loss": 0.7610833, + "num_input_tokens_seen": 324182510, + "step": 15034, + "time_per_iteration": 3.993263006210327 + }, + { + "auxiliary_loss_clip": 0.01070383, + "auxiliary_loss_mlp": 0.01032209, + "balance_loss_clip": 1.03448212, + "balance_loss_mlp": 1.01996052, + "epoch": 0.9039531038629189, + "flos": 14684089397760.0, + "grad_norm": 1.619473195423546, + "language_loss": 0.63374865, + "learning_rate": 9.593484304676791e-08, + "loss": 0.65477455, + "num_input_tokens_seen": 324200555, + "step": 15035, + "time_per_iteration": 2.5127596855163574 + }, + { + "auxiliary_loss_clip": 0.01103536, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.03569305, + "balance_loss_mlp": 1.01812732, + "epoch": 0.904013227115587, + "flos": 24024921287040.0, + "grad_norm": 2.2150075734538976, + "language_loss": 0.62309092, + "learning_rate": 9.581570516631643e-08, + "loss": 0.64443773, + "num_input_tokens_seen": 324220255, + "step": 15036, + "time_per_iteration": 2.4935667514801025 + }, + { + "auxiliary_loss_clip": 0.0105863, + "auxiliary_loss_mlp": 0.0102629, + "balance_loss_clip": 1.03526807, + "balance_loss_mlp": 1.01545346, + "epoch": 0.9040733503682549, + "flos": 22856459863680.0, + "grad_norm": 1.59045381824056, + "language_loss": 0.82341969, + "learning_rate": 9.569663949272455e-08, + "loss": 0.84426886, + "num_input_tokens_seen": 324237855, + "step": 15037, + "time_per_iteration": 2.5857129096984863 + }, + { + "auxiliary_loss_clip": 0.01104917, + "auxiliary_loss_mlp": 0.0102692, + "balance_loss_clip": 1.0347476, + "balance_loss_mlp": 1.0150466, + "epoch": 0.9041334736209229, + "flos": 19974951941760.0, + "grad_norm": 1.6290154862734403, + "language_loss": 0.67387116, + "learning_rate": 9.557764603050667e-08, + "loss": 0.69518954, + "num_input_tokens_seen": 324257050, + "step": 15038, + "time_per_iteration": 2.4516005516052246 + }, + { + "auxiliary_loss_clip": 0.01082489, + "auxiliary_loss_mlp": 0.0103592, + "balance_loss_clip": 1.03398037, + "balance_loss_mlp": 1.0237186, + "epoch": 0.9041935968735909, + "flos": 17530548624000.0, + "grad_norm": 1.951111109326481, + "language_loss": 0.75186771, + "learning_rate": 9.545872478417494e-08, + "loss": 0.7730518, + "num_input_tokens_seen": 324275510, + "step": 15039, + "time_per_iteration": 2.5058467388153076 + }, + { + "auxiliary_loss_clip": 0.0107722, + "auxiliary_loss_mlp": 0.0102806, + "balance_loss_clip": 1.03555012, + "balance_loss_mlp": 1.01702738, + "epoch": 0.9042537201262588, + "flos": 22780149419520.0, + "grad_norm": 1.4816399640926188, + "language_loss": 0.70090377, + "learning_rate": 9.533987575823977e-08, + "loss": 0.72195661, + "num_input_tokens_seen": 324295150, + "step": 15040, + "time_per_iteration": 2.5752105712890625 + }, + { + "auxiliary_loss_clip": 0.01069659, + "auxiliary_loss_mlp": 0.01029765, + "balance_loss_clip": 1.03251767, + "balance_loss_mlp": 1.01807654, + "epoch": 0.9043138433789268, + "flos": 20595416497920.0, + "grad_norm": 1.7227035817064555, + "language_loss": 0.67748952, + "learning_rate": 9.522109895720709e-08, + "loss": 0.69848371, + "num_input_tokens_seen": 324313855, + "step": 15041, + "time_per_iteration": 2.527285099029541 + }, + { + "auxiliary_loss_clip": 0.01091591, + "auxiliary_loss_mlp": 0.01032057, + "balance_loss_clip": 1.03410196, + "balance_loss_mlp": 1.01961148, + "epoch": 0.9043739666315948, + "flos": 32962978995840.0, + "grad_norm": 2.272083105051769, + "language_loss": 0.57533634, + "learning_rate": 9.510239438558155e-08, + "loss": 0.59657288, + "num_input_tokens_seen": 324338465, + "step": 15042, + "time_per_iteration": 2.6001853942871094 + }, + { + "auxiliary_loss_clip": 0.01010626, + "auxiliary_loss_mlp": 0.0076171, + "balance_loss_clip": 1.00724387, + "balance_loss_mlp": 1.00088727, + "epoch": 0.9044340898842628, + "flos": 67296418525440.0, + "grad_norm": 0.7806458370604512, + "language_loss": 0.57033837, + "learning_rate": 9.498376204786351e-08, + "loss": 0.58806175, + "num_input_tokens_seen": 324398740, + "step": 15043, + "time_per_iteration": 3.10719633102417 + }, + { + "auxiliary_loss_clip": 0.0108184, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.03378999, + "balance_loss_mlp": 1.01595783, + "epoch": 0.9044942131369307, + "flos": 17713154390400.0, + "grad_norm": 1.786827895398811, + "language_loss": 0.69985735, + "learning_rate": 9.486520194855274e-08, + "loss": 0.720963, + "num_input_tokens_seen": 324417335, + "step": 15044, + "time_per_iteration": 2.492211103439331 + }, + { + "auxiliary_loss_clip": 0.01084205, + "auxiliary_loss_mlp": 0.01035825, + "balance_loss_clip": 1.03490877, + "balance_loss_mlp": 1.02323091, + "epoch": 0.9045543363895987, + "flos": 17820563034240.0, + "grad_norm": 2.7065267870330634, + "language_loss": 0.69902307, + "learning_rate": 9.474671409214407e-08, + "loss": 0.72022337, + "num_input_tokens_seen": 324433240, + "step": 15045, + "time_per_iteration": 2.475430488586426 + }, + { + "auxiliary_loss_clip": 0.01072991, + "auxiliary_loss_mlp": 0.01033979, + "balance_loss_clip": 1.03447223, + "balance_loss_mlp": 1.02109838, + "epoch": 0.9046144596422666, + "flos": 21872723109120.0, + "grad_norm": 1.9289700273953294, + "language_loss": 0.6545347, + "learning_rate": 9.462829848313081e-08, + "loss": 0.67560434, + "num_input_tokens_seen": 324452675, + "step": 15046, + "time_per_iteration": 2.542355537414551 + }, + { + "auxiliary_loss_clip": 0.01071682, + "auxiliary_loss_mlp": 0.01034361, + "balance_loss_clip": 1.03351641, + "balance_loss_mlp": 1.02208853, + "epoch": 0.9046745828949346, + "flos": 17672646827520.0, + "grad_norm": 2.077150935453088, + "language_loss": 0.61442852, + "learning_rate": 9.450995512600379e-08, + "loss": 0.63548893, + "num_input_tokens_seen": 324467865, + "step": 15047, + "time_per_iteration": 2.5078887939453125 + }, + { + "auxiliary_loss_clip": 0.01102977, + "auxiliary_loss_mlp": 0.00783459, + "balance_loss_clip": 1.03555298, + "balance_loss_mlp": 1.01086712, + "epoch": 0.9047347061476025, + "flos": 25702559953920.0, + "grad_norm": 1.4028605874399873, + "language_loss": 0.71231496, + "learning_rate": 9.439168402525032e-08, + "loss": 0.73117936, + "num_input_tokens_seen": 324490430, + "step": 15048, + "time_per_iteration": 2.4914350509643555 + }, + { + "auxiliary_loss_clip": 0.01091807, + "auxiliary_loss_mlp": 0.01030556, + "balance_loss_clip": 1.03156567, + "balance_loss_mlp": 1.01764607, + "epoch": 0.9047948294002706, + "flos": 15158146118400.0, + "grad_norm": 2.0727322676737416, + "language_loss": 0.74960464, + "learning_rate": 9.427348518535483e-08, + "loss": 0.77082831, + "num_input_tokens_seen": 324506620, + "step": 15049, + "time_per_iteration": 2.4512054920196533 + }, + { + "auxiliary_loss_clip": 0.01090447, + "auxiliary_loss_mlp": 0.01028004, + "balance_loss_clip": 1.03573692, + "balance_loss_mlp": 1.01588082, + "epoch": 0.9048549526529385, + "flos": 21872292145920.0, + "grad_norm": 3.221606222288242, + "language_loss": 0.75882655, + "learning_rate": 9.415535861079993e-08, + "loss": 0.78001106, + "num_input_tokens_seen": 324525505, + "step": 15050, + "time_per_iteration": 2.5065665245056152 + }, + { + "auxiliary_loss_clip": 0.01103173, + "auxiliary_loss_mlp": 0.00781593, + "balance_loss_clip": 1.03451729, + "balance_loss_mlp": 1.0072372, + "epoch": 0.9049150759056065, + "flos": 23546626761600.0, + "grad_norm": 1.7938632188043035, + "language_loss": 0.81831747, + "learning_rate": 9.403730430606472e-08, + "loss": 0.83716518, + "num_input_tokens_seen": 324544415, + "step": 15051, + "time_per_iteration": 2.47902774810791 + }, + { + "auxiliary_loss_clip": 0.01091979, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.03450561, + "balance_loss_mlp": 1.0177263, + "epoch": 0.9049751991582745, + "flos": 19645902426240.0, + "grad_norm": 1.8805353912317617, + "language_loss": 0.88817197, + "learning_rate": 9.391932227562582e-08, + "loss": 0.90937996, + "num_input_tokens_seen": 324562555, + "step": 15052, + "time_per_iteration": 2.462597608566284 + }, + { + "auxiliary_loss_clip": 0.01095697, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.03476238, + "balance_loss_mlp": 1.02096796, + "epoch": 0.9050353224109424, + "flos": 15596220389760.0, + "grad_norm": 1.9344454239975266, + "language_loss": 0.77426064, + "learning_rate": 9.380141252395724e-08, + "loss": 0.79554963, + "num_input_tokens_seen": 324580865, + "step": 15053, + "time_per_iteration": 2.456810235977173 + }, + { + "auxiliary_loss_clip": 0.01089759, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.03343654, + "balance_loss_mlp": 1.01983905, + "epoch": 0.9050954456636104, + "flos": 28183592165760.0, + "grad_norm": 1.5013478296720073, + "language_loss": 0.72882193, + "learning_rate": 9.368357505553049e-08, + "loss": 0.75003976, + "num_input_tokens_seen": 324600665, + "step": 15054, + "time_per_iteration": 2.5316455364227295 + }, + { + "auxiliary_loss_clip": 0.01049681, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.03045678, + "balance_loss_mlp": 1.02258992, + "epoch": 0.9051555689162784, + "flos": 25731611078400.0, + "grad_norm": 4.735088325413634, + "language_loss": 0.83647203, + "learning_rate": 9.356580987481333e-08, + "loss": 0.85731435, + "num_input_tokens_seen": 324618145, + "step": 15055, + "time_per_iteration": 2.5937860012054443 + }, + { + "auxiliary_loss_clip": 0.01091802, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.03554881, + "balance_loss_mlp": 1.02234507, + "epoch": 0.9052156921689464, + "flos": 23257258796160.0, + "grad_norm": 1.5891597673157531, + "language_loss": 0.84981298, + "learning_rate": 9.344811698627176e-08, + "loss": 0.87107426, + "num_input_tokens_seen": 324638165, + "step": 15056, + "time_per_iteration": 2.4831764698028564 + }, + { + "auxiliary_loss_clip": 0.01082884, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.03484511, + "balance_loss_mlp": 1.01943922, + "epoch": 0.9052758154216143, + "flos": 29564285097600.0, + "grad_norm": 1.7869035788437304, + "language_loss": 0.71785337, + "learning_rate": 9.333049639436863e-08, + "loss": 0.7389881, + "num_input_tokens_seen": 324658560, + "step": 15057, + "time_per_iteration": 2.560811996459961 + }, + { + "auxiliary_loss_clip": 0.01088872, + "auxiliary_loss_mlp": 0.01026862, + "balance_loss_clip": 1.03250527, + "balance_loss_mlp": 1.01519692, + "epoch": 0.9053359386742823, + "flos": 22127688823680.0, + "grad_norm": 2.158906675298929, + "language_loss": 0.80519331, + "learning_rate": 9.321294810356418e-08, + "loss": 0.82635069, + "num_input_tokens_seen": 324679185, + "step": 15058, + "time_per_iteration": 2.4753379821777344 + }, + { + "auxiliary_loss_clip": 0.0101759, + "auxiliary_loss_mlp": 0.01004343, + "balance_loss_clip": 1.00664926, + "balance_loss_mlp": 1.00322294, + "epoch": 0.9053960619269502, + "flos": 67090112760960.0, + "grad_norm": 0.676763934972753, + "language_loss": 0.51426655, + "learning_rate": 9.309547211831592e-08, + "loss": 0.53448594, + "num_input_tokens_seen": 324744830, + "step": 15059, + "time_per_iteration": 3.203374147415161 + }, + { + "auxiliary_loss_clip": 0.01062846, + "auxiliary_loss_mlp": 0.01029551, + "balance_loss_clip": 1.03382051, + "balance_loss_mlp": 1.0170815, + "epoch": 0.9054561851796182, + "flos": 15815419136640.0, + "grad_norm": 2.617153859034066, + "language_loss": 0.6722666, + "learning_rate": 9.297806844307831e-08, + "loss": 0.69319057, + "num_input_tokens_seen": 324762905, + "step": 15060, + "time_per_iteration": 2.554060459136963 + }, + { + "auxiliary_loss_clip": 0.01085644, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.03593612, + "balance_loss_mlp": 1.01810634, + "epoch": 0.9055163084322861, + "flos": 17566997950080.0, + "grad_norm": 1.9562611463312884, + "language_loss": 0.63859111, + "learning_rate": 9.286073708230357e-08, + "loss": 0.65974557, + "num_input_tokens_seen": 324781905, + "step": 15061, + "time_per_iteration": 2.482518196105957 + }, + { + "auxiliary_loss_clip": 0.01079168, + "auxiliary_loss_mlp": 0.01033456, + "balance_loss_clip": 1.03603935, + "balance_loss_mlp": 1.02096295, + "epoch": 0.9055764316849542, + "flos": 17639573379840.0, + "grad_norm": 1.7820977848288684, + "language_loss": 0.7149508, + "learning_rate": 9.274347804044058e-08, + "loss": 0.73607707, + "num_input_tokens_seen": 324799260, + "step": 15062, + "time_per_iteration": 2.4710333347320557 + }, + { + "auxiliary_loss_clip": 0.01101376, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.03392673, + "balance_loss_mlp": 1.0187484, + "epoch": 0.9056365549376221, + "flos": 20120856986880.0, + "grad_norm": 1.64841174642194, + "language_loss": 0.70816362, + "learning_rate": 9.2626291321936e-08, + "loss": 0.72947562, + "num_input_tokens_seen": 324817800, + "step": 15063, + "time_per_iteration": 4.092405557632446 + }, + { + "auxiliary_loss_clip": 0.01068869, + "auxiliary_loss_mlp": 0.01033491, + "balance_loss_clip": 1.03412974, + "balance_loss_mlp": 1.02200484, + "epoch": 0.9056966781902901, + "flos": 27598786836480.0, + "grad_norm": 1.7651684457338732, + "language_loss": 0.72311521, + "learning_rate": 9.250917693123406e-08, + "loss": 0.74413878, + "num_input_tokens_seen": 324838445, + "step": 15064, + "time_per_iteration": 3.9789211750030518 + }, + { + "auxiliary_loss_clip": 0.01092734, + "auxiliary_loss_mlp": 0.01030838, + "balance_loss_clip": 1.03223085, + "balance_loss_mlp": 1.01891661, + "epoch": 0.9057568014429581, + "flos": 25920106675200.0, + "grad_norm": 2.0647451205952985, + "language_loss": 0.6902169, + "learning_rate": 9.23921348727752e-08, + "loss": 0.7114526, + "num_input_tokens_seen": 324859895, + "step": 15065, + "time_per_iteration": 2.525019645690918 + }, + { + "auxiliary_loss_clip": 0.01081444, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.03614235, + "balance_loss_mlp": 1.02436018, + "epoch": 0.905816924695626, + "flos": 22930364096640.0, + "grad_norm": 1.908518059028003, + "language_loss": 0.63082129, + "learning_rate": 9.227516515099743e-08, + "loss": 0.65199578, + "num_input_tokens_seen": 324879580, + "step": 15066, + "time_per_iteration": 3.9265286922454834 + }, + { + "auxiliary_loss_clip": 0.010396, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.03037691, + "balance_loss_mlp": 1.01796412, + "epoch": 0.905877047948294, + "flos": 22157422306560.0, + "grad_norm": 2.095067942768182, + "language_loss": 0.80078787, + "learning_rate": 9.215826777033675e-08, + "loss": 0.82150024, + "num_input_tokens_seen": 324898950, + "step": 15067, + "time_per_iteration": 2.652109384536743 + }, + { + "auxiliary_loss_clip": 0.01083687, + "auxiliary_loss_mlp": 0.01032628, + "balance_loss_clip": 1.03520823, + "balance_loss_mlp": 1.02001548, + "epoch": 0.905937171200962, + "flos": 15304805349120.0, + "grad_norm": 1.5922264320007362, + "language_loss": 0.69898462, + "learning_rate": 9.204144273522563e-08, + "loss": 0.72014785, + "num_input_tokens_seen": 324917455, + "step": 15068, + "time_per_iteration": 2.479933261871338 + }, + { + "auxiliary_loss_clip": 0.01099481, + "auxiliary_loss_mlp": 0.01028299, + "balance_loss_clip": 1.03304005, + "balance_loss_mlp": 1.0165031, + "epoch": 0.90599729445363, + "flos": 19462973437440.0, + "grad_norm": 1.8178745689983329, + "language_loss": 0.85405886, + "learning_rate": 9.19246900500943e-08, + "loss": 0.87533665, + "num_input_tokens_seen": 324934495, + "step": 15069, + "time_per_iteration": 2.435887575149536 + }, + { + "auxiliary_loss_clip": 0.01095388, + "auxiliary_loss_mlp": 0.01031199, + "balance_loss_clip": 1.03367972, + "balance_loss_mlp": 1.01844335, + "epoch": 0.9060574177062979, + "flos": 23732967542400.0, + "grad_norm": 1.7751685361287097, + "language_loss": 0.59157318, + "learning_rate": 9.180800971936987e-08, + "loss": 0.61283904, + "num_input_tokens_seen": 324953230, + "step": 15070, + "time_per_iteration": 2.475350856781006 + }, + { + "auxiliary_loss_clip": 0.01068876, + "auxiliary_loss_mlp": 0.01023721, + "balance_loss_clip": 1.0341804, + "balance_loss_mlp": 1.01130569, + "epoch": 0.9061175409589659, + "flos": 17311134395520.0, + "grad_norm": 12.235446626168141, + "language_loss": 0.81792533, + "learning_rate": 9.169140174747724e-08, + "loss": 0.83885133, + "num_input_tokens_seen": 324969880, + "step": 15071, + "time_per_iteration": 2.5167810916900635 + }, + { + "auxiliary_loss_clip": 0.0110668, + "auxiliary_loss_mlp": 0.01035681, + "balance_loss_clip": 1.03514934, + "balance_loss_mlp": 1.02268696, + "epoch": 0.9061776642116338, + "flos": 17778439359360.0, + "grad_norm": 1.8650424708269293, + "language_loss": 0.61757195, + "learning_rate": 9.157486613883758e-08, + "loss": 0.63899553, + "num_input_tokens_seen": 324987005, + "step": 15072, + "time_per_iteration": 2.409898519515991 + }, + { + "auxiliary_loss_clip": 0.01082303, + "auxiliary_loss_mlp": 0.0103294, + "balance_loss_clip": 1.03405619, + "balance_loss_mlp": 1.02101874, + "epoch": 0.9062377874643018, + "flos": 42777688037760.0, + "grad_norm": 1.8024412736506676, + "language_loss": 0.729568, + "learning_rate": 9.145840289787021e-08, + "loss": 0.75072044, + "num_input_tokens_seen": 325010700, + "step": 15073, + "time_per_iteration": 4.093415260314941 + }, + { + "auxiliary_loss_clip": 0.01090339, + "auxiliary_loss_mlp": 0.01026041, + "balance_loss_clip": 1.03496206, + "balance_loss_mlp": 1.01479387, + "epoch": 0.9062979107169697, + "flos": 16361620323840.0, + "grad_norm": 2.0512498727794526, + "language_loss": 0.81263709, + "learning_rate": 9.134201202899161e-08, + "loss": 0.83380091, + "num_input_tokens_seen": 325028760, + "step": 15074, + "time_per_iteration": 2.445389747619629 + }, + { + "auxiliary_loss_clip": 0.00990761, + "auxiliary_loss_mlp": 0.00764915, + "balance_loss_clip": 1.01735806, + "balance_loss_mlp": 1.00293279, + "epoch": 0.9063580339696378, + "flos": 69313988528640.0, + "grad_norm": 0.7489242692336314, + "language_loss": 0.52399814, + "learning_rate": 9.122569353661513e-08, + "loss": 0.54155487, + "num_input_tokens_seen": 325093545, + "step": 15075, + "time_per_iteration": 3.2996654510498047 + }, + { + "auxiliary_loss_clip": 0.01001062, + "auxiliary_loss_mlp": 0.01000562, + "balance_loss_clip": 1.01252973, + "balance_loss_mlp": 0.99938202, + "epoch": 0.9064181572223057, + "flos": 58794747148800.0, + "grad_norm": 0.7289157916781073, + "language_loss": 0.62103361, + "learning_rate": 9.11094474251517e-08, + "loss": 0.6410498, + "num_input_tokens_seen": 325152295, + "step": 15076, + "time_per_iteration": 3.0579447746276855 + }, + { + "auxiliary_loss_clip": 0.01090242, + "auxiliary_loss_mlp": 0.0103179, + "balance_loss_clip": 1.0334177, + "balance_loss_mlp": 1.02006531, + "epoch": 0.9064782804749737, + "flos": 21762692772480.0, + "grad_norm": 1.8460076961226015, + "language_loss": 0.82084668, + "learning_rate": 9.09932736990091e-08, + "loss": 0.84206694, + "num_input_tokens_seen": 325169705, + "step": 15077, + "time_per_iteration": 2.489321231842041 + }, + { + "auxiliary_loss_clip": 0.01078156, + "auxiliary_loss_mlp": 0.00783042, + "balance_loss_clip": 1.03162384, + "balance_loss_mlp": 1.00917709, + "epoch": 0.9065384037276417, + "flos": 21397373498880.0, + "grad_norm": 1.5132035498580798, + "language_loss": 0.83971798, + "learning_rate": 9.08771723625934e-08, + "loss": 0.85833001, + "num_input_tokens_seen": 325189175, + "step": 15078, + "time_per_iteration": 2.5343127250671387 + }, + { + "auxiliary_loss_clip": 0.01088178, + "auxiliary_loss_mlp": 0.00781744, + "balance_loss_clip": 1.03434205, + "balance_loss_mlp": 1.00869894, + "epoch": 0.9065985269803096, + "flos": 38283646849920.0, + "grad_norm": 1.9898008106676155, + "language_loss": 0.65509307, + "learning_rate": 9.076114342030617e-08, + "loss": 0.67379224, + "num_input_tokens_seen": 325211020, + "step": 15079, + "time_per_iteration": 2.61872935295105 + }, + { + "auxiliary_loss_clip": 0.01033235, + "auxiliary_loss_mlp": 0.01025018, + "balance_loss_clip": 1.03311741, + "balance_loss_mlp": 1.01337719, + "epoch": 0.9066586502329776, + "flos": 44818562989440.0, + "grad_norm": 1.6765017908556235, + "language_loss": 0.71346736, + "learning_rate": 9.064518687654765e-08, + "loss": 0.73404992, + "num_input_tokens_seen": 325236970, + "step": 15080, + "time_per_iteration": 2.8443312644958496 + }, + { + "auxiliary_loss_clip": 0.01090769, + "auxiliary_loss_mlp": 0.0102926, + "balance_loss_clip": 1.03670502, + "balance_loss_mlp": 1.01677322, + "epoch": 0.9067187734856456, + "flos": 18623992492800.0, + "grad_norm": 3.7614865771123345, + "language_loss": 0.711263, + "learning_rate": 9.052930273571547e-08, + "loss": 0.73246324, + "num_input_tokens_seen": 325252670, + "step": 15081, + "time_per_iteration": 2.443546772003174 + }, + { + "auxiliary_loss_clip": 0.01077768, + "auxiliary_loss_mlp": 0.01030131, + "balance_loss_clip": 1.03570175, + "balance_loss_mlp": 1.01820445, + "epoch": 0.9067788967383136, + "flos": 22747578762240.0, + "grad_norm": 1.9556746266016236, + "language_loss": 0.74421239, + "learning_rate": 9.04134910022032e-08, + "loss": 0.76529133, + "num_input_tokens_seen": 325273860, + "step": 15082, + "time_per_iteration": 2.5417847633361816 + }, + { + "auxiliary_loss_clip": 0.01068684, + "auxiliary_loss_mlp": 0.01034737, + "balance_loss_clip": 1.03410578, + "balance_loss_mlp": 1.02348328, + "epoch": 0.9068390199909815, + "flos": 27670787648640.0, + "grad_norm": 1.816967728608277, + "language_loss": 0.78529322, + "learning_rate": 9.029775168040266e-08, + "loss": 0.80632734, + "num_input_tokens_seen": 325294140, + "step": 15083, + "time_per_iteration": 2.5715065002441406 + }, + { + "auxiliary_loss_clip": 0.01076655, + "auxiliary_loss_mlp": 0.00783461, + "balance_loss_clip": 1.03443754, + "balance_loss_mlp": 1.01090932, + "epoch": 0.9068991432436495, + "flos": 24244012293120.0, + "grad_norm": 1.5417492316763635, + "language_loss": 0.69057846, + "learning_rate": 9.01820847747028e-08, + "loss": 0.70917958, + "num_input_tokens_seen": 325313130, + "step": 15084, + "time_per_iteration": 2.529724359512329 + }, + { + "auxiliary_loss_clip": 0.01104457, + "auxiliary_loss_mlp": 0.01032664, + "balance_loss_clip": 1.03646016, + "balance_loss_mlp": 1.02095795, + "epoch": 0.9069592664963174, + "flos": 28033305661440.0, + "grad_norm": 3.6950651086591013, + "language_loss": 0.66648692, + "learning_rate": 9.006649028948965e-08, + "loss": 0.6878581, + "num_input_tokens_seen": 325334880, + "step": 15085, + "time_per_iteration": 2.4781494140625 + }, + { + "auxiliary_loss_clip": 0.01008207, + "auxiliary_loss_mlp": 0.01008854, + "balance_loss_clip": 1.02234626, + "balance_loss_mlp": 1.00763798, + "epoch": 0.9070193897489854, + "flos": 68778414789120.0, + "grad_norm": 0.7718682185926731, + "language_loss": 0.61332178, + "learning_rate": 8.995096822914638e-08, + "loss": 0.63349235, + "num_input_tokens_seen": 325394175, + "step": 15086, + "time_per_iteration": 3.166022777557373 + }, + { + "auxiliary_loss_clip": 0.01088518, + "auxiliary_loss_mlp": 0.01037917, + "balance_loss_clip": 1.03383231, + "balance_loss_mlp": 1.02492929, + "epoch": 0.9070795130016533, + "flos": 23441624328960.0, + "grad_norm": 1.4139750692085502, + "language_loss": 0.72171879, + "learning_rate": 8.983551859805416e-08, + "loss": 0.74298316, + "num_input_tokens_seen": 325415020, + "step": 15087, + "time_per_iteration": 2.4947776794433594 + }, + { + "auxiliary_loss_clip": 0.01081333, + "auxiliary_loss_mlp": 0.01026697, + "balance_loss_clip": 1.03409314, + "balance_loss_mlp": 1.01536596, + "epoch": 0.9071396362543214, + "flos": 18916413114240.0, + "grad_norm": 2.1122979950565584, + "language_loss": 0.76682776, + "learning_rate": 8.972014140059058e-08, + "loss": 0.78790808, + "num_input_tokens_seen": 325433595, + "step": 15088, + "time_per_iteration": 2.52335262298584 + }, + { + "auxiliary_loss_clip": 0.01069204, + "auxiliary_loss_mlp": 0.01030193, + "balance_loss_clip": 1.03267193, + "balance_loss_mlp": 1.01884389, + "epoch": 0.9071997595069893, + "flos": 25228646887680.0, + "grad_norm": 1.8646967162922683, + "language_loss": 0.72960436, + "learning_rate": 8.960483664113038e-08, + "loss": 0.75059831, + "num_input_tokens_seen": 325451605, + "step": 15089, + "time_per_iteration": 2.521923065185547 + }, + { + "auxiliary_loss_clip": 0.01097901, + "auxiliary_loss_mlp": 0.0102992, + "balance_loss_clip": 1.033921, + "balance_loss_mlp": 1.01947737, + "epoch": 0.9072598827596573, + "flos": 24346608514560.0, + "grad_norm": 1.8579275902803754, + "language_loss": 0.758048, + "learning_rate": 8.948960432404628e-08, + "loss": 0.7793262, + "num_input_tokens_seen": 325470645, + "step": 15090, + "time_per_iteration": 2.4765565395355225 + }, + { + "auxiliary_loss_clip": 0.01084103, + "auxiliary_loss_mlp": 0.01025261, + "balance_loss_clip": 1.03510523, + "balance_loss_mlp": 1.01254177, + "epoch": 0.9073200060123253, + "flos": 22674967418880.0, + "grad_norm": 2.657341481218281, + "language_loss": 0.77736521, + "learning_rate": 8.93744444537079e-08, + "loss": 0.79845881, + "num_input_tokens_seen": 325488070, + "step": 15091, + "time_per_iteration": 2.5018060207366943 + }, + { + "auxiliary_loss_clip": 0.0107547, + "auxiliary_loss_mlp": 0.01025236, + "balance_loss_clip": 1.03180051, + "balance_loss_mlp": 1.01450145, + "epoch": 0.9073801292649932, + "flos": 23695476721920.0, + "grad_norm": 1.5357102802532638, + "language_loss": 0.85931081, + "learning_rate": 8.925935703448217e-08, + "loss": 0.88031787, + "num_input_tokens_seen": 325509285, + "step": 15092, + "time_per_iteration": 2.5397744178771973 + }, + { + "auxiliary_loss_clip": 0.01081646, + "auxiliary_loss_mlp": 0.0103315, + "balance_loss_clip": 1.03746223, + "balance_loss_mlp": 1.02133608, + "epoch": 0.9074402525176612, + "flos": 25375413859200.0, + "grad_norm": 1.7970565238517713, + "language_loss": 0.78541827, + "learning_rate": 8.914434207073296e-08, + "loss": 0.80656624, + "num_input_tokens_seen": 325529360, + "step": 15093, + "time_per_iteration": 2.5473592281341553 + }, + { + "auxiliary_loss_clip": 0.01019497, + "auxiliary_loss_mlp": 0.01003853, + "balance_loss_clip": 1.00590062, + "balance_loss_mlp": 1.00275636, + "epoch": 0.9075003757703292, + "flos": 67649024384640.0, + "grad_norm": 2.2412402706118058, + "language_loss": 0.57067239, + "learning_rate": 8.902939956682188e-08, + "loss": 0.5909059, + "num_input_tokens_seen": 325583565, + "step": 15094, + "time_per_iteration": 3.0337257385253906 + }, + { + "auxiliary_loss_clip": 0.01094639, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.0344398, + "balance_loss_mlp": 1.0196383, + "epoch": 0.9075604990229972, + "flos": 22453649769600.0, + "grad_norm": 1.7832399136651822, + "language_loss": 0.71219671, + "learning_rate": 8.891452952710742e-08, + "loss": 0.73346603, + "num_input_tokens_seen": 325603690, + "step": 15095, + "time_per_iteration": 2.481020450592041 + }, + { + "auxiliary_loss_clip": 0.01061513, + "auxiliary_loss_mlp": 0.01034688, + "balance_loss_clip": 1.03288043, + "balance_loss_mlp": 1.02255225, + "epoch": 0.9076206222756651, + "flos": 19536662188800.0, + "grad_norm": 1.744702286742437, + "language_loss": 0.74176717, + "learning_rate": 8.879973195594526e-08, + "loss": 0.76272917, + "num_input_tokens_seen": 325622255, + "step": 15096, + "time_per_iteration": 2.5210654735565186 + }, + { + "auxiliary_loss_clip": 0.01104291, + "auxiliary_loss_mlp": 0.01034955, + "balance_loss_clip": 1.03474426, + "balance_loss_mlp": 1.02185369, + "epoch": 0.9076807455283331, + "flos": 30116914819200.0, + "grad_norm": 2.466185966409207, + "language_loss": 0.56878674, + "learning_rate": 8.868500685768898e-08, + "loss": 0.59017915, + "num_input_tokens_seen": 325640165, + "step": 15097, + "time_per_iteration": 2.529118061065674 + }, + { + "auxiliary_loss_clip": 0.01089101, + "auxiliary_loss_mlp": 0.01024686, + "balance_loss_clip": 1.0328269, + "balance_loss_mlp": 1.01360571, + "epoch": 0.907740868781001, + "flos": 18697537589760.0, + "grad_norm": 1.6585799042303067, + "language_loss": 0.7966404, + "learning_rate": 8.857035423668935e-08, + "loss": 0.81777829, + "num_input_tokens_seen": 325659455, + "step": 15098, + "time_per_iteration": 2.4470677375793457 + }, + { + "auxiliary_loss_clip": 0.01061223, + "auxiliary_loss_mlp": 0.00783456, + "balance_loss_clip": 1.03310108, + "balance_loss_mlp": 1.01052928, + "epoch": 0.907800992033669, + "flos": 22638805401600.0, + "grad_norm": 2.2553730754018964, + "language_loss": 0.65867376, + "learning_rate": 8.845577409729266e-08, + "loss": 0.67712057, + "num_input_tokens_seen": 325678095, + "step": 15099, + "time_per_iteration": 2.6100351810455322 + }, + { + "auxiliary_loss_clip": 0.01084051, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.03470302, + "balance_loss_mlp": 1.0211519, + "epoch": 0.907861115286337, + "flos": 21287666384640.0, + "grad_norm": 5.619238103681348, + "language_loss": 0.70391822, + "learning_rate": 8.834126644384477e-08, + "loss": 0.72509509, + "num_input_tokens_seen": 325695825, + "step": 15100, + "time_per_iteration": 2.504565954208374 + }, + { + "auxiliary_loss_clip": 0.01018598, + "auxiliary_loss_mlp": 0.01000251, + "balance_loss_clip": 1.0055865, + "balance_loss_mlp": 0.99929124, + "epoch": 0.907921238539005, + "flos": 69739493040000.0, + "grad_norm": 0.6222657049378155, + "language_loss": 0.53412986, + "learning_rate": 8.822683128068775e-08, + "loss": 0.55431831, + "num_input_tokens_seen": 325764515, + "step": 15101, + "time_per_iteration": 4.539974689483643 + }, + { + "auxiliary_loss_clip": 0.01068302, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.03285706, + "balance_loss_mlp": 1.01834977, + "epoch": 0.9079813617916729, + "flos": 23477391296640.0, + "grad_norm": 1.6772100063946953, + "language_loss": 0.68603516, + "learning_rate": 8.811246861216081e-08, + "loss": 0.70702624, + "num_input_tokens_seen": 325783235, + "step": 15102, + "time_per_iteration": 2.571366786956787 + }, + { + "auxiliary_loss_clip": 0.01088958, + "auxiliary_loss_mlp": 0.01030424, + "balance_loss_clip": 1.03465748, + "balance_loss_mlp": 1.01871789, + "epoch": 0.9080414850443409, + "flos": 22929933133440.0, + "grad_norm": 1.75097999620067, + "language_loss": 0.79399902, + "learning_rate": 8.799817844260049e-08, + "loss": 0.81519288, + "num_input_tokens_seen": 325800195, + "step": 15103, + "time_per_iteration": 3.851489782333374 + }, + { + "auxiliary_loss_clip": 0.010767, + "auxiliary_loss_mlp": 0.01028896, + "balance_loss_clip": 1.0326333, + "balance_loss_mlp": 1.01670074, + "epoch": 0.9081016082970089, + "flos": 26177083551360.0, + "grad_norm": 1.6359337481462783, + "language_loss": 0.71447378, + "learning_rate": 8.78839607763413e-08, + "loss": 0.73552978, + "num_input_tokens_seen": 325820215, + "step": 15104, + "time_per_iteration": 3.996955394744873 + }, + { + "auxiliary_loss_clip": 0.01079119, + "auxiliary_loss_mlp": 0.01023734, + "balance_loss_clip": 1.03309488, + "balance_loss_mlp": 1.01277876, + "epoch": 0.9081617315496768, + "flos": 24462169545600.0, + "grad_norm": 1.724763069653292, + "language_loss": 0.77333128, + "learning_rate": 8.77698156177138e-08, + "loss": 0.7943598, + "num_input_tokens_seen": 325838415, + "step": 15105, + "time_per_iteration": 2.5533816814422607 + }, + { + "auxiliary_loss_clip": 0.01102056, + "auxiliary_loss_mlp": 0.007841, + "balance_loss_clip": 1.03343272, + "balance_loss_mlp": 1.00872827, + "epoch": 0.9082218548023449, + "flos": 24746868743040.0, + "grad_norm": 2.029565226740009, + "language_loss": 0.73792452, + "learning_rate": 8.765574297104628e-08, + "loss": 0.75678605, + "num_input_tokens_seen": 325855580, + "step": 15106, + "time_per_iteration": 2.480668783187866 + }, + { + "auxiliary_loss_clip": 0.0105441, + "auxiliary_loss_mlp": 0.01036532, + "balance_loss_clip": 1.02976036, + "balance_loss_mlp": 1.02361608, + "epoch": 0.9082819780550128, + "flos": 24421302846720.0, + "grad_norm": 1.649148060120071, + "language_loss": 0.80815518, + "learning_rate": 8.754174284066462e-08, + "loss": 0.82906461, + "num_input_tokens_seen": 325874890, + "step": 15107, + "time_per_iteration": 2.6020498275756836 + }, + { + "auxiliary_loss_clip": 0.01009491, + "auxiliary_loss_mlp": 0.01000746, + "balance_loss_clip": 1.01115322, + "balance_loss_mlp": 0.99954224, + "epoch": 0.9083421013076808, + "flos": 59609704872960.0, + "grad_norm": 0.8157479423913526, + "language_loss": 0.59753883, + "learning_rate": 8.742781523089205e-08, + "loss": 0.61764115, + "num_input_tokens_seen": 325935835, + "step": 15108, + "time_per_iteration": 3.0848288536071777 + }, + { + "auxiliary_loss_clip": 0.01080809, + "auxiliary_loss_mlp": 0.01025019, + "balance_loss_clip": 1.03317761, + "balance_loss_mlp": 1.01324117, + "epoch": 0.9084022245603487, + "flos": 33620216100480.0, + "grad_norm": 1.6736156408161018, + "language_loss": 0.73613268, + "learning_rate": 8.73139601460482e-08, + "loss": 0.75719094, + "num_input_tokens_seen": 325958035, + "step": 15109, + "time_per_iteration": 2.6182143688201904 + }, + { + "auxiliary_loss_clip": 0.01068586, + "auxiliary_loss_mlp": 0.01027044, + "balance_loss_clip": 1.03318846, + "balance_loss_mlp": 1.01596928, + "epoch": 0.9084623478130167, + "flos": 24971705925120.0, + "grad_norm": 1.8677951231272971, + "language_loss": 0.71138507, + "learning_rate": 8.720017759045073e-08, + "loss": 0.73234141, + "num_input_tokens_seen": 325979870, + "step": 15110, + "time_per_iteration": 2.573913812637329 + }, + { + "auxiliary_loss_clip": 0.01073554, + "auxiliary_loss_mlp": 0.01032883, + "balance_loss_clip": 1.0303967, + "balance_loss_mlp": 1.02009189, + "epoch": 0.9085224710656846, + "flos": 31461804869760.0, + "grad_norm": 2.038602390331913, + "language_loss": 0.68802583, + "learning_rate": 8.708646756841421e-08, + "loss": 0.70909023, + "num_input_tokens_seen": 325998245, + "step": 15111, + "time_per_iteration": 4.00778341293335 + }, + { + "auxiliary_loss_clip": 0.01006004, + "auxiliary_loss_mlp": 0.01006885, + "balance_loss_clip": 1.00628424, + "balance_loss_mlp": 1.00579977, + "epoch": 0.9085825943183526, + "flos": 64917012867840.0, + "grad_norm": 0.6884383534786601, + "language_loss": 0.51723611, + "learning_rate": 8.697283008425026e-08, + "loss": 0.53736496, + "num_input_tokens_seen": 326061770, + "step": 15112, + "time_per_iteration": 3.186412811279297 + }, + { + "auxiliary_loss_clip": 0.01091914, + "auxiliary_loss_mlp": 0.01032619, + "balance_loss_clip": 1.03230214, + "balance_loss_mlp": 1.02054334, + "epoch": 0.9086427175710206, + "flos": 18953221576320.0, + "grad_norm": 9.767465113500998, + "language_loss": 0.70166671, + "learning_rate": 8.685926514226837e-08, + "loss": 0.72291207, + "num_input_tokens_seen": 326080945, + "step": 15113, + "time_per_iteration": 2.4627254009246826 + }, + { + "auxiliary_loss_clip": 0.01091499, + "auxiliary_loss_mlp": 0.01030208, + "balance_loss_clip": 1.03477097, + "balance_loss_mlp": 1.01832867, + "epoch": 0.9087028408236886, + "flos": 34014873807360.0, + "grad_norm": 2.1242727700146475, + "language_loss": 0.79041469, + "learning_rate": 8.674577274677508e-08, + "loss": 0.8116318, + "num_input_tokens_seen": 326100630, + "step": 15114, + "time_per_iteration": 2.5931005477905273 + }, + { + "auxiliary_loss_clip": 0.01064346, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.03481007, + "balance_loss_mlp": 1.02160263, + "epoch": 0.9087629640763565, + "flos": 21944580266880.0, + "grad_norm": 2.0580373948365507, + "language_loss": 0.70347321, + "learning_rate": 8.663235290207405e-08, + "loss": 0.72446263, + "num_input_tokens_seen": 326120145, + "step": 15115, + "time_per_iteration": 2.5566980838775635 + }, + { + "auxiliary_loss_clip": 0.01080968, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.03712893, + "balance_loss_mlp": 1.01973581, + "epoch": 0.9088230873290245, + "flos": 21762908254080.0, + "grad_norm": 3.627793528067355, + "language_loss": 0.65902805, + "learning_rate": 8.651900561246561e-08, + "loss": 0.68016648, + "num_input_tokens_seen": 326140715, + "step": 15116, + "time_per_iteration": 2.5683786869049072 + }, + { + "auxiliary_loss_clip": 0.01101651, + "auxiliary_loss_mlp": 0.01035015, + "balance_loss_clip": 1.03554952, + "balance_loss_mlp": 1.02292705, + "epoch": 0.9088832105816925, + "flos": 21541267382400.0, + "grad_norm": 1.562272263255014, + "language_loss": 0.6928491, + "learning_rate": 8.640573088224812e-08, + "loss": 0.71421576, + "num_input_tokens_seen": 326159130, + "step": 15117, + "time_per_iteration": 2.447754144668579 + }, + { + "auxiliary_loss_clip": 0.0106322, + "auxiliary_loss_mlp": 0.01026243, + "balance_loss_clip": 1.03549612, + "balance_loss_mlp": 1.01520419, + "epoch": 0.9089433338343604, + "flos": 25996704428160.0, + "grad_norm": 1.5826588044783139, + "language_loss": 0.74354196, + "learning_rate": 8.629252871571745e-08, + "loss": 0.7644366, + "num_input_tokens_seen": 326181375, + "step": 15118, + "time_per_iteration": 2.610609292984009 + }, + { + "auxiliary_loss_clip": 0.01078749, + "auxiliary_loss_mlp": 0.01032051, + "balance_loss_clip": 1.03317881, + "balance_loss_mlp": 1.01863945, + "epoch": 0.9090034570870285, + "flos": 21178426147200.0, + "grad_norm": 2.007914693603318, + "language_loss": 0.73014665, + "learning_rate": 8.617939911716554e-08, + "loss": 0.75125468, + "num_input_tokens_seen": 326199740, + "step": 15119, + "time_per_iteration": 2.481048583984375 + }, + { + "auxiliary_loss_clip": 0.01067701, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.03368413, + "balance_loss_mlp": 1.01680875, + "epoch": 0.9090635803396964, + "flos": 16141811045760.0, + "grad_norm": 2.697227891742029, + "language_loss": 0.71587789, + "learning_rate": 8.60663420908827e-08, + "loss": 0.73686373, + "num_input_tokens_seen": 326214350, + "step": 15120, + "time_per_iteration": 2.515573263168335 + }, + { + "auxiliary_loss_clip": 0.0110395, + "auxiliary_loss_mlp": 0.00782394, + "balance_loss_clip": 1.03485262, + "balance_loss_mlp": 1.00761461, + "epoch": 0.9091237035923644, + "flos": 20591537829120.0, + "grad_norm": 2.1942164743956525, + "language_loss": 0.6586715, + "learning_rate": 8.595335764115596e-08, + "loss": 0.67753494, + "num_input_tokens_seen": 326234580, + "step": 15121, + "time_per_iteration": 2.473665952682495 + }, + { + "auxiliary_loss_clip": 0.01092707, + "auxiliary_loss_mlp": 0.01035138, + "balance_loss_clip": 1.03364062, + "balance_loss_mlp": 1.02275777, + "epoch": 0.9091838268450323, + "flos": 52227760164480.0, + "grad_norm": 1.7861399681198902, + "language_loss": 0.70278525, + "learning_rate": 8.58404457722699e-08, + "loss": 0.72406369, + "num_input_tokens_seen": 326259080, + "step": 15122, + "time_per_iteration": 2.7621140480041504 + }, + { + "auxiliary_loss_clip": 0.01048569, + "auxiliary_loss_mlp": 0.01030817, + "balance_loss_clip": 1.0309819, + "balance_loss_mlp": 1.01899111, + "epoch": 0.9092439500977003, + "flos": 20559613616640.0, + "grad_norm": 1.2743930714933893, + "language_loss": 0.74576569, + "learning_rate": 8.572760648850575e-08, + "loss": 0.7665596, + "num_input_tokens_seen": 326280175, + "step": 15123, + "time_per_iteration": 2.5868513584136963 + }, + { + "auxiliary_loss_clip": 0.01089351, + "auxiliary_loss_mlp": 0.01033015, + "balance_loss_clip": 1.03369999, + "balance_loss_mlp": 1.02170181, + "epoch": 0.9093040733503682, + "flos": 28617859595520.0, + "grad_norm": 2.204410803344162, + "language_loss": 0.75456792, + "learning_rate": 8.561483979414253e-08, + "loss": 0.77579153, + "num_input_tokens_seen": 326297990, + "step": 15124, + "time_per_iteration": 2.5146567821502686 + }, + { + "auxiliary_loss_clip": 0.01086514, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.0334034, + "balance_loss_mlp": 1.01957774, + "epoch": 0.9093641966030362, + "flos": 23440187784960.0, + "grad_norm": 2.029068599140535, + "language_loss": 0.72301865, + "learning_rate": 8.55021456934566e-08, + "loss": 0.74420124, + "num_input_tokens_seen": 326316735, + "step": 15125, + "time_per_iteration": 2.4909214973449707 + }, + { + "auxiliary_loss_clip": 0.01062545, + "auxiliary_loss_mlp": 0.01032916, + "balance_loss_clip": 1.03553414, + "balance_loss_mlp": 1.02117991, + "epoch": 0.9094243198557042, + "flos": 16800197385600.0, + "grad_norm": 1.6175448067906075, + "language_loss": 0.7901479, + "learning_rate": 8.538952419072143e-08, + "loss": 0.81110251, + "num_input_tokens_seen": 326334370, + "step": 15126, + "time_per_iteration": 2.5535123348236084 + }, + { + "auxiliary_loss_clip": 0.01065379, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.03478539, + "balance_loss_mlp": 1.01938868, + "epoch": 0.9094844431083722, + "flos": 24273278899200.0, + "grad_norm": 1.7147891721318076, + "language_loss": 0.75339663, + "learning_rate": 8.527697529020694e-08, + "loss": 0.77436471, + "num_input_tokens_seen": 326353435, + "step": 15127, + "time_per_iteration": 2.567479133605957 + }, + { + "auxiliary_loss_clip": 0.01031239, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.03054667, + "balance_loss_mlp": 1.02147448, + "epoch": 0.9095445663610401, + "flos": 21944652094080.0, + "grad_norm": 1.8501096247579176, + "language_loss": 0.62789106, + "learning_rate": 8.516449899618173e-08, + "loss": 0.64853919, + "num_input_tokens_seen": 326371810, + "step": 15128, + "time_per_iteration": 2.65617299079895 + }, + { + "auxiliary_loss_clip": 0.01068889, + "auxiliary_loss_mlp": 0.01027197, + "balance_loss_clip": 1.03334105, + "balance_loss_mlp": 1.01565135, + "epoch": 0.9096046896137081, + "flos": 19792848965760.0, + "grad_norm": 1.6704405515928746, + "language_loss": 0.76270425, + "learning_rate": 8.505209531291013e-08, + "loss": 0.78366506, + "num_input_tokens_seen": 326391380, + "step": 15129, + "time_per_iteration": 2.5499823093414307 + }, + { + "auxiliary_loss_clip": 0.01091158, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.03406358, + "balance_loss_mlp": 1.01796699, + "epoch": 0.909664812866376, + "flos": 22638087129600.0, + "grad_norm": 1.8499632899204919, + "language_loss": 0.832551, + "learning_rate": 8.49397642446552e-08, + "loss": 0.85376, + "num_input_tokens_seen": 326408800, + "step": 15130, + "time_per_iteration": 2.5018978118896484 + }, + { + "auxiliary_loss_clip": 0.01082286, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.03501141, + "balance_loss_mlp": 1.02225828, + "epoch": 0.909724936119044, + "flos": 39852153020160.0, + "grad_norm": 1.747561250765313, + "language_loss": 0.75302672, + "learning_rate": 8.482750579567644e-08, + "loss": 0.77419651, + "num_input_tokens_seen": 326431565, + "step": 15131, + "time_per_iteration": 2.6672606468200684 + }, + { + "auxiliary_loss_clip": 0.01077087, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.03368163, + "balance_loss_mlp": 1.01867342, + "epoch": 0.9097850593717121, + "flos": 35071616954880.0, + "grad_norm": 1.800575730294283, + "language_loss": 0.59496212, + "learning_rate": 8.471531997023085e-08, + "loss": 0.61604118, + "num_input_tokens_seen": 326451715, + "step": 15132, + "time_per_iteration": 2.6267762184143066 + }, + { + "auxiliary_loss_clip": 0.01070752, + "auxiliary_loss_mlp": 0.01028844, + "balance_loss_clip": 1.03813553, + "balance_loss_mlp": 1.01775742, + "epoch": 0.90984518262438, + "flos": 23367468700800.0, + "grad_norm": 1.391779471457768, + "language_loss": 0.82482231, + "learning_rate": 8.460320677257193e-08, + "loss": 0.84581828, + "num_input_tokens_seen": 326470855, + "step": 15133, + "time_per_iteration": 2.5484538078308105 + }, + { + "auxiliary_loss_clip": 0.01080251, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.03248644, + "balance_loss_mlp": 1.01858449, + "epoch": 0.909905305877048, + "flos": 27523302405120.0, + "grad_norm": 1.8392812666159402, + "language_loss": 0.73559695, + "learning_rate": 8.449116620695118e-08, + "loss": 0.7567057, + "num_input_tokens_seen": 326490480, + "step": 15134, + "time_per_iteration": 2.6072652339935303 + }, + { + "auxiliary_loss_clip": 0.01075768, + "auxiliary_loss_mlp": 0.01036547, + "balance_loss_clip": 1.03649926, + "balance_loss_mlp": 1.02408409, + "epoch": 0.9099654291297159, + "flos": 24347865490560.0, + "grad_norm": 1.599315664024651, + "language_loss": 0.72764778, + "learning_rate": 8.437919827761786e-08, + "loss": 0.74877095, + "num_input_tokens_seen": 326509445, + "step": 15135, + "time_per_iteration": 2.585002899169922 + }, + { + "auxiliary_loss_clip": 0.01091161, + "auxiliary_loss_mlp": 0.01030736, + "balance_loss_clip": 1.03495646, + "balance_loss_mlp": 1.01922059, + "epoch": 0.9100255523823839, + "flos": 21215234609280.0, + "grad_norm": 1.6236614517384516, + "language_loss": 0.69682944, + "learning_rate": 8.426730298881702e-08, + "loss": 0.71804833, + "num_input_tokens_seen": 326528380, + "step": 15136, + "time_per_iteration": 2.4719502925872803 + }, + { + "auxiliary_loss_clip": 0.00997516, + "auxiliary_loss_mlp": 0.01005145, + "balance_loss_clip": 1.00681674, + "balance_loss_mlp": 1.0041554, + "epoch": 0.9100856756350518, + "flos": 46052276446080.0, + "grad_norm": 0.8256453028880957, + "language_loss": 0.59348655, + "learning_rate": 8.415548034479214e-08, + "loss": 0.61351311, + "num_input_tokens_seen": 326576940, + "step": 15137, + "time_per_iteration": 2.9014508724212646 + }, + { + "auxiliary_loss_clip": 0.01093651, + "auxiliary_loss_mlp": 0.01036751, + "balance_loss_clip": 1.0349195, + "balance_loss_mlp": 1.0250026, + "epoch": 0.9101457988877198, + "flos": 20229917656320.0, + "grad_norm": 1.6483095579675058, + "language_loss": 0.82583177, + "learning_rate": 8.40437303497834e-08, + "loss": 0.84713578, + "num_input_tokens_seen": 326596100, + "step": 15138, + "time_per_iteration": 2.477790594100952 + }, + { + "auxiliary_loss_clip": 0.01089231, + "auxiliary_loss_mlp": 0.01025845, + "balance_loss_clip": 1.03542757, + "balance_loss_mlp": 1.01465178, + "epoch": 0.9102059221403878, + "flos": 26615157822720.0, + "grad_norm": 1.428094393876634, + "language_loss": 0.81053406, + "learning_rate": 8.39320530080283e-08, + "loss": 0.83168483, + "num_input_tokens_seen": 326615700, + "step": 15139, + "time_per_iteration": 2.541222095489502 + }, + { + "auxiliary_loss_clip": 0.01069719, + "auxiliary_loss_mlp": 0.01034786, + "balance_loss_clip": 1.03379095, + "balance_loss_mlp": 1.02340126, + "epoch": 0.9102660453930558, + "flos": 21908561904000.0, + "grad_norm": 1.8002744746847454, + "language_loss": 0.77578467, + "learning_rate": 8.382044832376167e-08, + "loss": 0.7968297, + "num_input_tokens_seen": 326635905, + "step": 15140, + "time_per_iteration": 3.91365909576416 + }, + { + "auxiliary_loss_clip": 0.01102766, + "auxiliary_loss_mlp": 0.01026757, + "balance_loss_clip": 1.03490579, + "balance_loss_mlp": 1.01506853, + "epoch": 0.9103261686457237, + "flos": 36176660916480.0, + "grad_norm": 2.862114362268329, + "language_loss": 0.66864586, + "learning_rate": 8.370891630121569e-08, + "loss": 0.68994105, + "num_input_tokens_seen": 326661855, + "step": 15141, + "time_per_iteration": 2.599681854248047 + }, + { + "auxiliary_loss_clip": 0.01093797, + "auxiliary_loss_mlp": 0.01034576, + "balance_loss_clip": 1.03476024, + "balance_loss_mlp": 1.02316809, + "epoch": 0.9103862918983917, + "flos": 23878549365120.0, + "grad_norm": 1.677236535636131, + "language_loss": 0.75029504, + "learning_rate": 8.359745694462005e-08, + "loss": 0.77157879, + "num_input_tokens_seen": 326679320, + "step": 15142, + "time_per_iteration": 5.28966760635376 + }, + { + "auxiliary_loss_clip": 0.01066438, + "auxiliary_loss_mlp": 0.0103909, + "balance_loss_clip": 1.03194654, + "balance_loss_mlp": 1.02685905, + "epoch": 0.9104464151510596, + "flos": 14939521989120.0, + "grad_norm": 1.657665248418674, + "language_loss": 0.64374077, + "learning_rate": 8.348607025820076e-08, + "loss": 0.66479599, + "num_input_tokens_seen": 326698110, + "step": 15143, + "time_per_iteration": 2.5161027908325195 + }, + { + "auxiliary_loss_clip": 0.01103388, + "auxiliary_loss_mlp": 0.0103391, + "balance_loss_clip": 1.03353333, + "balance_loss_mlp": 1.02122581, + "epoch": 0.9105065384037276, + "flos": 33655803500160.0, + "grad_norm": 1.8663189342602347, + "language_loss": 0.61063099, + "learning_rate": 8.337475624618152e-08, + "loss": 0.63200396, + "num_input_tokens_seen": 326718370, + "step": 15144, + "time_per_iteration": 2.53768253326416 + }, + { + "auxiliary_loss_clip": 0.01063569, + "auxiliary_loss_mlp": 0.01025234, + "balance_loss_clip": 1.03245962, + "balance_loss_mlp": 1.01327157, + "epoch": 0.9105666616563957, + "flos": 24316695463680.0, + "grad_norm": 1.6390736756841775, + "language_loss": 0.71032459, + "learning_rate": 8.326351491278382e-08, + "loss": 0.73121262, + "num_input_tokens_seen": 326738445, + "step": 15145, + "time_per_iteration": 2.5526223182678223 + }, + { + "auxiliary_loss_clip": 0.01051169, + "auxiliary_loss_mlp": 0.01032588, + "balance_loss_clip": 1.03341794, + "balance_loss_mlp": 1.02104902, + "epoch": 0.9106267849090636, + "flos": 29971692132480.0, + "grad_norm": 1.536228728431362, + "language_loss": 0.70821095, + "learning_rate": 8.315234626222545e-08, + "loss": 0.72904849, + "num_input_tokens_seen": 326758855, + "step": 15146, + "time_per_iteration": 2.6608612537384033 + }, + { + "auxiliary_loss_clip": 0.01080919, + "auxiliary_loss_mlp": 0.01029124, + "balance_loss_clip": 1.03255987, + "balance_loss_mlp": 1.01796055, + "epoch": 0.9106869081617316, + "flos": 25337743470720.0, + "grad_norm": 1.912540595815008, + "language_loss": 0.72928202, + "learning_rate": 8.304125029872233e-08, + "loss": 0.75038254, + "num_input_tokens_seen": 326777140, + "step": 15147, + "time_per_iteration": 2.5565829277038574 + }, + { + "auxiliary_loss_clip": 0.01074786, + "auxiliary_loss_mlp": 0.01027563, + "balance_loss_clip": 1.03389096, + "balance_loss_mlp": 1.01560068, + "epoch": 0.9107470314143995, + "flos": 18187031543040.0, + "grad_norm": 1.8252376938892632, + "language_loss": 0.80517668, + "learning_rate": 8.293022702648711e-08, + "loss": 0.82620019, + "num_input_tokens_seen": 326794070, + "step": 15148, + "time_per_iteration": 2.5213913917541504 + }, + { + "auxiliary_loss_clip": 0.01070528, + "auxiliary_loss_mlp": 0.01034807, + "balance_loss_clip": 1.0331763, + "balance_loss_mlp": 1.02284479, + "epoch": 0.9108071546670675, + "flos": 23550828652800.0, + "grad_norm": 2.4584743096862516, + "language_loss": 0.67599833, + "learning_rate": 8.281927644972996e-08, + "loss": 0.69705176, + "num_input_tokens_seen": 326814695, + "step": 15149, + "time_per_iteration": 2.5717239379882812 + }, + { + "auxiliary_loss_clip": 0.01102433, + "auxiliary_loss_mlp": 0.01027684, + "balance_loss_clip": 1.03453052, + "balance_loss_mlp": 1.01523304, + "epoch": 0.9108672779197354, + "flos": 25630307746560.0, + "grad_norm": 2.422024116677399, + "language_loss": 0.63487017, + "learning_rate": 8.270839857265776e-08, + "loss": 0.65617138, + "num_input_tokens_seen": 326835295, + "step": 15150, + "time_per_iteration": 3.854471206665039 + }, + { + "auxiliary_loss_clip": 0.0106817, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.03401017, + "balance_loss_mlp": 1.02098751, + "epoch": 0.9109274011724035, + "flos": 22339094319360.0, + "grad_norm": 1.7268600322223937, + "language_loss": 0.72377455, + "learning_rate": 8.259759339947514e-08, + "loss": 0.74478543, + "num_input_tokens_seen": 326853350, + "step": 15151, + "time_per_iteration": 2.5510141849517822 + }, + { + "auxiliary_loss_clip": 0.01090456, + "auxiliary_loss_mlp": 0.01026462, + "balance_loss_clip": 1.03387249, + "balance_loss_mlp": 1.01489866, + "epoch": 0.9109875244250714, + "flos": 26688200129280.0, + "grad_norm": 1.6021136905513762, + "language_loss": 0.64389014, + "learning_rate": 8.248686093438429e-08, + "loss": 0.66505933, + "num_input_tokens_seen": 326873425, + "step": 15152, + "time_per_iteration": 2.543565034866333 + }, + { + "auxiliary_loss_clip": 0.01083152, + "auxiliary_loss_mlp": 0.00782588, + "balance_loss_clip": 1.03503609, + "balance_loss_mlp": 1.00894105, + "epoch": 0.9110476476777394, + "flos": 22930112701440.0, + "grad_norm": 2.4409726260036853, + "language_loss": 0.73054779, + "learning_rate": 8.23762011815834e-08, + "loss": 0.74920523, + "num_input_tokens_seen": 326893455, + "step": 15153, + "time_per_iteration": 2.5526320934295654 + }, + { + "auxiliary_loss_clip": 0.01063334, + "auxiliary_loss_mlp": 0.01043237, + "balance_loss_clip": 1.03178906, + "balance_loss_mlp": 1.02872348, + "epoch": 0.9111077709304073, + "flos": 13472857854720.0, + "grad_norm": 1.8690995841369131, + "language_loss": 0.72269499, + "learning_rate": 8.226561414526956e-08, + "loss": 0.7437607, + "num_input_tokens_seen": 326910210, + "step": 15154, + "time_per_iteration": 2.505903482437134 + }, + { + "auxiliary_loss_clip": 0.01081029, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.03682518, + "balance_loss_mlp": 1.01987433, + "epoch": 0.9111678941830753, + "flos": 20850561780480.0, + "grad_norm": 1.7614167318219531, + "language_loss": 0.81597781, + "learning_rate": 8.215509982963564e-08, + "loss": 0.83710063, + "num_input_tokens_seen": 326929350, + "step": 15155, + "time_per_iteration": 2.521428108215332 + }, + { + "auxiliary_loss_clip": 0.01090023, + "auxiliary_loss_mlp": 0.0102899, + "balance_loss_clip": 1.03670216, + "balance_loss_mlp": 1.01714087, + "epoch": 0.9112280174357432, + "flos": 19682244011520.0, + "grad_norm": 1.4088192653187965, + "language_loss": 0.59803849, + "learning_rate": 8.204465823887252e-08, + "loss": 0.6192286, + "num_input_tokens_seen": 326949060, + "step": 15156, + "time_per_iteration": 2.4693007469177246 + }, + { + "auxiliary_loss_clip": 0.01093334, + "auxiliary_loss_mlp": 0.01029376, + "balance_loss_clip": 1.03232396, + "balance_loss_mlp": 1.01639414, + "epoch": 0.9112881406884112, + "flos": 25447163276160.0, + "grad_norm": 1.7449301427551074, + "language_loss": 0.73997718, + "learning_rate": 8.193428937716796e-08, + "loss": 0.7612043, + "num_input_tokens_seen": 326968950, + "step": 15157, + "time_per_iteration": 2.5131356716156006 + }, + { + "auxiliary_loss_clip": 0.01059676, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.03225136, + "balance_loss_mlp": 1.02020955, + "epoch": 0.9113482639410793, + "flos": 33066975847680.0, + "grad_norm": 1.5867178827115824, + "language_loss": 0.59214401, + "learning_rate": 8.182399324870747e-08, + "loss": 0.61305869, + "num_input_tokens_seen": 326989455, + "step": 15158, + "time_per_iteration": 2.656273365020752 + }, + { + "auxiliary_loss_clip": 0.01044519, + "auxiliary_loss_mlp": 0.01030259, + "balance_loss_clip": 1.03401721, + "balance_loss_mlp": 1.01932716, + "epoch": 0.9114083871937472, + "flos": 21835591424640.0, + "grad_norm": 1.5205388243803153, + "language_loss": 0.677935, + "learning_rate": 8.171376985767375e-08, + "loss": 0.69868279, + "num_input_tokens_seen": 327009640, + "step": 15159, + "time_per_iteration": 2.621690273284912 + }, + { + "auxiliary_loss_clip": 0.01082969, + "auxiliary_loss_mlp": 0.01030225, + "balance_loss_clip": 1.03413177, + "balance_loss_mlp": 1.01856029, + "epoch": 0.9114685104464152, + "flos": 27088999061760.0, + "grad_norm": 2.108185650842786, + "language_loss": 0.78538024, + "learning_rate": 8.160361920824588e-08, + "loss": 0.80651224, + "num_input_tokens_seen": 327027690, + "step": 15160, + "time_per_iteration": 2.5487401485443115 + }, + { + "auxiliary_loss_clip": 0.01105515, + "auxiliary_loss_mlp": 0.01027761, + "balance_loss_clip": 1.03634834, + "balance_loss_mlp": 1.01484466, + "epoch": 0.9115286336990831, + "flos": 17967042696960.0, + "grad_norm": 1.6818077181670759, + "language_loss": 0.6919049, + "learning_rate": 8.149354130460073e-08, + "loss": 0.71323764, + "num_input_tokens_seen": 327045915, + "step": 15161, + "time_per_iteration": 2.4342257976531982 + }, + { + "auxiliary_loss_clip": 0.01059331, + "auxiliary_loss_mlp": 0.01035883, + "balance_loss_clip": 1.03448546, + "balance_loss_mlp": 1.02217984, + "epoch": 0.9115887569517511, + "flos": 22929861306240.0, + "grad_norm": 1.7959963483469137, + "language_loss": 0.76191264, + "learning_rate": 8.138353615091321e-08, + "loss": 0.78286481, + "num_input_tokens_seen": 327066355, + "step": 15162, + "time_per_iteration": 2.575934410095215 + }, + { + "auxiliary_loss_clip": 0.01085865, + "auxiliary_loss_mlp": 0.01035348, + "balance_loss_clip": 1.03730273, + "balance_loss_mlp": 1.02314091, + "epoch": 0.911648880204419, + "flos": 23988436047360.0, + "grad_norm": 1.844987188877883, + "language_loss": 0.66687107, + "learning_rate": 8.127360375135395e-08, + "loss": 0.68808317, + "num_input_tokens_seen": 327086735, + "step": 15163, + "time_per_iteration": 2.539062023162842 + }, + { + "auxiliary_loss_clip": 0.01061531, + "auxiliary_loss_mlp": 0.01032468, + "balance_loss_clip": 1.03390074, + "balance_loss_mlp": 1.02007008, + "epoch": 0.911709003457087, + "flos": 17055306754560.0, + "grad_norm": 2.347910428817761, + "language_loss": 0.70756608, + "learning_rate": 8.116374411009186e-08, + "loss": 0.72850609, + "num_input_tokens_seen": 327104035, + "step": 15164, + "time_per_iteration": 2.5566039085388184 + }, + { + "auxiliary_loss_clip": 0.01103031, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.03769982, + "balance_loss_mlp": 1.02252269, + "epoch": 0.911769126709755, + "flos": 21653344794240.0, + "grad_norm": 1.847118481264255, + "language_loss": 0.75997305, + "learning_rate": 8.105395723129315e-08, + "loss": 0.78134549, + "num_input_tokens_seen": 327124370, + "step": 15165, + "time_per_iteration": 2.4734737873077393 + }, + { + "auxiliary_loss_clip": 0.01088092, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.0330714, + "balance_loss_mlp": 1.02165174, + "epoch": 0.911829249962423, + "flos": 24790321221120.0, + "grad_norm": 2.244263561887781, + "language_loss": 0.71766567, + "learning_rate": 8.094424311912074e-08, + "loss": 0.73888755, + "num_input_tokens_seen": 327140915, + "step": 15166, + "time_per_iteration": 2.493412733078003 + }, + { + "auxiliary_loss_clip": 0.01063059, + "auxiliary_loss_mlp": 0.01034988, + "balance_loss_clip": 1.03325582, + "balance_loss_mlp": 1.02189279, + "epoch": 0.9118893732150909, + "flos": 20959406968320.0, + "grad_norm": 1.795201541151311, + "language_loss": 0.72947091, + "learning_rate": 8.083460177773482e-08, + "loss": 0.75045133, + "num_input_tokens_seen": 327158940, + "step": 15167, + "time_per_iteration": 2.576633930206299 + }, + { + "auxiliary_loss_clip": 0.01016228, + "auxiliary_loss_mlp": 0.01000736, + "balance_loss_clip": 1.01231146, + "balance_loss_mlp": 0.9996509, + "epoch": 0.9119494964677589, + "flos": 67917385872000.0, + "grad_norm": 0.7703537655122893, + "language_loss": 0.65596569, + "learning_rate": 8.072503321129298e-08, + "loss": 0.67613536, + "num_input_tokens_seen": 327217450, + "step": 15168, + "time_per_iteration": 3.1078052520751953 + }, + { + "auxiliary_loss_clip": 0.01072494, + "auxiliary_loss_mlp": 0.01027145, + "balance_loss_clip": 1.03497112, + "balance_loss_mlp": 1.01594496, + "epoch": 0.9120096197204268, + "flos": 18551524803840.0, + "grad_norm": 3.193006047904742, + "language_loss": 0.78030074, + "learning_rate": 8.061553742395033e-08, + "loss": 0.80129719, + "num_input_tokens_seen": 327233905, + "step": 15169, + "time_per_iteration": 2.488603353500366 + }, + { + "auxiliary_loss_clip": 0.01091745, + "auxiliary_loss_mlp": 0.01027541, + "balance_loss_clip": 1.03409445, + "balance_loss_mlp": 1.01588237, + "epoch": 0.9120697429730948, + "flos": 19025725178880.0, + "grad_norm": 1.7121976464685618, + "language_loss": 0.82143992, + "learning_rate": 8.05061144198591e-08, + "loss": 0.84263277, + "num_input_tokens_seen": 327252430, + "step": 15170, + "time_per_iteration": 2.4807143211364746 + }, + { + "auxiliary_loss_clip": 0.01095271, + "auxiliary_loss_mlp": 0.0102926, + "balance_loss_clip": 1.03615928, + "balance_loss_mlp": 1.01717257, + "epoch": 0.9121298662257629, + "flos": 17163685065600.0, + "grad_norm": 1.9761934699210726, + "language_loss": 0.77158678, + "learning_rate": 8.039676420316799e-08, + "loss": 0.79283214, + "num_input_tokens_seen": 327269215, + "step": 15171, + "time_per_iteration": 2.4437942504882812 + }, + { + "auxiliary_loss_clip": 0.01027968, + "auxiliary_loss_mlp": 0.01037364, + "balance_loss_clip": 1.03072512, + "balance_loss_mlp": 1.02438164, + "epoch": 0.9121899894784308, + "flos": 19682710888320.0, + "grad_norm": 1.4028798080328464, + "language_loss": 0.67115891, + "learning_rate": 8.02874867780241e-08, + "loss": 0.69181228, + "num_input_tokens_seen": 327290320, + "step": 15172, + "time_per_iteration": 2.6797919273376465 + }, + { + "auxiliary_loss_clip": 0.01075486, + "auxiliary_loss_mlp": 0.01032537, + "balance_loss_clip": 1.03506625, + "balance_loss_mlp": 1.02062821, + "epoch": 0.9122501127310988, + "flos": 22235743912320.0, + "grad_norm": 1.5788247703294447, + "language_loss": 0.74518114, + "learning_rate": 8.017828214857103e-08, + "loss": 0.7662614, + "num_input_tokens_seen": 327310150, + "step": 15173, + "time_per_iteration": 2.532552719116211 + }, + { + "auxiliary_loss_clip": 0.01087266, + "auxiliary_loss_mlp": 0.01030818, + "balance_loss_clip": 1.03569365, + "balance_loss_mlp": 1.01677513, + "epoch": 0.9123102359837667, + "flos": 15957122290560.0, + "grad_norm": 2.221659098123988, + "language_loss": 0.66076577, + "learning_rate": 8.00691503189499e-08, + "loss": 0.68194652, + "num_input_tokens_seen": 327326660, + "step": 15174, + "time_per_iteration": 2.4776573181152344 + }, + { + "auxiliary_loss_clip": 0.01091913, + "auxiliary_loss_mlp": 0.01029494, + "balance_loss_clip": 1.03490925, + "balance_loss_mlp": 1.01621461, + "epoch": 0.9123703592364347, + "flos": 25155784149120.0, + "grad_norm": 1.6220826741564516, + "language_loss": 0.74784768, + "learning_rate": 7.996009129329894e-08, + "loss": 0.76906174, + "num_input_tokens_seen": 327346700, + "step": 15175, + "time_per_iteration": 2.4994077682495117 + }, + { + "auxiliary_loss_clip": 0.01019078, + "auxiliary_loss_mlp": 0.01000308, + "balance_loss_clip": 1.00687671, + "balance_loss_mlp": 0.9993183, + "epoch": 0.9124304824891026, + "flos": 60801650812800.0, + "grad_norm": 0.9696584009871491, + "language_loss": 0.58397275, + "learning_rate": 7.985110507575421e-08, + "loss": 0.60416663, + "num_input_tokens_seen": 327403050, + "step": 15176, + "time_per_iteration": 3.1411426067352295 + }, + { + "auxiliary_loss_clip": 0.01080234, + "auxiliary_loss_mlp": 0.01036183, + "balance_loss_clip": 1.03386402, + "balance_loss_mlp": 1.02427363, + "epoch": 0.9124906057417707, + "flos": 18150941352960.0, + "grad_norm": 1.8382190823554097, + "language_loss": 0.65431964, + "learning_rate": 7.97421916704475e-08, + "loss": 0.67548382, + "num_input_tokens_seen": 327422225, + "step": 15177, + "time_per_iteration": 2.4917290210723877 + }, + { + "auxiliary_loss_clip": 0.01073075, + "auxiliary_loss_mlp": 0.01028824, + "balance_loss_clip": 1.03448272, + "balance_loss_mlp": 1.0172255, + "epoch": 0.9125507289944386, + "flos": 11686769049600.0, + "grad_norm": 1.8047162626306705, + "language_loss": 0.81074136, + "learning_rate": 7.963335108150926e-08, + "loss": 0.83176029, + "num_input_tokens_seen": 327437025, + "step": 15178, + "time_per_iteration": 2.476388692855835 + }, + { + "auxiliary_loss_clip": 0.01045116, + "auxiliary_loss_mlp": 0.01034116, + "balance_loss_clip": 1.03111315, + "balance_loss_mlp": 1.02079463, + "epoch": 0.9126108522471066, + "flos": 17748813617280.0, + "grad_norm": 1.9768309127783001, + "language_loss": 0.78558117, + "learning_rate": 7.952458331306711e-08, + "loss": 0.80637348, + "num_input_tokens_seen": 327453915, + "step": 15179, + "time_per_iteration": 3.9671101570129395 + }, + { + "auxiliary_loss_clip": 0.01080072, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.03499341, + "balance_loss_mlp": 1.01956165, + "epoch": 0.9126709754997745, + "flos": 27635738952960.0, + "grad_norm": 2.6756077093910493, + "language_loss": 0.68238622, + "learning_rate": 7.941588836924507e-08, + "loss": 0.70349514, + "num_input_tokens_seen": 327474415, + "step": 15180, + "time_per_iteration": 3.973003387451172 + }, + { + "auxiliary_loss_clip": 0.0109035, + "auxiliary_loss_mlp": 0.01028379, + "balance_loss_clip": 1.03345764, + "balance_loss_mlp": 1.01732206, + "epoch": 0.9127310987524425, + "flos": 15924982596480.0, + "grad_norm": 1.7818160170422686, + "language_loss": 0.75396931, + "learning_rate": 7.930726625416495e-08, + "loss": 0.77515662, + "num_input_tokens_seen": 327492750, + "step": 15181, + "time_per_iteration": 3.866295337677002 + }, + { + "auxiliary_loss_clip": 0.0110832, + "auxiliary_loss_mlp": 0.01031221, + "balance_loss_clip": 1.03667021, + "balance_loss_mlp": 1.01946712, + "epoch": 0.9127912220051104, + "flos": 21536885923200.0, + "grad_norm": 1.6213925091167896, + "language_loss": 0.74515462, + "learning_rate": 7.919871697194614e-08, + "loss": 0.76655006, + "num_input_tokens_seen": 327509470, + "step": 15182, + "time_per_iteration": 2.4581503868103027 + }, + { + "auxiliary_loss_clip": 0.0110567, + "auxiliary_loss_mlp": 0.01031066, + "balance_loss_clip": 1.03498054, + "balance_loss_mlp": 1.01898992, + "epoch": 0.9128513452577784, + "flos": 24063561342720.0, + "grad_norm": 1.4249246270273057, + "language_loss": 0.76172614, + "learning_rate": 7.909024052670421e-08, + "loss": 0.78309351, + "num_input_tokens_seen": 327530520, + "step": 15183, + "time_per_iteration": 2.4724552631378174 + }, + { + "auxiliary_loss_clip": 0.01092117, + "auxiliary_loss_mlp": 0.01030106, + "balance_loss_clip": 1.03620696, + "balance_loss_mlp": 1.01790452, + "epoch": 0.9129114685104465, + "flos": 16216469464320.0, + "grad_norm": 2.3791410384019076, + "language_loss": 0.76571149, + "learning_rate": 7.898183692255256e-08, + "loss": 0.78693366, + "num_input_tokens_seen": 327546960, + "step": 15184, + "time_per_iteration": 2.4713759422302246 + }, + { + "auxiliary_loss_clip": 0.01095092, + "auxiliary_loss_mlp": 0.01032975, + "balance_loss_clip": 1.03646791, + "balance_loss_mlp": 1.02139401, + "epoch": 0.9129715917631144, + "flos": 19384364522880.0, + "grad_norm": 2.5169228750993935, + "language_loss": 0.74430025, + "learning_rate": 7.887350616360233e-08, + "loss": 0.76558089, + "num_input_tokens_seen": 327564830, + "step": 15185, + "time_per_iteration": 2.4522485733032227 + }, + { + "auxiliary_loss_clip": 0.01078801, + "auxiliary_loss_mlp": 0.01028725, + "balance_loss_clip": 1.03557575, + "balance_loss_mlp": 1.01714396, + "epoch": 0.9130317150157824, + "flos": 20590460421120.0, + "grad_norm": 2.8940181622671033, + "language_loss": 0.685534, + "learning_rate": 7.876524825396158e-08, + "loss": 0.70660925, + "num_input_tokens_seen": 327583675, + "step": 15186, + "time_per_iteration": 2.5118305683135986 + }, + { + "auxiliary_loss_clip": 0.01085336, + "auxiliary_loss_mlp": 0.01040961, + "balance_loss_clip": 1.03480864, + "balance_loss_mlp": 1.02652478, + "epoch": 0.9130918382684503, + "flos": 20189230525440.0, + "grad_norm": 1.8318240751515744, + "language_loss": 0.76988304, + "learning_rate": 7.865706319773502e-08, + "loss": 0.79114604, + "num_input_tokens_seen": 327602280, + "step": 15187, + "time_per_iteration": 2.4929161071777344 + }, + { + "auxiliary_loss_clip": 0.01103691, + "auxiliary_loss_mlp": 0.00783387, + "balance_loss_clip": 1.03455687, + "balance_loss_mlp": 1.00981593, + "epoch": 0.9131519615211183, + "flos": 25556870390400.0, + "grad_norm": 1.7353367447662638, + "language_loss": 0.6571964, + "learning_rate": 7.854895099902515e-08, + "loss": 0.67606717, + "num_input_tokens_seen": 327623515, + "step": 15188, + "time_per_iteration": 3.905729293823242 + }, + { + "auxiliary_loss_clip": 0.01031391, + "auxiliary_loss_mlp": 0.01039265, + "balance_loss_clip": 1.03114414, + "balance_loss_mlp": 1.02667058, + "epoch": 0.9132120847737862, + "flos": 17931563038080.0, + "grad_norm": 1.710035710200227, + "language_loss": 0.76360309, + "learning_rate": 7.844091166193157e-08, + "loss": 0.78430969, + "num_input_tokens_seen": 327642875, + "step": 15189, + "time_per_iteration": 2.610316276550293 + }, + { + "auxiliary_loss_clip": 0.01091991, + "auxiliary_loss_mlp": 0.01027283, + "balance_loss_clip": 1.03427374, + "balance_loss_mlp": 1.01642907, + "epoch": 0.9132722080264543, + "flos": 20047635112320.0, + "grad_norm": 2.025078922126337, + "language_loss": 0.75336313, + "learning_rate": 7.8332945190551e-08, + "loss": 0.77455592, + "num_input_tokens_seen": 327662450, + "step": 15190, + "time_per_iteration": 2.502211332321167 + }, + { + "auxiliary_loss_clip": 0.01019262, + "auxiliary_loss_mlp": 0.01001654, + "balance_loss_clip": 1.00663352, + "balance_loss_mlp": 1.00062871, + "epoch": 0.9133323312791222, + "flos": 70439967141120.0, + "grad_norm": 0.7056753138125434, + "language_loss": 0.57360995, + "learning_rate": 7.822505158897797e-08, + "loss": 0.59381914, + "num_input_tokens_seen": 327723845, + "step": 15191, + "time_per_iteration": 3.1207058429718018 + }, + { + "auxiliary_loss_clip": 0.01106582, + "auxiliary_loss_mlp": 0.01036042, + "balance_loss_clip": 1.03582692, + "balance_loss_mlp": 1.023525, + "epoch": 0.9133924545317902, + "flos": 25483792170240.0, + "grad_norm": 1.7631765485024358, + "language_loss": 0.74288034, + "learning_rate": 7.81172308613034e-08, + "loss": 0.76430655, + "num_input_tokens_seen": 327742590, + "step": 15192, + "time_per_iteration": 2.5117266178131104 + }, + { + "auxiliary_loss_clip": 0.01091218, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.03589416, + "balance_loss_mlp": 1.01790714, + "epoch": 0.9134525777844581, + "flos": 39930690107520.0, + "grad_norm": 1.892299827009024, + "language_loss": 0.69448847, + "learning_rate": 7.800948301161647e-08, + "loss": 0.71569699, + "num_input_tokens_seen": 327764350, + "step": 15193, + "time_per_iteration": 2.64632511138916 + }, + { + "auxiliary_loss_clip": 0.01089118, + "auxiliary_loss_mlp": 0.01037445, + "balance_loss_clip": 1.0340941, + "balance_loss_mlp": 1.02640605, + "epoch": 0.9135127010371261, + "flos": 20886723797760.0, + "grad_norm": 1.7980069471936975, + "language_loss": 0.72914159, + "learning_rate": 7.790180804400215e-08, + "loss": 0.75040728, + "num_input_tokens_seen": 327783120, + "step": 15194, + "time_per_iteration": 2.5153708457946777 + }, + { + "auxiliary_loss_clip": 0.01058349, + "auxiliary_loss_mlp": 0.01038246, + "balance_loss_clip": 1.03189826, + "balance_loss_mlp": 1.02306485, + "epoch": 0.913572824289794, + "flos": 20813250528000.0, + "grad_norm": 1.887059621456418, + "language_loss": 0.61571169, + "learning_rate": 7.779420596254383e-08, + "loss": 0.63667762, + "num_input_tokens_seen": 327801960, + "step": 15195, + "time_per_iteration": 2.583388328552246 + }, + { + "auxiliary_loss_clip": 0.01093711, + "auxiliary_loss_mlp": 0.01031083, + "balance_loss_clip": 1.03417325, + "balance_loss_mlp": 1.01936495, + "epoch": 0.913632947542462, + "flos": 25703278225920.0, + "grad_norm": 1.7255560454772492, + "language_loss": 0.71543157, + "learning_rate": 7.768667677132201e-08, + "loss": 0.73667955, + "num_input_tokens_seen": 327823795, + "step": 15196, + "time_per_iteration": 2.54146409034729 + }, + { + "auxiliary_loss_clip": 0.01081704, + "auxiliary_loss_mlp": 0.01029863, + "balance_loss_clip": 1.03430557, + "balance_loss_mlp": 1.01862705, + "epoch": 0.9136930707951301, + "flos": 26286216048000.0, + "grad_norm": 1.593450803067473, + "language_loss": 0.71213359, + "learning_rate": 7.757922047441411e-08, + "loss": 0.73324925, + "num_input_tokens_seen": 327845175, + "step": 15197, + "time_per_iteration": 2.5599172115325928 + }, + { + "auxiliary_loss_clip": 0.01083912, + "auxiliary_loss_mlp": 0.010283, + "balance_loss_clip": 1.03256869, + "balance_loss_mlp": 1.01581311, + "epoch": 0.913753194047798, + "flos": 22091885942400.0, + "grad_norm": 1.6953227771696529, + "language_loss": 0.77668774, + "learning_rate": 7.747183707589489e-08, + "loss": 0.7978099, + "num_input_tokens_seen": 327863150, + "step": 15198, + "time_per_iteration": 2.508389711380005 + }, + { + "auxiliary_loss_clip": 0.01085314, + "auxiliary_loss_mlp": 0.010338, + "balance_loss_clip": 1.03377008, + "balance_loss_mlp": 1.0219748, + "epoch": 0.913813317300466, + "flos": 23587206151680.0, + "grad_norm": 1.3716105891603294, + "language_loss": 0.67839092, + "learning_rate": 7.736452657983616e-08, + "loss": 0.6995821, + "num_input_tokens_seen": 327883445, + "step": 15199, + "time_per_iteration": 2.5115697383880615 + }, + { + "auxiliary_loss_clip": 0.01093319, + "auxiliary_loss_mlp": 0.0078168, + "balance_loss_clip": 1.03373456, + "balance_loss_mlp": 1.00690222, + "epoch": 0.9138734405531339, + "flos": 28876452583680.0, + "grad_norm": 1.5814305302636624, + "language_loss": 0.67475533, + "learning_rate": 7.725728899030714e-08, + "loss": 0.69350535, + "num_input_tokens_seen": 327905745, + "step": 15200, + "time_per_iteration": 2.5348875522613525 + }, + { + "auxiliary_loss_clip": 0.01092147, + "auxiliary_loss_mlp": 0.01033132, + "balance_loss_clip": 1.03650963, + "balance_loss_mlp": 1.02237296, + "epoch": 0.9139335638058019, + "flos": 22821087945600.0, + "grad_norm": 1.4927866903032523, + "language_loss": 0.70963591, + "learning_rate": 7.715012431137435e-08, + "loss": 0.73088872, + "num_input_tokens_seen": 327925435, + "step": 15201, + "time_per_iteration": 2.5108840465545654 + }, + { + "auxiliary_loss_clip": 0.01090683, + "auxiliary_loss_mlp": 0.01025238, + "balance_loss_clip": 1.03297162, + "balance_loss_mlp": 1.01450336, + "epoch": 0.9139936870584698, + "flos": 18004174381440.0, + "grad_norm": 1.7052579123532463, + "language_loss": 0.70887625, + "learning_rate": 7.704303254710165e-08, + "loss": 0.73003542, + "num_input_tokens_seen": 327944145, + "step": 15202, + "time_per_iteration": 2.4544639587402344 + }, + { + "auxiliary_loss_clip": 0.01101823, + "auxiliary_loss_mlp": 0.01029352, + "balance_loss_clip": 1.03381991, + "balance_loss_mlp": 1.01709151, + "epoch": 0.9140538103111379, + "flos": 15813767111040.0, + "grad_norm": 1.889604016206173, + "language_loss": 0.66631591, + "learning_rate": 7.693601370155001e-08, + "loss": 0.68762767, + "num_input_tokens_seen": 327960565, + "step": 15203, + "time_per_iteration": 2.427597999572754 + }, + { + "auxiliary_loss_clip": 0.01093887, + "auxiliary_loss_mlp": 0.01030113, + "balance_loss_clip": 1.03534365, + "balance_loss_mlp": 1.01727974, + "epoch": 0.9141139335638058, + "flos": 23987035416960.0, + "grad_norm": 2.602458322707884, + "language_loss": 0.68763161, + "learning_rate": 7.682906777877751e-08, + "loss": 0.7088716, + "num_input_tokens_seen": 327981180, + "step": 15204, + "time_per_iteration": 2.494999647140503 + }, + { + "auxiliary_loss_clip": 0.01092242, + "auxiliary_loss_mlp": 0.01025854, + "balance_loss_clip": 1.03198791, + "balance_loss_mlp": 1.01334918, + "epoch": 0.9141740568164738, + "flos": 24024418496640.0, + "grad_norm": 1.9676829716553483, + "language_loss": 0.59464562, + "learning_rate": 7.672219478283915e-08, + "loss": 0.61582661, + "num_input_tokens_seen": 328001500, + "step": 15205, + "time_per_iteration": 2.4988577365875244 + }, + { + "auxiliary_loss_clip": 0.01070037, + "auxiliary_loss_mlp": 0.01032702, + "balance_loss_clip": 1.03439879, + "balance_loss_mlp": 1.02064431, + "epoch": 0.9142341800691417, + "flos": 27018291139200.0, + "grad_norm": 1.6490646816237196, + "language_loss": 0.81209642, + "learning_rate": 7.661539471778811e-08, + "loss": 0.8331238, + "num_input_tokens_seen": 328023025, + "step": 15206, + "time_per_iteration": 2.5976977348327637 + }, + { + "auxiliary_loss_clip": 0.01058266, + "auxiliary_loss_mlp": 0.01025342, + "balance_loss_clip": 1.03140426, + "balance_loss_mlp": 1.01290214, + "epoch": 0.9142943033218097, + "flos": 20412487509120.0, + "grad_norm": 2.607041190318014, + "language_loss": 0.74203533, + "learning_rate": 7.650866758767382e-08, + "loss": 0.76287138, + "num_input_tokens_seen": 328041410, + "step": 15207, + "time_per_iteration": 2.569094181060791 + }, + { + "auxiliary_loss_clip": 0.01059657, + "auxiliary_loss_mlp": 0.01035091, + "balance_loss_clip": 1.03666973, + "balance_loss_mlp": 1.02262783, + "epoch": 0.9143544265744776, + "flos": 19755322231680.0, + "grad_norm": 1.5981545898539251, + "language_loss": 0.72683394, + "learning_rate": 7.640201339654373e-08, + "loss": 0.7477814, + "num_input_tokens_seen": 328060495, + "step": 15208, + "time_per_iteration": 2.5695676803588867 + }, + { + "auxiliary_loss_clip": 0.01092577, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.03584576, + "balance_loss_mlp": 1.01792264, + "epoch": 0.9144145498271457, + "flos": 17165444832000.0, + "grad_norm": 2.469356857014516, + "language_loss": 0.86421204, + "learning_rate": 7.629543214844237e-08, + "loss": 0.88543081, + "num_input_tokens_seen": 328076905, + "step": 15209, + "time_per_iteration": 2.457808494567871 + }, + { + "auxiliary_loss_clip": 0.01083327, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.0354315, + "balance_loss_mlp": 1.02206969, + "epoch": 0.9144746730798137, + "flos": 23726072131200.0, + "grad_norm": 5.850777385733918, + "language_loss": 0.74835336, + "learning_rate": 7.618892384741093e-08, + "loss": 0.76951927, + "num_input_tokens_seen": 328096960, + "step": 15210, + "time_per_iteration": 2.556257963180542 + }, + { + "auxiliary_loss_clip": 0.01078972, + "auxiliary_loss_mlp": 0.01032984, + "balance_loss_clip": 1.03025222, + "balance_loss_mlp": 1.02060962, + "epoch": 0.9145347963324816, + "flos": 25847854467840.0, + "grad_norm": 1.8703677993087247, + "language_loss": 0.78117633, + "learning_rate": 7.6082488497488e-08, + "loss": 0.80229592, + "num_input_tokens_seen": 328115445, + "step": 15211, + "time_per_iteration": 2.550583600997925 + }, + { + "auxiliary_loss_clip": 0.01094732, + "auxiliary_loss_mlp": 0.010274, + "balance_loss_clip": 1.03520751, + "balance_loss_mlp": 1.0158366, + "epoch": 0.9145949195851496, + "flos": 19242769109760.0, + "grad_norm": 1.6127854192556228, + "language_loss": 0.82749951, + "learning_rate": 7.597612610270986e-08, + "loss": 0.84872091, + "num_input_tokens_seen": 328133965, + "step": 15212, + "time_per_iteration": 2.470095634460449 + }, + { + "auxiliary_loss_clip": 0.01089738, + "auxiliary_loss_mlp": 0.01025991, + "balance_loss_clip": 1.03419268, + "balance_loss_mlp": 1.01501203, + "epoch": 0.9146550428378175, + "flos": 18296379521280.0, + "grad_norm": 1.6964770833881595, + "language_loss": 0.84057951, + "learning_rate": 7.586983666711022e-08, + "loss": 0.86173677, + "num_input_tokens_seen": 328151520, + "step": 15213, + "time_per_iteration": 2.576432943344116 + }, + { + "auxiliary_loss_clip": 0.0109535, + "auxiliary_loss_mlp": 0.01027681, + "balance_loss_clip": 1.0368619, + "balance_loss_mlp": 1.0162127, + "epoch": 0.9147151660904855, + "flos": 20084264006400.0, + "grad_norm": 1.6403408187422437, + "language_loss": 0.71157622, + "learning_rate": 7.576362019471894e-08, + "loss": 0.73280644, + "num_input_tokens_seen": 328171275, + "step": 15214, + "time_per_iteration": 2.4651708602905273 + }, + { + "auxiliary_loss_clip": 0.01096369, + "auxiliary_loss_mlp": 0.01038037, + "balance_loss_clip": 1.03661227, + "balance_loss_mlp": 1.02515626, + "epoch": 0.9147752893431534, + "flos": 24389127239040.0, + "grad_norm": 1.4786344595923366, + "language_loss": 0.62475967, + "learning_rate": 7.565747668956413e-08, + "loss": 0.64610374, + "num_input_tokens_seen": 328192115, + "step": 15215, + "time_per_iteration": 2.5024194717407227 + }, + { + "auxiliary_loss_clip": 0.01075644, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.03626633, + "balance_loss_mlp": 1.01673567, + "epoch": 0.9148354125958215, + "flos": 18150402648960.0, + "grad_norm": 2.3673246554120637, + "language_loss": 0.76168454, + "learning_rate": 7.555140615567058e-08, + "loss": 0.78273523, + "num_input_tokens_seen": 328208990, + "step": 15216, + "time_per_iteration": 2.5032505989074707 + }, + { + "auxiliary_loss_clip": 0.01079846, + "auxiliary_loss_mlp": 0.01040021, + "balance_loss_clip": 1.03550208, + "balance_loss_mlp": 1.02578175, + "epoch": 0.9148955358484894, + "flos": 23367540528000.0, + "grad_norm": 2.227401699096528, + "language_loss": 0.67912281, + "learning_rate": 7.544540859706062e-08, + "loss": 0.7003215, + "num_input_tokens_seen": 328227840, + "step": 15217, + "time_per_iteration": 3.9342377185821533 + }, + { + "auxiliary_loss_clip": 0.01090905, + "auxiliary_loss_mlp": 0.01031997, + "balance_loss_clip": 1.03505516, + "balance_loss_mlp": 1.02046323, + "epoch": 0.9149556591011574, + "flos": 18076498416000.0, + "grad_norm": 1.8868874354915137, + "language_loss": 0.79695582, + "learning_rate": 7.533948401775347e-08, + "loss": 0.81818485, + "num_input_tokens_seen": 328246250, + "step": 15218, + "time_per_iteration": 2.4437167644500732 + }, + { + "auxiliary_loss_clip": 0.00999198, + "auxiliary_loss_mlp": 0.00998888, + "balance_loss_clip": 1.01008701, + "balance_loss_mlp": 0.99784464, + "epoch": 0.9150157823538253, + "flos": 54586374825600.0, + "grad_norm": 0.8501912802934998, + "language_loss": 0.59253043, + "learning_rate": 7.523363242176595e-08, + "loss": 0.61251128, + "num_input_tokens_seen": 328303625, + "step": 15219, + "time_per_iteration": 4.493297100067139 + }, + { + "auxiliary_loss_clip": 0.01090547, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.03388655, + "balance_loss_mlp": 1.02098846, + "epoch": 0.9150759056064933, + "flos": 17893102550400.0, + "grad_norm": 2.0470272546149606, + "language_loss": 0.78511107, + "learning_rate": 7.512785381311216e-08, + "loss": 0.80634683, + "num_input_tokens_seen": 328322135, + "step": 15220, + "time_per_iteration": 4.135571241378784 + }, + { + "auxiliary_loss_clip": 0.01052013, + "auxiliary_loss_mlp": 0.01038584, + "balance_loss_clip": 1.03169465, + "balance_loss_mlp": 1.02433264, + "epoch": 0.9151360288591612, + "flos": 18073517587200.0, + "grad_norm": 1.835562285105488, + "language_loss": 0.65225261, + "learning_rate": 7.50221481958031e-08, + "loss": 0.67315859, + "num_input_tokens_seen": 328340750, + "step": 15221, + "time_per_iteration": 2.586873769760132 + }, + { + "auxiliary_loss_clip": 0.01081535, + "auxiliary_loss_mlp": 0.01027328, + "balance_loss_clip": 1.03426218, + "balance_loss_mlp": 1.01621211, + "epoch": 0.9151961521118293, + "flos": 19354523299200.0, + "grad_norm": 1.696118308215638, + "language_loss": 0.84125322, + "learning_rate": 7.491651557384692e-08, + "loss": 0.86234188, + "num_input_tokens_seen": 328359995, + "step": 15222, + "time_per_iteration": 2.4941346645355225 + }, + { + "auxiliary_loss_clip": 0.01016509, + "auxiliary_loss_mlp": 0.00999777, + "balance_loss_clip": 1.01327157, + "balance_loss_mlp": 0.99866802, + "epoch": 0.9152562753644973, + "flos": 72146621018880.0, + "grad_norm": 0.7209813156122196, + "language_loss": 0.49587774, + "learning_rate": 7.481095595124953e-08, + "loss": 0.51604062, + "num_input_tokens_seen": 328426865, + "step": 15223, + "time_per_iteration": 3.17991042137146 + }, + { + "auxiliary_loss_clip": 0.01072559, + "auxiliary_loss_mlp": 0.0103777, + "balance_loss_clip": 1.03468978, + "balance_loss_mlp": 1.02547991, + "epoch": 0.9153163986171652, + "flos": 20777016683520.0, + "grad_norm": 2.064037999580253, + "language_loss": 0.72484177, + "learning_rate": 7.470546933201349e-08, + "loss": 0.7459451, + "num_input_tokens_seen": 328445970, + "step": 15224, + "time_per_iteration": 2.549142837524414 + }, + { + "auxiliary_loss_clip": 0.01087815, + "auxiliary_loss_mlp": 0.01026843, + "balance_loss_clip": 1.03498626, + "balance_loss_mlp": 1.01522064, + "epoch": 0.9153765218698332, + "flos": 23040107124480.0, + "grad_norm": 1.7811175316814336, + "language_loss": 0.80972528, + "learning_rate": 7.460005572013895e-08, + "loss": 0.83087182, + "num_input_tokens_seen": 328464585, + "step": 15225, + "time_per_iteration": 2.5327799320220947 + }, + { + "auxiliary_loss_clip": 0.01101844, + "auxiliary_loss_mlp": 0.01025215, + "balance_loss_clip": 1.03353608, + "balance_loss_mlp": 1.01385391, + "epoch": 0.9154366451225011, + "flos": 28990900293120.0, + "grad_norm": 1.4265109308930939, + "language_loss": 0.71363032, + "learning_rate": 7.44947151196238e-08, + "loss": 0.73490089, + "num_input_tokens_seen": 328490155, + "step": 15226, + "time_per_iteration": 2.571580648422241 + }, + { + "auxiliary_loss_clip": 0.01041381, + "auxiliary_loss_mlp": 0.01032667, + "balance_loss_clip": 1.03219151, + "balance_loss_mlp": 1.01999545, + "epoch": 0.9154967683751691, + "flos": 22309504490880.0, + "grad_norm": 1.9719859550191061, + "language_loss": 0.74764729, + "learning_rate": 7.43894475344613e-08, + "loss": 0.76838779, + "num_input_tokens_seen": 328508275, + "step": 15227, + "time_per_iteration": 4.0892333984375 + }, + { + "auxiliary_loss_clip": 0.01080331, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.03388619, + "balance_loss_mlp": 1.02097845, + "epoch": 0.915556891627837, + "flos": 24571481610240.0, + "grad_norm": 1.4051392683342, + "language_loss": 0.73711002, + "learning_rate": 7.428425296864404e-08, + "loss": 0.75823915, + "num_input_tokens_seen": 328529425, + "step": 15228, + "time_per_iteration": 2.5695133209228516 + }, + { + "auxiliary_loss_clip": 0.01066173, + "auxiliary_loss_mlp": 0.0103, + "balance_loss_clip": 1.0337007, + "balance_loss_mlp": 1.01856732, + "epoch": 0.9156170148805051, + "flos": 22164676853760.0, + "grad_norm": 1.5777902556819563, + "language_loss": 0.71672487, + "learning_rate": 7.417913142616106e-08, + "loss": 0.73768663, + "num_input_tokens_seen": 328550200, + "step": 15229, + "time_per_iteration": 2.551664113998413 + }, + { + "auxiliary_loss_clip": 0.0110655, + "auxiliary_loss_mlp": 0.01033515, + "balance_loss_clip": 1.03718281, + "balance_loss_mlp": 1.02044368, + "epoch": 0.915677138133173, + "flos": 20920659171840.0, + "grad_norm": 1.5545659667759513, + "language_loss": 0.82961726, + "learning_rate": 7.407408291099848e-08, + "loss": 0.85101783, + "num_input_tokens_seen": 328568540, + "step": 15230, + "time_per_iteration": 2.468846082687378 + }, + { + "auxiliary_loss_clip": 0.01060894, + "auxiliary_loss_mlp": 0.010268, + "balance_loss_clip": 1.03248703, + "balance_loss_mlp": 1.01560664, + "epoch": 0.915737261385841, + "flos": 24345136056960.0, + "grad_norm": 1.6928724830962776, + "language_loss": 0.83439898, + "learning_rate": 7.396910742713957e-08, + "loss": 0.85527593, + "num_input_tokens_seen": 328587300, + "step": 15231, + "time_per_iteration": 2.6174890995025635 + }, + { + "auxiliary_loss_clip": 0.01087135, + "auxiliary_loss_mlp": 0.01026482, + "balance_loss_clip": 1.03102124, + "balance_loss_mlp": 1.01504397, + "epoch": 0.9157973846385089, + "flos": 26761386090240.0, + "grad_norm": 1.4820462532289163, + "language_loss": 0.7237764, + "learning_rate": 7.386420497856516e-08, + "loss": 0.74491256, + "num_input_tokens_seen": 328610055, + "step": 15232, + "time_per_iteration": 2.549255132675171 + }, + { + "auxiliary_loss_clip": 0.01104925, + "auxiliary_loss_mlp": 0.01032471, + "balance_loss_clip": 1.0342319, + "balance_loss_mlp": 1.02050257, + "epoch": 0.9158575078911769, + "flos": 18478733892480.0, + "grad_norm": 2.038306779246488, + "language_loss": 0.6792531, + "learning_rate": 7.375937556925338e-08, + "loss": 0.70062709, + "num_input_tokens_seen": 328626815, + "step": 15233, + "time_per_iteration": 2.420616865158081 + }, + { + "auxiliary_loss_clip": 0.01078956, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.03651333, + "balance_loss_mlp": 1.01997721, + "epoch": 0.9159176311438448, + "flos": 21798926616960.0, + "grad_norm": 1.837964273708223, + "language_loss": 0.69844639, + "learning_rate": 7.365461920317861e-08, + "loss": 0.71955574, + "num_input_tokens_seen": 328643995, + "step": 15234, + "time_per_iteration": 2.5360162258148193 + }, + { + "auxiliary_loss_clip": 0.01081106, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.03512216, + "balance_loss_mlp": 1.0216763, + "epoch": 0.9159777543965129, + "flos": 24783749032320.0, + "grad_norm": 2.230784813926407, + "language_loss": 0.88141835, + "learning_rate": 7.354993588431391e-08, + "loss": 0.90257144, + "num_input_tokens_seen": 328659565, + "step": 15235, + "time_per_iteration": 2.5258424282073975 + }, + { + "auxiliary_loss_clip": 0.01038347, + "auxiliary_loss_mlp": 0.010364, + "balance_loss_clip": 1.03282487, + "balance_loss_mlp": 1.0222913, + "epoch": 0.9160378776491809, + "flos": 26868758820480.0, + "grad_norm": 1.5531146068696027, + "language_loss": 0.77151394, + "learning_rate": 7.344532561662853e-08, + "loss": 0.79226148, + "num_input_tokens_seen": 328679045, + "step": 15236, + "time_per_iteration": 2.658754825592041 + }, + { + "auxiliary_loss_clip": 0.00992623, + "auxiliary_loss_mlp": 0.01001342, + "balance_loss_clip": 1.01884961, + "balance_loss_mlp": 1.00003099, + "epoch": 0.9160980009018488, + "flos": 70578222589440.0, + "grad_norm": 0.7183720198758208, + "language_loss": 0.62191588, + "learning_rate": 7.334078840409019e-08, + "loss": 0.64185554, + "num_input_tokens_seen": 328744565, + "step": 15237, + "time_per_iteration": 3.17425537109375 + }, + { + "auxiliary_loss_clip": 0.0110625, + "auxiliary_loss_mlp": 0.00781803, + "balance_loss_clip": 1.03590441, + "balance_loss_mlp": 1.00623024, + "epoch": 0.9161581241545168, + "flos": 16289332202880.0, + "grad_norm": 2.102584270530641, + "language_loss": 0.75152349, + "learning_rate": 7.323632425066151e-08, + "loss": 0.77040398, + "num_input_tokens_seen": 328762455, + "step": 15238, + "time_per_iteration": 2.4460582733154297 + }, + { + "auxiliary_loss_clip": 0.01103812, + "auxiliary_loss_mlp": 0.01027487, + "balance_loss_clip": 1.03473496, + "balance_loss_mlp": 1.01558352, + "epoch": 0.9162182474071847, + "flos": 18438154502400.0, + "grad_norm": 1.876414020843958, + "language_loss": 0.74720335, + "learning_rate": 7.313193316030464e-08, + "loss": 0.76851642, + "num_input_tokens_seen": 328780320, + "step": 15239, + "time_per_iteration": 2.43398118019104 + }, + { + "auxiliary_loss_clip": 0.01071352, + "auxiliary_loss_mlp": 0.01031781, + "balance_loss_clip": 1.03182328, + "balance_loss_mlp": 1.01963973, + "epoch": 0.9162783706598527, + "flos": 19167248764800.0, + "grad_norm": 2.006811316938341, + "language_loss": 0.63400024, + "learning_rate": 7.302761513697819e-08, + "loss": 0.65503162, + "num_input_tokens_seen": 328797570, + "step": 15240, + "time_per_iteration": 2.5364534854888916 + }, + { + "auxiliary_loss_clip": 0.01081705, + "auxiliary_loss_mlp": 0.00781585, + "balance_loss_clip": 1.03578615, + "balance_loss_mlp": 1.00710511, + "epoch": 0.9163384939125206, + "flos": 20412990299520.0, + "grad_norm": 1.7211203690631538, + "language_loss": 0.76286769, + "learning_rate": 7.292337018463746e-08, + "loss": 0.78150058, + "num_input_tokens_seen": 328814075, + "step": 15241, + "time_per_iteration": 2.5032010078430176 + }, + { + "auxiliary_loss_clip": 0.01101918, + "auxiliary_loss_mlp": 0.01031554, + "balance_loss_clip": 1.03742731, + "balance_loss_mlp": 1.0176425, + "epoch": 0.9163986171651887, + "flos": 19645902426240.0, + "grad_norm": 2.535748664589755, + "language_loss": 0.67583704, + "learning_rate": 7.281919830723549e-08, + "loss": 0.69717169, + "num_input_tokens_seen": 328831990, + "step": 15242, + "time_per_iteration": 2.4912006855010986 + }, + { + "auxiliary_loss_clip": 0.01092916, + "auxiliary_loss_mlp": 0.01028709, + "balance_loss_clip": 1.0334332, + "balance_loss_mlp": 1.01602507, + "epoch": 0.9164587404178566, + "flos": 12823054865280.0, + "grad_norm": 1.6595480007643708, + "language_loss": 0.80747145, + "learning_rate": 7.271509950872334e-08, + "loss": 0.82868767, + "num_input_tokens_seen": 328849105, + "step": 15243, + "time_per_iteration": 2.472428798675537 + }, + { + "auxiliary_loss_clip": 0.01080303, + "auxiliary_loss_mlp": 0.01029333, + "balance_loss_clip": 1.03242671, + "balance_loss_mlp": 1.01694679, + "epoch": 0.9165188636705246, + "flos": 22309396750080.0, + "grad_norm": 1.7487630874842564, + "language_loss": 0.81763732, + "learning_rate": 7.261107379304721e-08, + "loss": 0.83873367, + "num_input_tokens_seen": 328866810, + "step": 15244, + "time_per_iteration": 2.5188372135162354 + }, + { + "auxiliary_loss_clip": 0.01107429, + "auxiliary_loss_mlp": 0.01035572, + "balance_loss_clip": 1.03513527, + "balance_loss_mlp": 1.0230732, + "epoch": 0.9165789869231925, + "flos": 18223337214720.0, + "grad_norm": 2.1961454678597603, + "language_loss": 0.72134328, + "learning_rate": 7.250712116415214e-08, + "loss": 0.74277329, + "num_input_tokens_seen": 328885325, + "step": 15245, + "time_per_iteration": 2.4213449954986572 + }, + { + "auxiliary_loss_clip": 0.01081892, + "auxiliary_loss_mlp": 0.01029524, + "balance_loss_clip": 1.03476739, + "balance_loss_mlp": 1.01821065, + "epoch": 0.9166391101758605, + "flos": 13691553811200.0, + "grad_norm": 1.7349269856512282, + "language_loss": 0.74802566, + "learning_rate": 7.240324162598033e-08, + "loss": 0.76913977, + "num_input_tokens_seen": 328902655, + "step": 15246, + "time_per_iteration": 2.520372152328491 + }, + { + "auxiliary_loss_clip": 0.01076739, + "auxiliary_loss_mlp": 0.01029541, + "balance_loss_clip": 1.03312612, + "balance_loss_mlp": 1.01710129, + "epoch": 0.9166992334285284, + "flos": 17346793622400.0, + "grad_norm": 1.9353821328661256, + "language_loss": 0.7510137, + "learning_rate": 7.229943518247106e-08, + "loss": 0.77207649, + "num_input_tokens_seen": 328918440, + "step": 15247, + "time_per_iteration": 2.4697623252868652 + }, + { + "auxiliary_loss_clip": 0.01095697, + "auxiliary_loss_mlp": 0.0102585, + "balance_loss_clip": 1.03693771, + "balance_loss_mlp": 1.01391077, + "epoch": 0.9167593566811965, + "flos": 23731135948800.0, + "grad_norm": 1.7266812862451686, + "language_loss": 0.75769687, + "learning_rate": 7.219570183756052e-08, + "loss": 0.77891237, + "num_input_tokens_seen": 328938055, + "step": 15248, + "time_per_iteration": 2.5058579444885254 + }, + { + "auxiliary_loss_clip": 0.01092292, + "auxiliary_loss_mlp": 0.01034759, + "balance_loss_clip": 1.03285241, + "balance_loss_mlp": 1.02122283, + "epoch": 0.9168194799338644, + "flos": 27818201064960.0, + "grad_norm": 2.360683796000608, + "language_loss": 0.73069501, + "learning_rate": 7.209204159518178e-08, + "loss": 0.75196552, + "num_input_tokens_seen": 328957895, + "step": 15249, + "time_per_iteration": 2.532177448272705 + }, + { + "auxiliary_loss_clip": 0.01057662, + "auxiliary_loss_mlp": 0.01028724, + "balance_loss_clip": 1.03627968, + "balance_loss_mlp": 1.01581991, + "epoch": 0.9168796031865324, + "flos": 21717552355200.0, + "grad_norm": 1.8571366727173375, + "language_loss": 0.75844491, + "learning_rate": 7.198845445926616e-08, + "loss": 0.7793088, + "num_input_tokens_seen": 328971365, + "step": 15250, + "time_per_iteration": 2.5601084232330322 + }, + { + "auxiliary_loss_clip": 0.01061044, + "auxiliary_loss_mlp": 0.01028609, + "balance_loss_clip": 1.03302288, + "balance_loss_mlp": 1.0160321, + "epoch": 0.9169397264392004, + "flos": 23404420817280.0, + "grad_norm": 1.667860971248614, + "language_loss": 0.75714946, + "learning_rate": 7.188494043374138e-08, + "loss": 0.77804595, + "num_input_tokens_seen": 328990830, + "step": 15251, + "time_per_iteration": 2.585933208465576 + }, + { + "auxiliary_loss_clip": 0.01080239, + "auxiliary_loss_mlp": 0.01033086, + "balance_loss_clip": 1.03561401, + "balance_loss_mlp": 1.01958585, + "epoch": 0.9169998496918683, + "flos": 23950981140480.0, + "grad_norm": 2.074915807143545, + "language_loss": 0.79774928, + "learning_rate": 7.178149952253298e-08, + "loss": 0.81888258, + "num_input_tokens_seen": 329008345, + "step": 15252, + "time_per_iteration": 2.530998468399048 + }, + { + "auxiliary_loss_clip": 0.01103746, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.03486514, + "balance_loss_mlp": 1.02318347, + "epoch": 0.9170599729445363, + "flos": 18332469711360.0, + "grad_norm": 1.613539381378801, + "language_loss": 0.77434814, + "learning_rate": 7.167813172956316e-08, + "loss": 0.7957356, + "num_input_tokens_seen": 329027440, + "step": 15253, + "time_per_iteration": 2.4275195598602295 + }, + { + "auxiliary_loss_clip": 0.01094885, + "auxiliary_loss_mlp": 0.01025343, + "balance_loss_clip": 1.03594327, + "balance_loss_mlp": 1.01348805, + "epoch": 0.9171200961972042, + "flos": 22674859678080.0, + "grad_norm": 2.675882925128281, + "language_loss": 0.72909105, + "learning_rate": 7.157483705875256e-08, + "loss": 0.75029325, + "num_input_tokens_seen": 329046445, + "step": 15254, + "time_per_iteration": 2.4996325969696045 + }, + { + "auxiliary_loss_clip": 0.01063445, + "auxiliary_loss_mlp": 0.01024731, + "balance_loss_clip": 1.03294301, + "balance_loss_mlp": 1.01396692, + "epoch": 0.9171802194498723, + "flos": 26719298328960.0, + "grad_norm": 1.4555948610035223, + "language_loss": 0.78925395, + "learning_rate": 7.14716155140167e-08, + "loss": 0.81013572, + "num_input_tokens_seen": 329065555, + "step": 15255, + "time_per_iteration": 2.5907673835754395 + }, + { + "auxiliary_loss_clip": 0.0109415, + "auxiliary_loss_mlp": 0.01031402, + "balance_loss_clip": 1.03357112, + "balance_loss_mlp": 1.01904559, + "epoch": 0.9172403427025402, + "flos": 37889240538240.0, + "grad_norm": 1.8895268612431264, + "language_loss": 0.68457997, + "learning_rate": 7.136846709927047e-08, + "loss": 0.70583552, + "num_input_tokens_seen": 329087515, + "step": 15256, + "time_per_iteration": 4.0132622718811035 + }, + { + "auxiliary_loss_clip": 0.01087699, + "auxiliary_loss_mlp": 0.01037396, + "balance_loss_clip": 1.03337526, + "balance_loss_mlp": 1.02491438, + "epoch": 0.9173004659552082, + "flos": 17055163100160.0, + "grad_norm": 1.618248391327963, + "language_loss": 0.83784485, + "learning_rate": 7.126539181842561e-08, + "loss": 0.85909581, + "num_input_tokens_seen": 329106820, + "step": 15257, + "time_per_iteration": 2.488647699356079 + }, + { + "auxiliary_loss_clip": 0.01078539, + "auxiliary_loss_mlp": 0.01028316, + "balance_loss_clip": 1.03220773, + "balance_loss_mlp": 1.01696765, + "epoch": 0.9173605892078761, + "flos": 22201593056640.0, + "grad_norm": 1.4859371711908207, + "language_loss": 0.77455193, + "learning_rate": 7.116238967539012e-08, + "loss": 0.7956205, + "num_input_tokens_seen": 329126515, + "step": 15258, + "time_per_iteration": 5.2878477573394775 + }, + { + "auxiliary_loss_clip": 0.01094916, + "auxiliary_loss_mlp": 0.01034111, + "balance_loss_clip": 1.03727102, + "balance_loss_mlp": 1.02260089, + "epoch": 0.9174207124605441, + "flos": 16507776764160.0, + "grad_norm": 1.874388109919324, + "language_loss": 0.7841959, + "learning_rate": 7.105946067406999e-08, + "loss": 0.80548614, + "num_input_tokens_seen": 329142660, + "step": 15259, + "time_per_iteration": 2.455075979232788 + }, + { + "auxiliary_loss_clip": 0.01056861, + "auxiliary_loss_mlp": 0.01029045, + "balance_loss_clip": 1.03289497, + "balance_loss_mlp": 1.01813745, + "epoch": 0.917480835713212, + "flos": 24535606901760.0, + "grad_norm": 1.6036806104906023, + "language_loss": 0.76523364, + "learning_rate": 7.095660481836895e-08, + "loss": 0.78609264, + "num_input_tokens_seen": 329162575, + "step": 15260, + "time_per_iteration": 2.6203041076660156 + }, + { + "auxiliary_loss_clip": 0.01061231, + "auxiliary_loss_mlp": 0.01030527, + "balance_loss_clip": 1.03211474, + "balance_loss_mlp": 1.01856482, + "epoch": 0.9175409589658801, + "flos": 20880726226560.0, + "grad_norm": 1.4476256824830447, + "language_loss": 0.60843754, + "learning_rate": 7.085382211218637e-08, + "loss": 0.62935513, + "num_input_tokens_seen": 329182090, + "step": 15261, + "time_per_iteration": 2.585662364959717 + }, + { + "auxiliary_loss_clip": 0.01078139, + "auxiliary_loss_mlp": 0.01027308, + "balance_loss_clip": 1.03241932, + "balance_loss_mlp": 1.01551187, + "epoch": 0.917601082218548, + "flos": 14276035918080.0, + "grad_norm": 1.7452075721543565, + "language_loss": 0.73532271, + "learning_rate": 7.075111255942002e-08, + "loss": 0.75637716, + "num_input_tokens_seen": 329196535, + "step": 15262, + "time_per_iteration": 2.4800074100494385 + }, + { + "auxiliary_loss_clip": 0.01106101, + "auxiliary_loss_mlp": 0.01036308, + "balance_loss_clip": 1.03349483, + "balance_loss_mlp": 1.023857, + "epoch": 0.917661205471216, + "flos": 19099234362240.0, + "grad_norm": 1.7188727249258504, + "language_loss": 0.77662951, + "learning_rate": 7.064847616396496e-08, + "loss": 0.79805362, + "num_input_tokens_seen": 329215135, + "step": 15263, + "time_per_iteration": 2.410594940185547 + }, + { + "auxiliary_loss_clip": 0.01106357, + "auxiliary_loss_mlp": 0.01032389, + "balance_loss_clip": 1.03468633, + "balance_loss_mlp": 1.02039027, + "epoch": 0.917721328723884, + "flos": 21106568989440.0, + "grad_norm": 5.333294595714343, + "language_loss": 0.75585568, + "learning_rate": 7.054591292971324e-08, + "loss": 0.77724314, + "num_input_tokens_seen": 329235150, + "step": 15264, + "time_per_iteration": 2.4581072330474854 + }, + { + "auxiliary_loss_clip": 0.01081743, + "auxiliary_loss_mlp": 0.01034803, + "balance_loss_clip": 1.03501415, + "balance_loss_mlp": 1.02314472, + "epoch": 0.9177814519765519, + "flos": 21943215550080.0, + "grad_norm": 1.6217170953731737, + "language_loss": 0.831788, + "learning_rate": 7.044342286055394e-08, + "loss": 0.85295343, + "num_input_tokens_seen": 329254365, + "step": 15265, + "time_per_iteration": 2.501666307449341 + }, + { + "auxiliary_loss_clip": 0.01108894, + "auxiliary_loss_mlp": 0.01040215, + "balance_loss_clip": 1.03657985, + "balance_loss_mlp": 1.02735829, + "epoch": 0.9178415752292199, + "flos": 24205982768640.0, + "grad_norm": 1.4823346822896535, + "language_loss": 0.73310244, + "learning_rate": 7.034100596037306e-08, + "loss": 0.75459349, + "num_input_tokens_seen": 329274385, + "step": 15266, + "time_per_iteration": 3.8682918548583984 + }, + { + "auxiliary_loss_clip": 0.01103106, + "auxiliary_loss_mlp": 0.01029655, + "balance_loss_clip": 1.0338676, + "balance_loss_mlp": 1.01804376, + "epoch": 0.9179016984818879, + "flos": 20042068504320.0, + "grad_norm": 2.0731302175442354, + "language_loss": 0.77554882, + "learning_rate": 7.023866223305486e-08, + "loss": 0.79687643, + "num_input_tokens_seen": 329292160, + "step": 15267, + "time_per_iteration": 2.4430463314056396 + }, + { + "auxiliary_loss_clip": 0.01017485, + "auxiliary_loss_mlp": 0.00759778, + "balance_loss_clip": 1.00811625, + "balance_loss_mlp": 0.99648422, + "epoch": 0.9179618217345559, + "flos": 65555901100800.0, + "grad_norm": 0.7450347574364345, + "language_loss": 0.56284928, + "learning_rate": 7.013639168247975e-08, + "loss": 0.58062196, + "num_input_tokens_seen": 329351870, + "step": 15268, + "time_per_iteration": 3.1606030464172363 + }, + { + "auxiliary_loss_clip": 0.01105669, + "auxiliary_loss_mlp": 0.00782837, + "balance_loss_clip": 1.03518808, + "balance_loss_mlp": 1.00790453, + "epoch": 0.9180219449872238, + "flos": 21324618501120.0, + "grad_norm": 2.617577116794001, + "language_loss": 0.76603502, + "learning_rate": 7.0034194312526e-08, + "loss": 0.78492004, + "num_input_tokens_seen": 329370930, + "step": 15269, + "time_per_iteration": 2.4621238708496094 + }, + { + "auxiliary_loss_clip": 0.01070762, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.0328275, + "balance_loss_mlp": 1.01891375, + "epoch": 0.9180820682398918, + "flos": 41060008684800.0, + "grad_norm": 2.5322800980161837, + "language_loss": 0.72966707, + "learning_rate": 6.993207012706936e-08, + "loss": 0.75068533, + "num_input_tokens_seen": 329391275, + "step": 15270, + "time_per_iteration": 2.692742347717285 + }, + { + "auxiliary_loss_clip": 0.0110072, + "auxiliary_loss_mlp": 0.01032132, + "balance_loss_clip": 1.03300989, + "balance_loss_mlp": 1.01995492, + "epoch": 0.9181421914925597, + "flos": 28072915384320.0, + "grad_norm": 1.8348591954033457, + "language_loss": 0.79700661, + "learning_rate": 6.98300191299821e-08, + "loss": 0.81833512, + "num_input_tokens_seen": 329412775, + "step": 15271, + "time_per_iteration": 2.4951019287109375 + }, + { + "auxiliary_loss_clip": 0.01060737, + "auxiliary_loss_mlp": 0.01032484, + "balance_loss_clip": 1.03045571, + "balance_loss_mlp": 1.01982427, + "epoch": 0.9182023147452277, + "flos": 29169411909120.0, + "grad_norm": 1.812661594833814, + "language_loss": 0.7316317, + "learning_rate": 6.972804132513355e-08, + "loss": 0.75256389, + "num_input_tokens_seen": 329432440, + "step": 15272, + "time_per_iteration": 2.586599588394165 + }, + { + "auxiliary_loss_clip": 0.01081937, + "auxiliary_loss_mlp": 0.01033229, + "balance_loss_clip": 1.03404188, + "balance_loss_mlp": 1.02158284, + "epoch": 0.9182624379978956, + "flos": 24060831909120.0, + "grad_norm": 1.911081517326716, + "language_loss": 0.73003781, + "learning_rate": 6.962613671639105e-08, + "loss": 0.75118947, + "num_input_tokens_seen": 329450605, + "step": 15273, + "time_per_iteration": 2.5415749549865723 + }, + { + "auxiliary_loss_clip": 0.01063352, + "auxiliary_loss_mlp": 0.01026906, + "balance_loss_clip": 1.0325489, + "balance_loss_mlp": 1.01617682, + "epoch": 0.9183225612505637, + "flos": 23293528554240.0, + "grad_norm": 1.444559617236309, + "language_loss": 0.74577284, + "learning_rate": 6.952430530761933e-08, + "loss": 0.76667541, + "num_input_tokens_seen": 329470550, + "step": 15274, + "time_per_iteration": 2.544069766998291 + }, + { + "auxiliary_loss_clip": 0.01090916, + "auxiliary_loss_mlp": 0.01037069, + "balance_loss_clip": 1.03264952, + "balance_loss_mlp": 1.02564299, + "epoch": 0.9183826845032316, + "flos": 19609237618560.0, + "grad_norm": 1.580122238622461, + "language_loss": 0.68746543, + "learning_rate": 6.942254710267902e-08, + "loss": 0.70874536, + "num_input_tokens_seen": 329489765, + "step": 15275, + "time_per_iteration": 2.4759600162506104 + }, + { + "auxiliary_loss_clip": 0.01088427, + "auxiliary_loss_mlp": 0.01029992, + "balance_loss_clip": 1.0332737, + "balance_loss_mlp": 1.01859534, + "epoch": 0.9184428077558996, + "flos": 18479057114880.0, + "grad_norm": 2.02688879942051, + "language_loss": 0.72539729, + "learning_rate": 6.932086210542953e-08, + "loss": 0.74658144, + "num_input_tokens_seen": 329507040, + "step": 15276, + "time_per_iteration": 2.450692653656006 + }, + { + "auxiliary_loss_clip": 0.01082549, + "auxiliary_loss_mlp": 0.01028753, + "balance_loss_clip": 1.0352788, + "balance_loss_mlp": 1.01736856, + "epoch": 0.9185029310085676, + "flos": 20741034234240.0, + "grad_norm": 1.5981875232090554, + "language_loss": 0.73266518, + "learning_rate": 6.921925031972642e-08, + "loss": 0.75377822, + "num_input_tokens_seen": 329525540, + "step": 15277, + "time_per_iteration": 2.522148847579956 + }, + { + "auxiliary_loss_clip": 0.01000794, + "auxiliary_loss_mlp": 0.01003898, + "balance_loss_clip": 1.00816453, + "balance_loss_mlp": 1.00279546, + "epoch": 0.9185630542612355, + "flos": 68209231875840.0, + "grad_norm": 0.7225453953315476, + "language_loss": 0.59284592, + "learning_rate": 6.91177117494226e-08, + "loss": 0.61289287, + "num_input_tokens_seen": 329592905, + "step": 15278, + "time_per_iteration": 3.2287709712982178 + }, + { + "auxiliary_loss_clip": 0.01064822, + "auxiliary_loss_mlp": 0.01028425, + "balance_loss_clip": 1.03040421, + "balance_loss_mlp": 1.01746333, + "epoch": 0.9186231775139035, + "flos": 12239470598400.0, + "grad_norm": 1.6314586732280334, + "language_loss": 0.64514351, + "learning_rate": 6.901624639836879e-08, + "loss": 0.666076, + "num_input_tokens_seen": 329610150, + "step": 15279, + "time_per_iteration": 2.525012254714966 + }, + { + "auxiliary_loss_clip": 0.01027525, + "auxiliary_loss_mlp": 0.00762252, + "balance_loss_clip": 1.00484347, + "balance_loss_mlp": 1.00263727, + "epoch": 0.9186833007665715, + "flos": 63939237770880.0, + "grad_norm": 0.8683992644926448, + "language_loss": 0.60176259, + "learning_rate": 6.891485427041211e-08, + "loss": 0.61966044, + "num_input_tokens_seen": 329673650, + "step": 15280, + "time_per_iteration": 3.053539514541626 + }, + { + "auxiliary_loss_clip": 0.01083686, + "auxiliary_loss_mlp": 0.01034756, + "balance_loss_clip": 1.0342871, + "balance_loss_mlp": 1.02272749, + "epoch": 0.9187434240192395, + "flos": 19974700546560.0, + "grad_norm": 1.6647436715860733, + "language_loss": 0.69244909, + "learning_rate": 6.881353536939815e-08, + "loss": 0.71363348, + "num_input_tokens_seen": 329692520, + "step": 15281, + "time_per_iteration": 2.5225236415863037 + }, + { + "auxiliary_loss_clip": 0.01081803, + "auxiliary_loss_mlp": 0.01028198, + "balance_loss_clip": 1.03404593, + "balance_loss_mlp": 1.01541829, + "epoch": 0.9188035472719074, + "flos": 25227820874880.0, + "grad_norm": 1.6576770631886806, + "language_loss": 0.84463882, + "learning_rate": 6.871228969916831e-08, + "loss": 0.86573881, + "num_input_tokens_seen": 329713750, + "step": 15282, + "time_per_iteration": 2.539827585220337 + }, + { + "auxiliary_loss_clip": 0.01078946, + "auxiliary_loss_mlp": 0.0103404, + "balance_loss_clip": 1.0333972, + "balance_loss_mlp": 1.02146316, + "epoch": 0.9188636705245754, + "flos": 18405547931520.0, + "grad_norm": 1.7660970167337773, + "language_loss": 0.60050285, + "learning_rate": 6.861111726356194e-08, + "loss": 0.6216327, + "num_input_tokens_seen": 329730960, + "step": 15283, + "time_per_iteration": 2.5012104511260986 + }, + { + "auxiliary_loss_clip": 0.01096872, + "auxiliary_loss_mlp": 0.00782588, + "balance_loss_clip": 1.03554714, + "balance_loss_mlp": 1.00654852, + "epoch": 0.9189237937772433, + "flos": 23769129559680.0, + "grad_norm": 1.4879887064921404, + "language_loss": 0.65303743, + "learning_rate": 6.851001806641554e-08, + "loss": 0.67183197, + "num_input_tokens_seen": 329750975, + "step": 15284, + "time_per_iteration": 2.491382598876953 + }, + { + "auxiliary_loss_clip": 0.01101197, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.03340101, + "balance_loss_mlp": 1.01938331, + "epoch": 0.9189839170299113, + "flos": 21214624078080.0, + "grad_norm": 1.954042861660427, + "language_loss": 0.73543853, + "learning_rate": 6.840899211156292e-08, + "loss": 0.75676662, + "num_input_tokens_seen": 329769645, + "step": 15285, + "time_per_iteration": 2.4662909507751465 + }, + { + "auxiliary_loss_clip": 0.01102222, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.03393245, + "balance_loss_mlp": 1.02272439, + "epoch": 0.9190440402825792, + "flos": 16727370560640.0, + "grad_norm": 1.827394442029598, + "language_loss": 0.71625847, + "learning_rate": 6.830803940283458e-08, + "loss": 0.73762918, + "num_input_tokens_seen": 329788185, + "step": 15286, + "time_per_iteration": 2.4104690551757812 + }, + { + "auxiliary_loss_clip": 0.01104576, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.03528214, + "balance_loss_mlp": 1.01622748, + "epoch": 0.9191041635352473, + "flos": 23441193365760.0, + "grad_norm": 1.8707567241793748, + "language_loss": 0.73202288, + "learning_rate": 6.820715994405945e-08, + "loss": 0.75335169, + "num_input_tokens_seen": 329806780, + "step": 15287, + "time_per_iteration": 2.4733715057373047 + }, + { + "auxiliary_loss_clip": 0.01105902, + "auxiliary_loss_mlp": 0.01029315, + "balance_loss_clip": 1.03685379, + "balance_loss_mlp": 1.01633954, + "epoch": 0.9191642867879152, + "flos": 18807532012800.0, + "grad_norm": 2.1696161093244113, + "language_loss": 0.65475225, + "learning_rate": 6.810635373906226e-08, + "loss": 0.67610443, + "num_input_tokens_seen": 329826350, + "step": 15288, + "time_per_iteration": 2.4431943893432617 + }, + { + "auxiliary_loss_clip": 0.01108262, + "auxiliary_loss_mlp": 0.01036185, + "balance_loss_clip": 1.03936529, + "balance_loss_mlp": 1.02373981, + "epoch": 0.9192244100405832, + "flos": 32160950167680.0, + "grad_norm": 2.0263648803717156, + "language_loss": 0.71392167, + "learning_rate": 6.800562079166549e-08, + "loss": 0.73536611, + "num_input_tokens_seen": 329846160, + "step": 15289, + "time_per_iteration": 2.538158893585205 + }, + { + "auxiliary_loss_clip": 0.01065242, + "auxiliary_loss_mlp": 0.0103753, + "balance_loss_clip": 1.03369594, + "balance_loss_mlp": 1.02440524, + "epoch": 0.9192845332932512, + "flos": 16357669827840.0, + "grad_norm": 1.896818980703734, + "language_loss": 0.74321741, + "learning_rate": 6.790496110568921e-08, + "loss": 0.76424515, + "num_input_tokens_seen": 329862020, + "step": 15290, + "time_per_iteration": 2.5002431869506836 + }, + { + "auxiliary_loss_clip": 0.01060777, + "auxiliary_loss_mlp": 0.01027813, + "balance_loss_clip": 1.03403604, + "balance_loss_mlp": 1.01658952, + "epoch": 0.9193446565459191, + "flos": 26614475464320.0, + "grad_norm": 1.8721355423319446, + "language_loss": 0.7229799, + "learning_rate": 6.78043746849506e-08, + "loss": 0.74386585, + "num_input_tokens_seen": 329880185, + "step": 15291, + "time_per_iteration": 2.604869842529297 + }, + { + "auxiliary_loss_clip": 0.01080018, + "auxiliary_loss_mlp": 0.01027966, + "balance_loss_clip": 1.03414011, + "balance_loss_mlp": 1.01629531, + "epoch": 0.9194047797985871, + "flos": 22492182084480.0, + "grad_norm": 1.8756319941532056, + "language_loss": 0.70968866, + "learning_rate": 6.770386153326346e-08, + "loss": 0.73076844, + "num_input_tokens_seen": 329900255, + "step": 15292, + "time_per_iteration": 2.5193614959716797 + }, + { + "auxiliary_loss_clip": 0.01084532, + "auxiliary_loss_mlp": 0.01027484, + "balance_loss_clip": 1.0354085, + "balance_loss_mlp": 1.01496148, + "epoch": 0.9194649030512551, + "flos": 25078791346560.0, + "grad_norm": 1.6659608274742854, + "language_loss": 0.73355043, + "learning_rate": 6.760342165443988e-08, + "loss": 0.75467056, + "num_input_tokens_seen": 329919095, + "step": 15293, + "time_per_iteration": 2.5391714572906494 + }, + { + "auxiliary_loss_clip": 0.01101747, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.03459489, + "balance_loss_mlp": 1.01535022, + "epoch": 0.9195250263039231, + "flos": 11911139354880.0, + "grad_norm": 1.9122250238217922, + "language_loss": 0.77777666, + "learning_rate": 6.750305505228837e-08, + "loss": 0.79906875, + "num_input_tokens_seen": 329936505, + "step": 15294, + "time_per_iteration": 2.412062168121338 + }, + { + "auxiliary_loss_clip": 0.01083801, + "auxiliary_loss_mlp": 0.01031513, + "balance_loss_clip": 1.03432286, + "balance_loss_mlp": 1.0182091, + "epoch": 0.919585149556591, + "flos": 21834154880640.0, + "grad_norm": 3.7164919765360684, + "language_loss": 0.77180743, + "learning_rate": 6.74027617306141e-08, + "loss": 0.79296064, + "num_input_tokens_seen": 329956795, + "step": 15295, + "time_per_iteration": 3.9314160346984863 + }, + { + "auxiliary_loss_clip": 0.01100947, + "auxiliary_loss_mlp": 0.01027656, + "balance_loss_clip": 1.03528368, + "balance_loss_mlp": 1.01676059, + "epoch": 0.919645272809259, + "flos": 28184059042560.0, + "grad_norm": 2.0026291092200563, + "language_loss": 0.71451771, + "learning_rate": 6.730254169322114e-08, + "loss": 0.73580372, + "num_input_tokens_seen": 329977195, + "step": 15296, + "time_per_iteration": 3.9286043643951416 + }, + { + "auxiliary_loss_clip": 0.01104164, + "auxiliary_loss_mlp": 0.01035314, + "balance_loss_clip": 1.03567696, + "balance_loss_mlp": 1.02391148, + "epoch": 0.9197053960619269, + "flos": 18332828847360.0, + "grad_norm": 2.0236307254163006, + "language_loss": 0.75362557, + "learning_rate": 6.720239494390912e-08, + "loss": 0.77502036, + "num_input_tokens_seen": 329992095, + "step": 15297, + "time_per_iteration": 3.834601879119873 + }, + { + "auxiliary_loss_clip": 0.01087401, + "auxiliary_loss_mlp": 0.00782054, + "balance_loss_clip": 1.0342555, + "balance_loss_mlp": 1.00845051, + "epoch": 0.9197655193145949, + "flos": 28183448511360.0, + "grad_norm": 1.606279363063911, + "language_loss": 0.73662364, + "learning_rate": 6.710232148647676e-08, + "loss": 0.75531822, + "num_input_tokens_seen": 330011490, + "step": 15298, + "time_per_iteration": 2.5185587406158447 + }, + { + "auxiliary_loss_clip": 0.01077056, + "auxiliary_loss_mlp": 0.01032006, + "balance_loss_clip": 1.03549218, + "balance_loss_mlp": 1.01988816, + "epoch": 0.9198256425672628, + "flos": 17306321973120.0, + "grad_norm": 2.16245322592682, + "language_loss": 0.79403865, + "learning_rate": 6.70023213247175e-08, + "loss": 0.81512928, + "num_input_tokens_seen": 330027885, + "step": 15299, + "time_per_iteration": 2.5126662254333496 + }, + { + "auxiliary_loss_clip": 0.01073879, + "auxiliary_loss_mlp": 0.01027078, + "balance_loss_clip": 1.03508615, + "balance_loss_mlp": 1.01548529, + "epoch": 0.9198857658199309, + "flos": 17858520731520.0, + "grad_norm": 1.9780485312503362, + "language_loss": 0.63890815, + "learning_rate": 6.690239446242385e-08, + "loss": 0.65991771, + "num_input_tokens_seen": 330046230, + "step": 15300, + "time_per_iteration": 2.512488603591919 + }, + { + "auxiliary_loss_clip": 0.01076475, + "auxiliary_loss_mlp": 0.00780134, + "balance_loss_clip": 1.03428197, + "balance_loss_mlp": 1.00825059, + "epoch": 0.9199458890725988, + "flos": 22127545169280.0, + "grad_norm": 1.709746402449741, + "language_loss": 0.69660699, + "learning_rate": 6.680254090338545e-08, + "loss": 0.71517301, + "num_input_tokens_seen": 330065535, + "step": 15301, + "time_per_iteration": 2.533690929412842 + }, + { + "auxiliary_loss_clip": 0.01093711, + "auxiliary_loss_mlp": 0.0103862, + "balance_loss_clip": 1.03551292, + "balance_loss_mlp": 1.02432656, + "epoch": 0.9200060123252668, + "flos": 16034043265920.0, + "grad_norm": 1.9184653229089332, + "language_loss": 0.71280134, + "learning_rate": 6.670276065138814e-08, + "loss": 0.73412466, + "num_input_tokens_seen": 330082920, + "step": 15302, + "time_per_iteration": 2.4526774883270264 + }, + { + "auxiliary_loss_clip": 0.01104546, + "auxiliary_loss_mlp": 0.01030644, + "balance_loss_clip": 1.03477883, + "balance_loss_mlp": 1.01900339, + "epoch": 0.9200661355779348, + "flos": 26864521015680.0, + "grad_norm": 1.7022099778602027, + "language_loss": 0.76536238, + "learning_rate": 6.660305371021579e-08, + "loss": 0.78671432, + "num_input_tokens_seen": 330101165, + "step": 15303, + "time_per_iteration": 2.5090975761413574 + }, + { + "auxiliary_loss_clip": 0.01083751, + "auxiliary_loss_mlp": 0.01031696, + "balance_loss_clip": 1.03569388, + "balance_loss_mlp": 1.0201087, + "epoch": 0.9201262588306027, + "flos": 12786749193600.0, + "grad_norm": 2.950933670800871, + "language_loss": 0.88290632, + "learning_rate": 6.650342008365006e-08, + "loss": 0.90406084, + "num_input_tokens_seen": 330118775, + "step": 15304, + "time_per_iteration": 2.4791369438171387 + }, + { + "auxiliary_loss_clip": 0.01042162, + "auxiliary_loss_mlp": 0.0103517, + "balance_loss_clip": 1.03281498, + "balance_loss_mlp": 1.02005994, + "epoch": 0.9201863820832707, + "flos": 20631614428800.0, + "grad_norm": 1.8900297248507547, + "language_loss": 0.77569425, + "learning_rate": 6.64038597754677e-08, + "loss": 0.79646754, + "num_input_tokens_seen": 330135570, + "step": 15305, + "time_per_iteration": 4.0152506828308105 + }, + { + "auxiliary_loss_clip": 0.01094046, + "auxiliary_loss_mlp": 0.01036823, + "balance_loss_clip": 1.03497005, + "balance_loss_mlp": 1.02438354, + "epoch": 0.9202465053359387, + "flos": 26395815421440.0, + "grad_norm": 2.3512156380013023, + "language_loss": 0.81224144, + "learning_rate": 6.630437278944501e-08, + "loss": 0.83355004, + "num_input_tokens_seen": 330152840, + "step": 15306, + "time_per_iteration": 2.543663740158081 + }, + { + "auxiliary_loss_clip": 0.01063692, + "auxiliary_loss_mlp": 0.01029445, + "balance_loss_clip": 1.03237402, + "balance_loss_mlp": 1.01862717, + "epoch": 0.9203066285886067, + "flos": 10488179093760.0, + "grad_norm": 2.069422734852628, + "language_loss": 0.72011024, + "learning_rate": 6.62049591293541e-08, + "loss": 0.74104166, + "num_input_tokens_seen": 330168605, + "step": 15307, + "time_per_iteration": 2.539537191390991 + }, + { + "auxiliary_loss_clip": 0.01094724, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.03431106, + "balance_loss_mlp": 1.0173291, + "epoch": 0.9203667518412746, + "flos": 19390721230080.0, + "grad_norm": 1.9444741708023532, + "language_loss": 0.78754854, + "learning_rate": 6.610561879896526e-08, + "loss": 0.80878961, + "num_input_tokens_seen": 330186160, + "step": 15308, + "time_per_iteration": 2.466181516647339 + }, + { + "auxiliary_loss_clip": 0.01081328, + "auxiliary_loss_mlp": 0.01033924, + "balance_loss_clip": 1.03256583, + "balance_loss_mlp": 1.02119827, + "epoch": 0.9204268750939426, + "flos": 15924982596480.0, + "grad_norm": 1.9423492330194896, + "language_loss": 0.78033197, + "learning_rate": 6.600635180204484e-08, + "loss": 0.80148453, + "num_input_tokens_seen": 330201780, + "step": 15309, + "time_per_iteration": 2.4562456607818604 + }, + { + "auxiliary_loss_clip": 0.01058518, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.03330255, + "balance_loss_mlp": 1.01936471, + "epoch": 0.9204869983466105, + "flos": 16471758401280.0, + "grad_norm": 1.908813401510127, + "language_loss": 0.6636861, + "learning_rate": 6.590715814235781e-08, + "loss": 0.68459165, + "num_input_tokens_seen": 330219165, + "step": 15310, + "time_per_iteration": 2.5644047260284424 + }, + { + "auxiliary_loss_clip": 0.01042118, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.03401983, + "balance_loss_mlp": 1.01697719, + "epoch": 0.9205471215992785, + "flos": 21539220307200.0, + "grad_norm": 1.6748614394535124, + "language_loss": 0.66263419, + "learning_rate": 6.580803782366495e-08, + "loss": 0.68334931, + "num_input_tokens_seen": 330238975, + "step": 15311, + "time_per_iteration": 2.6151740550994873 + }, + { + "auxiliary_loss_clip": 0.01092206, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.03338945, + "balance_loss_mlp": 1.02129281, + "epoch": 0.9206072448519464, + "flos": 25005892694400.0, + "grad_norm": 1.5813462715624276, + "language_loss": 0.75833321, + "learning_rate": 6.570899084972503e-08, + "loss": 0.7795859, + "num_input_tokens_seen": 330259755, + "step": 15312, + "time_per_iteration": 2.5255610942840576 + }, + { + "auxiliary_loss_clip": 0.01089806, + "auxiliary_loss_mlp": 0.01035472, + "balance_loss_clip": 1.03459263, + "balance_loss_mlp": 1.02382493, + "epoch": 0.9206673681046145, + "flos": 20522661500160.0, + "grad_norm": 1.6671071058549505, + "language_loss": 0.79009008, + "learning_rate": 6.561001722429394e-08, + "loss": 0.81134284, + "num_input_tokens_seen": 330277660, + "step": 15313, + "time_per_iteration": 2.4712235927581787 + }, + { + "auxiliary_loss_clip": 0.0109514, + "auxiliary_loss_mlp": 0.01029851, + "balance_loss_clip": 1.03409648, + "balance_loss_mlp": 1.01789987, + "epoch": 0.9207274913572824, + "flos": 20883455660160.0, + "grad_norm": 1.7547122738282617, + "language_loss": 0.78118634, + "learning_rate": 6.55111169511251e-08, + "loss": 0.80243623, + "num_input_tokens_seen": 330295455, + "step": 15314, + "time_per_iteration": 2.4814462661743164 + }, + { + "auxiliary_loss_clip": 0.01087179, + "auxiliary_loss_mlp": 0.01033238, + "balance_loss_clip": 1.03453732, + "balance_loss_mlp": 1.01992273, + "epoch": 0.9207876146099504, + "flos": 22708256348160.0, + "grad_norm": 1.7454536002851013, + "language_loss": 0.79418576, + "learning_rate": 6.541229003396864e-08, + "loss": 0.81538999, + "num_input_tokens_seen": 330315310, + "step": 15315, + "time_per_iteration": 2.511749505996704 + }, + { + "auxiliary_loss_clip": 0.01085864, + "auxiliary_loss_mlp": 0.01028708, + "balance_loss_clip": 1.0354178, + "balance_loss_mlp": 1.01656079, + "epoch": 0.9208477378626184, + "flos": 18507354053760.0, + "grad_norm": 1.8648499700847674, + "language_loss": 0.76362801, + "learning_rate": 6.531353647657156e-08, + "loss": 0.78477371, + "num_input_tokens_seen": 330333260, + "step": 15316, + "time_per_iteration": 2.4958906173706055 + }, + { + "auxiliary_loss_clip": 0.01103238, + "auxiliary_loss_mlp": 0.01034532, + "balance_loss_clip": 1.03299415, + "balance_loss_mlp": 1.02191341, + "epoch": 0.9209078611152863, + "flos": 22999635475200.0, + "grad_norm": 1.570968245456868, + "language_loss": 0.69556904, + "learning_rate": 6.521485628267931e-08, + "loss": 0.71694672, + "num_input_tokens_seen": 330352465, + "step": 15317, + "time_per_iteration": 2.455543279647827 + }, + { + "auxiliary_loss_clip": 0.01092714, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.03508806, + "balance_loss_mlp": 1.02086461, + "epoch": 0.9209679843679544, + "flos": 24061514267520.0, + "grad_norm": 1.6603512074571751, + "language_loss": 0.83340019, + "learning_rate": 6.511624945603378e-08, + "loss": 0.85465622, + "num_input_tokens_seen": 330372685, + "step": 15318, + "time_per_iteration": 2.513838768005371 + }, + { + "auxiliary_loss_clip": 0.0108276, + "auxiliary_loss_mlp": 0.01031175, + "balance_loss_clip": 1.03586173, + "balance_loss_mlp": 1.01922464, + "epoch": 0.9210281076206223, + "flos": 13553370190080.0, + "grad_norm": 1.9708649585423381, + "language_loss": 0.85778832, + "learning_rate": 6.501771600037354e-08, + "loss": 0.87892771, + "num_input_tokens_seen": 330388860, + "step": 15319, + "time_per_iteration": 2.490665912628174 + }, + { + "auxiliary_loss_clip": 0.01027109, + "auxiliary_loss_mlp": 0.01002849, + "balance_loss_clip": 1.00437868, + "balance_loss_mlp": 1.00178194, + "epoch": 0.9210882308732903, + "flos": 71426289674880.0, + "grad_norm": 0.7697389946901002, + "language_loss": 0.5616408, + "learning_rate": 6.491925591943559e-08, + "loss": 0.58194041, + "num_input_tokens_seen": 330448735, + "step": 15320, + "time_per_iteration": 3.102128028869629 + }, + { + "auxiliary_loss_clip": 0.01054398, + "auxiliary_loss_mlp": 0.01045986, + "balance_loss_clip": 1.03440881, + "balance_loss_mlp": 1.03134704, + "epoch": 0.9211483541259582, + "flos": 18509113820160.0, + "grad_norm": 2.0675602197678122, + "language_loss": 0.63923061, + "learning_rate": 6.482086921695384e-08, + "loss": 0.66023445, + "num_input_tokens_seen": 330465600, + "step": 15321, + "time_per_iteration": 2.5387611389160156 + }, + { + "auxiliary_loss_clip": 0.01062904, + "auxiliary_loss_mlp": 0.01027026, + "balance_loss_clip": 1.03370619, + "balance_loss_mlp": 1.01577878, + "epoch": 0.9212084773786262, + "flos": 23258228463360.0, + "grad_norm": 1.4578470743154641, + "language_loss": 0.71527946, + "learning_rate": 6.47225558966582e-08, + "loss": 0.73617876, + "num_input_tokens_seen": 330485770, + "step": 15322, + "time_per_iteration": 2.582439422607422 + }, + { + "auxiliary_loss_clip": 0.01053485, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.03409791, + "balance_loss_mlp": 1.02004921, + "epoch": 0.9212686006312941, + "flos": 16289511770880.0, + "grad_norm": 2.5801631647751933, + "language_loss": 0.7003448, + "learning_rate": 6.462431596227725e-08, + "loss": 0.72119117, + "num_input_tokens_seen": 330504255, + "step": 15323, + "time_per_iteration": 2.5533764362335205 + }, + { + "auxiliary_loss_clip": 0.0107725, + "auxiliary_loss_mlp": 0.0103675, + "balance_loss_clip": 1.031299, + "balance_loss_mlp": 1.02301157, + "epoch": 0.9213287238839621, + "flos": 19785773986560.0, + "grad_norm": 2.002888392729793, + "language_loss": 0.7482717, + "learning_rate": 6.452614941753597e-08, + "loss": 0.76941168, + "num_input_tokens_seen": 330520705, + "step": 15324, + "time_per_iteration": 2.5419020652770996 + }, + { + "auxiliary_loss_clip": 0.01093649, + "auxiliary_loss_mlp": 0.010397, + "balance_loss_clip": 1.03534532, + "balance_loss_mlp": 1.02801704, + "epoch": 0.92138884713663, + "flos": 21030402199680.0, + "grad_norm": 4.392890117614824, + "language_loss": 0.71226996, + "learning_rate": 6.442805626615744e-08, + "loss": 0.73360348, + "num_input_tokens_seen": 330539245, + "step": 15325, + "time_per_iteration": 2.485720157623291 + }, + { + "auxiliary_loss_clip": 0.01078369, + "auxiliary_loss_mlp": 0.01033146, + "balance_loss_clip": 1.0361563, + "balance_loss_mlp": 1.02098036, + "epoch": 0.9214489703892981, + "flos": 28587264186240.0, + "grad_norm": 1.5136370994165007, + "language_loss": 0.78510725, + "learning_rate": 6.433003651186109e-08, + "loss": 0.80622238, + "num_input_tokens_seen": 330561815, + "step": 15326, + "time_per_iteration": 2.5969414710998535 + }, + { + "auxiliary_loss_clip": 0.01097148, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.03682137, + "balance_loss_mlp": 1.02114093, + "epoch": 0.921509093641966, + "flos": 16361476669440.0, + "grad_norm": 1.9000367324488865, + "language_loss": 0.71017134, + "learning_rate": 6.42320901583635e-08, + "loss": 0.73147869, + "num_input_tokens_seen": 330579760, + "step": 15327, + "time_per_iteration": 2.475288152694702 + }, + { + "auxiliary_loss_clip": 0.01098295, + "auxiliary_loss_mlp": 0.01039789, + "balance_loss_clip": 1.0376333, + "balance_loss_mlp": 1.02704537, + "epoch": 0.921569216894634, + "flos": 26830837036800.0, + "grad_norm": 2.1551751044103864, + "language_loss": 0.77588189, + "learning_rate": 6.413421720937906e-08, + "loss": 0.79726267, + "num_input_tokens_seen": 330598545, + "step": 15328, + "time_per_iteration": 2.5258400440216064 + }, + { + "auxiliary_loss_clip": 0.01082236, + "auxiliary_loss_mlp": 0.01030275, + "balance_loss_clip": 1.03419197, + "balance_loss_mlp": 1.01859248, + "epoch": 0.921629340147302, + "flos": 24645134448000.0, + "grad_norm": 2.093393932371499, + "language_loss": 0.71456385, + "learning_rate": 6.4036417668619e-08, + "loss": 0.73568904, + "num_input_tokens_seen": 330616700, + "step": 15329, + "time_per_iteration": 2.5281929969787598 + }, + { + "auxiliary_loss_clip": 0.01090148, + "auxiliary_loss_mlp": 0.01025434, + "balance_loss_clip": 1.03327501, + "balance_loss_mlp": 1.01460385, + "epoch": 0.9216894633999699, + "flos": 15086504442240.0, + "grad_norm": 1.7362849360005825, + "language_loss": 0.86587024, + "learning_rate": 6.393869153979192e-08, + "loss": 0.88702607, + "num_input_tokens_seen": 330633355, + "step": 15330, + "time_per_iteration": 2.444657325744629 + }, + { + "auxiliary_loss_clip": 0.01070256, + "auxiliary_loss_mlp": 0.010339, + "balance_loss_clip": 1.03217876, + "balance_loss_mlp": 1.0212822, + "epoch": 0.921749586652638, + "flos": 19204524103680.0, + "grad_norm": 2.5529340885180196, + "language_loss": 0.76028538, + "learning_rate": 6.384103882660397e-08, + "loss": 0.78132701, + "num_input_tokens_seen": 330651470, + "step": 15331, + "time_per_iteration": 2.512305498123169 + }, + { + "auxiliary_loss_clip": 0.01091791, + "auxiliary_loss_mlp": 0.01027684, + "balance_loss_clip": 1.03368807, + "balance_loss_mlp": 1.01618052, + "epoch": 0.9218097099053059, + "flos": 20522446018560.0, + "grad_norm": 1.991425615363305, + "language_loss": 0.75559378, + "learning_rate": 6.374345953275794e-08, + "loss": 0.77678853, + "num_input_tokens_seen": 330669170, + "step": 15332, + "time_per_iteration": 2.49147629737854 + }, + { + "auxiliary_loss_clip": 0.01059733, + "auxiliary_loss_mlp": 0.01028362, + "balance_loss_clip": 1.03257132, + "balance_loss_mlp": 1.01719785, + "epoch": 0.9218698331579739, + "flos": 17348625216000.0, + "grad_norm": 1.7706018550983702, + "language_loss": 0.74942291, + "learning_rate": 6.364595366195358e-08, + "loss": 0.77030385, + "num_input_tokens_seen": 330686635, + "step": 15333, + "time_per_iteration": 2.537027597427368 + }, + { + "auxiliary_loss_clip": 0.01017431, + "auxiliary_loss_mlp": 0.00999931, + "balance_loss_clip": 1.00806785, + "balance_loss_mlp": 0.99891776, + "epoch": 0.9219299564106418, + "flos": 61958332575360.0, + "grad_norm": 0.8130602688427413, + "language_loss": 0.52921045, + "learning_rate": 6.354852121788879e-08, + "loss": 0.54938412, + "num_input_tokens_seen": 330749160, + "step": 15334, + "time_per_iteration": 4.445763111114502 + }, + { + "auxiliary_loss_clip": 0.01074017, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.03325486, + "balance_loss_mlp": 1.02305424, + "epoch": 0.9219900796633098, + "flos": 15701761526400.0, + "grad_norm": 1.9475931920857337, + "language_loss": 0.62048411, + "learning_rate": 6.345116220425839e-08, + "loss": 0.64157104, + "num_input_tokens_seen": 330766840, + "step": 15335, + "time_per_iteration": 3.869744062423706 + }, + { + "auxiliary_loss_clip": 0.01059778, + "auxiliary_loss_mlp": 0.01030913, + "balance_loss_clip": 1.03233945, + "balance_loss_mlp": 1.01824677, + "epoch": 0.9220502029159777, + "flos": 24932670819840.0, + "grad_norm": 1.7388218049749053, + "language_loss": 0.71816695, + "learning_rate": 6.335387662475366e-08, + "loss": 0.73907381, + "num_input_tokens_seen": 330785585, + "step": 15336, + "time_per_iteration": 3.99977970123291 + }, + { + "auxiliary_loss_clip": 0.01076634, + "auxiliary_loss_mlp": 0.01028603, + "balance_loss_clip": 1.03186929, + "balance_loss_mlp": 1.0186727, + "epoch": 0.9221103261686457, + "flos": 15667215621120.0, + "grad_norm": 1.8873864867308798, + "language_loss": 0.71855891, + "learning_rate": 6.325666448306433e-08, + "loss": 0.73961121, + "num_input_tokens_seen": 330800750, + "step": 15337, + "time_per_iteration": 2.4621264934539795 + }, + { + "auxiliary_loss_clip": 0.01021663, + "auxiliary_loss_mlp": 0.01002931, + "balance_loss_clip": 1.00884008, + "balance_loss_mlp": 1.00190568, + "epoch": 0.9221704494213137, + "flos": 67516299630720.0, + "grad_norm": 0.8874000157194111, + "language_loss": 0.65348989, + "learning_rate": 6.31595257828763e-08, + "loss": 0.67373586, + "num_input_tokens_seen": 330863640, + "step": 15338, + "time_per_iteration": 3.0703814029693604 + }, + { + "auxiliary_loss_clip": 0.01095781, + "auxiliary_loss_mlp": 0.01032066, + "balance_loss_clip": 1.0359211, + "balance_loss_mlp": 1.01992488, + "epoch": 0.9222305726739817, + "flos": 30226945155840.0, + "grad_norm": 1.659588688091902, + "language_loss": 0.6708104, + "learning_rate": 6.306246052787289e-08, + "loss": 0.6920889, + "num_input_tokens_seen": 330884675, + "step": 15339, + "time_per_iteration": 2.5369646549224854 + }, + { + "auxiliary_loss_clip": 0.01104123, + "auxiliary_loss_mlp": 0.0102992, + "balance_loss_clip": 1.03528821, + "balance_loss_mlp": 1.0179987, + "epoch": 0.9222906959266496, + "flos": 25337204766720.0, + "grad_norm": 1.9222084383787847, + "language_loss": 0.71990204, + "learning_rate": 6.296546872173513e-08, + "loss": 0.74124253, + "num_input_tokens_seen": 330904125, + "step": 15340, + "time_per_iteration": 2.49969744682312 + }, + { + "auxiliary_loss_clip": 0.0106914, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.03315377, + "balance_loss_mlp": 1.02096438, + "epoch": 0.9223508191793176, + "flos": 27599864244480.0, + "grad_norm": 1.6310514687895812, + "language_loss": 0.70102286, + "learning_rate": 6.286855036814098e-08, + "loss": 0.72203845, + "num_input_tokens_seen": 330925140, + "step": 15341, + "time_per_iteration": 2.595306396484375 + }, + { + "auxiliary_loss_clip": 0.01050268, + "auxiliary_loss_mlp": 0.01025795, + "balance_loss_clip": 1.03383625, + "balance_loss_mlp": 1.01514339, + "epoch": 0.9224109424319856, + "flos": 27307587277440.0, + "grad_norm": 1.598531829517011, + "language_loss": 0.67233884, + "learning_rate": 6.277170547076571e-08, + "loss": 0.6930995, + "num_input_tokens_seen": 330946625, + "step": 15342, + "time_per_iteration": 2.6309854984283447 + }, + { + "auxiliary_loss_clip": 0.01054448, + "auxiliary_loss_mlp": 0.01035266, + "balance_loss_clip": 1.03568721, + "balance_loss_mlp": 1.02434635, + "epoch": 0.9224710656846535, + "flos": 48208314401280.0, + "grad_norm": 2.0147657986180696, + "language_loss": 0.7011528, + "learning_rate": 6.26749340332815e-08, + "loss": 0.72204989, + "num_input_tokens_seen": 330967795, + "step": 15343, + "time_per_iteration": 2.820791482925415 + }, + { + "auxiliary_loss_clip": 0.01010143, + "auxiliary_loss_mlp": 0.01001698, + "balance_loss_clip": 1.00678563, + "balance_loss_mlp": 1.00063097, + "epoch": 0.9225311889373216, + "flos": 66722171794560.0, + "grad_norm": 0.7186402232174223, + "language_loss": 0.51975429, + "learning_rate": 6.257823605935786e-08, + "loss": 0.53987277, + "num_input_tokens_seen": 331040850, + "step": 15344, + "time_per_iteration": 4.775454044342041 + }, + { + "auxiliary_loss_clip": 0.01099677, + "auxiliary_loss_mlp": 0.01032922, + "balance_loss_clip": 1.03531122, + "balance_loss_mlp": 1.02197278, + "epoch": 0.9225913121899895, + "flos": 22271295398400.0, + "grad_norm": 1.7131598865236317, + "language_loss": 0.70304465, + "learning_rate": 6.248161155266162e-08, + "loss": 0.72437072, + "num_input_tokens_seen": 331060595, + "step": 15345, + "time_per_iteration": 2.4816977977752686 + }, + { + "auxiliary_loss_clip": 0.01081891, + "auxiliary_loss_mlp": 0.0103334, + "balance_loss_clip": 1.03434682, + "balance_loss_mlp": 1.02146661, + "epoch": 0.9226514354426575, + "flos": 20082719721600.0, + "grad_norm": 1.6738188702628254, + "language_loss": 0.77442127, + "learning_rate": 6.238506051685677e-08, + "loss": 0.79557359, + "num_input_tokens_seen": 331080195, + "step": 15346, + "time_per_iteration": 2.512470006942749 + }, + { + "auxiliary_loss_clip": 0.01085972, + "auxiliary_loss_mlp": 0.01036422, + "balance_loss_clip": 1.03895926, + "balance_loss_mlp": 1.02403581, + "epoch": 0.9227115586953254, + "flos": 16070851728000.0, + "grad_norm": 2.6023050587893075, + "language_loss": 0.76312846, + "learning_rate": 6.228858295560457e-08, + "loss": 0.78435242, + "num_input_tokens_seen": 331097645, + "step": 15347, + "time_per_iteration": 2.4999592304229736 + }, + { + "auxiliary_loss_clip": 0.01089971, + "auxiliary_loss_mlp": 0.01028094, + "balance_loss_clip": 1.03590965, + "balance_loss_mlp": 1.01727581, + "epoch": 0.9227716819479934, + "flos": 20446027833600.0, + "grad_norm": 1.4588192998855583, + "language_loss": 0.76932937, + "learning_rate": 6.219217887256367e-08, + "loss": 0.79051006, + "num_input_tokens_seen": 331116830, + "step": 15348, + "time_per_iteration": 2.471672296524048 + }, + { + "auxiliary_loss_clip": 0.0108087, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.03266358, + "balance_loss_mlp": 1.01995671, + "epoch": 0.9228318052006613, + "flos": 25007401065600.0, + "grad_norm": 1.8391556812880392, + "language_loss": 0.68387282, + "learning_rate": 6.209584827138959e-08, + "loss": 0.70499897, + "num_input_tokens_seen": 331137235, + "step": 15349, + "time_per_iteration": 2.543954610824585 + }, + { + "auxiliary_loss_clip": 0.01071503, + "auxiliary_loss_mlp": 0.01027692, + "balance_loss_clip": 1.03407478, + "balance_loss_mlp": 1.01559854, + "epoch": 0.9228919284533293, + "flos": 12677257560960.0, + "grad_norm": 2.84435317012987, + "language_loss": 0.86658359, + "learning_rate": 6.199959115573495e-08, + "loss": 0.88757557, + "num_input_tokens_seen": 331153155, + "step": 15350, + "time_per_iteration": 2.499349594116211 + }, + { + "auxiliary_loss_clip": 0.01012558, + "auxiliary_loss_mlp": 0.01003494, + "balance_loss_clip": 1.00917482, + "balance_loss_mlp": 1.00246894, + "epoch": 0.9229520517059973, + "flos": 69986162712960.0, + "grad_norm": 0.7985847904083166, + "language_loss": 0.60372984, + "learning_rate": 6.190340752924994e-08, + "loss": 0.6238904, + "num_input_tokens_seen": 331214895, + "step": 15351, + "time_per_iteration": 3.093109130859375 + }, + { + "auxiliary_loss_clip": 0.01079596, + "auxiliary_loss_mlp": 0.01024963, + "balance_loss_clip": 1.03454947, + "balance_loss_mlp": 1.0136863, + "epoch": 0.9230121749586653, + "flos": 14793832425600.0, + "grad_norm": 2.0112581682705293, + "language_loss": 0.77872443, + "learning_rate": 6.180729739558233e-08, + "loss": 0.79977, + "num_input_tokens_seen": 331232185, + "step": 15352, + "time_per_iteration": 2.5113823413848877 + }, + { + "auxiliary_loss_clip": 0.01069173, + "auxiliary_loss_mlp": 0.01043202, + "balance_loss_clip": 1.03213787, + "balance_loss_mlp": 1.02883768, + "epoch": 0.9230722982113332, + "flos": 22967208472320.0, + "grad_norm": 2.071111305729824, + "language_loss": 0.59742606, + "learning_rate": 6.171126075837585e-08, + "loss": 0.61854982, + "num_input_tokens_seen": 331251065, + "step": 15353, + "time_per_iteration": 2.537743330001831 + }, + { + "auxiliary_loss_clip": 0.01078838, + "auxiliary_loss_mlp": 0.01027795, + "balance_loss_clip": 1.03419232, + "balance_loss_mlp": 1.01667261, + "epoch": 0.9231324214640012, + "flos": 18551452976640.0, + "grad_norm": 1.6042350169385307, + "language_loss": 0.74958223, + "learning_rate": 6.161529762127293e-08, + "loss": 0.7706486, + "num_input_tokens_seen": 331269110, + "step": 15354, + "time_per_iteration": 2.4981327056884766 + }, + { + "auxiliary_loss_clip": 0.01106284, + "auxiliary_loss_mlp": 0.01033678, + "balance_loss_clip": 1.0348866, + "balance_loss_mlp": 1.02090514, + "epoch": 0.9231925447166691, + "flos": 22082727974400.0, + "grad_norm": 2.0209552971458873, + "language_loss": 0.64738023, + "learning_rate": 6.1519407987912e-08, + "loss": 0.66877985, + "num_input_tokens_seen": 331286555, + "step": 15355, + "time_per_iteration": 2.446593761444092 + }, + { + "auxiliary_loss_clip": 0.01077428, + "auxiliary_loss_mlp": 0.01040719, + "balance_loss_clip": 1.03367972, + "balance_loss_mlp": 1.02794015, + "epoch": 0.9232526679693371, + "flos": 26541145848960.0, + "grad_norm": 1.438156126411136, + "language_loss": 0.73994273, + "learning_rate": 6.142359186192947e-08, + "loss": 0.76112419, + "num_input_tokens_seen": 331307660, + "step": 15356, + "time_per_iteration": 2.5697686672210693 + }, + { + "auxiliary_loss_clip": 0.01080641, + "auxiliary_loss_mlp": 0.01030124, + "balance_loss_clip": 1.03502989, + "balance_loss_mlp": 1.0179888, + "epoch": 0.9233127912220052, + "flos": 14756664827520.0, + "grad_norm": 2.1867095262514646, + "language_loss": 0.61056757, + "learning_rate": 6.132784924695844e-08, + "loss": 0.63167524, + "num_input_tokens_seen": 331324885, + "step": 15357, + "time_per_iteration": 2.480957269668579 + }, + { + "auxiliary_loss_clip": 0.0107519, + "auxiliary_loss_mlp": 0.01029792, + "balance_loss_clip": 1.03422666, + "balance_loss_mlp": 1.01739979, + "epoch": 0.9233729144746731, + "flos": 25261792162560.0, + "grad_norm": 1.3975935172321932, + "language_loss": 0.6999867, + "learning_rate": 6.123218014662956e-08, + "loss": 0.72103655, + "num_input_tokens_seen": 331345885, + "step": 15358, + "time_per_iteration": 2.5956897735595703 + }, + { + "auxiliary_loss_clip": 0.01102913, + "auxiliary_loss_mlp": 0.01029022, + "balance_loss_clip": 1.03467941, + "balance_loss_mlp": 1.01694, + "epoch": 0.9234330377273411, + "flos": 27849837968640.0, + "grad_norm": 1.9131240476351619, + "language_loss": 0.73143566, + "learning_rate": 6.113658456457104e-08, + "loss": 0.75275493, + "num_input_tokens_seen": 331364320, + "step": 15359, + "time_per_iteration": 2.489328145980835 + }, + { + "auxiliary_loss_clip": 0.01043036, + "auxiliary_loss_mlp": 0.01031287, + "balance_loss_clip": 1.03418708, + "balance_loss_mlp": 1.01950955, + "epoch": 0.923493160980009, + "flos": 24608361899520.0, + "grad_norm": 1.8504481653968372, + "language_loss": 0.6458236, + "learning_rate": 6.104106250440732e-08, + "loss": 0.66656679, + "num_input_tokens_seen": 331384135, + "step": 15360, + "time_per_iteration": 2.6618993282318115 + }, + { + "auxiliary_loss_clip": 0.01018348, + "auxiliary_loss_mlp": 0.00761268, + "balance_loss_clip": 1.00531626, + "balance_loss_mlp": 1.00086486, + "epoch": 0.923553284232677, + "flos": 67700916558720.0, + "grad_norm": 0.7684964126390791, + "language_loss": 0.55178714, + "learning_rate": 6.094561396976083e-08, + "loss": 0.5695833, + "num_input_tokens_seen": 331440645, + "step": 15361, + "time_per_iteration": 3.036098003387451 + }, + { + "auxiliary_loss_clip": 0.01070997, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.03143299, + "balance_loss_mlp": 1.01765418, + "epoch": 0.9236134074853449, + "flos": 18807244704000.0, + "grad_norm": 1.7250312797749559, + "language_loss": 0.69568539, + "learning_rate": 6.085023896425112e-08, + "loss": 0.71669912, + "num_input_tokens_seen": 331459580, + "step": 15362, + "time_per_iteration": 2.5458106994628906 + }, + { + "auxiliary_loss_clip": 0.01089719, + "auxiliary_loss_mlp": 0.01032475, + "balance_loss_clip": 1.03297615, + "balance_loss_mlp": 1.01815784, + "epoch": 0.923673530738013, + "flos": 27782362270080.0, + "grad_norm": 1.3781407957199678, + "language_loss": 0.75834489, + "learning_rate": 6.075493749149463e-08, + "loss": 0.77956688, + "num_input_tokens_seen": 331481560, + "step": 15363, + "time_per_iteration": 2.5190842151641846 + }, + { + "auxiliary_loss_clip": 0.0110392, + "auxiliary_loss_mlp": 0.01030533, + "balance_loss_clip": 1.03550911, + "balance_loss_mlp": 1.01906526, + "epoch": 0.9237336539906809, + "flos": 26797117144320.0, + "grad_norm": 1.9264326552740696, + "language_loss": 0.83196497, + "learning_rate": 6.065970955510514e-08, + "loss": 0.85330951, + "num_input_tokens_seen": 331499090, + "step": 15364, + "time_per_iteration": 2.519832134246826 + }, + { + "auxiliary_loss_clip": 0.01067195, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.03186655, + "balance_loss_mlp": 1.01485384, + "epoch": 0.9237937772433489, + "flos": 23587708942080.0, + "grad_norm": 1.52794681436461, + "language_loss": 0.68020368, + "learning_rate": 6.056455515869419e-08, + "loss": 0.70113385, + "num_input_tokens_seen": 331519420, + "step": 15365, + "time_per_iteration": 2.5810720920562744 + }, + { + "auxiliary_loss_clip": 0.0110471, + "auxiliary_loss_mlp": 0.01029011, + "balance_loss_clip": 1.03618717, + "balance_loss_mlp": 1.0168221, + "epoch": 0.9238539004960168, + "flos": 26140562398080.0, + "grad_norm": 1.8089633106304939, + "language_loss": 0.63045675, + "learning_rate": 6.046947430586913e-08, + "loss": 0.65179396, + "num_input_tokens_seen": 331538720, + "step": 15366, + "time_per_iteration": 2.506680727005005 + }, + { + "auxiliary_loss_clip": 0.01072426, + "auxiliary_loss_mlp": 0.01028498, + "balance_loss_clip": 1.03723681, + "balance_loss_mlp": 1.01602829, + "epoch": 0.9239140237486848, + "flos": 21068000760960.0, + "grad_norm": 1.5381631441492354, + "language_loss": 0.74581707, + "learning_rate": 6.037446700023619e-08, + "loss": 0.76682627, + "num_input_tokens_seen": 331558505, + "step": 15367, + "time_per_iteration": 2.5417091846466064 + }, + { + "auxiliary_loss_clip": 0.01078722, + "auxiliary_loss_mlp": 0.00781531, + "balance_loss_clip": 1.03402984, + "balance_loss_mlp": 1.00892639, + "epoch": 0.9239741470013527, + "flos": 24607930936320.0, + "grad_norm": 1.7433315357287282, + "language_loss": 0.64546108, + "learning_rate": 6.027953324539759e-08, + "loss": 0.66406369, + "num_input_tokens_seen": 331578440, + "step": 15368, + "time_per_iteration": 2.5368478298187256 + }, + { + "auxiliary_loss_clip": 0.01096622, + "auxiliary_loss_mlp": 0.01033848, + "balance_loss_clip": 1.03514671, + "balance_loss_mlp": 1.02125931, + "epoch": 0.9240342702540207, + "flos": 24718248581760.0, + "grad_norm": 1.7404078847425666, + "language_loss": 0.74458885, + "learning_rate": 6.018467304495401e-08, + "loss": 0.76589358, + "num_input_tokens_seen": 331598945, + "step": 15369, + "time_per_iteration": 2.511030435562134 + }, + { + "auxiliary_loss_clip": 0.01098175, + "auxiliary_loss_mlp": 0.01037098, + "balance_loss_clip": 1.0367372, + "balance_loss_mlp": 1.02352571, + "epoch": 0.9240943935066888, + "flos": 20849987162880.0, + "grad_norm": 2.4459533163843394, + "language_loss": 0.76723063, + "learning_rate": 6.008988640250145e-08, + "loss": 0.78858334, + "num_input_tokens_seen": 331616700, + "step": 15370, + "time_per_iteration": 2.482431650161743 + }, + { + "auxiliary_loss_clip": 0.01103976, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.03450656, + "balance_loss_mlp": 1.02298963, + "epoch": 0.9241545167593567, + "flos": 24462313200000.0, + "grad_norm": 2.45623226821063, + "language_loss": 0.66422796, + "learning_rate": 5.999517332163528e-08, + "loss": 0.68561554, + "num_input_tokens_seen": 331635625, + "step": 15371, + "time_per_iteration": 2.4696807861328125 + }, + { + "auxiliary_loss_clip": 0.01014214, + "auxiliary_loss_mlp": 0.00999797, + "balance_loss_clip": 1.0101943, + "balance_loss_mlp": 0.99874175, + "epoch": 0.9242146400120247, + "flos": 61827259847040.0, + "grad_norm": 0.7227713739762819, + "language_loss": 0.57732928, + "learning_rate": 5.99005338059464e-08, + "loss": 0.59746939, + "num_input_tokens_seen": 331698595, + "step": 15372, + "time_per_iteration": 4.471449375152588 + }, + { + "auxiliary_loss_clip": 0.01104338, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.03743029, + "balance_loss_mlp": 1.02169633, + "epoch": 0.9242747632646926, + "flos": 22048397550720.0, + "grad_norm": 2.008672491639872, + "language_loss": 0.69711292, + "learning_rate": 5.98059678590237e-08, + "loss": 0.71848357, + "num_input_tokens_seen": 331717975, + "step": 15373, + "time_per_iteration": 2.4489212036132812 + }, + { + "auxiliary_loss_clip": 0.01089984, + "auxiliary_loss_mlp": 0.01037548, + "balance_loss_clip": 1.03452051, + "balance_loss_mlp": 1.02531755, + "epoch": 0.9243348865173606, + "flos": 18478338842880.0, + "grad_norm": 2.394687020661308, + "language_loss": 0.75264949, + "learning_rate": 5.971147548445299e-08, + "loss": 0.77392483, + "num_input_tokens_seen": 331737220, + "step": 15374, + "time_per_iteration": 3.851379632949829 + }, + { + "auxiliary_loss_clip": 0.01066825, + "auxiliary_loss_mlp": 0.01036013, + "balance_loss_clip": 1.03256285, + "balance_loss_mlp": 1.02412224, + "epoch": 0.9243950097700285, + "flos": 23258767167360.0, + "grad_norm": 1.7190983868924739, + "language_loss": 0.64867377, + "learning_rate": 5.961705668581784e-08, + "loss": 0.66970217, + "num_input_tokens_seen": 331757300, + "step": 15375, + "time_per_iteration": 3.96248722076416 + }, + { + "auxiliary_loss_clip": 0.01076089, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.03615224, + "balance_loss_mlp": 1.02125204, + "epoch": 0.9244551330226966, + "flos": 29749081593600.0, + "grad_norm": 1.7223623955491305, + "language_loss": 0.66198659, + "learning_rate": 5.952271146669829e-08, + "loss": 0.68308085, + "num_input_tokens_seen": 331776995, + "step": 15376, + "time_per_iteration": 2.5903499126434326 + }, + { + "auxiliary_loss_clip": 0.01027326, + "auxiliary_loss_mlp": 0.01002635, + "balance_loss_clip": 1.00462818, + "balance_loss_mlp": 1.00162721, + "epoch": 0.9245152562753645, + "flos": 68864960609280.0, + "grad_norm": 0.6609377435346611, + "language_loss": 0.61162627, + "learning_rate": 5.94284398306717e-08, + "loss": 0.63192582, + "num_input_tokens_seen": 331845015, + "step": 15377, + "time_per_iteration": 3.115239381790161 + }, + { + "auxiliary_loss_clip": 0.01064293, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.03236473, + "balance_loss_mlp": 1.02278185, + "epoch": 0.9245753795280325, + "flos": 21579260993280.0, + "grad_norm": 2.0252668275046677, + "language_loss": 0.73811466, + "learning_rate": 5.933424178131341e-08, + "loss": 0.75910854, + "num_input_tokens_seen": 331862795, + "step": 15378, + "time_per_iteration": 2.529099941253662 + }, + { + "auxiliary_loss_clip": 0.01105473, + "auxiliary_loss_mlp": 0.01032464, + "balance_loss_clip": 1.03586149, + "balance_loss_mlp": 1.01966691, + "epoch": 0.9246355027807004, + "flos": 34496077334400.0, + "grad_norm": 2.1979902947766026, + "language_loss": 0.62540984, + "learning_rate": 5.924011732219503e-08, + "loss": 0.64678925, + "num_input_tokens_seen": 331882535, + "step": 15379, + "time_per_iteration": 2.553590774536133 + }, + { + "auxiliary_loss_clip": 0.01032486, + "auxiliary_loss_mlp": 0.01033532, + "balance_loss_clip": 1.03467786, + "balance_loss_mlp": 1.02023983, + "epoch": 0.9246956260333684, + "flos": 15953854152960.0, + "grad_norm": 1.9987642341465934, + "language_loss": 0.83460689, + "learning_rate": 5.914606645688591e-08, + "loss": 0.85526705, + "num_input_tokens_seen": 331899335, + "step": 15380, + "time_per_iteration": 2.6260673999786377 + }, + { + "auxiliary_loss_clip": 0.01105107, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.03420901, + "balance_loss_mlp": 1.02144551, + "epoch": 0.9247557492860363, + "flos": 23368366540800.0, + "grad_norm": 1.4996767121899437, + "language_loss": 0.73561931, + "learning_rate": 5.905208918895233e-08, + "loss": 0.757016, + "num_input_tokens_seen": 331919030, + "step": 15381, + "time_per_iteration": 2.4527904987335205 + }, + { + "auxiliary_loss_clip": 0.0108734, + "auxiliary_loss_mlp": 0.01029122, + "balance_loss_clip": 1.03580976, + "balance_loss_mlp": 1.0175823, + "epoch": 0.9248158725387043, + "flos": 23039855729280.0, + "grad_norm": 1.9854741355843133, + "language_loss": 0.78451777, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.80568242, + "num_input_tokens_seen": 331936465, + "step": 15382, + "time_per_iteration": 3.8872482776641846 + }, + { + "auxiliary_loss_clip": 0.01081141, + "auxiliary_loss_mlp": 0.01034504, + "balance_loss_clip": 1.03367949, + "balance_loss_mlp": 1.02198708, + "epoch": 0.9248759957913724, + "flos": 22522418357760.0, + "grad_norm": 1.7429046008149593, + "language_loss": 0.74856293, + "learning_rate": 5.886435545946455e-08, + "loss": 0.76971942, + "num_input_tokens_seen": 331954625, + "step": 15383, + "time_per_iteration": 2.540475368499756 + }, + { + "auxiliary_loss_clip": 0.01080365, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.03276014, + "balance_loss_mlp": 1.0173341, + "epoch": 0.9249361190440403, + "flos": 25447271016960.0, + "grad_norm": 1.539443433164708, + "language_loss": 0.75704229, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.77813399, + "num_input_tokens_seen": 331975865, + "step": 15384, + "time_per_iteration": 2.5503287315368652 + }, + { + "auxiliary_loss_clip": 0.01068656, + "auxiliary_loss_mlp": 0.0102914, + "balance_loss_clip": 1.03338909, + "balance_loss_mlp": 1.01733804, + "epoch": 0.9249962422967083, + "flos": 12378623886720.0, + "grad_norm": 1.9489085690661576, + "language_loss": 0.66393483, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.6849128, + "num_input_tokens_seen": 331992760, + "step": 15385, + "time_per_iteration": 2.4888851642608643 + }, + { + "auxiliary_loss_clip": 0.01101545, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.03393102, + "balance_loss_mlp": 1.02093387, + "epoch": 0.9250563655493762, + "flos": 22929430343040.0, + "grad_norm": 2.1098883434117752, + "language_loss": 0.80576068, + "learning_rate": 5.85833069345496e-08, + "loss": 0.82710314, + "num_input_tokens_seen": 332011890, + "step": 15386, + "time_per_iteration": 2.450291395187378 + }, + { + "auxiliary_loss_clip": 0.01090815, + "auxiliary_loss_mlp": 0.01037979, + "balance_loss_clip": 1.03427041, + "balance_loss_mlp": 1.02583802, + "epoch": 0.9251164888020442, + "flos": 18478662065280.0, + "grad_norm": 1.83224161575174, + "language_loss": 0.75367975, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.77496767, + "num_input_tokens_seen": 332029485, + "step": 15387, + "time_per_iteration": 2.4652528762817383 + }, + { + "auxiliary_loss_clip": 0.01087145, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.03497815, + "balance_loss_mlp": 1.01970494, + "epoch": 0.9251766120547121, + "flos": 33037062796800.0, + "grad_norm": 1.893968863020349, + "language_loss": 0.70011342, + "learning_rate": 5.839630933893014e-08, + "loss": 0.72129107, + "num_input_tokens_seen": 332052970, + "step": 15388, + "time_per_iteration": 2.5983080863952637 + }, + { + "auxiliary_loss_clip": 0.0109391, + "auxiliary_loss_mlp": 0.01028416, + "balance_loss_clip": 1.03446317, + "balance_loss_mlp": 1.01644778, + "epoch": 0.9252367353073802, + "flos": 24387906176640.0, + "grad_norm": 1.7681515297029704, + "language_loss": 0.82138914, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.84261245, + "num_input_tokens_seen": 332070395, + "step": 15389, + "time_per_iteration": 2.5031449794769287 + }, + { + "auxiliary_loss_clip": 0.01100885, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.03660607, + "balance_loss_mlp": 1.02200437, + "epoch": 0.9252968585600481, + "flos": 18916844077440.0, + "grad_norm": 1.5986838320859396, + "language_loss": 0.79102898, + "learning_rate": 5.820960624653381e-08, + "loss": 0.81239676, + "num_input_tokens_seen": 332090185, + "step": 15390, + "time_per_iteration": 2.4665005207061768 + }, + { + "auxiliary_loss_clip": 0.01071374, + "auxiliary_loss_mlp": 0.01038677, + "balance_loss_clip": 1.03422856, + "balance_loss_mlp": 1.02604103, + "epoch": 0.9253569818127161, + "flos": 21725345606400.0, + "grad_norm": 1.976268903143008, + "language_loss": 0.75472897, + "learning_rate": 5.811636514789597e-08, + "loss": 0.77582949, + "num_input_tokens_seen": 332109050, + "step": 15391, + "time_per_iteration": 2.538593292236328 + }, + { + "auxiliary_loss_clip": 0.01082204, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.03262806, + "balance_loss_mlp": 1.01685965, + "epoch": 0.925417105065384, + "flos": 34240357434240.0, + "grad_norm": 2.235252734866771, + "language_loss": 0.51913512, + "learning_rate": 5.80231976856802e-08, + "loss": 0.54026026, + "num_input_tokens_seen": 332131180, + "step": 15392, + "time_per_iteration": 2.6134872436523438 + }, + { + "auxiliary_loss_clip": 0.01101894, + "auxiliary_loss_mlp": 0.01028434, + "balance_loss_clip": 1.03298056, + "balance_loss_mlp": 1.01721668, + "epoch": 0.925477228318052, + "flos": 25959536830080.0, + "grad_norm": 1.7592401618455358, + "language_loss": 0.77158427, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.79288757, + "num_input_tokens_seen": 332149555, + "step": 15393, + "time_per_iteration": 2.5247156620025635 + }, + { + "auxiliary_loss_clip": 0.01073371, + "auxiliary_loss_mlp": 0.01030362, + "balance_loss_clip": 1.03342104, + "balance_loss_mlp": 1.01828599, + "epoch": 0.9255373515707199, + "flos": 11838240702720.0, + "grad_norm": 1.8397738076963843, + "language_loss": 0.69376028, + "learning_rate": 5.783708368464357e-08, + "loss": 0.71479762, + "num_input_tokens_seen": 332165830, + "step": 15394, + "time_per_iteration": 2.483447790145874 + }, + { + "auxiliary_loss_clip": 0.01104476, + "auxiliary_loss_mlp": 0.01025532, + "balance_loss_clip": 1.03545594, + "balance_loss_mlp": 1.01377165, + "epoch": 0.925597474823388, + "flos": 21434325615360.0, + "grad_norm": 1.7707871513192834, + "language_loss": 0.73049772, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.7517978, + "num_input_tokens_seen": 332185130, + "step": 15395, + "time_per_iteration": 2.453838348388672 + }, + { + "auxiliary_loss_clip": 0.01051122, + "auxiliary_loss_mlp": 0.01026007, + "balance_loss_clip": 1.0308435, + "balance_loss_mlp": 1.01518321, + "epoch": 0.925657598076056, + "flos": 22857573185280.0, + "grad_norm": 2.374298747447846, + "language_loss": 0.72086126, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.74163258, + "num_input_tokens_seen": 332203695, + "step": 15396, + "time_per_iteration": 2.5711939334869385 + }, + { + "auxiliary_loss_clip": 0.01102628, + "auxiliary_loss_mlp": 0.01031426, + "balance_loss_clip": 1.0347724, + "balance_loss_mlp": 1.01890922, + "epoch": 0.9257177213287239, + "flos": 25704032411520.0, + "grad_norm": 2.2248145382381153, + "language_loss": 0.87380248, + "learning_rate": 5.755846504448603e-08, + "loss": 0.89514303, + "num_input_tokens_seen": 332224850, + "step": 15397, + "time_per_iteration": 2.478447914123535 + }, + { + "auxiliary_loss_clip": 0.01027335, + "auxiliary_loss_mlp": 0.00999258, + "balance_loss_clip": 1.00449681, + "balance_loss_mlp": 0.99815524, + "epoch": 0.9257778445813919, + "flos": 59592933221760.0, + "grad_norm": 0.79799632912146, + "language_loss": 0.55143243, + "learning_rate": 5.746573947489586e-08, + "loss": 0.57169837, + "num_input_tokens_seen": 332278085, + "step": 15398, + "time_per_iteration": 2.9308431148529053 + }, + { + "auxiliary_loss_clip": 0.01083912, + "auxiliary_loss_mlp": 0.01028645, + "balance_loss_clip": 1.03382778, + "balance_loss_mlp": 1.01469195, + "epoch": 0.9258379678340598, + "flos": 27709427704320.0, + "grad_norm": 1.7544365851555825, + "language_loss": 0.75809175, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.77921736, + "num_input_tokens_seen": 332297875, + "step": 15399, + "time_per_iteration": 2.5535495281219482 + }, + { + "auxiliary_loss_clip": 0.01073188, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.03206468, + "balance_loss_mlp": 1.01866865, + "epoch": 0.9258980910867278, + "flos": 24863543095680.0, + "grad_norm": 1.4648693416881415, + "language_loss": 0.78264618, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.80367315, + "num_input_tokens_seen": 332318500, + "step": 15400, + "time_per_iteration": 2.5804760456085205 + }, + { + "auxiliary_loss_clip": 0.01018848, + "auxiliary_loss_mlp": 0.01000165, + "balance_loss_clip": 1.00484598, + "balance_loss_mlp": 0.99909174, + "epoch": 0.9259582143393957, + "flos": 63134587249920.0, + "grad_norm": 0.7268442966069679, + "language_loss": 0.5132215, + "learning_rate": 5.718800474673946e-08, + "loss": 0.53341162, + "num_input_tokens_seen": 332381980, + "step": 15401, + "time_per_iteration": 3.058939218521118 + }, + { + "auxiliary_loss_clip": 0.01089096, + "auxiliary_loss_mlp": 0.0103211, + "balance_loss_clip": 1.03460217, + "balance_loss_mlp": 1.02101719, + "epoch": 0.9260183375920638, + "flos": 24127122458880.0, + "grad_norm": 1.748864649105151, + "language_loss": 0.82514119, + "learning_rate": 5.709557384259378e-08, + "loss": 0.84635323, + "num_input_tokens_seen": 332399510, + "step": 15402, + "time_per_iteration": 2.5010056495666504 + }, + { + "auxiliary_loss_clip": 0.01026931, + "auxiliary_loss_mlp": 0.01001034, + "balance_loss_clip": 1.00418878, + "balance_loss_mlp": 1.00005078, + "epoch": 0.9260784608447317, + "flos": 63042872849280.0, + "grad_norm": 1.1426773891896398, + "language_loss": 0.51137328, + "learning_rate": 5.700321661357876e-08, + "loss": 0.53165293, + "num_input_tokens_seen": 332459130, + "step": 15403, + "time_per_iteration": 3.1635212898254395 + }, + { + "auxiliary_loss_clip": 0.0100939, + "auxiliary_loss_mlp": 0.01003052, + "balance_loss_clip": 1.00763011, + "balance_loss_mlp": 1.00194311, + "epoch": 0.9261385840973997, + "flos": 70585979927040.0, + "grad_norm": 0.6877295273819564, + "language_loss": 0.58755225, + "learning_rate": 5.69109330631965e-08, + "loss": 0.60767668, + "num_input_tokens_seen": 332526555, + "step": 15404, + "time_per_iteration": 3.148221254348755 + }, + { + "auxiliary_loss_clip": 0.01077239, + "auxiliary_loss_mlp": 0.01031364, + "balance_loss_clip": 1.03444159, + "balance_loss_mlp": 1.01851916, + "epoch": 0.9261987073500676, + "flos": 20229917656320.0, + "grad_norm": 2.2248226271810987, + "language_loss": 0.7163853, + "learning_rate": 5.681872319494596e-08, + "loss": 0.73747134, + "num_input_tokens_seen": 332544005, + "step": 15405, + "time_per_iteration": 2.4999518394470215 + }, + { + "auxiliary_loss_clip": 0.01057918, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.03448701, + "balance_loss_mlp": 1.02208591, + "epoch": 0.9262588306027356, + "flos": 20954163582720.0, + "grad_norm": 1.6671306240265698, + "language_loss": 0.6889565, + "learning_rate": 5.672658701232458e-08, + "loss": 0.70988625, + "num_input_tokens_seen": 332563070, + "step": 15406, + "time_per_iteration": 2.587157964706421 + }, + { + "auxiliary_loss_clip": 0.01053677, + "auxiliary_loss_mlp": 0.01041128, + "balance_loss_clip": 1.03316557, + "balance_loss_mlp": 1.02650714, + "epoch": 0.9263189538554035, + "flos": 22158679282560.0, + "grad_norm": 2.642773167199833, + "language_loss": 0.76514912, + "learning_rate": 5.663452451882555e-08, + "loss": 0.78609723, + "num_input_tokens_seen": 332579620, + "step": 15407, + "time_per_iteration": 2.555332660675049 + }, + { + "auxiliary_loss_clip": 0.01072536, + "auxiliary_loss_mlp": 0.01039903, + "balance_loss_clip": 1.03156233, + "balance_loss_mlp": 1.0267365, + "epoch": 0.9263790771080715, + "flos": 18187211111040.0, + "grad_norm": 2.46075476997338, + "language_loss": 0.72483408, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.74595845, + "num_input_tokens_seen": 332597795, + "step": 15408, + "time_per_iteration": 2.5495641231536865 + }, + { + "auxiliary_loss_clip": 0.01081093, + "auxiliary_loss_mlp": 0.01027668, + "balance_loss_clip": 1.0345583, + "balance_loss_mlp": 1.01729131, + "epoch": 0.9264392003607396, + "flos": 48178545004800.0, + "grad_norm": 1.8021170457418827, + "language_loss": 0.68668216, + "learning_rate": 5.645062061315675e-08, + "loss": 0.70776975, + "num_input_tokens_seen": 332620375, + "step": 15409, + "time_per_iteration": 2.734372138977051 + }, + { + "auxiliary_loss_clip": 0.01072199, + "auxiliary_loss_mlp": 0.01030414, + "balance_loss_clip": 1.03674781, + "balance_loss_mlp": 1.01752162, + "epoch": 0.9264993236134075, + "flos": 26389458714240.0, + "grad_norm": 1.9638171079634597, + "language_loss": 0.75648332, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.77750945, + "num_input_tokens_seen": 332639510, + "step": 15410, + "time_per_iteration": 2.594776153564453 + }, + { + "auxiliary_loss_clip": 0.01050374, + "auxiliary_loss_mlp": 0.0102558, + "balance_loss_clip": 1.0338167, + "balance_loss_mlp": 1.01430857, + "epoch": 0.9265594468660755, + "flos": 20920084554240.0, + "grad_norm": 1.5062688481332813, + "language_loss": 0.82047796, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.84123755, + "num_input_tokens_seen": 332658350, + "step": 15411, + "time_per_iteration": 3.979329824447632 + }, + { + "auxiliary_loss_clip": 0.01080009, + "auxiliary_loss_mlp": 0.01034563, + "balance_loss_clip": 1.03565896, + "balance_loss_mlp": 1.02299953, + "epoch": 0.9266195701187434, + "flos": 17525017929600.0, + "grad_norm": 1.8455763248760013, + "language_loss": 0.75187069, + "learning_rate": 5.617531751025728e-08, + "loss": 0.77301645, + "num_input_tokens_seen": 332676715, + "step": 15412, + "time_per_iteration": 3.872934103012085 + }, + { + "auxiliary_loss_clip": 0.01100643, + "auxiliary_loss_mlp": 0.01026743, + "balance_loss_clip": 1.03175187, + "balance_loss_mlp": 1.01492977, + "epoch": 0.9266796933714114, + "flos": 33688733293440.0, + "grad_norm": 1.667102953710611, + "language_loss": 0.66883147, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.69010532, + "num_input_tokens_seen": 332701470, + "step": 15413, + "time_per_iteration": 2.550652265548706 + }, + { + "auxiliary_loss_clip": 0.01044918, + "auxiliary_loss_mlp": 0.01035167, + "balance_loss_clip": 1.03218114, + "balance_loss_mlp": 1.02269769, + "epoch": 0.9267398166240793, + "flos": 18916520855040.0, + "grad_norm": 1.9459438780915324, + "language_loss": 0.75675893, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.77755982, + "num_input_tokens_seen": 332719060, + "step": 15414, + "time_per_iteration": 3.9760141372680664 + }, + { + "auxiliary_loss_clip": 0.01087427, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.03467679, + "balance_loss_mlp": 1.01741934, + "epoch": 0.9267999398767474, + "flos": 20478957626880.0, + "grad_norm": 2.2415205927947772, + "language_loss": 0.81663793, + "learning_rate": 5.59006777975819e-08, + "loss": 0.83780134, + "num_input_tokens_seen": 332736345, + "step": 15415, + "time_per_iteration": 2.459085702896118 + }, + { + "auxiliary_loss_clip": 0.01083257, + "auxiliary_loss_mlp": 0.01032449, + "balance_loss_clip": 1.03324342, + "balance_loss_mlp": 1.01966345, + "epoch": 0.9268600631294153, + "flos": 24789351553920.0, + "grad_norm": 1.3402473203695986, + "language_loss": 0.54140246, + "learning_rate": 5.580927866294671e-08, + "loss": 0.56255949, + "num_input_tokens_seen": 332756270, + "step": 15416, + "time_per_iteration": 2.5530052185058594 + }, + { + "auxiliary_loss_clip": 0.01061331, + "auxiliary_loss_mlp": 0.01034726, + "balance_loss_clip": 1.03114855, + "balance_loss_mlp": 1.02297163, + "epoch": 0.9269201863820833, + "flos": 18697178453760.0, + "grad_norm": 1.7892325904070032, + "language_loss": 0.72047389, + "learning_rate": 5.571795325221807e-08, + "loss": 0.7414344, + "num_input_tokens_seen": 332775185, + "step": 15417, + "time_per_iteration": 2.5179495811462402 + }, + { + "auxiliary_loss_clip": 0.0108911, + "auxiliary_loss_mlp": 0.01029497, + "balance_loss_clip": 1.03589368, + "balance_loss_mlp": 1.01731396, + "epoch": 0.9269803096347512, + "flos": 20923999136640.0, + "grad_norm": 2.3370336434155217, + "language_loss": 0.75573248, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.77691853, + "num_input_tokens_seen": 332794320, + "step": 15418, + "time_per_iteration": 2.5134074687957764 + }, + { + "auxiliary_loss_clip": 0.01090543, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.03313124, + "balance_loss_mlp": 1.01880026, + "epoch": 0.9270404328874192, + "flos": 28002710252160.0, + "grad_norm": 1.4784675252558879, + "language_loss": 0.76028329, + "learning_rate": 5.553552361633174e-08, + "loss": 0.78149807, + "num_input_tokens_seen": 332818095, + "step": 15419, + "time_per_iteration": 2.556217670440674 + }, + { + "auxiliary_loss_clip": 0.01098142, + "auxiliary_loss_mlp": 0.01031736, + "balance_loss_clip": 1.03289294, + "balance_loss_mlp": 1.02098346, + "epoch": 0.9271005561400871, + "flos": 25889870401920.0, + "grad_norm": 1.5954255109916482, + "language_loss": 0.75871468, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.78001344, + "num_input_tokens_seen": 332839860, + "step": 15420, + "time_per_iteration": 2.508079767227173 + }, + { + "auxiliary_loss_clip": 0.01094105, + "auxiliary_loss_mlp": 0.010298, + "balance_loss_clip": 1.03391421, + "balance_loss_mlp": 1.0175215, + "epoch": 0.9271606793927551, + "flos": 27053914452480.0, + "grad_norm": 1.5176411891437707, + "language_loss": 0.76578414, + "learning_rate": 5.535338891759389e-08, + "loss": 0.78702319, + "num_input_tokens_seen": 332861155, + "step": 15421, + "time_per_iteration": 2.5219476222991943 + }, + { + "auxiliary_loss_clip": 0.01077934, + "auxiliary_loss_mlp": 0.01029806, + "balance_loss_clip": 1.03536201, + "balance_loss_mlp": 1.01789117, + "epoch": 0.9272208026454232, + "flos": 26209869690240.0, + "grad_norm": 1.9398939344111723, + "language_loss": 0.72424084, + "learning_rate": 5.526243217829041e-08, + "loss": 0.74531823, + "num_input_tokens_seen": 332881110, + "step": 15422, + "time_per_iteration": 3.9531912803649902 + }, + { + "auxiliary_loss_clip": 0.01095497, + "auxiliary_loss_mlp": 0.01040119, + "balance_loss_clip": 1.03453827, + "balance_loss_mlp": 1.02727377, + "epoch": 0.9272809258980911, + "flos": 12458453863680.0, + "grad_norm": 1.8877969931312746, + "language_loss": 0.77109879, + "learning_rate": 5.517154918363065e-08, + "loss": 0.79245496, + "num_input_tokens_seen": 332899350, + "step": 15423, + "time_per_iteration": 2.4579670429229736 + }, + { + "auxiliary_loss_clip": 0.01094644, + "auxiliary_loss_mlp": 0.01030365, + "balance_loss_clip": 1.0340966, + "balance_loss_mlp": 1.01784825, + "epoch": 0.9273410491507591, + "flos": 22856890826880.0, + "grad_norm": 2.1074795708809946, + "language_loss": 0.75507158, + "learning_rate": 5.508073993706053e-08, + "loss": 0.77632165, + "num_input_tokens_seen": 332918105, + "step": 15424, + "time_per_iteration": 2.4743595123291016 + }, + { + "auxiliary_loss_clip": 0.0101941, + "auxiliary_loss_mlp": 0.01004559, + "balance_loss_clip": 1.00589311, + "balance_loss_mlp": 1.00348032, + "epoch": 0.927401172403427, + "flos": 47665384329600.0, + "grad_norm": 0.780796460050688, + "language_loss": 0.60669684, + "learning_rate": 5.499000444202351e-08, + "loss": 0.62693655, + "num_input_tokens_seen": 332969490, + "step": 15425, + "time_per_iteration": 2.8726694583892822 + }, + { + "auxiliary_loss_clip": 0.01083224, + "auxiliary_loss_mlp": 0.00782595, + "balance_loss_clip": 1.03589129, + "balance_loss_mlp": 1.00881016, + "epoch": 0.927461295656095, + "flos": 29972374490880.0, + "grad_norm": 1.412293674811702, + "language_loss": 0.70652258, + "learning_rate": 5.489934270196106e-08, + "loss": 0.72518075, + "num_input_tokens_seen": 332988805, + "step": 15426, + "time_per_iteration": 2.584205389022827 + }, + { + "auxiliary_loss_clip": 0.01079747, + "auxiliary_loss_mlp": 0.01024603, + "balance_loss_clip": 1.03490734, + "balance_loss_mlp": 1.01343298, + "epoch": 0.9275214189087629, + "flos": 20375427651840.0, + "grad_norm": 1.8961350588585713, + "language_loss": 0.83281028, + "learning_rate": 5.480875472030977e-08, + "loss": 0.85385382, + "num_input_tokens_seen": 333007960, + "step": 15427, + "time_per_iteration": 2.5107314586639404 + }, + { + "auxiliary_loss_clip": 0.01070489, + "auxiliary_loss_mlp": 0.01033577, + "balance_loss_clip": 1.03457546, + "balance_loss_mlp": 1.02142966, + "epoch": 0.927581542161431, + "flos": 22383193242240.0, + "grad_norm": 1.6277949652723425, + "language_loss": 0.76959455, + "learning_rate": 5.471824050050555e-08, + "loss": 0.79063523, + "num_input_tokens_seen": 333026035, + "step": 15428, + "time_per_iteration": 2.5276072025299072 + }, + { + "auxiliary_loss_clip": 0.01063801, + "auxiliary_loss_mlp": 0.01033194, + "balance_loss_clip": 1.03083348, + "balance_loss_mlp": 1.02055788, + "epoch": 0.9276416654140989, + "flos": 23952453598080.0, + "grad_norm": 1.8638357917778532, + "language_loss": 0.74536353, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.76633352, + "num_input_tokens_seen": 333045590, + "step": 15429, + "time_per_iteration": 2.5658814907073975 + }, + { + "auxiliary_loss_clip": 0.01066215, + "auxiliary_loss_mlp": 0.01032628, + "balance_loss_clip": 1.03267145, + "balance_loss_mlp": 1.02080882, + "epoch": 0.9277017886667669, + "flos": 13917719796480.0, + "grad_norm": 1.8823653225345707, + "language_loss": 0.74906266, + "learning_rate": 5.45374333601647e-08, + "loss": 0.77005112, + "num_input_tokens_seen": 333063355, + "step": 15430, + "time_per_iteration": 2.516108989715576 + }, + { + "auxiliary_loss_clip": 0.01092662, + "auxiliary_loss_mlp": 0.01031704, + "balance_loss_clip": 1.03373158, + "balance_loss_mlp": 1.01900828, + "epoch": 0.9277619119194348, + "flos": 35666478092160.0, + "grad_norm": 1.4866121812942439, + "language_loss": 0.76504046, + "learning_rate": 5.444714044648391e-08, + "loss": 0.78628409, + "num_input_tokens_seen": 333088045, + "step": 15431, + "time_per_iteration": 2.637157440185547 + }, + { + "auxiliary_loss_clip": 0.01091587, + "auxiliary_loss_mlp": 0.01028248, + "balance_loss_clip": 1.03551066, + "balance_loss_mlp": 1.01700068, + "epoch": 0.9278220351721028, + "flos": 23841238112640.0, + "grad_norm": 2.1312729126972996, + "language_loss": 0.70574641, + "learning_rate": 5.4356921308363e-08, + "loss": 0.72694474, + "num_input_tokens_seen": 333108005, + "step": 15432, + "time_per_iteration": 2.4997785091400146 + }, + { + "auxiliary_loss_clip": 0.01056614, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.0340302, + "balance_loss_mlp": 1.01759529, + "epoch": 0.9278821584247707, + "flos": 15228135768960.0, + "grad_norm": 2.2549730354773163, + "language_loss": 0.82722044, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.84807611, + "num_input_tokens_seen": 333124335, + "step": 15433, + "time_per_iteration": 2.57077693939209 + }, + { + "auxiliary_loss_clip": 0.01099966, + "auxiliary_loss_mlp": 0.01028118, + "balance_loss_clip": 1.03461647, + "balance_loss_mlp": 1.01750207, + "epoch": 0.9279422816774388, + "flos": 24681404206080.0, + "grad_norm": 1.888437172768609, + "language_loss": 0.66904479, + "learning_rate": 5.417670437248056e-08, + "loss": 0.69032568, + "num_input_tokens_seen": 333143995, + "step": 15434, + "time_per_iteration": 2.4776577949523926 + }, + { + "auxiliary_loss_clip": 0.01074265, + "auxiliary_loss_mlp": 0.01027807, + "balance_loss_clip": 1.03336883, + "balance_loss_mlp": 1.01657128, + "epoch": 0.9280024049301068, + "flos": 19169188099200.0, + "grad_norm": 1.714811976950717, + "language_loss": 0.68567276, + "learning_rate": 5.40867065815529e-08, + "loss": 0.70669347, + "num_input_tokens_seen": 333162805, + "step": 15435, + "time_per_iteration": 2.538637161254883 + }, + { + "auxiliary_loss_clip": 0.01103907, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.03479719, + "balance_loss_mlp": 1.01937819, + "epoch": 0.9280625281827747, + "flos": 11393701983360.0, + "grad_norm": 1.9685261560246479, + "language_loss": 0.72037375, + "learning_rate": 5.399678257985263e-08, + "loss": 0.74173069, + "num_input_tokens_seen": 333175770, + "step": 15436, + "time_per_iteration": 2.4414782524108887 + }, + { + "auxiliary_loss_clip": 0.0107827, + "auxiliary_loss_mlp": 0.01029742, + "balance_loss_clip": 1.03401315, + "balance_loss_mlp": 1.01772606, + "epoch": 0.9281226514354427, + "flos": 24785616539520.0, + "grad_norm": 2.2399922615829366, + "language_loss": 0.66954112, + "learning_rate": 5.390693237078925e-08, + "loss": 0.69062126, + "num_input_tokens_seen": 333194775, + "step": 15437, + "time_per_iteration": 2.549983501434326 + }, + { + "auxiliary_loss_clip": 0.01092371, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.03571486, + "balance_loss_mlp": 1.01941943, + "epoch": 0.9281827746881106, + "flos": 15083128563840.0, + "grad_norm": 2.0744151849336636, + "language_loss": 0.71102142, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.73226738, + "num_input_tokens_seen": 333208920, + "step": 15438, + "time_per_iteration": 2.4475314617156982 + }, + { + "auxiliary_loss_clip": 0.01104009, + "auxiliary_loss_mlp": 0.0102987, + "balance_loss_clip": 1.0348196, + "balance_loss_mlp": 1.01772261, + "epoch": 0.9282428979407786, + "flos": 24135059364480.0, + "grad_norm": 2.2013506338290774, + "language_loss": 0.6444633, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.66580206, + "num_input_tokens_seen": 333229350, + "step": 15439, + "time_per_iteration": 2.49297833442688 + }, + { + "auxiliary_loss_clip": 0.01081419, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.03455591, + "balance_loss_mlp": 1.0162642, + "epoch": 0.9283030211934465, + "flos": 24823215100800.0, + "grad_norm": 2.639818521345491, + "language_loss": 0.70320225, + "learning_rate": 5.363782453347876e-08, + "loss": 0.72429931, + "num_input_tokens_seen": 333246125, + "step": 15440, + "time_per_iteration": 2.5373897552490234 + }, + { + "auxiliary_loss_clip": 0.01069291, + "auxiliary_loss_mlp": 0.0078428, + "balance_loss_clip": 1.03422213, + "balance_loss_mlp": 1.00808036, + "epoch": 0.9283631444461146, + "flos": 23981037845760.0, + "grad_norm": 1.605873433536233, + "language_loss": 0.76860571, + "learning_rate": 5.354826952900682e-08, + "loss": 0.78714144, + "num_input_tokens_seen": 333263685, + "step": 15441, + "time_per_iteration": 2.627082347869873 + }, + { + "auxiliary_loss_clip": 0.01086824, + "auxiliary_loss_mlp": 0.01027788, + "balance_loss_clip": 1.03476918, + "balance_loss_mlp": 1.01754761, + "epoch": 0.9284232676987825, + "flos": 22784530878720.0, + "grad_norm": 1.6108503036482649, + "language_loss": 0.63818848, + "learning_rate": 5.345878833417949e-08, + "loss": 0.6593346, + "num_input_tokens_seen": 333282435, + "step": 15442, + "time_per_iteration": 2.4826526641845703 + }, + { + "auxiliary_loss_clip": 0.01069256, + "auxiliary_loss_mlp": 0.01039946, + "balance_loss_clip": 1.03593886, + "balance_loss_mlp": 1.02635622, + "epoch": 0.9284833909514505, + "flos": 19500500171520.0, + "grad_norm": 2.013619196925685, + "language_loss": 0.80664122, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.82773322, + "num_input_tokens_seen": 333300400, + "step": 15443, + "time_per_iteration": 2.527553081512451 + }, + { + "auxiliary_loss_clip": 0.01093697, + "auxiliary_loss_mlp": 0.0078494, + "balance_loss_clip": 1.03472304, + "balance_loss_mlp": 1.01227629, + "epoch": 0.9285435142041184, + "flos": 23185976256000.0, + "grad_norm": 1.8270474195825444, + "language_loss": 0.65384269, + "learning_rate": 5.328004738702896e-08, + "loss": 0.672629, + "num_input_tokens_seen": 333318980, + "step": 15444, + "time_per_iteration": 2.491356134414673 + }, + { + "auxiliary_loss_clip": 0.01062443, + "auxiliary_loss_mlp": 0.01032078, + "balance_loss_clip": 1.0350368, + "balance_loss_mlp": 1.0201745, + "epoch": 0.9286036374567864, + "flos": 17675519915520.0, + "grad_norm": 2.168998446700989, + "language_loss": 0.72800446, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.74894965, + "num_input_tokens_seen": 333334135, + "step": 15445, + "time_per_iteration": 2.536442518234253 + }, + { + "auxiliary_loss_clip": 0.01087802, + "auxiliary_loss_mlp": 0.01032582, + "balance_loss_clip": 1.03388965, + "balance_loss_mlp": 1.01998115, + "epoch": 0.9286637607094543, + "flos": 20886687884160.0, + "grad_norm": 1.9406220087185442, + "language_loss": 0.7123338, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.73353767, + "num_input_tokens_seen": 333353325, + "step": 15446, + "time_per_iteration": 2.49086332321167 + }, + { + "auxiliary_loss_clip": 0.01044249, + "auxiliary_loss_mlp": 0.01028617, + "balance_loss_clip": 1.03227544, + "balance_loss_mlp": 1.01638567, + "epoch": 0.9287238839621224, + "flos": 19026012487680.0, + "grad_norm": 1.7547306280819275, + "language_loss": 0.69272208, + "learning_rate": 5.301248962337523e-08, + "loss": 0.71345073, + "num_input_tokens_seen": 333371110, + "step": 15447, + "time_per_iteration": 2.5949947834014893 + }, + { + "auxiliary_loss_clip": 0.01096189, + "auxiliary_loss_mlp": 0.01029227, + "balance_loss_clip": 1.03297496, + "balance_loss_mlp": 1.01864719, + "epoch": 0.9287840072147904, + "flos": 20557027837440.0, + "grad_norm": 1.8391273972542055, + "language_loss": 0.72654808, + "learning_rate": 5.292345135757403e-08, + "loss": 0.74780226, + "num_input_tokens_seen": 333391420, + "step": 15448, + "time_per_iteration": 2.456153154373169 + }, + { + "auxiliary_loss_clip": 0.01102069, + "auxiliary_loss_mlp": 0.01028006, + "balance_loss_clip": 1.03415418, + "balance_loss_mlp": 1.01479721, + "epoch": 0.9288441304674583, + "flos": 21250822008960.0, + "grad_norm": 1.6387654177658173, + "language_loss": 0.74063987, + "learning_rate": 5.283448692511072e-08, + "loss": 0.7619406, + "num_input_tokens_seen": 333410365, + "step": 15449, + "time_per_iteration": 3.8281748294830322 + }, + { + "auxiliary_loss_clip": 0.01102611, + "auxiliary_loss_mlp": 0.00783214, + "balance_loss_clip": 1.03379226, + "balance_loss_mlp": 1.01020932, + "epoch": 0.9289042537201263, + "flos": 27669853895040.0, + "grad_norm": 1.8330137750397721, + "language_loss": 0.67407215, + "learning_rate": 5.27455963293586e-08, + "loss": 0.69293046, + "num_input_tokens_seen": 333430000, + "step": 15450, + "time_per_iteration": 2.5545027256011963 + }, + { + "auxiliary_loss_clip": 0.01072049, + "auxiliary_loss_mlp": 0.01027195, + "balance_loss_clip": 1.0336442, + "balance_loss_mlp": 1.01519048, + "epoch": 0.9289643769727942, + "flos": 19317750750720.0, + "grad_norm": 1.9008800172699156, + "language_loss": 0.71660686, + "learning_rate": 5.265677957368875e-08, + "loss": 0.73759931, + "num_input_tokens_seen": 333445800, + "step": 15451, + "time_per_iteration": 3.8965165615081787 + }, + { + "auxiliary_loss_clip": 0.01079039, + "auxiliary_loss_mlp": 0.01043849, + "balance_loss_clip": 1.0326767, + "balance_loss_mlp": 1.03046775, + "epoch": 0.9290245002254622, + "flos": 14058058233600.0, + "grad_norm": 2.012711465812543, + "language_loss": 0.73305357, + "learning_rate": 5.25680366614687e-08, + "loss": 0.75428247, + "num_input_tokens_seen": 333461550, + "step": 15452, + "time_per_iteration": 2.467413902282715 + }, + { + "auxiliary_loss_clip": 0.01086064, + "auxiliary_loss_mlp": 0.01032662, + "balance_loss_clip": 1.03803575, + "balance_loss_mlp": 1.02028191, + "epoch": 0.9290846234781301, + "flos": 20047132321920.0, + "grad_norm": 1.6463889642352727, + "language_loss": 0.7423954, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.76358271, + "num_input_tokens_seen": 333478835, + "step": 15453, + "time_per_iteration": 3.915983200073242 + }, + { + "auxiliary_loss_clip": 0.01000565, + "auxiliary_loss_mlp": 0.01001041, + "balance_loss_clip": 1.01029754, + "balance_loss_mlp": 0.99994463, + "epoch": 0.9291447467307982, + "flos": 61227514460160.0, + "grad_norm": 0.8182908789499966, + "language_loss": 0.6071986, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.62721467, + "num_input_tokens_seen": 333535250, + "step": 15454, + "time_per_iteration": 3.05910587310791 + }, + { + "auxiliary_loss_clip": 0.01078916, + "auxiliary_loss_mlp": 0.01037895, + "balance_loss_clip": 1.03232718, + "balance_loss_mlp": 1.02448428, + "epoch": 0.9292048699834661, + "flos": 20553328736640.0, + "grad_norm": 1.6828635112979977, + "language_loss": 0.68833029, + "learning_rate": 5.230225101914709e-08, + "loss": 0.70949841, + "num_input_tokens_seen": 333553805, + "step": 15455, + "time_per_iteration": 2.516469717025757 + }, + { + "auxiliary_loss_clip": 0.01067571, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.03657985, + "balance_loss_mlp": 1.0226295, + "epoch": 0.9292649932361341, + "flos": 23623655477760.0, + "grad_norm": 6.6324225800736025, + "language_loss": 0.64639515, + "learning_rate": 5.22138035143509e-08, + "loss": 0.66742754, + "num_input_tokens_seen": 333572800, + "step": 15456, + "time_per_iteration": 2.556259870529175 + }, + { + "auxiliary_loss_clip": 0.01059314, + "auxiliary_loss_mlp": 0.01030241, + "balance_loss_clip": 1.03401494, + "balance_loss_mlp": 1.01783705, + "epoch": 0.929325116488802, + "flos": 15009942602880.0, + "grad_norm": 2.0424474973249307, + "language_loss": 0.6879319, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.70882744, + "num_input_tokens_seen": 333588520, + "step": 15457, + "time_per_iteration": 2.557140827178955 + }, + { + "auxiliary_loss_clip": 0.01081238, + "auxiliary_loss_mlp": 0.01025801, + "balance_loss_clip": 1.03318048, + "balance_loss_mlp": 1.01444006, + "epoch": 0.92938523974147, + "flos": 17967365919360.0, + "grad_norm": 2.0196655821843064, + "language_loss": 0.8068229, + "learning_rate": 5.203713008885291e-08, + "loss": 0.82789332, + "num_input_tokens_seen": 333603435, + "step": 15458, + "time_per_iteration": 2.4694786071777344 + }, + { + "auxiliary_loss_clip": 0.01093773, + "auxiliary_loss_mlp": 0.01032452, + "balance_loss_clip": 1.03412843, + "balance_loss_mlp": 1.02045393, + "epoch": 0.9294453629941379, + "flos": 23003047267200.0, + "grad_norm": 1.587371724271458, + "language_loss": 0.72381294, + "learning_rate": 5.194890417485065e-08, + "loss": 0.74507523, + "num_input_tokens_seen": 333623305, + "step": 15459, + "time_per_iteration": 2.5055882930755615 + }, + { + "auxiliary_loss_clip": 0.01068301, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.03611398, + "balance_loss_mlp": 1.02410388, + "epoch": 0.929505486246806, + "flos": 17055234927360.0, + "grad_norm": 2.084138991294169, + "language_loss": 0.58635759, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.60740077, + "num_input_tokens_seen": 333641205, + "step": 15460, + "time_per_iteration": 4.046386003494263 + }, + { + "auxiliary_loss_clip": 0.01056133, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.03387082, + "balance_loss_mlp": 1.02060866, + "epoch": 0.9295656094994739, + "flos": 27340409329920.0, + "grad_norm": 1.7868684977279137, + "language_loss": 0.80504006, + "learning_rate": 5.177267396106733e-08, + "loss": 0.82593518, + "num_input_tokens_seen": 333659615, + "step": 15461, + "time_per_iteration": 2.6313486099243164 + }, + { + "auxiliary_loss_clip": 0.0107881, + "auxiliary_loss_mlp": 0.01025726, + "balance_loss_clip": 1.03351128, + "balance_loss_mlp": 1.01394796, + "epoch": 0.9296257327521419, + "flos": 21470954509440.0, + "grad_norm": 1.8167243234309478, + "language_loss": 0.78833532, + "learning_rate": 5.168466966796869e-08, + "loss": 0.80938065, + "num_input_tokens_seen": 333678985, + "step": 15462, + "time_per_iteration": 2.513051748275757 + }, + { + "auxiliary_loss_clip": 0.01061497, + "auxiliary_loss_mlp": 0.01027808, + "balance_loss_clip": 1.02985752, + "balance_loss_mlp": 1.01539838, + "epoch": 0.9296858560048099, + "flos": 16362661818240.0, + "grad_norm": 1.7952466371220337, + "language_loss": 0.62353015, + "learning_rate": 5.159673925518282e-08, + "loss": 0.64442319, + "num_input_tokens_seen": 333696410, + "step": 15463, + "time_per_iteration": 2.51395320892334 + }, + { + "auxiliary_loss_clip": 0.01077921, + "auxiliary_loss_mlp": 0.01027095, + "balance_loss_clip": 1.030617, + "balance_loss_mlp": 1.01553142, + "epoch": 0.9297459792574778, + "flos": 29858609139840.0, + "grad_norm": 1.5000243734351142, + "language_loss": 0.71391416, + "learning_rate": 5.15088827260437e-08, + "loss": 0.73496437, + "num_input_tokens_seen": 333716615, + "step": 15464, + "time_per_iteration": 2.5802695751190186 + }, + { + "auxiliary_loss_clip": 0.0107697, + "auxiliary_loss_mlp": 0.01029987, + "balance_loss_clip": 1.03203249, + "balance_loss_mlp": 1.01807809, + "epoch": 0.9298061025101458, + "flos": 15924838942080.0, + "grad_norm": 1.7873810120371691, + "language_loss": 0.77466083, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.79573029, + "num_input_tokens_seen": 333732800, + "step": 15465, + "time_per_iteration": 2.472368001937866 + }, + { + "auxiliary_loss_clip": 0.0097874, + "auxiliary_loss_mlp": 0.01002534, + "balance_loss_clip": 1.01079082, + "balance_loss_mlp": 1.00137162, + "epoch": 0.9298662257628137, + "flos": 64096994304000.0, + "grad_norm": 0.6938523396329327, + "language_loss": 0.56450802, + "learning_rate": 5.133339133202952e-08, + "loss": 0.58432078, + "num_input_tokens_seen": 333799300, + "step": 15466, + "time_per_iteration": 3.3992204666137695 + }, + { + "auxiliary_loss_clip": 0.01080587, + "auxiliary_loss_mlp": 0.01035302, + "balance_loss_clip": 1.03203821, + "balance_loss_mlp": 1.02243888, + "epoch": 0.9299263490154818, + "flos": 24280210224000.0, + "grad_norm": 1.4025755472881758, + "language_loss": 0.72720993, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.7483688, + "num_input_tokens_seen": 333820360, + "step": 15467, + "time_per_iteration": 2.661621570587158 + }, + { + "auxiliary_loss_clip": 0.01082837, + "auxiliary_loss_mlp": 0.01031076, + "balance_loss_clip": 1.03506052, + "balance_loss_mlp": 1.01864266, + "epoch": 0.9299864722681497, + "flos": 23294354567040.0, + "grad_norm": 1.871569804077974, + "language_loss": 0.72010851, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.74124759, + "num_input_tokens_seen": 333840415, + "step": 15468, + "time_per_iteration": 2.5491104125976562 + }, + { + "auxiliary_loss_clip": 0.01090797, + "auxiliary_loss_mlp": 0.01034396, + "balance_loss_clip": 1.03329575, + "balance_loss_mlp": 1.02060914, + "epoch": 0.9300465955208177, + "flos": 21395972868480.0, + "grad_norm": 1.6752592265777597, + "language_loss": 0.75599241, + "learning_rate": 5.107070845155737e-08, + "loss": 0.77724433, + "num_input_tokens_seen": 333859910, + "step": 15469, + "time_per_iteration": 2.4881863594055176 + }, + { + "auxiliary_loss_clip": 0.01076423, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.03464091, + "balance_loss_mlp": 1.02101827, + "epoch": 0.9301067187734856, + "flos": 24571445696640.0, + "grad_norm": 2.1607661973072356, + "language_loss": 0.75564313, + "learning_rate": 5.098329529416379e-08, + "loss": 0.77673757, + "num_input_tokens_seen": 333880495, + "step": 15470, + "time_per_iteration": 2.564033269882202 + }, + { + "auxiliary_loss_clip": 0.01068945, + "auxiliary_loss_mlp": 0.01028493, + "balance_loss_clip": 1.03426814, + "balance_loss_mlp": 1.01759124, + "epoch": 0.9301668420261536, + "flos": 22196960202240.0, + "grad_norm": 1.4791940481755939, + "language_loss": 0.74397671, + "learning_rate": 5.089595604367902e-08, + "loss": 0.76495111, + "num_input_tokens_seen": 333897640, + "step": 15471, + "time_per_iteration": 2.5318474769592285 + }, + { + "auxiliary_loss_clip": 0.01087421, + "auxiliary_loss_mlp": 0.01028755, + "balance_loss_clip": 1.03350401, + "balance_loss_mlp": 1.01676214, + "epoch": 0.9302269652788215, + "flos": 17747628468480.0, + "grad_norm": 2.447466065519673, + "language_loss": 0.69845939, + "learning_rate": 5.080869070341487e-08, + "loss": 0.71962112, + "num_input_tokens_seen": 333913670, + "step": 15472, + "time_per_iteration": 2.462728977203369 + }, + { + "auxiliary_loss_clip": 0.01074283, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.03346121, + "balance_loss_mlp": 1.02155113, + "epoch": 0.9302870885314896, + "flos": 19390793057280.0, + "grad_norm": 1.8977869514473331, + "language_loss": 0.88259459, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.90366447, + "num_input_tokens_seen": 333934105, + "step": 15473, + "time_per_iteration": 2.50498104095459 + }, + { + "auxiliary_loss_clip": 0.01081147, + "auxiliary_loss_mlp": 0.01035799, + "balance_loss_clip": 1.03496599, + "balance_loss_mlp": 1.02229905, + "epoch": 0.9303472117841575, + "flos": 21760286561280.0, + "grad_norm": 1.9115938135971617, + "language_loss": 0.64471453, + "learning_rate": 5.063438176678203e-08, + "loss": 0.66588402, + "num_input_tokens_seen": 333953635, + "step": 15474, + "time_per_iteration": 2.54056453704834 + }, + { + "auxiliary_loss_clip": 0.01103268, + "auxiliary_loss_mlp": 0.01032651, + "balance_loss_clip": 1.03474247, + "balance_loss_mlp": 1.02079606, + "epoch": 0.9304073350368255, + "flos": 19609740408960.0, + "grad_norm": 1.7272735630499318, + "language_loss": 0.74395394, + "learning_rate": 5.054733817702339e-08, + "loss": 0.76531315, + "num_input_tokens_seen": 333971825, + "step": 15475, + "time_per_iteration": 2.4207701683044434 + }, + { + "auxiliary_loss_clip": 0.01089401, + "auxiliary_loss_mlp": 0.01027296, + "balance_loss_clip": 1.03282225, + "balance_loss_mlp": 1.01604271, + "epoch": 0.9304674582894935, + "flos": 30441582875520.0, + "grad_norm": 2.1947457106217882, + "language_loss": 0.66769385, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.68886077, + "num_input_tokens_seen": 333990120, + "step": 15476, + "time_per_iteration": 2.5585567951202393 + }, + { + "auxiliary_loss_clip": 0.01059355, + "auxiliary_loss_mlp": 0.01030818, + "balance_loss_clip": 1.0352695, + "balance_loss_mlp": 1.01837826, + "epoch": 0.9305275815421614, + "flos": 17785693906560.0, + "grad_norm": 1.9014785368514089, + "language_loss": 0.68573046, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.70663214, + "num_input_tokens_seen": 334007970, + "step": 15477, + "time_per_iteration": 2.54782772064209 + }, + { + "auxiliary_loss_clip": 0.01082786, + "auxiliary_loss_mlp": 0.01028738, + "balance_loss_clip": 1.03600514, + "balance_loss_mlp": 1.01768112, + "epoch": 0.9305877047948294, + "flos": 25298456970240.0, + "grad_norm": 1.7437746280391557, + "language_loss": 0.58449262, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.60560781, + "num_input_tokens_seen": 334027120, + "step": 15478, + "time_per_iteration": 2.57336163520813 + }, + { + "auxiliary_loss_clip": 0.0108152, + "auxiliary_loss_mlp": 0.01032409, + "balance_loss_clip": 1.03540134, + "balance_loss_mlp": 1.01915288, + "epoch": 0.9306478280474973, + "flos": 16977236544000.0, + "grad_norm": 1.721964097341294, + "language_loss": 0.78596717, + "learning_rate": 5.01999030853566e-08, + "loss": 0.80710644, + "num_input_tokens_seen": 334042785, + "step": 15479, + "time_per_iteration": 2.4898979663848877 + }, + { + "auxiliary_loss_clip": 0.01104404, + "auxiliary_loss_mlp": 0.0103366, + "balance_loss_clip": 1.03537488, + "balance_loss_mlp": 1.02212012, + "epoch": 0.9307079513001654, + "flos": 35663353608960.0, + "grad_norm": 1.6376356236709904, + "language_loss": 0.68858469, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.70996535, + "num_input_tokens_seen": 334063480, + "step": 15480, + "time_per_iteration": 2.578665256500244 + }, + { + "auxiliary_loss_clip": 0.01103793, + "auxiliary_loss_mlp": 0.01028821, + "balance_loss_clip": 1.03549671, + "balance_loss_mlp": 1.01691771, + "epoch": 0.9307680745528333, + "flos": 19208151377280.0, + "grad_norm": 1.7442121070114822, + "language_loss": 0.67852598, + "learning_rate": 5.002662914604583e-08, + "loss": 0.69985211, + "num_input_tokens_seen": 334082005, + "step": 15481, + "time_per_iteration": 2.4239730834960938 + }, + { + "auxiliary_loss_clip": 0.01076233, + "auxiliary_loss_mlp": 0.01032364, + "balance_loss_clip": 1.03126299, + "balance_loss_mlp": 1.01970363, + "epoch": 0.9308281978055013, + "flos": 19062641381760.0, + "grad_norm": 1.8920002367837583, + "language_loss": 0.74505603, + "learning_rate": 4.994010308952701e-08, + "loss": 0.76614189, + "num_input_tokens_seen": 334101375, + "step": 15482, + "time_per_iteration": 2.520158290863037 + }, + { + "auxiliary_loss_clip": 0.0108987, + "auxiliary_loss_mlp": 0.01029197, + "balance_loss_clip": 1.03237998, + "balance_loss_mlp": 1.01760435, + "epoch": 0.9308883210581692, + "flos": 20521548178560.0, + "grad_norm": 3.120587550206097, + "language_loss": 0.8008129, + "learning_rate": 4.985365097947469e-08, + "loss": 0.8220036, + "num_input_tokens_seen": 334119460, + "step": 15483, + "time_per_iteration": 2.4460291862487793 + }, + { + "auxiliary_loss_clip": 0.01078089, + "auxiliary_loss_mlp": 0.01031062, + "balance_loss_clip": 1.03478789, + "balance_loss_mlp": 1.01907003, + "epoch": 0.9309484443108372, + "flos": 13001422826880.0, + "grad_norm": 1.847882433549484, + "language_loss": 0.74930096, + "learning_rate": 4.976727281916782e-08, + "loss": 0.77039242, + "num_input_tokens_seen": 334136065, + "step": 15484, + "time_per_iteration": 2.48758602142334 + }, + { + "auxiliary_loss_clip": 0.01083635, + "auxiliary_loss_mlp": 0.0103302, + "balance_loss_clip": 1.03830552, + "balance_loss_mlp": 1.02064633, + "epoch": 0.9310085675635051, + "flos": 12567765928320.0, + "grad_norm": 2.1761509454310213, + "language_loss": 0.75995433, + "learning_rate": 4.968096861188087e-08, + "loss": 0.7811209, + "num_input_tokens_seen": 334153690, + "step": 15485, + "time_per_iteration": 2.4905846118927 + }, + { + "auxiliary_loss_clip": 0.01059041, + "auxiliary_loss_mlp": 0.01031341, + "balance_loss_clip": 1.03264129, + "balance_loss_mlp": 1.01770914, + "epoch": 0.9310686908161732, + "flos": 23477570864640.0, + "grad_norm": 1.7125932370063621, + "language_loss": 0.78545862, + "learning_rate": 4.959473836088723e-08, + "loss": 0.80636239, + "num_input_tokens_seen": 334171880, + "step": 15486, + "time_per_iteration": 2.5919508934020996 + }, + { + "auxiliary_loss_clip": 0.01074176, + "auxiliary_loss_mlp": 0.0102814, + "balance_loss_clip": 1.03695214, + "balance_loss_mlp": 1.0154258, + "epoch": 0.9311288140688411, + "flos": 24170287628160.0, + "grad_norm": 4.411187977766473, + "language_loss": 0.76814902, + "learning_rate": 4.950858206945674e-08, + "loss": 0.78917217, + "num_input_tokens_seen": 334190005, + "step": 15487, + "time_per_iteration": 2.5567755699157715 + }, + { + "auxiliary_loss_clip": 0.0107361, + "auxiliary_loss_mlp": 0.0102644, + "balance_loss_clip": 1.03438878, + "balance_loss_mlp": 1.01383924, + "epoch": 0.9311889373215091, + "flos": 35590203561600.0, + "grad_norm": 4.238485473268984, + "language_loss": 0.67295229, + "learning_rate": 4.942249974085633e-08, + "loss": 0.6939528, + "num_input_tokens_seen": 334209545, + "step": 15488, + "time_per_iteration": 4.029850006103516 + }, + { + "auxiliary_loss_clip": 0.01078741, + "auxiliary_loss_mlp": 0.01030396, + "balance_loss_clip": 1.03364098, + "balance_loss_mlp": 1.01820111, + "epoch": 0.9312490605741771, + "flos": 20230528187520.0, + "grad_norm": 1.7226598339155603, + "language_loss": 0.74971867, + "learning_rate": 4.933649137834983e-08, + "loss": 0.77081001, + "num_input_tokens_seen": 334228900, + "step": 15489, + "time_per_iteration": 3.89424729347229 + }, + { + "auxiliary_loss_clip": 0.0110504, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.03492439, + "balance_loss_mlp": 1.01686597, + "epoch": 0.931309183826845, + "flos": 13950577762560.0, + "grad_norm": 2.243174480892999, + "language_loss": 0.80555081, + "learning_rate": 4.925055698519931e-08, + "loss": 0.82689363, + "num_input_tokens_seen": 334245500, + "step": 15490, + "time_per_iteration": 2.4214470386505127 + }, + { + "auxiliary_loss_clip": 0.01058182, + "auxiliary_loss_mlp": 0.01031313, + "balance_loss_clip": 1.03556705, + "balance_loss_mlp": 1.018641, + "epoch": 0.931369307079513, + "flos": 20156731695360.0, + "grad_norm": 2.4430295687596386, + "language_loss": 0.71927524, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.74017012, + "num_input_tokens_seen": 334264370, + "step": 15491, + "time_per_iteration": 4.128579616546631 + }, + { + "auxiliary_loss_clip": 0.01078973, + "auxiliary_loss_mlp": 0.00783024, + "balance_loss_clip": 1.03291965, + "balance_loss_mlp": 1.00931311, + "epoch": 0.931429430332181, + "flos": 25338569483520.0, + "grad_norm": 1.8395238186678844, + "language_loss": 0.74404228, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.76266223, + "num_input_tokens_seen": 334283905, + "step": 15492, + "time_per_iteration": 2.598203182220459 + }, + { + "auxiliary_loss_clip": 0.0101889, + "auxiliary_loss_mlp": 0.00999992, + "balance_loss_clip": 1.00499797, + "balance_loss_mlp": 0.99881762, + "epoch": 0.931489553584849, + "flos": 71226193985280.0, + "grad_norm": 0.7127484987900294, + "language_loss": 0.53430343, + "learning_rate": 4.899319765445442e-08, + "loss": 0.55449224, + "num_input_tokens_seen": 334339925, + "step": 15493, + "time_per_iteration": 2.9703893661499023 + }, + { + "auxiliary_loss_clip": 0.01092448, + "auxiliary_loss_mlp": 0.01029188, + "balance_loss_clip": 1.03391993, + "balance_loss_mlp": 1.01793504, + "epoch": 0.9315496768375169, + "flos": 14643653662080.0, + "grad_norm": 1.6751800677997006, + "language_loss": 0.70901692, + "learning_rate": 4.890755917128531e-08, + "loss": 0.73023319, + "num_input_tokens_seen": 334357225, + "step": 15494, + "time_per_iteration": 2.4962096214294434 + }, + { + "auxiliary_loss_clip": 0.01093927, + "auxiliary_loss_mlp": 0.01027613, + "balance_loss_clip": 1.03387678, + "balance_loss_mlp": 1.01560235, + "epoch": 0.9316098000901849, + "flos": 28329928174080.0, + "grad_norm": 1.6213232993414854, + "language_loss": 0.68269932, + "learning_rate": 4.882199467373671e-08, + "loss": 0.70391464, + "num_input_tokens_seen": 334375945, + "step": 15495, + "time_per_iteration": 2.5363314151763916 + }, + { + "auxiliary_loss_clip": 0.01099868, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.0328393, + "balance_loss_mlp": 1.02433431, + "epoch": 0.9316699233428528, + "flos": 28512677594880.0, + "grad_norm": 1.915913762923675, + "language_loss": 0.61600518, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.637357, + "num_input_tokens_seen": 334395310, + "step": 15496, + "time_per_iteration": 2.512446165084839 + }, + { + "auxiliary_loss_clip": 0.01092425, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.0346024, + "balance_loss_mlp": 1.01919329, + "epoch": 0.9317300465955208, + "flos": 33693402061440.0, + "grad_norm": 1.5810137622274227, + "language_loss": 0.77250952, + "learning_rate": 4.865108764847825e-08, + "loss": 0.79374373, + "num_input_tokens_seen": 334416965, + "step": 15497, + "time_per_iteration": 2.583953857421875 + }, + { + "auxiliary_loss_clip": 0.01096573, + "auxiliary_loss_mlp": 0.00785881, + "balance_loss_clip": 1.03658676, + "balance_loss_mlp": 1.01219702, + "epoch": 0.9317901698481887, + "flos": 23658237296640.0, + "grad_norm": 1.7797766571067648, + "language_loss": 0.66823196, + "learning_rate": 4.856574512724898e-08, + "loss": 0.68705648, + "num_input_tokens_seen": 334435620, + "step": 15498, + "time_per_iteration": 3.92992901802063 + }, + { + "auxiliary_loss_clip": 0.0108268, + "auxiliary_loss_mlp": 0.01037119, + "balance_loss_clip": 1.03501105, + "balance_loss_mlp": 1.02492964, + "epoch": 0.9318502931008568, + "flos": 20960017499520.0, + "grad_norm": 1.7281008640361637, + "language_loss": 0.79843867, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.8196367, + "num_input_tokens_seen": 334456210, + "step": 15499, + "time_per_iteration": 2.5320544242858887 + }, + { + "auxiliary_loss_clip": 0.01060001, + "auxiliary_loss_mlp": 0.01029998, + "balance_loss_clip": 1.03336787, + "balance_loss_mlp": 1.01758862, + "epoch": 0.9319104163535247, + "flos": 23441049711360.0, + "grad_norm": 1.4222760746861716, + "language_loss": 0.76916218, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.79006219, + "num_input_tokens_seen": 334475485, + "step": 15500, + "time_per_iteration": 2.5794692039489746 + }, + { + "auxiliary_loss_clip": 0.01067975, + "auxiliary_loss_mlp": 0.01023133, + "balance_loss_clip": 1.03127313, + "balance_loss_mlp": 1.01160502, + "epoch": 0.9319705396061927, + "flos": 22347426274560.0, + "grad_norm": 1.9618571774084552, + "language_loss": 0.72217214, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.74308324, + "num_input_tokens_seen": 334494740, + "step": 15501, + "time_per_iteration": 2.5605766773223877 + }, + { + "auxiliary_loss_clip": 0.01105864, + "auxiliary_loss_mlp": 0.01034759, + "balance_loss_clip": 1.03542423, + "balance_loss_mlp": 1.02249205, + "epoch": 0.9320306628588607, + "flos": 20993557824000.0, + "grad_norm": 1.723885160170039, + "language_loss": 0.6627984, + "learning_rate": 4.822511506047666e-08, + "loss": 0.68420464, + "num_input_tokens_seen": 334511910, + "step": 15502, + "time_per_iteration": 2.418593645095825 + }, + { + "auxiliary_loss_clip": 0.01094747, + "auxiliary_loss_mlp": 0.00784884, + "balance_loss_clip": 1.03504062, + "balance_loss_mlp": 1.01189542, + "epoch": 0.9320907861115286, + "flos": 24538300421760.0, + "grad_norm": 1.4093453331448773, + "language_loss": 0.65876698, + "learning_rate": 4.814014256446586e-08, + "loss": 0.67756325, + "num_input_tokens_seen": 334533150, + "step": 15503, + "time_per_iteration": 2.528770923614502 + }, + { + "auxiliary_loss_clip": 0.01067428, + "auxiliary_loss_mlp": 0.01036553, + "balance_loss_clip": 1.03158188, + "balance_loss_mlp": 1.0236609, + "epoch": 0.9321509093641966, + "flos": 19785414850560.0, + "grad_norm": 1.5662289965736342, + "language_loss": 0.75053215, + "learning_rate": 4.805524408317652e-08, + "loss": 0.77157199, + "num_input_tokens_seen": 334550940, + "step": 15504, + "time_per_iteration": 2.557810068130493 + }, + { + "auxiliary_loss_clip": 0.01094515, + "auxiliary_loss_mlp": 0.00784607, + "balance_loss_clip": 1.03543746, + "balance_loss_mlp": 1.01000094, + "epoch": 0.9322110326168646, + "flos": 24972675592320.0, + "grad_norm": 5.979121210175421, + "language_loss": 0.71249986, + "learning_rate": 4.797041961982762e-08, + "loss": 0.73129106, + "num_input_tokens_seen": 334570935, + "step": 15505, + "time_per_iteration": 2.5425987243652344 + }, + { + "auxiliary_loss_clip": 0.01083514, + "auxiliary_loss_mlp": 0.01029966, + "balance_loss_clip": 1.03379118, + "balance_loss_mlp": 1.01734793, + "epoch": 0.9322711558695326, + "flos": 16143642639360.0, + "grad_norm": 1.8739041335278073, + "language_loss": 0.75406373, + "learning_rate": 4.788566917763614e-08, + "loss": 0.77519858, + "num_input_tokens_seen": 334589315, + "step": 15506, + "time_per_iteration": 2.4860713481903076 + }, + { + "auxiliary_loss_clip": 0.01064047, + "auxiliary_loss_mlp": 0.01027809, + "balance_loss_clip": 1.03282475, + "balance_loss_mlp": 1.01637733, + "epoch": 0.9323312791222005, + "flos": 23732428838400.0, + "grad_norm": 1.8351445126661334, + "language_loss": 0.83017945, + "learning_rate": 4.780099275981597e-08, + "loss": 0.851098, + "num_input_tokens_seen": 334608990, + "step": 15507, + "time_per_iteration": 2.5753467082977295 + }, + { + "auxiliary_loss_clip": 0.0110473, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.03496099, + "balance_loss_mlp": 1.01975965, + "epoch": 0.9323914023748685, + "flos": 20777914523520.0, + "grad_norm": 1.4564449451306887, + "language_loss": 0.67833197, + "learning_rate": 4.771639036957742e-08, + "loss": 0.69969487, + "num_input_tokens_seen": 334628655, + "step": 15508, + "time_per_iteration": 2.4422402381896973 + }, + { + "auxiliary_loss_clip": 0.01068983, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.03563571, + "balance_loss_mlp": 1.01908231, + "epoch": 0.9324515256275364, + "flos": 23915178259200.0, + "grad_norm": 1.628027514705778, + "language_loss": 0.71919239, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.74019277, + "num_input_tokens_seen": 334648295, + "step": 15509, + "time_per_iteration": 2.5762388706207275 + }, + { + "auxiliary_loss_clip": 0.01092222, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.03387463, + "balance_loss_mlp": 1.02036572, + "epoch": 0.9325116488802044, + "flos": 18005215875840.0, + "grad_norm": 2.101325209076579, + "language_loss": 0.74551237, + "learning_rate": 4.754740768467624e-08, + "loss": 0.76675862, + "num_input_tokens_seen": 334666280, + "step": 15510, + "time_per_iteration": 2.4660046100616455 + }, + { + "auxiliary_loss_clip": 0.01095551, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.03310657, + "balance_loss_mlp": 1.01618373, + "epoch": 0.9325717721328723, + "flos": 29021603443200.0, + "grad_norm": 1.5051994410292724, + "language_loss": 0.70422053, + "learning_rate": 4.746302739642161e-08, + "loss": 0.72545958, + "num_input_tokens_seen": 334688830, + "step": 15511, + "time_per_iteration": 2.553065538406372 + }, + { + "auxiliary_loss_clip": 0.01077232, + "auxiliary_loss_mlp": 0.0103773, + "balance_loss_clip": 1.03350592, + "balance_loss_mlp": 1.02527213, + "epoch": 0.9326318953855404, + "flos": 21646341642240.0, + "grad_norm": 1.784709660094255, + "language_loss": 0.78034186, + "learning_rate": 4.737872114856412e-08, + "loss": 0.8014915, + "num_input_tokens_seen": 334705205, + "step": 15512, + "time_per_iteration": 2.506049156188965 + }, + { + "auxiliary_loss_clip": 0.01101532, + "auxiliary_loss_mlp": 0.0102778, + "balance_loss_clip": 1.03356934, + "balance_loss_mlp": 1.01538277, + "epoch": 0.9326920186382083, + "flos": 26065724411520.0, + "grad_norm": 1.426125854159295, + "language_loss": 0.80328441, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.82457757, + "num_input_tokens_seen": 334723830, + "step": 15513, + "time_per_iteration": 2.5029296875 + }, + { + "auxiliary_loss_clip": 0.01089119, + "auxiliary_loss_mlp": 0.01031735, + "balance_loss_clip": 1.03771067, + "balance_loss_mlp": 1.01856267, + "epoch": 0.9327521418908763, + "flos": 12057116227200.0, + "grad_norm": 2.006099880763913, + "language_loss": 0.80142713, + "learning_rate": 4.721033078682768e-08, + "loss": 0.82263571, + "num_input_tokens_seen": 334740825, + "step": 15514, + "time_per_iteration": 2.4727063179016113 + }, + { + "auxiliary_loss_clip": 0.01077555, + "auxiliary_loss_mlp": 0.01037006, + "balance_loss_clip": 1.0365026, + "balance_loss_mlp": 1.02522779, + "epoch": 0.9328122651435443, + "flos": 43834395271680.0, + "grad_norm": 1.7564535612918173, + "language_loss": 0.71562952, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.7367751, + "num_input_tokens_seen": 334765825, + "step": 15515, + "time_per_iteration": 2.7149147987365723 + }, + { + "auxiliary_loss_clip": 0.01086746, + "auxiliary_loss_mlp": 0.01033054, + "balance_loss_clip": 1.03483808, + "balance_loss_mlp": 1.02004194, + "epoch": 0.9328723883962122, + "flos": 15194954580480.0, + "grad_norm": 2.444788436432688, + "language_loss": 0.80735946, + "learning_rate": 4.704223662500806e-08, + "loss": 0.82855749, + "num_input_tokens_seen": 334782680, + "step": 15516, + "time_per_iteration": 2.471937417984009 + }, + { + "auxiliary_loss_clip": 0.01067184, + "auxiliary_loss_mlp": 0.01038659, + "balance_loss_clip": 1.03284955, + "balance_loss_mlp": 1.02529597, + "epoch": 0.9329325116488802, + "flos": 20261770041600.0, + "grad_norm": 1.655901220347938, + "language_loss": 0.8057375, + "learning_rate": 4.695830062703643e-08, + "loss": 0.82679594, + "num_input_tokens_seen": 334800160, + "step": 15517, + "time_per_iteration": 2.5508930683135986 + }, + { + "auxiliary_loss_clip": 0.01084051, + "auxiliary_loss_mlp": 0.01030453, + "balance_loss_clip": 1.03460526, + "balance_loss_mlp": 1.01822209, + "epoch": 0.9329926349015482, + "flos": 13115008609920.0, + "grad_norm": 1.882374073254307, + "language_loss": 0.7450012, + "learning_rate": 4.687443868860219e-08, + "loss": 0.76614618, + "num_input_tokens_seen": 334815840, + "step": 15518, + "time_per_iteration": 2.4868078231811523 + }, + { + "auxiliary_loss_clip": 0.01080262, + "auxiliary_loss_mlp": 0.01031199, + "balance_loss_clip": 1.03400517, + "balance_loss_mlp": 1.01880074, + "epoch": 0.9330527581542162, + "flos": 23040250778880.0, + "grad_norm": 2.0900213429569017, + "language_loss": 0.75615287, + "learning_rate": 4.679065081288458e-08, + "loss": 0.77726746, + "num_input_tokens_seen": 334834735, + "step": 15519, + "time_per_iteration": 2.52270245552063 + }, + { + "auxiliary_loss_clip": 0.01049017, + "auxiliary_loss_mlp": 0.01033817, + "balance_loss_clip": 1.03215909, + "balance_loss_mlp": 1.02108002, + "epoch": 0.9331128814068841, + "flos": 15559627409280.0, + "grad_norm": 2.0270770459292233, + "language_loss": 0.83084309, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.85167146, + "num_input_tokens_seen": 334853490, + "step": 15520, + "time_per_iteration": 2.604471445083618 + }, + { + "auxiliary_loss_clip": 0.01089939, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.03244627, + "balance_loss_mlp": 1.01708031, + "epoch": 0.9331730046595521, + "flos": 22271762275200.0, + "grad_norm": 1.6068848894157632, + "language_loss": 0.7662074, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.78739685, + "num_input_tokens_seen": 334873675, + "step": 15521, + "time_per_iteration": 2.4885141849517822 + }, + { + "auxiliary_loss_clip": 0.010953, + "auxiliary_loss_mlp": 0.01030948, + "balance_loss_clip": 1.03707552, + "balance_loss_mlp": 1.01940811, + "epoch": 0.93323312791222, + "flos": 15777641007360.0, + "grad_norm": 1.7177867917623644, + "language_loss": 0.77848732, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.79974985, + "num_input_tokens_seen": 334890970, + "step": 15522, + "time_per_iteration": 2.453705072402954 + }, + { + "auxiliary_loss_clip": 0.01068803, + "auxiliary_loss_mlp": 0.00783069, + "balance_loss_clip": 1.03230977, + "balance_loss_mlp": 1.01034081, + "epoch": 0.933293251164888, + "flos": 22010978557440.0, + "grad_norm": 2.6446528829272298, + "language_loss": 0.62708914, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.64560783, + "num_input_tokens_seen": 334906635, + "step": 15523, + "time_per_iteration": 2.528109312057495 + }, + { + "auxiliary_loss_clip": 0.01081344, + "auxiliary_loss_mlp": 0.01032727, + "balance_loss_clip": 1.03426123, + "balance_loss_mlp": 1.02078843, + "epoch": 0.933353374417556, + "flos": 26031358074240.0, + "grad_norm": 1.5647328304117871, + "language_loss": 0.68109632, + "learning_rate": 4.63728224861577e-08, + "loss": 0.70223713, + "num_input_tokens_seen": 334926230, + "step": 15524, + "time_per_iteration": 2.532000780105591 + }, + { + "auxiliary_loss_clip": 0.01059983, + "auxiliary_loss_mlp": 0.01034468, + "balance_loss_clip": 1.03170598, + "balance_loss_mlp": 1.02185619, + "epoch": 0.933413497670224, + "flos": 24900100162560.0, + "grad_norm": 1.7122214993255171, + "language_loss": 0.73888987, + "learning_rate": 4.628947905336589e-08, + "loss": 0.75983441, + "num_input_tokens_seen": 334946680, + "step": 15525, + "time_per_iteration": 2.6029868125915527 + }, + { + "auxiliary_loss_clip": 0.01049303, + "auxiliary_loss_mlp": 0.01038059, + "balance_loss_clip": 1.03119624, + "balance_loss_mlp": 1.02566123, + "epoch": 0.9334736209228919, + "flos": 23688689051520.0, + "grad_norm": 1.7692827987530468, + "language_loss": 0.83849341, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.85936701, + "num_input_tokens_seen": 334964785, + "step": 15526, + "time_per_iteration": 2.58610200881958 + }, + { + "auxiliary_loss_clip": 0.01059531, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.03460693, + "balance_loss_mlp": 1.01676214, + "epoch": 0.9335337441755599, + "flos": 15377344865280.0, + "grad_norm": 1.875832778413, + "language_loss": 0.68893361, + "learning_rate": 4.61230144456366e-08, + "loss": 0.70982134, + "num_input_tokens_seen": 334982400, + "step": 15527, + "time_per_iteration": 3.934974193572998 + }, + { + "auxiliary_loss_clip": 0.01106639, + "auxiliary_loss_mlp": 0.01029971, + "balance_loss_clip": 1.03570569, + "balance_loss_mlp": 1.01630414, + "epoch": 0.9335938674282279, + "flos": 16106726436480.0, + "grad_norm": 1.9028159455912952, + "language_loss": 0.64884359, + "learning_rate": 4.603989327701141e-08, + "loss": 0.67020965, + "num_input_tokens_seen": 334999685, + "step": 15528, + "time_per_iteration": 3.8269989490509033 + }, + { + "auxiliary_loss_clip": 0.01104768, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.03431773, + "balance_loss_mlp": 1.02067065, + "epoch": 0.9336539906808958, + "flos": 18952898353920.0, + "grad_norm": 1.7922427129233063, + "language_loss": 0.74562997, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.76701295, + "num_input_tokens_seen": 335019160, + "step": 15529, + "time_per_iteration": 3.8532514572143555 + }, + { + "auxiliary_loss_clip": 0.01056321, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.03073692, + "balance_loss_mlp": 1.0174669, + "epoch": 0.9337141139335638, + "flos": 18109104986880.0, + "grad_norm": 1.708738296913162, + "language_loss": 0.63120186, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.65205741, + "num_input_tokens_seen": 335037350, + "step": 15530, + "time_per_iteration": 2.5583834648132324 + }, + { + "auxiliary_loss_clip": 0.01079763, + "auxiliary_loss_mlp": 0.01028003, + "balance_loss_clip": 1.03394532, + "balance_loss_mlp": 1.01630306, + "epoch": 0.9337742371862318, + "flos": 17345716214400.0, + "grad_norm": 2.3717172724104514, + "language_loss": 0.7228055, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.74388319, + "num_input_tokens_seen": 335056060, + "step": 15531, + "time_per_iteration": 2.5294857025146484 + }, + { + "auxiliary_loss_clip": 0.0108084, + "auxiliary_loss_mlp": 0.01033565, + "balance_loss_clip": 1.03242373, + "balance_loss_mlp": 1.02117956, + "epoch": 0.9338343604388998, + "flos": 29058986522880.0, + "grad_norm": 1.6849369807760675, + "language_loss": 0.70789474, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.72903883, + "num_input_tokens_seen": 335075410, + "step": 15532, + "time_per_iteration": 2.558807611465454 + }, + { + "auxiliary_loss_clip": 0.0110487, + "auxiliary_loss_mlp": 0.00782104, + "balance_loss_clip": 1.03457427, + "balance_loss_mlp": 1.00758767, + "epoch": 0.9338944836915677, + "flos": 18660908695680.0, + "grad_norm": 1.781644393979094, + "language_loss": 0.72965479, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.74852455, + "num_input_tokens_seen": 335095190, + "step": 15533, + "time_per_iteration": 2.5111985206604004 + }, + { + "auxiliary_loss_clip": 0.01069211, + "auxiliary_loss_mlp": 0.01025536, + "balance_loss_clip": 1.03304183, + "balance_loss_mlp": 1.01387715, + "epoch": 0.9339546069442357, + "flos": 16617735273600.0, + "grad_norm": 1.8724124058007001, + "language_loss": 0.79493463, + "learning_rate": 4.554272235700507e-08, + "loss": 0.81588209, + "num_input_tokens_seen": 335113825, + "step": 15534, + "time_per_iteration": 2.5196497440338135 + }, + { + "auxiliary_loss_clip": 0.01098209, + "auxiliary_loss_mlp": 0.01025898, + "balance_loss_clip": 1.03526783, + "balance_loss_mlp": 1.01547861, + "epoch": 0.9340147301969036, + "flos": 23693106424320.0, + "grad_norm": 1.6618576278559234, + "language_loss": 0.74430954, + "learning_rate": 4.546011991495513e-08, + "loss": 0.76555061, + "num_input_tokens_seen": 335136425, + "step": 15535, + "time_per_iteration": 2.4999678134918213 + }, + { + "auxiliary_loss_clip": 0.01089417, + "auxiliary_loss_mlp": 0.01029686, + "balance_loss_clip": 1.03605938, + "balance_loss_mlp": 1.01718712, + "epoch": 0.9340748534495716, + "flos": 28654452576000.0, + "grad_norm": 1.8240551766841684, + "language_loss": 0.77616763, + "learning_rate": 4.537759158925292e-08, + "loss": 0.79735869, + "num_input_tokens_seen": 335157925, + "step": 15536, + "time_per_iteration": 2.5602850914001465 + }, + { + "auxiliary_loss_clip": 0.01072675, + "auxiliary_loss_mlp": 0.01029546, + "balance_loss_clip": 1.03314936, + "balance_loss_mlp": 1.01780415, + "epoch": 0.9341349767022396, + "flos": 24899633285760.0, + "grad_norm": 1.4149907381786824, + "language_loss": 0.80741596, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.82843822, + "num_input_tokens_seen": 335177840, + "step": 15537, + "time_per_iteration": 4.004602909088135 + }, + { + "auxiliary_loss_clip": 0.01084839, + "auxiliary_loss_mlp": 0.01029367, + "balance_loss_clip": 1.03578043, + "balance_loss_mlp": 1.01755953, + "epoch": 0.9341950999549076, + "flos": 29059525226880.0, + "grad_norm": 1.6164588827924442, + "language_loss": 0.77901196, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.80015403, + "num_input_tokens_seen": 335199470, + "step": 15538, + "time_per_iteration": 2.5617246627807617 + }, + { + "auxiliary_loss_clip": 0.01076753, + "auxiliary_loss_mlp": 0.01030461, + "balance_loss_clip": 1.03457642, + "balance_loss_mlp": 1.01811719, + "epoch": 0.9342552232075755, + "flos": 23587062497280.0, + "grad_norm": 1.464415386756092, + "language_loss": 0.73173964, + "learning_rate": 4.513045134151672e-08, + "loss": 0.75281173, + "num_input_tokens_seen": 335218885, + "step": 15539, + "time_per_iteration": 2.52237606048584 + }, + { + "auxiliary_loss_clip": 0.01064023, + "auxiliary_loss_mlp": 0.01028145, + "balance_loss_clip": 1.03527284, + "balance_loss_mlp": 1.01752329, + "epoch": 0.9343153464602435, + "flos": 36721389646080.0, + "grad_norm": 1.4420601480670017, + "language_loss": 0.6473943, + "learning_rate": 4.504821951247373e-08, + "loss": 0.66831601, + "num_input_tokens_seen": 335239485, + "step": 15540, + "time_per_iteration": 2.6876113414764404 + }, + { + "auxiliary_loss_clip": 0.01089866, + "auxiliary_loss_mlp": 0.01029142, + "balance_loss_clip": 1.0324533, + "balance_loss_mlp": 1.01794207, + "epoch": 0.9343754697129115, + "flos": 22236498097920.0, + "grad_norm": 1.6175961403978396, + "language_loss": 0.76332635, + "learning_rate": 4.496606181539864e-08, + "loss": 0.78451639, + "num_input_tokens_seen": 335258355, + "step": 15541, + "time_per_iteration": 2.4866137504577637 + }, + { + "auxiliary_loss_clip": 0.01089748, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.0361805, + "balance_loss_mlp": 1.02078378, + "epoch": 0.9344355929655794, + "flos": 29710333797120.0, + "grad_norm": 1.9783038229993095, + "language_loss": 0.67218268, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.69340396, + "num_input_tokens_seen": 335276835, + "step": 15542, + "time_per_iteration": 2.5345969200134277 + }, + { + "auxiliary_loss_clip": 0.01059497, + "auxiliary_loss_mlp": 0.01028836, + "balance_loss_clip": 1.03218055, + "balance_loss_mlp": 1.01622951, + "epoch": 0.9344957162182475, + "flos": 18880394751360.0, + "grad_norm": 1.94642339565805, + "language_loss": 0.69938195, + "learning_rate": 4.480196882960907e-08, + "loss": 0.72026533, + "num_input_tokens_seen": 335296220, + "step": 15543, + "time_per_iteration": 2.519374132156372 + }, + { + "auxiliary_loss_clip": 0.01094024, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.03284502, + "balance_loss_mlp": 1.01753438, + "epoch": 0.9345558394709154, + "flos": 27417761268480.0, + "grad_norm": 1.91313169106958, + "language_loss": 0.69267917, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.7139284, + "num_input_tokens_seen": 335316335, + "step": 15544, + "time_per_iteration": 2.538534164428711 + }, + { + "auxiliary_loss_clip": 0.0109039, + "auxiliary_loss_mlp": 0.01041504, + "balance_loss_clip": 1.03402114, + "balance_loss_mlp": 1.02778256, + "epoch": 0.9346159627235834, + "flos": 20741285629440.0, + "grad_norm": 2.189177398127204, + "language_loss": 0.77239031, + "learning_rate": 4.463817240903789e-08, + "loss": 0.79370922, + "num_input_tokens_seen": 335335545, + "step": 15545, + "time_per_iteration": 2.4885804653167725 + }, + { + "auxiliary_loss_clip": 0.01095634, + "auxiliary_loss_mlp": 0.01029275, + "balance_loss_clip": 1.03609443, + "balance_loss_mlp": 1.01806295, + "epoch": 0.9346760859762513, + "flos": 21069221823360.0, + "grad_norm": 1.652150752444054, + "language_loss": 0.68995297, + "learning_rate": 4.455638541847495e-08, + "loss": 0.71120203, + "num_input_tokens_seen": 335355350, + "step": 15546, + "time_per_iteration": 2.4931247234344482 + }, + { + "auxiliary_loss_clip": 0.01063522, + "auxiliary_loss_mlp": 0.01026802, + "balance_loss_clip": 1.03285861, + "balance_loss_mlp": 1.01471972, + "epoch": 0.9347362092289193, + "flos": 29204927481600.0, + "grad_norm": 1.8019676126535786, + "language_loss": 0.82041991, + "learning_rate": 4.447467257852966e-08, + "loss": 0.84132314, + "num_input_tokens_seen": 335375160, + "step": 15547, + "time_per_iteration": 2.5903806686401367 + }, + { + "auxiliary_loss_clip": 0.01084825, + "auxiliary_loss_mlp": 0.01039253, + "balance_loss_clip": 1.0312103, + "balance_loss_mlp": 1.02658105, + "epoch": 0.9347963324815872, + "flos": 19427350124160.0, + "grad_norm": 2.0246855894143576, + "language_loss": 0.83730298, + "learning_rate": 4.439303389230087e-08, + "loss": 0.85854375, + "num_input_tokens_seen": 335394080, + "step": 15548, + "time_per_iteration": 2.480454206466675 + }, + { + "auxiliary_loss_clip": 0.01096427, + "auxiliary_loss_mlp": 0.01036696, + "balance_loss_clip": 1.03522193, + "balance_loss_mlp": 1.02341628, + "epoch": 0.9348564557342552, + "flos": 36901840596480.0, + "grad_norm": 2.7748504347477128, + "language_loss": 0.66122162, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.68255281, + "num_input_tokens_seen": 335414230, + "step": 15549, + "time_per_iteration": 2.591153860092163 + }, + { + "auxiliary_loss_clip": 0.01094557, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.03590751, + "balance_loss_mlp": 1.01928735, + "epoch": 0.9349165789869232, + "flos": 21690117342720.0, + "grad_norm": 1.8182911724930118, + "language_loss": 0.80341744, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.82467961, + "num_input_tokens_seen": 335432890, + "step": 15550, + "time_per_iteration": 2.5077593326568604 + }, + { + "auxiliary_loss_clip": 0.01091646, + "auxiliary_loss_mlp": 0.01031104, + "balance_loss_clip": 1.03806794, + "balance_loss_mlp": 1.01960075, + "epoch": 0.9349767022395912, + "flos": 18844053166080.0, + "grad_norm": 1.6116817716121221, + "language_loss": 0.75547755, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.77670503, + "num_input_tokens_seen": 335452085, + "step": 15551, + "time_per_iteration": 2.4805312156677246 + }, + { + "auxiliary_loss_clip": 0.01045854, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.03102386, + "balance_loss_mlp": 1.01910162, + "epoch": 0.9350368254922591, + "flos": 24973429777920.0, + "grad_norm": 1.6605431583562031, + "language_loss": 0.73739576, + "learning_rate": 4.406722074642255e-08, + "loss": 0.75815248, + "num_input_tokens_seen": 335472130, + "step": 15552, + "time_per_iteration": 2.655322313308716 + }, + { + "auxiliary_loss_clip": 0.01054589, + "auxiliary_loss_mlp": 0.01042113, + "balance_loss_clip": 1.03078151, + "balance_loss_mlp": 1.02942336, + "epoch": 0.9350969487449271, + "flos": 23070594792960.0, + "grad_norm": 1.6286016735036362, + "language_loss": 0.77349615, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.79446322, + "num_input_tokens_seen": 335489970, + "step": 15553, + "time_per_iteration": 2.590271472930908 + }, + { + "auxiliary_loss_clip": 0.01068957, + "auxiliary_loss_mlp": 0.010369, + "balance_loss_clip": 1.03269649, + "balance_loss_mlp": 1.02382874, + "epoch": 0.9351570719975951, + "flos": 18625177641600.0, + "grad_norm": 1.5220401182866605, + "language_loss": 0.78318226, + "learning_rate": 4.390475917613723e-08, + "loss": 0.80424082, + "num_input_tokens_seen": 335509125, + "step": 15554, + "time_per_iteration": 2.526153326034546 + }, + { + "auxiliary_loss_clip": 0.01073275, + "auxiliary_loss_mlp": 0.01031662, + "balance_loss_clip": 1.03099811, + "balance_loss_mlp": 1.0203793, + "epoch": 0.935217195250263, + "flos": 15888353702400.0, + "grad_norm": 1.5095076937663103, + "language_loss": 0.69315624, + "learning_rate": 4.382363965244695e-08, + "loss": 0.71420562, + "num_input_tokens_seen": 335525620, + "step": 15555, + "time_per_iteration": 2.4846901893615723 + }, + { + "auxiliary_loss_clip": 0.01019482, + "auxiliary_loss_mlp": 0.0104166, + "balance_loss_clip": 1.0323509, + "balance_loss_mlp": 1.02808857, + "epoch": 0.935277318502931, + "flos": 24390312387840.0, + "grad_norm": 1.5254084767921123, + "language_loss": 0.75487661, + "learning_rate": 4.374259430715965e-08, + "loss": 0.77548802, + "num_input_tokens_seen": 335547565, + "step": 15556, + "time_per_iteration": 2.8624703884124756 + }, + { + "auxiliary_loss_clip": 0.0108075, + "auxiliary_loss_mlp": 0.0103452, + "balance_loss_clip": 1.03262925, + "balance_loss_mlp": 1.02333868, + "epoch": 0.935337441755599, + "flos": 27600259294080.0, + "grad_norm": 1.5015612989669962, + "language_loss": 0.72217929, + "learning_rate": 4.366162314334953e-08, + "loss": 0.74333203, + "num_input_tokens_seen": 335570285, + "step": 15557, + "time_per_iteration": 2.8201675415039062 + }, + { + "auxiliary_loss_clip": 0.01103263, + "auxiliary_loss_mlp": 0.01030304, + "balance_loss_clip": 1.0346359, + "balance_loss_mlp": 1.01807284, + "epoch": 0.935397565008267, + "flos": 20482872209280.0, + "grad_norm": 1.460000314000897, + "language_loss": 0.62918186, + "learning_rate": 4.358072616408681e-08, + "loss": 0.65051758, + "num_input_tokens_seen": 335588600, + "step": 15558, + "time_per_iteration": 2.45874285697937 + }, + { + "auxiliary_loss_clip": 0.01080334, + "auxiliary_loss_mlp": 0.01029579, + "balance_loss_clip": 1.03442693, + "balance_loss_mlp": 1.01679432, + "epoch": 0.9354576882609349, + "flos": 23654394541440.0, + "grad_norm": 2.4403208962736453, + "language_loss": 0.73357213, + "learning_rate": 4.34999033724388e-08, + "loss": 0.75467128, + "num_input_tokens_seen": 335606235, + "step": 15559, + "time_per_iteration": 2.522536516189575 + }, + { + "auxiliary_loss_clip": 0.01056541, + "auxiliary_loss_mlp": 0.00782434, + "balance_loss_clip": 1.03214812, + "balance_loss_mlp": 1.00869298, + "epoch": 0.9355178115136029, + "flos": 36684904406400.0, + "grad_norm": 1.46086109139203, + "language_loss": 0.63338178, + "learning_rate": 4.341915477147062e-08, + "loss": 0.65177155, + "num_input_tokens_seen": 335628240, + "step": 15560, + "time_per_iteration": 2.7130584716796875 + }, + { + "auxiliary_loss_clip": 0.01042093, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.0341264, + "balance_loss_mlp": 1.01759017, + "epoch": 0.9355779347662708, + "flos": 14460401450880.0, + "grad_norm": 2.133332639169153, + "language_loss": 0.64167798, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.66242075, + "num_input_tokens_seen": 335643755, + "step": 15561, + "time_per_iteration": 2.589303970336914 + }, + { + "auxiliary_loss_clip": 0.01104794, + "auxiliary_loss_mlp": 0.01033045, + "balance_loss_clip": 1.03702986, + "balance_loss_mlp": 1.02069521, + "epoch": 0.9356380580189388, + "flos": 23185976256000.0, + "grad_norm": 1.6974251297329317, + "language_loss": 0.75538802, + "learning_rate": 4.325788015381859e-08, + "loss": 0.77676642, + "num_input_tokens_seen": 335665160, + "step": 15562, + "time_per_iteration": 2.472651481628418 + }, + { + "auxiliary_loss_clip": 0.01016931, + "auxiliary_loss_mlp": 0.01003866, + "balance_loss_clip": 1.00592875, + "balance_loss_mlp": 1.00278091, + "epoch": 0.9356981812716068, + "flos": 67471626090240.0, + "grad_norm": 0.9552031710714227, + "language_loss": 0.62341452, + "learning_rate": 4.31773541432503e-08, + "loss": 0.64362246, + "num_input_tokens_seen": 335715240, + "step": 15563, + "time_per_iteration": 2.935854196548462 + }, + { + "auxiliary_loss_clip": 0.01060154, + "auxiliary_loss_mlp": 0.01034437, + "balance_loss_clip": 1.03418505, + "balance_loss_mlp": 1.02275467, + "epoch": 0.9357583045242748, + "flos": 24681619687680.0, + "grad_norm": 1.6745331742670497, + "language_loss": 0.78207445, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.80302036, + "num_input_tokens_seen": 335734970, + "step": 15564, + "time_per_iteration": 2.5964019298553467 + }, + { + "auxiliary_loss_clip": 0.01103939, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.03330445, + "balance_loss_mlp": 1.0177815, + "epoch": 0.9358184277769427, + "flos": 19463727623040.0, + "grad_norm": 2.3134140605200773, + "language_loss": 0.78271449, + "learning_rate": 4.301652473389694e-08, + "loss": 0.80406392, + "num_input_tokens_seen": 335753435, + "step": 15565, + "time_per_iteration": 3.804476737976074 + }, + { + "auxiliary_loss_clip": 0.0108939, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.03325403, + "balance_loss_mlp": 1.01795244, + "epoch": 0.9358785510296107, + "flos": 18916987731840.0, + "grad_norm": 2.6296625264031213, + "language_loss": 0.71882188, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.74000937, + "num_input_tokens_seen": 335772105, + "step": 15566, + "time_per_iteration": 4.06824517250061 + }, + { + "auxiliary_loss_clip": 0.01064182, + "auxiliary_loss_mlp": 0.00785293, + "balance_loss_clip": 1.03070736, + "balance_loss_mlp": 1.01090121, + "epoch": 0.9359386742822787, + "flos": 23441265192960.0, + "grad_norm": 1.7528182682493063, + "language_loss": 0.6767031, + "learning_rate": 4.285599216057889e-08, + "loss": 0.69519782, + "num_input_tokens_seen": 335789125, + "step": 15567, + "time_per_iteration": 4.095608711242676 + }, + { + "auxiliary_loss_clip": 0.01073379, + "auxiliary_loss_mlp": 0.01034356, + "balance_loss_clip": 1.03403223, + "balance_loss_mlp": 1.02251279, + "epoch": 0.9359987975349466, + "flos": 32744067557760.0, + "grad_norm": 2.134274737954368, + "language_loss": 0.61629212, + "learning_rate": 4.277583719504418e-08, + "loss": 0.63736951, + "num_input_tokens_seen": 335810995, + "step": 15568, + "time_per_iteration": 2.5983786582946777 + }, + { + "auxiliary_loss_clip": 0.01078251, + "auxiliary_loss_mlp": 0.0103339, + "balance_loss_clip": 1.03091824, + "balance_loss_mlp": 1.02136779, + "epoch": 0.9360589207876147, + "flos": 22819651401600.0, + "grad_norm": 1.6324050566663537, + "language_loss": 0.78777957, + "learning_rate": 4.269575644764556e-08, + "loss": 0.80889595, + "num_input_tokens_seen": 335830580, + "step": 15569, + "time_per_iteration": 2.5157172679901123 + }, + { + "auxiliary_loss_clip": 0.01085042, + "auxiliary_loss_mlp": 0.01034701, + "balance_loss_clip": 1.03498125, + "balance_loss_mlp": 1.02233887, + "epoch": 0.9361190440402826, + "flos": 20885251340160.0, + "grad_norm": 2.5920239322832543, + "language_loss": 0.69530004, + "learning_rate": 4.261574992142014e-08, + "loss": 0.71649748, + "num_input_tokens_seen": 335846515, + "step": 15570, + "time_per_iteration": 2.5494847297668457 + }, + { + "auxiliary_loss_clip": 0.01087736, + "auxiliary_loss_mlp": 0.01028237, + "balance_loss_clip": 1.03464067, + "balance_loss_mlp": 1.01598179, + "epoch": 0.9361791672929506, + "flos": 19317822577920.0, + "grad_norm": 1.8946242780044513, + "language_loss": 0.78422999, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.8053897, + "num_input_tokens_seen": 335863350, + "step": 15571, + "time_per_iteration": 2.4553892612457275 + }, + { + "auxiliary_loss_clip": 0.01070586, + "auxiliary_loss_mlp": 0.01029648, + "balance_loss_clip": 1.03260922, + "balance_loss_mlp": 1.01757193, + "epoch": 0.9362392905456185, + "flos": 15158182032000.0, + "grad_norm": 1.9634167178483677, + "language_loss": 0.77774912, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.79875147, + "num_input_tokens_seen": 335880510, + "step": 15572, + "time_per_iteration": 2.519888162612915 + }, + { + "auxiliary_loss_clip": 0.01078793, + "auxiliary_loss_mlp": 0.01036048, + "balance_loss_clip": 1.03271425, + "balance_loss_mlp": 1.02400136, + "epoch": 0.9362994137982865, + "flos": 22085888371200.0, + "grad_norm": 1.7111176677663573, + "language_loss": 0.77909601, + "learning_rate": 4.237617570010688e-08, + "loss": 0.80024445, + "num_input_tokens_seen": 335899440, + "step": 15573, + "time_per_iteration": 2.4984281063079834 + }, + { + "auxiliary_loss_clip": 0.01066835, + "auxiliary_loss_mlp": 0.01025429, + "balance_loss_clip": 1.03148341, + "balance_loss_mlp": 1.01382375, + "epoch": 0.9363595370509544, + "flos": 23512260424320.0, + "grad_norm": 2.3201639120050372, + "language_loss": 0.74448657, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.76540917, + "num_input_tokens_seen": 335919540, + "step": 15574, + "time_per_iteration": 2.5509278774261475 + }, + { + "auxiliary_loss_clip": 0.01045349, + "auxiliary_loss_mlp": 0.01033013, + "balance_loss_clip": 1.03185749, + "balance_loss_mlp": 1.02069902, + "epoch": 0.9364196603036224, + "flos": 27123473139840.0, + "grad_norm": 1.6931168754083834, + "language_loss": 0.68090749, + "learning_rate": 4.221683071397564e-08, + "loss": 0.70169109, + "num_input_tokens_seen": 335939665, + "step": 15575, + "time_per_iteration": 2.638585090637207 + }, + { + "auxiliary_loss_clip": 0.01076513, + "auxiliary_loss_mlp": 0.0103204, + "balance_loss_clip": 1.03376579, + "balance_loss_mlp": 1.0197252, + "epoch": 0.9364797835562904, + "flos": 18479057114880.0, + "grad_norm": 1.4742520459973771, + "language_loss": 0.6518563, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.67294186, + "num_input_tokens_seen": 335958580, + "step": 15576, + "time_per_iteration": 3.856295585632324 + }, + { + "auxiliary_loss_clip": 0.01089143, + "auxiliary_loss_mlp": 0.01027494, + "balance_loss_clip": 1.03183329, + "balance_loss_mlp": 1.01395166, + "epoch": 0.9365399068089584, + "flos": 13005552890880.0, + "grad_norm": 2.50532690552505, + "language_loss": 0.75380242, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.77496874, + "num_input_tokens_seen": 335974965, + "step": 15577, + "time_per_iteration": 2.450474262237549 + }, + { + "auxiliary_loss_clip": 0.01057653, + "auxiliary_loss_mlp": 0.01029094, + "balance_loss_clip": 1.03279018, + "balance_loss_mlp": 1.01652944, + "epoch": 0.9366000300616263, + "flos": 25666433850240.0, + "grad_norm": 2.0486315553052363, + "language_loss": 0.52466697, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.54553443, + "num_input_tokens_seen": 335996575, + "step": 15578, + "time_per_iteration": 2.6473519802093506 + }, + { + "auxiliary_loss_clip": 0.01041831, + "auxiliary_loss_mlp": 0.01033192, + "balance_loss_clip": 1.03035831, + "balance_loss_mlp": 1.02124071, + "epoch": 0.9366601533142943, + "flos": 21433355948160.0, + "grad_norm": 1.5269573121483597, + "language_loss": 0.70395857, + "learning_rate": 4.189903163783692e-08, + "loss": 0.7247088, + "num_input_tokens_seen": 336017265, + "step": 15579, + "time_per_iteration": 2.6275346279144287 + }, + { + "auxiliary_loss_clip": 0.01077894, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.0327673, + "balance_loss_mlp": 1.01780772, + "epoch": 0.9367202765669622, + "flos": 24093222998400.0, + "grad_norm": 2.0541864902112277, + "language_loss": 0.76324558, + "learning_rate": 4.181976748973959e-08, + "loss": 0.78431743, + "num_input_tokens_seen": 336035905, + "step": 15580, + "time_per_iteration": 2.5222973823547363 + }, + { + "auxiliary_loss_clip": 0.01093841, + "auxiliary_loss_mlp": 0.01032612, + "balance_loss_clip": 1.03552032, + "balance_loss_mlp": 1.01954651, + "epoch": 0.9367803998196302, + "flos": 20888842700160.0, + "grad_norm": 2.210633979288992, + "language_loss": 0.66365004, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.68491459, + "num_input_tokens_seen": 336055585, + "step": 15581, + "time_per_iteration": 2.524878978729248 + }, + { + "auxiliary_loss_clip": 0.01093317, + "auxiliary_loss_mlp": 0.01029997, + "balance_loss_clip": 1.03524947, + "balance_loss_mlp": 1.01787889, + "epoch": 0.9368405230722983, + "flos": 22564362464640.0, + "grad_norm": 1.5826395223481204, + "language_loss": 0.7667371, + "learning_rate": 4.166146195972042e-08, + "loss": 0.78797024, + "num_input_tokens_seen": 336076695, + "step": 15582, + "time_per_iteration": 2.5147645473480225 + }, + { + "auxiliary_loss_clip": 0.01029426, + "auxiliary_loss_mlp": 0.01037392, + "balance_loss_clip": 1.03165472, + "balance_loss_mlp": 1.02479732, + "epoch": 0.9369006463249662, + "flos": 18880215183360.0, + "grad_norm": 1.636056406694439, + "language_loss": 0.7390942, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.75976235, + "num_input_tokens_seen": 336094740, + "step": 15583, + "time_per_iteration": 2.635038137435913 + }, + { + "auxiliary_loss_clip": 0.01108425, + "auxiliary_loss_mlp": 0.01033488, + "balance_loss_clip": 1.03626192, + "balance_loss_mlp": 1.0206666, + "epoch": 0.9369607695776342, + "flos": 26432516142720.0, + "grad_norm": 2.093083238641521, + "language_loss": 0.83827704, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.85969615, + "num_input_tokens_seen": 336113985, + "step": 15584, + "time_per_iteration": 2.47880482673645 + }, + { + "auxiliary_loss_clip": 0.01093328, + "auxiliary_loss_mlp": 0.00783589, + "balance_loss_clip": 1.03562856, + "balance_loss_mlp": 1.00883293, + "epoch": 0.9370208928303021, + "flos": 39567346081920.0, + "grad_norm": 1.6149113125621728, + "language_loss": 0.71973538, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.73850453, + "num_input_tokens_seen": 336136395, + "step": 15585, + "time_per_iteration": 2.6569066047668457 + }, + { + "auxiliary_loss_clip": 0.01067436, + "auxiliary_loss_mlp": 0.01022702, + "balance_loss_clip": 1.03264523, + "balance_loss_mlp": 1.01185369, + "epoch": 0.9370810160829701, + "flos": 22963114321920.0, + "grad_norm": 1.6934746855788905, + "language_loss": 0.8074494, + "learning_rate": 4.134574204836316e-08, + "loss": 0.82835078, + "num_input_tokens_seen": 336156345, + "step": 15586, + "time_per_iteration": 2.5395936965942383 + }, + { + "auxiliary_loss_clip": 0.01063909, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.03444815, + "balance_loss_mlp": 1.01866055, + "epoch": 0.937141139335638, + "flos": 23075048079360.0, + "grad_norm": 1.5983802360998167, + "language_loss": 0.76377451, + "learning_rate": 4.126699774396258e-08, + "loss": 0.78471917, + "num_input_tokens_seen": 336176760, + "step": 15587, + "time_per_iteration": 2.564521312713623 + }, + { + "auxiliary_loss_clip": 0.01085079, + "auxiliary_loss_mlp": 0.01030034, + "balance_loss_clip": 1.03389096, + "balance_loss_mlp": 1.01797009, + "epoch": 0.937201262588306, + "flos": 16356664247040.0, + "grad_norm": 3.479769378787924, + "language_loss": 0.87373734, + "learning_rate": 4.118832771491387e-08, + "loss": 0.89488846, + "num_input_tokens_seen": 336193285, + "step": 15588, + "time_per_iteration": 2.469794273376465 + }, + { + "auxiliary_loss_clip": 0.01100903, + "auxiliary_loss_mlp": 0.00782856, + "balance_loss_clip": 1.03514826, + "balance_loss_mlp": 1.00962949, + "epoch": 0.937261385840974, + "flos": 20194078861440.0, + "grad_norm": 1.7070276042960943, + "language_loss": 0.78147912, + "learning_rate": 4.11097319642002e-08, + "loss": 0.80031675, + "num_input_tokens_seen": 336211425, + "step": 15589, + "time_per_iteration": 2.4685447216033936 + }, + { + "auxiliary_loss_clip": 0.01100138, + "auxiliary_loss_mlp": 0.01032665, + "balance_loss_clip": 1.03385937, + "balance_loss_mlp": 1.02057099, + "epoch": 0.937321509093642, + "flos": 18295948558080.0, + "grad_norm": 1.8492168798496216, + "language_loss": 0.77646977, + "learning_rate": 4.103121049480163e-08, + "loss": 0.7977978, + "num_input_tokens_seen": 336230205, + "step": 15590, + "time_per_iteration": 2.415260076522827 + }, + { + "auxiliary_loss_clip": 0.0108255, + "auxiliary_loss_mlp": 0.01037709, + "balance_loss_clip": 1.03454554, + "balance_loss_mlp": 1.02401829, + "epoch": 0.9373816323463099, + "flos": 25884662929920.0, + "grad_norm": 1.6067893068379164, + "language_loss": 0.71460617, + "learning_rate": 4.095276330969577e-08, + "loss": 0.73580885, + "num_input_tokens_seen": 336252440, + "step": 15591, + "time_per_iteration": 2.6125824451446533 + }, + { + "auxiliary_loss_clip": 0.01092444, + "auxiliary_loss_mlp": 0.00784827, + "balance_loss_clip": 1.0356257, + "balance_loss_mlp": 1.01027405, + "epoch": 0.9374417555989779, + "flos": 27198849830400.0, + "grad_norm": 1.728193854274124, + "language_loss": 0.53464353, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.55341625, + "num_input_tokens_seen": 336273845, + "step": 15592, + "time_per_iteration": 2.5215208530426025 + }, + { + "auxiliary_loss_clip": 0.01084928, + "auxiliary_loss_mlp": 0.01026814, + "balance_loss_clip": 1.03476214, + "balance_loss_mlp": 1.01547706, + "epoch": 0.9375018788516458, + "flos": 23621249266560.0, + "grad_norm": 1.4332291918339004, + "language_loss": 0.67273021, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.69384766, + "num_input_tokens_seen": 336292790, + "step": 15593, + "time_per_iteration": 2.5007576942443848 + }, + { + "auxiliary_loss_clip": 0.01082639, + "auxiliary_loss_mlp": 0.01029732, + "balance_loss_clip": 1.03341949, + "balance_loss_mlp": 1.01822233, + "epoch": 0.9375620021043138, + "flos": 22678774260480.0, + "grad_norm": 1.5229958666692947, + "language_loss": 0.73963177, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.76075542, + "num_input_tokens_seen": 336312600, + "step": 15594, + "time_per_iteration": 2.5383472442626953 + }, + { + "auxiliary_loss_clip": 0.01090107, + "auxiliary_loss_mlp": 0.01024909, + "balance_loss_clip": 1.03364706, + "balance_loss_mlp": 1.01426947, + "epoch": 0.9376221253569819, + "flos": 27560254521600.0, + "grad_norm": 1.540696949240773, + "language_loss": 0.73769128, + "learning_rate": 4.063971747165351e-08, + "loss": 0.7588414, + "num_input_tokens_seen": 336332770, + "step": 15595, + "time_per_iteration": 2.5188863277435303 + }, + { + "auxiliary_loss_clip": 0.01083526, + "auxiliary_loss_mlp": 0.01026718, + "balance_loss_clip": 1.0346204, + "balance_loss_mlp": 1.01549411, + "epoch": 0.9376822486096498, + "flos": 24129887806080.0, + "grad_norm": 1.8153901089531408, + "language_loss": 0.76134443, + "learning_rate": 4.056164175257626e-08, + "loss": 0.78244692, + "num_input_tokens_seen": 336351445, + "step": 15596, + "time_per_iteration": 2.5518603324890137 + }, + { + "auxiliary_loss_clip": 0.01082727, + "auxiliary_loss_mlp": 0.0103022, + "balance_loss_clip": 1.0348959, + "balance_loss_mlp": 1.01885891, + "epoch": 0.9377423718623178, + "flos": 22784028088320.0, + "grad_norm": 1.6664075391431528, + "language_loss": 0.78612888, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.80725831, + "num_input_tokens_seen": 336368690, + "step": 15597, + "time_per_iteration": 2.501997709274292 + }, + { + "auxiliary_loss_clip": 0.01107368, + "auxiliary_loss_mlp": 0.01032175, + "balance_loss_clip": 1.03545916, + "balance_loss_mlp": 1.01970577, + "epoch": 0.9378024951149857, + "flos": 19168900790400.0, + "grad_norm": 1.4889264215040359, + "language_loss": 0.80901223, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.83040768, + "num_input_tokens_seen": 336388165, + "step": 15598, + "time_per_iteration": 2.436891555786133 + }, + { + "auxiliary_loss_clip": 0.0107232, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.03459692, + "balance_loss_mlp": 1.01823568, + "epoch": 0.9378626183676537, + "flos": 23505508667520.0, + "grad_norm": 1.9486662852124939, + "language_loss": 0.63348681, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.65451288, + "num_input_tokens_seen": 336406475, + "step": 15599, + "time_per_iteration": 2.559999942779541 + }, + { + "auxiliary_loss_clip": 0.01063273, + "auxiliary_loss_mlp": 0.01031962, + "balance_loss_clip": 1.03381431, + "balance_loss_mlp": 1.01942766, + "epoch": 0.9379227416203216, + "flos": 18405655672320.0, + "grad_norm": 1.7076943677960992, + "language_loss": 0.73744512, + "learning_rate": 4.0250081926821e-08, + "loss": 0.75839752, + "num_input_tokens_seen": 336424690, + "step": 15600, + "time_per_iteration": 2.5022454261779785 + }, + { + "auxiliary_loss_clip": 0.01077019, + "auxiliary_loss_mlp": 0.01028879, + "balance_loss_clip": 1.03521895, + "balance_loss_mlp": 1.01824522, + "epoch": 0.9379828648729897, + "flos": 17821855923840.0, + "grad_norm": 1.7756664405819595, + "language_loss": 0.69114965, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.71220863, + "num_input_tokens_seen": 336443055, + "step": 15601, + "time_per_iteration": 2.5188684463500977 + }, + { + "auxiliary_loss_clip": 0.01019439, + "auxiliary_loss_mlp": 0.01002242, + "balance_loss_clip": 1.00602126, + "balance_loss_mlp": 1.00109184, + "epoch": 0.9380429881256576, + "flos": 68024399466240.0, + "grad_norm": 0.7597375324088025, + "language_loss": 0.58120269, + "learning_rate": 4.009474788561573e-08, + "loss": 0.60141945, + "num_input_tokens_seen": 336510190, + "step": 15602, + "time_per_iteration": 3.2495241165161133 + }, + { + "auxiliary_loss_clip": 0.01036408, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.03351712, + "balance_loss_mlp": 1.02125168, + "epoch": 0.9381031113783256, + "flos": 20776980769920.0, + "grad_norm": 2.487079707929017, + "language_loss": 0.72237426, + "learning_rate": 4.001719234324663e-08, + "loss": 0.74307394, + "num_input_tokens_seen": 336529250, + "step": 15603, + "time_per_iteration": 2.6305859088897705 + }, + { + "auxiliary_loss_clip": 0.01095784, + "auxiliary_loss_mlp": 0.01030128, + "balance_loss_clip": 1.03249979, + "balance_loss_mlp": 1.0191251, + "epoch": 0.9381632346309935, + "flos": 19025078734080.0, + "grad_norm": 1.5535400399216845, + "language_loss": 0.76047754, + "learning_rate": 3.993971112362171e-08, + "loss": 0.78173667, + "num_input_tokens_seen": 336548530, + "step": 15604, + "time_per_iteration": 3.824373960494995 + }, + { + "auxiliary_loss_clip": 0.01079626, + "auxiliary_loss_mlp": 0.01035794, + "balance_loss_clip": 1.03313971, + "balance_loss_mlp": 1.0214169, + "epoch": 0.9382233578836615, + "flos": 23513840622720.0, + "grad_norm": 1.8067637887083097, + "language_loss": 0.65374434, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.67489856, + "num_input_tokens_seen": 336568510, + "step": 15605, + "time_per_iteration": 3.927949905395508 + }, + { + "auxiliary_loss_clip": 0.01071536, + "auxiliary_loss_mlp": 0.00783174, + "balance_loss_clip": 1.03460872, + "balance_loss_mlp": 1.0070734, + "epoch": 0.9382834811363294, + "flos": 43067882016000.0, + "grad_norm": 1.808691527457172, + "language_loss": 0.67094612, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.68949318, + "num_input_tokens_seen": 336592020, + "step": 15606, + "time_per_iteration": 4.134126424789429 + }, + { + "auxiliary_loss_clip": 0.01087296, + "auxiliary_loss_mlp": 0.01026419, + "balance_loss_clip": 1.03257799, + "balance_loss_mlp": 1.01525533, + "epoch": 0.9383436043889974, + "flos": 16436242828800.0, + "grad_norm": 1.68669680893608, + "language_loss": 0.77246177, + "learning_rate": 3.970771343058166e-08, + "loss": 0.79359895, + "num_input_tokens_seen": 336610010, + "step": 15607, + "time_per_iteration": 2.4889976978302 + }, + { + "auxiliary_loss_clip": 0.01094256, + "auxiliary_loss_mlp": 0.01028124, + "balance_loss_clip": 1.03522444, + "balance_loss_mlp": 1.01676977, + "epoch": 0.9384037276416655, + "flos": 20740603271040.0, + "grad_norm": 1.8814036347555276, + "language_loss": 0.82669014, + "learning_rate": 3.963052953128776e-08, + "loss": 0.84791392, + "num_input_tokens_seen": 336628520, + "step": 15608, + "time_per_iteration": 2.4793779850006104 + }, + { + "auxiliary_loss_clip": 0.01094746, + "auxiliary_loss_mlp": 0.01033632, + "balance_loss_clip": 1.03708315, + "balance_loss_mlp": 1.02135372, + "epoch": 0.9384638508943334, + "flos": 19062677295360.0, + "grad_norm": 1.6309027347178864, + "language_loss": 0.68701613, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.70829993, + "num_input_tokens_seen": 336647365, + "step": 15609, + "time_per_iteration": 2.4664385318756104 + }, + { + "auxiliary_loss_clip": 0.0107365, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.03344452, + "balance_loss_mlp": 1.01662111, + "epoch": 0.9385239741470014, + "flos": 23404887694080.0, + "grad_norm": 2.4349668752143723, + "language_loss": 0.75232929, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.77335984, + "num_input_tokens_seen": 336667165, + "step": 15610, + "time_per_iteration": 2.5551717281341553 + }, + { + "auxiliary_loss_clip": 0.0104795, + "auxiliary_loss_mlp": 0.01025218, + "balance_loss_clip": 1.03465891, + "balance_loss_mlp": 1.01395273, + "epoch": 0.9385840973996693, + "flos": 12824742804480.0, + "grad_norm": 1.8944609371327303, + "language_loss": 0.74920487, + "learning_rate": 3.939942386953987e-08, + "loss": 0.76993656, + "num_input_tokens_seen": 336684130, + "step": 15611, + "time_per_iteration": 2.5956838130950928 + }, + { + "auxiliary_loss_clip": 0.01064641, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.03627443, + "balance_loss_mlp": 1.01792097, + "epoch": 0.9386442206523373, + "flos": 15486980152320.0, + "grad_norm": 1.7645101559276006, + "language_loss": 0.65826923, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.67921323, + "num_input_tokens_seen": 336701520, + "step": 15612, + "time_per_iteration": 2.513424873352051 + }, + { + "auxiliary_loss_clip": 0.0109041, + "auxiliary_loss_mlp": 0.01027228, + "balance_loss_clip": 1.03443384, + "balance_loss_mlp": 1.01635015, + "epoch": 0.9387043439050052, + "flos": 21178821196800.0, + "grad_norm": 1.6670714604930348, + "language_loss": 0.56812477, + "learning_rate": 3.924572515435742e-08, + "loss": 0.58930123, + "num_input_tokens_seen": 336720675, + "step": 15613, + "time_per_iteration": 2.4972872734069824 + }, + { + "auxiliary_loss_clip": 0.01079828, + "auxiliary_loss_mlp": 0.01032943, + "balance_loss_clip": 1.03230977, + "balance_loss_mlp": 1.02137995, + "epoch": 0.9387644671576733, + "flos": 27668273696640.0, + "grad_norm": 2.1680228825308174, + "language_loss": 0.70746958, + "learning_rate": 3.916898732330764e-08, + "loss": 0.72859728, + "num_input_tokens_seen": 336741005, + "step": 15614, + "time_per_iteration": 3.956528902053833 + }, + { + "auxiliary_loss_clip": 0.01095325, + "auxiliary_loss_mlp": 0.0103118, + "balance_loss_clip": 1.0352273, + "balance_loss_mlp": 1.01866865, + "epoch": 0.9388245904103412, + "flos": 18836331742080.0, + "grad_norm": 1.84497174102472, + "language_loss": 0.80985773, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.83112276, + "num_input_tokens_seen": 336757990, + "step": 15615, + "time_per_iteration": 2.4610280990600586 + }, + { + "auxiliary_loss_clip": 0.01077357, + "auxiliary_loss_mlp": 0.01029881, + "balance_loss_clip": 1.03346944, + "balance_loss_mlp": 1.0183239, + "epoch": 0.9388847136630092, + "flos": 25483828083840.0, + "grad_norm": 1.6703099635584093, + "language_loss": 0.71688116, + "learning_rate": 3.901573472884134e-08, + "loss": 0.73795354, + "num_input_tokens_seen": 336777705, + "step": 15616, + "time_per_iteration": 2.5810277462005615 + }, + { + "auxiliary_loss_clip": 0.01104291, + "auxiliary_loss_mlp": 0.01027176, + "balance_loss_clip": 1.03619444, + "balance_loss_mlp": 1.01498652, + "epoch": 0.9389448369156771, + "flos": 18734992496640.0, + "grad_norm": 1.9197540841564078, + "language_loss": 0.66095978, + "learning_rate": 3.89392199712355e-08, + "loss": 0.68227446, + "num_input_tokens_seen": 336798275, + "step": 15617, + "time_per_iteration": 2.468583345413208 + }, + { + "auxiliary_loss_clip": 0.01095477, + "auxiliary_loss_mlp": 0.01034475, + "balance_loss_clip": 1.03528595, + "balance_loss_mlp": 1.02116585, + "epoch": 0.9390049601683451, + "flos": 21717839664000.0, + "grad_norm": 1.999209418275975, + "language_loss": 0.73500848, + "learning_rate": 3.886277957725092e-08, + "loss": 0.75630802, + "num_input_tokens_seen": 336813835, + "step": 15618, + "time_per_iteration": 2.4835472106933594 + }, + { + "auxiliary_loss_clip": 0.01108949, + "auxiliary_loss_mlp": 0.01029071, + "balance_loss_clip": 1.03608859, + "balance_loss_mlp": 1.01572585, + "epoch": 0.939065083421013, + "flos": 19391224020480.0, + "grad_norm": 2.897151286801757, + "language_loss": 0.70055056, + "learning_rate": 3.878641354978662e-08, + "loss": 0.72193074, + "num_input_tokens_seen": 336832210, + "step": 15619, + "time_per_iteration": 2.4261631965637207 + }, + { + "auxiliary_loss_clip": 0.01080947, + "auxiliary_loss_mlp": 0.01032053, + "balance_loss_clip": 1.03447032, + "balance_loss_mlp": 1.01939869, + "epoch": 0.939125206673681, + "flos": 24681511946880.0, + "grad_norm": 1.702931350129787, + "language_loss": 0.77531093, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.79644096, + "num_input_tokens_seen": 336851380, + "step": 15620, + "time_per_iteration": 2.559323310852051 + }, + { + "auxiliary_loss_clip": 0.01087742, + "auxiliary_loss_mlp": 0.01027814, + "balance_loss_clip": 1.03265679, + "balance_loss_mlp": 1.01593494, + "epoch": 0.9391853299263491, + "flos": 16325961096960.0, + "grad_norm": 2.079356462925298, + "language_loss": 0.74018419, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.76133966, + "num_input_tokens_seen": 336868525, + "step": 15621, + "time_per_iteration": 2.435532331466675 + }, + { + "auxiliary_loss_clip": 0.01073289, + "auxiliary_loss_mlp": 0.01031832, + "balance_loss_clip": 1.03336406, + "balance_loss_mlp": 1.01937485, + "epoch": 0.939245453179017, + "flos": 11655778590720.0, + "grad_norm": 1.8025218715184799, + "language_loss": 0.66009492, + "learning_rate": 3.855776169545688e-08, + "loss": 0.68114614, + "num_input_tokens_seen": 336886200, + "step": 15622, + "time_per_iteration": 2.50565767288208 + }, + { + "auxiliary_loss_clip": 0.01077214, + "auxiliary_loss_mlp": 0.0104309, + "balance_loss_clip": 1.0325036, + "balance_loss_mlp": 1.02976274, + "epoch": 0.939305576431685, + "flos": 23148700917120.0, + "grad_norm": 1.5926581501070718, + "language_loss": 0.71839237, + "learning_rate": 3.848169316300209e-08, + "loss": 0.73959535, + "num_input_tokens_seen": 336905815, + "step": 15623, + "time_per_iteration": 2.4906768798828125 + }, + { + "auxiliary_loss_clip": 0.01097274, + "auxiliary_loss_mlp": 0.01029956, + "balance_loss_clip": 1.03785825, + "balance_loss_mlp": 1.01799917, + "epoch": 0.9393656996843529, + "flos": 33287790706560.0, + "grad_norm": 2.0711933140084984, + "language_loss": 0.7226541, + "learning_rate": 3.84056990115178e-08, + "loss": 0.74392635, + "num_input_tokens_seen": 336928460, + "step": 15624, + "time_per_iteration": 2.6029536724090576 + }, + { + "auxiliary_loss_clip": 0.01067315, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.03196979, + "balance_loss_mlp": 1.01909781, + "epoch": 0.9394258229370209, + "flos": 21689434984320.0, + "grad_norm": 1.9925254704082167, + "language_loss": 0.89412034, + "learning_rate": 3.832977924388614e-08, + "loss": 0.91510391, + "num_input_tokens_seen": 336948320, + "step": 15625, + "time_per_iteration": 2.52925705909729 + }, + { + "auxiliary_loss_clip": 0.01092166, + "auxiliary_loss_mlp": 0.01030648, + "balance_loss_clip": 1.03493702, + "balance_loss_mlp": 1.01783955, + "epoch": 0.9394859461896888, + "flos": 23874203819520.0, + "grad_norm": 1.7396492295535817, + "language_loss": 0.83765614, + "learning_rate": 3.825393386298592e-08, + "loss": 0.85888433, + "num_input_tokens_seen": 336967670, + "step": 15626, + "time_per_iteration": 2.4949300289154053 + }, + { + "auxiliary_loss_clip": 0.01012355, + "auxiliary_loss_mlp": 0.01003256, + "balance_loss_clip": 1.00791645, + "balance_loss_mlp": 1.0020045, + "epoch": 0.9395460694423569, + "flos": 61566116993280.0, + "grad_norm": 0.7759774673790854, + "language_loss": 0.56157869, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.58173484, + "num_input_tokens_seen": 337028395, + "step": 15627, + "time_per_iteration": 3.071989059448242 + }, + { + "auxiliary_loss_clip": 0.01054101, + "auxiliary_loss_mlp": 0.01038648, + "balance_loss_clip": 1.03279734, + "balance_loss_mlp": 1.02558327, + "epoch": 0.9396061926950248, + "flos": 20995712640000.0, + "grad_norm": 1.4415790746816062, + "language_loss": 0.70253813, + "learning_rate": 3.810246627288105e-08, + "loss": 0.72346568, + "num_input_tokens_seen": 337048150, + "step": 15628, + "time_per_iteration": 2.59606671333313 + }, + { + "auxiliary_loss_clip": 0.01091129, + "auxiliary_loss_mlp": 0.01028494, + "balance_loss_clip": 1.03406906, + "balance_loss_mlp": 1.01638198, + "epoch": 0.9396663159476928, + "flos": 27487786832640.0, + "grad_norm": 1.9004798770695726, + "language_loss": 0.75587064, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.77706695, + "num_input_tokens_seen": 337069315, + "step": 15629, + "time_per_iteration": 2.5131468772888184 + }, + { + "auxiliary_loss_clip": 0.01042302, + "auxiliary_loss_mlp": 0.0103406, + "balance_loss_clip": 1.03154254, + "balance_loss_mlp": 1.02128053, + "epoch": 0.9397264392003607, + "flos": 19427457864960.0, + "grad_norm": 1.8388086530186554, + "language_loss": 0.74254978, + "learning_rate": 3.795129626417748e-08, + "loss": 0.76331335, + "num_input_tokens_seen": 337087765, + "step": 15630, + "time_per_iteration": 2.5988919734954834 + }, + { + "auxiliary_loss_clip": 0.01069051, + "auxiliary_loss_mlp": 0.01030545, + "balance_loss_clip": 1.03371143, + "balance_loss_mlp": 1.0188868, + "epoch": 0.9397865624530287, + "flos": 18004820826240.0, + "grad_norm": 2.201823068216722, + "language_loss": 0.69073892, + "learning_rate": 3.787582286001845e-08, + "loss": 0.71173477, + "num_input_tokens_seen": 337106265, + "step": 15631, + "time_per_iteration": 2.4863619804382324 + }, + { + "auxiliary_loss_clip": 0.01053212, + "auxiliary_loss_mlp": 0.01033279, + "balance_loss_clip": 1.03291154, + "balance_loss_mlp": 1.02198982, + "epoch": 0.9398466857056966, + "flos": 22564613859840.0, + "grad_norm": 1.6202466852892417, + "language_loss": 0.75209808, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.77296299, + "num_input_tokens_seen": 337126090, + "step": 15632, + "time_per_iteration": 2.5887320041656494 + }, + { + "auxiliary_loss_clip": 0.01098767, + "auxiliary_loss_mlp": 0.01035509, + "balance_loss_clip": 1.03677344, + "balance_loss_mlp": 1.02221072, + "epoch": 0.9399068089583646, + "flos": 24535678728960.0, + "grad_norm": 1.6608415997978991, + "language_loss": 0.74258232, + "learning_rate": 3.772509926639622e-08, + "loss": 0.76392508, + "num_input_tokens_seen": 337145655, + "step": 15633, + "time_per_iteration": 2.495952606201172 + }, + { + "auxiliary_loss_clip": 0.01106079, + "auxiliary_loss_mlp": 0.01035383, + "balance_loss_clip": 1.03559375, + "balance_loss_mlp": 1.02230024, + "epoch": 0.9399669322110327, + "flos": 25630343660160.0, + "grad_norm": 1.749247723318036, + "language_loss": 0.72578341, + "learning_rate": 3.764984908264823e-08, + "loss": 0.74719805, + "num_input_tokens_seen": 337164805, + "step": 15634, + "time_per_iteration": 2.5107321739196777 + }, + { + "auxiliary_loss_clip": 0.01094454, + "auxiliary_loss_mlp": 0.01030676, + "balance_loss_clip": 1.03352809, + "balance_loss_mlp": 1.01782537, + "epoch": 0.9400270554637006, + "flos": 17089385783040.0, + "grad_norm": 1.7061775040608376, + "language_loss": 0.69017518, + "learning_rate": 3.75746733114144e-08, + "loss": 0.71142638, + "num_input_tokens_seen": 337182280, + "step": 15635, + "time_per_iteration": 2.4536962509155273 + }, + { + "auxiliary_loss_clip": 0.01052242, + "auxiliary_loss_mlp": 0.01025025, + "balance_loss_clip": 1.03543079, + "balance_loss_mlp": 1.01376569, + "epoch": 0.9400871787163686, + "flos": 22055113393920.0, + "grad_norm": 1.4872834361896294, + "language_loss": 0.7435869, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.76435953, + "num_input_tokens_seen": 337203495, + "step": 15636, + "time_per_iteration": 2.5863916873931885 + }, + { + "auxiliary_loss_clip": 0.01093707, + "auxiliary_loss_mlp": 0.01033954, + "balance_loss_clip": 1.03534591, + "balance_loss_mlp": 1.02127576, + "epoch": 0.9401473019690365, + "flos": 16982767238400.0, + "grad_norm": 1.9317416959277358, + "language_loss": 0.8285619, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.84983855, + "num_input_tokens_seen": 337220435, + "step": 15637, + "time_per_iteration": 2.426757574081421 + }, + { + "auxiliary_loss_clip": 0.01056775, + "auxiliary_loss_mlp": 0.01030301, + "balance_loss_clip": 1.03324938, + "balance_loss_mlp": 1.01804018, + "epoch": 0.9402074252217045, + "flos": 19681956702720.0, + "grad_norm": 2.598937591284569, + "language_loss": 0.69316196, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.71403271, + "num_input_tokens_seen": 337238095, + "step": 15638, + "time_per_iteration": 2.5606420040130615 + }, + { + "auxiliary_loss_clip": 0.0108728, + "auxiliary_loss_mlp": 0.01036695, + "balance_loss_clip": 1.03294575, + "balance_loss_mlp": 1.02608502, + "epoch": 0.9402675484743724, + "flos": 24754302858240.0, + "grad_norm": 1.7426454839201664, + "language_loss": 0.84790587, + "learning_rate": 3.727471440859498e-08, + "loss": 0.86914563, + "num_input_tokens_seen": 337256645, + "step": 15639, + "time_per_iteration": 2.4934165477752686 + }, + { + "auxiliary_loss_clip": 0.01078643, + "auxiliary_loss_mlp": 0.00783335, + "balance_loss_clip": 1.03200781, + "balance_loss_mlp": 1.0097326, + "epoch": 0.9403276717270405, + "flos": 25558630156800.0, + "grad_norm": 1.4460972055289312, + "language_loss": 0.78161514, + "learning_rate": 3.719991074263662e-08, + "loss": 0.80023497, + "num_input_tokens_seen": 337278360, + "step": 15640, + "time_per_iteration": 2.5743680000305176 + }, + { + "auxiliary_loss_clip": 0.01095427, + "auxiliary_loss_mlp": 0.01032764, + "balance_loss_clip": 1.03485751, + "balance_loss_mlp": 1.02090907, + "epoch": 0.9403877949797084, + "flos": 26689852154880.0, + "grad_norm": 1.5578220104234615, + "language_loss": 0.74046171, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.76174355, + "num_input_tokens_seen": 337302480, + "step": 15641, + "time_per_iteration": 2.5387516021728516 + }, + { + "auxiliary_loss_clip": 0.01094963, + "auxiliary_loss_mlp": 0.01034675, + "balance_loss_clip": 1.03421485, + "balance_loss_mlp": 1.02131152, + "epoch": 0.9404479182323764, + "flos": 15011666455680.0, + "grad_norm": 1.8419473506056603, + "language_loss": 0.82275391, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.84405023, + "num_input_tokens_seen": 337316600, + "step": 15642, + "time_per_iteration": 2.4356019496917725 + }, + { + "auxiliary_loss_clip": 0.01085604, + "auxiliary_loss_mlp": 0.0102971, + "balance_loss_clip": 1.03353477, + "balance_loss_mlp": 1.018713, + "epoch": 0.9405080414850443, + "flos": 24973573432320.0, + "grad_norm": 2.2359974476560325, + "language_loss": 0.68467659, + "learning_rate": 3.697594633355084e-08, + "loss": 0.70582974, + "num_input_tokens_seen": 337336895, + "step": 15643, + "time_per_iteration": 5.2143096923828125 + }, + { + "auxiliary_loss_clip": 0.01094767, + "auxiliary_loss_mlp": 0.0103785, + "balance_loss_clip": 1.03663301, + "balance_loss_mlp": 1.02488017, + "epoch": 0.9405681647377123, + "flos": 20844743777280.0, + "grad_norm": 1.959400775663905, + "language_loss": 0.76580298, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.78712916, + "num_input_tokens_seen": 337355105, + "step": 15644, + "time_per_iteration": 2.4593496322631836 + }, + { + "auxiliary_loss_clip": 0.0108479, + "auxiliary_loss_mlp": 0.01029824, + "balance_loss_clip": 1.03233635, + "balance_loss_mlp": 1.01855838, + "epoch": 0.9406282879903802, + "flos": 23805578885760.0, + "grad_norm": 1.5445104534505583, + "language_loss": 0.67192495, + "learning_rate": 3.682700891311974e-08, + "loss": 0.69307107, + "num_input_tokens_seen": 337374905, + "step": 15645, + "time_per_iteration": 3.972628116607666 + }, + { + "auxiliary_loss_clip": 0.01080082, + "auxiliary_loss_mlp": 0.00781488, + "balance_loss_clip": 1.03476369, + "balance_loss_mlp": 1.00718069, + "epoch": 0.9406884112430483, + "flos": 27674953626240.0, + "grad_norm": 1.4732281544785932, + "language_loss": 0.70308292, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.72169864, + "num_input_tokens_seen": 337397130, + "step": 15646, + "time_per_iteration": 2.581941843032837 + }, + { + "auxiliary_loss_clip": 0.01086537, + "auxiliary_loss_mlp": 0.01028035, + "balance_loss_clip": 1.03286147, + "balance_loss_mlp": 1.01653695, + "epoch": 0.9407485344957163, + "flos": 23075048079360.0, + "grad_norm": 1.584961726221311, + "language_loss": 0.74424946, + "learning_rate": 3.667836926755208e-08, + "loss": 0.76539528, + "num_input_tokens_seen": 337418660, + "step": 15647, + "time_per_iteration": 2.4914541244506836 + }, + { + "auxiliary_loss_clip": 0.01010806, + "auxiliary_loss_mlp": 0.01002127, + "balance_loss_clip": 1.00810754, + "balance_loss_mlp": 1.00113726, + "epoch": 0.9408086577483842, + "flos": 71014034304000.0, + "grad_norm": 0.8856472729930946, + "language_loss": 0.635315, + "learning_rate": 3.660416111738907e-08, + "loss": 0.65544432, + "num_input_tokens_seen": 337478055, + "step": 15648, + "time_per_iteration": 3.2263569831848145 + }, + { + "auxiliary_loss_clip": 0.01099221, + "auxiliary_loss_mlp": 0.0102973, + "balance_loss_clip": 1.03376043, + "balance_loss_mlp": 1.01909661, + "epoch": 0.9408687810010522, + "flos": 23730956380800.0, + "grad_norm": 1.4547953232275144, + "language_loss": 0.66908121, + "learning_rate": 3.653002741939337e-08, + "loss": 0.6903708, + "num_input_tokens_seen": 337499405, + "step": 15649, + "time_per_iteration": 2.4591097831726074 + }, + { + "auxiliary_loss_clip": 0.01070349, + "auxiliary_loss_mlp": 0.01028486, + "balance_loss_clip": 1.03119326, + "balance_loss_mlp": 1.01690519, + "epoch": 0.9409289042537201, + "flos": 18369314087040.0, + "grad_norm": 2.155291871068878, + "language_loss": 0.77596426, + "learning_rate": 3.645596817637586e-08, + "loss": 0.79695261, + "num_input_tokens_seen": 337517195, + "step": 15650, + "time_per_iteration": 2.518077850341797 + }, + { + "auxiliary_loss_clip": 0.01057867, + "auxiliary_loss_mlp": 0.01029418, + "balance_loss_clip": 1.03593385, + "balance_loss_mlp": 1.01803923, + "epoch": 0.9409890275063881, + "flos": 23878333883520.0, + "grad_norm": 1.6967572992831086, + "language_loss": 0.74414819, + "learning_rate": 3.638198339114451e-08, + "loss": 0.76502103, + "num_input_tokens_seen": 337535245, + "step": 15651, + "time_per_iteration": 2.5812008380889893 + }, + { + "auxiliary_loss_clip": 0.0110227, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.03449845, + "balance_loss_mlp": 1.02089024, + "epoch": 0.941049150759056, + "flos": 16545088016640.0, + "grad_norm": 1.7307795360203881, + "language_loss": 0.72489846, + "learning_rate": 3.630807306650507e-08, + "loss": 0.74624956, + "num_input_tokens_seen": 337553040, + "step": 15652, + "time_per_iteration": 3.8349947929382324 + }, + { + "auxiliary_loss_clip": 0.01068449, + "auxiliary_loss_mlp": 0.01032395, + "balance_loss_clip": 1.03421938, + "balance_loss_mlp": 1.01955616, + "epoch": 0.9411092740117241, + "flos": 25118401069440.0, + "grad_norm": 1.7195852813036407, + "language_loss": 0.66279745, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.68380594, + "num_input_tokens_seen": 337574580, + "step": 15653, + "time_per_iteration": 2.58687424659729 + }, + { + "auxiliary_loss_clip": 0.0110384, + "auxiliary_loss_mlp": 0.01033245, + "balance_loss_clip": 1.03476989, + "balance_loss_mlp": 1.02085924, + "epoch": 0.941169397264392, + "flos": 21142264129920.0, + "grad_norm": 2.3154982367892782, + "language_loss": 0.77530783, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.79667866, + "num_input_tokens_seen": 337593010, + "step": 15654, + "time_per_iteration": 2.4335625171661377 + }, + { + "auxiliary_loss_clip": 0.01097227, + "auxiliary_loss_mlp": 0.01028569, + "balance_loss_clip": 1.03395033, + "balance_loss_mlp": 1.01659465, + "epoch": 0.94122952051706, + "flos": 38508914995200.0, + "grad_norm": 1.460375891806464, + "language_loss": 0.7002095, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.72146749, + "num_input_tokens_seen": 337616170, + "step": 15655, + "time_per_iteration": 2.6295151710510254 + }, + { + "auxiliary_loss_clip": 0.01103539, + "auxiliary_loss_mlp": 0.01032062, + "balance_loss_clip": 1.03439283, + "balance_loss_mlp": 1.0186981, + "epoch": 0.9412896437697279, + "flos": 18369206346240.0, + "grad_norm": 1.913269440874927, + "language_loss": 0.72244883, + "learning_rate": 3.601317642987944e-08, + "loss": 0.74380481, + "num_input_tokens_seen": 337635215, + "step": 15656, + "time_per_iteration": 2.4098451137542725 + }, + { + "auxiliary_loss_clip": 0.01077273, + "auxiliary_loss_mlp": 0.01027959, + "balance_loss_clip": 1.03384256, + "balance_loss_mlp": 1.01640153, + "epoch": 0.9413497670223959, + "flos": 25884950238720.0, + "grad_norm": 1.760065280532338, + "language_loss": 0.77769065, + "learning_rate": 3.593963845018377e-08, + "loss": 0.79874289, + "num_input_tokens_seen": 337654195, + "step": 15657, + "time_per_iteration": 2.534491777420044 + }, + { + "auxiliary_loss_clip": 0.01065549, + "auxiliary_loss_mlp": 0.01027309, + "balance_loss_clip": 1.03146863, + "balance_loss_mlp": 1.01485789, + "epoch": 0.9414098902750638, + "flos": 16618309891200.0, + "grad_norm": 2.0900480415180263, + "language_loss": 0.84083956, + "learning_rate": 3.586617494785371e-08, + "loss": 0.86176813, + "num_input_tokens_seen": 337671810, + "step": 15658, + "time_per_iteration": 2.491455554962158 + }, + { + "auxiliary_loss_clip": 0.01108679, + "auxiliary_loss_mlp": 0.01031158, + "balance_loss_clip": 1.03605294, + "balance_loss_mlp": 1.01738298, + "epoch": 0.9414700135277319, + "flos": 18625033987200.0, + "grad_norm": 1.767511329437255, + "language_loss": 0.70302105, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.72441941, + "num_input_tokens_seen": 337689410, + "step": 15659, + "time_per_iteration": 2.4331743717193604 + }, + { + "auxiliary_loss_clip": 0.01073906, + "auxiliary_loss_mlp": 0.01035207, + "balance_loss_clip": 1.0321064, + "balance_loss_mlp": 1.02437103, + "epoch": 0.9415301367803999, + "flos": 26280146649600.0, + "grad_norm": 1.6385778080637707, + "language_loss": 0.7954824, + "learning_rate": 3.571947138643172e-08, + "loss": 0.8165735, + "num_input_tokens_seen": 337709950, + "step": 15660, + "time_per_iteration": 2.5331966876983643 + }, + { + "auxiliary_loss_clip": 0.01067584, + "auxiliary_loss_mlp": 0.01026976, + "balance_loss_clip": 1.03235781, + "balance_loss_mlp": 1.01559711, + "epoch": 0.9415902600330678, + "flos": 23261388860160.0, + "grad_norm": 1.414190290963814, + "language_loss": 0.68078941, + "learning_rate": 3.564623133290201e-08, + "loss": 0.70173502, + "num_input_tokens_seen": 337731320, + "step": 15661, + "time_per_iteration": 2.5992352962493896 + }, + { + "auxiliary_loss_clip": 0.01088127, + "auxiliary_loss_mlp": 0.01031326, + "balance_loss_clip": 1.03245473, + "balance_loss_mlp": 1.01936293, + "epoch": 0.9416503832857358, + "flos": 14719138093440.0, + "grad_norm": 2.022563726965061, + "language_loss": 0.66442215, + "learning_rate": 3.557306576786434e-08, + "loss": 0.68561667, + "num_input_tokens_seen": 337747720, + "step": 15662, + "time_per_iteration": 2.440382957458496 + }, + { + "auxiliary_loss_clip": 0.01007882, + "auxiliary_loss_mlp": 0.00999919, + "balance_loss_clip": 1.00527692, + "balance_loss_mlp": 0.99880993, + "epoch": 0.9417105065384037, + "flos": 70312698276480.0, + "grad_norm": 0.7749794122221303, + "language_loss": 0.59313601, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.61321402, + "num_input_tokens_seen": 337806930, + "step": 15663, + "time_per_iteration": 3.1881017684936523 + }, + { + "auxiliary_loss_clip": 0.01098434, + "auxiliary_loss_mlp": 0.01032455, + "balance_loss_clip": 1.03548408, + "balance_loss_mlp": 1.0190264, + "epoch": 0.9417706297910717, + "flos": 34057895322240.0, + "grad_norm": 1.9317907782806034, + "language_loss": 0.66764963, + "learning_rate": 3.542695811435914e-08, + "loss": 0.68895859, + "num_input_tokens_seen": 337828100, + "step": 15664, + "time_per_iteration": 2.5654635429382324 + }, + { + "auxiliary_loss_clip": 0.01079431, + "auxiliary_loss_mlp": 0.01026486, + "balance_loss_clip": 1.03761995, + "balance_loss_mlp": 1.01532221, + "epoch": 0.9418307530437396, + "flos": 16471614746880.0, + "grad_norm": 2.0450056971553563, + "language_loss": 0.73074615, + "learning_rate": 3.535401603143207e-08, + "loss": 0.75180531, + "num_input_tokens_seen": 337844805, + "step": 15665, + "time_per_iteration": 2.4893178939819336 + }, + { + "auxiliary_loss_clip": 0.0110108, + "auxiliary_loss_mlp": 0.01029434, + "balance_loss_clip": 1.03542984, + "balance_loss_mlp": 1.01811516, + "epoch": 0.9418908762964077, + "flos": 11253543114240.0, + "grad_norm": 1.9595407608638742, + "language_loss": 0.63600206, + "learning_rate": 3.528114844807773e-08, + "loss": 0.65730727, + "num_input_tokens_seen": 337860490, + "step": 15666, + "time_per_iteration": 2.4029088020324707 + }, + { + "auxiliary_loss_clip": 0.0106691, + "auxiliary_loss_mlp": 0.01032741, + "balance_loss_clip": 1.03354192, + "balance_loss_mlp": 1.0202601, + "epoch": 0.9419509995490756, + "flos": 18438836860800.0, + "grad_norm": 2.202105490437277, + "language_loss": 0.79241467, + "learning_rate": 3.520835536705902e-08, + "loss": 0.81341124, + "num_input_tokens_seen": 337878360, + "step": 15667, + "time_per_iteration": 2.551560163497925 + }, + { + "auxiliary_loss_clip": 0.01099912, + "auxiliary_loss_mlp": 0.01027614, + "balance_loss_clip": 1.0334723, + "balance_loss_mlp": 1.0168376, + "epoch": 0.9420111228017436, + "flos": 20737945664640.0, + "grad_norm": 1.670953841185381, + "language_loss": 0.75171089, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.77298611, + "num_input_tokens_seen": 337895635, + "step": 15668, + "time_per_iteration": 2.4362730979919434 + }, + { + "auxiliary_loss_clip": 0.01058742, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.03515005, + "balance_loss_mlp": 1.01779616, + "epoch": 0.9420712460544115, + "flos": 21141940907520.0, + "grad_norm": 2.9343811417516683, + "language_loss": 0.59584165, + "learning_rate": 3.506299272306723e-08, + "loss": 0.61672747, + "num_input_tokens_seen": 337913940, + "step": 15669, + "time_per_iteration": 2.5626139640808105 + }, + { + "auxiliary_loss_clip": 0.01061899, + "auxiliary_loss_mlp": 0.01024924, + "balance_loss_clip": 1.0329926, + "balance_loss_mlp": 1.0136292, + "epoch": 0.9421313693070795, + "flos": 15851760721920.0, + "grad_norm": 1.5343841697263154, + "language_loss": 0.77062893, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.79149711, + "num_input_tokens_seen": 337932015, + "step": 15670, + "time_per_iteration": 2.54091477394104 + }, + { + "auxiliary_loss_clip": 0.01102554, + "auxiliary_loss_mlp": 0.01035114, + "balance_loss_clip": 1.03426886, + "balance_loss_mlp": 1.02251911, + "epoch": 0.9421914925597474, + "flos": 32415915882240.0, + "grad_norm": 1.7875645926522663, + "language_loss": 0.64910525, + "learning_rate": 3.491792812150574e-08, + "loss": 0.67048192, + "num_input_tokens_seen": 337953345, + "step": 15671, + "time_per_iteration": 2.538486957550049 + }, + { + "auxiliary_loss_clip": 0.01078699, + "auxiliary_loss_mlp": 0.0103229, + "balance_loss_clip": 1.03403854, + "balance_loss_mlp": 1.01985049, + "epoch": 0.9422516158124155, + "flos": 19718513769600.0, + "grad_norm": 1.599990490213978, + "language_loss": 0.79880714, + "learning_rate": 3.48455075935139e-08, + "loss": 0.81991702, + "num_input_tokens_seen": 337973685, + "step": 15672, + "time_per_iteration": 2.500953197479248 + }, + { + "auxiliary_loss_clip": 0.01070635, + "auxiliary_loss_mlp": 0.0103703, + "balance_loss_clip": 1.03350782, + "balance_loss_mlp": 1.02380991, + "epoch": 0.9423117390650835, + "flos": 16253277926400.0, + "grad_norm": 2.0253401450432187, + "language_loss": 0.73314315, + "learning_rate": 3.47731615843776e-08, + "loss": 0.75421977, + "num_input_tokens_seen": 337989175, + "step": 15673, + "time_per_iteration": 2.520496368408203 + }, + { + "auxiliary_loss_clip": 0.0108578, + "auxiliary_loss_mlp": 0.01029857, + "balance_loss_clip": 1.03189969, + "balance_loss_mlp": 1.01686299, + "epoch": 0.9423718623177514, + "flos": 31796564647680.0, + "grad_norm": 1.4465091768296927, + "language_loss": 0.70150232, + "learning_rate": 3.470089009683974e-08, + "loss": 0.72265869, + "num_input_tokens_seen": 338011800, + "step": 15674, + "time_per_iteration": 2.56890869140625 + }, + { + "auxiliary_loss_clip": 0.0110289, + "auxiliary_loss_mlp": 0.01025594, + "balance_loss_clip": 1.03436375, + "balance_loss_mlp": 1.01413178, + "epoch": 0.9424319855704194, + "flos": 23331809473920.0, + "grad_norm": 1.7556342710336021, + "language_loss": 0.81015992, + "learning_rate": 3.462869313364125e-08, + "loss": 0.83144474, + "num_input_tokens_seen": 338032120, + "step": 15675, + "time_per_iteration": 2.45219087600708 + }, + { + "auxiliary_loss_clip": 0.01078127, + "auxiliary_loss_mlp": 0.01028014, + "balance_loss_clip": 1.03625762, + "balance_loss_mlp": 1.016361, + "epoch": 0.9424921088230873, + "flos": 20777627214720.0, + "grad_norm": 1.6627026204337043, + "language_loss": 0.62821412, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.64927554, + "num_input_tokens_seen": 338051880, + "step": 15676, + "time_per_iteration": 2.512535333633423 + }, + { + "auxiliary_loss_clip": 0.01084974, + "auxiliary_loss_mlp": 0.01033982, + "balance_loss_clip": 1.03465629, + "balance_loss_mlp": 1.0221746, + "epoch": 0.9425522320757553, + "flos": 19026658932480.0, + "grad_norm": 1.7751120713609283, + "language_loss": 0.66843808, + "learning_rate": 3.448452279120984e-08, + "loss": 0.68962765, + "num_input_tokens_seen": 338069665, + "step": 15677, + "time_per_iteration": 2.508359432220459 + }, + { + "auxiliary_loss_clip": 0.01068971, + "auxiliary_loss_mlp": 0.01033313, + "balance_loss_clip": 1.03282404, + "balance_loss_mlp": 1.01988363, + "epoch": 0.9426123553284232, + "flos": 25155353185920.0, + "grad_norm": 1.7407990943905052, + "language_loss": 0.64489031, + "learning_rate": 3.441254941744387e-08, + "loss": 0.66591316, + "num_input_tokens_seen": 338090490, + "step": 15678, + "time_per_iteration": 2.5544772148132324 + }, + { + "auxiliary_loss_clip": 0.01072923, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.03389382, + "balance_loss_mlp": 1.01699889, + "epoch": 0.9426724785810913, + "flos": 21179359900800.0, + "grad_norm": 1.4945064969287034, + "language_loss": 0.74536288, + "learning_rate": 3.434065057895097e-08, + "loss": 0.76637781, + "num_input_tokens_seen": 338109825, + "step": 15679, + "time_per_iteration": 2.554229259490967 + }, + { + "auxiliary_loss_clip": 0.0108788, + "auxiliary_loss_mlp": 0.01032167, + "balance_loss_clip": 1.03659582, + "balance_loss_mlp": 1.01969767, + "epoch": 0.9427326018337592, + "flos": 14756916222720.0, + "grad_norm": 2.4103518553209264, + "language_loss": 0.77639633, + "learning_rate": 3.426882627845762e-08, + "loss": 0.79759681, + "num_input_tokens_seen": 338125790, + "step": 15680, + "time_per_iteration": 2.482663869857788 + }, + { + "auxiliary_loss_clip": 0.01091117, + "auxiliary_loss_mlp": 0.01032913, + "balance_loss_clip": 1.03421128, + "balance_loss_mlp": 1.0214982, + "epoch": 0.9427927250864272, + "flos": 20923640000640.0, + "grad_norm": 1.9114306755798662, + "language_loss": 0.75358343, + "learning_rate": 3.419707651868742e-08, + "loss": 0.77482367, + "num_input_tokens_seen": 338145610, + "step": 15681, + "time_per_iteration": 3.925328493118286 + }, + { + "auxiliary_loss_clip": 0.0108568, + "auxiliary_loss_mlp": 0.0103561, + "balance_loss_clip": 1.03675246, + "balance_loss_mlp": 1.02325439, + "epoch": 0.9428528483390951, + "flos": 19752520970880.0, + "grad_norm": 1.774822425757189, + "language_loss": 0.66181099, + "learning_rate": 3.412540130236086e-08, + "loss": 0.68302393, + "num_input_tokens_seen": 338165960, + "step": 15682, + "time_per_iteration": 3.9952878952026367 + }, + { + "auxiliary_loss_clip": 0.01068319, + "auxiliary_loss_mlp": 0.01027979, + "balance_loss_clip": 1.03228939, + "balance_loss_mlp": 1.01575434, + "epoch": 0.9429129715917631, + "flos": 24534996370560.0, + "grad_norm": 2.0137971196133, + "language_loss": 0.76196301, + "learning_rate": 3.405380063219665e-08, + "loss": 0.78292596, + "num_input_tokens_seen": 338187215, + "step": 15683, + "time_per_iteration": 3.9438538551330566 + }, + { + "auxiliary_loss_clip": 0.01096394, + "auxiliary_loss_mlp": 0.01038267, + "balance_loss_clip": 1.03498816, + "balance_loss_mlp": 1.02513564, + "epoch": 0.942973094844431, + "flos": 17959824063360.0, + "grad_norm": 2.7198957585100096, + "language_loss": 0.75759923, + "learning_rate": 3.398227451090885e-08, + "loss": 0.77894592, + "num_input_tokens_seen": 338201825, + "step": 15684, + "time_per_iteration": 2.4250645637512207 + }, + { + "auxiliary_loss_clip": 0.01099085, + "auxiliary_loss_mlp": 0.01021726, + "balance_loss_clip": 1.03330731, + "balance_loss_mlp": 1.01070476, + "epoch": 0.9430332180970991, + "flos": 26137689310080.0, + "grad_norm": 1.6887135532103126, + "language_loss": 0.77514446, + "learning_rate": 3.391082294121017e-08, + "loss": 0.79635251, + "num_input_tokens_seen": 338220865, + "step": 15685, + "time_per_iteration": 2.5002598762512207 + }, + { + "auxiliary_loss_clip": 0.01089204, + "auxiliary_loss_mlp": 0.01030671, + "balance_loss_clip": 1.03340626, + "balance_loss_mlp": 1.01921463, + "epoch": 0.943093341349767, + "flos": 23951376190080.0, + "grad_norm": 1.9584191055228868, + "language_loss": 0.75549257, + "learning_rate": 3.383944592581023e-08, + "loss": 0.77669132, + "num_input_tokens_seen": 338240160, + "step": 15686, + "time_per_iteration": 2.4820642471313477 + }, + { + "auxiliary_loss_clip": 0.0109323, + "auxiliary_loss_mlp": 0.01028668, + "balance_loss_clip": 1.03342128, + "balance_loss_mlp": 1.01583529, + "epoch": 0.943153464602435, + "flos": 17968407413760.0, + "grad_norm": 1.669328264835531, + "language_loss": 0.80759299, + "learning_rate": 3.376814346741575e-08, + "loss": 0.82881194, + "num_input_tokens_seen": 338259305, + "step": 15687, + "time_per_iteration": 2.456772565841675 + }, + { + "auxiliary_loss_clip": 0.01085673, + "auxiliary_loss_mlp": 0.01033243, + "balance_loss_clip": 1.03539836, + "balance_loss_mlp": 1.01965952, + "epoch": 0.943213587855103, + "flos": 14501519544960.0, + "grad_norm": 2.1643458185271016, + "language_loss": 0.75992525, + "learning_rate": 3.369691556873011e-08, + "loss": 0.78111446, + "num_input_tokens_seen": 338274950, + "step": 15688, + "time_per_iteration": 2.4622299671173096 + }, + { + "auxiliary_loss_clip": 0.01074408, + "auxiliary_loss_mlp": 0.01027986, + "balance_loss_clip": 1.03243423, + "balance_loss_mlp": 1.0159874, + "epoch": 0.9432737111077709, + "flos": 28986411093120.0, + "grad_norm": 1.7474172814670688, + "language_loss": 0.68591654, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.70694053, + "num_input_tokens_seen": 338295585, + "step": 15689, + "time_per_iteration": 2.56526780128479 + }, + { + "auxiliary_loss_clip": 0.01090121, + "auxiliary_loss_mlp": 0.01035194, + "balance_loss_clip": 1.03411829, + "balance_loss_mlp": 1.02469158, + "epoch": 0.9433338343604389, + "flos": 21609066303360.0, + "grad_norm": 1.6934502263449016, + "language_loss": 0.80556005, + "learning_rate": 3.35546834612872e-08, + "loss": 0.82681322, + "num_input_tokens_seen": 338314555, + "step": 15690, + "time_per_iteration": 2.464779853820801 + }, + { + "auxiliary_loss_clip": 0.01090631, + "auxiliary_loss_mlp": 0.01027904, + "balance_loss_clip": 1.03397298, + "balance_loss_mlp": 1.0163343, + "epoch": 0.9433939576131068, + "flos": 33182285483520.0, + "grad_norm": 2.0216042002833707, + "language_loss": 0.5993464, + "learning_rate": 3.348367925792317e-08, + "loss": 0.6205318, + "num_input_tokens_seen": 338336260, + "step": 15691, + "time_per_iteration": 4.01296067237854 + }, + { + "auxiliary_loss_clip": 0.01066042, + "auxiliary_loss_mlp": 0.01029114, + "balance_loss_clip": 1.0339098, + "balance_loss_mlp": 1.01655555, + "epoch": 0.9434540808657749, + "flos": 20486391742080.0, + "grad_norm": 1.6831999821465933, + "language_loss": 0.66760135, + "learning_rate": 3.341274962505514e-08, + "loss": 0.68855298, + "num_input_tokens_seen": 338354680, + "step": 15692, + "time_per_iteration": 2.565175771713257 + }, + { + "auxiliary_loss_clip": 0.01089761, + "auxiliary_loss_mlp": 0.01031287, + "balance_loss_clip": 1.03518808, + "balance_loss_mlp": 1.01916313, + "epoch": 0.9435142041184428, + "flos": 21542955321600.0, + "grad_norm": 2.6901904301317447, + "language_loss": 0.75071502, + "learning_rate": 3.334189456537251e-08, + "loss": 0.77192545, + "num_input_tokens_seen": 338372490, + "step": 15693, + "time_per_iteration": 2.485797882080078 + }, + { + "auxiliary_loss_clip": 0.01070435, + "auxiliary_loss_mlp": 0.01035116, + "balance_loss_clip": 1.03407562, + "balance_loss_mlp": 1.02205086, + "epoch": 0.9435743273711108, + "flos": 25009089004800.0, + "grad_norm": 1.5566043306936497, + "language_loss": 0.73086226, + "learning_rate": 3.327111408156291e-08, + "loss": 0.75191778, + "num_input_tokens_seen": 338390870, + "step": 15694, + "time_per_iteration": 2.590803384780884 + }, + { + "auxiliary_loss_clip": 0.00997085, + "auxiliary_loss_mlp": 0.0100194, + "balance_loss_clip": 1.00461113, + "balance_loss_mlp": 1.00088501, + "epoch": 0.9436344506237787, + "flos": 60158707320960.0, + "grad_norm": 0.6871429601106139, + "language_loss": 0.50603455, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.52602482, + "num_input_tokens_seen": 338453075, + "step": 15695, + "time_per_iteration": 3.1890575885772705 + }, + { + "auxiliary_loss_clip": 0.01074789, + "auxiliary_loss_mlp": 0.01032766, + "balance_loss_clip": 1.03146875, + "balance_loss_mlp": 1.02092874, + "epoch": 0.9436945738764467, + "flos": 22237252283520.0, + "grad_norm": 1.732658100683525, + "language_loss": 0.64924926, + "learning_rate": 3.312977685229335e-08, + "loss": 0.6703248, + "num_input_tokens_seen": 338471770, + "step": 15696, + "time_per_iteration": 2.5321311950683594 + }, + { + "auxiliary_loss_clip": 0.01092893, + "auxiliary_loss_mlp": 0.01027532, + "balance_loss_clip": 1.03522789, + "balance_loss_mlp": 1.01644516, + "epoch": 0.9437546971291146, + "flos": 25045179194880.0, + "grad_norm": 1.8340174655480652, + "language_loss": 0.66048634, + "learning_rate": 3.305922011219353e-08, + "loss": 0.68169057, + "num_input_tokens_seen": 338492190, + "step": 15697, + "time_per_iteration": 2.5046074390411377 + }, + { + "auxiliary_loss_clip": 0.01000338, + "auxiliary_loss_mlp": 0.01003406, + "balance_loss_clip": 1.00664282, + "balance_loss_mlp": 1.00220776, + "epoch": 0.9438148203817827, + "flos": 56790788400000.0, + "grad_norm": 0.9094381436720116, + "language_loss": 0.63208926, + "learning_rate": 3.298873795868506e-08, + "loss": 0.65212673, + "num_input_tokens_seen": 338552560, + "step": 15698, + "time_per_iteration": 3.074404001235962 + }, + { + "auxiliary_loss_clip": 0.01083565, + "auxiliary_loss_mlp": 0.01038896, + "balance_loss_clip": 1.03433585, + "balance_loss_mlp": 1.02631903, + "epoch": 0.9438749436344506, + "flos": 22346384780160.0, + "grad_norm": 1.738110008328092, + "language_loss": 0.69944704, + "learning_rate": 3.291833039444092e-08, + "loss": 0.72067165, + "num_input_tokens_seen": 338571770, + "step": 15699, + "time_per_iteration": 2.5105068683624268 + }, + { + "auxiliary_loss_clip": 0.01064895, + "auxiliary_loss_mlp": 0.0102797, + "balance_loss_clip": 1.03237295, + "balance_loss_mlp": 1.01638937, + "epoch": 0.9439350668871186, + "flos": 13370800337280.0, + "grad_norm": 1.954711165880113, + "language_loss": 0.74155378, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.76248246, + "num_input_tokens_seen": 338587310, + "step": 15700, + "time_per_iteration": 2.5311925411224365 + }, + { + "auxiliary_loss_clip": 0.01037503, + "auxiliary_loss_mlp": 0.01029694, + "balance_loss_clip": 1.03223062, + "balance_loss_mlp": 1.01827931, + "epoch": 0.9439951901397866, + "flos": 17785334770560.0, + "grad_norm": 1.5486579775127942, + "language_loss": 0.70962214, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.73029411, + "num_input_tokens_seen": 338606235, + "step": 15701, + "time_per_iteration": 2.600071430206299 + }, + { + "auxiliary_loss_clip": 0.01061321, + "auxiliary_loss_mlp": 0.0102706, + "balance_loss_clip": 1.03123927, + "balance_loss_mlp": 1.01527619, + "epoch": 0.9440553133924545, + "flos": 18879568738560.0, + "grad_norm": 1.746682759368625, + "language_loss": 0.7774207, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.79830456, + "num_input_tokens_seen": 338624090, + "step": 15702, + "time_per_iteration": 2.5510122776031494 + }, + { + "auxiliary_loss_clip": 0.0108965, + "auxiliary_loss_mlp": 0.01040415, + "balance_loss_clip": 1.03325462, + "balance_loss_mlp": 1.02760553, + "epoch": 0.9441154366451225, + "flos": 19572967860480.0, + "grad_norm": 1.587498964005069, + "language_loss": 0.66638446, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.68768507, + "num_input_tokens_seen": 338643695, + "step": 15703, + "time_per_iteration": 2.4655585289001465 + }, + { + "auxiliary_loss_clip": 0.01093744, + "auxiliary_loss_mlp": 0.01027039, + "balance_loss_clip": 1.03733015, + "balance_loss_mlp": 1.01462388, + "epoch": 0.9441755598977905, + "flos": 30294995472000.0, + "grad_norm": 2.267236788513038, + "language_loss": 0.73019505, + "learning_rate": 3.256741150552833e-08, + "loss": 0.75140297, + "num_input_tokens_seen": 338664725, + "step": 15704, + "time_per_iteration": 2.5804500579833984 + }, + { + "auxiliary_loss_clip": 0.01089334, + "auxiliary_loss_mlp": 0.0103182, + "balance_loss_clip": 1.03461027, + "balance_loss_mlp": 1.01966119, + "epoch": 0.9442356831504585, + "flos": 20667884186880.0, + "grad_norm": 1.7462012293071008, + "language_loss": 0.74243325, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.76364475, + "num_input_tokens_seen": 338683990, + "step": 15705, + "time_per_iteration": 2.4650678634643555 + }, + { + "auxiliary_loss_clip": 0.01081239, + "auxiliary_loss_mlp": 0.01033646, + "balance_loss_clip": 1.0363214, + "balance_loss_mlp": 1.02268505, + "epoch": 0.9442958064031264, + "flos": 16107265140480.0, + "grad_norm": 2.0178982960852574, + "language_loss": 0.76970136, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.79085022, + "num_input_tokens_seen": 338702025, + "step": 15706, + "time_per_iteration": 2.5066909790039062 + }, + { + "auxiliary_loss_clip": 0.0108838, + "auxiliary_loss_mlp": 0.01027714, + "balance_loss_clip": 1.03310847, + "balance_loss_mlp": 1.01701498, + "epoch": 0.9443559296557944, + "flos": 20447392550400.0, + "grad_norm": 1.562203747922436, + "language_loss": 0.69338673, + "learning_rate": 3.23577554137866e-08, + "loss": 0.71454763, + "num_input_tokens_seen": 338720920, + "step": 15707, + "time_per_iteration": 2.462198257446289 + }, + { + "auxiliary_loss_clip": 0.01095426, + "auxiliary_loss_mlp": 0.01026074, + "balance_loss_clip": 1.03098202, + "balance_loss_mlp": 1.01544619, + "epoch": 0.9444160529084623, + "flos": 21610897896960.0, + "grad_norm": 1.7255546565955746, + "language_loss": 0.69244325, + "learning_rate": 3.22880192727244e-08, + "loss": 0.71365821, + "num_input_tokens_seen": 338739590, + "step": 15708, + "time_per_iteration": 2.4740374088287354 + }, + { + "auxiliary_loss_clip": 0.01093335, + "auxiliary_loss_mlp": 0.01030816, + "balance_loss_clip": 1.03552103, + "balance_loss_mlp": 1.01910961, + "epoch": 0.9444761761611303, + "flos": 18441781776000.0, + "grad_norm": 2.5932905281414538, + "language_loss": 0.70651042, + "learning_rate": 3.221835774749748e-08, + "loss": 0.72775197, + "num_input_tokens_seen": 338757240, + "step": 15709, + "time_per_iteration": 2.439553737640381 + }, + { + "auxiliary_loss_clip": 0.01061682, + "auxiliary_loss_mlp": 0.01032351, + "balance_loss_clip": 1.0354259, + "balance_loss_mlp": 1.02068019, + "epoch": 0.9445362994137982, + "flos": 20957144411520.0, + "grad_norm": 2.0166719916740123, + "language_loss": 0.84560621, + "learning_rate": 3.214877084074774e-08, + "loss": 0.86654651, + "num_input_tokens_seen": 338773750, + "step": 15710, + "time_per_iteration": 2.566802978515625 + }, + { + "auxiliary_loss_clip": 0.01079072, + "auxiliary_loss_mlp": 0.01028479, + "balance_loss_clip": 1.03483796, + "balance_loss_mlp": 1.01577735, + "epoch": 0.9445964226664663, + "flos": 20303283185280.0, + "grad_norm": 1.6780831326137393, + "language_loss": 0.71399403, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.73506951, + "num_input_tokens_seen": 338792115, + "step": 15711, + "time_per_iteration": 2.5305678844451904 + }, + { + "auxiliary_loss_clip": 0.01091747, + "auxiliary_loss_mlp": 0.01027653, + "balance_loss_clip": 1.03513288, + "balance_loss_mlp": 1.01568496, + "epoch": 0.9446565459191342, + "flos": 26396030903040.0, + "grad_norm": 1.6212708561934541, + "language_loss": 0.69102025, + "learning_rate": 3.200982089323179e-08, + "loss": 0.71221423, + "num_input_tokens_seen": 338812480, + "step": 15712, + "time_per_iteration": 2.5181126594543457 + }, + { + "auxiliary_loss_clip": 0.01097565, + "auxiliary_loss_mlp": 0.01035909, + "balance_loss_clip": 1.03731918, + "balance_loss_mlp": 1.0225817, + "epoch": 0.9447166691718022, + "flos": 16544764794240.0, + "grad_norm": 2.2869355654776644, + "language_loss": 0.70252168, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.72385645, + "num_input_tokens_seen": 338829105, + "step": 15713, + "time_per_iteration": 2.427954912185669 + }, + { + "auxiliary_loss_clip": 0.01078258, + "auxiliary_loss_mlp": 0.01031116, + "balance_loss_clip": 1.03253317, + "balance_loss_mlp": 1.01858115, + "epoch": 0.9447767924244702, + "flos": 29164635400320.0, + "grad_norm": 1.5866640754028876, + "language_loss": 0.76573288, + "learning_rate": 3.187116945125212e-08, + "loss": 0.78682667, + "num_input_tokens_seen": 338850670, + "step": 15714, + "time_per_iteration": 2.5610082149505615 + }, + { + "auxiliary_loss_clip": 0.01074615, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.03309488, + "balance_loss_mlp": 1.02026176, + "epoch": 0.9448369156771381, + "flos": 19274908803840.0, + "grad_norm": 1.9495247818315684, + "language_loss": 0.6749351, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.69600695, + "num_input_tokens_seen": 338867795, + "step": 15715, + "time_per_iteration": 2.513051986694336 + }, + { + "auxiliary_loss_clip": 0.01072448, + "auxiliary_loss_mlp": 0.01031802, + "balance_loss_clip": 1.03467333, + "balance_loss_mlp": 1.01940477, + "epoch": 0.9448970389298061, + "flos": 23841166285440.0, + "grad_norm": 1.7190119794870498, + "language_loss": 0.7496528, + "learning_rate": 3.173281653583948e-08, + "loss": 0.77069533, + "num_input_tokens_seen": 338887205, + "step": 15716, + "time_per_iteration": 2.5538980960845947 + }, + { + "auxiliary_loss_clip": 0.0108661, + "auxiliary_loss_mlp": 0.01032908, + "balance_loss_clip": 1.03726673, + "balance_loss_mlp": 1.02080798, + "epoch": 0.944957162182474, + "flos": 22382259488640.0, + "grad_norm": 1.970141956860539, + "language_loss": 0.62535632, + "learning_rate": 3.166375203215565e-08, + "loss": 0.64655149, + "num_input_tokens_seen": 338906130, + "step": 15717, + "time_per_iteration": 2.503058433532715 + }, + { + "auxiliary_loss_clip": 0.01087068, + "auxiliary_loss_mlp": 0.01029006, + "balance_loss_clip": 1.03529775, + "balance_loss_mlp": 1.01743126, + "epoch": 0.9450172854351421, + "flos": 17383889393280.0, + "grad_norm": 1.797944376757105, + "language_loss": 0.79378897, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.81494975, + "num_input_tokens_seen": 338923045, + "step": 15718, + "time_per_iteration": 2.4711573123931885 + }, + { + "auxiliary_loss_clip": 0.01018436, + "auxiliary_loss_mlp": 0.01001232, + "balance_loss_clip": 1.00561619, + "balance_loss_mlp": 1.00021327, + "epoch": 0.94507740868781, + "flos": 68466352406400.0, + "grad_norm": 0.6965450312518885, + "language_loss": 0.57797384, + "learning_rate": 3.152584694592719e-08, + "loss": 0.59817052, + "num_input_tokens_seen": 338987545, + "step": 15719, + "time_per_iteration": 3.1398465633392334 + }, + { + "auxiliary_loss_clip": 0.01062304, + "auxiliary_loss_mlp": 0.00782912, + "balance_loss_clip": 1.03251743, + "balance_loss_mlp": 1.00826502, + "epoch": 0.945137531940478, + "flos": 21142479611520.0, + "grad_norm": 1.5059785981604858, + "language_loss": 0.75820386, + "learning_rate": 3.145700636861193e-08, + "loss": 0.77665603, + "num_input_tokens_seen": 339007830, + "step": 15720, + "time_per_iteration": 3.9339468479156494 + }, + { + "auxiliary_loss_clip": 0.01088049, + "auxiliary_loss_mlp": 0.01024401, + "balance_loss_clip": 1.03524518, + "balance_loss_mlp": 1.01403022, + "epoch": 0.9451976551931459, + "flos": 24533918962560.0, + "grad_norm": 1.6410521607196302, + "language_loss": 0.72828203, + "learning_rate": 3.138824043864452e-08, + "loss": 0.74940652, + "num_input_tokens_seen": 339028980, + "step": 15721, + "time_per_iteration": 3.9112160205841064 + }, + { + "auxiliary_loss_clip": 0.0105964, + "auxiliary_loss_mlp": 0.0103317, + "balance_loss_clip": 1.03328371, + "balance_loss_mlp": 1.02036095, + "epoch": 0.9452577784458139, + "flos": 23440582834560.0, + "grad_norm": 1.83163372094644, + "language_loss": 0.85126412, + "learning_rate": 3.131954915863244e-08, + "loss": 0.8721922, + "num_input_tokens_seen": 339047950, + "step": 15722, + "time_per_iteration": 4.004755735397339 + }, + { + "auxiliary_loss_clip": 0.01008144, + "auxiliary_loss_mlp": 0.01002667, + "balance_loss_clip": 1.00673735, + "balance_loss_mlp": 1.0015707, + "epoch": 0.9453179016984818, + "flos": 52017686449920.0, + "grad_norm": 0.8996712334627891, + "language_loss": 0.64557803, + "learning_rate": 3.125093253118005e-08, + "loss": 0.66568625, + "num_input_tokens_seen": 339104535, + "step": 15723, + "time_per_iteration": 3.0693395137786865 + }, + { + "auxiliary_loss_clip": 0.01064263, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.03540468, + "balance_loss_mlp": 1.01817489, + "epoch": 0.9453780249511499, + "flos": 13473001509120.0, + "grad_norm": 1.8203193715256547, + "language_loss": 0.72945344, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.75040656, + "num_input_tokens_seen": 339122050, + "step": 15724, + "time_per_iteration": 2.525331497192383 + }, + { + "auxiliary_loss_clip": 0.01069326, + "auxiliary_loss_mlp": 0.01024897, + "balance_loss_clip": 1.03234506, + "balance_loss_mlp": 1.01342893, + "epoch": 0.9454381482038178, + "flos": 23258515772160.0, + "grad_norm": 1.9964541227892152, + "language_loss": 0.84764981, + "learning_rate": 3.111392324436024e-08, + "loss": 0.86859202, + "num_input_tokens_seen": 339138940, + "step": 15725, + "time_per_iteration": 2.552743673324585 + }, + { + "auxiliary_loss_clip": 0.01083959, + "auxiliary_loss_mlp": 0.01026068, + "balance_loss_clip": 1.03548241, + "balance_loss_mlp": 1.01452208, + "epoch": 0.9454982714564858, + "flos": 19496621502720.0, + "grad_norm": 1.8408164536048706, + "language_loss": 0.70893914, + "learning_rate": 3.104553059018822e-08, + "loss": 0.73003936, + "num_input_tokens_seen": 339158245, + "step": 15726, + "time_per_iteration": 2.5179543495178223 + }, + { + "auxiliary_loss_clip": 0.01079004, + "auxiliary_loss_mlp": 0.01029112, + "balance_loss_clip": 1.03457808, + "balance_loss_mlp": 1.01608825, + "epoch": 0.9455583947091538, + "flos": 23258120722560.0, + "grad_norm": 1.8103834068806104, + "language_loss": 0.60666937, + "learning_rate": 3.097721259896735e-08, + "loss": 0.62775052, + "num_input_tokens_seen": 339178200, + "step": 15727, + "time_per_iteration": 2.5459678173065186 + }, + { + "auxiliary_loss_clip": 0.01089739, + "auxiliary_loss_mlp": 0.01034079, + "balance_loss_clip": 1.03348064, + "balance_loss_mlp": 1.02248621, + "epoch": 0.9456185179618217, + "flos": 17673041877120.0, + "grad_norm": 1.869638453476036, + "language_loss": 0.81789267, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.83913094, + "num_input_tokens_seen": 339193950, + "step": 15728, + "time_per_iteration": 2.461965322494507 + }, + { + "auxiliary_loss_clip": 0.00990321, + "auxiliary_loss_mlp": 0.01025665, + "balance_loss_clip": 1.01658893, + "balance_loss_mlp": 1.02450264, + "epoch": 0.9456786412144897, + "flos": 61415040389760.0, + "grad_norm": 0.7433309696627799, + "language_loss": 0.59004426, + "learning_rate": 3.08408006157368e-08, + "loss": 0.61020416, + "num_input_tokens_seen": 339252330, + "step": 15729, + "time_per_iteration": 3.184943199157715 + }, + { + "auxiliary_loss_clip": 0.01101746, + "auxiliary_loss_mlp": 0.01025702, + "balance_loss_clip": 1.0345161, + "balance_loss_mlp": 1.01356053, + "epoch": 0.9457387644671577, + "flos": 18588369179520.0, + "grad_norm": 1.7444254232454885, + "language_loss": 0.76555836, + "learning_rate": 3.077270662890052e-08, + "loss": 0.78683287, + "num_input_tokens_seen": 339270325, + "step": 15730, + "time_per_iteration": 3.824986457824707 + }, + { + "auxiliary_loss_clip": 0.01077291, + "auxiliary_loss_mlp": 0.01031416, + "balance_loss_clip": 1.0333569, + "balance_loss_mlp": 1.01865458, + "epoch": 0.9457988877198257, + "flos": 21108544237440.0, + "grad_norm": 1.3949992201121133, + "language_loss": 0.6256249, + "learning_rate": 3.070468731536047e-08, + "loss": 0.64671189, + "num_input_tokens_seen": 339291980, + "step": 15731, + "time_per_iteration": 2.5751113891601562 + }, + { + "auxiliary_loss_clip": 0.01094253, + "auxiliary_loss_mlp": 0.0102781, + "balance_loss_clip": 1.03347921, + "balance_loss_mlp": 1.01517391, + "epoch": 0.9458590109724936, + "flos": 26688379697280.0, + "grad_norm": 2.089445034923707, + "language_loss": 0.6409384, + "learning_rate": 3.063674267769589e-08, + "loss": 0.66215897, + "num_input_tokens_seen": 339311795, + "step": 15732, + "time_per_iteration": 2.4941647052764893 + }, + { + "auxiliary_loss_clip": 0.01091561, + "auxiliary_loss_mlp": 0.01026897, + "balance_loss_clip": 1.03560674, + "balance_loss_mlp": 1.01436186, + "epoch": 0.9459191342251616, + "flos": 18661591054080.0, + "grad_norm": 1.8713815485384226, + "language_loss": 0.84043074, + "learning_rate": 3.056887271848363e-08, + "loss": 0.8616153, + "num_input_tokens_seen": 339327745, + "step": 15733, + "time_per_iteration": 2.4687414169311523 + }, + { + "auxiliary_loss_clip": 0.01090506, + "auxiliary_loss_mlp": 0.01028001, + "balance_loss_clip": 1.03424132, + "balance_loss_mlp": 1.0170753, + "epoch": 0.9459792574778295, + "flos": 23398459159680.0, + "grad_norm": 1.4103824401122265, + "language_loss": 0.72074938, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.74193442, + "num_input_tokens_seen": 339346445, + "step": 15734, + "time_per_iteration": 2.484187364578247 + }, + { + "auxiliary_loss_clip": 0.01085854, + "auxiliary_loss_mlp": 0.01030837, + "balance_loss_clip": 1.03153563, + "balance_loss_mlp": 1.02082396, + "epoch": 0.9460393807304975, + "flos": 24392969994240.0, + "grad_norm": 1.380010625509751, + "language_loss": 0.86801255, + "learning_rate": 3.043335684570692e-08, + "loss": 0.88917941, + "num_input_tokens_seen": 339367945, + "step": 15735, + "time_per_iteration": 2.5790066719055176 + }, + { + "auxiliary_loss_clip": 0.01082682, + "auxiliary_loss_mlp": 0.01029871, + "balance_loss_clip": 1.03253853, + "balance_loss_mlp": 1.01836109, + "epoch": 0.9460995039831654, + "flos": 21939408708480.0, + "grad_norm": 1.8727464665118811, + "language_loss": 0.67166209, + "learning_rate": 3.036571093728102e-08, + "loss": 0.69278765, + "num_input_tokens_seen": 339386060, + "step": 15736, + "time_per_iteration": 2.4909276962280273 + }, + { + "auxiliary_loss_clip": 0.00996498, + "auxiliary_loss_mlp": 0.01001729, + "balance_loss_clip": 1.02106452, + "balance_loss_mlp": 1.00056028, + "epoch": 0.9461596272358335, + "flos": 70322466775680.0, + "grad_norm": 0.8748594616502149, + "language_loss": 0.65352571, + "learning_rate": 3.029813971758499e-08, + "loss": 0.67350799, + "num_input_tokens_seen": 339446695, + "step": 15737, + "time_per_iteration": 3.2265865802764893 + }, + { + "auxiliary_loss_clip": 0.01017205, + "auxiliary_loss_mlp": 0.0099943, + "balance_loss_clip": 1.00786793, + "balance_loss_mlp": 0.99834484, + "epoch": 0.9462197504885014, + "flos": 58591242645120.0, + "grad_norm": 0.8030137813572151, + "language_loss": 0.58807778, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.60824412, + "num_input_tokens_seen": 339510080, + "step": 15738, + "time_per_iteration": 3.0877442359924316 + }, + { + "auxiliary_loss_clip": 0.01090008, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.03315616, + "balance_loss_mlp": 1.01972532, + "epoch": 0.9462798737411694, + "flos": 23433759250560.0, + "grad_norm": 1.8487552725869794, + "language_loss": 0.71546787, + "learning_rate": 3.016322135462834e-08, + "loss": 0.73667049, + "num_input_tokens_seen": 339529335, + "step": 15739, + "time_per_iteration": 2.4843344688415527 + }, + { + "auxiliary_loss_clip": 0.01090488, + "auxiliary_loss_mlp": 0.01029991, + "balance_loss_clip": 1.03335643, + "balance_loss_mlp": 1.01753926, + "epoch": 0.9463399969938374, + "flos": 25046077034880.0, + "grad_norm": 2.6962968216880907, + "language_loss": 0.6444968, + "learning_rate": 3.009587421648363e-08, + "loss": 0.66570163, + "num_input_tokens_seen": 339548820, + "step": 15740, + "time_per_iteration": 2.53670597076416 + }, + { + "auxiliary_loss_clip": 0.01077019, + "auxiliary_loss_mlp": 0.01030061, + "balance_loss_clip": 1.03384876, + "balance_loss_mlp": 1.01849151, + "epoch": 0.9464001202465053, + "flos": 24352606085760.0, + "grad_norm": 1.6912422479717186, + "language_loss": 0.66473526, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.68580604, + "num_input_tokens_seen": 339566775, + "step": 15741, + "time_per_iteration": 2.532106876373291 + }, + { + "auxiliary_loss_clip": 0.01092702, + "auxiliary_loss_mlp": 0.01025649, + "balance_loss_clip": 1.03500867, + "balance_loss_mlp": 1.01390648, + "epoch": 0.9464602434991733, + "flos": 17165444832000.0, + "grad_norm": 2.100812921943838, + "language_loss": 0.7581352, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.77931875, + "num_input_tokens_seen": 339581905, + "step": 15742, + "time_per_iteration": 2.4864628314971924 + }, + { + "auxiliary_loss_clip": 0.01088591, + "auxiliary_loss_mlp": 0.0103206, + "balance_loss_clip": 1.03276706, + "balance_loss_mlp": 1.02063382, + "epoch": 0.9465203667518413, + "flos": 19938107566080.0, + "grad_norm": 1.754444055567874, + "language_loss": 0.72317123, + "learning_rate": 2.989428100602187e-08, + "loss": 0.74437773, + "num_input_tokens_seen": 339599870, + "step": 15743, + "time_per_iteration": 2.4477202892303467 + }, + { + "auxiliary_loss_clip": 0.01068589, + "auxiliary_loss_mlp": 0.01032105, + "balance_loss_clip": 1.03610587, + "balance_loss_mlp": 1.01956987, + "epoch": 0.9465804900045093, + "flos": 20120318282880.0, + "grad_norm": 1.7244673826235897, + "language_loss": 0.79408062, + "learning_rate": 2.982723267901943e-08, + "loss": 0.81508756, + "num_input_tokens_seen": 339620250, + "step": 15744, + "time_per_iteration": 2.5766162872314453 + }, + { + "auxiliary_loss_clip": 0.0108102, + "auxiliary_loss_mlp": 0.01035632, + "balance_loss_clip": 1.03528023, + "balance_loss_mlp": 1.0235796, + "epoch": 0.9466406132571772, + "flos": 23911622812800.0, + "grad_norm": 1.5065310425668796, + "language_loss": 0.78028268, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.80144924, + "num_input_tokens_seen": 339639900, + "step": 15745, + "time_per_iteration": 2.527313232421875 + }, + { + "auxiliary_loss_clip": 0.01078901, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.03280413, + "balance_loss_mlp": 1.01977277, + "epoch": 0.9467007365098452, + "flos": 19933223316480.0, + "grad_norm": 1.4522058547884102, + "language_loss": 0.70195961, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.72307122, + "num_input_tokens_seen": 339658970, + "step": 15746, + "time_per_iteration": 2.5212342739105225 + }, + { + "auxiliary_loss_clip": 0.01077737, + "auxiliary_loss_mlp": 0.01027942, + "balance_loss_clip": 1.03483415, + "balance_loss_mlp": 1.01602054, + "epoch": 0.9467608597625131, + "flos": 19310496203520.0, + "grad_norm": 2.0385207307507205, + "language_loss": 0.55796456, + "learning_rate": 2.962653596305964e-08, + "loss": 0.57902133, + "num_input_tokens_seen": 339675600, + "step": 15747, + "time_per_iteration": 2.4823431968688965 + }, + { + "auxiliary_loss_clip": 0.0097251, + "auxiliary_loss_mlp": 0.01004164, + "balance_loss_clip": 1.00798416, + "balance_loss_mlp": 1.00297832, + "epoch": 0.9468209830151811, + "flos": 69630252802560.0, + "grad_norm": 0.6916005215148011, + "language_loss": 0.53206384, + "learning_rate": 2.955978648787871e-08, + "loss": 0.55183065, + "num_input_tokens_seen": 339744505, + "step": 15748, + "time_per_iteration": 3.6436448097229004 + }, + { + "auxiliary_loss_clip": 0.01082177, + "auxiliary_loss_mlp": 0.01034404, + "balance_loss_clip": 1.03389287, + "balance_loss_mlp": 1.02225709, + "epoch": 0.946881106267849, + "flos": 27016639113600.0, + "grad_norm": 1.887762087276275, + "language_loss": 0.66565061, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.68681639, + "num_input_tokens_seen": 339765810, + "step": 15749, + "time_per_iteration": 3.1324243545532227 + }, + { + "auxiliary_loss_clip": 0.01072641, + "auxiliary_loss_mlp": 0.01032755, + "balance_loss_clip": 1.03187168, + "balance_loss_mlp": 1.01911163, + "epoch": 0.9469412295205171, + "flos": 20190092451840.0, + "grad_norm": 1.8784190224458064, + "language_loss": 0.76170605, + "learning_rate": 2.942651169791621e-08, + "loss": 0.78276002, + "num_input_tokens_seen": 339784125, + "step": 15750, + "time_per_iteration": 2.517817735671997 + }, + { + "auxiliary_loss_clip": 0.01091571, + "auxiliary_loss_mlp": 0.01027631, + "balance_loss_clip": 1.03473592, + "balance_loss_mlp": 1.01619887, + "epoch": 0.947001352773185, + "flos": 21324905809920.0, + "grad_norm": 1.6924569966836118, + "language_loss": 0.67803538, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.69922733, + "num_input_tokens_seen": 339803450, + "step": 15751, + "time_per_iteration": 2.494255542755127 + }, + { + "auxiliary_loss_clip": 0.01071779, + "auxiliary_loss_mlp": 0.01025967, + "balance_loss_clip": 1.03334391, + "balance_loss_mlp": 1.01448703, + "epoch": 0.947061476025853, + "flos": 21944041562880.0, + "grad_norm": 1.6092039805946716, + "language_loss": 0.65082532, + "learning_rate": 2.929353580532723e-08, + "loss": 0.67180276, + "num_input_tokens_seen": 339823215, + "step": 15752, + "time_per_iteration": 2.5496826171875 + }, + { + "auxiliary_loss_clip": 0.01088693, + "auxiliary_loss_mlp": 0.01035151, + "balance_loss_clip": 1.03271937, + "balance_loss_mlp": 1.02193069, + "epoch": 0.947121599278521, + "flos": 21394715892480.0, + "grad_norm": 1.5226950056679822, + "language_loss": 0.71451735, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.7357558, + "num_input_tokens_seen": 339842230, + "step": 15753, + "time_per_iteration": 2.5137991905212402 + }, + { + "auxiliary_loss_clip": 0.01106016, + "auxiliary_loss_mlp": 0.01033204, + "balance_loss_clip": 1.03438771, + "balance_loss_mlp": 1.01887536, + "epoch": 0.9471817225311889, + "flos": 23075730437760.0, + "grad_norm": 1.8255918560104618, + "language_loss": 0.70240706, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.72379923, + "num_input_tokens_seen": 339861640, + "step": 15754, + "time_per_iteration": 2.4509847164154053 + }, + { + "auxiliary_loss_clip": 0.01104631, + "auxiliary_loss_mlp": 0.0102726, + "balance_loss_clip": 1.03375709, + "balance_loss_mlp": 1.01569653, + "epoch": 0.947241845783857, + "flos": 11910744305280.0, + "grad_norm": 2.316303844200544, + "language_loss": 0.78782451, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.80914342, + "num_input_tokens_seen": 339878210, + "step": 15755, + "time_per_iteration": 2.4217464923858643 + }, + { + "auxiliary_loss_clip": 0.01065831, + "auxiliary_loss_mlp": 0.01037552, + "balance_loss_clip": 1.03369975, + "balance_loss_mlp": 1.02297914, + "epoch": 0.9473019690365249, + "flos": 20740675098240.0, + "grad_norm": 3.0974011823095906, + "language_loss": 0.75649548, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.7775293, + "num_input_tokens_seen": 339894255, + "step": 15756, + "time_per_iteration": 2.5079708099365234 + }, + { + "auxiliary_loss_clip": 0.01078856, + "auxiliary_loss_mlp": 0.01031626, + "balance_loss_clip": 1.03126228, + "balance_loss_mlp": 1.01978242, + "epoch": 0.9473620922891929, + "flos": 17639896602240.0, + "grad_norm": 2.1349843004249514, + "language_loss": 0.74381775, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.76492256, + "num_input_tokens_seen": 339912425, + "step": 15757, + "time_per_iteration": 2.492227554321289 + }, + { + "auxiliary_loss_clip": 0.01085612, + "auxiliary_loss_mlp": 0.01030348, + "balance_loss_clip": 1.034554, + "balance_loss_mlp": 1.01735401, + "epoch": 0.9474222155418608, + "flos": 23550002640000.0, + "grad_norm": 2.0923234428554967, + "language_loss": 0.79665959, + "learning_rate": 2.889640171327512e-08, + "loss": 0.81781924, + "num_input_tokens_seen": 339929635, + "step": 15758, + "time_per_iteration": 4.468221187591553 + }, + { + "auxiliary_loss_clip": 0.010799, + "auxiliary_loss_mlp": 0.00782112, + "balance_loss_clip": 1.0337739, + "balance_loss_mlp": 1.00768125, + "epoch": 0.9474823387945288, + "flos": 27089753247360.0, + "grad_norm": 1.6147534683338334, + "language_loss": 0.71794528, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.73656535, + "num_input_tokens_seen": 339951200, + "step": 15759, + "time_per_iteration": 3.9437901973724365 + }, + { + "auxiliary_loss_clip": 0.01086497, + "auxiliary_loss_mlp": 0.0102527, + "balance_loss_clip": 1.03682029, + "balance_loss_mlp": 1.01512492, + "epoch": 0.9475424620471967, + "flos": 22966526113920.0, + "grad_norm": 1.5694203292181799, + "language_loss": 0.75798434, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.77910197, + "num_input_tokens_seen": 339971820, + "step": 15760, + "time_per_iteration": 2.5118021965026855 + }, + { + "auxiliary_loss_clip": 0.01102646, + "auxiliary_loss_mlp": 0.00781882, + "balance_loss_clip": 1.0350244, + "balance_loss_mlp": 1.00795794, + "epoch": 0.9476025852998647, + "flos": 20047671025920.0, + "grad_norm": 2.251547318487219, + "language_loss": 0.7280665, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.74691176, + "num_input_tokens_seen": 339989420, + "step": 15761, + "time_per_iteration": 3.952897310256958 + }, + { + "auxiliary_loss_clip": 0.01081355, + "auxiliary_loss_mlp": 0.01033563, + "balance_loss_clip": 1.03774571, + "balance_loss_mlp": 1.02232134, + "epoch": 0.9476627085525327, + "flos": 14975468524800.0, + "grad_norm": 2.0335139764123893, + "language_loss": 0.71606266, + "learning_rate": 2.863314050734722e-08, + "loss": 0.73721188, + "num_input_tokens_seen": 340006690, + "step": 15762, + "time_per_iteration": 2.475796699523926 + }, + { + "auxiliary_loss_clip": 0.01105602, + "auxiliary_loss_mlp": 0.01035418, + "balance_loss_clip": 1.03388524, + "balance_loss_mlp": 1.0225141, + "epoch": 0.9477228318052007, + "flos": 18697788984960.0, + "grad_norm": 17.419649784633222, + "language_loss": 0.66844934, + "learning_rate": 2.856751208570518e-08, + "loss": 0.68985957, + "num_input_tokens_seen": 340025480, + "step": 15763, + "time_per_iteration": 2.469226121902466 + }, + { + "auxiliary_loss_clip": 0.01103081, + "auxiliary_loss_mlp": 0.01034614, + "balance_loss_clip": 1.03346968, + "balance_loss_mlp": 1.02306831, + "epoch": 0.9477829550578686, + "flos": 23875065745920.0, + "grad_norm": 1.8428787967534006, + "language_loss": 0.69539499, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.71677196, + "num_input_tokens_seen": 340043785, + "step": 15764, + "time_per_iteration": 2.463555335998535 + }, + { + "auxiliary_loss_clip": 0.01092102, + "auxiliary_loss_mlp": 0.00782408, + "balance_loss_clip": 1.0372231, + "balance_loss_mlp": 1.01110387, + "epoch": 0.9478430783105366, + "flos": 22562890007040.0, + "grad_norm": 1.6306230006422098, + "language_loss": 0.70821917, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.72696429, + "num_input_tokens_seen": 340064360, + "step": 15765, + "time_per_iteration": 2.515057325363159 + }, + { + "auxiliary_loss_clip": 0.01014961, + "auxiliary_loss_mlp": 0.0100151, + "balance_loss_clip": 1.0044328, + "balance_loss_mlp": 1.0004673, + "epoch": 0.9479032015632046, + "flos": 60857885554560.0, + "grad_norm": 0.8096867875937486, + "language_loss": 0.59125119, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.61141586, + "num_input_tokens_seen": 340114425, + "step": 15766, + "time_per_iteration": 2.8709473609924316 + }, + { + "auxiliary_loss_clip": 0.01059825, + "auxiliary_loss_mlp": 0.01041781, + "balance_loss_clip": 1.03516769, + "balance_loss_mlp": 1.02930605, + "epoch": 0.9479633248158725, + "flos": 14683873916160.0, + "grad_norm": 1.9068129919818604, + "language_loss": 0.74598032, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.76699638, + "num_input_tokens_seen": 340132200, + "step": 15767, + "time_per_iteration": 2.570392370223999 + }, + { + "auxiliary_loss_clip": 0.01076351, + "auxiliary_loss_mlp": 0.01036259, + "balance_loss_clip": 1.0347054, + "balance_loss_mlp": 1.02363491, + "epoch": 0.9480234480685406, + "flos": 20333878594560.0, + "grad_norm": 2.5117901009127, + "language_loss": 0.73221081, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.75333691, + "num_input_tokens_seen": 340149175, + "step": 15768, + "time_per_iteration": 3.9684693813323975 + }, + { + "auxiliary_loss_clip": 0.00990687, + "auxiliary_loss_mlp": 0.01003843, + "balance_loss_clip": 1.00771284, + "balance_loss_mlp": 1.00259769, + "epoch": 0.9480835713212085, + "flos": 70293092428800.0, + "grad_norm": 0.7365797676267513, + "language_loss": 0.55283141, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.57277673, + "num_input_tokens_seen": 340208155, + "step": 15769, + "time_per_iteration": 3.199697732925415 + }, + { + "auxiliary_loss_clip": 0.01063277, + "auxiliary_loss_mlp": 0.01031014, + "balance_loss_clip": 1.03361011, + "balance_loss_mlp": 1.01906943, + "epoch": 0.9481436945738765, + "flos": 25449749055360.0, + "grad_norm": 1.3935907067927642, + "language_loss": 0.77714282, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.79808575, + "num_input_tokens_seen": 340229275, + "step": 15770, + "time_per_iteration": 2.6410505771636963 + }, + { + "auxiliary_loss_clip": 0.01085832, + "auxiliary_loss_mlp": 0.01033727, + "balance_loss_clip": 1.03761542, + "balance_loss_mlp": 1.02136552, + "epoch": 0.9482038178265444, + "flos": 26979902478720.0, + "grad_norm": 1.9120090013686777, + "language_loss": 0.79878294, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.81997854, + "num_input_tokens_seen": 340248920, + "step": 15771, + "time_per_iteration": 2.541445016860962 + }, + { + "auxiliary_loss_clip": 0.01069243, + "auxiliary_loss_mlp": 0.01027975, + "balance_loss_clip": 1.03213775, + "balance_loss_mlp": 1.01611328, + "epoch": 0.9482639410792124, + "flos": 17785442511360.0, + "grad_norm": 1.6809017825773926, + "language_loss": 0.69727194, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.71824408, + "num_input_tokens_seen": 340266775, + "step": 15772, + "time_per_iteration": 2.52559232711792 + }, + { + "auxiliary_loss_clip": 0.01087692, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.03391814, + "balance_loss_mlp": 1.0206238, + "epoch": 0.9483240643318803, + "flos": 20996682307200.0, + "grad_norm": 1.4819111052292349, + "language_loss": 0.74030262, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.76150602, + "num_input_tokens_seen": 340285295, + "step": 15773, + "time_per_iteration": 2.4753904342651367 + }, + { + "auxiliary_loss_clip": 0.01070056, + "auxiliary_loss_mlp": 0.01034057, + "balance_loss_clip": 1.03426063, + "balance_loss_mlp": 1.02129006, + "epoch": 0.9483841875845483, + "flos": 20083294339200.0, + "grad_norm": 2.1289643584983184, + "language_loss": 0.62645435, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.64749551, + "num_input_tokens_seen": 340304265, + "step": 15774, + "time_per_iteration": 2.605304002761841 + }, + { + "auxiliary_loss_clip": 0.01103612, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.03392422, + "balance_loss_mlp": 1.02044976, + "epoch": 0.9484443108372163, + "flos": 20813645577600.0, + "grad_norm": 2.127633419883079, + "language_loss": 0.59594101, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.61730719, + "num_input_tokens_seen": 340323690, + "step": 15775, + "time_per_iteration": 2.4367942810058594 + }, + { + "auxiliary_loss_clip": 0.01083336, + "auxiliary_loss_mlp": 0.0102739, + "balance_loss_clip": 1.03573775, + "balance_loss_mlp": 1.01498008, + "epoch": 0.9485044340898843, + "flos": 36429184506240.0, + "grad_norm": 1.5290812488564158, + "language_loss": 0.61894745, + "learning_rate": 2.772114638584555e-08, + "loss": 0.6400547, + "num_input_tokens_seen": 340345830, + "step": 15776, + "time_per_iteration": 2.6305439472198486 + }, + { + "auxiliary_loss_clip": 0.01075032, + "auxiliary_loss_mlp": 0.0102926, + "balance_loss_clip": 1.03209317, + "balance_loss_mlp": 1.01656437, + "epoch": 0.9485645573425522, + "flos": 22602535643520.0, + "grad_norm": 2.2035920250500864, + "language_loss": 0.73454541, + "learning_rate": 2.765656478622458e-08, + "loss": 0.75558835, + "num_input_tokens_seen": 340365910, + "step": 15777, + "time_per_iteration": 2.507387638092041 + }, + { + "auxiliary_loss_clip": 0.01101635, + "auxiliary_loss_mlp": 0.01036509, + "balance_loss_clip": 1.03639722, + "balance_loss_mlp": 1.02349138, + "epoch": 0.9486246805952202, + "flos": 22017766227840.0, + "grad_norm": 2.6269423544278965, + "language_loss": 0.72068989, + "learning_rate": 2.759205797806441e-08, + "loss": 0.74207139, + "num_input_tokens_seen": 340383935, + "step": 15778, + "time_per_iteration": 2.4943501949310303 + }, + { + "auxiliary_loss_clip": 0.01088651, + "auxiliary_loss_mlp": 0.00782777, + "balance_loss_clip": 1.03532469, + "balance_loss_mlp": 1.01131344, + "epoch": 0.9486848038478882, + "flos": 16508674604160.0, + "grad_norm": 1.9398628418742216, + "language_loss": 0.69901133, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.71772569, + "num_input_tokens_seen": 340402760, + "step": 15779, + "time_per_iteration": 2.450079917907715 + }, + { + "auxiliary_loss_clip": 0.01104353, + "auxiliary_loss_mlp": 0.01033028, + "balance_loss_clip": 1.03546238, + "balance_loss_mlp": 1.02042127, + "epoch": 0.9487449271005561, + "flos": 19244385221760.0, + "grad_norm": 2.773990953204372, + "language_loss": 0.77919459, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.8005684, + "num_input_tokens_seen": 340422105, + "step": 15780, + "time_per_iteration": 2.4414784908294678 + }, + { + "auxiliary_loss_clip": 0.01076934, + "auxiliary_loss_mlp": 0.00782652, + "balance_loss_clip": 1.03608787, + "balance_loss_mlp": 1.00766611, + "epoch": 0.9488050503532242, + "flos": 21762692772480.0, + "grad_norm": 1.635009343598399, + "language_loss": 0.66166031, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.68025625, + "num_input_tokens_seen": 340441160, + "step": 15781, + "time_per_iteration": 2.516613245010376 + }, + { + "auxiliary_loss_clip": 0.01101904, + "auxiliary_loss_mlp": 0.01031454, + "balance_loss_clip": 1.03462434, + "balance_loss_mlp": 1.01950324, + "epoch": 0.9488651736058921, + "flos": 18368919037440.0, + "grad_norm": 1.963848462098364, + "language_loss": 0.79513669, + "learning_rate": 2.733477870890999e-08, + "loss": 0.81647027, + "num_input_tokens_seen": 340458200, + "step": 15782, + "time_per_iteration": 2.4435346126556396 + }, + { + "auxiliary_loss_clip": 0.01019035, + "auxiliary_loss_mlp": 0.01000743, + "balance_loss_clip": 1.00636888, + "balance_loss_mlp": 0.99977189, + "epoch": 0.9489252968585601, + "flos": 70084057230720.0, + "grad_norm": 0.716363737515144, + "language_loss": 0.59761304, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.61781079, + "num_input_tokens_seen": 340526420, + "step": 15783, + "time_per_iteration": 3.183809518814087 + }, + { + "auxiliary_loss_clip": 0.01092527, + "auxiliary_loss_mlp": 0.01030129, + "balance_loss_clip": 1.03387773, + "balance_loss_mlp": 1.0178268, + "epoch": 0.948985420111228, + "flos": 27855440490240.0, + "grad_norm": 1.5956060253390945, + "language_loss": 0.7393921, + "learning_rate": 2.720658788656105e-08, + "loss": 0.76061869, + "num_input_tokens_seen": 340546325, + "step": 15784, + "time_per_iteration": 2.557420492172241 + }, + { + "auxiliary_loss_clip": 0.01050927, + "auxiliary_loss_mlp": 0.01031053, + "balance_loss_clip": 1.03056836, + "balance_loss_mlp": 1.01779735, + "epoch": 0.949045543363896, + "flos": 24316049018880.0, + "grad_norm": 1.7840657046010706, + "language_loss": 0.69835234, + "learning_rate": 2.714260468695806e-08, + "loss": 0.71917212, + "num_input_tokens_seen": 340565145, + "step": 15785, + "time_per_iteration": 2.5745930671691895 + }, + { + "auxiliary_loss_clip": 0.01105164, + "auxiliary_loss_mlp": 0.01027895, + "balance_loss_clip": 1.03456783, + "balance_loss_mlp": 1.01552105, + "epoch": 0.9491056666165639, + "flos": 24241677909120.0, + "grad_norm": 1.5452186974056819, + "language_loss": 0.75942194, + "learning_rate": 2.707869629830495e-08, + "loss": 0.78075254, + "num_input_tokens_seen": 340585465, + "step": 15786, + "time_per_iteration": 2.506511926651001 + }, + { + "auxiliary_loss_clip": 0.010708, + "auxiliary_loss_mlp": 0.01028787, + "balance_loss_clip": 1.03488851, + "balance_loss_mlp": 1.01764107, + "epoch": 0.949165789869232, + "flos": 24531261356160.0, + "grad_norm": 1.739277643271959, + "language_loss": 0.78399742, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.80499327, + "num_input_tokens_seen": 340606010, + "step": 15787, + "time_per_iteration": 2.56524920463562 + }, + { + "auxiliary_loss_clip": 0.01093397, + "auxiliary_loss_mlp": 0.01026869, + "balance_loss_clip": 1.03729892, + "balance_loss_mlp": 1.01570535, + "epoch": 0.9492259131218999, + "flos": 22235348862720.0, + "grad_norm": 1.7524077722492615, + "language_loss": 0.7641176, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.78532022, + "num_input_tokens_seen": 340626135, + "step": 15788, + "time_per_iteration": 2.5197675228118896 + }, + { + "auxiliary_loss_clip": 0.01094168, + "auxiliary_loss_mlp": 0.0103185, + "balance_loss_clip": 1.03492022, + "balance_loss_mlp": 1.0188024, + "epoch": 0.9492860363745679, + "flos": 22966310632320.0, + "grad_norm": 1.6621756550064397, + "language_loss": 0.71672767, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.73798788, + "num_input_tokens_seen": 340644870, + "step": 15789, + "time_per_iteration": 2.474813461303711 + }, + { + "auxiliary_loss_clip": 0.01064555, + "auxiliary_loss_mlp": 0.01028326, + "balance_loss_clip": 1.03513575, + "balance_loss_mlp": 1.01570129, + "epoch": 0.9493461596272358, + "flos": 18370283754240.0, + "grad_norm": 1.8658891408638403, + "language_loss": 0.7323308, + "learning_rate": 2.682381090161989e-08, + "loss": 0.7532596, + "num_input_tokens_seen": 340663695, + "step": 15790, + "time_per_iteration": 2.545459032058716 + }, + { + "auxiliary_loss_clip": 0.01068802, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.03255415, + "balance_loss_mlp": 1.01893413, + "epoch": 0.9494062828799038, + "flos": 20011724490240.0, + "grad_norm": 2.133260734075795, + "language_loss": 0.77504063, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.7960484, + "num_input_tokens_seen": 340682970, + "step": 15791, + "time_per_iteration": 2.531764030456543 + }, + { + "auxiliary_loss_clip": 0.01095654, + "auxiliary_loss_mlp": 0.01030896, + "balance_loss_clip": 1.03395283, + "balance_loss_mlp": 1.01791978, + "epoch": 0.9494664061325718, + "flos": 27228583313280.0, + "grad_norm": 2.0532876894431706, + "language_loss": 0.73971725, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.76098275, + "num_input_tokens_seen": 340702275, + "step": 15792, + "time_per_iteration": 2.5234501361846924 + }, + { + "auxiliary_loss_clip": 0.01090907, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.03272128, + "balance_loss_mlp": 1.02182078, + "epoch": 0.9495265293852397, + "flos": 18369816877440.0, + "grad_norm": 1.7692387623391352, + "language_loss": 0.77860653, + "learning_rate": 2.663343248754679e-08, + "loss": 0.79985297, + "num_input_tokens_seen": 340719060, + "step": 15793, + "time_per_iteration": 2.4378561973571777 + }, + { + "auxiliary_loss_clip": 0.01076856, + "auxiliary_loss_mlp": 0.01030473, + "balance_loss_clip": 1.0357759, + "balance_loss_mlp": 1.0189755, + "epoch": 0.9495866526379078, + "flos": 23075766351360.0, + "grad_norm": 1.7148787338883655, + "language_loss": 0.774508, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.79558128, + "num_input_tokens_seen": 340737815, + "step": 15794, + "time_per_iteration": 2.529343843460083 + }, + { + "auxiliary_loss_clip": 0.01072123, + "auxiliary_loss_mlp": 0.00782725, + "balance_loss_clip": 1.03257346, + "balance_loss_mlp": 1.00904763, + "epoch": 0.9496467758905757, + "flos": 17529902179200.0, + "grad_norm": 1.9557021946815252, + "language_loss": 0.61184525, + "learning_rate": 2.650688769211107e-08, + "loss": 0.63039374, + "num_input_tokens_seen": 340756035, + "step": 15795, + "time_per_iteration": 2.5264387130737305 + }, + { + "auxiliary_loss_clip": 0.01091222, + "auxiliary_loss_mlp": 0.01034544, + "balance_loss_clip": 1.03544378, + "balance_loss_mlp": 1.02170515, + "epoch": 0.9497068991432437, + "flos": 24133910129280.0, + "grad_norm": 1.6492739542053454, + "language_loss": 0.79071045, + "learning_rate": 2.644372754577895e-08, + "loss": 0.81196809, + "num_input_tokens_seen": 340775620, + "step": 15796, + "time_per_iteration": 2.5015182495117188 + }, + { + "auxiliary_loss_clip": 0.01092511, + "auxiliary_loss_mlp": 0.01027084, + "balance_loss_clip": 1.03446949, + "balance_loss_mlp": 1.01440585, + "epoch": 0.9497670223959116, + "flos": 20303319098880.0, + "grad_norm": 1.90369149528977, + "language_loss": 0.75256324, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.77375919, + "num_input_tokens_seen": 340794510, + "step": 15797, + "time_per_iteration": 3.879220485687256 + }, + { + "auxiliary_loss_clip": 0.01072187, + "auxiliary_loss_mlp": 0.00783744, + "balance_loss_clip": 1.03566492, + "balance_loss_mlp": 1.00829399, + "epoch": 0.9498271456485796, + "flos": 13698916099200.0, + "grad_norm": 1.8821457456825348, + "language_loss": 0.65448934, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.67304862, + "num_input_tokens_seen": 340812955, + "step": 15798, + "time_per_iteration": 3.9595048427581787 + }, + { + "auxiliary_loss_clip": 0.01094527, + "auxiliary_loss_mlp": 0.01031385, + "balance_loss_clip": 1.03618264, + "balance_loss_mlp": 1.01994061, + "epoch": 0.9498872689012475, + "flos": 20814004713600.0, + "grad_norm": 1.9530108676336875, + "language_loss": 0.77377862, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.79503769, + "num_input_tokens_seen": 340829200, + "step": 15799, + "time_per_iteration": 2.488452196121216 + }, + { + "auxiliary_loss_clip": 0.01088951, + "auxiliary_loss_mlp": 0.01034926, + "balance_loss_clip": 1.03533864, + "balance_loss_mlp": 1.02331543, + "epoch": 0.9499473921539155, + "flos": 21032700670080.0, + "grad_norm": 3.6795499738586015, + "language_loss": 0.70516729, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.7264061, + "num_input_tokens_seen": 340848035, + "step": 15800, + "time_per_iteration": 3.903365135192871 + }, + { + "auxiliary_loss_clip": 0.01076499, + "auxiliary_loss_mlp": 0.010277, + "balance_loss_clip": 1.03132749, + "balance_loss_mlp": 1.01539803, + "epoch": 0.9500075154065835, + "flos": 20998693468800.0, + "grad_norm": 1.6891568899043354, + "language_loss": 0.71769404, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.73873603, + "num_input_tokens_seen": 340870025, + "step": 15801, + "time_per_iteration": 2.544992208480835 + }, + { + "auxiliary_loss_clip": 0.01092879, + "auxiliary_loss_mlp": 0.01029321, + "balance_loss_clip": 1.03455734, + "balance_loss_mlp": 1.01767445, + "epoch": 0.9500676386592515, + "flos": 25121956515840.0, + "grad_norm": 1.5943915372320756, + "language_loss": 0.80962557, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.83084762, + "num_input_tokens_seen": 340892290, + "step": 15802, + "time_per_iteration": 2.543656587600708 + }, + { + "auxiliary_loss_clip": 0.0110562, + "auxiliary_loss_mlp": 0.01032755, + "balance_loss_clip": 1.03575218, + "balance_loss_mlp": 1.02093577, + "epoch": 0.9501277619119194, + "flos": 27523625627520.0, + "grad_norm": 1.7350343494385365, + "language_loss": 0.67290884, + "learning_rate": 2.60037021038646e-08, + "loss": 0.6942926, + "num_input_tokens_seen": 340912260, + "step": 15803, + "time_per_iteration": 2.479767322540283 + }, + { + "auxiliary_loss_clip": 0.01078479, + "auxiliary_loss_mlp": 0.01034429, + "balance_loss_clip": 1.03448617, + "balance_loss_mlp": 1.02234733, + "epoch": 0.9501878851645874, + "flos": 20813968800000.0, + "grad_norm": 1.6663953796416882, + "language_loss": 0.76063704, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.78176612, + "num_input_tokens_seen": 340928930, + "step": 15804, + "time_per_iteration": 2.597602367401123 + }, + { + "auxiliary_loss_clip": 0.01090275, + "auxiliary_loss_mlp": 0.01030241, + "balance_loss_clip": 1.03503191, + "balance_loss_mlp": 1.0178194, + "epoch": 0.9502480084172553, + "flos": 18369385914240.0, + "grad_norm": 1.7640685878013294, + "language_loss": 0.73095143, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.75215662, + "num_input_tokens_seen": 340946615, + "step": 15805, + "time_per_iteration": 2.445333242416382 + }, + { + "auxiliary_loss_clip": 0.01087171, + "auxiliary_loss_mlp": 0.01031127, + "balance_loss_clip": 1.03746223, + "balance_loss_mlp": 1.01920581, + "epoch": 0.9503081316699233, + "flos": 23549607590400.0, + "grad_norm": 1.3822869233739026, + "language_loss": 0.80169916, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.82288218, + "num_input_tokens_seen": 340967545, + "step": 15806, + "time_per_iteration": 2.5630276203155518 + }, + { + "auxiliary_loss_clip": 0.01068085, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.03433514, + "balance_loss_mlp": 1.01904917, + "epoch": 0.9503682549225914, + "flos": 18040444139520.0, + "grad_norm": 2.069573926869338, + "language_loss": 0.82449007, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.84548247, + "num_input_tokens_seen": 340984955, + "step": 15807, + "time_per_iteration": 3.938669443130493 + }, + { + "auxiliary_loss_clip": 0.01083546, + "auxiliary_loss_mlp": 0.01030726, + "balance_loss_clip": 1.03030133, + "balance_loss_mlp": 1.01838207, + "epoch": 0.9504283781752593, + "flos": 25886135387520.0, + "grad_norm": 2.1077792846478585, + "language_loss": 0.71604741, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.73719019, + "num_input_tokens_seen": 341007300, + "step": 15808, + "time_per_iteration": 2.5681397914886475 + }, + { + "auxiliary_loss_clip": 0.01092937, + "auxiliary_loss_mlp": 0.01027765, + "balance_loss_clip": 1.03487444, + "balance_loss_mlp": 1.01583195, + "epoch": 0.9504885014279273, + "flos": 22124025636480.0, + "grad_norm": 1.4303831795732436, + "language_loss": 0.69550538, + "learning_rate": 2.562945671948058e-08, + "loss": 0.71671236, + "num_input_tokens_seen": 341026695, + "step": 15809, + "time_per_iteration": 2.4718220233917236 + }, + { + "auxiliary_loss_clip": 0.01078901, + "auxiliary_loss_mlp": 0.01026909, + "balance_loss_clip": 1.03296113, + "balance_loss_mlp": 1.01474953, + "epoch": 0.9505486246805952, + "flos": 21615961714560.0, + "grad_norm": 1.4757620464576282, + "language_loss": 0.75424945, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.7753076, + "num_input_tokens_seen": 341047080, + "step": 15810, + "time_per_iteration": 2.5411131381988525 + }, + { + "auxiliary_loss_clip": 0.01069759, + "auxiliary_loss_mlp": 0.01039252, + "balance_loss_clip": 1.03347993, + "balance_loss_mlp": 1.02671754, + "epoch": 0.9506087479332632, + "flos": 22528236360960.0, + "grad_norm": 1.4659375829473602, + "language_loss": 0.80065638, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.82174647, + "num_input_tokens_seen": 341067310, + "step": 15811, + "time_per_iteration": 2.574913501739502 + }, + { + "auxiliary_loss_clip": 0.01077242, + "auxiliary_loss_mlp": 0.01034352, + "balance_loss_clip": 1.03192711, + "balance_loss_mlp": 1.02217507, + "epoch": 0.9506688711859311, + "flos": 27527360641920.0, + "grad_norm": 1.912269909632188, + "language_loss": 0.70114124, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.72225714, + "num_input_tokens_seen": 341085110, + "step": 15812, + "time_per_iteration": 2.5552666187286377 + }, + { + "auxiliary_loss_clip": 0.01066514, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.03332686, + "balance_loss_mlp": 1.02139354, + "epoch": 0.9507289944385992, + "flos": 19865783531520.0, + "grad_norm": 1.4742370314948827, + "language_loss": 0.65271592, + "learning_rate": 2.538145713158446e-08, + "loss": 0.67372966, + "num_input_tokens_seen": 341103190, + "step": 15813, + "time_per_iteration": 2.548431634902954 + }, + { + "auxiliary_loss_clip": 0.01094981, + "auxiliary_loss_mlp": 0.01035052, + "balance_loss_clip": 1.03522587, + "balance_loss_mlp": 1.02262449, + "epoch": 0.9507891176912671, + "flos": 25193274969600.0, + "grad_norm": 1.305218582773568, + "language_loss": 0.70295465, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.72425497, + "num_input_tokens_seen": 341125695, + "step": 15814, + "time_per_iteration": 2.519381046295166 + }, + { + "auxiliary_loss_clip": 0.01089995, + "auxiliary_loss_mlp": 0.01027313, + "balance_loss_clip": 1.03401709, + "balance_loss_mlp": 1.01616061, + "epoch": 0.9508492409439351, + "flos": 24899561458560.0, + "grad_norm": 2.1074367067083917, + "language_loss": 0.63767576, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.65884876, + "num_input_tokens_seen": 341143930, + "step": 15815, + "time_per_iteration": 2.5088648796081543 + }, + { + "auxiliary_loss_clip": 0.01078321, + "auxiliary_loss_mlp": 0.01027195, + "balance_loss_clip": 1.03311479, + "balance_loss_mlp": 1.01590025, + "epoch": 0.950909364196603, + "flos": 29784094375680.0, + "grad_norm": 1.7431932437583872, + "language_loss": 0.59083974, + "learning_rate": 2.519624364862061e-08, + "loss": 0.61189491, + "num_input_tokens_seen": 341164280, + "step": 15816, + "time_per_iteration": 2.5609776973724365 + }, + { + "auxiliary_loss_clip": 0.01102305, + "auxiliary_loss_mlp": 0.01040074, + "balance_loss_clip": 1.0341413, + "balance_loss_mlp": 1.02817118, + "epoch": 0.950969487449271, + "flos": 24717781704960.0, + "grad_norm": 1.7748465804071214, + "language_loss": 0.73405033, + "learning_rate": 2.513465558735994e-08, + "loss": 0.75547409, + "num_input_tokens_seen": 341183670, + "step": 15817, + "time_per_iteration": 2.5068750381469727 + }, + { + "auxiliary_loss_clip": 0.01084349, + "auxiliary_loss_mlp": 0.01036433, + "balance_loss_clip": 1.03538394, + "balance_loss_mlp": 1.02257538, + "epoch": 0.9510296107019389, + "flos": 13699167494400.0, + "grad_norm": 1.9164186357723525, + "language_loss": 0.5995816, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.62078941, + "num_input_tokens_seen": 341201900, + "step": 15818, + "time_per_iteration": 2.4741029739379883 + }, + { + "auxiliary_loss_clip": 0.01105067, + "auxiliary_loss_mlp": 0.01036427, + "balance_loss_clip": 1.03642011, + "balance_loss_mlp": 1.0236001, + "epoch": 0.9510897339546069, + "flos": 17311852667520.0, + "grad_norm": 1.8253364354015957, + "language_loss": 0.69215631, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.71357119, + "num_input_tokens_seen": 341218340, + "step": 15819, + "time_per_iteration": 2.4231510162353516 + }, + { + "auxiliary_loss_clip": 0.01064118, + "auxiliary_loss_mlp": 0.01031541, + "balance_loss_clip": 1.03698516, + "balance_loss_mlp": 1.01960266, + "epoch": 0.951149857207275, + "flos": 14793940166400.0, + "grad_norm": 1.8214581283879954, + "language_loss": 0.74001884, + "learning_rate": 2.49503407354561e-08, + "loss": 0.76097548, + "num_input_tokens_seen": 341235885, + "step": 15820, + "time_per_iteration": 2.5565836429595947 + }, + { + "auxiliary_loss_clip": 0.01084988, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.034729, + "balance_loss_mlp": 1.02194262, + "epoch": 0.9512099804599429, + "flos": 19391152193280.0, + "grad_norm": 1.9504858846616753, + "language_loss": 0.78377986, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.80497074, + "num_input_tokens_seen": 341255280, + "step": 15821, + "time_per_iteration": 2.508463144302368 + }, + { + "auxiliary_loss_clip": 0.01062877, + "auxiliary_loss_mlp": 0.01027577, + "balance_loss_clip": 1.03298628, + "balance_loss_mlp": 1.01513779, + "epoch": 0.9512701037126109, + "flos": 36757874885760.0, + "grad_norm": 1.4292536969028966, + "language_loss": 0.70962906, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.7305336, + "num_input_tokens_seen": 341279055, + "step": 15822, + "time_per_iteration": 2.666663885116577 + }, + { + "auxiliary_loss_clip": 0.01090449, + "auxiliary_loss_mlp": 0.01034311, + "balance_loss_clip": 1.03719068, + "balance_loss_mlp": 1.02296233, + "epoch": 0.9513302269652788, + "flos": 22638266697600.0, + "grad_norm": 1.5444687857150488, + "language_loss": 0.66176677, + "learning_rate": 2.47666999302647e-08, + "loss": 0.68301439, + "num_input_tokens_seen": 341298560, + "step": 15823, + "time_per_iteration": 2.493187189102173 + }, + { + "auxiliary_loss_clip": 0.0108673, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.03398335, + "balance_loss_mlp": 1.01888609, + "epoch": 0.9513903502179468, + "flos": 22893232412160.0, + "grad_norm": 1.7554851369351057, + "language_loss": 0.77110642, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.79228008, + "num_input_tokens_seen": 341316650, + "step": 15824, + "time_per_iteration": 2.4776666164398193 + }, + { + "auxiliary_loss_clip": 0.01106021, + "auxiliary_loss_mlp": 0.01029615, + "balance_loss_clip": 1.03446698, + "balance_loss_mlp": 1.01710403, + "epoch": 0.9514504734706147, + "flos": 27928626451200.0, + "grad_norm": 2.2869579692698, + "language_loss": 0.73770916, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.75906551, + "num_input_tokens_seen": 341336185, + "step": 15825, + "time_per_iteration": 2.5130016803741455 + }, + { + "auxiliary_loss_clip": 0.0101773, + "auxiliary_loss_mlp": 0.00999582, + "balance_loss_clip": 1.00888503, + "balance_loss_mlp": 0.99839586, + "epoch": 0.9515105967232828, + "flos": 67366767312000.0, + "grad_norm": 0.8399890992317496, + "language_loss": 0.53441012, + "learning_rate": 2.458373323445806e-08, + "loss": 0.55458319, + "num_input_tokens_seen": 341395795, + "step": 15826, + "time_per_iteration": 3.0280754566192627 + }, + { + "auxiliary_loss_clip": 0.01080712, + "auxiliary_loss_mlp": 0.01033609, + "balance_loss_clip": 1.03540111, + "balance_loss_mlp": 1.02176547, + "epoch": 0.9515707199759507, + "flos": 25846525664640.0, + "grad_norm": 2.0659928324825696, + "language_loss": 0.72756743, + "learning_rate": 2.452289414874076e-08, + "loss": 0.74871063, + "num_input_tokens_seen": 341415675, + "step": 15827, + "time_per_iteration": 2.5618202686309814 + }, + { + "auxiliary_loss_clip": 0.01083174, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.03549767, + "balance_loss_mlp": 1.01826525, + "epoch": 0.9516308432286187, + "flos": 21828983322240.0, + "grad_norm": 1.7668195415698418, + "language_loss": 0.7437458, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.764884, + "num_input_tokens_seen": 341432990, + "step": 15828, + "time_per_iteration": 2.486964225769043 + }, + { + "auxiliary_loss_clip": 0.01069984, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.03561711, + "balance_loss_mlp": 1.02312076, + "epoch": 0.9516909664812866, + "flos": 27269593666560.0, + "grad_norm": 1.5283464326892506, + "language_loss": 0.72743958, + "learning_rate": 2.440144071047978e-08, + "loss": 0.7484771, + "num_input_tokens_seen": 341454100, + "step": 15829, + "time_per_iteration": 2.602888345718384 + }, + { + "auxiliary_loss_clip": 0.01089691, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.03351986, + "balance_loss_mlp": 1.02130222, + "epoch": 0.9517510897339546, + "flos": 21215342350080.0, + "grad_norm": 1.737944048075394, + "language_loss": 0.61228538, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.63352382, + "num_input_tokens_seen": 341472955, + "step": 15830, + "time_per_iteration": 2.461042881011963 + }, + { + "auxiliary_loss_clip": 0.01089197, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.03527331, + "balance_loss_mlp": 1.01830971, + "epoch": 0.9518112129866225, + "flos": 18733986915840.0, + "grad_norm": 1.7892146351909157, + "language_loss": 0.72833723, + "learning_rate": 2.428028693179729e-08, + "loss": 0.74954712, + "num_input_tokens_seen": 341490165, + "step": 15831, + "time_per_iteration": 2.463923454284668 + }, + { + "auxiliary_loss_clip": 0.01057914, + "auxiliary_loss_mlp": 0.01026096, + "balance_loss_clip": 1.03246176, + "balance_loss_mlp": 1.01472378, + "epoch": 0.9518713362392905, + "flos": 16763676232320.0, + "grad_norm": 1.6724690354080216, + "language_loss": 0.65326363, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.67410374, + "num_input_tokens_seen": 341508055, + "step": 15832, + "time_per_iteration": 2.529538631439209 + }, + { + "auxiliary_loss_clip": 0.01088739, + "auxiliary_loss_mlp": 0.01035711, + "balance_loss_clip": 1.03631306, + "balance_loss_mlp": 1.02290773, + "epoch": 0.9519314594919586, + "flos": 15230649720960.0, + "grad_norm": 1.7442237738598922, + "language_loss": 0.7818135, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.80305803, + "num_input_tokens_seen": 341526155, + "step": 15833, + "time_per_iteration": 2.4611093997955322 + }, + { + "auxiliary_loss_clip": 0.01065751, + "auxiliary_loss_mlp": 0.01030628, + "balance_loss_clip": 1.03425562, + "balance_loss_mlp": 1.01899922, + "epoch": 0.9519915827446265, + "flos": 19352943100800.0, + "grad_norm": 2.03547465286947, + "language_loss": 0.74714839, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.76811218, + "num_input_tokens_seen": 341540450, + "step": 15834, + "time_per_iteration": 2.512953281402588 + }, + { + "auxiliary_loss_clip": 0.01092704, + "auxiliary_loss_mlp": 0.01033601, + "balance_loss_clip": 1.0371058, + "balance_loss_mlp": 1.02038622, + "epoch": 0.9520517059972945, + "flos": 22266303408000.0, + "grad_norm": 1.8605002767312375, + "language_loss": 0.75897944, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.78024244, + "num_input_tokens_seen": 341557865, + "step": 15835, + "time_per_iteration": 3.9317095279693604 + }, + { + "auxiliary_loss_clip": 0.01082221, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.03332758, + "balance_loss_mlp": 1.02066517, + "epoch": 0.9521118292499624, + "flos": 14862313704960.0, + "grad_norm": 1.8477706336015232, + "language_loss": 0.66175669, + "learning_rate": 2.397871361623238e-08, + "loss": 0.6829145, + "num_input_tokens_seen": 341573890, + "step": 15836, + "time_per_iteration": 3.860098123550415 + }, + { + "auxiliary_loss_clip": 0.01068095, + "auxiliary_loss_mlp": 0.01026995, + "balance_loss_clip": 1.03413701, + "balance_loss_mlp": 1.01503289, + "epoch": 0.9521719525026304, + "flos": 23508812718720.0, + "grad_norm": 1.5438794261565254, + "language_loss": 0.7039516, + "learning_rate": 2.391862373676057e-08, + "loss": 0.72490251, + "num_input_tokens_seen": 341593770, + "step": 15837, + "time_per_iteration": 2.5573978424072266 + }, + { + "auxiliary_loss_clip": 0.01104932, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.03435564, + "balance_loss_mlp": 1.02057481, + "epoch": 0.9522320757552983, + "flos": 19714922409600.0, + "grad_norm": 1.7790885203319187, + "language_loss": 0.73150539, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.75289363, + "num_input_tokens_seen": 341612065, + "step": 15838, + "time_per_iteration": 3.8181777000427246 + }, + { + "auxiliary_loss_clip": 0.01072382, + "auxiliary_loss_mlp": 0.01030854, + "balance_loss_clip": 1.03352904, + "balance_loss_mlp": 1.01856935, + "epoch": 0.9522921990079664, + "flos": 25921291824000.0, + "grad_norm": 1.824782594850877, + "language_loss": 0.77885187, + "learning_rate": 2.379866877970449e-08, + "loss": 0.79988432, + "num_input_tokens_seen": 341631365, + "step": 15839, + "time_per_iteration": 2.601278066635132 + }, + { + "auxiliary_loss_clip": 0.01077127, + "auxiliary_loss_mlp": 0.01030143, + "balance_loss_clip": 1.0354358, + "balance_loss_mlp": 1.01850772, + "epoch": 0.9523523222606343, + "flos": 19208115463680.0, + "grad_norm": 1.5957655276427791, + "language_loss": 0.80614173, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.82721442, + "num_input_tokens_seen": 341650300, + "step": 15840, + "time_per_iteration": 2.5174708366394043 + }, + { + "auxiliary_loss_clip": 0.01073996, + "auxiliary_loss_mlp": 0.01027763, + "balance_loss_clip": 1.03477633, + "balance_loss_mlp": 1.01763582, + "epoch": 0.9524124455133023, + "flos": 20921269703040.0, + "grad_norm": 2.026905134805001, + "language_loss": 0.73302275, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.75404036, + "num_input_tokens_seen": 341667680, + "step": 15841, + "time_per_iteration": 2.5062003135681152 + }, + { + "auxiliary_loss_clip": 0.01074728, + "auxiliary_loss_mlp": 0.01025823, + "balance_loss_clip": 1.03378129, + "balance_loss_mlp": 1.01464117, + "epoch": 0.9524725687659702, + "flos": 18843550375680.0, + "grad_norm": 1.7390059113760672, + "language_loss": 0.79188633, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.81289178, + "num_input_tokens_seen": 341685760, + "step": 15842, + "time_per_iteration": 2.4788448810577393 + }, + { + "auxiliary_loss_clip": 0.01081693, + "auxiliary_loss_mlp": 0.01030391, + "balance_loss_clip": 1.03671527, + "balance_loss_mlp": 1.01876187, + "epoch": 0.9525326920186382, + "flos": 22674680110080.0, + "grad_norm": 2.126296297783467, + "language_loss": 0.7219938, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.74311465, + "num_input_tokens_seen": 341705300, + "step": 15843, + "time_per_iteration": 2.5729849338531494 + }, + { + "auxiliary_loss_clip": 0.01082342, + "auxiliary_loss_mlp": 0.00783434, + "balance_loss_clip": 1.0371455, + "balance_loss_mlp": 1.0100168, + "epoch": 0.9525928152713061, + "flos": 22086642556800.0, + "grad_norm": 1.6287826616386587, + "language_loss": 0.77995163, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.79860938, + "num_input_tokens_seen": 341724565, + "step": 15844, + "time_per_iteration": 2.5378477573394775 + }, + { + "auxiliary_loss_clip": 0.01069137, + "auxiliary_loss_mlp": 0.01034432, + "balance_loss_clip": 1.03267288, + "balance_loss_mlp": 1.02026939, + "epoch": 0.9526529385239741, + "flos": 20704728562560.0, + "grad_norm": 1.853214113900597, + "language_loss": 0.69867152, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.71970713, + "num_input_tokens_seen": 341743605, + "step": 15845, + "time_per_iteration": 2.5468053817749023 + }, + { + "auxiliary_loss_clip": 0.01066542, + "auxiliary_loss_mlp": 0.01032955, + "balance_loss_clip": 1.03378165, + "balance_loss_mlp": 1.02136803, + "epoch": 0.9527130617766422, + "flos": 23368043318400.0, + "grad_norm": 1.5392031543414788, + "language_loss": 0.75443453, + "learning_rate": 2.338118708818282e-08, + "loss": 0.77542949, + "num_input_tokens_seen": 341763475, + "step": 15846, + "time_per_iteration": 3.916409730911255 + }, + { + "auxiliary_loss_clip": 0.01073494, + "auxiliary_loss_mlp": 0.01026302, + "balance_loss_clip": 1.03565645, + "balance_loss_mlp": 1.01427364, + "epoch": 0.9527731850293101, + "flos": 18985935888000.0, + "grad_norm": 1.684247268328158, + "language_loss": 0.77890539, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.79990333, + "num_input_tokens_seen": 341781265, + "step": 15847, + "time_per_iteration": 2.543130397796631 + }, + { + "auxiliary_loss_clip": 0.01061364, + "auxiliary_loss_mlp": 0.0103534, + "balance_loss_clip": 1.03366303, + "balance_loss_mlp": 1.02431941, + "epoch": 0.9528333082819781, + "flos": 19318038059520.0, + "grad_norm": 3.5864885047640116, + "language_loss": 0.78213125, + "learning_rate": 2.326258115328672e-08, + "loss": 0.80309826, + "num_input_tokens_seen": 341798825, + "step": 15848, + "time_per_iteration": 2.524172306060791 + }, + { + "auxiliary_loss_clip": 0.01085317, + "auxiliary_loss_mlp": 0.01040823, + "balance_loss_clip": 1.03518462, + "balance_loss_mlp": 1.02757931, + "epoch": 0.952893431534646, + "flos": 23951340276480.0, + "grad_norm": 1.5215903444977577, + "language_loss": 0.71800828, + "learning_rate": 2.320339062183674e-08, + "loss": 0.73926967, + "num_input_tokens_seen": 341819480, + "step": 15849, + "time_per_iteration": 2.534142255783081 + }, + { + "auxiliary_loss_clip": 0.01098366, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.03641462, + "balance_loss_mlp": 1.0222255, + "epoch": 0.952953554787314, + "flos": 21030545854080.0, + "grad_norm": 1.5309617850694412, + "language_loss": 0.7550301, + "learning_rate": 2.314427505071226e-08, + "loss": 0.77636051, + "num_input_tokens_seen": 341838035, + "step": 15850, + "time_per_iteration": 2.4764444828033447 + }, + { + "auxiliary_loss_clip": 0.01075817, + "auxiliary_loss_mlp": 0.01031981, + "balance_loss_clip": 1.03404403, + "balance_loss_mlp": 1.02073348, + "epoch": 0.9530136780399819, + "flos": 22382870019840.0, + "grad_norm": 2.0774587272627767, + "language_loss": 0.72468954, + "learning_rate": 2.308523444215482e-08, + "loss": 0.74576759, + "num_input_tokens_seen": 341855895, + "step": 15851, + "time_per_iteration": 2.521024465560913 + }, + { + "auxiliary_loss_clip": 0.0107581, + "auxiliary_loss_mlp": 0.01026624, + "balance_loss_clip": 1.03302526, + "balance_loss_mlp": 1.01491761, + "epoch": 0.95307380129265, + "flos": 22159613036160.0, + "grad_norm": 1.6747515749976736, + "language_loss": 0.79956126, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.82058561, + "num_input_tokens_seen": 341875240, + "step": 15852, + "time_per_iteration": 2.5138373374938965 + }, + { + "auxiliary_loss_clip": 0.01093221, + "auxiliary_loss_mlp": 0.01031401, + "balance_loss_clip": 1.03443575, + "balance_loss_mlp": 1.01918232, + "epoch": 0.9531339245453179, + "flos": 44022747214080.0, + "grad_norm": 1.430628164698909, + "language_loss": 0.59536552, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.61661172, + "num_input_tokens_seen": 341901020, + "step": 15853, + "time_per_iteration": 2.686136484146118 + }, + { + "auxiliary_loss_clip": 0.01079239, + "auxiliary_loss_mlp": 0.01027675, + "balance_loss_clip": 1.03298354, + "balance_loss_mlp": 1.01639199, + "epoch": 0.9531940477979859, + "flos": 20266690204800.0, + "grad_norm": 1.7878408677228705, + "language_loss": 0.72778761, + "learning_rate": 2.290856241425998e-08, + "loss": 0.74885672, + "num_input_tokens_seen": 341919365, + "step": 15854, + "time_per_iteration": 2.475715398788452 + }, + { + "auxiliary_loss_clip": 0.01077164, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.03256917, + "balance_loss_mlp": 1.01902783, + "epoch": 0.9532541710506538, + "flos": 25335732309120.0, + "grad_norm": 2.0851841522624843, + "language_loss": 0.67564768, + "learning_rate": 2.284982167833127e-08, + "loss": 0.69672805, + "num_input_tokens_seen": 341939985, + "step": 15855, + "time_per_iteration": 2.5620648860931396 + }, + { + "auxiliary_loss_clip": 0.01103052, + "auxiliary_loss_mlp": 0.01029504, + "balance_loss_clip": 1.03388608, + "balance_loss_mlp": 1.01826262, + "epoch": 0.9533142943033218, + "flos": 26469288691200.0, + "grad_norm": 1.629506521833898, + "language_loss": 0.76389605, + "learning_rate": 2.279115591613556e-08, + "loss": 0.78522164, + "num_input_tokens_seen": 341959255, + "step": 15856, + "time_per_iteration": 2.485079526901245 + }, + { + "auxiliary_loss_clip": 0.01077015, + "auxiliary_loss_mlp": 0.0103178, + "balance_loss_clip": 1.03159678, + "balance_loss_mlp": 1.0204494, + "epoch": 0.9533744175559897, + "flos": 23656944407040.0, + "grad_norm": 1.5507016478038937, + "language_loss": 0.77926612, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.80035406, + "num_input_tokens_seen": 341977205, + "step": 15857, + "time_per_iteration": 2.5377187728881836 + }, + { + "auxiliary_loss_clip": 0.01021847, + "auxiliary_loss_mlp": 0.01000092, + "balance_loss_clip": 1.00910711, + "balance_loss_mlp": 0.99911445, + "epoch": 0.9534345408086577, + "flos": 61052055500160.0, + "grad_norm": 0.7179338411891907, + "language_loss": 0.62578142, + "learning_rate": 2.267404932183803e-08, + "loss": 0.64600074, + "num_input_tokens_seen": 342038545, + "step": 15858, + "time_per_iteration": 3.069474697113037 + }, + { + "auxiliary_loss_clip": 0.01053307, + "auxiliary_loss_mlp": 0.01027174, + "balance_loss_clip": 1.03219891, + "balance_loss_mlp": 1.01583695, + "epoch": 0.9534946640613258, + "flos": 18951677291520.0, + "grad_norm": 1.5408877475956821, + "language_loss": 0.56461173, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.58541656, + "num_input_tokens_seen": 342058195, + "step": 15859, + "time_per_iteration": 2.5668094158172607 + }, + { + "auxiliary_loss_clip": 0.01098456, + "auxiliary_loss_mlp": 0.01026662, + "balance_loss_clip": 1.03307772, + "balance_loss_mlp": 1.01584387, + "epoch": 0.9535547873139937, + "flos": 16654292340480.0, + "grad_norm": 2.0534819275266556, + "language_loss": 0.81446421, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.83571541, + "num_input_tokens_seen": 342075025, + "step": 15860, + "time_per_iteration": 2.4284727573394775 + }, + { + "auxiliary_loss_clip": 0.01052393, + "auxiliary_loss_mlp": 0.00781612, + "balance_loss_clip": 1.03175092, + "balance_loss_mlp": 1.00676179, + "epoch": 0.9536149105666617, + "flos": 20667776446080.0, + "grad_norm": 1.7067169191048637, + "language_loss": 0.66800481, + "learning_rate": 2.249895178891159e-08, + "loss": 0.68634486, + "num_input_tokens_seen": 342094595, + "step": 15861, + "time_per_iteration": 2.5683600902557373 + }, + { + "auxiliary_loss_clip": 0.0109303, + "auxiliary_loss_mlp": 0.01035807, + "balance_loss_clip": 1.03399515, + "balance_loss_mlp": 1.02354026, + "epoch": 0.9536750338193296, + "flos": 30700499086080.0, + "grad_norm": 1.5597929271137883, + "language_loss": 0.65243548, + "learning_rate": 2.244073591573037e-08, + "loss": 0.67372382, + "num_input_tokens_seen": 342115970, + "step": 15862, + "time_per_iteration": 2.5532166957855225 + }, + { + "auxiliary_loss_clip": 0.01064013, + "auxiliary_loss_mlp": 0.01030927, + "balance_loss_clip": 1.03301215, + "balance_loss_mlp": 1.01950121, + "epoch": 0.9537351570719976, + "flos": 20405484357120.0, + "grad_norm": 1.5416348558810435, + "language_loss": 0.67325675, + "learning_rate": 2.238259503179485e-08, + "loss": 0.69420612, + "num_input_tokens_seen": 342134080, + "step": 15863, + "time_per_iteration": 2.5403642654418945 + }, + { + "auxiliary_loss_clip": 0.01081588, + "auxiliary_loss_mlp": 0.01031318, + "balance_loss_clip": 1.03234744, + "balance_loss_mlp": 1.01911139, + "epoch": 0.9537952803246655, + "flos": 29929245235200.0, + "grad_norm": 1.9944465333130657, + "language_loss": 0.78469139, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.80582047, + "num_input_tokens_seen": 342154725, + "step": 15864, + "time_per_iteration": 2.572067975997925 + }, + { + "auxiliary_loss_clip": 0.01071204, + "auxiliary_loss_mlp": 0.01027463, + "balance_loss_clip": 1.03412437, + "balance_loss_mlp": 1.01587546, + "epoch": 0.9538554035773336, + "flos": 20521404524160.0, + "grad_norm": 1.864845933610587, + "language_loss": 0.59606862, + "learning_rate": 2.226653824047586e-08, + "loss": 0.6170553, + "num_input_tokens_seen": 342172275, + "step": 15865, + "time_per_iteration": 2.5102875232696533 + }, + { + "auxiliary_loss_clip": 0.01062242, + "auxiliary_loss_mlp": 0.01033772, + "balance_loss_clip": 1.03378201, + "balance_loss_mlp": 1.02046847, + "epoch": 0.9539155268300015, + "flos": 18406517598720.0, + "grad_norm": 1.6257306941614837, + "language_loss": 0.70337856, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.72433871, + "num_input_tokens_seen": 342190880, + "step": 15866, + "time_per_iteration": 2.525197744369507 + }, + { + "auxiliary_loss_clip": 0.01079358, + "auxiliary_loss_mlp": 0.01036436, + "balance_loss_clip": 1.03286862, + "balance_loss_mlp": 1.0227567, + "epoch": 0.9539756500826695, + "flos": 26213281482240.0, + "grad_norm": 2.977823540106525, + "language_loss": 0.8506614, + "learning_rate": 2.215078143255855e-08, + "loss": 0.87181932, + "num_input_tokens_seen": 342208165, + "step": 15867, + "time_per_iteration": 2.534639835357666 + }, + { + "auxiliary_loss_clip": 0.01017152, + "auxiliary_loss_mlp": 0.01001325, + "balance_loss_clip": 1.00760841, + "balance_loss_mlp": 1.00022817, + "epoch": 0.9540357733353374, + "flos": 68289097766400.0, + "grad_norm": 0.7521506106406901, + "language_loss": 0.61824012, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.63842487, + "num_input_tokens_seen": 342277110, + "step": 15868, + "time_per_iteration": 3.135977029800415 + }, + { + "auxiliary_loss_clip": 0.01067512, + "auxiliary_loss_mlp": 0.01024871, + "balance_loss_clip": 1.03514481, + "balance_loss_mlp": 1.01286054, + "epoch": 0.9540958965880054, + "flos": 21288276915840.0, + "grad_norm": 1.9173589722002555, + "language_loss": 0.6003319, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.62125576, + "num_input_tokens_seen": 342294695, + "step": 15869, + "time_per_iteration": 2.5337462425231934 + }, + { + "auxiliary_loss_clip": 0.01066168, + "auxiliary_loss_mlp": 0.0078335, + "balance_loss_clip": 1.03346527, + "balance_loss_mlp": 1.00693035, + "epoch": 0.9541560198406733, + "flos": 19751407649280.0, + "grad_norm": 1.6001316371746634, + "language_loss": 0.71192479, + "learning_rate": 2.197770872795579e-08, + "loss": 0.73041999, + "num_input_tokens_seen": 342314970, + "step": 15870, + "time_per_iteration": 2.582280158996582 + }, + { + "auxiliary_loss_clip": 0.01063737, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.03230786, + "balance_loss_mlp": 1.0168395, + "epoch": 0.9542161430933414, + "flos": 24715626888960.0, + "grad_norm": 2.042693649949192, + "language_loss": 0.77122903, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.79215556, + "num_input_tokens_seen": 342334255, + "step": 15871, + "time_per_iteration": 2.5413522720336914 + }, + { + "auxiliary_loss_clip": 0.01092505, + "auxiliary_loss_mlp": 0.01031137, + "balance_loss_clip": 1.03414178, + "balance_loss_mlp": 1.01866817, + "epoch": 0.9542762663460094, + "flos": 31065818359680.0, + "grad_norm": 3.153754606355243, + "language_loss": 0.57876813, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.60000461, + "num_input_tokens_seen": 342354730, + "step": 15872, + "time_per_iteration": 2.560175657272339 + }, + { + "auxiliary_loss_clip": 0.01081953, + "auxiliary_loss_mlp": 0.01033508, + "balance_loss_clip": 1.03425765, + "balance_loss_mlp": 1.02003741, + "epoch": 0.9543363895986773, + "flos": 20776729374720.0, + "grad_norm": 1.4675918810490436, + "language_loss": 0.74670589, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.76786047, + "num_input_tokens_seen": 342374565, + "step": 15873, + "time_per_iteration": 2.512266159057617 + }, + { + "auxiliary_loss_clip": 0.01104429, + "auxiliary_loss_mlp": 0.0103235, + "balance_loss_clip": 1.03521252, + "balance_loss_mlp": 1.01951158, + "epoch": 0.9543965128513453, + "flos": 24462744163200.0, + "grad_norm": 1.7975136243037892, + "language_loss": 0.62555313, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.64692092, + "num_input_tokens_seen": 342394590, + "step": 15874, + "time_per_iteration": 3.8409314155578613 + }, + { + "auxiliary_loss_clip": 0.01078685, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.03318107, + "balance_loss_mlp": 1.02262688, + "epoch": 0.9544566361040132, + "flos": 15261532439040.0, + "grad_norm": 1.9713664730112697, + "language_loss": 0.89525914, + "learning_rate": 2.169075438538104e-08, + "loss": 0.91639602, + "num_input_tokens_seen": 342410445, + "step": 15875, + "time_per_iteration": 3.8611690998077393 + }, + { + "auxiliary_loss_clip": 0.0110697, + "auxiliary_loss_mlp": 0.01030486, + "balance_loss_clip": 1.03516388, + "balance_loss_mlp": 1.01774907, + "epoch": 0.9545167593566812, + "flos": 25918777872000.0, + "grad_norm": 1.864892585422974, + "language_loss": 0.67928803, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.70066261, + "num_input_tokens_seen": 342430970, + "step": 15876, + "time_per_iteration": 3.866117000579834 + }, + { + "auxiliary_loss_clip": 0.01092498, + "auxiliary_loss_mlp": 0.01033182, + "balance_loss_clip": 1.03468013, + "balance_loss_mlp": 1.02047372, + "epoch": 0.9545768826093491, + "flos": 25628188844160.0, + "grad_norm": 2.366582710257036, + "language_loss": 0.69171625, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.71297306, + "num_input_tokens_seen": 342449505, + "step": 15877, + "time_per_iteration": 2.5050644874572754 + }, + { + "auxiliary_loss_clip": 0.01062309, + "auxiliary_loss_mlp": 0.01030773, + "balance_loss_clip": 1.03207695, + "balance_loss_mlp": 1.01770186, + "epoch": 0.9546370058620172, + "flos": 22491499726080.0, + "grad_norm": 1.6313240088401524, + "language_loss": 0.70724195, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.72817278, + "num_input_tokens_seen": 342470390, + "step": 15878, + "time_per_iteration": 2.5530529022216797 + }, + { + "auxiliary_loss_clip": 0.01099684, + "auxiliary_loss_mlp": 0.01027372, + "balance_loss_clip": 1.03297305, + "balance_loss_mlp": 1.01532543, + "epoch": 0.9546971291146851, + "flos": 24609582961920.0, + "grad_norm": 1.3371619427169126, + "language_loss": 0.68147421, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.70274472, + "num_input_tokens_seen": 342492560, + "step": 15879, + "time_per_iteration": 2.4686648845672607 + }, + { + "auxiliary_loss_clip": 0.01067101, + "auxiliary_loss_mlp": 0.00781001, + "balance_loss_clip": 1.03228498, + "balance_loss_mlp": 1.00807786, + "epoch": 0.9547572523673531, + "flos": 28657756627200.0, + "grad_norm": 2.0183362548772665, + "language_loss": 0.84688979, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.86537087, + "num_input_tokens_seen": 342512315, + "step": 15880, + "time_per_iteration": 2.5973219871520996 + }, + { + "auxiliary_loss_clip": 0.01037747, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.03145993, + "balance_loss_mlp": 1.01675344, + "epoch": 0.954817375620021, + "flos": 33802606385280.0, + "grad_norm": 1.7056098341307906, + "language_loss": 0.72041547, + "learning_rate": 2.134888478151753e-08, + "loss": 0.74109292, + "num_input_tokens_seen": 342533060, + "step": 15881, + "time_per_iteration": 2.7903385162353516 + }, + { + "auxiliary_loss_clip": 0.01091244, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.0354346, + "balance_loss_mlp": 1.02042985, + "epoch": 0.954877498872689, + "flos": 14428225843200.0, + "grad_norm": 1.8639762982942607, + "language_loss": 0.71368051, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.7349143, + "num_input_tokens_seen": 342550830, + "step": 15882, + "time_per_iteration": 2.7413032054901123 + }, + { + "auxiliary_loss_clip": 0.01083643, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.03508282, + "balance_loss_mlp": 1.01758611, + "epoch": 0.9549376221253569, + "flos": 59269447336320.0, + "grad_norm": 1.8219019145363355, + "language_loss": 0.65962875, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.68075609, + "num_input_tokens_seen": 342575070, + "step": 15883, + "time_per_iteration": 2.83160400390625 + }, + { + "auxiliary_loss_clip": 0.0109444, + "auxiliary_loss_mlp": 0.01030277, + "balance_loss_clip": 1.03673887, + "balance_loss_mlp": 1.0175333, + "epoch": 0.954997745378025, + "flos": 17274397760640.0, + "grad_norm": 2.32612362158497, + "language_loss": 0.77854931, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.79979646, + "num_input_tokens_seen": 342592215, + "step": 15884, + "time_per_iteration": 2.460235834121704 + }, + { + "auxiliary_loss_clip": 0.01104263, + "auxiliary_loss_mlp": 0.01027465, + "balance_loss_clip": 1.03318357, + "balance_loss_mlp": 1.014853, + "epoch": 0.955057868630693, + "flos": 13006378903680.0, + "grad_norm": 1.821064364021651, + "language_loss": 0.7764082, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.79772544, + "num_input_tokens_seen": 342610030, + "step": 15885, + "time_per_iteration": 3.8125693798065186 + }, + { + "auxiliary_loss_clip": 0.01103807, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.03369856, + "balance_loss_mlp": 1.0206306, + "epoch": 0.9551179918833609, + "flos": 22637692080000.0, + "grad_norm": 1.9843598840079646, + "language_loss": 0.70143163, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.72279292, + "num_input_tokens_seen": 342626475, + "step": 15886, + "time_per_iteration": 2.4310245513916016 + }, + { + "auxiliary_loss_clip": 0.01072698, + "auxiliary_loss_mlp": 0.01031065, + "balance_loss_clip": 1.0364337, + "balance_loss_mlp": 1.01746917, + "epoch": 0.9551781151360289, + "flos": 21542811667200.0, + "grad_norm": 2.287701414637279, + "language_loss": 0.72758663, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.74862427, + "num_input_tokens_seen": 342646645, + "step": 15887, + "time_per_iteration": 2.552844524383545 + }, + { + "auxiliary_loss_clip": 0.01078765, + "auxiliary_loss_mlp": 0.01029282, + "balance_loss_clip": 1.03237367, + "balance_loss_mlp": 1.01773071, + "epoch": 0.9552382383886968, + "flos": 20702250524160.0, + "grad_norm": 1.9453588122248182, + "language_loss": 0.56819957, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.58928001, + "num_input_tokens_seen": 342663615, + "step": 15888, + "time_per_iteration": 2.4864511489868164 + }, + { + "auxiliary_loss_clip": 0.01017199, + "auxiliary_loss_mlp": 0.0100418, + "balance_loss_clip": 1.00485325, + "balance_loss_mlp": 1.00310099, + "epoch": 0.9552983616413648, + "flos": 67769792887680.0, + "grad_norm": 0.7077151954024432, + "language_loss": 0.57893729, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.59915113, + "num_input_tokens_seen": 342728275, + "step": 15889, + "time_per_iteration": 3.1545443534851074 + }, + { + "auxiliary_loss_clip": 0.01103646, + "auxiliary_loss_mlp": 0.01028397, + "balance_loss_clip": 1.03306675, + "balance_loss_mlp": 1.01640463, + "epoch": 0.9553584848940327, + "flos": 21579979265280.0, + "grad_norm": 1.4507435544215574, + "language_loss": 0.66792548, + "learning_rate": 2.084114508877466e-08, + "loss": 0.68924588, + "num_input_tokens_seen": 342748860, + "step": 15890, + "time_per_iteration": 2.449159622192383 + }, + { + "auxiliary_loss_clip": 0.01103782, + "auxiliary_loss_mlp": 0.01030626, + "balance_loss_clip": 1.03540564, + "balance_loss_mlp": 1.01917005, + "epoch": 0.9554186081467008, + "flos": 24208173498240.0, + "grad_norm": 1.6937172853886684, + "language_loss": 0.74241722, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.76376128, + "num_input_tokens_seen": 342769705, + "step": 15891, + "time_per_iteration": 2.4783828258514404 + }, + { + "auxiliary_loss_clip": 0.01066398, + "auxiliary_loss_mlp": 0.01030527, + "balance_loss_clip": 1.03277802, + "balance_loss_mlp": 1.01985824, + "epoch": 0.9554787313993687, + "flos": 16251554073600.0, + "grad_norm": 2.033006488381614, + "language_loss": 0.77910948, + "learning_rate": 2.072913954011435e-08, + "loss": 0.80007875, + "num_input_tokens_seen": 342787000, + "step": 15892, + "time_per_iteration": 2.50307559967041 + }, + { + "auxiliary_loss_clip": 0.01102123, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.0344274, + "balance_loss_mlp": 1.01887679, + "epoch": 0.9555388546520367, + "flos": 23404133508480.0, + "grad_norm": 1.4905786309427618, + "language_loss": 0.69698453, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.71832037, + "num_input_tokens_seen": 342807795, + "step": 15893, + "time_per_iteration": 2.483534812927246 + }, + { + "auxiliary_loss_clip": 0.01082959, + "auxiliary_loss_mlp": 0.00783337, + "balance_loss_clip": 1.0361371, + "balance_loss_mlp": 1.01002085, + "epoch": 0.9555989779047046, + "flos": 14794047907200.0, + "grad_norm": 3.122639449341315, + "language_loss": 0.65683615, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.67549914, + "num_input_tokens_seen": 342825490, + "step": 15894, + "time_per_iteration": 2.4751827716827393 + }, + { + "auxiliary_loss_clip": 0.01093545, + "auxiliary_loss_mlp": 0.01028845, + "balance_loss_clip": 1.03434062, + "balance_loss_mlp": 1.01622128, + "epoch": 0.9556591011573726, + "flos": 22236749493120.0, + "grad_norm": 1.7107317071260963, + "language_loss": 0.81595016, + "learning_rate": 2.056169412853581e-08, + "loss": 0.83717412, + "num_input_tokens_seen": 342844965, + "step": 15895, + "time_per_iteration": 2.5018489360809326 + }, + { + "auxiliary_loss_clip": 0.01077588, + "auxiliary_loss_mlp": 0.01030539, + "balance_loss_clip": 1.03431439, + "balance_loss_mlp": 1.01849937, + "epoch": 0.9557192244100405, + "flos": 27855296835840.0, + "grad_norm": 1.5484988346562092, + "language_loss": 0.72365338, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.74473459, + "num_input_tokens_seen": 342865915, + "step": 15896, + "time_per_iteration": 2.5496573448181152 + }, + { + "auxiliary_loss_clip": 0.01100199, + "auxiliary_loss_mlp": 0.01027385, + "balance_loss_clip": 1.03295565, + "balance_loss_mlp": 1.0159111, + "epoch": 0.9557793476627086, + "flos": 17602800831360.0, + "grad_norm": 2.0959852451371224, + "language_loss": 0.79495335, + "learning_rate": 2.045043915311706e-08, + "loss": 0.81622916, + "num_input_tokens_seen": 342884000, + "step": 15897, + "time_per_iteration": 2.4810736179351807 + }, + { + "auxiliary_loss_clip": 0.01078248, + "auxiliary_loss_mlp": 0.01030908, + "balance_loss_clip": 1.03218579, + "balance_loss_mlp": 1.01789045, + "epoch": 0.9558394709153766, + "flos": 23875496709120.0, + "grad_norm": 1.5723189149753873, + "language_loss": 0.72778618, + "learning_rate": 2.03949242614303e-08, + "loss": 0.7488777, + "num_input_tokens_seen": 342903095, + "step": 15898, + "time_per_iteration": 2.531891107559204 + }, + { + "auxiliary_loss_clip": 0.0100224, + "auxiliary_loss_mlp": 0.01003129, + "balance_loss_clip": 1.01006377, + "balance_loss_mlp": 1.00213385, + "epoch": 0.9558995941680445, + "flos": 53682001171200.0, + "grad_norm": 0.8807555281079319, + "language_loss": 0.52347249, + "learning_rate": 2.033948443656652e-08, + "loss": 0.54352617, + "num_input_tokens_seen": 342958155, + "step": 15899, + "time_per_iteration": 3.108670711517334 + }, + { + "auxiliary_loss_clip": 0.0109695, + "auxiliary_loss_mlp": 0.01033831, + "balance_loss_clip": 1.03550601, + "balance_loss_mlp": 1.02039063, + "epoch": 0.9559597174207125, + "flos": 13764488376960.0, + "grad_norm": 1.9373974881680784, + "language_loss": 0.68501699, + "learning_rate": 2.028411968062782e-08, + "loss": 0.70632482, + "num_input_tokens_seen": 342972500, + "step": 15900, + "time_per_iteration": 2.4427835941314697 + }, + { + "auxiliary_loss_clip": 0.01093638, + "auxiliary_loss_mlp": 0.00783083, + "balance_loss_clip": 1.03343046, + "balance_loss_mlp": 1.01027989, + "epoch": 0.9560198406733804, + "flos": 19936347799680.0, + "grad_norm": 1.9249939122078557, + "language_loss": 0.83099747, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.84976459, + "num_input_tokens_seen": 342989035, + "step": 15901, + "time_per_iteration": 2.476273775100708 + }, + { + "auxiliary_loss_clip": 0.00998521, + "auxiliary_loss_mlp": 0.0099945, + "balance_loss_clip": 1.00773287, + "balance_loss_mlp": 0.99809724, + "epoch": 0.9560799639260484, + "flos": 57289550699520.0, + "grad_norm": 0.7203863851705833, + "language_loss": 0.54294777, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.56292748, + "num_input_tokens_seen": 343051675, + "step": 15902, + "time_per_iteration": 3.2151782512664795 + }, + { + "auxiliary_loss_clip": 0.01079034, + "auxiliary_loss_mlp": 0.01031111, + "balance_loss_clip": 1.03382325, + "balance_loss_mlp": 1.02101421, + "epoch": 0.9561400871787163, + "flos": 18917167299840.0, + "grad_norm": 1.8772837286569182, + "language_loss": 0.85376465, + "learning_rate": 2.01184758473425e-08, + "loss": 0.87486601, + "num_input_tokens_seen": 343068895, + "step": 15903, + "time_per_iteration": 2.5020482540130615 + }, + { + "auxiliary_loss_clip": 0.01077062, + "auxiliary_loss_mlp": 0.00784475, + "balance_loss_clip": 1.03304076, + "balance_loss_mlp": 1.00797319, + "epoch": 0.9562002104313844, + "flos": 18038576632320.0, + "grad_norm": 2.0031155071391327, + "language_loss": 0.80552649, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.82414186, + "num_input_tokens_seen": 343087115, + "step": 15904, + "time_per_iteration": 2.470367670059204 + }, + { + "auxiliary_loss_clip": 0.01094005, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.03427243, + "balance_loss_mlp": 1.01796889, + "epoch": 0.9562603336840523, + "flos": 24717673964160.0, + "grad_norm": 1.9791369124534814, + "language_loss": 0.60092962, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.62217796, + "num_input_tokens_seen": 343105575, + "step": 15905, + "time_per_iteration": 2.5156326293945312 + }, + { + "auxiliary_loss_clip": 0.01090473, + "auxiliary_loss_mlp": 0.01028604, + "balance_loss_clip": 1.03356647, + "balance_loss_mlp": 1.01732039, + "epoch": 0.9563204569367203, + "flos": 21177205084800.0, + "grad_norm": 1.9196538500108913, + "language_loss": 0.70226109, + "learning_rate": 1.995350770979254e-08, + "loss": 0.72345185, + "num_input_tokens_seen": 343123025, + "step": 15906, + "time_per_iteration": 2.4661355018615723 + }, + { + "auxiliary_loss_clip": 0.01061068, + "auxiliary_loss_mlp": 0.01027844, + "balance_loss_clip": 1.03648043, + "balance_loss_mlp": 1.01516628, + "epoch": 0.9563805801893882, + "flos": 20229738088320.0, + "grad_norm": 1.7802784999816448, + "language_loss": 0.70990217, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.73079121, + "num_input_tokens_seen": 343141625, + "step": 15907, + "time_per_iteration": 2.569262742996216 + }, + { + "auxiliary_loss_clip": 0.01070794, + "auxiliary_loss_mlp": 0.01030306, + "balance_loss_clip": 1.03435373, + "balance_loss_mlp": 1.01823008, + "epoch": 0.9564407034420562, + "flos": 25411001258880.0, + "grad_norm": 2.064882260422441, + "language_loss": 0.69771278, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.71872377, + "num_input_tokens_seen": 343161300, + "step": 15908, + "time_per_iteration": 2.5468592643737793 + }, + { + "auxiliary_loss_clip": 0.01081601, + "auxiliary_loss_mlp": 0.00784102, + "balance_loss_clip": 1.03543615, + "balance_loss_mlp": 1.01125503, + "epoch": 0.9565008266947241, + "flos": 18623884752000.0, + "grad_norm": 1.7051790600594432, + "language_loss": 0.83403242, + "learning_rate": 1.978921532427802e-08, + "loss": 0.85268945, + "num_input_tokens_seen": 343177815, + "step": 15909, + "time_per_iteration": 2.4965710639953613 + }, + { + "auxiliary_loss_clip": 0.01091991, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.03347206, + "balance_loss_mlp": 1.02230155, + "epoch": 0.9565609499473922, + "flos": 24862142465280.0, + "grad_norm": 1.7855566785126014, + "language_loss": 0.67365772, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.69491708, + "num_input_tokens_seen": 343198140, + "step": 15910, + "time_per_iteration": 2.500535488128662 + }, + { + "auxiliary_loss_clip": 0.01096903, + "auxiliary_loss_mlp": 0.0103455, + "balance_loss_clip": 1.03670621, + "balance_loss_mlp": 1.0221343, + "epoch": 0.9566210732000601, + "flos": 21798459740160.0, + "grad_norm": 1.6787040607623185, + "language_loss": 0.74329042, + "learning_rate": 1.968006251276444e-08, + "loss": 0.76460493, + "num_input_tokens_seen": 343218280, + "step": 15911, + "time_per_iteration": 2.5039381980895996 + }, + { + "auxiliary_loss_clip": 0.01091699, + "auxiliary_loss_mlp": 0.01027662, + "balance_loss_clip": 1.03340507, + "balance_loss_mlp": 1.01622963, + "epoch": 0.9566811964527281, + "flos": 18697609416960.0, + "grad_norm": 2.032423997204952, + "language_loss": 0.69474465, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.71593827, + "num_input_tokens_seen": 343236850, + "step": 15912, + "time_per_iteration": 2.461393356323242 + }, + { + "auxiliary_loss_clip": 0.01082522, + "auxiliary_loss_mlp": 0.0103643, + "balance_loss_clip": 1.03345442, + "balance_loss_mlp": 1.02371633, + "epoch": 0.9567413197053961, + "flos": 13000632727680.0, + "grad_norm": 2.555779127438761, + "language_loss": 0.72365755, + "learning_rate": 1.95712100769696e-08, + "loss": 0.744847, + "num_input_tokens_seen": 343253065, + "step": 15913, + "time_per_iteration": 3.95328950881958 + }, + { + "auxiliary_loss_clip": 0.01023164, + "auxiliary_loss_mlp": 0.01029469, + "balance_loss_clip": 1.03333855, + "balance_loss_mlp": 1.0180254, + "epoch": 0.956801442958064, + "flos": 19719267955200.0, + "grad_norm": 2.283099362244623, + "language_loss": 0.73285878, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.75338507, + "num_input_tokens_seen": 343270330, + "step": 15914, + "time_per_iteration": 4.021246433258057 + }, + { + "auxiliary_loss_clip": 0.01101227, + "auxiliary_loss_mlp": 0.01027598, + "balance_loss_clip": 1.03381062, + "balance_loss_mlp": 1.0157845, + "epoch": 0.956861566210732, + "flos": 18222834424320.0, + "grad_norm": 1.4378548058613165, + "language_loss": 0.67189288, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.69318116, + "num_input_tokens_seen": 343289625, + "step": 15915, + "time_per_iteration": 4.015488147735596 + }, + { + "auxiliary_loss_clip": 0.01089565, + "auxiliary_loss_mlp": 0.01027943, + "balance_loss_clip": 1.03467917, + "balance_loss_mlp": 1.01645684, + "epoch": 0.9569216894634, + "flos": 22196960202240.0, + "grad_norm": 1.8036919903093458, + "language_loss": 0.64365667, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.66483176, + "num_input_tokens_seen": 343309200, + "step": 15916, + "time_per_iteration": 2.4763660430908203 + }, + { + "auxiliary_loss_clip": 0.01097063, + "auxiliary_loss_mlp": 0.01026813, + "balance_loss_clip": 1.03365397, + "balance_loss_mlp": 1.0158217, + "epoch": 0.956981812716068, + "flos": 21689291329920.0, + "grad_norm": 1.8474433729558768, + "language_loss": 0.80644041, + "learning_rate": 1.935440639853536e-08, + "loss": 0.82767916, + "num_input_tokens_seen": 343326270, + "step": 15917, + "time_per_iteration": 2.457047939300537 + }, + { + "auxiliary_loss_clip": 0.01072449, + "auxiliary_loss_mlp": 0.01034547, + "balance_loss_clip": 1.03441083, + "balance_loss_mlp": 1.02231646, + "epoch": 0.9570419359687359, + "flos": 13990905757440.0, + "grad_norm": 1.7220559109285867, + "language_loss": 0.73059672, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.75166667, + "num_input_tokens_seen": 343344430, + "step": 15918, + "time_per_iteration": 2.4847288131713867 + }, + { + "auxiliary_loss_clip": 0.01001566, + "auxiliary_loss_mlp": 0.01005793, + "balance_loss_clip": 1.00740218, + "balance_loss_mlp": 1.00469041, + "epoch": 0.9571020592214039, + "flos": 65196938534400.0, + "grad_norm": 0.6310971926736959, + "language_loss": 0.53151751, + "learning_rate": 1.924645518878032e-08, + "loss": 0.55159116, + "num_input_tokens_seen": 343416155, + "step": 15919, + "time_per_iteration": 3.259613513946533 + }, + { + "auxiliary_loss_clip": 0.01100323, + "auxiliary_loss_mlp": 0.0103803, + "balance_loss_clip": 1.03860426, + "balance_loss_mlp": 1.02493465, + "epoch": 0.9571621824740718, + "flos": 17384068961280.0, + "grad_norm": 2.5404845048967295, + "language_loss": 0.75121951, + "learning_rate": 1.919259224843972e-08, + "loss": 0.77260309, + "num_input_tokens_seen": 343431715, + "step": 15920, + "time_per_iteration": 2.4567558765411377 + }, + { + "auxiliary_loss_clip": 0.01068175, + "auxiliary_loss_mlp": 0.01028159, + "balance_loss_clip": 1.03470254, + "balance_loss_mlp": 1.01559401, + "epoch": 0.9572223057267398, + "flos": 14538184352640.0, + "grad_norm": 1.6047541313617237, + "language_loss": 0.78924513, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.8102085, + "num_input_tokens_seen": 343450425, + "step": 15921, + "time_per_iteration": 2.5462357997894287 + }, + { + "auxiliary_loss_clip": 0.01096427, + "auxiliary_loss_mlp": 0.01028954, + "balance_loss_clip": 1.03328824, + "balance_loss_mlp": 1.01616907, + "epoch": 0.9572824289794077, + "flos": 33947793158400.0, + "grad_norm": 1.817280350435024, + "language_loss": 0.50979364, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.53104746, + "num_input_tokens_seen": 343470445, + "step": 15922, + "time_per_iteration": 2.586251974105835 + }, + { + "auxiliary_loss_clip": 0.01051863, + "auxiliary_loss_mlp": 0.0103918, + "balance_loss_clip": 1.0334419, + "balance_loss_mlp": 1.02508986, + "epoch": 0.9573425522320758, + "flos": 18694915896960.0, + "grad_norm": 2.2591931496330533, + "language_loss": 0.83395851, + "learning_rate": 1.903145411006557e-08, + "loss": 0.85486889, + "num_input_tokens_seen": 343485200, + "step": 15923, + "time_per_iteration": 2.5547213554382324 + }, + { + "auxiliary_loss_clip": 0.01076349, + "auxiliary_loss_mlp": 0.01030245, + "balance_loss_clip": 1.03163981, + "balance_loss_mlp": 1.0191828, + "epoch": 0.9574026754847437, + "flos": 28510307297280.0, + "grad_norm": 1.5881823127418098, + "language_loss": 0.75241256, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.77347863, + "num_input_tokens_seen": 343505080, + "step": 15924, + "time_per_iteration": 3.956639051437378 + }, + { + "auxiliary_loss_clip": 0.01081747, + "auxiliary_loss_mlp": 0.01028355, + "balance_loss_clip": 1.03414202, + "balance_loss_mlp": 1.01637435, + "epoch": 0.9574627987374117, + "flos": 24352390604160.0, + "grad_norm": 1.82226157630626, + "language_loss": 0.86174458, + "learning_rate": 1.892440427371711e-08, + "loss": 0.88284564, + "num_input_tokens_seen": 343523995, + "step": 15925, + "time_per_iteration": 2.523029327392578 + }, + { + "auxiliary_loss_clip": 0.01074689, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.03449059, + "balance_loss_mlp": 1.01944852, + "epoch": 0.9575229219900797, + "flos": 23510680225920.0, + "grad_norm": 1.944468710259776, + "language_loss": 0.75721955, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.77828932, + "num_input_tokens_seen": 343542015, + "step": 15926, + "time_per_iteration": 2.5605249404907227 + }, + { + "auxiliary_loss_clip": 0.01077984, + "auxiliary_loss_mlp": 0.01030571, + "balance_loss_clip": 1.03597796, + "balance_loss_mlp": 1.01954389, + "epoch": 0.9575830452427476, + "flos": 22674823764480.0, + "grad_norm": 1.6041897822134794, + "language_loss": 0.77534223, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.79642779, + "num_input_tokens_seen": 343561680, + "step": 15927, + "time_per_iteration": 2.504793167114258 + }, + { + "auxiliary_loss_clip": 0.01057699, + "auxiliary_loss_mlp": 0.01031902, + "balance_loss_clip": 1.03486514, + "balance_loss_mlp": 1.01848531, + "epoch": 0.9576431684954156, + "flos": 30485250835200.0, + "grad_norm": 2.929314003374916, + "language_loss": 0.68521905, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.70611501, + "num_input_tokens_seen": 343585290, + "step": 15928, + "time_per_iteration": 2.6596686840057373 + }, + { + "auxiliary_loss_clip": 0.01082723, + "auxiliary_loss_mlp": 0.01030747, + "balance_loss_clip": 1.03558528, + "balance_loss_mlp": 1.01837301, + "epoch": 0.9577032917480836, + "flos": 21687387909120.0, + "grad_norm": 1.8608948082465002, + "language_loss": 0.82104409, + "learning_rate": 1.871120608822485e-08, + "loss": 0.84217882, + "num_input_tokens_seen": 343604045, + "step": 15929, + "time_per_iteration": 2.489361524581909 + }, + { + "auxiliary_loss_clip": 0.01072197, + "auxiliary_loss_mlp": 0.01046649, + "balance_loss_clip": 1.03397441, + "balance_loss_mlp": 1.03280842, + "epoch": 0.9577634150007516, + "flos": 29023147728000.0, + "grad_norm": 1.6000973528018319, + "language_loss": 0.72025931, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.74144769, + "num_input_tokens_seen": 343626595, + "step": 15930, + "time_per_iteration": 2.589919328689575 + }, + { + "auxiliary_loss_clip": 0.01039819, + "auxiliary_loss_mlp": 0.01029666, + "balance_loss_clip": 1.03186536, + "balance_loss_mlp": 1.0167501, + "epoch": 0.9578235382534195, + "flos": 19282235178240.0, + "grad_norm": 1.4561256930207347, + "language_loss": 0.62134254, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.64203745, + "num_input_tokens_seen": 343646195, + "step": 15931, + "time_per_iteration": 2.567692995071411 + }, + { + "auxiliary_loss_clip": 0.01100888, + "auxiliary_loss_mlp": 0.01026906, + "balance_loss_clip": 1.03472543, + "balance_loss_mlp": 1.01594484, + "epoch": 0.9578836615060875, + "flos": 13699275235200.0, + "grad_norm": 1.840946639389772, + "language_loss": 0.68882704, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.71010494, + "num_input_tokens_seen": 343663665, + "step": 15932, + "time_per_iteration": 2.4318737983703613 + }, + { + "auxiliary_loss_clip": 0.01075455, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.03466582, + "balance_loss_mlp": 1.02245128, + "epoch": 0.9579437847587554, + "flos": 17054516655360.0, + "grad_norm": 1.8553278407037948, + "language_loss": 0.75707257, + "learning_rate": 1.849920999338961e-08, + "loss": 0.77818394, + "num_input_tokens_seen": 343682145, + "step": 15933, + "time_per_iteration": 2.493351936340332 + }, + { + "auxiliary_loss_clip": 0.00997326, + "auxiliary_loss_mlp": 0.01000596, + "balance_loss_clip": 1.01121902, + "balance_loss_mlp": 0.9995411, + "epoch": 0.9580039080114234, + "flos": 60570887886720.0, + "grad_norm": 0.7020029629969564, + "language_loss": 0.57321984, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.59319907, + "num_input_tokens_seen": 343744685, + "step": 15934, + "time_per_iteration": 3.3038177490234375 + }, + { + "auxiliary_loss_clip": 0.01026518, + "auxiliary_loss_mlp": 0.00760509, + "balance_loss_clip": 1.00381088, + "balance_loss_mlp": 0.99941331, + "epoch": 0.9580640312640913, + "flos": 66235365745920.0, + "grad_norm": 0.9185406796379341, + "language_loss": 0.65951848, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.67738873, + "num_input_tokens_seen": 343801835, + "step": 15935, + "time_per_iteration": 3.006756544113159 + }, + { + "auxiliary_loss_clip": 0.01017347, + "auxiliary_loss_mlp": 0.01002173, + "balance_loss_clip": 1.02157879, + "balance_loss_mlp": 1.00116563, + "epoch": 0.9581241545167594, + "flos": 62218002971520.0, + "grad_norm": 0.7834039313500597, + "language_loss": 0.57042682, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.59062195, + "num_input_tokens_seen": 343861515, + "step": 15936, + "time_per_iteration": 3.106759548187256 + }, + { + "auxiliary_loss_clip": 0.01044541, + "auxiliary_loss_mlp": 0.01029145, + "balance_loss_clip": 1.03354883, + "balance_loss_mlp": 1.01692557, + "epoch": 0.9581842777694273, + "flos": 23768088065280.0, + "grad_norm": 1.785429528012442, + "language_loss": 0.7834543, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.80419111, + "num_input_tokens_seen": 343881240, + "step": 15937, + "time_per_iteration": 2.66329026222229 + }, + { + "auxiliary_loss_clip": 0.01091265, + "auxiliary_loss_mlp": 0.01036436, + "balance_loss_clip": 1.03490853, + "balance_loss_mlp": 1.0230732, + "epoch": 0.9582444010220953, + "flos": 21213079793280.0, + "grad_norm": 1.7461192056981982, + "language_loss": 0.68426871, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.70554572, + "num_input_tokens_seen": 343900885, + "step": 15938, + "time_per_iteration": 2.472263813018799 + }, + { + "auxiliary_loss_clip": 0.01070775, + "auxiliary_loss_mlp": 0.01029346, + "balance_loss_clip": 1.03347349, + "balance_loss_mlp": 1.01763964, + "epoch": 0.9583045242747633, + "flos": 23805147922560.0, + "grad_norm": 2.473922718314474, + "language_loss": 0.66139513, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.68239635, + "num_input_tokens_seen": 343918460, + "step": 15939, + "time_per_iteration": 2.573601007461548 + }, + { + "auxiliary_loss_clip": 0.01074122, + "auxiliary_loss_mlp": 0.01036495, + "balance_loss_clip": 1.0322777, + "balance_loss_mlp": 1.02426386, + "epoch": 0.9583646475274312, + "flos": 24131468004480.0, + "grad_norm": 1.562492813331926, + "language_loss": 0.73754221, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.7586484, + "num_input_tokens_seen": 343938030, + "step": 15940, + "time_per_iteration": 2.539907693862915 + }, + { + "auxiliary_loss_clip": 0.01104598, + "auxiliary_loss_mlp": 0.0103106, + "balance_loss_clip": 1.03524601, + "balance_loss_mlp": 1.01820314, + "epoch": 0.9584247707800992, + "flos": 20886651970560.0, + "grad_norm": 1.6364897199906014, + "language_loss": 0.72977346, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.75113004, + "num_input_tokens_seen": 343956635, + "step": 15941, + "time_per_iteration": 2.495635747909546 + }, + { + "auxiliary_loss_clip": 0.01080352, + "auxiliary_loss_mlp": 0.01032665, + "balance_loss_clip": 1.03433156, + "balance_loss_mlp": 1.02115536, + "epoch": 0.9584848940327672, + "flos": 26067591918720.0, + "grad_norm": 1.458752176432393, + "language_loss": 0.71354914, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.73467934, + "num_input_tokens_seen": 343976625, + "step": 15942, + "time_per_iteration": 2.5285017490386963 + }, + { + "auxiliary_loss_clip": 0.01104556, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.03455448, + "balance_loss_mlp": 1.01905847, + "epoch": 0.9585450172854352, + "flos": 34492988764800.0, + "grad_norm": 1.526650814871503, + "language_loss": 0.71940994, + "learning_rate": 1.797447974521571e-08, + "loss": 0.74077547, + "num_input_tokens_seen": 343997790, + "step": 15943, + "time_per_iteration": 2.5558314323425293 + }, + { + "auxiliary_loss_clip": 0.01095528, + "auxiliary_loss_mlp": 0.01035494, + "balance_loss_clip": 1.03638625, + "balance_loss_mlp": 1.02284539, + "epoch": 0.9586051405381031, + "flos": 23110743219840.0, + "grad_norm": 1.6147344520935178, + "language_loss": 0.6866678, + "learning_rate": 1.792242006001965e-08, + "loss": 0.70797801, + "num_input_tokens_seen": 344016935, + "step": 15944, + "time_per_iteration": 2.47809100151062 + }, + { + "auxiliary_loss_clip": 0.01101633, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.03299522, + "balance_loss_mlp": 1.02140784, + "epoch": 0.9586652637907711, + "flos": 19603994232960.0, + "grad_norm": 2.273716686536164, + "language_loss": 0.6584022, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.67975283, + "num_input_tokens_seen": 344035590, + "step": 15945, + "time_per_iteration": 2.446030616760254 + }, + { + "auxiliary_loss_clip": 0.00975742, + "auxiliary_loss_mlp": 0.00999664, + "balance_loss_clip": 1.01493168, + "balance_loss_mlp": 0.99847233, + "epoch": 0.958725387043439, + "flos": 72073327317120.0, + "grad_norm": 0.7526189972699443, + "language_loss": 0.61869299, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.63844705, + "num_input_tokens_seen": 344100845, + "step": 15946, + "time_per_iteration": 3.693606376647949 + }, + { + "auxiliary_loss_clip": 0.01101618, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.03490508, + "balance_loss_mlp": 1.0177424, + "epoch": 0.958785510296107, + "flos": 28911932242560.0, + "grad_norm": 1.7476571459982133, + "language_loss": 0.75108135, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.7723918, + "num_input_tokens_seen": 344121780, + "step": 15947, + "time_per_iteration": 4.246658802032471 + }, + { + "auxiliary_loss_clip": 0.01075682, + "auxiliary_loss_mlp": 0.01026016, + "balance_loss_clip": 1.03352523, + "balance_loss_mlp": 1.0150373, + "epoch": 0.958845633548775, + "flos": 18477189607680.0, + "grad_norm": 2.4694366021133756, + "language_loss": 0.69625229, + "learning_rate": 1.771493294473747e-08, + "loss": 0.7172693, + "num_input_tokens_seen": 344140150, + "step": 15948, + "time_per_iteration": 2.549262285232544 + }, + { + "auxiliary_loss_clip": 0.01058209, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.03334522, + "balance_loss_mlp": 1.01817465, + "epoch": 0.958905756801443, + "flos": 24206916522240.0, + "grad_norm": 2.890605891112174, + "language_loss": 0.78635979, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.80723643, + "num_input_tokens_seen": 344158200, + "step": 15949, + "time_per_iteration": 2.6112027168273926 + }, + { + "auxiliary_loss_clip": 0.01106084, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.03681552, + "balance_loss_mlp": 1.02163196, + "epoch": 0.9589658800541109, + "flos": 25007939769600.0, + "grad_norm": 1.844508798425589, + "language_loss": 0.69033539, + "learning_rate": 1.761164038992602e-08, + "loss": 0.71173602, + "num_input_tokens_seen": 344174720, + "step": 15950, + "time_per_iteration": 2.495490789413452 + }, + { + "auxiliary_loss_clip": 0.01083908, + "auxiliary_loss_mlp": 0.01030868, + "balance_loss_clip": 1.03530264, + "balance_loss_mlp": 1.01981735, + "epoch": 0.9590260033067789, + "flos": 23514558894720.0, + "grad_norm": 1.6703485801188571, + "language_loss": 0.86066127, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.88180906, + "num_input_tokens_seen": 344192580, + "step": 15951, + "time_per_iteration": 2.539576530456543 + }, + { + "auxiliary_loss_clip": 0.0108144, + "auxiliary_loss_mlp": 0.01037971, + "balance_loss_clip": 1.03421223, + "balance_loss_mlp": 1.02549028, + "epoch": 0.9590861265594469, + "flos": 25520349237120.0, + "grad_norm": 2.0163504894454247, + "language_loss": 0.79947764, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.82067174, + "num_input_tokens_seen": 344210345, + "step": 15952, + "time_per_iteration": 3.920907974243164 + }, + { + "auxiliary_loss_clip": 0.01093128, + "auxiliary_loss_mlp": 0.01029653, + "balance_loss_clip": 1.03657246, + "balance_loss_mlp": 1.0171597, + "epoch": 0.9591462498121148, + "flos": 21179323987200.0, + "grad_norm": 3.056460079727345, + "language_loss": 0.695207, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.71643472, + "num_input_tokens_seen": 344229540, + "step": 15953, + "time_per_iteration": 3.8917765617370605 + }, + { + "auxiliary_loss_clip": 0.01042666, + "auxiliary_loss_mlp": 0.01034832, + "balance_loss_clip": 1.03316236, + "balance_loss_mlp": 1.02101016, + "epoch": 0.9592063730647828, + "flos": 21723047136000.0, + "grad_norm": 3.9526374006240603, + "language_loss": 0.58715278, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.60792774, + "num_input_tokens_seen": 344247830, + "step": 15954, + "time_per_iteration": 4.014133453369141 + }, + { + "auxiliary_loss_clip": 0.01094105, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.03428793, + "balance_loss_mlp": 1.02139759, + "epoch": 0.9592664963174508, + "flos": 29891395278720.0, + "grad_norm": 2.4237121218608864, + "language_loss": 0.73829228, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.75957936, + "num_input_tokens_seen": 344267760, + "step": 15955, + "time_per_iteration": 2.5334370136260986 + }, + { + "auxiliary_loss_clip": 0.01081593, + "auxiliary_loss_mlp": 0.01033914, + "balance_loss_clip": 1.0337199, + "balance_loss_mlp": 1.02115297, + "epoch": 0.9593266195701188, + "flos": 17999613354240.0, + "grad_norm": 1.8279548051095775, + "language_loss": 0.6242488, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.64540386, + "num_input_tokens_seen": 344284905, + "step": 15956, + "time_per_iteration": 2.502157211303711 + }, + { + "auxiliary_loss_clip": 0.01064796, + "auxiliary_loss_mlp": 0.01030221, + "balance_loss_clip": 1.03447366, + "balance_loss_mlp": 1.01805544, + "epoch": 0.9593867428227867, + "flos": 18838271076480.0, + "grad_norm": 1.7845114101480872, + "language_loss": 0.59916019, + "learning_rate": 1.725248447997507e-08, + "loss": 0.62011039, + "num_input_tokens_seen": 344302025, + "step": 15957, + "time_per_iteration": 2.518641233444214 + }, + { + "auxiliary_loss_clip": 0.0107768, + "auxiliary_loss_mlp": 0.01036211, + "balance_loss_clip": 1.03700614, + "balance_loss_mlp": 1.0234077, + "epoch": 0.9594468660754547, + "flos": 29567050444800.0, + "grad_norm": 1.9762818713525976, + "language_loss": 0.74276549, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.76390439, + "num_input_tokens_seen": 344321935, + "step": 15958, + "time_per_iteration": 2.6308443546295166 + }, + { + "auxiliary_loss_clip": 0.01081437, + "auxiliary_loss_mlp": 0.00782049, + "balance_loss_clip": 1.03289688, + "balance_loss_mlp": 1.00730979, + "epoch": 0.9595069893281226, + "flos": 20703256104960.0, + "grad_norm": 1.5812675597854473, + "language_loss": 0.7471199, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.7657547, + "num_input_tokens_seen": 344340405, + "step": 15959, + "time_per_iteration": 2.5526771545410156 + }, + { + "auxiliary_loss_clip": 0.01088496, + "auxiliary_loss_mlp": 0.01031219, + "balance_loss_clip": 1.03336513, + "balance_loss_mlp": 1.01885676, + "epoch": 0.9595671125807906, + "flos": 22453613856000.0, + "grad_norm": 3.3187045744891526, + "language_loss": 0.65457022, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.67576742, + "num_input_tokens_seen": 344359925, + "step": 15960, + "time_per_iteration": 2.4939675331115723 + }, + { + "auxiliary_loss_clip": 0.01099978, + "auxiliary_loss_mlp": 0.01035642, + "balance_loss_clip": 1.0336833, + "balance_loss_mlp": 1.02341723, + "epoch": 0.9596272358334585, + "flos": 23915214172800.0, + "grad_norm": 1.701340091593819, + "language_loss": 0.78265959, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.80401576, + "num_input_tokens_seen": 344379100, + "step": 15961, + "time_per_iteration": 2.4523284435272217 + }, + { + "auxiliary_loss_clip": 0.0106423, + "auxiliary_loss_mlp": 0.01025198, + "balance_loss_clip": 1.03463006, + "balance_loss_mlp": 1.01358676, + "epoch": 0.9596873590861266, + "flos": 17672539086720.0, + "grad_norm": 2.1992366644086805, + "language_loss": 0.75598967, + "learning_rate": 1.699820008484698e-08, + "loss": 0.77688396, + "num_input_tokens_seen": 344396895, + "step": 15962, + "time_per_iteration": 3.956773519515991 + }, + { + "auxiliary_loss_clip": 0.01084282, + "auxiliary_loss_mlp": 0.01031975, + "balance_loss_clip": 1.03499246, + "balance_loss_mlp": 1.01922011, + "epoch": 0.9597474823387945, + "flos": 25808532053760.0, + "grad_norm": 1.8573725654866093, + "language_loss": 0.71223193, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.7333945, + "num_input_tokens_seen": 344415115, + "step": 15963, + "time_per_iteration": 2.5508811473846436 + }, + { + "auxiliary_loss_clip": 0.01072655, + "auxiliary_loss_mlp": 0.01027618, + "balance_loss_clip": 1.03455973, + "balance_loss_mlp": 1.01591718, + "epoch": 0.9598076055914625, + "flos": 23768519028480.0, + "grad_norm": 1.7693670494968199, + "language_loss": 0.74076569, + "learning_rate": 1.689701268270527e-08, + "loss": 0.76176846, + "num_input_tokens_seen": 344435185, + "step": 15964, + "time_per_iteration": 2.52687406539917 + }, + { + "auxiliary_loss_clip": 0.00998772, + "auxiliary_loss_mlp": 0.01005062, + "balance_loss_clip": 1.00831807, + "balance_loss_mlp": 1.00399518, + "epoch": 0.9598677288441305, + "flos": 56515962464640.0, + "grad_norm": 0.8838922077936839, + "language_loss": 0.57640922, + "learning_rate": 1.684653177987161e-08, + "loss": 0.59644759, + "num_input_tokens_seen": 344488950, + "step": 15965, + "time_per_iteration": 3.1429336071014404 + }, + { + "auxiliary_loss_clip": 0.01104034, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.03438258, + "balance_loss_mlp": 1.01779222, + "epoch": 0.9599278520967984, + "flos": 22997480659200.0, + "grad_norm": 2.7568724861155594, + "language_loss": 0.79096282, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.8122952, + "num_input_tokens_seen": 344506740, + "step": 15966, + "time_per_iteration": 2.448906183242798 + }, + { + "auxiliary_loss_clip": 0.01076144, + "auxiliary_loss_mlp": 0.01029899, + "balance_loss_clip": 1.03097892, + "balance_loss_mlp": 1.01811469, + "epoch": 0.9599879753494664, + "flos": 23039676161280.0, + "grad_norm": 1.5928923510809871, + "language_loss": 0.79547703, + "learning_rate": 1.674579558025102e-08, + "loss": 0.81653738, + "num_input_tokens_seen": 344526670, + "step": 15967, + "time_per_iteration": 2.540665864944458 + }, + { + "auxiliary_loss_clip": 0.0105004, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.03277445, + "balance_loss_mlp": 1.01664639, + "epoch": 0.9600480986021344, + "flos": 16392287560320.0, + "grad_norm": 1.861431279007753, + "language_loss": 0.80382127, + "learning_rate": 1.669554028728348e-08, + "loss": 0.8246243, + "num_input_tokens_seen": 344541995, + "step": 15968, + "time_per_iteration": 2.5250611305236816 + }, + { + "auxiliary_loss_clip": 0.01057507, + "auxiliary_loss_mlp": 0.01040017, + "balance_loss_clip": 1.03446269, + "balance_loss_mlp": 1.02568829, + "epoch": 0.9601082218548024, + "flos": 24276439296000.0, + "grad_norm": 2.540066048339152, + "language_loss": 0.67441642, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.69539165, + "num_input_tokens_seen": 344559980, + "step": 15969, + "time_per_iteration": 2.613860607147217 + }, + { + "auxiliary_loss_clip": 0.01091632, + "auxiliary_loss_mlp": 0.010338, + "balance_loss_clip": 1.03416753, + "balance_loss_mlp": 1.02223063, + "epoch": 0.9601683451074703, + "flos": 19609991804160.0, + "grad_norm": 2.600879792806001, + "language_loss": 0.78999299, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.81124729, + "num_input_tokens_seen": 344577765, + "step": 15970, + "time_per_iteration": 2.451685905456543 + }, + { + "auxiliary_loss_clip": 0.01090456, + "auxiliary_loss_mlp": 0.01028605, + "balance_loss_clip": 1.03542995, + "balance_loss_mlp": 1.01681542, + "epoch": 0.9602284683601383, + "flos": 26651104358400.0, + "grad_norm": 1.531143516419581, + "language_loss": 0.77233666, + "learning_rate": 1.654522565861316e-08, + "loss": 0.79352725, + "num_input_tokens_seen": 344597650, + "step": 15971, + "time_per_iteration": 2.564420461654663 + }, + { + "auxiliary_loss_clip": 0.01084693, + "auxiliary_loss_mlp": 0.01025932, + "balance_loss_clip": 1.03251171, + "balance_loss_mlp": 1.01352882, + "epoch": 0.9602885916128062, + "flos": 15554096714880.0, + "grad_norm": 1.8992173161421073, + "language_loss": 0.67115504, + "learning_rate": 1.64952712054669e-08, + "loss": 0.69226128, + "num_input_tokens_seen": 344613580, + "step": 15972, + "time_per_iteration": 2.508312702178955 + }, + { + "auxiliary_loss_clip": 0.01090326, + "auxiliary_loss_mlp": 0.0078303, + "balance_loss_clip": 1.033144, + "balance_loss_mlp": 1.0092088, + "epoch": 0.9603487148654742, + "flos": 16502353810560.0, + "grad_norm": 2.273500496911293, + "language_loss": 0.76383913, + "learning_rate": 1.644539196701844e-08, + "loss": 0.78257263, + "num_input_tokens_seen": 344626910, + "step": 15973, + "time_per_iteration": 2.462385416030884 + }, + { + "auxiliary_loss_clip": 0.01061788, + "auxiliary_loss_mlp": 0.01042981, + "balance_loss_clip": 1.03663623, + "balance_loss_mlp": 1.02974892, + "epoch": 0.9604088381181421, + "flos": 20845354308480.0, + "grad_norm": 1.5939080674324668, + "language_loss": 0.6936242, + "learning_rate": 1.639558794515983e-08, + "loss": 0.71467191, + "num_input_tokens_seen": 344644330, + "step": 15974, + "time_per_iteration": 2.565502166748047 + }, + { + "auxiliary_loss_clip": 0.01092352, + "auxiliary_loss_mlp": 0.01026285, + "balance_loss_clip": 1.03304088, + "balance_loss_mlp": 1.01375055, + "epoch": 0.9604689613708102, + "flos": 19683105937920.0, + "grad_norm": 1.7735175208376301, + "language_loss": 0.67880708, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.69999349, + "num_input_tokens_seen": 344663910, + "step": 15975, + "time_per_iteration": 2.4924046993255615 + }, + { + "auxiliary_loss_clip": 0.01099698, + "auxiliary_loss_mlp": 0.01025606, + "balance_loss_clip": 1.03456903, + "balance_loss_mlp": 1.01381636, + "epoch": 0.9605290846234781, + "flos": 24097568544000.0, + "grad_norm": 2.1644077170915534, + "language_loss": 0.55098391, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.5722369, + "num_input_tokens_seen": 344682320, + "step": 15976, + "time_per_iteration": 2.4620418548583984 + }, + { + "auxiliary_loss_clip": 0.01074451, + "auxiliary_loss_mlp": 0.01025254, + "balance_loss_clip": 1.03093445, + "balance_loss_mlp": 1.013834, + "epoch": 0.9605892078761461, + "flos": 27122575299840.0, + "grad_norm": 1.7208234643043387, + "language_loss": 0.68327129, + "learning_rate": 1.624662719799219e-08, + "loss": 0.7042684, + "num_input_tokens_seen": 344701355, + "step": 15977, + "time_per_iteration": 2.5575196743011475 + }, + { + "auxiliary_loss_clip": 0.0109153, + "auxiliary_loss_mlp": 0.01034754, + "balance_loss_clip": 1.03231239, + "balance_loss_mlp": 1.02243948, + "epoch": 0.9606493311288141, + "flos": 14136918543360.0, + "grad_norm": 4.225359531313664, + "language_loss": 0.81974339, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.84100622, + "num_input_tokens_seen": 344717980, + "step": 15978, + "time_per_iteration": 2.4253478050231934 + }, + { + "auxiliary_loss_clip": 0.0109604, + "auxiliary_loss_mlp": 0.01031739, + "balance_loss_clip": 1.03471649, + "balance_loss_mlp": 1.01934135, + "epoch": 0.960709454381482, + "flos": 15813336147840.0, + "grad_norm": 2.313349113266716, + "language_loss": 0.83538067, + "learning_rate": 1.614769615070921e-08, + "loss": 0.8566584, + "num_input_tokens_seen": 344733480, + "step": 15979, + "time_per_iteration": 2.483630418777466 + }, + { + "auxiliary_loss_clip": 0.01102604, + "auxiliary_loss_mlp": 0.01035217, + "balance_loss_clip": 1.03371191, + "balance_loss_mlp": 1.02391028, + "epoch": 0.96076957763415, + "flos": 22565403959040.0, + "grad_norm": 1.593333317970244, + "language_loss": 0.80055827, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.82193649, + "num_input_tokens_seen": 344752130, + "step": 15980, + "time_per_iteration": 2.4503917694091797 + }, + { + "auxiliary_loss_clip": 0.01094292, + "auxiliary_loss_mlp": 0.01026914, + "balance_loss_clip": 1.0336895, + "balance_loss_mlp": 1.01509464, + "epoch": 0.960829700886818, + "flos": 24681260551680.0, + "grad_norm": 1.788279271267182, + "language_loss": 0.68606603, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.70727807, + "num_input_tokens_seen": 344771195, + "step": 15981, + "time_per_iteration": 2.529557228088379 + }, + { + "auxiliary_loss_clip": 0.01089463, + "auxiliary_loss_mlp": 0.00782236, + "balance_loss_clip": 1.0328548, + "balance_loss_mlp": 1.00882733, + "epoch": 0.960889824139486, + "flos": 26542223256960.0, + "grad_norm": 1.5806678756939394, + "language_loss": 0.69476908, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.71348608, + "num_input_tokens_seen": 344793150, + "step": 15982, + "time_per_iteration": 2.5220072269439697 + }, + { + "auxiliary_loss_clip": 0.00999551, + "auxiliary_loss_mlp": 0.00998667, + "balance_loss_clip": 1.01090169, + "balance_loss_mlp": 0.99753428, + "epoch": 0.9609499473921539, + "flos": 71114942586240.0, + "grad_norm": 0.6766048582056688, + "language_loss": 0.53315926, + "learning_rate": 1.595073680563286e-08, + "loss": 0.55314147, + "num_input_tokens_seen": 344852855, + "step": 15983, + "time_per_iteration": 3.2357494831085205 + }, + { + "auxiliary_loss_clip": 0.01102835, + "auxiliary_loss_mlp": 0.01035437, + "balance_loss_clip": 1.03478205, + "balance_loss_mlp": 1.02331352, + "epoch": 0.9610100706448219, + "flos": 20552466810240.0, + "grad_norm": 2.034600760498007, + "language_loss": 0.67900097, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.70038372, + "num_input_tokens_seen": 344869830, + "step": 15984, + "time_per_iteration": 2.422638177871704 + }, + { + "auxiliary_loss_clip": 0.01069171, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.03444147, + "balance_loss_mlp": 1.02076304, + "epoch": 0.9610701938974898, + "flos": 14064199459200.0, + "grad_norm": 1.548563230852079, + "language_loss": 0.67036319, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.69137168, + "num_input_tokens_seen": 344888905, + "step": 15985, + "time_per_iteration": 2.5665576457977295 + }, + { + "auxiliary_loss_clip": 0.01105262, + "auxiliary_loss_mlp": 0.01028125, + "balance_loss_clip": 1.03524506, + "balance_loss_mlp": 1.01637101, + "epoch": 0.9611303171501578, + "flos": 20229989483520.0, + "grad_norm": 1.8261844705058505, + "language_loss": 0.78758872, + "learning_rate": 1.580380726142283e-08, + "loss": 0.80892265, + "num_input_tokens_seen": 344907160, + "step": 15986, + "time_per_iteration": 2.4237544536590576 + }, + { + "auxiliary_loss_clip": 0.01064648, + "auxiliary_loss_mlp": 0.01030823, + "balance_loss_clip": 1.03811765, + "balance_loss_mlp": 1.01736987, + "epoch": 0.9611904404028258, + "flos": 20951075013120.0, + "grad_norm": 1.920179736460202, + "language_loss": 0.63717663, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.65813136, + "num_input_tokens_seen": 344922400, + "step": 15987, + "time_per_iteration": 2.5615077018737793 + }, + { + "auxiliary_loss_clip": 0.01099654, + "auxiliary_loss_mlp": 0.01027062, + "balance_loss_clip": 1.03469634, + "balance_loss_mlp": 1.01617789, + "epoch": 0.9612505636554938, + "flos": 24827740214400.0, + "grad_norm": 1.9251422517689372, + "language_loss": 0.66597074, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.68723786, + "num_input_tokens_seen": 344941910, + "step": 15988, + "time_per_iteration": 2.4665749073028564 + }, + { + "auxiliary_loss_clip": 0.01092117, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.03423762, + "balance_loss_mlp": 1.02769947, + "epoch": 0.9613106869081617, + "flos": 17164977955200.0, + "grad_norm": 1.78032056768738, + "language_loss": 0.74491572, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.76622337, + "num_input_tokens_seen": 344960020, + "step": 15989, + "time_per_iteration": 2.4695193767547607 + }, + { + "auxiliary_loss_clip": 0.010123, + "auxiliary_loss_mlp": 0.01020138, + "balance_loss_clip": 1.00980735, + "balance_loss_mlp": 1.01887441, + "epoch": 0.9613708101608297, + "flos": 61563818522880.0, + "grad_norm": 0.8091620993324363, + "language_loss": 0.63139313, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.65171754, + "num_input_tokens_seen": 345018290, + "step": 15990, + "time_per_iteration": 3.047435998916626 + }, + { + "auxiliary_loss_clip": 0.01090413, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.03304076, + "balance_loss_mlp": 1.02302718, + "epoch": 0.9614309334134977, + "flos": 27417904922880.0, + "grad_norm": 1.736660014773818, + "language_loss": 0.77319348, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.79444361, + "num_input_tokens_seen": 345040235, + "step": 15991, + "time_per_iteration": 3.969411611557007 + }, + { + "auxiliary_loss_clip": 0.01108074, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.03463066, + "balance_loss_mlp": 1.01828063, + "epoch": 0.9614910566661656, + "flos": 22819148611200.0, + "grad_norm": 2.3289571919437937, + "language_loss": 0.84835327, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.86975074, + "num_input_tokens_seen": 345054540, + "step": 15992, + "time_per_iteration": 2.4499351978302 + }, + { + "auxiliary_loss_clip": 0.01080153, + "auxiliary_loss_mlp": 0.01030487, + "balance_loss_clip": 1.03227544, + "balance_loss_mlp": 1.01718283, + "epoch": 0.9615511799188337, + "flos": 20667812359680.0, + "grad_norm": 2.0521653971536584, + "language_loss": 0.73063666, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.75174308, + "num_input_tokens_seen": 345074035, + "step": 15993, + "time_per_iteration": 4.165271282196045 + }, + { + "auxiliary_loss_clip": 0.01073132, + "auxiliary_loss_mlp": 0.01028784, + "balance_loss_clip": 1.03391886, + "balance_loss_mlp": 1.01654696, + "epoch": 0.9616113031715016, + "flos": 33149212035840.0, + "grad_norm": 1.6518585351993422, + "language_loss": 0.68144357, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.70246279, + "num_input_tokens_seen": 345099270, + "step": 15994, + "time_per_iteration": 2.6639606952667236 + }, + { + "auxiliary_loss_clip": 0.01073989, + "auxiliary_loss_mlp": 0.0103186, + "balance_loss_clip": 1.03685284, + "balance_loss_mlp": 1.01962936, + "epoch": 0.9616714264241696, + "flos": 25009807276800.0, + "grad_norm": 2.3184824465214935, + "language_loss": 0.84501851, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.86607707, + "num_input_tokens_seen": 345116975, + "step": 15995, + "time_per_iteration": 2.596580982208252 + }, + { + "auxiliary_loss_clip": 0.01095647, + "auxiliary_loss_mlp": 0.01035199, + "balance_loss_clip": 1.03456175, + "balance_loss_mlp": 1.02234852, + "epoch": 0.9617315496768375, + "flos": 13547480359680.0, + "grad_norm": 1.9553300907135167, + "language_loss": 0.75885862, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.7801671, + "num_input_tokens_seen": 345133645, + "step": 15996, + "time_per_iteration": 2.4444386959075928 + }, + { + "auxiliary_loss_clip": 0.01078589, + "auxiliary_loss_mlp": 0.01030438, + "balance_loss_clip": 1.03547835, + "balance_loss_mlp": 1.01815367, + "epoch": 0.9617916729295055, + "flos": 11254512781440.0, + "grad_norm": 1.9605969332909137, + "language_loss": 0.76914352, + "learning_rate": 1.52708595287494e-08, + "loss": 0.79023385, + "num_input_tokens_seen": 345150740, + "step": 15997, + "time_per_iteration": 2.5120232105255127 + }, + { + "auxiliary_loss_clip": 0.0109791, + "auxiliary_loss_mlp": 0.00782572, + "balance_loss_clip": 1.03270364, + "balance_loss_mlp": 1.00975525, + "epoch": 0.9618517961821734, + "flos": 22819723228800.0, + "grad_norm": 1.585036739514364, + "language_loss": 0.6729691, + "learning_rate": 1.522286126505001e-08, + "loss": 0.69177395, + "num_input_tokens_seen": 345170365, + "step": 15998, + "time_per_iteration": 2.5352861881256104 + }, + { + "auxiliary_loss_clip": 0.01075213, + "auxiliary_loss_mlp": 0.01029996, + "balance_loss_clip": 1.03105021, + "balance_loss_mlp": 1.01700234, + "epoch": 0.9619119194348414, + "flos": 16617340224000.0, + "grad_norm": 1.579153587394206, + "language_loss": 0.7274133, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.74846536, + "num_input_tokens_seen": 345188930, + "step": 15999, + "time_per_iteration": 2.5189874172210693 + }, + { + "auxiliary_loss_clip": 0.0107641, + "auxiliary_loss_mlp": 0.01025306, + "balance_loss_clip": 1.03311276, + "balance_loss_mlp": 1.01410687, + "epoch": 0.9619720426875094, + "flos": 24535140024960.0, + "grad_norm": 2.295223217282639, + "language_loss": 0.65300161, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.6740188, + "num_input_tokens_seen": 345209615, + "step": 16000, + "time_per_iteration": 2.544492244720459 + }, + { + "auxiliary_loss_clip": 0.01070786, + "auxiliary_loss_mlp": 0.01028687, + "balance_loss_clip": 1.03298163, + "balance_loss_mlp": 1.01577115, + "epoch": 0.9620321659401774, + "flos": 20632224960000.0, + "grad_norm": 1.9979351054199816, + "language_loss": 0.75340462, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.7743994, + "num_input_tokens_seen": 345229175, + "step": 16001, + "time_per_iteration": 3.9537296295166016 + }, + { + "auxiliary_loss_clip": 0.01089534, + "auxiliary_loss_mlp": 0.01029612, + "balance_loss_clip": 1.03340936, + "balance_loss_mlp": 1.01716077, + "epoch": 0.9620922891928453, + "flos": 18515290959360.0, + "grad_norm": 1.5556524399370568, + "language_loss": 0.68205309, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.70324457, + "num_input_tokens_seen": 345247815, + "step": 16002, + "time_per_iteration": 2.483450412750244 + }, + { + "auxiliary_loss_clip": 0.01092963, + "auxiliary_loss_mlp": 0.01029228, + "balance_loss_clip": 1.03608775, + "balance_loss_mlp": 1.01745641, + "epoch": 0.9621524124455133, + "flos": 28767391914240.0, + "grad_norm": 1.2731129115465383, + "language_loss": 0.64648652, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.6677084, + "num_input_tokens_seen": 345269935, + "step": 16003, + "time_per_iteration": 2.543663263320923 + }, + { + "auxiliary_loss_clip": 0.01061755, + "auxiliary_loss_mlp": 0.01038772, + "balance_loss_clip": 1.03590775, + "balance_loss_mlp": 1.02702975, + "epoch": 0.9622125356981813, + "flos": 19098875226240.0, + "grad_norm": 1.9404291412697514, + "language_loss": 0.75785029, + "learning_rate": 1.493645226826512e-08, + "loss": 0.77885556, + "num_input_tokens_seen": 345288310, + "step": 16004, + "time_per_iteration": 2.5748291015625 + }, + { + "auxiliary_loss_clip": 0.01090997, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.03413069, + "balance_loss_mlp": 1.01810384, + "epoch": 0.9622726589508492, + "flos": 20302816308480.0, + "grad_norm": 1.8500444845330368, + "language_loss": 0.78786063, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.80907083, + "num_input_tokens_seen": 345306615, + "step": 16005, + "time_per_iteration": 2.468085527420044 + }, + { + "auxiliary_loss_clip": 0.01088616, + "auxiliary_loss_mlp": 0.01025437, + "balance_loss_clip": 1.03303885, + "balance_loss_mlp": 1.01428545, + "epoch": 0.9623327822035173, + "flos": 54929750889600.0, + "grad_norm": 1.9096305820117256, + "language_loss": 0.67975008, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.70089066, + "num_input_tokens_seen": 345331935, + "step": 16006, + "time_per_iteration": 2.792125940322876 + }, + { + "auxiliary_loss_clip": 0.01070538, + "auxiliary_loss_mlp": 0.01033294, + "balance_loss_clip": 1.03302443, + "balance_loss_mlp": 1.02190971, + "epoch": 0.9623929054561852, + "flos": 21759029585280.0, + "grad_norm": 1.5431727653498615, + "language_loss": 0.783602, + "learning_rate": 1.479426394188521e-08, + "loss": 0.80464035, + "num_input_tokens_seen": 345351510, + "step": 16007, + "time_per_iteration": 2.510564088821411 + }, + { + "auxiliary_loss_clip": 0.01104866, + "auxiliary_loss_mlp": 0.01030834, + "balance_loss_clip": 1.03570259, + "balance_loss_mlp": 1.01872826, + "epoch": 0.9624530287088532, + "flos": 17931563038080.0, + "grad_norm": 1.8260463018719035, + "language_loss": 0.67902362, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.70038062, + "num_input_tokens_seen": 345367750, + "step": 16008, + "time_per_iteration": 2.432136297225952 + }, + { + "auxiliary_loss_clip": 0.01081661, + "auxiliary_loss_mlp": 0.01031479, + "balance_loss_clip": 1.03543043, + "balance_loss_mlp": 1.01809788, + "epoch": 0.9625131519615211, + "flos": 23253739263360.0, + "grad_norm": 2.245899471813522, + "language_loss": 0.73096418, + "learning_rate": 1.469984811730529e-08, + "loss": 0.75209558, + "num_input_tokens_seen": 345384790, + "step": 16009, + "time_per_iteration": 2.510167121887207 + }, + { + "auxiliary_loss_clip": 0.0108865, + "auxiliary_loss_mlp": 0.01030117, + "balance_loss_clip": 1.03261375, + "balance_loss_mlp": 1.01862526, + "epoch": 0.9625732752141891, + "flos": 18916628595840.0, + "grad_norm": 1.7271619066245045, + "language_loss": 0.75508302, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.77627075, + "num_input_tokens_seen": 345403390, + "step": 16010, + "time_per_iteration": 2.4993793964385986 + }, + { + "auxiliary_loss_clip": 0.01096598, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.03814363, + "balance_loss_mlp": 1.02055693, + "epoch": 0.962633398466857, + "flos": 16252918790400.0, + "grad_norm": 1.8959726033850308, + "language_loss": 0.69725734, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.71857905, + "num_input_tokens_seen": 345418685, + "step": 16011, + "time_per_iteration": 2.456112861633301 + }, + { + "auxiliary_loss_clip": 0.01089991, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.03465378, + "balance_loss_mlp": 1.02050924, + "epoch": 0.962693521719525, + "flos": 54197424403200.0, + "grad_norm": 1.878334486520256, + "language_loss": 0.68371403, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.70492816, + "num_input_tokens_seen": 345442380, + "step": 16012, + "time_per_iteration": 2.7931275367736816 + }, + { + "auxiliary_loss_clip": 0.01086687, + "auxiliary_loss_mlp": 0.01036203, + "balance_loss_clip": 1.03297198, + "balance_loss_mlp": 1.02254701, + "epoch": 0.962753644972193, + "flos": 33105795471360.0, + "grad_norm": 1.731118946445511, + "language_loss": 0.72635806, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.74758697, + "num_input_tokens_seen": 345463815, + "step": 16013, + "time_per_iteration": 2.5931649208068848 + }, + { + "auxiliary_loss_clip": 0.01077734, + "auxiliary_loss_mlp": 0.01031518, + "balance_loss_clip": 1.03517985, + "balance_loss_mlp": 1.01872134, + "epoch": 0.962813768224861, + "flos": 42230660837760.0, + "grad_norm": 2.778854341437809, + "language_loss": 0.63048095, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.65157342, + "num_input_tokens_seen": 345484525, + "step": 16014, + "time_per_iteration": 2.699995756149292 + }, + { + "auxiliary_loss_clip": 0.01077691, + "auxiliary_loss_mlp": 0.01028238, + "balance_loss_clip": 1.03426814, + "balance_loss_mlp": 1.01780152, + "epoch": 0.9628738914775289, + "flos": 43944677003520.0, + "grad_norm": 1.63361480034155, + "language_loss": 0.71975005, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.74080932, + "num_input_tokens_seen": 345508295, + "step": 16015, + "time_per_iteration": 2.7087485790252686 + }, + { + "auxiliary_loss_clip": 0.0106734, + "auxiliary_loss_mlp": 0.01029584, + "balance_loss_clip": 1.03163409, + "balance_loss_mlp": 1.01737118, + "epoch": 0.9629340147301969, + "flos": 15596184476160.0, + "grad_norm": 1.9225120526689747, + "language_loss": 0.769508, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.79047722, + "num_input_tokens_seen": 345525155, + "step": 16016, + "time_per_iteration": 2.5729217529296875 + }, + { + "auxiliary_loss_clip": 0.0102687, + "auxiliary_loss_mlp": 0.01004347, + "balance_loss_clip": 1.00416875, + "balance_loss_mlp": 1.00331616, + "epoch": 0.9629941379828649, + "flos": 62951011816320.0, + "grad_norm": 0.8115399115612564, + "language_loss": 0.63143641, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.6517486, + "num_input_tokens_seen": 345578905, + "step": 16017, + "time_per_iteration": 2.9742543697357178 + }, + { + "auxiliary_loss_clip": 0.01085796, + "auxiliary_loss_mlp": 0.01028099, + "balance_loss_clip": 1.03444672, + "balance_loss_mlp": 1.01633906, + "epoch": 0.9630542612355328, + "flos": 29899116702720.0, + "grad_norm": 1.9855174264081203, + "language_loss": 0.66455615, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.68569517, + "num_input_tokens_seen": 345598965, + "step": 16018, + "time_per_iteration": 2.60267972946167 + }, + { + "auxiliary_loss_clip": 0.01061435, + "auxiliary_loss_mlp": 0.010349, + "balance_loss_clip": 1.03517842, + "balance_loss_mlp": 1.02311563, + "epoch": 0.9631143844882009, + "flos": 17894575008000.0, + "grad_norm": 2.073459052462633, + "language_loss": 0.79427183, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.81523514, + "num_input_tokens_seen": 345617945, + "step": 16019, + "time_per_iteration": 2.5372681617736816 + }, + { + "auxiliary_loss_clip": 0.01065265, + "auxiliary_loss_mlp": 0.01031911, + "balance_loss_clip": 1.0317812, + "balance_loss_mlp": 1.0211463, + "epoch": 0.9631745077408688, + "flos": 26139161767680.0, + "grad_norm": 1.4303457450508714, + "language_loss": 0.71911711, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.74008882, + "num_input_tokens_seen": 345637920, + "step": 16020, + "time_per_iteration": 2.5722877979278564 + }, + { + "auxiliary_loss_clip": 0.01079977, + "auxiliary_loss_mlp": 0.01025032, + "balance_loss_clip": 1.03495073, + "balance_loss_mlp": 1.01406467, + "epoch": 0.9632346309935368, + "flos": 24973645259520.0, + "grad_norm": 1.628393952796008, + "language_loss": 0.76930571, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.7903558, + "num_input_tokens_seen": 345656195, + "step": 16021, + "time_per_iteration": 2.5457935333251953 + }, + { + "auxiliary_loss_clip": 0.0107003, + "auxiliary_loss_mlp": 0.01033381, + "balance_loss_clip": 1.03349519, + "balance_loss_mlp": 1.0187068, + "epoch": 0.9632947542462047, + "flos": 23617226943360.0, + "grad_norm": 2.0914329531886118, + "language_loss": 0.65242612, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.67346025, + "num_input_tokens_seen": 345676700, + "step": 16022, + "time_per_iteration": 2.5744314193725586 + }, + { + "auxiliary_loss_clip": 0.01075955, + "auxiliary_loss_mlp": 0.01035585, + "balance_loss_clip": 1.03203225, + "balance_loss_mlp": 1.02371228, + "epoch": 0.9633548774988727, + "flos": 26395599939840.0, + "grad_norm": 1.821145908043204, + "language_loss": 0.73258787, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.75370324, + "num_input_tokens_seen": 345696725, + "step": 16023, + "time_per_iteration": 2.5300230979919434 + }, + { + "auxiliary_loss_clip": 0.01087407, + "auxiliary_loss_mlp": 0.01030457, + "balance_loss_clip": 1.03286123, + "balance_loss_mlp": 1.01773155, + "epoch": 0.9634150007515406, + "flos": 23767728929280.0, + "grad_norm": 1.9613630380762088, + "language_loss": 0.81598991, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.83716857, + "num_input_tokens_seen": 345716245, + "step": 16024, + "time_per_iteration": 2.507746696472168 + }, + { + "auxiliary_loss_clip": 0.0109613, + "auxiliary_loss_mlp": 0.01034797, + "balance_loss_clip": 1.03464961, + "balance_loss_mlp": 1.0223639, + "epoch": 0.9634751240042086, + "flos": 24135346673280.0, + "grad_norm": 1.4016839376822616, + "language_loss": 0.8145498, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.83585906, + "num_input_tokens_seen": 345739060, + "step": 16025, + "time_per_iteration": 2.5009255409240723 + }, + { + "auxiliary_loss_clip": 0.010952, + "auxiliary_loss_mlp": 0.01028978, + "balance_loss_clip": 1.03439975, + "balance_loss_mlp": 1.01700997, + "epoch": 0.9635352472568766, + "flos": 24349086552960.0, + "grad_norm": 2.0098562156821296, + "language_loss": 0.76277959, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.78402138, + "num_input_tokens_seen": 345758325, + "step": 16026, + "time_per_iteration": 2.514521360397339 + }, + { + "auxiliary_loss_clip": 0.01069004, + "auxiliary_loss_mlp": 0.00784612, + "balance_loss_clip": 1.03211665, + "balance_loss_mlp": 1.0074358, + "epoch": 0.9635953705095446, + "flos": 23984772860160.0, + "grad_norm": 1.5768420732893675, + "language_loss": 0.63350654, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.65204269, + "num_input_tokens_seen": 345778530, + "step": 16027, + "time_per_iteration": 2.5627801418304443 + }, + { + "auxiliary_loss_clip": 0.01094791, + "auxiliary_loss_mlp": 0.01027176, + "balance_loss_clip": 1.03559721, + "balance_loss_mlp": 1.01517117, + "epoch": 0.9636554937622125, + "flos": 19828436365440.0, + "grad_norm": 1.859214015578949, + "language_loss": 0.869506, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.89072573, + "num_input_tokens_seen": 345796535, + "step": 16028, + "time_per_iteration": 2.490760564804077 + }, + { + "auxiliary_loss_clip": 0.01001298, + "auxiliary_loss_mlp": 0.01006773, + "balance_loss_clip": 1.02766299, + "balance_loss_mlp": 1.00533056, + "epoch": 0.9637156170148805, + "flos": 67435499986560.0, + "grad_norm": 0.6860139976300149, + "language_loss": 0.531847, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.55192769, + "num_input_tokens_seen": 345859700, + "step": 16029, + "time_per_iteration": 3.1740922927856445 + }, + { + "auxiliary_loss_clip": 0.01104919, + "auxiliary_loss_mlp": 0.01030065, + "balance_loss_clip": 1.03510535, + "balance_loss_mlp": 1.01799488, + "epoch": 0.9637757402675484, + "flos": 20300912887680.0, + "grad_norm": 2.2162493607025335, + "language_loss": 0.73984313, + "learning_rate": 1.372666546129797e-08, + "loss": 0.76119298, + "num_input_tokens_seen": 345878760, + "step": 16030, + "time_per_iteration": 5.244468450546265 + }, + { + "auxiliary_loss_clip": 0.01079048, + "auxiliary_loss_mlp": 0.01034505, + "balance_loss_clip": 1.03460979, + "balance_loss_mlp": 1.02269697, + "epoch": 0.9638358635202164, + "flos": 27234544970880.0, + "grad_norm": 2.214385269747224, + "language_loss": 0.66055197, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.68168747, + "num_input_tokens_seen": 345900445, + "step": 16031, + "time_per_iteration": 3.976991653442383 + }, + { + "auxiliary_loss_clip": 0.01017804, + "auxiliary_loss_mlp": 0.00760268, + "balance_loss_clip": 1.00565612, + "balance_loss_mlp": 0.99955767, + "epoch": 0.9638959867728845, + "flos": 70288998278400.0, + "grad_norm": 0.8373816030118958, + "language_loss": 0.60722744, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.62500823, + "num_input_tokens_seen": 345961020, + "step": 16032, + "time_per_iteration": 3.1391727924346924 + }, + { + "auxiliary_loss_clip": 0.0108329, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.03397071, + "balance_loss_mlp": 1.01962185, + "epoch": 0.9639561100255524, + "flos": 25407517639680.0, + "grad_norm": 1.7941771309154309, + "language_loss": 0.66085958, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.68199265, + "num_input_tokens_seen": 345980210, + "step": 16033, + "time_per_iteration": 2.5555052757263184 + }, + { + "auxiliary_loss_clip": 0.01047018, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.03411794, + "balance_loss_mlp": 1.01719069, + "epoch": 0.9640162332782204, + "flos": 18113881495680.0, + "grad_norm": 1.7592433679242558, + "language_loss": 0.65426296, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.67502236, + "num_input_tokens_seen": 345998280, + "step": 16034, + "time_per_iteration": 2.609494686126709 + }, + { + "auxiliary_loss_clip": 0.01061712, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.0333724, + "balance_loss_mlp": 1.01664948, + "epoch": 0.9640763565308883, + "flos": 23440295525760.0, + "grad_norm": 2.105545482063162, + "language_loss": 0.73842007, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.75932914, + "num_input_tokens_seen": 346015545, + "step": 16035, + "time_per_iteration": 2.608625650405884 + }, + { + "auxiliary_loss_clip": 0.01104809, + "auxiliary_loss_mlp": 0.01029193, + "balance_loss_clip": 1.03692293, + "balance_loss_mlp": 1.01731372, + "epoch": 0.9641364797835563, + "flos": 22419355259520.0, + "grad_norm": 2.75658569667586, + "language_loss": 0.82099104, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.84233105, + "num_input_tokens_seen": 346034055, + "step": 16036, + "time_per_iteration": 2.4736435413360596 + }, + { + "auxiliary_loss_clip": 0.0108276, + "auxiliary_loss_mlp": 0.01029271, + "balance_loss_clip": 1.03478026, + "balance_loss_mlp": 1.01711774, + "epoch": 0.9641966030362242, + "flos": 30622357048320.0, + "grad_norm": 1.7135482608023438, + "language_loss": 0.69832861, + "learning_rate": 1.340965177371789e-08, + "loss": 0.71944892, + "num_input_tokens_seen": 346054130, + "step": 16037, + "time_per_iteration": 2.6174240112304688 + }, + { + "auxiliary_loss_clip": 0.01102541, + "auxiliary_loss_mlp": 0.01028237, + "balance_loss_clip": 1.03352141, + "balance_loss_mlp": 1.01644683, + "epoch": 0.9642567262888923, + "flos": 20953122088320.0, + "grad_norm": 1.719510332759985, + "language_loss": 0.63041854, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.65172637, + "num_input_tokens_seen": 346072990, + "step": 16038, + "time_per_iteration": 2.503009557723999 + }, + { + "auxiliary_loss_clip": 0.01069308, + "auxiliary_loss_mlp": 0.00786582, + "balance_loss_clip": 1.03230166, + "balance_loss_mlp": 1.00921392, + "epoch": 0.9643168495415602, + "flos": 22639415932800.0, + "grad_norm": 1.6365645649626694, + "language_loss": 0.70971847, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.72827733, + "num_input_tokens_seen": 346093745, + "step": 16039, + "time_per_iteration": 3.98201847076416 + }, + { + "auxiliary_loss_clip": 0.01060354, + "auxiliary_loss_mlp": 0.01029412, + "balance_loss_clip": 1.03388357, + "balance_loss_mlp": 1.01689517, + "epoch": 0.9643769727942282, + "flos": 20266259241600.0, + "grad_norm": 2.1578932189031725, + "language_loss": 0.72850174, + "learning_rate": 1.327491870605657e-08, + "loss": 0.74939942, + "num_input_tokens_seen": 346110115, + "step": 16040, + "time_per_iteration": 2.547440528869629 + }, + { + "auxiliary_loss_clip": 0.0109412, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_clip": 1.03404701, + "balance_loss_mlp": 1.01905107, + "epoch": 0.9644370960468961, + "flos": 13881845088000.0, + "grad_norm": 2.4574770166315267, + "language_loss": 0.72739863, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.74865699, + "num_input_tokens_seen": 346127165, + "step": 16041, + "time_per_iteration": 2.471254825592041 + }, + { + "auxiliary_loss_clip": 0.01076602, + "auxiliary_loss_mlp": 0.01029868, + "balance_loss_clip": 1.03291535, + "balance_loss_mlp": 1.01853085, + "epoch": 0.9644972192995641, + "flos": 17238199829760.0, + "grad_norm": 1.8901939294794061, + "language_loss": 0.72061896, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.74168372, + "num_input_tokens_seen": 346145950, + "step": 16042, + "time_per_iteration": 2.48492169380188 + }, + { + "auxiliary_loss_clip": 0.0106724, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.03513896, + "balance_loss_mlp": 1.02062249, + "epoch": 0.964557342552232, + "flos": 23840340272640.0, + "grad_norm": 1.8924139978818204, + "language_loss": 0.81031954, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.83131742, + "num_input_tokens_seen": 346165005, + "step": 16043, + "time_per_iteration": 2.5964367389678955 + }, + { + "auxiliary_loss_clip": 0.01075135, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.03413963, + "balance_loss_mlp": 1.01780403, + "epoch": 0.9646174658049, + "flos": 21653129312640.0, + "grad_norm": 1.536026390570106, + "language_loss": 0.71552771, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.73657024, + "num_input_tokens_seen": 346185095, + "step": 16044, + "time_per_iteration": 2.523463726043701 + }, + { + "auxiliary_loss_clip": 0.01077659, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.03226638, + "balance_loss_mlp": 1.0169307, + "epoch": 0.9646775890575681, + "flos": 17129570123520.0, + "grad_norm": 2.0645774137715485, + "language_loss": 0.70167834, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.72274578, + "num_input_tokens_seen": 346202580, + "step": 16045, + "time_per_iteration": 2.5066866874694824 + }, + { + "auxiliary_loss_clip": 0.01043618, + "auxiliary_loss_mlp": 0.01031785, + "balance_loss_clip": 1.03361154, + "balance_loss_mlp": 1.01861858, + "epoch": 0.964737712310236, + "flos": 13005732458880.0, + "grad_norm": 1.682924991280473, + "language_loss": 0.75218034, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.77293438, + "num_input_tokens_seen": 346219395, + "step": 16046, + "time_per_iteration": 2.566162347793579 + }, + { + "auxiliary_loss_clip": 0.01096515, + "auxiliary_loss_mlp": 0.01035977, + "balance_loss_clip": 1.03469706, + "balance_loss_mlp": 1.02279246, + "epoch": 0.964797835562904, + "flos": 24279240556800.0, + "grad_norm": 1.5159700532315727, + "language_loss": 0.623052, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.64437699, + "num_input_tokens_seen": 346239715, + "step": 16047, + "time_per_iteration": 2.519449234008789 + }, + { + "auxiliary_loss_clip": 0.01082922, + "auxiliary_loss_mlp": 0.0103131, + "balance_loss_clip": 1.03742766, + "balance_loss_mlp": 1.0194428, + "epoch": 0.9648579588155719, + "flos": 20522697413760.0, + "grad_norm": 1.895462187743246, + "language_loss": 0.69404483, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.71518713, + "num_input_tokens_seen": 346258500, + "step": 16048, + "time_per_iteration": 2.5210745334625244 + }, + { + "auxiliary_loss_clip": 0.01094116, + "auxiliary_loss_mlp": 0.01027812, + "balance_loss_clip": 1.03453541, + "balance_loss_mlp": 1.01489019, + "epoch": 0.9649180820682399, + "flos": 32154844855680.0, + "grad_norm": 2.05393091038773, + "language_loss": 0.63770831, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.65892756, + "num_input_tokens_seen": 346279110, + "step": 16049, + "time_per_iteration": 2.577864408493042 + }, + { + "auxiliary_loss_clip": 0.01094665, + "auxiliary_loss_mlp": 0.01027019, + "balance_loss_clip": 1.03593612, + "balance_loss_mlp": 1.01499057, + "epoch": 0.9649782053209078, + "flos": 20522589672960.0, + "grad_norm": 1.7497546467349627, + "language_loss": 0.70840919, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.72962606, + "num_input_tokens_seen": 346297860, + "step": 16050, + "time_per_iteration": 2.467543840408325 + }, + { + "auxiliary_loss_clip": 0.01093608, + "auxiliary_loss_mlp": 0.01032732, + "balance_loss_clip": 1.03193521, + "balance_loss_mlp": 1.01920199, + "epoch": 0.9650383285735759, + "flos": 43067953843200.0, + "grad_norm": 2.7194820931117425, + "language_loss": 0.69797409, + "learning_rate": 1.278669873970606e-08, + "loss": 0.71923745, + "num_input_tokens_seen": 346319860, + "step": 16051, + "time_per_iteration": 2.680999279022217 + }, + { + "auxiliary_loss_clip": 0.01018274, + "auxiliary_loss_mlp": 0.0100625, + "balance_loss_clip": 1.00593352, + "balance_loss_mlp": 1.0051173, + "epoch": 0.9650984518262438, + "flos": 61748255882880.0, + "grad_norm": 0.8442510368605294, + "language_loss": 0.59144664, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.61169183, + "num_input_tokens_seen": 346379025, + "step": 16052, + "time_per_iteration": 3.124314308166504 + }, + { + "auxiliary_loss_clip": 0.01098806, + "auxiliary_loss_mlp": 0.01024842, + "balance_loss_clip": 1.03285718, + "balance_loss_mlp": 1.01314783, + "epoch": 0.9651585750789118, + "flos": 29789337761280.0, + "grad_norm": 1.7648483056059674, + "language_loss": 0.74730718, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.76854372, + "num_input_tokens_seen": 346402250, + "step": 16053, + "time_per_iteration": 2.5330936908721924 + }, + { + "auxiliary_loss_clip": 0.01079424, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.03395748, + "balance_loss_mlp": 1.02102435, + "epoch": 0.9652186983315797, + "flos": 16873060124160.0, + "grad_norm": 2.513919907141199, + "language_loss": 0.68465644, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.70578307, + "num_input_tokens_seen": 346419555, + "step": 16054, + "time_per_iteration": 2.5026016235351562 + }, + { + "auxiliary_loss_clip": 0.0108577, + "auxiliary_loss_mlp": 0.00781236, + "balance_loss_clip": 1.03519559, + "balance_loss_mlp": 1.00658727, + "epoch": 0.9652788215842477, + "flos": 31649761762560.0, + "grad_norm": 1.5696573250001915, + "language_loss": 0.62118304, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.63985312, + "num_input_tokens_seen": 346441245, + "step": 16055, + "time_per_iteration": 2.5683608055114746 + }, + { + "auxiliary_loss_clip": 0.0106345, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.03442931, + "balance_loss_mlp": 1.01902914, + "epoch": 0.9653389448369156, + "flos": 24754266944640.0, + "grad_norm": 1.898937599243159, + "language_loss": 0.76818621, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.78912878, + "num_input_tokens_seen": 346460065, + "step": 16056, + "time_per_iteration": 2.632518768310547 + }, + { + "auxiliary_loss_clip": 0.01077242, + "auxiliary_loss_mlp": 0.01031797, + "balance_loss_clip": 1.03546214, + "balance_loss_mlp": 1.02002561, + "epoch": 0.9653990680895836, + "flos": 20297249700480.0, + "grad_norm": 1.5492937143879804, + "language_loss": 0.71686149, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.73795193, + "num_input_tokens_seen": 346478005, + "step": 16057, + "time_per_iteration": 2.5222768783569336 + }, + { + "auxiliary_loss_clip": 0.01101442, + "auxiliary_loss_mlp": 0.01031319, + "balance_loss_clip": 1.03462291, + "balance_loss_mlp": 1.01983881, + "epoch": 0.9654591913422517, + "flos": 22528775064960.0, + "grad_norm": 1.7947857022493083, + "language_loss": 0.71822476, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.73955238, + "num_input_tokens_seen": 346497575, + "step": 16058, + "time_per_iteration": 2.492741107940674 + }, + { + "auxiliary_loss_clip": 0.01089611, + "auxiliary_loss_mlp": 0.01032955, + "balance_loss_clip": 1.03328574, + "balance_loss_mlp": 1.021505, + "epoch": 0.9655193145949196, + "flos": 26763002202240.0, + "grad_norm": 1.4895501815590615, + "language_loss": 0.74173975, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.76296544, + "num_input_tokens_seen": 346520000, + "step": 16059, + "time_per_iteration": 2.528712034225464 + }, + { + "auxiliary_loss_clip": 0.01084826, + "auxiliary_loss_mlp": 0.01028778, + "balance_loss_clip": 1.03440154, + "balance_loss_mlp": 1.01728654, + "epoch": 0.9655794378475876, + "flos": 41970703132800.0, + "grad_norm": 1.8434447996818184, + "language_loss": 0.73492503, + "learning_rate": 1.239402791721722e-08, + "loss": 0.75606114, + "num_input_tokens_seen": 346541605, + "step": 16060, + "time_per_iteration": 2.6995675563812256 + }, + { + "auxiliary_loss_clip": 0.0107825, + "auxiliary_loss_mlp": 0.01032354, + "balance_loss_clip": 1.0339483, + "balance_loss_mlp": 1.02132678, + "epoch": 0.9656395611002555, + "flos": 27709427704320.0, + "grad_norm": 1.6098017139710945, + "language_loss": 0.76768136, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.78878736, + "num_input_tokens_seen": 346560955, + "step": 16061, + "time_per_iteration": 2.5559070110321045 + }, + { + "auxiliary_loss_clip": 0.01009471, + "auxiliary_loss_mlp": 0.01001463, + "balance_loss_clip": 1.00665975, + "balance_loss_mlp": 1.00026476, + "epoch": 0.9656996843529235, + "flos": 68968562411520.0, + "grad_norm": 0.7765228807830403, + "language_loss": 0.64146769, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.66157705, + "num_input_tokens_seen": 346621615, + "step": 16062, + "time_per_iteration": 3.1695029735565186 + }, + { + "auxiliary_loss_clip": 0.01052235, + "auxiliary_loss_mlp": 0.01026485, + "balance_loss_clip": 1.0310998, + "balance_loss_mlp": 1.01564896, + "epoch": 0.9657598076055914, + "flos": 20631327120000.0, + "grad_norm": 2.18018496882028, + "language_loss": 0.93415219, + "learning_rate": 1.226449424760867e-08, + "loss": 0.95493937, + "num_input_tokens_seen": 346637460, + "step": 16063, + "time_per_iteration": 2.551731586456299 + }, + { + "auxiliary_loss_clip": 0.01094486, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.03601265, + "balance_loss_mlp": 1.01969182, + "epoch": 0.9658199308582595, + "flos": 20448577699200.0, + "grad_norm": 1.765581965309665, + "language_loss": 0.82041323, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.84167004, + "num_input_tokens_seen": 346655625, + "step": 16064, + "time_per_iteration": 2.4939305782318115 + }, + { + "auxiliary_loss_clip": 0.01091032, + "auxiliary_loss_mlp": 0.00786212, + "balance_loss_clip": 1.03674006, + "balance_loss_mlp": 1.01503956, + "epoch": 0.9658800541109274, + "flos": 24718033100160.0, + "grad_norm": 1.5564173287897225, + "language_loss": 0.83844769, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.85722017, + "num_input_tokens_seen": 346675220, + "step": 16065, + "time_per_iteration": 2.5282247066497803 + }, + { + "auxiliary_loss_clip": 0.01078713, + "auxiliary_loss_mlp": 0.01028116, + "balance_loss_clip": 1.03321064, + "balance_loss_mlp": 1.01612997, + "epoch": 0.9659401773635954, + "flos": 21610035970560.0, + "grad_norm": 1.7385829003824058, + "language_loss": 0.67467868, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.69574702, + "num_input_tokens_seen": 346694710, + "step": 16066, + "time_per_iteration": 2.527076005935669 + }, + { + "auxiliary_loss_clip": 0.01101187, + "auxiliary_loss_mlp": 0.01025275, + "balance_loss_clip": 1.03304851, + "balance_loss_mlp": 1.01374722, + "epoch": 0.9660003006162633, + "flos": 20301200196480.0, + "grad_norm": 2.2684463574282057, + "language_loss": 0.82199621, + "learning_rate": 1.209283794752558e-08, + "loss": 0.84326077, + "num_input_tokens_seen": 346712645, + "step": 16067, + "time_per_iteration": 2.4386308193206787 + }, + { + "auxiliary_loss_clip": 0.01080632, + "auxiliary_loss_mlp": 0.01028855, + "balance_loss_clip": 1.03383684, + "balance_loss_mlp": 1.01689863, + "epoch": 0.9660604238689313, + "flos": 24461954064000.0, + "grad_norm": 1.713301006719681, + "language_loss": 0.69206065, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.71315551, + "num_input_tokens_seen": 346732375, + "step": 16068, + "time_per_iteration": 2.550292491912842 + }, + { + "auxiliary_loss_clip": 0.01081228, + "auxiliary_loss_mlp": 0.01030202, + "balance_loss_clip": 1.03177166, + "balance_loss_mlp": 1.01962829, + "epoch": 0.9661205471215992, + "flos": 19864023765120.0, + "grad_norm": 1.859611012912049, + "language_loss": 0.67774498, + "learning_rate": 1.20074620808146e-08, + "loss": 0.69885933, + "num_input_tokens_seen": 346750430, + "step": 16069, + "time_per_iteration": 5.208314657211304 + }, + { + "auxiliary_loss_clip": 0.01082576, + "auxiliary_loss_mlp": 0.01026509, + "balance_loss_clip": 1.03598797, + "balance_loss_mlp": 1.0153985, + "epoch": 0.9661806703742672, + "flos": 20557889763840.0, + "grad_norm": 1.8254223981597708, + "language_loss": 0.89094281, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.91203368, + "num_input_tokens_seen": 346768455, + "step": 16070, + "time_per_iteration": 4.080590724945068 + }, + { + "auxiliary_loss_clip": 0.01106703, + "auxiliary_loss_mlp": 0.01035142, + "balance_loss_clip": 1.03750825, + "balance_loss_mlp": 1.02238095, + "epoch": 0.9662407936269353, + "flos": 21430949736960.0, + "grad_norm": 1.7723512383639426, + "language_loss": 0.76999915, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.7914176, + "num_input_tokens_seen": 346786530, + "step": 16071, + "time_per_iteration": 2.458963632583618 + }, + { + "auxiliary_loss_clip": 0.01077824, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.03190231, + "balance_loss_mlp": 1.02143502, + "epoch": 0.9663009168796032, + "flos": 14902893095040.0, + "grad_norm": 1.7176188029169175, + "language_loss": 0.6567533, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.67787552, + "num_input_tokens_seen": 346804635, + "step": 16072, + "time_per_iteration": 2.526878833770752 + }, + { + "auxiliary_loss_clip": 0.01095983, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.03576386, + "balance_loss_mlp": 1.0204556, + "epoch": 0.9663610401322712, + "flos": 24310877460480.0, + "grad_norm": 1.6342111261647885, + "language_loss": 0.7751075, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.79638743, + "num_input_tokens_seen": 346823070, + "step": 16073, + "time_per_iteration": 2.5119826793670654 + }, + { + "auxiliary_loss_clip": 0.01106046, + "auxiliary_loss_mlp": 0.01033508, + "balance_loss_clip": 1.03578663, + "balance_loss_mlp": 1.02127159, + "epoch": 0.9664211633849391, + "flos": 17637849527040.0, + "grad_norm": 2.498468990656494, + "language_loss": 0.76165718, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.78305268, + "num_input_tokens_seen": 346841180, + "step": 16074, + "time_per_iteration": 2.4602203369140625 + }, + { + "auxiliary_loss_clip": 0.01081407, + "auxiliary_loss_mlp": 0.01030141, + "balance_loss_clip": 1.03600907, + "balance_loss_mlp": 1.01768339, + "epoch": 0.9664812866376071, + "flos": 29789409588480.0, + "grad_norm": 1.5865302923346496, + "language_loss": 0.75713927, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.77825475, + "num_input_tokens_seen": 346864250, + "step": 16075, + "time_per_iteration": 2.570364236831665 + }, + { + "auxiliary_loss_clip": 0.01074208, + "auxiliary_loss_mlp": 0.01033899, + "balance_loss_clip": 1.03673363, + "balance_loss_mlp": 1.02181721, + "epoch": 0.966541409890275, + "flos": 14282320798080.0, + "grad_norm": 1.8581525250032593, + "language_loss": 0.78831637, + "learning_rate": 1.171102125547696e-08, + "loss": 0.80939746, + "num_input_tokens_seen": 346881955, + "step": 16076, + "time_per_iteration": 2.5619616508483887 + }, + { + "auxiliary_loss_clip": 0.01083739, + "auxiliary_loss_mlp": 0.01042056, + "balance_loss_clip": 1.03568125, + "balance_loss_mlp": 1.02878809, + "epoch": 0.9666015331429431, + "flos": 19860432405120.0, + "grad_norm": 1.6466558394104436, + "language_loss": 0.72457671, + "learning_rate": 1.166897413780532e-08, + "loss": 0.74583471, + "num_input_tokens_seen": 346900445, + "step": 16077, + "time_per_iteration": 2.494022846221924 + }, + { + "auxiliary_loss_clip": 0.01088996, + "auxiliary_loss_mlp": 0.01032154, + "balance_loss_clip": 1.03310359, + "balance_loss_mlp": 1.01968443, + "epoch": 0.966661656395611, + "flos": 27125951178240.0, + "grad_norm": 1.7502676683251068, + "language_loss": 0.59188616, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.61309761, + "num_input_tokens_seen": 346920135, + "step": 16078, + "time_per_iteration": 3.925840377807617 + }, + { + "auxiliary_loss_clip": 0.01095923, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.03518128, + "balance_loss_mlp": 1.01879001, + "epoch": 0.966721779648279, + "flos": 21508229848320.0, + "grad_norm": 2.227878398678544, + "language_loss": 0.72026813, + "learning_rate": 1.158510609718899e-08, + "loss": 0.74154109, + "num_input_tokens_seen": 346940450, + "step": 16079, + "time_per_iteration": 2.5090653896331787 + }, + { + "auxiliary_loss_clip": 0.01089063, + "auxiliary_loss_mlp": 0.01028141, + "balance_loss_clip": 1.03457618, + "balance_loss_mlp": 1.01709008, + "epoch": 0.9667819029009469, + "flos": 23878118401920.0, + "grad_norm": 1.5432721370010796, + "language_loss": 0.72123086, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.74240291, + "num_input_tokens_seen": 346960935, + "step": 16080, + "time_per_iteration": 2.5058460235595703 + }, + { + "auxiliary_loss_clip": 0.01072015, + "auxiliary_loss_mlp": 0.0103413, + "balance_loss_clip": 1.03178668, + "balance_loss_mlp": 1.02113008, + "epoch": 0.9668420261536149, + "flos": 21507224267520.0, + "grad_norm": 1.8930093890385007, + "language_loss": 0.73751402, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.7585755, + "num_input_tokens_seen": 346980100, + "step": 16081, + "time_per_iteration": 2.528735399246216 + }, + { + "auxiliary_loss_clip": 0.01075661, + "auxiliary_loss_mlp": 0.01027457, + "balance_loss_clip": 1.03154576, + "balance_loss_mlp": 1.01526785, + "epoch": 0.9669021494062828, + "flos": 26687266375680.0, + "grad_norm": 1.5525961324106754, + "language_loss": 0.67288685, + "learning_rate": 1.145986954691236e-08, + "loss": 0.69391799, + "num_input_tokens_seen": 347001250, + "step": 16082, + "time_per_iteration": 2.544196367263794 + }, + { + "auxiliary_loss_clip": 0.01060806, + "auxiliary_loss_mlp": 0.0103874, + "balance_loss_clip": 1.03050041, + "balance_loss_mlp": 1.02491164, + "epoch": 0.9669622726589508, + "flos": 29825032901760.0, + "grad_norm": 1.4573655553865428, + "language_loss": 0.76788127, + "learning_rate": 1.141827483932789e-08, + "loss": 0.78887677, + "num_input_tokens_seen": 347022975, + "step": 16083, + "time_per_iteration": 2.622837543487549 + }, + { + "auxiliary_loss_clip": 0.01056757, + "auxiliary_loss_mlp": 0.01031968, + "balance_loss_clip": 1.03482676, + "balance_loss_mlp": 1.01989758, + "epoch": 0.9670223959116189, + "flos": 22922499018240.0, + "grad_norm": 1.880244983347926, + "language_loss": 0.79129589, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.81218314, + "num_input_tokens_seen": 347038780, + "step": 16084, + "time_per_iteration": 2.597515344619751 + }, + { + "auxiliary_loss_clip": 0.01095364, + "auxiliary_loss_mlp": 0.01027757, + "balance_loss_clip": 1.03321862, + "balance_loss_mlp": 1.01534128, + "epoch": 0.9670825191642868, + "flos": 18624495283200.0, + "grad_norm": 2.1670951539316823, + "language_loss": 0.6774087, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.69863987, + "num_input_tokens_seen": 347056705, + "step": 16085, + "time_per_iteration": 2.491541624069214 + }, + { + "auxiliary_loss_clip": 0.01084766, + "auxiliary_loss_mlp": 0.01029576, + "balance_loss_clip": 1.03584075, + "balance_loss_mlp": 1.01648712, + "epoch": 0.9671426424169548, + "flos": 24497936513280.0, + "grad_norm": 2.044192717761249, + "language_loss": 0.68191355, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.70305699, + "num_input_tokens_seen": 347075710, + "step": 16086, + "time_per_iteration": 2.5439982414245605 + }, + { + "auxiliary_loss_clip": 0.01091243, + "auxiliary_loss_mlp": 0.01033445, + "balance_loss_clip": 1.03396475, + "balance_loss_mlp": 1.02104139, + "epoch": 0.9672027656696227, + "flos": 20371189847040.0, + "grad_norm": 1.8083685554362006, + "language_loss": 0.78777748, + "learning_rate": 1.125265009690235e-08, + "loss": 0.80902433, + "num_input_tokens_seen": 347092325, + "step": 16087, + "time_per_iteration": 2.477534294128418 + }, + { + "auxiliary_loss_clip": 0.01075284, + "auxiliary_loss_mlp": 0.01025387, + "balance_loss_clip": 1.03210139, + "balance_loss_mlp": 1.01347244, + "epoch": 0.9672628889222907, + "flos": 18880179269760.0, + "grad_norm": 1.9096377103119597, + "language_loss": 0.71508497, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.73609167, + "num_input_tokens_seen": 347110595, + "step": 16088, + "time_per_iteration": 2.502833604812622 + }, + { + "auxiliary_loss_clip": 0.01100099, + "auxiliary_loss_mlp": 0.00781938, + "balance_loss_clip": 1.03420639, + "balance_loss_mlp": 1.00877643, + "epoch": 0.9673230121749586, + "flos": 28695247447680.0, + "grad_norm": 1.4028687045816384, + "language_loss": 0.7037077, + "learning_rate": 1.117029020040916e-08, + "loss": 0.7225281, + "num_input_tokens_seen": 347131625, + "step": 16089, + "time_per_iteration": 2.537194013595581 + }, + { + "auxiliary_loss_clip": 0.01105729, + "auxiliary_loss_mlp": 0.01029033, + "balance_loss_clip": 1.03545034, + "balance_loss_mlp": 1.01677799, + "epoch": 0.9673831354276267, + "flos": 20484452407680.0, + "grad_norm": 2.5821732716342765, + "language_loss": 0.74447393, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.76582146, + "num_input_tokens_seen": 347147910, + "step": 16090, + "time_per_iteration": 2.428001642227173 + }, + { + "auxiliary_loss_clip": 0.01083776, + "auxiliary_loss_mlp": 0.01028104, + "balance_loss_clip": 1.03497374, + "balance_loss_mlp": 1.01598668, + "epoch": 0.9674432586802946, + "flos": 26797548107520.0, + "grad_norm": 1.6175532865756246, + "language_loss": 0.68857104, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.7096898, + "num_input_tokens_seen": 347168805, + "step": 16091, + "time_per_iteration": 2.5714657306671143 + }, + { + "auxiliary_loss_clip": 0.01102338, + "auxiliary_loss_mlp": 0.01032833, + "balance_loss_clip": 1.03486133, + "balance_loss_mlp": 1.02004826, + "epoch": 0.9675033819329626, + "flos": 22310941034880.0, + "grad_norm": 2.0330270074761576, + "language_loss": 0.77050167, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.79185343, + "num_input_tokens_seen": 347189455, + "step": 16092, + "time_per_iteration": 2.4676384925842285 + }, + { + "auxiliary_loss_clip": 0.01103695, + "auxiliary_loss_mlp": 0.01026801, + "balance_loss_clip": 1.03576231, + "balance_loss_mlp": 1.01564908, + "epoch": 0.9675635051856305, + "flos": 12675713276160.0, + "grad_norm": 1.9108113806024958, + "language_loss": 0.76180547, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.78311044, + "num_input_tokens_seen": 347206030, + "step": 16093, + "time_per_iteration": 2.432854413986206 + }, + { + "auxiliary_loss_clip": 0.01079201, + "auxiliary_loss_mlp": 0.01025883, + "balance_loss_clip": 1.03667724, + "balance_loss_mlp": 1.01307952, + "epoch": 0.9676236284382985, + "flos": 24608469640320.0, + "grad_norm": 1.4876912668177518, + "language_loss": 0.69158554, + "learning_rate": 1.096571027726112e-08, + "loss": 0.71263635, + "num_input_tokens_seen": 347226250, + "step": 16094, + "time_per_iteration": 2.5492799282073975 + }, + { + "auxiliary_loss_clip": 0.01094161, + "auxiliary_loss_mlp": 0.01029472, + "balance_loss_clip": 1.03428984, + "balance_loss_mlp": 1.01786137, + "epoch": 0.9676837516909664, + "flos": 23367145478400.0, + "grad_norm": 1.487542063985486, + "language_loss": 0.75994343, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.78117979, + "num_input_tokens_seen": 347247350, + "step": 16095, + "time_per_iteration": 2.5340163707733154 + }, + { + "auxiliary_loss_clip": 0.01108181, + "auxiliary_loss_mlp": 0.01033616, + "balance_loss_clip": 1.03694165, + "balance_loss_mlp": 1.02118826, + "epoch": 0.9677438749436345, + "flos": 20486894532480.0, + "grad_norm": 1.7990288477045595, + "language_loss": 0.70108485, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.72250283, + "num_input_tokens_seen": 347266870, + "step": 16096, + "time_per_iteration": 2.4454047679901123 + }, + { + "auxiliary_loss_clip": 0.01083925, + "auxiliary_loss_mlp": 0.01027313, + "balance_loss_clip": 1.03501916, + "balance_loss_mlp": 1.01537967, + "epoch": 0.9678039981963025, + "flos": 47555889719040.0, + "grad_norm": 1.5876220659709945, + "language_loss": 0.71811795, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.73923028, + "num_input_tokens_seen": 347290120, + "step": 16097, + "time_per_iteration": 2.7591240406036377 + }, + { + "auxiliary_loss_clip": 0.01101838, + "auxiliary_loss_mlp": 0.01033591, + "balance_loss_clip": 1.03381968, + "balance_loss_mlp": 1.02182543, + "epoch": 0.9678641214489704, + "flos": 25040474513280.0, + "grad_norm": 1.5409112629551556, + "language_loss": 0.78229433, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.80364859, + "num_input_tokens_seen": 347308785, + "step": 16098, + "time_per_iteration": 2.4671366214752197 + }, + { + "auxiliary_loss_clip": 0.01075352, + "auxiliary_loss_mlp": 0.01025917, + "balance_loss_clip": 1.03499877, + "balance_loss_mlp": 1.01458037, + "epoch": 0.9679242447016384, + "flos": 19240937516160.0, + "grad_norm": 1.996999076325595, + "language_loss": 0.90983355, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.93084621, + "num_input_tokens_seen": 347326375, + "step": 16099, + "time_per_iteration": 2.505222797393799 + }, + { + "auxiliary_loss_clip": 0.01093641, + "auxiliary_loss_mlp": 0.01028886, + "balance_loss_clip": 1.03405046, + "balance_loss_mlp": 1.01644695, + "epoch": 0.9679843679543063, + "flos": 33254681345280.0, + "grad_norm": 2.3015652498104084, + "language_loss": 0.66364199, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.68486726, + "num_input_tokens_seen": 347348250, + "step": 16100, + "time_per_iteration": 2.5802457332611084 + }, + { + "auxiliary_loss_clip": 0.01063083, + "auxiliary_loss_mlp": 0.01032357, + "balance_loss_clip": 1.03618085, + "balance_loss_mlp": 1.02024531, + "epoch": 0.9680444912069743, + "flos": 22783633038720.0, + "grad_norm": 1.5306921931645212, + "language_loss": 0.73376226, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.75471663, + "num_input_tokens_seen": 347367400, + "step": 16101, + "time_per_iteration": 2.571345329284668 + }, + { + "auxiliary_loss_clip": 0.0108038, + "auxiliary_loss_mlp": 0.01028802, + "balance_loss_clip": 1.0343256, + "balance_loss_mlp": 1.01648211, + "epoch": 0.9681046144596422, + "flos": 24024095274240.0, + "grad_norm": 1.5046455827228338, + "language_loss": 0.73306119, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.75415301, + "num_input_tokens_seen": 347387600, + "step": 16102, + "time_per_iteration": 2.550658702850342 + }, + { + "auxiliary_loss_clip": 0.01075967, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.03679836, + "balance_loss_mlp": 1.01972198, + "epoch": 0.9681647377123103, + "flos": 23441013797760.0, + "grad_norm": 2.509475571476384, + "language_loss": 0.77273786, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.79382086, + "num_input_tokens_seen": 347406915, + "step": 16103, + "time_per_iteration": 2.6016759872436523 + }, + { + "auxiliary_loss_clip": 0.01082192, + "auxiliary_loss_mlp": 0.01028391, + "balance_loss_clip": 1.03306973, + "balance_loss_mlp": 1.01656568, + "epoch": 0.9682248609649782, + "flos": 22675075159680.0, + "grad_norm": 1.5647060830973825, + "language_loss": 0.80302584, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.82413173, + "num_input_tokens_seen": 347425140, + "step": 16104, + "time_per_iteration": 2.5205013751983643 + }, + { + "auxiliary_loss_clip": 0.01083005, + "auxiliary_loss_mlp": 0.01033608, + "balance_loss_clip": 1.03033233, + "balance_loss_mlp": 1.02246785, + "epoch": 0.9682849842176462, + "flos": 24428413739520.0, + "grad_norm": 1.4604262304179652, + "language_loss": 0.78013909, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.80130517, + "num_input_tokens_seen": 347446350, + "step": 16105, + "time_per_iteration": 2.5648958683013916 + }, + { + "auxiliary_loss_clip": 0.01005458, + "auxiliary_loss_mlp": 0.01001602, + "balance_loss_clip": 1.00414777, + "balance_loss_mlp": 1.00055873, + "epoch": 0.9683451074703141, + "flos": 59995132784640.0, + "grad_norm": 0.824730152804101, + "language_loss": 0.56717724, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.58724785, + "num_input_tokens_seen": 347510135, + "step": 16106, + "time_per_iteration": 4.574390649795532 + }, + { + "auxiliary_loss_clip": 0.01007769, + "auxiliary_loss_mlp": 0.00999815, + "balance_loss_clip": 1.01307607, + "balance_loss_mlp": 0.99854529, + "epoch": 0.9684052307229821, + "flos": 52696145514240.0, + "grad_norm": 0.8833024070502788, + "language_loss": 0.61547947, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.63555527, + "num_input_tokens_seen": 347562505, + "step": 16107, + "time_per_iteration": 4.394185781478882 + }, + { + "auxiliary_loss_clip": 0.01094405, + "auxiliary_loss_mlp": 0.0103922, + "balance_loss_clip": 1.0352813, + "balance_loss_mlp": 1.02575493, + "epoch": 0.96846535397565, + "flos": 22783848520320.0, + "grad_norm": 2.0634952984997854, + "language_loss": 0.73733836, + "learning_rate": 1.040291854638875e-08, + "loss": 0.75867462, + "num_input_tokens_seen": 347579150, + "step": 16108, + "time_per_iteration": 3.859767198562622 + }, + { + "auxiliary_loss_clip": 0.01089296, + "auxiliary_loss_mlp": 0.01027109, + "balance_loss_clip": 1.03418529, + "balance_loss_mlp": 1.01418734, + "epoch": 0.968525477228318, + "flos": 23323980309120.0, + "grad_norm": 2.0617974433445037, + "language_loss": 0.56677091, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.58793497, + "num_input_tokens_seen": 347596705, + "step": 16109, + "time_per_iteration": 2.4933395385742188 + }, + { + "auxiliary_loss_clip": 0.01018755, + "auxiliary_loss_mlp": 0.01001597, + "balance_loss_clip": 1.00522017, + "balance_loss_mlp": 1.00053632, + "epoch": 0.9685856004809861, + "flos": 67882947707520.0, + "grad_norm": 0.6694668363503478, + "language_loss": 0.54238904, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.56259257, + "num_input_tokens_seen": 347661870, + "step": 16110, + "time_per_iteration": 3.089578866958618 + }, + { + "auxiliary_loss_clip": 0.01037182, + "auxiliary_loss_mlp": 0.01037799, + "balance_loss_clip": 1.03404677, + "balance_loss_mlp": 1.02389908, + "epoch": 0.968645723733654, + "flos": 33947900899200.0, + "grad_norm": 1.4068991479946813, + "language_loss": 0.62191427, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.64266413, + "num_input_tokens_seen": 347684295, + "step": 16111, + "time_per_iteration": 2.784716844558716 + }, + { + "auxiliary_loss_clip": 0.01078636, + "auxiliary_loss_mlp": 0.01026734, + "balance_loss_clip": 1.03250206, + "balance_loss_mlp": 1.01640463, + "epoch": 0.968705846986322, + "flos": 18551488890240.0, + "grad_norm": 4.574918490940266, + "language_loss": 0.74939787, + "learning_rate": 1.024483677309118e-08, + "loss": 0.77045166, + "num_input_tokens_seen": 347702585, + "step": 16112, + "time_per_iteration": 2.5633316040039062 + }, + { + "auxiliary_loss_clip": 0.01091008, + "auxiliary_loss_mlp": 0.01026575, + "balance_loss_clip": 1.03383517, + "balance_loss_mlp": 1.01572144, + "epoch": 0.9687659702389899, + "flos": 17420913336960.0, + "grad_norm": 1.8297485952623755, + "language_loss": 0.66721696, + "learning_rate": 1.020550495531558e-08, + "loss": 0.68839276, + "num_input_tokens_seen": 347721810, + "step": 16113, + "time_per_iteration": 2.5308640003204346 + }, + { + "auxiliary_loss_clip": 0.01016971, + "auxiliary_loss_mlp": 0.0100547, + "balance_loss_clip": 1.00598454, + "balance_loss_mlp": 1.00447488, + "epoch": 0.9688260934916579, + "flos": 62047176865920.0, + "grad_norm": 0.6910904593141285, + "language_loss": 0.5657571, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.58598149, + "num_input_tokens_seen": 347782330, + "step": 16114, + "time_per_iteration": 3.094142198562622 + }, + { + "auxiliary_loss_clip": 0.0107755, + "auxiliary_loss_mlp": 0.01033941, + "balance_loss_clip": 1.03446698, + "balance_loss_mlp": 1.02178729, + "epoch": 0.9688862167443258, + "flos": 15076520461440.0, + "grad_norm": 1.9512016837213002, + "language_loss": 0.8276183, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.84873319, + "num_input_tokens_seen": 347794835, + "step": 16115, + "time_per_iteration": 2.4400620460510254 + }, + { + "auxiliary_loss_clip": 0.01087228, + "auxiliary_loss_mlp": 0.0102803, + "balance_loss_clip": 1.03350282, + "balance_loss_mlp": 1.01685989, + "epoch": 0.9689463399969939, + "flos": 19938215306880.0, + "grad_norm": 1.6196973853305432, + "language_loss": 0.72330427, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.74445677, + "num_input_tokens_seen": 347814320, + "step": 16116, + "time_per_iteration": 3.884894609451294 + }, + { + "auxiliary_loss_clip": 0.01063384, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.03339529, + "balance_loss_mlp": 1.02116454, + "epoch": 0.9690064632496618, + "flos": 19573039687680.0, + "grad_norm": 2.3882747809388514, + "language_loss": 0.75519651, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.7761693, + "num_input_tokens_seen": 347832125, + "step": 16117, + "time_per_iteration": 2.523899555206299 + }, + { + "auxiliary_loss_clip": 0.01103992, + "auxiliary_loss_mlp": 0.01029363, + "balance_loss_clip": 1.03355479, + "balance_loss_mlp": 1.01707888, + "epoch": 0.9690665865023298, + "flos": 21872292145920.0, + "grad_norm": 2.0496819304967127, + "language_loss": 0.77539992, + "learning_rate": 1.000997769426548e-08, + "loss": 0.7967335, + "num_input_tokens_seen": 347850765, + "step": 16118, + "time_per_iteration": 2.4586479663848877 + }, + { + "auxiliary_loss_clip": 0.01082457, + "auxiliary_loss_mlp": 0.00785377, + "balance_loss_clip": 1.03543615, + "balance_loss_mlp": 1.01159, + "epoch": 0.9691267097549977, + "flos": 20994491577600.0, + "grad_norm": 1.766593161299943, + "language_loss": 0.78108001, + "learning_rate": 9.971098618001272e-09, + "loss": 0.79975832, + "num_input_tokens_seen": 347870125, + "step": 16119, + "time_per_iteration": 2.5324463844299316 + }, + { + "auxiliary_loss_clip": 0.01053338, + "auxiliary_loss_mlp": 0.01034686, + "balance_loss_clip": 1.03143072, + "balance_loss_mlp": 1.02253282, + "epoch": 0.9691868330076657, + "flos": 24279132816000.0, + "grad_norm": 1.3664166631291177, + "language_loss": 0.7555747, + "learning_rate": 9.932295003832747e-09, + "loss": 0.77645493, + "num_input_tokens_seen": 347890615, + "step": 16120, + "time_per_iteration": 2.6001393795013428 + }, + { + "auxiliary_loss_clip": 0.01091952, + "auxiliary_loss_mlp": 0.01027548, + "balance_loss_clip": 1.03387344, + "balance_loss_mlp": 1.01603293, + "epoch": 0.9692469562603336, + "flos": 17675699483520.0, + "grad_norm": 1.8289097616934527, + "language_loss": 0.69644076, + "learning_rate": 9.89356685323095e-09, + "loss": 0.71763575, + "num_input_tokens_seen": 347908685, + "step": 16121, + "time_per_iteration": 2.487438440322876 + }, + { + "auxiliary_loss_clip": 0.01091098, + "auxiliary_loss_mlp": 0.01028379, + "balance_loss_clip": 1.033849, + "balance_loss_mlp": 1.01672029, + "epoch": 0.9693070795130017, + "flos": 26834392483200.0, + "grad_norm": 1.7606472749833983, + "language_loss": 0.69406259, + "learning_rate": 9.854914167664486e-09, + "loss": 0.71525741, + "num_input_tokens_seen": 347926385, + "step": 16122, + "time_per_iteration": 2.535815715789795 + }, + { + "auxiliary_loss_clip": 0.01064075, + "auxiliary_loss_mlp": 0.01031973, + "balance_loss_clip": 1.03196049, + "balance_loss_mlp": 1.01973009, + "epoch": 0.9693672027656697, + "flos": 18077288515200.0, + "grad_norm": 1.8735126831177165, + "language_loss": 0.75870919, + "learning_rate": 9.81633694859907e-09, + "loss": 0.77966964, + "num_input_tokens_seen": 347945290, + "step": 16123, + "time_per_iteration": 2.5234906673431396 + }, + { + "auxiliary_loss_clip": 0.01067494, + "auxiliary_loss_mlp": 0.01036239, + "balance_loss_clip": 1.0328424, + "balance_loss_mlp": 1.0224582, + "epoch": 0.9694273260183376, + "flos": 21763015994880.0, + "grad_norm": 1.637716870250452, + "language_loss": 0.74511194, + "learning_rate": 9.777835197497753e-09, + "loss": 0.76614922, + "num_input_tokens_seen": 347966330, + "step": 16124, + "time_per_iteration": 2.583019971847534 + }, + { + "auxiliary_loss_clip": 0.01092359, + "auxiliary_loss_mlp": 0.01033793, + "balance_loss_clip": 1.03405023, + "balance_loss_mlp": 1.02225399, + "epoch": 0.9694874492710056, + "flos": 24426115269120.0, + "grad_norm": 1.9753030356668775, + "language_loss": 0.74582863, + "learning_rate": 9.739408915820258e-09, + "loss": 0.7670902, + "num_input_tokens_seen": 347982590, + "step": 16125, + "time_per_iteration": 2.5477070808410645 + }, + { + "auxiliary_loss_clip": 0.01017399, + "auxiliary_loss_mlp": 0.01001188, + "balance_loss_clip": 1.00504446, + "balance_loss_mlp": 1.00016844, + "epoch": 0.9695475725236735, + "flos": 67650748237440.0, + "grad_norm": 0.8634075605124992, + "language_loss": 0.61491394, + "learning_rate": 9.70105810502364e-09, + "loss": 0.63509983, + "num_input_tokens_seen": 348043310, + "step": 16126, + "time_per_iteration": 3.0799570083618164 + }, + { + "auxiliary_loss_clip": 0.01091431, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.03576374, + "balance_loss_mlp": 1.02287936, + "epoch": 0.9696076957763415, + "flos": 19129326981120.0, + "grad_norm": 1.6327366409593627, + "language_loss": 0.74959648, + "learning_rate": 9.662782766562738e-09, + "loss": 0.77085668, + "num_input_tokens_seen": 348062200, + "step": 16127, + "time_per_iteration": 2.561328411102295 + }, + { + "auxiliary_loss_clip": 0.01058727, + "auxiliary_loss_mlp": 0.01030366, + "balance_loss_clip": 1.03170729, + "balance_loss_mlp": 1.01827812, + "epoch": 0.9696678190290094, + "flos": 15486836497920.0, + "grad_norm": 1.906666294349261, + "language_loss": 0.69106275, + "learning_rate": 9.62458290188839e-09, + "loss": 0.71195364, + "num_input_tokens_seen": 348080685, + "step": 16128, + "time_per_iteration": 2.599687099456787 + }, + { + "auxiliary_loss_clip": 0.01071269, + "auxiliary_loss_mlp": 0.01033741, + "balance_loss_clip": 1.03523993, + "balance_loss_mlp": 1.02199852, + "epoch": 0.9697279422816775, + "flos": 36208692869760.0, + "grad_norm": 1.5504658814744137, + "language_loss": 0.65615124, + "learning_rate": 9.586458512449213e-09, + "loss": 0.67720133, + "num_input_tokens_seen": 348102500, + "step": 16129, + "time_per_iteration": 2.717768669128418 + }, + { + "auxiliary_loss_clip": 0.01071426, + "auxiliary_loss_mlp": 0.01029191, + "balance_loss_clip": 1.03559411, + "balance_loss_mlp": 1.01699615, + "epoch": 0.9697880655343454, + "flos": 25484007651840.0, + "grad_norm": 1.9274633359266975, + "language_loss": 0.63070703, + "learning_rate": 9.548409599691166e-09, + "loss": 0.65171319, + "num_input_tokens_seen": 348122515, + "step": 16130, + "time_per_iteration": 2.609807252883911 + }, + { + "auxiliary_loss_clip": 0.0109453, + "auxiliary_loss_mlp": 0.01027729, + "balance_loss_clip": 1.03406513, + "balance_loss_mlp": 1.01567721, + "epoch": 0.9698481887870134, + "flos": 15333533251200.0, + "grad_norm": 2.2865511489193255, + "language_loss": 0.69976842, + "learning_rate": 9.510436165056867e-09, + "loss": 0.72099102, + "num_input_tokens_seen": 348138775, + "step": 16131, + "time_per_iteration": 2.4826691150665283 + }, + { + "auxiliary_loss_clip": 0.0110508, + "auxiliary_loss_mlp": 0.00783307, + "balance_loss_clip": 1.03488541, + "balance_loss_mlp": 1.01033306, + "epoch": 0.9699083120396813, + "flos": 21982250655360.0, + "grad_norm": 2.165814589916953, + "language_loss": 0.76258737, + "learning_rate": 9.472538209986058e-09, + "loss": 0.78147125, + "num_input_tokens_seen": 348157115, + "step": 16132, + "time_per_iteration": 2.5020503997802734 + }, + { + "auxiliary_loss_clip": 0.01072534, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.0370028, + "balance_loss_mlp": 1.02173638, + "epoch": 0.9699684352923493, + "flos": 15664055224320.0, + "grad_norm": 2.27552743023492, + "language_loss": 0.78936398, + "learning_rate": 9.434715735916477e-09, + "loss": 0.81043106, + "num_input_tokens_seen": 348173035, + "step": 16133, + "time_per_iteration": 2.583000659942627 + }, + { + "auxiliary_loss_clip": 0.01072179, + "auxiliary_loss_mlp": 0.01026963, + "balance_loss_clip": 1.03360105, + "balance_loss_mlp": 1.01619279, + "epoch": 0.9700285585450172, + "flos": 21908382336000.0, + "grad_norm": 1.6088870463495266, + "language_loss": 0.64771187, + "learning_rate": 9.396968744281863e-09, + "loss": 0.66870332, + "num_input_tokens_seen": 348192960, + "step": 16134, + "time_per_iteration": 2.524009943008423 + }, + { + "auxiliary_loss_clip": 0.01079317, + "auxiliary_loss_mlp": 0.01029141, + "balance_loss_clip": 1.03209484, + "balance_loss_mlp": 1.01655269, + "epoch": 0.9700886817976853, + "flos": 23914890950400.0, + "grad_norm": 2.3547408155569247, + "language_loss": 0.8086096, + "learning_rate": 9.359297236513519e-09, + "loss": 0.82969415, + "num_input_tokens_seen": 348212805, + "step": 16135, + "time_per_iteration": 2.5438811779022217 + }, + { + "auxiliary_loss_clip": 0.01094379, + "auxiliary_loss_mlp": 0.01029721, + "balance_loss_clip": 1.03425205, + "balance_loss_mlp": 1.01710916, + "epoch": 0.9701488050503532, + "flos": 25447845634560.0, + "grad_norm": 6.1055864995245415, + "language_loss": 0.73194706, + "learning_rate": 9.321701214040079e-09, + "loss": 0.75318801, + "num_input_tokens_seen": 348232900, + "step": 16136, + "time_per_iteration": 2.515380859375 + }, + { + "auxiliary_loss_clip": 0.01102333, + "auxiliary_loss_mlp": 0.01031652, + "balance_loss_clip": 1.03522062, + "balance_loss_mlp": 1.02063072, + "epoch": 0.9702089283030212, + "flos": 20590855470720.0, + "grad_norm": 1.4442914528739381, + "language_loss": 0.76036823, + "learning_rate": 9.28418067828729e-09, + "loss": 0.78170806, + "num_input_tokens_seen": 348253065, + "step": 16137, + "time_per_iteration": 2.486067533493042 + }, + { + "auxiliary_loss_clip": 0.00996909, + "auxiliary_loss_mlp": 0.01001033, + "balance_loss_clip": 1.01989293, + "balance_loss_mlp": 0.99997199, + "epoch": 0.9702690515556892, + "flos": 70651516291200.0, + "grad_norm": 0.7674504019640235, + "language_loss": 0.54930681, + "learning_rate": 9.246735630678015e-09, + "loss": 0.56928623, + "num_input_tokens_seen": 348316075, + "step": 16138, + "time_per_iteration": 3.323807954788208 + }, + { + "auxiliary_loss_clip": 0.01082512, + "auxiliary_loss_mlp": 0.01028423, + "balance_loss_clip": 1.03399634, + "balance_loss_mlp": 1.01656795, + "epoch": 0.9703291748083571, + "flos": 35881439034240.0, + "grad_norm": 3.2336141196843453, + "language_loss": 0.70778835, + "learning_rate": 9.209366072632007e-09, + "loss": 0.72889769, + "num_input_tokens_seen": 348337605, + "step": 16139, + "time_per_iteration": 2.7408053874969482 + }, + { + "auxiliary_loss_clip": 0.01095776, + "auxiliary_loss_mlp": 0.01027594, + "balance_loss_clip": 1.03701401, + "balance_loss_mlp": 1.01530981, + "epoch": 0.9703892980610251, + "flos": 24316479982080.0, + "grad_norm": 1.4309894534234784, + "language_loss": 0.72393018, + "learning_rate": 9.172072005566134e-09, + "loss": 0.74516392, + "num_input_tokens_seen": 348359430, + "step": 16140, + "time_per_iteration": 2.5062978267669678 + }, + { + "auxiliary_loss_clip": 0.01098051, + "auxiliary_loss_mlp": 0.00785711, + "balance_loss_clip": 1.03694677, + "balance_loss_mlp": 1.01065135, + "epoch": 0.970449421313693, + "flos": 18003743418240.0, + "grad_norm": 2.252397564136839, + "language_loss": 0.68622077, + "learning_rate": 9.13485343089504e-09, + "loss": 0.70505834, + "num_input_tokens_seen": 348377890, + "step": 16141, + "time_per_iteration": 2.4686646461486816 + }, + { + "auxiliary_loss_clip": 0.01087463, + "auxiliary_loss_mlp": 0.01029775, + "balance_loss_clip": 1.03241086, + "balance_loss_mlp": 1.01806879, + "epoch": 0.9705095445663611, + "flos": 25337994865920.0, + "grad_norm": 2.025160035014419, + "language_loss": 0.68337595, + "learning_rate": 9.097710350029597e-09, + "loss": 0.70454836, + "num_input_tokens_seen": 348396550, + "step": 16142, + "time_per_iteration": 2.497830629348755 + }, + { + "auxiliary_loss_clip": 0.010476, + "auxiliary_loss_mlp": 0.01028175, + "balance_loss_clip": 1.03198922, + "balance_loss_mlp": 1.01603961, + "epoch": 0.970569667819029, + "flos": 26833602384000.0, + "grad_norm": 1.854384377186835, + "language_loss": 0.5587483, + "learning_rate": 9.060642764378457e-09, + "loss": 0.57950604, + "num_input_tokens_seen": 348417120, + "step": 16143, + "time_per_iteration": 2.632920742034912 + }, + { + "auxiliary_loss_clip": 0.01093628, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.03554368, + "balance_loss_mlp": 1.02079964, + "epoch": 0.970629791071697, + "flos": 25848644567040.0, + "grad_norm": 2.3077839461810212, + "language_loss": 0.6776967, + "learning_rate": 9.023650675347382e-09, + "loss": 0.69895542, + "num_input_tokens_seen": 348437750, + "step": 16144, + "time_per_iteration": 2.569767475128174 + }, + { + "auxiliary_loss_clip": 0.01093099, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.03606343, + "balance_loss_mlp": 1.02650857, + "epoch": 0.9706899143243649, + "flos": 36540184510080.0, + "grad_norm": 1.6796592398635923, + "language_loss": 0.72247493, + "learning_rate": 8.986734084339253e-09, + "loss": 0.74378455, + "num_input_tokens_seen": 348460935, + "step": 16145, + "time_per_iteration": 4.1486687660217285 + }, + { + "auxiliary_loss_clip": 0.01080958, + "auxiliary_loss_mlp": 0.01027032, + "balance_loss_clip": 1.03296626, + "balance_loss_mlp": 1.0146637, + "epoch": 0.9707500375770329, + "flos": 12268234414080.0, + "grad_norm": 3.0932082202534392, + "language_loss": 0.81105614, + "learning_rate": 8.949892992753395e-09, + "loss": 0.83213603, + "num_input_tokens_seen": 348474480, + "step": 16146, + "time_per_iteration": 3.908280611038208 + }, + { + "auxiliary_loss_clip": 0.00998243, + "auxiliary_loss_mlp": 0.01000253, + "balance_loss_clip": 1.00960422, + "balance_loss_mlp": 0.99916804, + "epoch": 0.9708101608297008, + "flos": 60853040196480.0, + "grad_norm": 0.8238243437000202, + "language_loss": 0.54565907, + "learning_rate": 8.91312740198713e-09, + "loss": 0.56564403, + "num_input_tokens_seen": 348541220, + "step": 16147, + "time_per_iteration": 4.629950284957886 + }, + { + "auxiliary_loss_clip": 0.01069304, + "auxiliary_loss_mlp": 0.00785814, + "balance_loss_clip": 1.03206158, + "balance_loss_mlp": 1.01149821, + "epoch": 0.9708702840823689, + "flos": 27124766029440.0, + "grad_norm": 3.1925548701959103, + "language_loss": 0.6117605, + "learning_rate": 8.876437313434682e-09, + "loss": 0.63031173, + "num_input_tokens_seen": 348559230, + "step": 16148, + "time_per_iteration": 2.6058900356292725 + }, + { + "auxiliary_loss_clip": 0.01066153, + "auxiliary_loss_mlp": 0.01032963, + "balance_loss_clip": 1.03383422, + "balance_loss_mlp": 1.02163851, + "epoch": 0.9709304073350368, + "flos": 20777699041920.0, + "grad_norm": 1.7301271068643318, + "language_loss": 0.73695552, + "learning_rate": 8.839822728487155e-09, + "loss": 0.75794667, + "num_input_tokens_seen": 348577850, + "step": 16149, + "time_per_iteration": 2.558091640472412 + }, + { + "auxiliary_loss_clip": 0.0109062, + "auxiliary_loss_mlp": 0.01037897, + "balance_loss_clip": 1.03195071, + "balance_loss_mlp": 1.02577901, + "epoch": 0.9709905305877048, + "flos": 41934541115520.0, + "grad_norm": 2.9565339729309925, + "language_loss": 0.75575656, + "learning_rate": 8.803283648533222e-09, + "loss": 0.77704167, + "num_input_tokens_seen": 348598345, + "step": 16150, + "time_per_iteration": 2.6718592643737793 + }, + { + "auxiliary_loss_clip": 0.0108901, + "auxiliary_loss_mlp": 0.01029242, + "balance_loss_clip": 1.03691792, + "balance_loss_mlp": 1.01447189, + "epoch": 0.9710506538403728, + "flos": 17165588486400.0, + "grad_norm": 1.985481969584915, + "language_loss": 0.73826706, + "learning_rate": 8.766820074958214e-09, + "loss": 0.7594496, + "num_input_tokens_seen": 348616300, + "step": 16151, + "time_per_iteration": 2.507359743118286 + }, + { + "auxiliary_loss_clip": 0.01089609, + "auxiliary_loss_mlp": 0.01027115, + "balance_loss_clip": 1.03409958, + "balance_loss_mlp": 1.01553392, + "epoch": 0.9711107770930407, + "flos": 21173470070400.0, + "grad_norm": 1.739490842395945, + "language_loss": 0.74895394, + "learning_rate": 8.730432009145027e-09, + "loss": 0.77012122, + "num_input_tokens_seen": 348633845, + "step": 16152, + "time_per_iteration": 2.516207695007324 + }, + { + "auxiliary_loss_clip": 0.01068235, + "auxiliary_loss_mlp": 0.01031547, + "balance_loss_clip": 1.03470433, + "balance_loss_mlp": 1.0196197, + "epoch": 0.9711709003457087, + "flos": 22237072715520.0, + "grad_norm": 2.0643528615496365, + "language_loss": 0.67212987, + "learning_rate": 8.694119452473448e-09, + "loss": 0.69312775, + "num_input_tokens_seen": 348653070, + "step": 16153, + "time_per_iteration": 2.5508322715759277 + }, + { + "auxiliary_loss_clip": 0.01051393, + "auxiliary_loss_mlp": 0.01029278, + "balance_loss_clip": 1.03343964, + "balance_loss_mlp": 1.01809585, + "epoch": 0.9712310235983767, + "flos": 26213856099840.0, + "grad_norm": 1.5455554401605685, + "language_loss": 0.70772052, + "learning_rate": 8.65788240632037e-09, + "loss": 0.72852719, + "num_input_tokens_seen": 348672145, + "step": 16154, + "time_per_iteration": 2.649338483810425 + }, + { + "auxiliary_loss_clip": 0.01056144, + "auxiliary_loss_mlp": 0.01030788, + "balance_loss_clip": 1.03871322, + "balance_loss_mlp": 1.017699, + "epoch": 0.9712911468510447, + "flos": 20668171495680.0, + "grad_norm": 1.8007078676722663, + "language_loss": 0.80792856, + "learning_rate": 8.621720872059812e-09, + "loss": 0.82879788, + "num_input_tokens_seen": 348690615, + "step": 16155, + "time_per_iteration": 3.990067958831787 + }, + { + "auxiliary_loss_clip": 0.01092466, + "auxiliary_loss_mlp": 0.00783177, + "balance_loss_clip": 1.03527379, + "balance_loss_mlp": 1.00794983, + "epoch": 0.9713512701037126, + "flos": 13552903313280.0, + "grad_norm": 2.4649858388807173, + "language_loss": 0.67459249, + "learning_rate": 8.58563485106334e-09, + "loss": 0.69334888, + "num_input_tokens_seen": 348708665, + "step": 16156, + "time_per_iteration": 2.4739787578582764 + }, + { + "auxiliary_loss_clip": 0.0109243, + "auxiliary_loss_mlp": 0.01034707, + "balance_loss_clip": 1.0327419, + "balance_loss_mlp": 1.02294087, + "epoch": 0.9714113933563806, + "flos": 25848752307840.0, + "grad_norm": 2.342929152849355, + "language_loss": 0.90993047, + "learning_rate": 8.54962434469919e-09, + "loss": 0.93120182, + "num_input_tokens_seen": 348726105, + "step": 16157, + "time_per_iteration": 2.4954421520233154 + }, + { + "auxiliary_loss_clip": 0.01073456, + "auxiliary_loss_mlp": 0.00785132, + "balance_loss_clip": 1.03515601, + "balance_loss_mlp": 1.01403952, + "epoch": 0.9714715166090485, + "flos": 12743081233920.0, + "grad_norm": 1.7872677384797977, + "language_loss": 0.72713435, + "learning_rate": 8.513689354332721e-09, + "loss": 0.74572027, + "num_input_tokens_seen": 348743360, + "step": 16158, + "time_per_iteration": 2.5450830459594727 + }, + { + "auxiliary_loss_clip": 0.01054379, + "auxiliary_loss_mlp": 0.01036805, + "balance_loss_clip": 1.03238833, + "balance_loss_mlp": 1.02451408, + "epoch": 0.9715316398617165, + "flos": 18405547931520.0, + "grad_norm": 2.145022321195413, + "language_loss": 0.59979057, + "learning_rate": 8.477829881326836e-09, + "loss": 0.62070245, + "num_input_tokens_seen": 348759045, + "step": 16159, + "time_per_iteration": 2.56588077545166 + }, + { + "auxiliary_loss_clip": 0.0109818, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.03327727, + "balance_loss_mlp": 1.01630092, + "epoch": 0.9715917631143844, + "flos": 28913799749760.0, + "grad_norm": 1.718273813012951, + "language_loss": 0.79058826, + "learning_rate": 8.44204592704112e-09, + "loss": 0.81184143, + "num_input_tokens_seen": 348779910, + "step": 16160, + "time_per_iteration": 2.5141422748565674 + }, + { + "auxiliary_loss_clip": 0.01027021, + "auxiliary_loss_mlp": 0.00999477, + "balance_loss_clip": 1.00426364, + "balance_loss_mlp": 0.99845773, + "epoch": 0.9716518863670525, + "flos": 65939712900480.0, + "grad_norm": 0.7762122080328414, + "language_loss": 0.54294264, + "learning_rate": 8.406337492832704e-09, + "loss": 0.56320763, + "num_input_tokens_seen": 348838995, + "step": 16161, + "time_per_iteration": 3.083940029144287 + }, + { + "auxiliary_loss_clip": 0.01089143, + "auxiliary_loss_mlp": 0.00782874, + "balance_loss_clip": 1.03464746, + "balance_loss_mlp": 1.00879729, + "epoch": 0.9717120096197204, + "flos": 17712759340800.0, + "grad_norm": 1.7181306642673049, + "language_loss": 0.71971166, + "learning_rate": 8.3707045800554e-09, + "loss": 0.73843187, + "num_input_tokens_seen": 348858090, + "step": 16162, + "time_per_iteration": 2.4690141677856445 + }, + { + "auxiliary_loss_clip": 0.01065878, + "auxiliary_loss_mlp": 0.01031541, + "balance_loss_clip": 1.03073668, + "balance_loss_mlp": 1.01921463, + "epoch": 0.9717721328723884, + "flos": 24463426521600.0, + "grad_norm": 1.5614321595512972, + "language_loss": 0.78598356, + "learning_rate": 8.335147190060787e-09, + "loss": 0.80695772, + "num_input_tokens_seen": 348877885, + "step": 16163, + "time_per_iteration": 2.606612205505371 + }, + { + "auxiliary_loss_clip": 0.01077658, + "auxiliary_loss_mlp": 0.01026157, + "balance_loss_clip": 1.03425252, + "balance_loss_mlp": 1.01460004, + "epoch": 0.9718322561250564, + "flos": 20776477979520.0, + "grad_norm": 1.7451181591120961, + "language_loss": 0.72885108, + "learning_rate": 8.299665324196903e-09, + "loss": 0.74988919, + "num_input_tokens_seen": 348897720, + "step": 16164, + "time_per_iteration": 2.5045957565307617 + }, + { + "auxiliary_loss_clip": 0.01042933, + "auxiliary_loss_mlp": 0.01044308, + "balance_loss_clip": 1.03191137, + "balance_loss_mlp": 1.02894771, + "epoch": 0.9718923793777243, + "flos": 19025904746880.0, + "grad_norm": 2.209397170391226, + "language_loss": 0.83857238, + "learning_rate": 8.264258983809114e-09, + "loss": 0.85944474, + "num_input_tokens_seen": 348915410, + "step": 16165, + "time_per_iteration": 2.591508626937866 + }, + { + "auxiliary_loss_clip": 0.01067182, + "auxiliary_loss_mlp": 0.01025273, + "balance_loss_clip": 1.03343964, + "balance_loss_mlp": 1.01477706, + "epoch": 0.9719525026303923, + "flos": 21871717528320.0, + "grad_norm": 1.5215445805720342, + "language_loss": 0.79244673, + "learning_rate": 8.228928170240345e-09, + "loss": 0.81337124, + "num_input_tokens_seen": 348934335, + "step": 16166, + "time_per_iteration": 2.539766550064087 + }, + { + "auxiliary_loss_clip": 0.0107687, + "auxiliary_loss_mlp": 0.01026612, + "balance_loss_clip": 1.03514838, + "balance_loss_mlp": 1.01520967, + "epoch": 0.9720126258830603, + "flos": 14429303251200.0, + "grad_norm": 1.891886675364507, + "language_loss": 0.70374143, + "learning_rate": 8.193672884830195e-09, + "loss": 0.72477627, + "num_input_tokens_seen": 348952405, + "step": 16167, + "time_per_iteration": 2.5113425254821777 + }, + { + "auxiliary_loss_clip": 0.01075023, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.03729606, + "balance_loss_mlp": 1.02099776, + "epoch": 0.9720727491357283, + "flos": 26251167352320.0, + "grad_norm": 1.4883053494366814, + "language_loss": 0.75161469, + "learning_rate": 8.158493128915812e-09, + "loss": 0.77269053, + "num_input_tokens_seen": 348973580, + "step": 16168, + "time_per_iteration": 2.557081699371338 + }, + { + "auxiliary_loss_clip": 0.01047989, + "auxiliary_loss_mlp": 0.01047114, + "balance_loss_clip": 1.03259993, + "balance_loss_mlp": 1.03257692, + "epoch": 0.9721328723883962, + "flos": 22674105492480.0, + "grad_norm": 2.5451745547725064, + "language_loss": 0.73157007, + "learning_rate": 8.123388903830797e-09, + "loss": 0.75252116, + "num_input_tokens_seen": 348992035, + "step": 16169, + "time_per_iteration": 2.628107786178589 + }, + { + "auxiliary_loss_clip": 0.01069389, + "auxiliary_loss_mlp": 0.01036626, + "balance_loss_clip": 1.03197157, + "balance_loss_mlp": 1.02276218, + "epoch": 0.9721929956410642, + "flos": 28074172360320.0, + "grad_norm": 2.0061879287365767, + "language_loss": 0.57746732, + "learning_rate": 8.088360210906309e-09, + "loss": 0.59852749, + "num_input_tokens_seen": 349013160, + "step": 16170, + "time_per_iteration": 2.608646869659424 + }, + { + "auxiliary_loss_clip": 0.01072505, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.03394914, + "balance_loss_mlp": 1.01832795, + "epoch": 0.9722531188937321, + "flos": 20996251344000.0, + "grad_norm": 1.7407565283189785, + "language_loss": 0.71744668, + "learning_rate": 8.053407051471062e-09, + "loss": 0.73848295, + "num_input_tokens_seen": 349033485, + "step": 16171, + "time_per_iteration": 2.5768849849700928 + }, + { + "auxiliary_loss_clip": 0.0106968, + "auxiliary_loss_mlp": 0.01038157, + "balance_loss_clip": 1.0341146, + "balance_loss_mlp": 1.0260334, + "epoch": 0.9723132421464001, + "flos": 16070600332800.0, + "grad_norm": 2.0119266476064603, + "language_loss": 0.68341005, + "learning_rate": 8.018529426850218e-09, + "loss": 0.70448846, + "num_input_tokens_seen": 349051705, + "step": 16172, + "time_per_iteration": 2.52040433883667 + }, + { + "auxiliary_loss_clip": 0.01087954, + "auxiliary_loss_mlp": 0.01030424, + "balance_loss_clip": 1.03308296, + "balance_loss_mlp": 1.01864052, + "epoch": 0.972373365399068, + "flos": 27745769289600.0, + "grad_norm": 2.04649380818029, + "language_loss": 0.85831416, + "learning_rate": 7.983727338366274e-09, + "loss": 0.87949795, + "num_input_tokens_seen": 349070825, + "step": 16173, + "time_per_iteration": 2.545644998550415 + }, + { + "auxiliary_loss_clip": 0.01054921, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.0323869, + "balance_loss_mlp": 1.01920223, + "epoch": 0.9724334886517361, + "flos": 23002939526400.0, + "grad_norm": 1.8941872669699826, + "language_loss": 0.64398479, + "learning_rate": 7.949000787339289e-09, + "loss": 0.66487134, + "num_input_tokens_seen": 349089730, + "step": 16174, + "time_per_iteration": 2.575087547302246 + }, + { + "auxiliary_loss_clip": 0.0109165, + "auxiliary_loss_mlp": 0.0102519, + "balance_loss_clip": 1.03437364, + "balance_loss_mlp": 1.01377559, + "epoch": 0.972493611904404, + "flos": 25447055535360.0, + "grad_norm": 1.7203317234380966, + "language_loss": 0.77465701, + "learning_rate": 7.914349775085538e-09, + "loss": 0.79582542, + "num_input_tokens_seen": 349111315, + "step": 16175, + "time_per_iteration": 2.5237858295440674 + }, + { + "auxiliary_loss_clip": 0.01092071, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.03448594, + "balance_loss_mlp": 1.01833296, + "epoch": 0.972553735157072, + "flos": 16983054547200.0, + "grad_norm": 2.2363963307563233, + "language_loss": 0.56517339, + "learning_rate": 7.879774302919307e-09, + "loss": 0.58640254, + "num_input_tokens_seen": 349129495, + "step": 16176, + "time_per_iteration": 2.460719108581543 + }, + { + "auxiliary_loss_clip": 0.01081335, + "auxiliary_loss_mlp": 0.01028224, + "balance_loss_clip": 1.03504908, + "balance_loss_mlp": 1.01751935, + "epoch": 0.97261385840974, + "flos": 26104651776000.0, + "grad_norm": 2.54703437448003, + "language_loss": 0.72398221, + "learning_rate": 7.845274372151545e-09, + "loss": 0.74507785, + "num_input_tokens_seen": 349148850, + "step": 16177, + "time_per_iteration": 2.564497709274292 + }, + { + "auxiliary_loss_clip": 0.01081568, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.03233683, + "balance_loss_mlp": 1.01926064, + "epoch": 0.9726739816624079, + "flos": 25447881548160.0, + "grad_norm": 1.989116861241083, + "language_loss": 0.68519974, + "learning_rate": 7.810849984090984e-09, + "loss": 0.70632911, + "num_input_tokens_seen": 349167620, + "step": 16178, + "time_per_iteration": 2.5347790718078613 + }, + { + "auxiliary_loss_clip": 0.0105141, + "auxiliary_loss_mlp": 0.01032999, + "balance_loss_clip": 1.03163576, + "balance_loss_mlp": 1.02044046, + "epoch": 0.972734104915076, + "flos": 29014923513600.0, + "grad_norm": 1.7763795408075758, + "language_loss": 0.67535228, + "learning_rate": 7.776501140042358e-09, + "loss": 0.69619644, + "num_input_tokens_seen": 349185845, + "step": 16179, + "time_per_iteration": 2.690690517425537 + }, + { + "auxiliary_loss_clip": 0.01079814, + "auxiliary_loss_mlp": 0.00780876, + "balance_loss_clip": 1.03511083, + "balance_loss_mlp": 1.0081439, + "epoch": 0.9727942281677439, + "flos": 23437637919360.0, + "grad_norm": 1.8013697714235808, + "language_loss": 0.77077919, + "learning_rate": 7.742227841308624e-09, + "loss": 0.78938609, + "num_input_tokens_seen": 349204525, + "step": 16180, + "time_per_iteration": 2.5353312492370605 + }, + { + "auxiliary_loss_clip": 0.0109411, + "auxiliary_loss_mlp": 0.0103309, + "balance_loss_clip": 1.03379619, + "balance_loss_mlp": 1.02084112, + "epoch": 0.9728543514204119, + "flos": 31724599749120.0, + "grad_norm": 1.5109252118041612, + "language_loss": 0.76317549, + "learning_rate": 7.708030089189188e-09, + "loss": 0.78444755, + "num_input_tokens_seen": 349228075, + "step": 16181, + "time_per_iteration": 2.583693504333496 + }, + { + "auxiliary_loss_clip": 0.01100695, + "auxiliary_loss_mlp": 0.01031502, + "balance_loss_clip": 1.03347087, + "balance_loss_mlp": 1.02007031, + "epoch": 0.9729144746730798, + "flos": 16289368116480.0, + "grad_norm": 1.4407788177280763, + "language_loss": 0.6330477, + "learning_rate": 7.67390788498079e-09, + "loss": 0.65436971, + "num_input_tokens_seen": 349246990, + "step": 16182, + "time_per_iteration": 2.4226245880126953 + }, + { + "auxiliary_loss_clip": 0.0103241, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.03459787, + "balance_loss_mlp": 1.02470827, + "epoch": 0.9729745979257478, + "flos": 25041408266880.0, + "grad_norm": 1.734714363453598, + "language_loss": 0.62670994, + "learning_rate": 7.639861229977507e-09, + "loss": 0.64741242, + "num_input_tokens_seen": 349265890, + "step": 16183, + "time_per_iteration": 4.300455808639526 + }, + { + "auxiliary_loss_clip": 0.01081554, + "auxiliary_loss_mlp": 0.01032654, + "balance_loss_clip": 1.033759, + "balance_loss_mlp": 1.02034545, + "epoch": 0.9730347211784157, + "flos": 22638733574400.0, + "grad_norm": 1.5835142311492654, + "language_loss": 0.78003073, + "learning_rate": 7.605890125470527e-09, + "loss": 0.80117273, + "num_input_tokens_seen": 349285275, + "step": 16184, + "time_per_iteration": 2.748594284057617 + }, + { + "auxiliary_loss_clip": 0.01064615, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.03090703, + "balance_loss_mlp": 1.01833034, + "epoch": 0.9730948444310837, + "flos": 10998613313280.0, + "grad_norm": 2.25985126193625, + "language_loss": 0.79582012, + "learning_rate": 7.571994572747709e-09, + "loss": 0.81677151, + "num_input_tokens_seen": 349301515, + "step": 16185, + "time_per_iteration": 3.901275157928467 + }, + { + "auxiliary_loss_clip": 0.01072377, + "auxiliary_loss_mlp": 0.0103233, + "balance_loss_clip": 1.03419912, + "balance_loss_mlp": 1.02048659, + "epoch": 0.9731549676837516, + "flos": 16799479113600.0, + "grad_norm": 1.737289845144855, + "language_loss": 0.78047454, + "learning_rate": 7.538174573094469e-09, + "loss": 0.80152154, + "num_input_tokens_seen": 349319590, + "step": 16186, + "time_per_iteration": 3.9025180339813232 + }, + { + "auxiliary_loss_clip": 0.0107957, + "auxiliary_loss_mlp": 0.01026123, + "balance_loss_clip": 1.03407824, + "balance_loss_mlp": 1.01409483, + "epoch": 0.9732150909364197, + "flos": 21141761339520.0, + "grad_norm": 1.610072765201185, + "language_loss": 0.65491402, + "learning_rate": 7.504430127793337e-09, + "loss": 0.67597091, + "num_input_tokens_seen": 349339230, + "step": 16187, + "time_per_iteration": 2.5603628158569336 + }, + { + "auxiliary_loss_clip": 0.01076774, + "auxiliary_loss_mlp": 0.01034243, + "balance_loss_clip": 1.03211546, + "balance_loss_mlp": 1.02190495, + "epoch": 0.9732752141890876, + "flos": 33727337435520.0, + "grad_norm": 1.5900557529596175, + "language_loss": 0.80368471, + "learning_rate": 7.47076123812418e-09, + "loss": 0.82479489, + "num_input_tokens_seen": 349361155, + "step": 16188, + "time_per_iteration": 2.6004462242126465 + }, + { + "auxiliary_loss_clip": 0.01065954, + "auxiliary_loss_mlp": 0.010296, + "balance_loss_clip": 1.03090048, + "balance_loss_mlp": 1.01853192, + "epoch": 0.9733353374417556, + "flos": 23404384903680.0, + "grad_norm": 1.8598405884265867, + "language_loss": 0.78345621, + "learning_rate": 7.437167905363084e-09, + "loss": 0.80441171, + "num_input_tokens_seen": 349379335, + "step": 16189, + "time_per_iteration": 2.605949878692627 + }, + { + "auxiliary_loss_clip": 0.01086843, + "auxiliary_loss_mlp": 0.01026482, + "balance_loss_clip": 1.03210521, + "balance_loss_mlp": 1.01470447, + "epoch": 0.9733954606944236, + "flos": 39165792963840.0, + "grad_norm": 1.725605401230886, + "language_loss": 0.50994343, + "learning_rate": 7.403650130784367e-09, + "loss": 0.53107667, + "num_input_tokens_seen": 349401575, + "step": 16190, + "time_per_iteration": 2.616671085357666 + }, + { + "auxiliary_loss_clip": 0.0109174, + "auxiliary_loss_mlp": 0.01026556, + "balance_loss_clip": 1.0342598, + "balance_loss_mlp": 1.0145216, + "epoch": 0.9734555839470915, + "flos": 21981819692160.0, + "grad_norm": 1.599473103023743, + "language_loss": 0.81100726, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.83219022, + "num_input_tokens_seen": 349420650, + "step": 16191, + "time_per_iteration": 2.5053176879882812 + }, + { + "auxiliary_loss_clip": 0.01087839, + "auxiliary_loss_mlp": 0.01027403, + "balance_loss_clip": 1.03261256, + "balance_loss_mlp": 1.01622117, + "epoch": 0.9735157071997596, + "flos": 16575539771520.0, + "grad_norm": 1.9240648366275974, + "language_loss": 0.82702613, + "learning_rate": 7.336841261255111e-09, + "loss": 0.84817851, + "num_input_tokens_seen": 349436830, + "step": 16192, + "time_per_iteration": 2.439758062362671 + }, + { + "auxiliary_loss_clip": 0.01045279, + "auxiliary_loss_mlp": 0.01037855, + "balance_loss_clip": 1.03696847, + "balance_loss_mlp": 1.02467656, + "epoch": 0.9735758304524275, + "flos": 20223237726720.0, + "grad_norm": 1.986827920520046, + "language_loss": 0.75161612, + "learning_rate": 7.303550168837658e-09, + "loss": 0.77244747, + "num_input_tokens_seen": 349454325, + "step": 16193, + "time_per_iteration": 4.105252027511597 + }, + { + "auxiliary_loss_clip": 0.01073011, + "auxiliary_loss_mlp": 0.01031774, + "balance_loss_clip": 1.03269362, + "balance_loss_mlp": 1.02139068, + "epoch": 0.9736359537050955, + "flos": 23653353047040.0, + "grad_norm": 1.9117253201455835, + "language_loss": 0.85124832, + "learning_rate": 7.270334639669417e-09, + "loss": 0.87229615, + "num_input_tokens_seen": 349470230, + "step": 16194, + "time_per_iteration": 2.539759874343872 + }, + { + "auxiliary_loss_clip": 0.01066774, + "auxiliary_loss_mlp": 0.01034871, + "balance_loss_clip": 1.0347321, + "balance_loss_mlp": 1.02284312, + "epoch": 0.9736960769577634, + "flos": 15560202026880.0, + "grad_norm": 1.5314056868889963, + "language_loss": 0.75719351, + "learning_rate": 7.237194675009828e-09, + "loss": 0.77820992, + "num_input_tokens_seen": 349486250, + "step": 16195, + "time_per_iteration": 2.4988338947296143 + }, + { + "auxiliary_loss_clip": 0.0100777, + "auxiliary_loss_mlp": 0.00999608, + "balance_loss_clip": 1.01363599, + "balance_loss_mlp": 0.99839193, + "epoch": 0.9737562002104314, + "flos": 65351783088000.0, + "grad_norm": 0.7052914727605334, + "language_loss": 0.52520251, + "learning_rate": 7.204130276115439e-09, + "loss": 0.54527628, + "num_input_tokens_seen": 349545865, + "step": 16196, + "time_per_iteration": 3.114932060241699 + }, + { + "auxiliary_loss_clip": 0.0107893, + "auxiliary_loss_mlp": 0.01028928, + "balance_loss_clip": 1.03589928, + "balance_loss_mlp": 1.01777601, + "epoch": 0.9738163234630993, + "flos": 27196730928000.0, + "grad_norm": 1.5449591268917202, + "language_loss": 0.76301289, + "learning_rate": 7.171141444240136e-09, + "loss": 0.78409147, + "num_input_tokens_seen": 349566080, + "step": 16197, + "time_per_iteration": 2.572955369949341 + }, + { + "auxiliary_loss_clip": 0.01106411, + "auxiliary_loss_mlp": 0.01031193, + "balance_loss_clip": 1.03492272, + "balance_loss_mlp": 1.01893187, + "epoch": 0.9738764467157673, + "flos": 21069365477760.0, + "grad_norm": 1.7603078546933115, + "language_loss": 0.67396808, + "learning_rate": 7.13822818063492e-09, + "loss": 0.69534415, + "num_input_tokens_seen": 349585665, + "step": 16198, + "time_per_iteration": 2.4785072803497314 + }, + { + "auxiliary_loss_clip": 0.01101653, + "auxiliary_loss_mlp": 0.01028082, + "balance_loss_clip": 1.03311491, + "balance_loss_mlp": 1.01543355, + "epoch": 0.9739365699684353, + "flos": 21361211481600.0, + "grad_norm": 1.8405775387434822, + "language_loss": 0.77539003, + "learning_rate": 7.10539048654768e-09, + "loss": 0.79668736, + "num_input_tokens_seen": 349605125, + "step": 16199, + "time_per_iteration": 2.439615488052368 + }, + { + "auxiliary_loss_clip": 0.01078231, + "auxiliary_loss_mlp": 0.01031926, + "balance_loss_clip": 1.03465521, + "balance_loss_mlp": 1.02029681, + "epoch": 0.9739966932211033, + "flos": 21902061542400.0, + "grad_norm": 2.3569813097820056, + "language_loss": 0.79409075, + "learning_rate": 7.072628363223865e-09, + "loss": 0.81519228, + "num_input_tokens_seen": 349623360, + "step": 16200, + "time_per_iteration": 2.5391414165496826 + }, + { + "auxiliary_loss_clip": 0.01052254, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.03742003, + "balance_loss_mlp": 1.02299345, + "epoch": 0.9740568164737712, + "flos": 24827345164800.0, + "grad_norm": 1.9618109719462753, + "language_loss": 0.67954451, + "learning_rate": 7.039941811905592e-09, + "loss": 0.70042133, + "num_input_tokens_seen": 349644390, + "step": 16201, + "time_per_iteration": 2.6456668376922607 + }, + { + "auxiliary_loss_clip": 0.01071767, + "auxiliary_loss_mlp": 0.01030552, + "balance_loss_clip": 1.0336318, + "balance_loss_mlp": 1.0189352, + "epoch": 0.9741169397264392, + "flos": 23623583650560.0, + "grad_norm": 1.74333248478659, + "language_loss": 0.72693992, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.74796319, + "num_input_tokens_seen": 349663200, + "step": 16202, + "time_per_iteration": 2.5729103088378906 + }, + { + "auxiliary_loss_clip": 0.01082488, + "auxiliary_loss_mlp": 0.01028782, + "balance_loss_clip": 1.03651237, + "balance_loss_mlp": 1.01632476, + "epoch": 0.9741770629791072, + "flos": 18841144164480.0, + "grad_norm": 2.699894638001778, + "language_loss": 0.72986746, + "learning_rate": 6.974795430241265e-09, + "loss": 0.75098014, + "num_input_tokens_seen": 349681975, + "step": 16203, + "time_per_iteration": 2.5111920833587646 + }, + { + "auxiliary_loss_clip": 0.01102111, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.03414845, + "balance_loss_mlp": 1.0202384, + "epoch": 0.9742371862317751, + "flos": 22346241125760.0, + "grad_norm": 1.8927200278486223, + "language_loss": 0.77422678, + "learning_rate": 6.942335602365235e-09, + "loss": 0.79557061, + "num_input_tokens_seen": 349701185, + "step": 16204, + "time_per_iteration": 2.4865241050720215 + }, + { + "auxiliary_loss_clip": 0.01086189, + "auxiliary_loss_mlp": 0.010333, + "balance_loss_clip": 1.03673553, + "balance_loss_mlp": 1.02034187, + "epoch": 0.9742973094844432, + "flos": 21762764599680.0, + "grad_norm": 1.996078393612719, + "language_loss": 0.79661882, + "learning_rate": 6.909951351435905e-09, + "loss": 0.81781375, + "num_input_tokens_seen": 349720360, + "step": 16205, + "time_per_iteration": 2.5131676197052 + }, + { + "auxiliary_loss_clip": 0.01102268, + "auxiliary_loss_mlp": 0.01030479, + "balance_loss_clip": 1.03472543, + "balance_loss_mlp": 1.01882648, + "epoch": 0.9743574327371111, + "flos": 26248725227520.0, + "grad_norm": 1.645253948501529, + "language_loss": 0.74461412, + "learning_rate": 6.87764267868074e-09, + "loss": 0.76594162, + "num_input_tokens_seen": 349741040, + "step": 16206, + "time_per_iteration": 2.5108933448791504 + }, + { + "auxiliary_loss_clip": 0.01047491, + "auxiliary_loss_mlp": 0.01028226, + "balance_loss_clip": 1.03247452, + "balance_loss_mlp": 1.01576269, + "epoch": 0.9744175559897791, + "flos": 12349321367040.0, + "grad_norm": 2.3748894671054583, + "language_loss": 0.8368926, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.8576498, + "num_input_tokens_seen": 349758895, + "step": 16207, + "time_per_iteration": 2.5878524780273438 + }, + { + "auxiliary_loss_clip": 0.01090334, + "auxiliary_loss_mlp": 0.01033004, + "balance_loss_clip": 1.03375554, + "balance_loss_mlp": 1.02156544, + "epoch": 0.974477679242447, + "flos": 28397834835840.0, + "grad_norm": 1.878114784587456, + "language_loss": 0.70740348, + "learning_rate": 6.813252072591425e-09, + "loss": 0.72863686, + "num_input_tokens_seen": 349779740, + "step": 16208, + "time_per_iteration": 2.548093795776367 + }, + { + "auxiliary_loss_clip": 0.01064242, + "auxiliary_loss_mlp": 0.01027161, + "balance_loss_clip": 1.03308439, + "balance_loss_mlp": 1.01681399, + "epoch": 0.974537802495115, + "flos": 17785370684160.0, + "grad_norm": 1.6242257841218073, + "language_loss": 0.77528834, + "learning_rate": 6.781170141698878e-09, + "loss": 0.79620242, + "num_input_tokens_seen": 349796820, + "step": 16209, + "time_per_iteration": 2.5201070308685303 + }, + { + "auxiliary_loss_clip": 0.01068817, + "auxiliary_loss_mlp": 0.00784214, + "balance_loss_clip": 1.03332257, + "balance_loss_mlp": 1.0073539, + "epoch": 0.9745979257477829, + "flos": 23842315520640.0, + "grad_norm": 1.8340441606208908, + "language_loss": 0.78866637, + "learning_rate": 6.749163793864144e-09, + "loss": 0.80719662, + "num_input_tokens_seen": 349816550, + "step": 16210, + "time_per_iteration": 2.5759286880493164 + }, + { + "auxiliary_loss_clip": 0.01077633, + "auxiliary_loss_mlp": 0.01034503, + "balance_loss_clip": 1.03201413, + "balance_loss_mlp": 1.02279711, + "epoch": 0.9746580490004509, + "flos": 27016172236800.0, + "grad_norm": 2.1389464114487136, + "language_loss": 0.77833802, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.79945934, + "num_input_tokens_seen": 349834350, + "step": 16211, + "time_per_iteration": 2.5445737838745117 + }, + { + "auxiliary_loss_clip": 0.01069347, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.03314388, + "balance_loss_mlp": 1.02054477, + "epoch": 0.9747181722531189, + "flos": 19792022952960.0, + "grad_norm": 1.8933659705430461, + "language_loss": 0.7785427, + "learning_rate": 6.685377852219787e-09, + "loss": 0.79957539, + "num_input_tokens_seen": 349853460, + "step": 16212, + "time_per_iteration": 2.5458004474639893 + }, + { + "auxiliary_loss_clip": 0.01071161, + "auxiliary_loss_mlp": 0.01033176, + "balance_loss_clip": 1.03290009, + "balance_loss_mlp": 1.02149367, + "epoch": 0.9747782955057869, + "flos": 31430598929280.0, + "grad_norm": 1.5148682597354217, + "language_loss": 0.80195928, + "learning_rate": 6.653598260829118e-09, + "loss": 0.8230027, + "num_input_tokens_seen": 349874830, + "step": 16213, + "time_per_iteration": 2.5943148136138916 + }, + { + "auxiliary_loss_clip": 0.01056872, + "auxiliary_loss_mlp": 0.01026723, + "balance_loss_clip": 1.03107309, + "balance_loss_mlp": 1.01517725, + "epoch": 0.9748384187584548, + "flos": 15961288268160.0, + "grad_norm": 1.847879114485383, + "language_loss": 0.66464078, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.68547672, + "num_input_tokens_seen": 349893690, + "step": 16214, + "time_per_iteration": 2.5532827377319336 + }, + { + "auxiliary_loss_clip": 0.01086558, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.03630018, + "balance_loss_mlp": 1.01947534, + "epoch": 0.9748985420111228, + "flos": 20558715776640.0, + "grad_norm": 1.572585041133039, + "language_loss": 0.74518317, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.76636851, + "num_input_tokens_seen": 349912480, + "step": 16215, + "time_per_iteration": 2.5229077339172363 + }, + { + "auxiliary_loss_clip": 0.01051472, + "auxiliary_loss_mlp": 0.01031475, + "balance_loss_clip": 1.03232598, + "balance_loss_mlp": 1.01952434, + "epoch": 0.9749586652637908, + "flos": 36721605127680.0, + "grad_norm": 1.6613456250563972, + "language_loss": 0.67061186, + "learning_rate": 6.558713018834483e-09, + "loss": 0.69144136, + "num_input_tokens_seen": 349932470, + "step": 16216, + "time_per_iteration": 2.6965579986572266 + }, + { + "auxiliary_loss_clip": 0.01049866, + "auxiliary_loss_mlp": 0.01029588, + "balance_loss_clip": 1.03303981, + "balance_loss_mlp": 1.01674271, + "epoch": 0.9750187885164587, + "flos": 10999223844480.0, + "grad_norm": 1.8380577437855554, + "language_loss": 0.71811211, + "learning_rate": 6.527235786226937e-09, + "loss": 0.73890662, + "num_input_tokens_seen": 349949060, + "step": 16217, + "time_per_iteration": 2.5712411403656006 + }, + { + "auxiliary_loss_clip": 0.01070445, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.03669345, + "balance_loss_mlp": 1.01887655, + "epoch": 0.9750789117691268, + "flos": 25739512070400.0, + "grad_norm": 1.5191773847985837, + "language_loss": 0.78427213, + "learning_rate": 6.495834146306167e-09, + "loss": 0.80528581, + "num_input_tokens_seen": 349968010, + "step": 16218, + "time_per_iteration": 2.5960443019866943 + }, + { + "auxiliary_loss_clip": 0.01075961, + "auxiliary_loss_mlp": 0.01028669, + "balance_loss_clip": 1.03478599, + "balance_loss_mlp": 1.0166285, + "epoch": 0.9751390350217947, + "flos": 13333955961600.0, + "grad_norm": 2.019418481114559, + "language_loss": 0.7723974, + "learning_rate": 6.464508100263222e-09, + "loss": 0.79344368, + "num_input_tokens_seen": 349985270, + "step": 16219, + "time_per_iteration": 2.5324318408966064 + }, + { + "auxiliary_loss_clip": 0.01083858, + "auxiliary_loss_mlp": 0.01030202, + "balance_loss_clip": 1.03505301, + "balance_loss_mlp": 1.01819158, + "epoch": 0.9751991582744627, + "flos": 22820621068800.0, + "grad_norm": 1.7517362957251144, + "language_loss": 0.81123257, + "learning_rate": 6.433257649285817e-09, + "loss": 0.83237314, + "num_input_tokens_seen": 350003935, + "step": 16220, + "time_per_iteration": 2.524190664291382 + }, + { + "auxiliary_loss_clip": 0.0110021, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.03334093, + "balance_loss_mlp": 1.01884544, + "epoch": 0.9752592815271306, + "flos": 19646189735040.0, + "grad_norm": 1.9096040856918477, + "language_loss": 0.75284684, + "learning_rate": 6.402082794559227e-09, + "loss": 0.77414781, + "num_input_tokens_seen": 350023595, + "step": 16221, + "time_per_iteration": 2.459115982055664 + }, + { + "auxiliary_loss_clip": 0.01068234, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.03357852, + "balance_loss_mlp": 1.018255, + "epoch": 0.9753194047797986, + "flos": 26690462686080.0, + "grad_norm": 1.62602700671209, + "language_loss": 0.66646689, + "learning_rate": 6.370983537265395e-09, + "loss": 0.68744636, + "num_input_tokens_seen": 350045920, + "step": 16222, + "time_per_iteration": 3.9954135417938232 + }, + { + "auxiliary_loss_clip": 0.01089424, + "auxiliary_loss_mlp": 0.01029921, + "balance_loss_clip": 1.0332942, + "balance_loss_mlp": 1.01848876, + "epoch": 0.9753795280324665, + "flos": 23221779137280.0, + "grad_norm": 1.6355602942662584, + "language_loss": 0.88495791, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.90615135, + "num_input_tokens_seen": 350063925, + "step": 16223, + "time_per_iteration": 3.915684223175049 + }, + { + "auxiliary_loss_clip": 0.01040754, + "auxiliary_loss_mlp": 0.01031259, + "balance_loss_clip": 1.03257382, + "balance_loss_mlp": 1.01962996, + "epoch": 0.9754396512851345, + "flos": 19463835363840.0, + "grad_norm": 1.6998813325021103, + "language_loss": 0.74973035, + "learning_rate": 6.309011819690457e-09, + "loss": 0.77045047, + "num_input_tokens_seen": 350080900, + "step": 16224, + "time_per_iteration": 4.066991806030273 + }, + { + "auxiliary_loss_clip": 0.01011983, + "auxiliary_loss_mlp": 0.01004932, + "balance_loss_clip": 1.00930357, + "balance_loss_mlp": 1.00388908, + "epoch": 0.9754997745378025, + "flos": 68459313340800.0, + "grad_norm": 0.8070556716042739, + "language_loss": 0.59187698, + "learning_rate": 6.278139361759249e-09, + "loss": 0.61204612, + "num_input_tokens_seen": 350144550, + "step": 16225, + "time_per_iteration": 3.1212191581726074 + }, + { + "auxiliary_loss_clip": 0.01069566, + "auxiliary_loss_mlp": 0.00781407, + "balance_loss_clip": 1.03397882, + "balance_loss_mlp": 1.00776112, + "epoch": 0.9755598977904705, + "flos": 26395168976640.0, + "grad_norm": 1.7136801442173597, + "language_loss": 0.69101763, + "learning_rate": 6.247342505960818e-09, + "loss": 0.70952737, + "num_input_tokens_seen": 350164050, + "step": 16226, + "time_per_iteration": 2.597308874130249 + }, + { + "auxiliary_loss_clip": 0.01090324, + "auxiliary_loss_mlp": 0.01039882, + "balance_loss_clip": 1.03476739, + "balance_loss_mlp": 1.02730513, + "epoch": 0.9756200210431384, + "flos": 16617663446400.0, + "grad_norm": 1.6463911048757767, + "language_loss": 0.83218205, + "learning_rate": 6.216621253462894e-09, + "loss": 0.85348415, + "num_input_tokens_seen": 350181350, + "step": 16227, + "time_per_iteration": 2.477663993835449 + }, + { + "auxiliary_loss_clip": 0.01101127, + "auxiliary_loss_mlp": 0.01029821, + "balance_loss_clip": 1.03432798, + "balance_loss_mlp": 1.01869917, + "epoch": 0.9756801442958064, + "flos": 23623044946560.0, + "grad_norm": 1.7184108113514194, + "language_loss": 0.7761488, + "learning_rate": 6.185975605430549e-09, + "loss": 0.79745829, + "num_input_tokens_seen": 350199765, + "step": 16228, + "time_per_iteration": 2.464442729949951 + }, + { + "auxiliary_loss_clip": 0.01020155, + "auxiliary_loss_mlp": 0.01001106, + "balance_loss_clip": 1.00622511, + "balance_loss_mlp": 1.00009918, + "epoch": 0.9757402675484744, + "flos": 61625799440640.0, + "grad_norm": 0.8461153539665102, + "language_loss": 0.55921006, + "learning_rate": 6.155405563025962e-09, + "loss": 0.57942271, + "num_input_tokens_seen": 350256420, + "step": 16229, + "time_per_iteration": 3.0275909900665283 + }, + { + "auxiliary_loss_clip": 0.0109301, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.03394103, + "balance_loss_mlp": 1.01827884, + "epoch": 0.9758003908011423, + "flos": 24058964401920.0, + "grad_norm": 1.593286354364073, + "language_loss": 0.75375754, + "learning_rate": 6.124911127407984e-09, + "loss": 0.77499187, + "num_input_tokens_seen": 350276270, + "step": 16230, + "time_per_iteration": 2.5159547328948975 + }, + { + "auxiliary_loss_clip": 0.01077489, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.03449619, + "balance_loss_mlp": 1.01804781, + "epoch": 0.9758605140538104, + "flos": 17493093717120.0, + "grad_norm": 1.7981831691033496, + "language_loss": 0.72065485, + "learning_rate": 6.094492299733245e-09, + "loss": 0.74171931, + "num_input_tokens_seen": 350295000, + "step": 16231, + "time_per_iteration": 2.4809131622314453 + }, + { + "auxiliary_loss_clip": 0.0108455, + "auxiliary_loss_mlp": 0.0102704, + "balance_loss_clip": 1.03638947, + "balance_loss_mlp": 1.01473212, + "epoch": 0.9759206373064783, + "flos": 24826950115200.0, + "grad_norm": 1.8220781198946452, + "language_loss": 0.76599866, + "learning_rate": 6.064149081155267e-09, + "loss": 0.78711462, + "num_input_tokens_seen": 350314980, + "step": 16232, + "time_per_iteration": 3.9515202045440674 + }, + { + "auxiliary_loss_clip": 0.01011073, + "auxiliary_loss_mlp": 0.01001988, + "balance_loss_clip": 1.01191604, + "balance_loss_mlp": 1.00075376, + "epoch": 0.9759807605591463, + "flos": 68161182456960.0, + "grad_norm": 0.7388833109841759, + "language_loss": 0.53799868, + "learning_rate": 6.033881472824465e-09, + "loss": 0.55812931, + "num_input_tokens_seen": 350371985, + "step": 16233, + "time_per_iteration": 2.989121437072754 + }, + { + "auxiliary_loss_clip": 0.01103019, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.03395987, + "balance_loss_mlp": 1.02054238, + "epoch": 0.9760408838118142, + "flos": 18989239939200.0, + "grad_norm": 1.6991760460212633, + "language_loss": 0.71560621, + "learning_rate": 6.003689475888807e-09, + "loss": 0.73696232, + "num_input_tokens_seen": 350390590, + "step": 16234, + "time_per_iteration": 2.458466053009033 + }, + { + "auxiliary_loss_clip": 0.01095381, + "auxiliary_loss_mlp": 0.01032683, + "balance_loss_clip": 1.03516197, + "balance_loss_mlp": 1.02031517, + "epoch": 0.9761010070644822, + "flos": 17125978763520.0, + "grad_norm": 2.2380437918665765, + "language_loss": 0.78690183, + "learning_rate": 5.973573091493156e-09, + "loss": 0.80818248, + "num_input_tokens_seen": 350403770, + "step": 16235, + "time_per_iteration": 2.4426708221435547 + }, + { + "auxiliary_loss_clip": 0.01094422, + "auxiliary_loss_mlp": 0.01030307, + "balance_loss_clip": 1.0365963, + "balance_loss_mlp": 1.01708627, + "epoch": 0.9761611303171501, + "flos": 22052599441920.0, + "grad_norm": 1.745300429973555, + "language_loss": 0.76562017, + "learning_rate": 5.943532320779265e-09, + "loss": 0.7868675, + "num_input_tokens_seen": 350421870, + "step": 16236, + "time_per_iteration": 2.492396354675293 + }, + { + "auxiliary_loss_clip": 0.01091466, + "auxiliary_loss_mlp": 0.01027409, + "balance_loss_clip": 1.03294849, + "balance_loss_mlp": 1.0159533, + "epoch": 0.9762212535698181, + "flos": 21757521214080.0, + "grad_norm": 1.7447040005389491, + "language_loss": 0.75475562, + "learning_rate": 5.913567164886446e-09, + "loss": 0.77594435, + "num_input_tokens_seen": 350440025, + "step": 16237, + "time_per_iteration": 2.464792013168335 + }, + { + "auxiliary_loss_clip": 0.01057853, + "auxiliary_loss_mlp": 0.0103427, + "balance_loss_clip": 1.03231788, + "balance_loss_mlp": 1.02010822, + "epoch": 0.9762813768224861, + "flos": 25921615046400.0, + "grad_norm": 1.7131957699876106, + "language_loss": 0.73351359, + "learning_rate": 5.8836776249509e-09, + "loss": 0.75443482, + "num_input_tokens_seen": 350459435, + "step": 16238, + "time_per_iteration": 2.6084163188934326 + }, + { + "auxiliary_loss_clip": 0.01082279, + "auxiliary_loss_mlp": 0.00782932, + "balance_loss_clip": 1.03347445, + "balance_loss_mlp": 1.0073154, + "epoch": 0.9763415000751541, + "flos": 24051853509120.0, + "grad_norm": 2.2340644851838953, + "language_loss": 0.84179014, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.86044228, + "num_input_tokens_seen": 350472655, + "step": 16239, + "time_per_iteration": 2.501715898513794 + }, + { + "auxiliary_loss_clip": 0.01063689, + "auxiliary_loss_mlp": 0.01039913, + "balance_loss_clip": 1.03243995, + "balance_loss_mlp": 1.02572083, + "epoch": 0.976401623327822, + "flos": 17018677860480.0, + "grad_norm": 2.770517820672524, + "language_loss": 0.60721666, + "learning_rate": 5.824125397483115e-09, + "loss": 0.62825269, + "num_input_tokens_seen": 350488160, + "step": 16240, + "time_per_iteration": 2.5278990268707275 + }, + { + "auxiliary_loss_clip": 0.01065051, + "auxiliary_loss_mlp": 0.01029945, + "balance_loss_clip": 1.03386521, + "balance_loss_mlp": 1.01810789, + "epoch": 0.97646174658049, + "flos": 16106941918080.0, + "grad_norm": 2.093651005077876, + "language_loss": 0.82889271, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.84984267, + "num_input_tokens_seen": 350506065, + "step": 16241, + "time_per_iteration": 2.5139880180358887 + }, + { + "auxiliary_loss_clip": 0.01064031, + "auxiliary_loss_mlp": 0.0103733, + "balance_loss_clip": 1.03213573, + "balance_loss_mlp": 1.02566504, + "epoch": 0.9765218698331579, + "flos": 21252725429760.0, + "grad_norm": 6.266782557522362, + "language_loss": 0.83402902, + "learning_rate": 5.764875647408463e-09, + "loss": 0.85504264, + "num_input_tokens_seen": 350524495, + "step": 16242, + "time_per_iteration": 2.5607330799102783 + }, + { + "auxiliary_loss_clip": 0.01093044, + "auxiliary_loss_mlp": 0.01031321, + "balance_loss_clip": 1.03539336, + "balance_loss_mlp": 1.0193826, + "epoch": 0.9765819930858259, + "flos": 18588045957120.0, + "grad_norm": 1.5477688419629865, + "language_loss": 0.75314415, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.77438784, + "num_input_tokens_seen": 350544185, + "step": 16243, + "time_per_iteration": 2.4760124683380127 + }, + { + "auxiliary_loss_clip": 0.01091422, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.03341877, + "balance_loss_mlp": 1.02315032, + "epoch": 0.976642116338494, + "flos": 20266833859200.0, + "grad_norm": 1.54860748723667, + "language_loss": 0.69773817, + "learning_rate": 5.705928383713754e-09, + "loss": 0.71901095, + "num_input_tokens_seen": 350562675, + "step": 16244, + "time_per_iteration": 2.492584228515625 + }, + { + "auxiliary_loss_clip": 0.01084295, + "auxiliary_loss_mlp": 0.01032976, + "balance_loss_clip": 1.03666544, + "balance_loss_mlp": 1.02017879, + "epoch": 0.9767022395911619, + "flos": 25550477769600.0, + "grad_norm": 1.7584451958874965, + "language_loss": 0.83791471, + "learning_rate": 5.676568187055197e-09, + "loss": 0.85908735, + "num_input_tokens_seen": 350581535, + "step": 16245, + "time_per_iteration": 2.535740852355957 + }, + { + "auxiliary_loss_clip": 0.01052431, + "auxiliary_loss_mlp": 0.01026374, + "balance_loss_clip": 1.03374076, + "balance_loss_mlp": 1.01499581, + "epoch": 0.9767623628438299, + "flos": 21762656858880.0, + "grad_norm": 1.4185966726580308, + "language_loss": 0.78682923, + "learning_rate": 5.647283615340726e-09, + "loss": 0.80761725, + "num_input_tokens_seen": 350601615, + "step": 16246, + "time_per_iteration": 2.590569019317627 + }, + { + "auxiliary_loss_clip": 0.01096334, + "auxiliary_loss_mlp": 0.01031266, + "balance_loss_clip": 1.03352177, + "balance_loss_mlp": 1.02087736, + "epoch": 0.9768224860964978, + "flos": 15851114277120.0, + "grad_norm": 1.4942541918846772, + "language_loss": 0.74054772, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.76182371, + "num_input_tokens_seen": 350619580, + "step": 16247, + "time_per_iteration": 2.443958282470703 + }, + { + "auxiliary_loss_clip": 0.01047395, + "auxiliary_loss_mlp": 0.01033879, + "balance_loss_clip": 1.03144598, + "balance_loss_mlp": 1.02119553, + "epoch": 0.9768826093491658, + "flos": 25151151294720.0, + "grad_norm": 1.7374556662951117, + "language_loss": 0.79983008, + "learning_rate": 5.58894135118404e-09, + "loss": 0.82064289, + "num_input_tokens_seen": 350640015, + "step": 16248, + "time_per_iteration": 2.6566905975341797 + }, + { + "auxiliary_loss_clip": 0.01044841, + "auxiliary_loss_mlp": 0.01041071, + "balance_loss_clip": 1.0359894, + "balance_loss_mlp": 1.0266645, + "epoch": 0.9769427326018337, + "flos": 22967028904320.0, + "grad_norm": 1.7209302516547393, + "language_loss": 0.78978133, + "learning_rate": 5.559883660954278e-09, + "loss": 0.81064051, + "num_input_tokens_seen": 350659155, + "step": 16249, + "time_per_iteration": 2.6156554222106934 + }, + { + "auxiliary_loss_clip": 0.01089408, + "auxiliary_loss_mlp": 0.01032824, + "balance_loss_clip": 1.03487587, + "balance_loss_mlp": 1.02060556, + "epoch": 0.9770028558545018, + "flos": 15264297786240.0, + "grad_norm": 1.939715292768675, + "language_loss": 0.66626632, + "learning_rate": 5.530901600093507e-09, + "loss": 0.68748862, + "num_input_tokens_seen": 350676615, + "step": 16250, + "time_per_iteration": 2.474362850189209 + }, + { + "auxiliary_loss_clip": 0.01027021, + "auxiliary_loss_mlp": 0.01001323, + "balance_loss_clip": 1.00432515, + "balance_loss_mlp": 1.00027418, + "epoch": 0.9770629791071697, + "flos": 71450348808960.0, + "grad_norm": 0.7978868102749015, + "language_loss": 0.59920645, + "learning_rate": 5.501995169700846e-09, + "loss": 0.61948991, + "num_input_tokens_seen": 350736805, + "step": 16251, + "time_per_iteration": 3.102309226989746 + }, + { + "auxiliary_loss_clip": 0.01089195, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.03376174, + "balance_loss_mlp": 1.01809859, + "epoch": 0.9771231023598377, + "flos": 22412854897920.0, + "grad_norm": 1.7543046536507896, + "language_loss": 0.78549391, + "learning_rate": 5.473164370872307e-09, + "loss": 0.80669034, + "num_input_tokens_seen": 350753600, + "step": 16252, + "time_per_iteration": 2.486893892288208 + }, + { + "auxiliary_loss_clip": 0.01086153, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.03291428, + "balance_loss_mlp": 1.02343822, + "epoch": 0.9771832256125056, + "flos": 19025940660480.0, + "grad_norm": 2.673117242616465, + "language_loss": 0.64451087, + "learning_rate": 5.444409204701461e-09, + "loss": 0.66573483, + "num_input_tokens_seen": 350771225, + "step": 16253, + "time_per_iteration": 2.451687812805176 + }, + { + "auxiliary_loss_clip": 0.01097853, + "auxiliary_loss_mlp": 0.01032404, + "balance_loss_clip": 1.03704131, + "balance_loss_mlp": 1.01905274, + "epoch": 0.9772433488651736, + "flos": 17822143232640.0, + "grad_norm": 2.1499575619235056, + "language_loss": 0.7691288, + "learning_rate": 5.415729672278324e-09, + "loss": 0.79043138, + "num_input_tokens_seen": 350789100, + "step": 16254, + "time_per_iteration": 2.4666433334350586 + }, + { + "auxiliary_loss_clip": 0.01095633, + "auxiliary_loss_mlp": 0.01032023, + "balance_loss_clip": 1.03436255, + "balance_loss_mlp": 1.02004814, + "epoch": 0.9773034721178415, + "flos": 37629785623680.0, + "grad_norm": 1.6607793284969148, + "language_loss": 0.63967144, + "learning_rate": 5.387125774690471e-09, + "loss": 0.66094804, + "num_input_tokens_seen": 350811085, + "step": 16255, + "time_per_iteration": 2.617396593093872 + }, + { + "auxiliary_loss_clip": 0.01075243, + "auxiliary_loss_mlp": 0.00783181, + "balance_loss_clip": 1.03388095, + "balance_loss_mlp": 1.00793254, + "epoch": 0.9773635953705095, + "flos": 20302457172480.0, + "grad_norm": 1.5516011154263585, + "language_loss": 0.75917006, + "learning_rate": 5.358597513023033e-09, + "loss": 0.77775431, + "num_input_tokens_seen": 350831065, + "step": 16256, + "time_per_iteration": 2.583519220352173 + }, + { + "auxiliary_loss_clip": 0.01101962, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.03580904, + "balance_loss_mlp": 1.02001941, + "epoch": 0.9774237186231776, + "flos": 22309253095680.0, + "grad_norm": 1.9081525251327776, + "language_loss": 0.78285342, + "learning_rate": 5.330144888357369e-09, + "loss": 0.80419707, + "num_input_tokens_seen": 350849675, + "step": 16257, + "time_per_iteration": 2.4482641220092773 + }, + { + "auxiliary_loss_clip": 0.01087499, + "auxiliary_loss_mlp": 0.01031064, + "balance_loss_clip": 1.03455901, + "balance_loss_mlp": 1.01883912, + "epoch": 0.9774838418758455, + "flos": 24204905360640.0, + "grad_norm": 1.5255828600591865, + "language_loss": 0.75312555, + "learning_rate": 5.301767901772391e-09, + "loss": 0.77431118, + "num_input_tokens_seen": 350868955, + "step": 16258, + "time_per_iteration": 2.528136730194092 + }, + { + "auxiliary_loss_clip": 0.01022085, + "auxiliary_loss_mlp": 0.01002, + "balance_loss_clip": 1.00937772, + "balance_loss_mlp": 1.00096881, + "epoch": 0.9775439651285135, + "flos": 66357139829760.0, + "grad_norm": 0.6775450540278893, + "language_loss": 0.59749937, + "learning_rate": 5.273466554344353e-09, + "loss": 0.61774015, + "num_input_tokens_seen": 350935110, + "step": 16259, + "time_per_iteration": 3.1597633361816406 + }, + { + "auxiliary_loss_clip": 0.01087959, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.03558588, + "balance_loss_mlp": 1.02028418, + "epoch": 0.9776040883811814, + "flos": 22601565976320.0, + "grad_norm": 1.6369724513672157, + "language_loss": 0.73420841, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.75541747, + "num_input_tokens_seen": 350953220, + "step": 16260, + "time_per_iteration": 3.926499128341675 + }, + { + "auxiliary_loss_clip": 0.01092848, + "auxiliary_loss_mlp": 0.01032182, + "balance_loss_clip": 1.03422284, + "balance_loss_mlp": 1.02018309, + "epoch": 0.9776642116338494, + "flos": 18442176825600.0, + "grad_norm": 1.8423963563168593, + "language_loss": 0.79787612, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.81912637, + "num_input_tokens_seen": 350971915, + "step": 16261, + "time_per_iteration": 2.466264009475708 + }, + { + "auxiliary_loss_clip": 0.01095237, + "auxiliary_loss_mlp": 0.01026963, + "balance_loss_clip": 1.03496373, + "balance_loss_mlp": 1.01456559, + "epoch": 0.9777243348865173, + "flos": 22638446265600.0, + "grad_norm": 2.3029204704853465, + "language_loss": 0.74233532, + "learning_rate": 5.189016357718845e-09, + "loss": 0.76355731, + "num_input_tokens_seen": 350990470, + "step": 16262, + "time_per_iteration": 3.8842849731445312 + }, + { + "auxiliary_loss_clip": 0.01095362, + "auxiliary_loss_mlp": 0.01027574, + "balance_loss_clip": 1.03614593, + "balance_loss_mlp": 1.01474142, + "epoch": 0.9777844581391854, + "flos": 31321394605440.0, + "grad_norm": 1.9858229864593178, + "language_loss": 0.70018446, + "learning_rate": 5.16101757762133e-09, + "loss": 0.72141379, + "num_input_tokens_seen": 351010755, + "step": 16263, + "time_per_iteration": 3.9354703426361084 + }, + { + "auxiliary_loss_clip": 0.01094286, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.03546333, + "balance_loss_mlp": 1.0168885, + "epoch": 0.9778445813918533, + "flos": 23039101543680.0, + "grad_norm": 1.6565432065760262, + "language_loss": 0.66074932, + "learning_rate": 5.133094442018038e-09, + "loss": 0.68197751, + "num_input_tokens_seen": 351029965, + "step": 16264, + "time_per_iteration": 2.507380962371826 + }, + { + "auxiliary_loss_clip": 0.01058319, + "auxiliary_loss_mlp": 0.01027699, + "balance_loss_clip": 1.0365746, + "balance_loss_mlp": 1.0144434, + "epoch": 0.9779047046445213, + "flos": 17566351505280.0, + "grad_norm": 1.8166731814322394, + "language_loss": 0.73122817, + "learning_rate": 5.105246951967679e-09, + "loss": 0.75208843, + "num_input_tokens_seen": 351046205, + "step": 16265, + "time_per_iteration": 2.5599944591522217 + }, + { + "auxiliary_loss_clip": 0.01090949, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.03364062, + "balance_loss_mlp": 1.01964676, + "epoch": 0.9779648278971892, + "flos": 20741141975040.0, + "grad_norm": 1.7519255520092936, + "language_loss": 0.68630755, + "learning_rate": 5.077475108526297e-09, + "loss": 0.70753443, + "num_input_tokens_seen": 351065390, + "step": 16266, + "time_per_iteration": 2.504560947418213 + }, + { + "auxiliary_loss_clip": 0.01055266, + "auxiliary_loss_mlp": 0.01027624, + "balance_loss_clip": 1.03170741, + "balance_loss_mlp": 1.01663876, + "epoch": 0.9780249511498572, + "flos": 21026954494080.0, + "grad_norm": 1.6988029094706205, + "language_loss": 0.86898011, + "learning_rate": 5.049778912747049e-09, + "loss": 0.88980901, + "num_input_tokens_seen": 351084355, + "step": 16267, + "time_per_iteration": 2.5580835342407227 + }, + { + "auxiliary_loss_clip": 0.0103842, + "auxiliary_loss_mlp": 0.01025238, + "balance_loss_clip": 1.03213668, + "balance_loss_mlp": 1.01285791, + "epoch": 0.9780850744025251, + "flos": 30774223751040.0, + "grad_norm": 1.7941591570982727, + "language_loss": 0.7030713, + "learning_rate": 5.022158365679985e-09, + "loss": 0.72370791, + "num_input_tokens_seen": 351105870, + "step": 16268, + "time_per_iteration": 2.823263645172119 + }, + { + "auxiliary_loss_clip": 0.01081348, + "auxiliary_loss_mlp": 0.01026897, + "balance_loss_clip": 1.03238034, + "balance_loss_mlp": 1.01531005, + "epoch": 0.9781451976551931, + "flos": 20302995876480.0, + "grad_norm": 1.6428766100074628, + "language_loss": 0.73847181, + "learning_rate": 4.994613468372711e-09, + "loss": 0.75955427, + "num_input_tokens_seen": 351124760, + "step": 16269, + "time_per_iteration": 2.9180221557617188 + }, + { + "auxiliary_loss_clip": 0.01079976, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.03422761, + "balance_loss_mlp": 1.01769936, + "epoch": 0.9782053209078612, + "flos": 24316479982080.0, + "grad_norm": 1.7891978858489963, + "language_loss": 0.71052825, + "learning_rate": 4.967144221869501e-09, + "loss": 0.7316426, + "num_input_tokens_seen": 351142820, + "step": 16270, + "time_per_iteration": 3.969299793243408 + }, + { + "auxiliary_loss_clip": 0.01105279, + "auxiliary_loss_mlp": 0.01034187, + "balance_loss_clip": 1.03597367, + "balance_loss_mlp": 1.02198017, + "epoch": 0.9782654441605291, + "flos": 32489425065600.0, + "grad_norm": 1.6884069534369714, + "language_loss": 0.64061701, + "learning_rate": 4.939750627212191e-09, + "loss": 0.66201168, + "num_input_tokens_seen": 351164805, + "step": 16271, + "time_per_iteration": 2.5651535987854004 + }, + { + "auxiliary_loss_clip": 0.010777, + "auxiliary_loss_mlp": 0.01034197, + "balance_loss_clip": 1.03552699, + "balance_loss_mlp": 1.02190018, + "epoch": 0.9783255674131971, + "flos": 26979076465920.0, + "grad_norm": 1.4102331923360485, + "language_loss": 0.70429349, + "learning_rate": 4.912432685439505e-09, + "loss": 0.72541249, + "num_input_tokens_seen": 351187005, + "step": 16272, + "time_per_iteration": 2.5497801303863525 + }, + { + "auxiliary_loss_clip": 0.01047096, + "auxiliary_loss_mlp": 0.01030238, + "balance_loss_clip": 1.03676915, + "balance_loss_mlp": 1.01806688, + "epoch": 0.978385690665865, + "flos": 23112251591040.0, + "grad_norm": 2.161180634388928, + "language_loss": 0.66537404, + "learning_rate": 4.88519039758728e-09, + "loss": 0.68614745, + "num_input_tokens_seen": 351208450, + "step": 16273, + "time_per_iteration": 2.663785457611084 + }, + { + "auxiliary_loss_clip": 0.01073192, + "auxiliary_loss_mlp": 0.01024952, + "balance_loss_clip": 1.03575635, + "balance_loss_mlp": 1.01264977, + "epoch": 0.978445813918533, + "flos": 25409672455680.0, + "grad_norm": 1.6063525760313986, + "language_loss": 0.741238, + "learning_rate": 4.85802376468869e-09, + "loss": 0.76221943, + "num_input_tokens_seen": 351229585, + "step": 16274, + "time_per_iteration": 2.5975968837738037 + }, + { + "auxiliary_loss_clip": 0.01078831, + "auxiliary_loss_mlp": 0.01033954, + "balance_loss_clip": 1.0342586, + "balance_loss_mlp": 1.02247977, + "epoch": 0.9785059371712009, + "flos": 23550218121600.0, + "grad_norm": 1.5167373764830245, + "language_loss": 0.77784091, + "learning_rate": 4.830932787773579e-09, + "loss": 0.79896873, + "num_input_tokens_seen": 351249525, + "step": 16275, + "time_per_iteration": 2.572425127029419 + }, + { + "auxiliary_loss_clip": 0.01040041, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.03403807, + "balance_loss_mlp": 1.0182538, + "epoch": 0.978566060423869, + "flos": 34351177870080.0, + "grad_norm": 1.5385263873240587, + "language_loss": 0.70860839, + "learning_rate": 4.803917467869567e-09, + "loss": 0.72931695, + "num_input_tokens_seen": 351272530, + "step": 16276, + "time_per_iteration": 2.833160400390625 + }, + { + "auxiliary_loss_clip": 0.01071833, + "auxiliary_loss_mlp": 0.01029421, + "balance_loss_clip": 1.03064191, + "balance_loss_mlp": 1.01754165, + "epoch": 0.9786261836765369, + "flos": 11618862387840.0, + "grad_norm": 1.7182551875951353, + "language_loss": 0.85659516, + "learning_rate": 4.776977806000726e-09, + "loss": 0.8776077, + "num_input_tokens_seen": 351288530, + "step": 16277, + "time_per_iteration": 2.498908042907715 + }, + { + "auxiliary_loss_clip": 0.01085493, + "auxiliary_loss_mlp": 0.01027961, + "balance_loss_clip": 1.03284693, + "balance_loss_mlp": 1.01559329, + "epoch": 0.9786863069292049, + "flos": 17420949250560.0, + "grad_norm": 1.6938799610416937, + "language_loss": 0.70864826, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.72978282, + "num_input_tokens_seen": 351305890, + "step": 16278, + "time_per_iteration": 2.47751784324646 + }, + { + "auxiliary_loss_clip": 0.0109139, + "auxiliary_loss_mlp": 0.01028374, + "balance_loss_clip": 1.03447986, + "balance_loss_mlp": 1.01610756, + "epoch": 0.9787464301818728, + "flos": 20844923345280.0, + "grad_norm": 3.2733739991378266, + "language_loss": 0.84162128, + "learning_rate": 4.723325460453065e-09, + "loss": 0.86281902, + "num_input_tokens_seen": 351325010, + "step": 16279, + "time_per_iteration": 2.509918689727783 + }, + { + "auxiliary_loss_clip": 0.01092115, + "auxiliary_loss_mlp": 0.0103046, + "balance_loss_clip": 1.03295267, + "balance_loss_mlp": 1.01779437, + "epoch": 0.9788065534345408, + "flos": 18222942165120.0, + "grad_norm": 6.749540849732605, + "language_loss": 0.7919724, + "learning_rate": 4.696612778808395e-09, + "loss": 0.81319815, + "num_input_tokens_seen": 351343060, + "step": 16280, + "time_per_iteration": 2.456779956817627 + }, + { + "auxiliary_loss_clip": 0.01067759, + "auxiliary_loss_mlp": 0.01034755, + "balance_loss_clip": 1.03457379, + "balance_loss_mlp": 1.02400827, + "epoch": 0.9788666766872087, + "flos": 21578219498880.0, + "grad_norm": 1.696310832410057, + "language_loss": 0.79671878, + "learning_rate": 4.669975759268085e-09, + "loss": 0.8177439, + "num_input_tokens_seen": 351363260, + "step": 16281, + "time_per_iteration": 2.5739331245422363 + }, + { + "auxiliary_loss_clip": 0.01089589, + "auxiliary_loss_mlp": 0.01031061, + "balance_loss_clip": 1.03506851, + "balance_loss_mlp": 1.01840675, + "epoch": 0.9789267999398767, + "flos": 24900495212160.0, + "grad_norm": 1.6595047925338953, + "language_loss": 0.80319178, + "learning_rate": 4.643414402842216e-09, + "loss": 0.82439828, + "num_input_tokens_seen": 351382610, + "step": 16282, + "time_per_iteration": 2.50358510017395 + }, + { + "auxiliary_loss_clip": 0.01081275, + "auxiliary_loss_mlp": 0.01036864, + "balance_loss_clip": 1.03350759, + "balance_loss_mlp": 1.02493119, + "epoch": 0.9789869231925448, + "flos": 19573111514880.0, + "grad_norm": 2.0609193741732263, + "language_loss": 0.83304763, + "learning_rate": 4.616928710538204e-09, + "loss": 0.85422897, + "num_input_tokens_seen": 351401075, + "step": 16283, + "time_per_iteration": 2.520723342895508 + }, + { + "auxiliary_loss_clip": 0.01087754, + "auxiliary_loss_mlp": 0.01030928, + "balance_loss_clip": 1.03536701, + "balance_loss_mlp": 1.01914418, + "epoch": 0.9790470464452127, + "flos": 16796641939200.0, + "grad_norm": 1.828113114070613, + "language_loss": 0.72176898, + "learning_rate": 4.590518683360134e-09, + "loss": 0.7429558, + "num_input_tokens_seen": 351419275, + "step": 16284, + "time_per_iteration": 2.502812147140503 + }, + { + "auxiliary_loss_clip": 0.01091322, + "auxiliary_loss_mlp": 0.01035049, + "balance_loss_clip": 1.03552961, + "balance_loss_mlp": 1.02388525, + "epoch": 0.9791071696978807, + "flos": 18369350000640.0, + "grad_norm": 1.9069354358165265, + "language_loss": 0.64489257, + "learning_rate": 4.56418432230965e-09, + "loss": 0.66615629, + "num_input_tokens_seen": 351437375, + "step": 16285, + "time_per_iteration": 2.481660842895508 + }, + { + "auxiliary_loss_clip": 0.01078262, + "auxiliary_loss_mlp": 0.01030603, + "balance_loss_clip": 1.03579056, + "balance_loss_mlp": 1.01911116, + "epoch": 0.9791672929505486, + "flos": 24170323541760.0, + "grad_norm": 1.65269952083964, + "language_loss": 0.70676941, + "learning_rate": 4.537925628385286e-09, + "loss": 0.72785801, + "num_input_tokens_seen": 351457810, + "step": 16286, + "time_per_iteration": 2.528374195098877 + }, + { + "auxiliary_loss_clip": 0.01085007, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.03315747, + "balance_loss_mlp": 1.01780796, + "epoch": 0.9792274162032166, + "flos": 24354114456960.0, + "grad_norm": 1.3260794439643635, + "language_loss": 0.58034068, + "learning_rate": 4.511742602582691e-09, + "loss": 0.60148615, + "num_input_tokens_seen": 351478825, + "step": 16287, + "time_per_iteration": 2.5230419635772705 + }, + { + "auxiliary_loss_clip": 0.01092053, + "auxiliary_loss_mlp": 0.01034824, + "balance_loss_clip": 1.03491867, + "balance_loss_mlp": 1.02258682, + "epoch": 0.9792875394558845, + "flos": 26395779507840.0, + "grad_norm": 2.1375926696746523, + "language_loss": 0.81760538, + "learning_rate": 4.485635245894626e-09, + "loss": 0.83887422, + "num_input_tokens_seen": 351498785, + "step": 16288, + "time_per_iteration": 2.5043258666992188 + }, + { + "auxiliary_loss_clip": 0.01077978, + "auxiliary_loss_mlp": 0.00782891, + "balance_loss_clip": 1.0334934, + "balance_loss_mlp": 1.00938964, + "epoch": 0.9793476627085526, + "flos": 28148004766080.0, + "grad_norm": 1.582569030056652, + "language_loss": 0.71470737, + "learning_rate": 4.459603559311631e-09, + "loss": 0.733316, + "num_input_tokens_seen": 351520235, + "step": 16289, + "time_per_iteration": 2.572805881500244 + }, + { + "auxiliary_loss_clip": 0.01068038, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.0359441, + "balance_loss_mlp": 1.02146626, + "epoch": 0.9794077859612205, + "flos": 16763927627520.0, + "grad_norm": 2.317553890869716, + "language_loss": 0.75786638, + "learning_rate": 4.43364754382003e-09, + "loss": 0.77888, + "num_input_tokens_seen": 351538900, + "step": 16290, + "time_per_iteration": 2.4999938011169434 + }, + { + "auxiliary_loss_clip": 0.01093533, + "auxiliary_loss_mlp": 0.01032201, + "balance_loss_clip": 1.03489304, + "balance_loss_mlp": 1.0189693, + "epoch": 0.9794679092138885, + "flos": 19280834547840.0, + "grad_norm": 1.709490148723921, + "language_loss": 0.67132944, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.69258678, + "num_input_tokens_seen": 351558715, + "step": 16291, + "time_per_iteration": 2.4954793453216553 + }, + { + "auxiliary_loss_clip": 0.01106499, + "auxiliary_loss_mlp": 0.00783904, + "balance_loss_clip": 1.03490496, + "balance_loss_mlp": 1.00864077, + "epoch": 0.9795280324665564, + "flos": 32156640535680.0, + "grad_norm": 1.6105768614772433, + "language_loss": 0.62772226, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.64662635, + "num_input_tokens_seen": 351578450, + "step": 16292, + "time_per_iteration": 2.529905080795288 + }, + { + "auxiliary_loss_clip": 0.01069646, + "auxiliary_loss_mlp": 0.01031742, + "balance_loss_clip": 1.03496099, + "balance_loss_mlp": 1.02003002, + "epoch": 0.9795881557192244, + "flos": 19060953442560.0, + "grad_norm": 6.046983533586835, + "language_loss": 0.73671758, + "learning_rate": 4.356233533724829e-09, + "loss": 0.75773144, + "num_input_tokens_seen": 351597195, + "step": 16293, + "time_per_iteration": 2.5624260902404785 + }, + { + "auxiliary_loss_clip": 0.01095151, + "auxiliary_loss_mlp": 0.01028883, + "balance_loss_clip": 1.03357196, + "balance_loss_mlp": 1.01643705, + "epoch": 0.9796482789718923, + "flos": 28329928174080.0, + "grad_norm": 2.0568146472039968, + "language_loss": 0.83959502, + "learning_rate": 4.330580212414503e-09, + "loss": 0.86083537, + "num_input_tokens_seen": 351617460, + "step": 16294, + "time_per_iteration": 2.5159389972686768 + }, + { + "auxiliary_loss_clip": 0.01066423, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.0319978, + "balance_loss_mlp": 1.01992047, + "epoch": 0.9797084022245603, + "flos": 17967976450560.0, + "grad_norm": 1.8990128698314028, + "language_loss": 0.71732116, + "learning_rate": 4.305002567088767e-09, + "loss": 0.73829877, + "num_input_tokens_seen": 351635900, + "step": 16295, + "time_per_iteration": 2.52286434173584 + }, + { + "auxiliary_loss_clip": 0.01098147, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.03649509, + "balance_loss_mlp": 1.02461338, + "epoch": 0.9797685254772284, + "flos": 20266726118400.0, + "grad_norm": 1.638820665537276, + "language_loss": 0.80451328, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.82586271, + "num_input_tokens_seen": 351655400, + "step": 16296, + "time_per_iteration": 2.4674484729766846 + }, + { + "auxiliary_loss_clip": 0.01076939, + "auxiliary_loss_mlp": 0.01034995, + "balance_loss_clip": 1.03420436, + "balance_loss_mlp": 1.02313948, + "epoch": 0.9798286487298963, + "flos": 26907147480960.0, + "grad_norm": 1.9031795854675597, + "language_loss": 0.75614059, + "learning_rate": 4.254074308266853e-09, + "loss": 0.77725995, + "num_input_tokens_seen": 351675505, + "step": 16297, + "time_per_iteration": 2.5529961585998535 + }, + { + "auxiliary_loss_clip": 0.0109472, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.03453219, + "balance_loss_mlp": 1.01993084, + "epoch": 0.9798887719825643, + "flos": 27161071701120.0, + "grad_norm": 1.4964263960004178, + "language_loss": 0.78181493, + "learning_rate": 4.228723696702019e-09, + "loss": 0.80308163, + "num_input_tokens_seen": 351697920, + "step": 16298, + "time_per_iteration": 2.536107063293457 + }, + { + "auxiliary_loss_clip": 0.01088866, + "auxiliary_loss_mlp": 0.01026562, + "balance_loss_clip": 1.03357041, + "balance_loss_mlp": 1.01493907, + "epoch": 0.9799488952352322, + "flos": 20668422890880.0, + "grad_norm": 1.5203437042227115, + "language_loss": 0.72401571, + "learning_rate": 4.203448764984019e-09, + "loss": 0.74517, + "num_input_tokens_seen": 351717615, + "step": 16299, + "time_per_iteration": 3.8942174911499023 + }, + { + "auxiliary_loss_clip": 0.01080182, + "auxiliary_loss_mlp": 0.01029006, + "balance_loss_clip": 1.03337932, + "balance_loss_mlp": 1.01619101, + "epoch": 0.9800090184879002, + "flos": 21981209160960.0, + "grad_norm": 2.303369486809455, + "language_loss": 0.88887405, + "learning_rate": 4.178249514071419e-09, + "loss": 0.90996599, + "num_input_tokens_seen": 351735260, + "step": 16300, + "time_per_iteration": 2.5311107635498047 + }, + { + "auxiliary_loss_clip": 0.01094471, + "auxiliary_loss_mlp": 0.01028792, + "balance_loss_clip": 1.03416157, + "balance_loss_mlp": 1.01648939, + "epoch": 0.9800691417405681, + "flos": 21288420570240.0, + "grad_norm": 2.2679382180632, + "language_loss": 0.78186989, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.80310249, + "num_input_tokens_seen": 351755800, + "step": 16301, + "time_per_iteration": 5.292080640792847 + }, + { + "auxiliary_loss_clip": 0.01081988, + "auxiliary_loss_mlp": 0.01034223, + "balance_loss_clip": 1.03390443, + "balance_loss_mlp": 1.02192044, + "epoch": 0.9801292649932362, + "flos": 18439878355200.0, + "grad_norm": 2.1171939736852177, + "language_loss": 0.75019646, + "learning_rate": 4.128078058480921e-09, + "loss": 0.77135861, + "num_input_tokens_seen": 351774790, + "step": 16302, + "time_per_iteration": 2.499150276184082 + }, + { + "auxiliary_loss_clip": 0.01082424, + "auxiliary_loss_mlp": 0.01029004, + "balance_loss_clip": 1.03526616, + "balance_loss_mlp": 1.01692784, + "epoch": 0.9801893882459041, + "flos": 25046364343680.0, + "grad_norm": 1.622858123935546, + "language_loss": 0.79719305, + "learning_rate": 4.103105855705724e-09, + "loss": 0.81830728, + "num_input_tokens_seen": 351792855, + "step": 16303, + "time_per_iteration": 2.5940306186676025 + }, + { + "auxiliary_loss_clip": 0.01062929, + "auxiliary_loss_mlp": 0.01036127, + "balance_loss_clip": 1.03413153, + "balance_loss_mlp": 1.02296007, + "epoch": 0.9802495114985721, + "flos": 18511484117760.0, + "grad_norm": 2.203373991588886, + "language_loss": 0.83669484, + "learning_rate": 4.078209337540883e-09, + "loss": 0.85768539, + "num_input_tokens_seen": 351811450, + "step": 16304, + "time_per_iteration": 2.547268867492676 + }, + { + "auxiliary_loss_clip": 0.01067265, + "auxiliary_loss_mlp": 0.01026777, + "balance_loss_clip": 1.0354706, + "balance_loss_mlp": 1.01605964, + "epoch": 0.98030963475124, + "flos": 21469841187840.0, + "grad_norm": 1.8524709146713345, + "language_loss": 0.70073617, + "learning_rate": 4.053388504930089e-09, + "loss": 0.72167659, + "num_input_tokens_seen": 351831960, + "step": 16305, + "time_per_iteration": 2.5799946784973145 + }, + { + "auxiliary_loss_clip": 0.01074038, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.03435564, + "balance_loss_mlp": 1.0253185, + "epoch": 0.980369758003908, + "flos": 20412272027520.0, + "grad_norm": 2.803386974399174, + "language_loss": 0.71798027, + "learning_rate": 4.028643358815032e-09, + "loss": 0.73911166, + "num_input_tokens_seen": 351851585, + "step": 16306, + "time_per_iteration": 2.4845826625823975 + }, + { + "auxiliary_loss_clip": 0.01077166, + "auxiliary_loss_mlp": 0.01030922, + "balance_loss_clip": 1.03250313, + "balance_loss_mlp": 1.0196147, + "epoch": 0.9804298812565759, + "flos": 23399177431680.0, + "grad_norm": 1.5761913448996532, + "language_loss": 0.73537695, + "learning_rate": 4.00397390013385e-09, + "loss": 0.75645781, + "num_input_tokens_seen": 351871085, + "step": 16307, + "time_per_iteration": 2.5613269805908203 + }, + { + "auxiliary_loss_clip": 0.01070079, + "auxiliary_loss_mlp": 0.01027762, + "balance_loss_clip": 1.03469324, + "balance_loss_mlp": 1.01694989, + "epoch": 0.980490004509244, + "flos": 23292666627840.0, + "grad_norm": 1.4860832458444164, + "language_loss": 0.74857283, + "learning_rate": 3.979380129822018e-09, + "loss": 0.76955116, + "num_input_tokens_seen": 351891775, + "step": 16308, + "time_per_iteration": 2.561248540878296 + }, + { + "auxiliary_loss_clip": 0.01009049, + "auxiliary_loss_mlp": 0.01003206, + "balance_loss_clip": 1.00647902, + "balance_loss_mlp": 1.00202584, + "epoch": 0.980550127761912, + "flos": 56051027798400.0, + "grad_norm": 0.7608720995556525, + "language_loss": 0.57945073, + "learning_rate": 3.954862048811902e-09, + "loss": 0.59957325, + "num_input_tokens_seen": 351946770, + "step": 16309, + "time_per_iteration": 4.349445104598999 + }, + { + "auxiliary_loss_clip": 0.01060394, + "auxiliary_loss_mlp": 0.01031014, + "balance_loss_clip": 1.03283381, + "balance_loss_mlp": 1.01904523, + "epoch": 0.9806102510145799, + "flos": 25333290184320.0, + "grad_norm": 1.73916753614286, + "language_loss": 0.6592654, + "learning_rate": 3.930419658033646e-09, + "loss": 0.68017948, + "num_input_tokens_seen": 351966155, + "step": 16310, + "time_per_iteration": 2.650777816772461 + }, + { + "auxiliary_loss_clip": 0.01008368, + "auxiliary_loss_mlp": 0.01001373, + "balance_loss_clip": 1.00904787, + "balance_loss_mlp": 1.00027585, + "epoch": 0.9806703742672479, + "flos": 67274837429760.0, + "grad_norm": 0.8448863448804085, + "language_loss": 0.54595488, + "learning_rate": 3.906052958413841e-09, + "loss": 0.56605232, + "num_input_tokens_seen": 352031655, + "step": 16311, + "time_per_iteration": 3.2005562782287598 + }, + { + "auxiliary_loss_clip": 0.01091398, + "auxiliary_loss_mlp": 0.01024408, + "balance_loss_clip": 1.03373551, + "balance_loss_mlp": 1.01290452, + "epoch": 0.9807304975199158, + "flos": 25228970110080.0, + "grad_norm": 1.4580688029076263, + "language_loss": 0.79952979, + "learning_rate": 3.881761950876638e-09, + "loss": 0.82068789, + "num_input_tokens_seen": 352051920, + "step": 16312, + "time_per_iteration": 2.5342788696289062 + }, + { + "auxiliary_loss_clip": 0.01080246, + "auxiliary_loss_mlp": 0.01027635, + "balance_loss_clip": 1.03433418, + "balance_loss_mlp": 1.01621461, + "epoch": 0.9807906207725838, + "flos": 17456392995840.0, + "grad_norm": 1.710625280744429, + "language_loss": 0.63489783, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.65597665, + "num_input_tokens_seen": 352069315, + "step": 16313, + "time_per_iteration": 2.471071243286133 + }, + { + "auxiliary_loss_clip": 0.01093018, + "auxiliary_loss_mlp": 0.01029195, + "balance_loss_clip": 1.03602517, + "balance_loss_mlp": 1.01717854, + "epoch": 0.9808507440252517, + "flos": 21032413361280.0, + "grad_norm": 5.044553281999545, + "language_loss": 0.7218833, + "learning_rate": 3.833407015731316e-09, + "loss": 0.74310541, + "num_input_tokens_seen": 352089480, + "step": 16314, + "time_per_iteration": 2.490083694458008 + }, + { + "auxiliary_loss_clip": 0.01000441, + "auxiliary_loss_mlp": 0.01003961, + "balance_loss_clip": 1.00755525, + "balance_loss_mlp": 1.00280464, + "epoch": 0.9809108672779198, + "flos": 64044491598720.0, + "grad_norm": 0.6894059505380258, + "language_loss": 0.51710522, + "learning_rate": 3.80934308995684e-09, + "loss": 0.53714919, + "num_input_tokens_seen": 352150000, + "step": 16315, + "time_per_iteration": 3.1547727584838867 + }, + { + "auxiliary_loss_clip": 0.01090823, + "auxiliary_loss_mlp": 0.01028172, + "balance_loss_clip": 1.03272891, + "balance_loss_mlp": 1.01692426, + "epoch": 0.9809709905305877, + "flos": 22780616296320.0, + "grad_norm": 1.414290347012709, + "language_loss": 0.69671547, + "learning_rate": 3.785354859932033e-09, + "loss": 0.7179054, + "num_input_tokens_seen": 352170990, + "step": 16316, + "time_per_iteration": 2.4956912994384766 + }, + { + "auxiliary_loss_clip": 0.01102788, + "auxiliary_loss_mlp": 0.01029122, + "balance_loss_clip": 1.03359842, + "balance_loss_mlp": 1.01733804, + "epoch": 0.9810311137832557, + "flos": 37013415217920.0, + "grad_norm": 1.7578885156152424, + "language_loss": 0.55404592, + "learning_rate": 3.76144232656661e-09, + "loss": 0.57536495, + "num_input_tokens_seen": 352195335, + "step": 16317, + "time_per_iteration": 2.607006072998047 + }, + { + "auxiliary_loss_clip": 0.01049731, + "auxiliary_loss_mlp": 0.01038207, + "balance_loss_clip": 1.03066683, + "balance_loss_mlp": 1.02600646, + "epoch": 0.9810912370359236, + "flos": 18916305373440.0, + "grad_norm": 1.5718784049131997, + "language_loss": 0.73009551, + "learning_rate": 3.737605490767404e-09, + "loss": 0.75097489, + "num_input_tokens_seen": 352214170, + "step": 16318, + "time_per_iteration": 2.5882198810577393 + }, + { + "auxiliary_loss_clip": 0.01078111, + "auxiliary_loss_mlp": 0.0102656, + "balance_loss_clip": 1.03270006, + "balance_loss_mlp": 1.01533604, + "epoch": 0.9811513602885916, + "flos": 18441602208000.0, + "grad_norm": 2.112847916258105, + "language_loss": 0.82266569, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.84371245, + "num_input_tokens_seen": 352231470, + "step": 16319, + "time_per_iteration": 2.4891841411590576 + }, + { + "auxiliary_loss_clip": 0.01020369, + "auxiliary_loss_mlp": 0.01011872, + "balance_loss_clip": 1.00786138, + "balance_loss_mlp": 1.01046491, + "epoch": 0.9812114835412595, + "flos": 68058945371520.0, + "grad_norm": 0.7240579239361407, + "language_loss": 0.53562981, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.55595225, + "num_input_tokens_seen": 352291770, + "step": 16320, + "time_per_iteration": 2.976208209991455 + }, + { + "auxiliary_loss_clip": 0.01054072, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.03273177, + "balance_loss_mlp": 1.02023387, + "epoch": 0.9812716067939276, + "flos": 25373007648000.0, + "grad_norm": 1.8183424686599858, + "language_loss": 0.73495209, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.7558167, + "num_input_tokens_seen": 352310735, + "step": 16321, + "time_per_iteration": 2.623300790786743 + }, + { + "auxiliary_loss_clip": 0.0108253, + "auxiliary_loss_mlp": 0.01032785, + "balance_loss_clip": 1.03596878, + "balance_loss_mlp": 1.02072096, + "epoch": 0.9813317300465956, + "flos": 22856818999680.0, + "grad_norm": 1.860305761239585, + "language_loss": 0.78628796, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.80744106, + "num_input_tokens_seen": 352329545, + "step": 16322, + "time_per_iteration": 2.5481505393981934 + }, + { + "auxiliary_loss_clip": 0.0109138, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.03441119, + "balance_loss_mlp": 1.0209918, + "epoch": 0.9813918532992635, + "flos": 23586954756480.0, + "grad_norm": 1.493214467037949, + "language_loss": 0.80667365, + "learning_rate": 3.619556806799595e-09, + "loss": 0.82791638, + "num_input_tokens_seen": 352352080, + "step": 16323, + "time_per_iteration": 2.535888195037842 + }, + { + "auxiliary_loss_clip": 0.01105487, + "auxiliary_loss_mlp": 0.01031783, + "balance_loss_clip": 1.03550744, + "balance_loss_mlp": 1.02031469, + "epoch": 0.9814519765519315, + "flos": 19606328616960.0, + "grad_norm": 2.0154981229982707, + "language_loss": 0.84284937, + "learning_rate": 3.596174175278799e-09, + "loss": 0.86422211, + "num_input_tokens_seen": 352366455, + "step": 16324, + "time_per_iteration": 2.4116899967193604 + }, + { + "auxiliary_loss_clip": 0.01080182, + "auxiliary_loss_mlp": 0.0102987, + "balance_loss_clip": 1.03481841, + "balance_loss_mlp": 1.01786542, + "epoch": 0.9815120998045994, + "flos": 33946284787200.0, + "grad_norm": 2.5790910065856942, + "language_loss": 0.74333262, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.76443315, + "num_input_tokens_seen": 352386090, + "step": 16325, + "time_per_iteration": 2.6507415771484375 + }, + { + "auxiliary_loss_clip": 0.01050923, + "auxiliary_loss_mlp": 0.01030962, + "balance_loss_clip": 1.03487325, + "balance_loss_mlp": 1.02013743, + "epoch": 0.9815722230572674, + "flos": 20850023076480.0, + "grad_norm": 2.0448152533604955, + "language_loss": 0.76442111, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.78523999, + "num_input_tokens_seen": 352404000, + "step": 16326, + "time_per_iteration": 2.563978672027588 + }, + { + "auxiliary_loss_clip": 0.01070842, + "auxiliary_loss_mlp": 0.01028423, + "balance_loss_clip": 1.03474164, + "balance_loss_mlp": 1.01635289, + "epoch": 0.9816323463099353, + "flos": 22894525301760.0, + "grad_norm": 1.6807064560241587, + "language_loss": 0.6761241, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.69711673, + "num_input_tokens_seen": 352423540, + "step": 16327, + "time_per_iteration": 2.5692334175109863 + }, + { + "auxiliary_loss_clip": 0.01096336, + "auxiliary_loss_mlp": 0.01038853, + "balance_loss_clip": 1.03425467, + "balance_loss_mlp": 1.02550209, + "epoch": 0.9816924695626034, + "flos": 31539444117120.0, + "grad_norm": 1.5713057705287012, + "language_loss": 0.73775464, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.75910652, + "num_input_tokens_seen": 352445530, + "step": 16328, + "time_per_iteration": 2.5366382598876953 + }, + { + "auxiliary_loss_clip": 0.01088053, + "auxiliary_loss_mlp": 0.01038423, + "balance_loss_clip": 1.03500605, + "balance_loss_mlp": 1.02522099, + "epoch": 0.9817525928152713, + "flos": 21506901045120.0, + "grad_norm": 1.7957225333662308, + "language_loss": 0.80941272, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.83067751, + "num_input_tokens_seen": 352466325, + "step": 16329, + "time_per_iteration": 2.560040235519409 + }, + { + "auxiliary_loss_clip": 0.01106808, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.0346967, + "balance_loss_mlp": 1.01702046, + "epoch": 0.9818127160679393, + "flos": 25550513683200.0, + "grad_norm": 2.2637304586965192, + "language_loss": 0.76410383, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.78547168, + "num_input_tokens_seen": 352485505, + "step": 16330, + "time_per_iteration": 2.4795854091644287 + }, + { + "auxiliary_loss_clip": 0.01111558, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.0358777, + "balance_loss_mlp": 1.01759398, + "epoch": 0.9818728393206072, + "flos": 28803661672320.0, + "grad_norm": 2.9680364312606335, + "language_loss": 0.66343927, + "learning_rate": 3.434615511252126e-09, + "loss": 0.68487525, + "num_input_tokens_seen": 352505360, + "step": 16331, + "time_per_iteration": 2.5004591941833496 + }, + { + "auxiliary_loss_clip": 0.01090548, + "auxiliary_loss_mlp": 0.01030232, + "balance_loss_clip": 1.03322697, + "balance_loss_mlp": 1.0185734, + "epoch": 0.9819329625732752, + "flos": 23222246014080.0, + "grad_norm": 1.7052312319395275, + "language_loss": 0.73114979, + "learning_rate": 3.411838534981948e-09, + "loss": 0.75235766, + "num_input_tokens_seen": 352524035, + "step": 16332, + "time_per_iteration": 2.4848275184631348 + }, + { + "auxiliary_loss_clip": 0.01091247, + "auxiliary_loss_mlp": 0.01027528, + "balance_loss_clip": 1.03524804, + "balance_loss_mlp": 1.01665568, + "epoch": 0.9819930858259431, + "flos": 17530440883200.0, + "grad_norm": 1.7546547347606805, + "language_loss": 0.7715168, + "learning_rate": 3.389137269534936e-09, + "loss": 0.79270452, + "num_input_tokens_seen": 352543210, + "step": 16333, + "time_per_iteration": 2.476016044616699 + }, + { + "auxiliary_loss_clip": 0.010847, + "auxiliary_loss_mlp": 0.00781811, + "balance_loss_clip": 1.03361416, + "balance_loss_mlp": 1.00853372, + "epoch": 0.9820532090786112, + "flos": 12529915971840.0, + "grad_norm": 2.0589387910106316, + "language_loss": 0.73018444, + "learning_rate": 3.366511715771958e-09, + "loss": 0.74884957, + "num_input_tokens_seen": 352559770, + "step": 16334, + "time_per_iteration": 2.4710867404937744 + }, + { + "auxiliary_loss_clip": 0.01054401, + "auxiliary_loss_mlp": 0.01032485, + "balance_loss_clip": 1.03571129, + "balance_loss_mlp": 1.02065945, + "epoch": 0.9821133323312792, + "flos": 18840174497280.0, + "grad_norm": 1.7712889810796995, + "language_loss": 0.78452671, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.8053956, + "num_input_tokens_seen": 352577690, + "step": 16335, + "time_per_iteration": 2.57309889793396 + }, + { + "auxiliary_loss_clip": 0.01080473, + "auxiliary_loss_mlp": 0.01040017, + "balance_loss_clip": 1.03379977, + "balance_loss_mlp": 1.02582502, + "epoch": 0.9821734555839471, + "flos": 34824013528320.0, + "grad_norm": 2.208568412586143, + "language_loss": 0.64325249, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.66445744, + "num_input_tokens_seen": 352598850, + "step": 16336, + "time_per_iteration": 2.615187406539917 + }, + { + "auxiliary_loss_clip": 0.01071935, + "auxiliary_loss_mlp": 0.01034624, + "balance_loss_clip": 1.03447866, + "balance_loss_mlp": 1.02139187, + "epoch": 0.9822335788366151, + "flos": 17128169493120.0, + "grad_norm": 1.9374776045552953, + "language_loss": 0.72596574, + "learning_rate": 3.299089333152372e-09, + "loss": 0.74703133, + "num_input_tokens_seen": 352616130, + "step": 16337, + "time_per_iteration": 3.8979709148406982 + }, + { + "auxiliary_loss_clip": 0.01094575, + "auxiliary_loss_mlp": 0.01030241, + "balance_loss_clip": 1.03454375, + "balance_loss_mlp": 1.01702046, + "epoch": 0.982293702089283, + "flos": 20813250528000.0, + "grad_norm": 1.69165896582704, + "language_loss": 0.72923529, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.75048345, + "num_input_tokens_seen": 352636885, + "step": 16338, + "time_per_iteration": 2.4932589530944824 + }, + { + "auxiliary_loss_clip": 0.01044766, + "auxiliary_loss_mlp": 0.01032587, + "balance_loss_clip": 1.03214717, + "balance_loss_mlp": 1.02049315, + "epoch": 0.982353825341951, + "flos": 24680829588480.0, + "grad_norm": 1.6218319056350827, + "language_loss": 0.81569004, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.83646357, + "num_input_tokens_seen": 352657905, + "step": 16339, + "time_per_iteration": 4.046782732009888 + }, + { + "auxiliary_loss_clip": 0.01053478, + "auxiliary_loss_mlp": 0.01038284, + "balance_loss_clip": 1.03252423, + "balance_loss_mlp": 1.02535009, + "epoch": 0.982413948594619, + "flos": 20850489953280.0, + "grad_norm": 2.549932932095697, + "language_loss": 0.62158203, + "learning_rate": 3.232348386403405e-09, + "loss": 0.64249969, + "num_input_tokens_seen": 352676320, + "step": 16340, + "time_per_iteration": 3.983675718307495 + }, + { + "auxiliary_loss_clip": 0.01106465, + "auxiliary_loss_mlp": 0.01032356, + "balance_loss_clip": 1.03556585, + "balance_loss_mlp": 1.01986301, + "epoch": 0.982474071847287, + "flos": 15377380778880.0, + "grad_norm": 1.9680628395731437, + "language_loss": 0.8593328, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.88072103, + "num_input_tokens_seen": 352692665, + "step": 16341, + "time_per_iteration": 2.464702606201172 + }, + { + "auxiliary_loss_clip": 0.01077482, + "auxiliary_loss_mlp": 0.01027768, + "balance_loss_clip": 1.033355, + "balance_loss_mlp": 1.01579392, + "epoch": 0.9825341950999549, + "flos": 23774732081280.0, + "grad_norm": 1.471446440057394, + "language_loss": 0.67273968, + "learning_rate": 3.188233008645014e-09, + "loss": 0.69379216, + "num_input_tokens_seen": 352716130, + "step": 16342, + "time_per_iteration": 2.5644898414611816 + }, + { + "auxiliary_loss_clip": 0.01102728, + "auxiliary_loss_mlp": 0.01025696, + "balance_loss_clip": 1.03346276, + "balance_loss_mlp": 1.01373351, + "epoch": 0.9825943183526229, + "flos": 22746285872640.0, + "grad_norm": 1.6351494993021072, + "language_loss": 0.77568734, + "learning_rate": 3.16628889830195e-09, + "loss": 0.79697168, + "num_input_tokens_seen": 352734705, + "step": 16343, + "time_per_iteration": 2.471416473388672 + }, + { + "auxiliary_loss_clip": 0.01068124, + "auxiliary_loss_mlp": 0.01028866, + "balance_loss_clip": 1.03378558, + "balance_loss_mlp": 1.01824439, + "epoch": 0.9826544416052908, + "flos": 27709966408320.0, + "grad_norm": 1.5332183063624663, + "language_loss": 0.75307381, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.77404368, + "num_input_tokens_seen": 352756225, + "step": 16344, + "time_per_iteration": 2.5889952182769775 + }, + { + "auxiliary_loss_clip": 0.0107891, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.03263521, + "balance_loss_mlp": 1.02300894, + "epoch": 0.9827145648579588, + "flos": 26941657472640.0, + "grad_norm": 2.193545450188143, + "language_loss": 0.66700649, + "learning_rate": 3.122627838848313e-09, + "loss": 0.68815219, + "num_input_tokens_seen": 352776210, + "step": 16345, + "time_per_iteration": 2.5659103393554688 + }, + { + "auxiliary_loss_clip": 0.01086502, + "auxiliary_loss_mlp": 0.01025685, + "balance_loss_clip": 1.03282666, + "balance_loss_mlp": 1.01526034, + "epoch": 0.9827746881106267, + "flos": 21866545969920.0, + "grad_norm": 1.3833265314782326, + "language_loss": 0.79633832, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.81746024, + "num_input_tokens_seen": 352795455, + "step": 16346, + "time_per_iteration": 2.477888822555542 + }, + { + "auxiliary_loss_clip": 0.01092343, + "auxiliary_loss_mlp": 0.01034092, + "balance_loss_clip": 1.03462243, + "balance_loss_mlp": 1.02146149, + "epoch": 0.9828348113632948, + "flos": 20850777262080.0, + "grad_norm": 1.781516647752353, + "language_loss": 0.74954772, + "learning_rate": 3.079269666552031e-09, + "loss": 0.77081203, + "num_input_tokens_seen": 352812895, + "step": 16347, + "time_per_iteration": 2.4978206157684326 + }, + { + "auxiliary_loss_clip": 0.01030509, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.02888858, + "balance_loss_mlp": 1.02155805, + "epoch": 0.9828949346159628, + "flos": 34569227381760.0, + "grad_norm": 1.5979083025026812, + "language_loss": 0.66580534, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.68644089, + "num_input_tokens_seen": 352835470, + "step": 16348, + "time_per_iteration": 4.1834399700164795 + }, + { + "auxiliary_loss_clip": 0.01082557, + "auxiliary_loss_mlp": 0.01030803, + "balance_loss_clip": 1.0341785, + "balance_loss_mlp": 1.01834011, + "epoch": 0.9829550578686307, + "flos": 24457464864000.0, + "grad_norm": 2.5149579025613242, + "language_loss": 0.68815982, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.70929348, + "num_input_tokens_seen": 352854295, + "step": 16349, + "time_per_iteration": 2.710031509399414 + }, + { + "auxiliary_loss_clip": 0.01077116, + "auxiliary_loss_mlp": 0.01028083, + "balance_loss_clip": 1.03406298, + "balance_loss_mlp": 1.01731205, + "epoch": 0.9830151811212987, + "flos": 16910084067840.0, + "grad_norm": 2.250538215716194, + "language_loss": 0.75854766, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.77959967, + "num_input_tokens_seen": 352869695, + "step": 16350, + "time_per_iteration": 2.485588788986206 + }, + { + "auxiliary_loss_clip": 0.01073499, + "auxiliary_loss_mlp": 0.0103032, + "balance_loss_clip": 1.03257537, + "balance_loss_mlp": 1.01758242, + "epoch": 0.9830753043739666, + "flos": 21288312829440.0, + "grad_norm": 1.8622186637268714, + "language_loss": 0.844971, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.86600918, + "num_input_tokens_seen": 352887430, + "step": 16351, + "time_per_iteration": 2.560678482055664 + }, + { + "auxiliary_loss_clip": 0.01074877, + "auxiliary_loss_mlp": 0.01026463, + "balance_loss_clip": 1.03465486, + "balance_loss_mlp": 1.01430964, + "epoch": 0.9831354276266346, + "flos": 31723522341120.0, + "grad_norm": 1.6736392372038043, + "language_loss": 0.69077665, + "learning_rate": 2.972199410170795e-09, + "loss": 0.71179008, + "num_input_tokens_seen": 352907555, + "step": 16352, + "time_per_iteration": 2.647226572036743 + }, + { + "auxiliary_loss_clip": 0.01090245, + "auxiliary_loss_mlp": 0.00782425, + "balance_loss_clip": 1.03410745, + "balance_loss_mlp": 1.00953543, + "epoch": 0.9831955508793025, + "flos": 21619050284160.0, + "grad_norm": 1.5389774312952147, + "language_loss": 0.66314912, + "learning_rate": 2.951012538143782e-09, + "loss": 0.68187582, + "num_input_tokens_seen": 352928670, + "step": 16353, + "time_per_iteration": 2.497777223587036 + }, + { + "auxiliary_loss_clip": 0.01077401, + "auxiliary_loss_mlp": 0.01029371, + "balance_loss_clip": 1.03254139, + "balance_loss_mlp": 1.0180645, + "epoch": 0.9832556741319706, + "flos": 22968214053120.0, + "grad_norm": 1.5790553296222525, + "language_loss": 0.7454325, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.76650023, + "num_input_tokens_seen": 352948345, + "step": 16354, + "time_per_iteration": 2.5387771129608154 + }, + { + "auxiliary_loss_clip": 0.01092396, + "auxiliary_loss_mlp": 0.01027772, + "balance_loss_clip": 1.03464377, + "balance_loss_mlp": 1.01611936, + "epoch": 0.9833157973846385, + "flos": 21323900229120.0, + "grad_norm": 1.984315361952274, + "language_loss": 0.77459216, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.79579383, + "num_input_tokens_seen": 352967250, + "step": 16355, + "time_per_iteration": 2.4766623973846436 + }, + { + "auxiliary_loss_clip": 0.01090504, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.03402686, + "balance_loss_mlp": 1.01564455, + "epoch": 0.9833759206373065, + "flos": 21068719032960.0, + "grad_norm": 1.8362842461233266, + "language_loss": 0.73421192, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.75539023, + "num_input_tokens_seen": 352984725, + "step": 16356, + "time_per_iteration": 2.4811246395111084 + }, + { + "auxiliary_loss_clip": 0.01078762, + "auxiliary_loss_mlp": 0.01031299, + "balance_loss_clip": 1.03370929, + "balance_loss_mlp": 1.0188359, + "epoch": 0.9834360438899744, + "flos": 18697322108160.0, + "grad_norm": 1.4845095104877466, + "language_loss": 0.75868577, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.77978635, + "num_input_tokens_seen": 353003480, + "step": 16357, + "time_per_iteration": 2.494948625564575 + }, + { + "auxiliary_loss_clip": 0.01081711, + "auxiliary_loss_mlp": 0.01025167, + "balance_loss_clip": 1.0344975, + "balance_loss_mlp": 1.01292968, + "epoch": 0.9834961671426424, + "flos": 21105240186240.0, + "grad_norm": 2.020044451101331, + "language_loss": 0.80030859, + "learning_rate": 2.846214118442436e-09, + "loss": 0.82137734, + "num_input_tokens_seen": 353021425, + "step": 16358, + "time_per_iteration": 2.5254104137420654 + }, + { + "auxiliary_loss_clip": 0.01090765, + "auxiliary_loss_mlp": 0.01024899, + "balance_loss_clip": 1.03239131, + "balance_loss_mlp": 1.01312661, + "epoch": 0.9835562903953103, + "flos": 26687625511680.0, + "grad_norm": 2.0339416150649803, + "language_loss": 0.6736474, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.69480407, + "num_input_tokens_seen": 353039870, + "step": 16359, + "time_per_iteration": 2.5015368461608887 + }, + { + "auxiliary_loss_clip": 0.01099383, + "auxiliary_loss_mlp": 0.01028815, + "balance_loss_clip": 1.03327489, + "balance_loss_mlp": 1.01771712, + "epoch": 0.9836164136479784, + "flos": 22090162089600.0, + "grad_norm": 1.6223104007509754, + "language_loss": 0.6989218, + "learning_rate": 2.804824870920264e-09, + "loss": 0.72020382, + "num_input_tokens_seen": 353059750, + "step": 16360, + "time_per_iteration": 2.466033458709717 + }, + { + "auxiliary_loss_clip": 0.0109428, + "auxiliary_loss_mlp": 0.01032566, + "balance_loss_clip": 1.03518283, + "balance_loss_mlp": 1.02036476, + "epoch": 0.9836765369006463, + "flos": 23878405710720.0, + "grad_norm": 1.6723506026939714, + "language_loss": 0.83711302, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.85838151, + "num_input_tokens_seen": 353079940, + "step": 16361, + "time_per_iteration": 2.5217785835266113 + }, + { + "auxiliary_loss_clip": 0.0110121, + "auxiliary_loss_mlp": 0.01025895, + "balance_loss_clip": 1.03327465, + "balance_loss_mlp": 1.01486206, + "epoch": 0.9837366601533143, + "flos": 25845017293440.0, + "grad_norm": 1.6584618615378208, + "language_loss": 0.7601586, + "learning_rate": 2.76373855876022e-09, + "loss": 0.78142971, + "num_input_tokens_seen": 353099990, + "step": 16362, + "time_per_iteration": 2.501152276992798 + }, + { + "auxiliary_loss_clip": 0.01104087, + "auxiliary_loss_mlp": 0.01033506, + "balance_loss_clip": 1.03563762, + "balance_loss_mlp": 1.0210067, + "epoch": 0.9837967834059823, + "flos": 21358015171200.0, + "grad_norm": 1.6251684484034803, + "language_loss": 0.71156061, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.73293656, + "num_input_tokens_seen": 353118710, + "step": 16363, + "time_per_iteration": 2.4234185218811035 + }, + { + "auxiliary_loss_clip": 0.01076422, + "auxiliary_loss_mlp": 0.01026933, + "balance_loss_clip": 1.03210402, + "balance_loss_mlp": 1.01612687, + "epoch": 0.9838569066586502, + "flos": 18515793749760.0, + "grad_norm": 1.7194852286541413, + "language_loss": 0.62849605, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.64952958, + "num_input_tokens_seen": 353136415, + "step": 16364, + "time_per_iteration": 2.5013372898101807 + }, + { + "auxiliary_loss_clip": 0.01060748, + "auxiliary_loss_mlp": 0.01027375, + "balance_loss_clip": 1.03485012, + "balance_loss_mlp": 1.01693249, + "epoch": 0.9839170299113182, + "flos": 22452392793600.0, + "grad_norm": 1.5674456888406045, + "language_loss": 0.75232524, + "learning_rate": 2.702677107943252e-09, + "loss": 0.77320647, + "num_input_tokens_seen": 353154650, + "step": 16365, + "time_per_iteration": 2.5627031326293945 + }, + { + "auxiliary_loss_clip": 0.01057922, + "auxiliary_loss_mlp": 0.0102688, + "balance_loss_clip": 1.0356195, + "balance_loss_mlp": 1.01433921, + "epoch": 0.9839771531639862, + "flos": 27892320779520.0, + "grad_norm": 1.9394514797153193, + "language_loss": 0.75963855, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.78048658, + "num_input_tokens_seen": 353174065, + "step": 16366, + "time_per_iteration": 2.591057777404785 + }, + { + "auxiliary_loss_clip": 0.01099674, + "auxiliary_loss_mlp": 0.0102583, + "balance_loss_clip": 1.03330803, + "balance_loss_mlp": 1.01476145, + "epoch": 0.9840372764166542, + "flos": 28214510797440.0, + "grad_norm": 1.5616697336729695, + "language_loss": 0.77277398, + "learning_rate": 2.662348161352357e-09, + "loss": 0.79402906, + "num_input_tokens_seen": 353193560, + "step": 16367, + "time_per_iteration": 2.4849090576171875 + }, + { + "auxiliary_loss_clip": 0.01081812, + "auxiliary_loss_mlp": 0.01032002, + "balance_loss_clip": 1.0391233, + "balance_loss_mlp": 1.01983082, + "epoch": 0.9840973996693221, + "flos": 23403989854080.0, + "grad_norm": 1.4961212436302398, + "language_loss": 0.61511934, + "learning_rate": 2.642297296540974e-09, + "loss": 0.63625747, + "num_input_tokens_seen": 353213525, + "step": 16368, + "time_per_iteration": 2.536449670791626 + }, + { + "auxiliary_loss_clip": 0.01087623, + "auxiliary_loss_mlp": 0.01031305, + "balance_loss_clip": 1.03261721, + "balance_loss_mlp": 1.02052271, + "epoch": 0.9841575229219901, + "flos": 21395865127680.0, + "grad_norm": 1.450157705797853, + "language_loss": 0.6520294, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.67321873, + "num_input_tokens_seen": 353234000, + "step": 16369, + "time_per_iteration": 2.498229742050171 + }, + { + "auxiliary_loss_clip": 0.01092913, + "auxiliary_loss_mlp": 0.00784922, + "balance_loss_clip": 1.0344553, + "balance_loss_mlp": 1.01225054, + "epoch": 0.984217646174658, + "flos": 24464072966400.0, + "grad_norm": 1.5193868640707344, + "language_loss": 0.68693888, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.70571721, + "num_input_tokens_seen": 353254940, + "step": 16370, + "time_per_iteration": 2.5134880542755127 + }, + { + "auxiliary_loss_clip": 0.01103148, + "auxiliary_loss_mlp": 0.01034498, + "balance_loss_clip": 1.03327441, + "balance_loss_mlp": 1.02161193, + "epoch": 0.984277769427326, + "flos": 16435057680000.0, + "grad_norm": 1.852319740814137, + "language_loss": 0.73651934, + "learning_rate": 2.582599145159792e-09, + "loss": 0.75789583, + "num_input_tokens_seen": 353272590, + "step": 16371, + "time_per_iteration": 2.409574508666992 + }, + { + "auxiliary_loss_clip": 0.01018938, + "auxiliary_loss_mlp": 0.01003733, + "balance_loss_clip": 1.00539875, + "balance_loss_mlp": 1.00253451, + "epoch": 0.9843378926799939, + "flos": 64530615288960.0, + "grad_norm": 0.7772744382690722, + "language_loss": 0.65194851, + "learning_rate": 2.562851244898745e-09, + "loss": 0.67217517, + "num_input_tokens_seen": 353334380, + "step": 16372, + "time_per_iteration": 3.096102476119995 + }, + { + "auxiliary_loss_clip": 0.01089777, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.03335178, + "balance_loss_mlp": 1.01701558, + "epoch": 0.984398015932662, + "flos": 17382811985280.0, + "grad_norm": 2.0511172688755193, + "language_loss": 0.70812571, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.72931433, + "num_input_tokens_seen": 353351640, + "step": 16373, + "time_per_iteration": 2.4361982345581055 + }, + { + "auxiliary_loss_clip": 0.01101584, + "auxiliary_loss_mlp": 0.01029378, + "balance_loss_clip": 1.03416502, + "balance_loss_mlp": 1.01792789, + "epoch": 0.9844581391853299, + "flos": 23879088069120.0, + "grad_norm": 1.7620228928513137, + "language_loss": 0.81453788, + "learning_rate": 2.523582674173186e-09, + "loss": 0.8358475, + "num_input_tokens_seen": 353372555, + "step": 16374, + "time_per_iteration": 2.4855618476867676 + }, + { + "auxiliary_loss_clip": 0.01057231, + "auxiliary_loss_mlp": 0.01031133, + "balance_loss_clip": 1.03584445, + "balance_loss_mlp": 1.01954532, + "epoch": 0.9845182624379979, + "flos": 19865352568320.0, + "grad_norm": 1.683189837763612, + "language_loss": 0.69345498, + "learning_rate": 2.504062005197927e-09, + "loss": 0.71433854, + "num_input_tokens_seen": 353391385, + "step": 16375, + "time_per_iteration": 2.590319871902466 + }, + { + "auxiliary_loss_clip": 0.01080034, + "auxiliary_loss_mlp": 0.01039239, + "balance_loss_clip": 1.03262091, + "balance_loss_mlp": 1.02516079, + "epoch": 0.9845783856906659, + "flos": 28254659224320.0, + "grad_norm": 3.3127767278259697, + "language_loss": 0.8098439, + "learning_rate": 2.484617081468521e-09, + "loss": 0.83103657, + "num_input_tokens_seen": 353411630, + "step": 16376, + "time_per_iteration": 3.940930128097534 + }, + { + "auxiliary_loss_clip": 0.01100378, + "auxiliary_loss_mlp": 0.01034024, + "balance_loss_clip": 1.03388739, + "balance_loss_mlp": 1.02189422, + "epoch": 0.9846385089433338, + "flos": 28328383889280.0, + "grad_norm": 1.4846253814027757, + "language_loss": 0.62360632, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.64495039, + "num_input_tokens_seen": 353432895, + "step": 16377, + "time_per_iteration": 2.5082168579101562 + }, + { + "auxiliary_loss_clip": 0.01074973, + "auxiliary_loss_mlp": 0.01033833, + "balance_loss_clip": 1.03551626, + "balance_loss_mlp": 1.02177453, + "epoch": 0.9846986321960018, + "flos": 24316767290880.0, + "grad_norm": 1.6249553629422337, + "language_loss": 0.73191273, + "learning_rate": 2.445954472695133e-09, + "loss": 0.7530008, + "num_input_tokens_seen": 353454195, + "step": 16378, + "time_per_iteration": 5.43019437789917 + }, + { + "auxiliary_loss_clip": 0.0110241, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.03422093, + "balance_loss_mlp": 1.0187757, + "epoch": 0.9847587554486698, + "flos": 27271999877760.0, + "grad_norm": 2.0257380956088196, + "language_loss": 0.7090708, + "learning_rate": 2.426736789116868e-09, + "loss": 0.73039353, + "num_input_tokens_seen": 353475125, + "step": 16379, + "time_per_iteration": 2.5088512897491455 + }, + { + "auxiliary_loss_clip": 0.01072074, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.0350734, + "balance_loss_mlp": 1.02147388, + "epoch": 0.9848188787013378, + "flos": 16542717719040.0, + "grad_norm": 1.8139113836488048, + "language_loss": 0.68565172, + "learning_rate": 2.407594853716999e-09, + "loss": 0.70670813, + "num_input_tokens_seen": 353493265, + "step": 16380, + "time_per_iteration": 2.5210585594177246 + }, + { + "auxiliary_loss_clip": 0.01080479, + "auxiliary_loss_mlp": 0.01038241, + "balance_loss_clip": 1.03402984, + "balance_loss_mlp": 1.0260874, + "epoch": 0.9848790019540057, + "flos": 20193647898240.0, + "grad_norm": 2.2562628902082933, + "language_loss": 0.79087758, + "learning_rate": 2.38852866722139e-09, + "loss": 0.81206477, + "num_input_tokens_seen": 353511650, + "step": 16381, + "time_per_iteration": 2.5016374588012695 + }, + { + "auxiliary_loss_clip": 0.01091402, + "auxiliary_loss_mlp": 0.01024539, + "balance_loss_clip": 1.03665292, + "balance_loss_mlp": 1.01279712, + "epoch": 0.9849391252066737, + "flos": 28259723041920.0, + "grad_norm": 1.433644330853625, + "language_loss": 0.82463664, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.84579605, + "num_input_tokens_seen": 353534035, + "step": 16382, + "time_per_iteration": 2.5678870677948 + }, + { + "auxiliary_loss_clip": 0.01076456, + "auxiliary_loss_mlp": 0.01031761, + "balance_loss_clip": 1.03094864, + "balance_loss_mlp": 1.01809955, + "epoch": 0.9849992484593416, + "flos": 22454942659200.0, + "grad_norm": 1.9021568185716766, + "language_loss": 0.74119943, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.7622816, + "num_input_tokens_seen": 353549950, + "step": 16383, + "time_per_iteration": 2.5002753734588623 + }, + { + "auxiliary_loss_clip": 0.01057298, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.03427696, + "balance_loss_mlp": 1.01788068, + "epoch": 0.9850593717120096, + "flos": 34497190656000.0, + "grad_norm": 1.7519994967218893, + "language_loss": 0.66058612, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.68145508, + "num_input_tokens_seen": 353573745, + "step": 16384, + "time_per_iteration": 2.701334238052368 + }, + { + "auxiliary_loss_clip": 0.01085528, + "auxiliary_loss_mlp": 0.01034931, + "balance_loss_clip": 1.03697693, + "balance_loss_mlp": 1.0213294, + "epoch": 0.9851194949646775, + "flos": 38837282152320.0, + "grad_norm": 2.1267956981078076, + "language_loss": 0.69963419, + "learning_rate": 2.313021424697359e-09, + "loss": 0.72083879, + "num_input_tokens_seen": 353595335, + "step": 16385, + "time_per_iteration": 2.6667003631591797 + }, + { + "auxiliary_loss_clip": 0.01086493, + "auxiliary_loss_mlp": 0.01030671, + "balance_loss_clip": 1.03866506, + "balance_loss_mlp": 1.01854157, + "epoch": 0.9851796182173456, + "flos": 17712436118400.0, + "grad_norm": 1.9867278924035572, + "language_loss": 0.80723107, + "learning_rate": 2.294333993509978e-09, + "loss": 0.8284027, + "num_input_tokens_seen": 353614270, + "step": 16386, + "time_per_iteration": 2.5072569847106934 + }, + { + "auxiliary_loss_clip": 0.01074202, + "auxiliary_loss_mlp": 0.01034002, + "balance_loss_clip": 1.03266859, + "balance_loss_mlp": 1.02121043, + "epoch": 0.9852397414700135, + "flos": 27454318335360.0, + "grad_norm": 1.8248963648197345, + "language_loss": 0.67944199, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.70052397, + "num_input_tokens_seen": 353634900, + "step": 16387, + "time_per_iteration": 3.976900339126587 + }, + { + "auxiliary_loss_clip": 0.01083975, + "auxiliary_loss_mlp": 0.00781174, + "balance_loss_clip": 1.03213966, + "balance_loss_mlp": 1.00729311, + "epoch": 0.9852998647226815, + "flos": 18296702743680.0, + "grad_norm": 1.634834504392881, + "language_loss": 0.73968589, + "learning_rate": 2.257186391438237e-09, + "loss": 0.75833738, + "num_input_tokens_seen": 353652890, + "step": 16388, + "time_per_iteration": 2.474196434020996 + }, + { + "auxiliary_loss_clip": 0.01086049, + "auxiliary_loss_mlp": 0.01028123, + "balance_loss_clip": 1.03316593, + "balance_loss_mlp": 1.01669145, + "epoch": 0.9853599879753495, + "flos": 19642562461440.0, + "grad_norm": 3.3218398652083034, + "language_loss": 0.81989151, + "learning_rate": 2.238726221962528e-09, + "loss": 0.84103322, + "num_input_tokens_seen": 353671295, + "step": 16389, + "time_per_iteration": 2.494032859802246 + }, + { + "auxiliary_loss_clip": 0.01081909, + "auxiliary_loss_mlp": 0.00782501, + "balance_loss_clip": 1.03306389, + "balance_loss_mlp": 1.00791585, + "epoch": 0.9854201112280174, + "flos": 23841956384640.0, + "grad_norm": 2.4124576345985105, + "language_loss": 0.66838658, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.68703067, + "num_input_tokens_seen": 353690560, + "step": 16390, + "time_per_iteration": 2.52402663230896 + }, + { + "auxiliary_loss_clip": 0.01071954, + "auxiliary_loss_mlp": 0.01035952, + "balance_loss_clip": 1.03587222, + "balance_loss_mlp": 1.02354228, + "epoch": 0.9854802344806854, + "flos": 30080573233920.0, + "grad_norm": 1.6702356700800203, + "language_loss": 0.76914823, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.79022729, + "num_input_tokens_seen": 353710660, + "step": 16391, + "time_per_iteration": 2.6345012187957764 + }, + { + "auxiliary_loss_clip": 0.01061503, + "auxiliary_loss_mlp": 0.00781184, + "balance_loss_clip": 1.03257835, + "balance_loss_mlp": 1.00874054, + "epoch": 0.9855403577333534, + "flos": 21907412668800.0, + "grad_norm": 2.1037926881890723, + "language_loss": 0.68038183, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.69880873, + "num_input_tokens_seen": 353730440, + "step": 16392, + "time_per_iteration": 2.547722816467285 + }, + { + "auxiliary_loss_clip": 0.01073078, + "auxiliary_loss_mlp": 0.01027473, + "balance_loss_clip": 1.03311801, + "balance_loss_mlp": 1.01446676, + "epoch": 0.9856004809860214, + "flos": 15413794191360.0, + "grad_norm": 1.9330714039153296, + "language_loss": 0.56146127, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.58246678, + "num_input_tokens_seen": 353748360, + "step": 16393, + "time_per_iteration": 2.507110834121704 + }, + { + "auxiliary_loss_clip": 0.01070746, + "auxiliary_loss_mlp": 0.01032467, + "balance_loss_clip": 1.03306627, + "balance_loss_mlp": 1.01887119, + "epoch": 0.9856606042386893, + "flos": 13653201064320.0, + "grad_norm": 2.675336354208727, + "language_loss": 0.78963113, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.81066322, + "num_input_tokens_seen": 353760880, + "step": 16394, + "time_per_iteration": 2.485213279724121 + }, + { + "auxiliary_loss_clip": 0.01092844, + "auxiliary_loss_mlp": 0.01033173, + "balance_loss_clip": 1.03411531, + "balance_loss_mlp": 1.02031624, + "epoch": 0.9857207274913573, + "flos": 23479151063040.0, + "grad_norm": 1.4513323002817524, + "language_loss": 0.7601186, + "learning_rate": 2.129556090869178e-09, + "loss": 0.78137875, + "num_input_tokens_seen": 353782255, + "step": 16395, + "time_per_iteration": 2.5158278942108154 + }, + { + "auxiliary_loss_clip": 0.01089809, + "auxiliary_loss_mlp": 0.01028077, + "balance_loss_clip": 1.0331707, + "balance_loss_mlp": 1.01638842, + "epoch": 0.9857808507440252, + "flos": 21065486808960.0, + "grad_norm": 1.9734112237516896, + "language_loss": 0.75392056, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.77509952, + "num_input_tokens_seen": 353803580, + "step": 16396, + "time_per_iteration": 2.508225679397583 + }, + { + "auxiliary_loss_clip": 0.01065798, + "auxiliary_loss_mlp": 0.01028408, + "balance_loss_clip": 1.03277206, + "balance_loss_mlp": 1.01641536, + "epoch": 0.9858409739966932, + "flos": 25301365971840.0, + "grad_norm": 1.4168023693299208, + "language_loss": 0.70872337, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.7296654, + "num_input_tokens_seen": 353824200, + "step": 16397, + "time_per_iteration": 2.5855350494384766 + }, + { + "auxiliary_loss_clip": 0.01076293, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.03337598, + "balance_loss_mlp": 1.01963139, + "epoch": 0.9859010972493611, + "flos": 20558751690240.0, + "grad_norm": 1.6002587979710707, + "language_loss": 0.71314663, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.73421586, + "num_input_tokens_seen": 353843350, + "step": 16398, + "time_per_iteration": 2.5617291927337646 + }, + { + "auxiliary_loss_clip": 0.01067623, + "auxiliary_loss_mlp": 0.01027284, + "balance_loss_clip": 1.03315997, + "balance_loss_mlp": 1.01592302, + "epoch": 0.9859612205020292, + "flos": 24754985216640.0, + "grad_norm": 1.40324360610605, + "language_loss": 0.73937106, + "learning_rate": 2.058291183208771e-09, + "loss": 0.76032019, + "num_input_tokens_seen": 353864520, + "step": 16399, + "time_per_iteration": 2.616198778152466 + }, + { + "auxiliary_loss_clip": 0.01103469, + "auxiliary_loss_mlp": 0.01028574, + "balance_loss_clip": 1.03404975, + "balance_loss_mlp": 1.016361, + "epoch": 0.9860213437546971, + "flos": 21105850717440.0, + "grad_norm": 2.3886840162759366, + "language_loss": 0.57539302, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.59671342, + "num_input_tokens_seen": 353882240, + "step": 16400, + "time_per_iteration": 2.4415886402130127 + }, + { + "auxiliary_loss_clip": 0.0108292, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.03476167, + "balance_loss_mlp": 1.01675177, + "epoch": 0.9860814670073651, + "flos": 19136078737920.0, + "grad_norm": 1.6160401332669558, + "language_loss": 0.80224597, + "learning_rate": 2.023113299582491e-09, + "loss": 0.82336766, + "num_input_tokens_seen": 353901590, + "step": 16401, + "time_per_iteration": 2.4968178272247314 + }, + { + "auxiliary_loss_clip": 0.01090132, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.03445792, + "balance_loss_mlp": 1.01872802, + "epoch": 0.9861415902600331, + "flos": 17237050594560.0, + "grad_norm": 1.6925924031663175, + "language_loss": 0.78143024, + "learning_rate": 2.005638002662069e-09, + "loss": 0.80264419, + "num_input_tokens_seen": 353918785, + "step": 16402, + "time_per_iteration": 2.4407482147216797 + }, + { + "auxiliary_loss_clip": 0.01093603, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.03533483, + "balance_loss_mlp": 1.02076101, + "epoch": 0.986201713512701, + "flos": 27782577751680.0, + "grad_norm": 1.6282033437431807, + "language_loss": 0.70046568, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.72172511, + "num_input_tokens_seen": 353940390, + "step": 16403, + "time_per_iteration": 2.534506320953369 + }, + { + "auxiliary_loss_clip": 0.01086682, + "auxiliary_loss_mlp": 0.01030195, + "balance_loss_clip": 1.03111374, + "balance_loss_mlp": 1.01840568, + "epoch": 0.986261836765369, + "flos": 28730403884160.0, + "grad_norm": 1.7769141238024977, + "language_loss": 0.74695563, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.76812446, + "num_input_tokens_seen": 353962180, + "step": 16404, + "time_per_iteration": 2.514890193939209 + }, + { + "auxiliary_loss_clip": 0.0109141, + "auxiliary_loss_mlp": 0.00782726, + "balance_loss_clip": 1.03336155, + "balance_loss_mlp": 1.00919676, + "epoch": 0.986321960018037, + "flos": 34313471568000.0, + "grad_norm": 1.8542403859502028, + "language_loss": 0.69935077, + "learning_rate": 1.953666699415768e-09, + "loss": 0.71809214, + "num_input_tokens_seen": 353984305, + "step": 16405, + "time_per_iteration": 2.582247018814087 + }, + { + "auxiliary_loss_clip": 0.01080886, + "auxiliary_loss_mlp": 0.01032637, + "balance_loss_clip": 1.03600121, + "balance_loss_mlp": 1.02144289, + "epoch": 0.986382083270705, + "flos": 25189755436800.0, + "grad_norm": 1.5922639662002922, + "language_loss": 0.69509959, + "learning_rate": 1.93649446302846e-09, + "loss": 0.7162348, + "num_input_tokens_seen": 354004495, + "step": 16406, + "time_per_iteration": 2.523158550262451 + }, + { + "auxiliary_loss_clip": 0.01044969, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.03525639, + "balance_loss_mlp": 1.01870251, + "epoch": 0.9864422065233729, + "flos": 11025904671360.0, + "grad_norm": 3.098821577809842, + "language_loss": 0.74684882, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.76760685, + "num_input_tokens_seen": 354015985, + "step": 16407, + "time_per_iteration": 2.5548746585845947 + }, + { + "auxiliary_loss_clip": 0.01078357, + "auxiliary_loss_mlp": 0.01031324, + "balance_loss_clip": 1.03291905, + "balance_loss_mlp": 1.01950479, + "epoch": 0.9865023297760409, + "flos": 16545590807040.0, + "grad_norm": 2.074242088498079, + "language_loss": 0.77349961, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.79459643, + "num_input_tokens_seen": 354033260, + "step": 16408, + "time_per_iteration": 2.459052562713623 + }, + { + "auxiliary_loss_clip": 0.01095239, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.03534436, + "balance_loss_mlp": 1.01772952, + "epoch": 0.9865624530287088, + "flos": 18880179269760.0, + "grad_norm": 1.9184932266194012, + "language_loss": 0.67686546, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.69812375, + "num_input_tokens_seen": 354052825, + "step": 16409, + "time_per_iteration": 2.5095601081848145 + }, + { + "auxiliary_loss_clip": 0.0101045, + "auxiliary_loss_mlp": 0.01002053, + "balance_loss_clip": 1.00696087, + "balance_loss_mlp": 1.0009141, + "epoch": 0.9866225762813768, + "flos": 68887798680960.0, + "grad_norm": 0.8037381907236266, + "language_loss": 0.61103487, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.6311599, + "num_input_tokens_seen": 354113920, + "step": 16410, + "time_per_iteration": 3.1759021282196045 + }, + { + "auxiliary_loss_clip": 0.01092354, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.03428948, + "balance_loss_mlp": 1.02200079, + "epoch": 0.9866826995340447, + "flos": 29023111814400.0, + "grad_norm": 2.8567072499514836, + "language_loss": 0.66023433, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.68149686, + "num_input_tokens_seen": 354134210, + "step": 16411, + "time_per_iteration": 2.5368194580078125 + }, + { + "auxiliary_loss_clip": 0.01026879, + "auxiliary_loss_mlp": 0.01000929, + "balance_loss_clip": 1.00418532, + "balance_loss_mlp": 0.99990427, + "epoch": 0.9867428227867128, + "flos": 65376814867200.0, + "grad_norm": 0.7271356072175748, + "language_loss": 0.56254053, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.58281863, + "num_input_tokens_seen": 354198010, + "step": 16412, + "time_per_iteration": 3.141047477722168 + }, + { + "auxiliary_loss_clip": 0.01074502, + "auxiliary_loss_mlp": 0.01034381, + "balance_loss_clip": 1.03443027, + "balance_loss_mlp": 1.0211904, + "epoch": 0.9868029460393807, + "flos": 26506312634880.0, + "grad_norm": 1.6258252693910795, + "language_loss": 0.73027843, + "learning_rate": 1.818410313934926e-09, + "loss": 0.75136727, + "num_input_tokens_seen": 354220000, + "step": 16413, + "time_per_iteration": 2.6072239875793457 + }, + { + "auxiliary_loss_clip": 0.01055138, + "auxiliary_loss_mlp": 0.0102544, + "balance_loss_clip": 1.03313839, + "balance_loss_mlp": 1.01405525, + "epoch": 0.9868630692920487, + "flos": 22967280299520.0, + "grad_norm": 1.4201407581356418, + "language_loss": 0.7145226, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.73532832, + "num_input_tokens_seen": 354240910, + "step": 16414, + "time_per_iteration": 3.9915575981140137 + }, + { + "auxiliary_loss_clip": 0.01085222, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.03395319, + "balance_loss_mlp": 1.02381241, + "epoch": 0.9869231925447167, + "flos": 19828687760640.0, + "grad_norm": 2.1447131044558514, + "language_loss": 0.70159596, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.72280186, + "num_input_tokens_seen": 354259430, + "step": 16415, + "time_per_iteration": 2.4566256999969482 + }, + { + "auxiliary_loss_clip": 0.01061339, + "auxiliary_loss_mlp": 0.01032424, + "balance_loss_clip": 1.03128946, + "balance_loss_mlp": 1.02159345, + "epoch": 0.9869833157973846, + "flos": 20195228096640.0, + "grad_norm": 1.38119265379021, + "language_loss": 0.75280213, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.77373976, + "num_input_tokens_seen": 354279490, + "step": 16416, + "time_per_iteration": 3.953526258468628 + }, + { + "auxiliary_loss_clip": 0.01078228, + "auxiliary_loss_mlp": 0.01030542, + "balance_loss_clip": 1.03627419, + "balance_loss_mlp": 1.01887107, + "epoch": 0.9870434390500527, + "flos": 16099507802880.0, + "grad_norm": 2.4414249029153616, + "language_loss": 0.70838392, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.72947156, + "num_input_tokens_seen": 354295080, + "step": 16417, + "time_per_iteration": 3.876779794692993 + }, + { + "auxiliary_loss_clip": 0.01082074, + "auxiliary_loss_mlp": 0.01033579, + "balance_loss_clip": 1.03642893, + "balance_loss_mlp": 1.02110934, + "epoch": 0.9871035623027206, + "flos": 21760753438080.0, + "grad_norm": 1.437770589341119, + "language_loss": 0.70450616, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.72566265, + "num_input_tokens_seen": 354314610, + "step": 16418, + "time_per_iteration": 2.524843215942383 + }, + { + "auxiliary_loss_clip": 0.01027044, + "auxiliary_loss_mlp": 0.01002281, + "balance_loss_clip": 1.00436306, + "balance_loss_mlp": 1.00123787, + "epoch": 0.9871636855553886, + "flos": 70219583245440.0, + "grad_norm": 0.6545908431017157, + "language_loss": 0.53676414, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.55705738, + "num_input_tokens_seen": 354383115, + "step": 16419, + "time_per_iteration": 3.191831350326538 + }, + { + "auxiliary_loss_clip": 0.01081803, + "auxiliary_loss_mlp": 0.01036751, + "balance_loss_clip": 1.03214645, + "balance_loss_mlp": 1.02342939, + "epoch": 0.9872238088080565, + "flos": 25045825639680.0, + "grad_norm": 1.7672420852928092, + "language_loss": 0.78010762, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.80129308, + "num_input_tokens_seen": 354403115, + "step": 16420, + "time_per_iteration": 2.5313658714294434 + }, + { + "auxiliary_loss_clip": 0.01066181, + "auxiliary_loss_mlp": 0.01025997, + "balance_loss_clip": 1.03995895, + "balance_loss_mlp": 1.01401639, + "epoch": 0.9872839320607245, + "flos": 19465846525440.0, + "grad_norm": 1.5788846792298823, + "language_loss": 0.70717388, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.72809565, + "num_input_tokens_seen": 354424520, + "step": 16421, + "time_per_iteration": 2.571572780609131 + }, + { + "auxiliary_loss_clip": 0.01094758, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.03429401, + "balance_loss_mlp": 1.01968956, + "epoch": 0.9873440553133924, + "flos": 26942914448640.0, + "grad_norm": 2.2973214135850015, + "language_loss": 0.82188433, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.84315693, + "num_input_tokens_seen": 354444800, + "step": 16422, + "time_per_iteration": 2.501039981842041 + }, + { + "auxiliary_loss_clip": 0.01068591, + "auxiliary_loss_mlp": 0.01027578, + "balance_loss_clip": 1.03347266, + "balance_loss_mlp": 1.01578212, + "epoch": 0.9874041785660604, + "flos": 19062210418560.0, + "grad_norm": 1.6345528640928064, + "language_loss": 0.85876733, + "learning_rate": 1.656159280223779e-09, + "loss": 0.87972903, + "num_input_tokens_seen": 354464590, + "step": 16423, + "time_per_iteration": 2.6315150260925293 + }, + { + "auxiliary_loss_clip": 0.01096694, + "auxiliary_loss_mlp": 0.01029413, + "balance_loss_clip": 1.03641129, + "balance_loss_mlp": 1.01765275, + "epoch": 0.9874643018187284, + "flos": 21105814803840.0, + "grad_norm": 1.8712927419852974, + "language_loss": 0.70161891, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.72288001, + "num_input_tokens_seen": 354484145, + "step": 16424, + "time_per_iteration": 2.4725449085235596 + }, + { + "auxiliary_loss_clip": 0.01093061, + "auxiliary_loss_mlp": 0.00781624, + "balance_loss_clip": 1.03293037, + "balance_loss_mlp": 1.00736153, + "epoch": 0.9875244250713964, + "flos": 24426043441920.0, + "grad_norm": 2.0722133987134277, + "language_loss": 0.80861616, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.82736295, + "num_input_tokens_seen": 354502475, + "step": 16425, + "time_per_iteration": 2.5077102184295654 + }, + { + "auxiliary_loss_clip": 0.01052276, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.03168166, + "balance_loss_mlp": 1.02188623, + "epoch": 0.9875845483240643, + "flos": 25117610970240.0, + "grad_norm": 2.041304649000619, + "language_loss": 0.79747468, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.81835091, + "num_input_tokens_seen": 354521855, + "step": 16426, + "time_per_iteration": 4.062847852706909 + }, + { + "auxiliary_loss_clip": 0.01093649, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.0364722, + "balance_loss_mlp": 1.01930618, + "epoch": 0.9876446715767323, + "flos": 16581788737920.0, + "grad_norm": 1.8902320369145071, + "language_loss": 0.84601408, + "learning_rate": 1.593380599750338e-09, + "loss": 0.86725909, + "num_input_tokens_seen": 354539535, + "step": 16427, + "time_per_iteration": 2.4575417041778564 + }, + { + "auxiliary_loss_clip": 0.01102106, + "auxiliary_loss_mlp": 0.0102853, + "balance_loss_clip": 1.03535509, + "balance_loss_mlp": 1.01665664, + "epoch": 0.9877047948294003, + "flos": 21616141282560.0, + "grad_norm": 1.7047268113470202, + "language_loss": 0.70424807, + "learning_rate": 1.577875377599458e-09, + "loss": 0.72555441, + "num_input_tokens_seen": 354557430, + "step": 16428, + "time_per_iteration": 2.4707367420196533 + }, + { + "auxiliary_loss_clip": 0.01063565, + "auxiliary_loss_mlp": 0.01030654, + "balance_loss_clip": 1.03202009, + "balance_loss_mlp": 1.01928723, + "epoch": 0.9877649180820682, + "flos": 21178497974400.0, + "grad_norm": 2.0134167491860127, + "language_loss": 0.79842854, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.81937075, + "num_input_tokens_seen": 354574735, + "step": 16429, + "time_per_iteration": 2.5267794132232666 + }, + { + "auxiliary_loss_clip": 0.01101304, + "auxiliary_loss_mlp": 0.01027148, + "balance_loss_clip": 1.03391767, + "balance_loss_mlp": 1.0161984, + "epoch": 0.9878250413347363, + "flos": 39749233576320.0, + "grad_norm": 1.6079748364419204, + "language_loss": 0.61917436, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.64045882, + "num_input_tokens_seen": 354597050, + "step": 16430, + "time_per_iteration": 2.621305465698242 + }, + { + "auxiliary_loss_clip": 0.01104749, + "auxiliary_loss_mlp": 0.01034604, + "balance_loss_clip": 1.03489745, + "balance_loss_mlp": 1.02259946, + "epoch": 0.9878851645874042, + "flos": 29425634599680.0, + "grad_norm": 1.4672224697854215, + "language_loss": 0.73110342, + "learning_rate": 1.531814395687725e-09, + "loss": 0.75249696, + "num_input_tokens_seen": 354619095, + "step": 16431, + "time_per_iteration": 2.510655641555786 + }, + { + "auxiliary_loss_clip": 0.01103465, + "auxiliary_loss_mlp": 0.01032525, + "balance_loss_clip": 1.0355587, + "balance_loss_mlp": 1.02062201, + "epoch": 0.9879452878400722, + "flos": 15806261168640.0, + "grad_norm": 2.107735415237377, + "language_loss": 0.80264997, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.8240099, + "num_input_tokens_seen": 354633790, + "step": 16432, + "time_per_iteration": 2.4183809757232666 + }, + { + "auxiliary_loss_clip": 0.01088627, + "auxiliary_loss_mlp": 0.01030593, + "balance_loss_clip": 1.03255725, + "balance_loss_mlp": 1.01978707, + "epoch": 0.9880054110927401, + "flos": 22233912318720.0, + "grad_norm": 1.6614794882308088, + "language_loss": 0.8026787, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.8238709, + "num_input_tokens_seen": 354653180, + "step": 16433, + "time_per_iteration": 2.4745142459869385 + }, + { + "auxiliary_loss_clip": 0.01100664, + "auxiliary_loss_mlp": 0.01032687, + "balance_loss_clip": 1.03491962, + "balance_loss_mlp": 1.02053952, + "epoch": 0.9880655343454081, + "flos": 28763836467840.0, + "grad_norm": 2.0661015334208215, + "language_loss": 0.64878559, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.67011911, + "num_input_tokens_seen": 354669900, + "step": 16434, + "time_per_iteration": 2.4818968772888184 + }, + { + "auxiliary_loss_clip": 0.01093181, + "auxiliary_loss_mlp": 0.01031503, + "balance_loss_clip": 1.03257823, + "balance_loss_mlp": 1.01912332, + "epoch": 0.988125657598076, + "flos": 32853379622400.0, + "grad_norm": 1.6739883779034581, + "language_loss": 0.69321609, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.714463, + "num_input_tokens_seen": 354693165, + "step": 16435, + "time_per_iteration": 2.56654953956604 + }, + { + "auxiliary_loss_clip": 0.01048099, + "auxiliary_loss_mlp": 0.01031582, + "balance_loss_clip": 1.03687155, + "balance_loss_mlp": 1.01953554, + "epoch": 0.988185780850744, + "flos": 19390685316480.0, + "grad_norm": 1.668878350758069, + "language_loss": 0.75496042, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.77575719, + "num_input_tokens_seen": 354711915, + "step": 16436, + "time_per_iteration": 2.574218988418579 + }, + { + "auxiliary_loss_clip": 0.01073984, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.03479457, + "balance_loss_mlp": 1.01909304, + "epoch": 0.988245904103412, + "flos": 22528415928960.0, + "grad_norm": 2.007869853503989, + "language_loss": 0.74078333, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.76184154, + "num_input_tokens_seen": 354729135, + "step": 16437, + "time_per_iteration": 2.519456148147583 + }, + { + "auxiliary_loss_clip": 0.01066343, + "auxiliary_loss_mlp": 0.01031795, + "balance_loss_clip": 1.03367865, + "balance_loss_mlp": 1.02069676, + "epoch": 0.98830602735608, + "flos": 28659193171200.0, + "grad_norm": 1.4946558106446606, + "language_loss": 0.60048234, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.62146372, + "num_input_tokens_seen": 354752530, + "step": 16438, + "time_per_iteration": 2.622162103652954 + }, + { + "auxiliary_loss_clip": 0.01079029, + "auxiliary_loss_mlp": 0.01030137, + "balance_loss_clip": 1.03391981, + "balance_loss_mlp": 1.01745319, + "epoch": 0.9883661506087479, + "flos": 20996035862400.0, + "grad_norm": 1.7937514205834721, + "language_loss": 0.71949887, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.74059057, + "num_input_tokens_seen": 354771135, + "step": 16439, + "time_per_iteration": 2.498781442642212 + }, + { + "auxiliary_loss_clip": 0.01089392, + "auxiliary_loss_mlp": 0.01028752, + "balance_loss_clip": 1.03437948, + "balance_loss_mlp": 1.01712906, + "epoch": 0.9884262738614159, + "flos": 32706109860480.0, + "grad_norm": 1.5916403060389754, + "language_loss": 0.5970341, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.61821556, + "num_input_tokens_seen": 354791800, + "step": 16440, + "time_per_iteration": 2.5497617721557617 + }, + { + "auxiliary_loss_clip": 0.01104111, + "auxiliary_loss_mlp": 0.01030249, + "balance_loss_clip": 1.03359556, + "balance_loss_mlp": 1.01817322, + "epoch": 0.9884863971140839, + "flos": 17564699479680.0, + "grad_norm": 2.864120877332571, + "language_loss": 0.75976336, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.78110695, + "num_input_tokens_seen": 354809200, + "step": 16441, + "time_per_iteration": 2.3990588188171387 + }, + { + "auxiliary_loss_clip": 0.01080512, + "auxiliary_loss_mlp": 0.01028939, + "balance_loss_clip": 1.03390932, + "balance_loss_mlp": 1.01694655, + "epoch": 0.9885465203667518, + "flos": 40552519380480.0, + "grad_norm": 1.8996918805491922, + "language_loss": 0.67818093, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.69927537, + "num_input_tokens_seen": 354829945, + "step": 16442, + "time_per_iteration": 2.67311429977417 + }, + { + "auxiliary_loss_clip": 0.01090283, + "auxiliary_loss_mlp": 0.010309, + "balance_loss_clip": 1.03409612, + "balance_loss_mlp": 1.01913428, + "epoch": 0.9886066436194199, + "flos": 13807976768640.0, + "grad_norm": 3.054310592024688, + "language_loss": 0.74559879, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.76681066, + "num_input_tokens_seen": 354845055, + "step": 16443, + "time_per_iteration": 2.4313974380493164 + }, + { + "auxiliary_loss_clip": 0.0108166, + "auxiliary_loss_mlp": 0.01029813, + "balance_loss_clip": 1.0329411, + "balance_loss_mlp": 1.01715279, + "epoch": 0.9886667668720878, + "flos": 23325129544320.0, + "grad_norm": 1.7798276026043987, + "language_loss": 0.73694032, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.75805509, + "num_input_tokens_seen": 354864680, + "step": 16444, + "time_per_iteration": 2.5240495204925537 + }, + { + "auxiliary_loss_clip": 0.01055922, + "auxiliary_loss_mlp": 0.01038986, + "balance_loss_clip": 1.03533292, + "balance_loss_mlp": 1.0260396, + "epoch": 0.9887268901247558, + "flos": 22706029704960.0, + "grad_norm": 1.791040356119224, + "language_loss": 0.69161534, + "learning_rate": 1.325881465858547e-09, + "loss": 0.71256441, + "num_input_tokens_seen": 354885685, + "step": 16445, + "time_per_iteration": 2.600954532623291 + }, + { + "auxiliary_loss_clip": 0.01096087, + "auxiliary_loss_mlp": 0.01025653, + "balance_loss_clip": 1.03660905, + "balance_loss_mlp": 1.01332068, + "epoch": 0.9887870133774237, + "flos": 13041283944960.0, + "grad_norm": 2.581257845665943, + "language_loss": 0.60242319, + "learning_rate": 1.311740377491155e-09, + "loss": 0.62364066, + "num_input_tokens_seen": 354901505, + "step": 16446, + "time_per_iteration": 2.453902244567871 + }, + { + "auxiliary_loss_clip": 0.01082656, + "auxiliary_loss_mlp": 0.01034052, + "balance_loss_clip": 1.03458631, + "balance_loss_mlp": 1.02286458, + "epoch": 0.9888471366300917, + "flos": 15158864390400.0, + "grad_norm": 2.0343570679751624, + "language_loss": 0.7076813, + "learning_rate": 1.297675079582783e-09, + "loss": 0.72884834, + "num_input_tokens_seen": 354920060, + "step": 16447, + "time_per_iteration": 2.4692814350128174 + }, + { + "auxiliary_loss_clip": 0.01101531, + "auxiliary_loss_mlp": 0.00780957, + "balance_loss_clip": 1.0340097, + "balance_loss_mlp": 1.00732303, + "epoch": 0.9889072598827596, + "flos": 25118796119040.0, + "grad_norm": 1.821247505641219, + "language_loss": 0.83643174, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.85525668, + "num_input_tokens_seen": 354938690, + "step": 16448, + "time_per_iteration": 2.473247766494751 + }, + { + "auxiliary_loss_clip": 0.01087492, + "auxiliary_loss_mlp": 0.01024023, + "balance_loss_clip": 1.03307891, + "balance_loss_mlp": 1.01362181, + "epoch": 0.9889673831354276, + "flos": 16728663450240.0, + "grad_norm": 1.4575548840814965, + "language_loss": 0.70259541, + "learning_rate": 1.26977185727406e-09, + "loss": 0.72371054, + "num_input_tokens_seen": 354956955, + "step": 16449, + "time_per_iteration": 2.450620174407959 + }, + { + "auxiliary_loss_clip": 0.01094052, + "auxiliary_loss_mlp": 0.01028614, + "balance_loss_clip": 1.03400874, + "balance_loss_mlp": 1.01708043, + "epoch": 0.9890275063880956, + "flos": 35585175657600.0, + "grad_norm": 2.375336117168634, + "language_loss": 0.73670459, + "learning_rate": 1.25593393393153e-09, + "loss": 0.75793123, + "num_input_tokens_seen": 354976800, + "step": 16450, + "time_per_iteration": 2.580023765563965 + }, + { + "auxiliary_loss_clip": 0.01103565, + "auxiliary_loss_mlp": 0.01029234, + "balance_loss_clip": 1.03259814, + "balance_loss_mlp": 1.01712859, + "epoch": 0.9890876296407636, + "flos": 18952359649920.0, + "grad_norm": 1.7957007395828775, + "language_loss": 0.79604566, + "learning_rate": 1.242171803164549e-09, + "loss": 0.81737363, + "num_input_tokens_seen": 354996625, + "step": 16451, + "time_per_iteration": 2.426565408706665 + }, + { + "auxiliary_loss_clip": 0.01067631, + "auxiliary_loss_mlp": 0.01034186, + "balance_loss_clip": 1.03445745, + "balance_loss_mlp": 1.02187157, + "epoch": 0.9891477528934315, + "flos": 23769309127680.0, + "grad_norm": 1.9149674334374505, + "language_loss": 0.69962287, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.72064102, + "num_input_tokens_seen": 355014535, + "step": 16452, + "time_per_iteration": 2.570059299468994 + }, + { + "auxiliary_loss_clip": 0.01099722, + "auxiliary_loss_mlp": 0.01026481, + "balance_loss_clip": 1.0346179, + "balance_loss_mlp": 1.0158, + "epoch": 0.9892078761460995, + "flos": 20772922533120.0, + "grad_norm": 1.6596035561829157, + "language_loss": 0.7416023, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.76286435, + "num_input_tokens_seen": 355033280, + "step": 16453, + "time_per_iteration": 3.856184244155884 + }, + { + "auxiliary_loss_clip": 0.01062608, + "auxiliary_loss_mlp": 0.01034895, + "balance_loss_clip": 1.03394961, + "balance_loss_mlp": 1.02345061, + "epoch": 0.9892679993987675, + "flos": 23367827836800.0, + "grad_norm": 2.2520938459452813, + "language_loss": 0.69297361, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.71394867, + "num_input_tokens_seen": 355053320, + "step": 16454, + "time_per_iteration": 2.613281488418579 + }, + { + "auxiliary_loss_clip": 0.01073451, + "auxiliary_loss_mlp": 0.0103029, + "balance_loss_clip": 1.03346038, + "balance_loss_mlp": 1.01857126, + "epoch": 0.9893281226514354, + "flos": 22705419173760.0, + "grad_norm": 1.8425586890104222, + "language_loss": 0.7602663, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.7813037, + "num_input_tokens_seen": 355070230, + "step": 16455, + "time_per_iteration": 3.9261350631713867 + }, + { + "auxiliary_loss_clip": 0.01073605, + "auxiliary_loss_mlp": 0.01023763, + "balance_loss_clip": 1.03423619, + "balance_loss_mlp": 1.01203835, + "epoch": 0.9893882459041035, + "flos": 21796664060160.0, + "grad_norm": 1.6544613205331633, + "language_loss": 0.65659112, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.6775648, + "num_input_tokens_seen": 355090125, + "step": 16456, + "time_per_iteration": 3.901376247406006 + }, + { + "auxiliary_loss_clip": 0.01094281, + "auxiliary_loss_mlp": 0.01030255, + "balance_loss_clip": 1.03545141, + "balance_loss_mlp": 1.01838207, + "epoch": 0.9894483691567714, + "flos": 18113773754880.0, + "grad_norm": 1.7338063533888701, + "language_loss": 0.74136579, + "learning_rate": 1.161190691666203e-09, + "loss": 0.76261109, + "num_input_tokens_seen": 355107890, + "step": 16457, + "time_per_iteration": 2.4384751319885254 + }, + { + "auxiliary_loss_clip": 0.01103453, + "auxiliary_loss_mlp": 0.01026727, + "balance_loss_clip": 1.03525651, + "balance_loss_mlp": 1.0151701, + "epoch": 0.9895084924094394, + "flos": 31211615664000.0, + "grad_norm": 2.057235615271389, + "language_loss": 0.68741465, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.70871639, + "num_input_tokens_seen": 355126340, + "step": 16458, + "time_per_iteration": 2.5709078311920166 + }, + { + "auxiliary_loss_clip": 0.01088879, + "auxiliary_loss_mlp": 0.01028599, + "balance_loss_clip": 1.03372788, + "balance_loss_mlp": 1.01671994, + "epoch": 0.9895686156621073, + "flos": 19678042120320.0, + "grad_norm": 1.730903718665965, + "language_loss": 0.79195976, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.81313455, + "num_input_tokens_seen": 355144025, + "step": 16459, + "time_per_iteration": 2.448833465576172 + }, + { + "auxiliary_loss_clip": 0.01078823, + "auxiliary_loss_mlp": 0.01034292, + "balance_loss_clip": 1.03500676, + "balance_loss_mlp": 1.02249014, + "epoch": 0.9896287389147753, + "flos": 23581675457280.0, + "grad_norm": 2.7415807171000153, + "language_loss": 0.70486844, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.72599959, + "num_input_tokens_seen": 355163125, + "step": 16460, + "time_per_iteration": 2.5391249656677246 + }, + { + "auxiliary_loss_clip": 0.01082678, + "auxiliary_loss_mlp": 0.01026408, + "balance_loss_clip": 1.03270626, + "balance_loss_mlp": 1.01418924, + "epoch": 0.9896888621674432, + "flos": 29605331364480.0, + "grad_norm": 1.526694438059555, + "language_loss": 0.87473035, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.89582121, + "num_input_tokens_seen": 355184060, + "step": 16461, + "time_per_iteration": 2.5636961460113525 + }, + { + "auxiliary_loss_clip": 0.01091645, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.03419363, + "balance_loss_mlp": 1.01858699, + "epoch": 0.9897489854201112, + "flos": 23695045758720.0, + "grad_norm": 1.649883411388797, + "language_loss": 0.62967896, + "learning_rate": 1.09579082189315e-09, + "loss": 0.65090823, + "num_input_tokens_seen": 355204505, + "step": 16462, + "time_per_iteration": 2.5204575061798096 + }, + { + "auxiliary_loss_clip": 0.01095541, + "auxiliary_loss_mlp": 0.01028486, + "balance_loss_clip": 1.03709817, + "balance_loss_mlp": 1.01698864, + "epoch": 0.9898091086727792, + "flos": 13225146687360.0, + "grad_norm": 1.7394982597073374, + "language_loss": 0.72770381, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.7489441, + "num_input_tokens_seen": 355223055, + "step": 16463, + "time_per_iteration": 2.4783084392547607 + }, + { + "auxiliary_loss_clip": 0.01092839, + "auxiliary_loss_mlp": 0.0102585, + "balance_loss_clip": 1.03454018, + "balance_loss_mlp": 1.01351178, + "epoch": 0.9898692319254472, + "flos": 22930400010240.0, + "grad_norm": 2.066418675120834, + "language_loss": 0.70160204, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.72278893, + "num_input_tokens_seen": 355242000, + "step": 16464, + "time_per_iteration": 2.490875482559204 + }, + { + "auxiliary_loss_clip": 0.01074852, + "auxiliary_loss_mlp": 0.01027915, + "balance_loss_clip": 1.03482592, + "balance_loss_mlp": 1.0158149, + "epoch": 0.9899293551781151, + "flos": 12458346122880.0, + "grad_norm": 1.9677920931170145, + "language_loss": 0.73314351, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.75417113, + "num_input_tokens_seen": 355260175, + "step": 16465, + "time_per_iteration": 3.9032723903656006 + }, + { + "auxiliary_loss_clip": 0.01100489, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.03339791, + "balance_loss_mlp": 1.02070463, + "epoch": 0.9899894784307831, + "flos": 26871129118080.0, + "grad_norm": 1.8661807326664825, + "language_loss": 0.86538041, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.88669825, + "num_input_tokens_seen": 355281930, + "step": 16466, + "time_per_iteration": 2.4685120582580566 + }, + { + "auxiliary_loss_clip": 0.01073942, + "auxiliary_loss_mlp": 0.01024184, + "balance_loss_clip": 1.03491151, + "balance_loss_mlp": 1.01271653, + "epoch": 0.990049601683451, + "flos": 21542093395200.0, + "grad_norm": 1.7073609407303227, + "language_loss": 0.71649468, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.73747593, + "num_input_tokens_seen": 355301555, + "step": 16467, + "time_per_iteration": 2.5478591918945312 + }, + { + "auxiliary_loss_clip": 0.0107539, + "auxiliary_loss_mlp": 0.01032995, + "balance_loss_clip": 1.03218079, + "balance_loss_mlp": 1.02037072, + "epoch": 0.990109724936119, + "flos": 28771809287040.0, + "grad_norm": 1.42687668494041, + "language_loss": 0.65095133, + "learning_rate": 1.019812338686643e-09, + "loss": 0.67203516, + "num_input_tokens_seen": 355324925, + "step": 16468, + "time_per_iteration": 2.589926242828369 + }, + { + "auxiliary_loss_clip": 0.01075894, + "auxiliary_loss_mlp": 0.01031882, + "balance_loss_clip": 1.03496504, + "balance_loss_mlp": 1.02000296, + "epoch": 0.9901698481887871, + "flos": 29274270687360.0, + "grad_norm": 1.892468697759678, + "language_loss": 0.61983585, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.64091361, + "num_input_tokens_seen": 355343875, + "step": 16469, + "time_per_iteration": 2.5955936908721924 + }, + { + "auxiliary_loss_clip": 0.01069636, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.03397727, + "balance_loss_mlp": 1.0170573, + "epoch": 0.990229971441455, + "flos": 15959025711360.0, + "grad_norm": 2.722461169098648, + "language_loss": 0.70679772, + "learning_rate": 9.950925847685976e-10, + "loss": 0.72778875, + "num_input_tokens_seen": 355358835, + "step": 16470, + "time_per_iteration": 2.496119260787964 + }, + { + "auxiliary_loss_clip": 0.01016609, + "auxiliary_loss_mlp": 0.01002717, + "balance_loss_clip": 1.00432324, + "balance_loss_mlp": 1.00163865, + "epoch": 0.990290094694123, + "flos": 69780287911680.0, + "grad_norm": 0.6652069418245136, + "language_loss": 0.55515206, + "learning_rate": 9.828464112755509e-10, + "loss": 0.5753454, + "num_input_tokens_seen": 355431225, + "step": 16471, + "time_per_iteration": 3.2695038318634033 + }, + { + "auxiliary_loss_clip": 0.01082922, + "auxiliary_loss_mlp": 0.01032008, + "balance_loss_clip": 1.03538394, + "balance_loss_mlp": 1.01997435, + "epoch": 0.9903502179467909, + "flos": 16252451913600.0, + "grad_norm": 3.1709504476574777, + "language_loss": 0.83570486, + "learning_rate": 9.706760407131032e-10, + "loss": 0.8568542, + "num_input_tokens_seen": 355448250, + "step": 16472, + "time_per_iteration": 2.498241662979126 + }, + { + "auxiliary_loss_clip": 0.01091973, + "auxiliary_loss_mlp": 0.01024389, + "balance_loss_clip": 1.03490067, + "balance_loss_mlp": 1.01333272, + "epoch": 0.9904103411994589, + "flos": 21688393489920.0, + "grad_norm": 1.9630346826471112, + "language_loss": 0.85555786, + "learning_rate": 9.585814735431075e-10, + "loss": 0.8767215, + "num_input_tokens_seen": 355467040, + "step": 16473, + "time_per_iteration": 2.4860990047454834 + }, + { + "auxiliary_loss_clip": 0.01101117, + "auxiliary_loss_mlp": 0.01029704, + "balance_loss_clip": 1.03344619, + "balance_loss_mlp": 1.0189395, + "epoch": 0.9904704644521268, + "flos": 25739440243200.0, + "grad_norm": 1.6239831631023984, + "language_loss": 0.84421873, + "learning_rate": 9.465627102240859e-10, + "loss": 0.86552697, + "num_input_tokens_seen": 355487825, + "step": 16474, + "time_per_iteration": 2.4751710891723633 + }, + { + "auxiliary_loss_clip": 0.01076373, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.03068423, + "balance_loss_mlp": 1.02379394, + "epoch": 0.9905305877047949, + "flos": 21908346422400.0, + "grad_norm": 1.7447494717227168, + "language_loss": 0.76442587, + "learning_rate": 9.346197512116738e-10, + "loss": 0.78554344, + "num_input_tokens_seen": 355507445, + "step": 16475, + "time_per_iteration": 2.5232841968536377 + }, + { + "auxiliary_loss_clip": 0.01061262, + "auxiliary_loss_mlp": 0.01033682, + "balance_loss_clip": 1.03061485, + "balance_loss_mlp": 1.02052748, + "epoch": 0.9905907109574628, + "flos": 21392417422080.0, + "grad_norm": 1.6349725429082336, + "language_loss": 0.75704241, + "learning_rate": 9.227525969588423e-10, + "loss": 0.77799183, + "num_input_tokens_seen": 355527205, + "step": 16476, + "time_per_iteration": 2.5665366649627686 + }, + { + "auxiliary_loss_clip": 0.01095728, + "auxiliary_loss_mlp": 0.00785509, + "balance_loss_clip": 1.0341146, + "balance_loss_mlp": 1.01081657, + "epoch": 0.9906508342101308, + "flos": 20521620005760.0, + "grad_norm": 1.9783925973808456, + "language_loss": 0.67739749, + "learning_rate": 9.109612479154538e-10, + "loss": 0.69620991, + "num_input_tokens_seen": 355544740, + "step": 16477, + "time_per_iteration": 2.501646041870117 + }, + { + "auxiliary_loss_clip": 0.01087058, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.03516757, + "balance_loss_mlp": 1.01829123, + "epoch": 0.9907109574627987, + "flos": 21361211481600.0, + "grad_norm": 2.231345553149873, + "language_loss": 0.71855807, + "learning_rate": 8.992457045289282e-10, + "loss": 0.73973799, + "num_input_tokens_seen": 355564385, + "step": 16478, + "time_per_iteration": 2.5374014377593994 + }, + { + "auxiliary_loss_clip": 0.01105373, + "auxiliary_loss_mlp": 0.01037362, + "balance_loss_clip": 1.03615808, + "balance_loss_mlp": 1.02439249, + "epoch": 0.9907710807154667, + "flos": 17338605321600.0, + "grad_norm": 2.068658948736616, + "language_loss": 0.81030357, + "learning_rate": 8.876059672433545e-10, + "loss": 0.83173096, + "num_input_tokens_seen": 355579260, + "step": 16479, + "time_per_iteration": 2.4339823722839355 + }, + { + "auxiliary_loss_clip": 0.01094579, + "auxiliary_loss_mlp": 0.01033638, + "balance_loss_clip": 1.03484523, + "balance_loss_mlp": 1.02206838, + "epoch": 0.9908312039681346, + "flos": 28621881918720.0, + "grad_norm": 1.467783419177516, + "language_loss": 0.66411066, + "learning_rate": 8.760420364999355e-10, + "loss": 0.6853928, + "num_input_tokens_seen": 355599790, + "step": 16480, + "time_per_iteration": 2.5163028240203857 + }, + { + "auxiliary_loss_clip": 0.01090598, + "auxiliary_loss_mlp": 0.01029808, + "balance_loss_clip": 1.03384936, + "balance_loss_mlp": 1.01770854, + "epoch": 0.9908913272208026, + "flos": 35770654512000.0, + "grad_norm": 2.654187102925779, + "language_loss": 0.72217774, + "learning_rate": 8.645539127374313e-10, + "loss": 0.74338186, + "num_input_tokens_seen": 355620925, + "step": 16481, + "time_per_iteration": 2.598353385925293 + }, + { + "auxiliary_loss_clip": 0.01089912, + "auxiliary_loss_mlp": 0.01022218, + "balance_loss_clip": 1.03334725, + "balance_loss_mlp": 1.01077998, + "epoch": 0.9909514504734707, + "flos": 19902196944000.0, + "grad_norm": 1.7698561356912261, + "language_loss": 0.77788526, + "learning_rate": 8.531415963912713e-10, + "loss": 0.79900658, + "num_input_tokens_seen": 355639165, + "step": 16482, + "time_per_iteration": 2.447971820831299 + }, + { + "auxiliary_loss_clip": 0.01094194, + "auxiliary_loss_mlp": 0.01028414, + "balance_loss_clip": 1.03377986, + "balance_loss_mlp": 1.01663601, + "epoch": 0.9910115737261386, + "flos": 20004793165440.0, + "grad_norm": 2.017782454029079, + "language_loss": 0.75476611, + "learning_rate": 8.418050878944427e-10, + "loss": 0.77599216, + "num_input_tokens_seen": 355657320, + "step": 16483, + "time_per_iteration": 2.467625141143799 + }, + { + "auxiliary_loss_clip": 0.0101852, + "auxiliary_loss_mlp": 0.01000424, + "balance_loss_clip": 1.00456882, + "balance_loss_mlp": 0.99923819, + "epoch": 0.9910716969788066, + "flos": 70688432494080.0, + "grad_norm": 0.6723156978369148, + "language_loss": 0.53661764, + "learning_rate": 8.305443876768237e-10, + "loss": 0.55680716, + "num_input_tokens_seen": 355726370, + "step": 16484, + "time_per_iteration": 3.2017130851745605 + }, + { + "auxiliary_loss_clip": 0.01097532, + "auxiliary_loss_mlp": 0.01030051, + "balance_loss_clip": 1.03313732, + "balance_loss_mlp": 1.01847577, + "epoch": 0.9911318202314745, + "flos": 21434038306560.0, + "grad_norm": 1.7676341789103007, + "language_loss": 0.8181836, + "learning_rate": 8.19359496165184e-10, + "loss": 0.83945942, + "num_input_tokens_seen": 355745840, + "step": 16485, + "time_per_iteration": 2.4401137828826904 + }, + { + "auxiliary_loss_clip": 0.01061836, + "auxiliary_loss_mlp": 0.01034429, + "balance_loss_clip": 1.03164911, + "balance_loss_mlp": 1.02154899, + "epoch": 0.9911919434841425, + "flos": 19826820253440.0, + "grad_norm": 1.7141761542571903, + "language_loss": 0.81553108, + "learning_rate": 8.082504137836288e-10, + "loss": 0.83649373, + "num_input_tokens_seen": 355763385, + "step": 16486, + "time_per_iteration": 2.507490634918213 + }, + { + "auxiliary_loss_clip": 0.01094298, + "auxiliary_loss_mlp": 0.01029825, + "balance_loss_clip": 1.03479803, + "balance_loss_mlp": 1.01808274, + "epoch": 0.9912520667368104, + "flos": 41719364691840.0, + "grad_norm": 1.4689591252091678, + "language_loss": 0.66243786, + "learning_rate": 7.972171409538209e-10, + "loss": 0.6836791, + "num_input_tokens_seen": 355786075, + "step": 16487, + "time_per_iteration": 2.6409401893615723 + }, + { + "auxiliary_loss_clip": 0.01087912, + "auxiliary_loss_mlp": 0.00779666, + "balance_loss_clip": 1.03254092, + "balance_loss_mlp": 1.00645256, + "epoch": 0.9913121899894785, + "flos": 23769668263680.0, + "grad_norm": 1.5416857604176755, + "language_loss": 0.76516861, + "learning_rate": 7.862596780936481e-10, + "loss": 0.78384441, + "num_input_tokens_seen": 355806295, + "step": 16488, + "time_per_iteration": 2.5030152797698975 + }, + { + "auxiliary_loss_clip": 0.01075645, + "auxiliary_loss_mlp": 0.01026566, + "balance_loss_clip": 1.03402138, + "balance_loss_mlp": 1.01412606, + "epoch": 0.9913723132421464, + "flos": 23769668263680.0, + "grad_norm": 4.078182394540024, + "language_loss": 0.68831784, + "learning_rate": 7.753780256190001e-10, + "loss": 0.70933998, + "num_input_tokens_seen": 355825730, + "step": 16489, + "time_per_iteration": 2.595344066619873 + }, + { + "auxiliary_loss_clip": 0.00996913, + "auxiliary_loss_mlp": 0.01003394, + "balance_loss_clip": 1.00610209, + "balance_loss_mlp": 1.00230968, + "epoch": 0.9914324364948144, + "flos": 71267419820160.0, + "grad_norm": 0.6088265247852145, + "language_loss": 0.52593273, + "learning_rate": 7.645721839424357e-10, + "loss": 0.54593581, + "num_input_tokens_seen": 355891545, + "step": 16490, + "time_per_iteration": 3.247739315032959 + }, + { + "auxiliary_loss_clip": 0.0107658, + "auxiliary_loss_mlp": 0.0103737, + "balance_loss_clip": 1.03413177, + "balance_loss_mlp": 1.0234046, + "epoch": 0.9914925597474823, + "flos": 23695440808320.0, + "grad_norm": 1.6517410641845605, + "language_loss": 0.75405085, + "learning_rate": 7.538421534734052e-10, + "loss": 0.77519035, + "num_input_tokens_seen": 355909920, + "step": 16491, + "time_per_iteration": 3.958491086959839 + }, + { + "auxiliary_loss_clip": 0.0106208, + "auxiliary_loss_mlp": 0.01032482, + "balance_loss_clip": 1.03633142, + "balance_loss_mlp": 1.02004218, + "epoch": 0.9915526830001503, + "flos": 13433822749440.0, + "grad_norm": 2.062646050614676, + "language_loss": 0.70512152, + "learning_rate": 7.431879346191383e-10, + "loss": 0.72606719, + "num_input_tokens_seen": 355923130, + "step": 16492, + "time_per_iteration": 2.603111982345581 + }, + { + "auxiliary_loss_clip": 0.01068069, + "auxiliary_loss_mlp": 0.01031266, + "balance_loss_clip": 1.03259706, + "balance_loss_mlp": 1.01820624, + "epoch": 0.9916128062528182, + "flos": 20740962407040.0, + "grad_norm": 2.0247022237389647, + "language_loss": 0.68460029, + "learning_rate": 7.326095277837563e-10, + "loss": 0.70559371, + "num_input_tokens_seen": 355941960, + "step": 16493, + "time_per_iteration": 2.545336961746216 + }, + { + "auxiliary_loss_clip": 0.01077955, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.0350616, + "balance_loss_mlp": 1.02237678, + "epoch": 0.9916729295054862, + "flos": 22487082353280.0, + "grad_norm": 1.8309470418312739, + "language_loss": 0.71162868, + "learning_rate": 7.221069333678276e-10, + "loss": 0.7327534, + "num_input_tokens_seen": 355961640, + "step": 16494, + "time_per_iteration": 4.030745983123779 + }, + { + "auxiliary_loss_clip": 0.01093387, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.03472149, + "balance_loss_mlp": 1.01912546, + "epoch": 0.9917330527581543, + "flos": 14792467708800.0, + "grad_norm": 2.365164181629142, + "language_loss": 0.67867768, + "learning_rate": 7.116801517701443e-10, + "loss": 0.69993126, + "num_input_tokens_seen": 355977980, + "step": 16495, + "time_per_iteration": 2.469764232635498 + }, + { + "auxiliary_loss_clip": 0.01010026, + "auxiliary_loss_mlp": 0.01000564, + "balance_loss_clip": 1.0067091, + "balance_loss_mlp": 0.99949133, + "epoch": 0.9917931760108222, + "flos": 59191595585280.0, + "grad_norm": 0.7238204988516688, + "language_loss": 0.53510422, + "learning_rate": 7.013291833859458e-10, + "loss": 0.55521011, + "num_input_tokens_seen": 356042900, + "step": 16496, + "time_per_iteration": 3.2301788330078125 + }, + { + "auxiliary_loss_clip": 0.01079426, + "auxiliary_loss_mlp": 0.00785684, + "balance_loss_clip": 1.03372121, + "balance_loss_mlp": 1.00908518, + "epoch": 0.9918532992634902, + "flos": 26761637485440.0, + "grad_norm": 14.174156197875808, + "language_loss": 0.7158879, + "learning_rate": 6.91054028607585e-10, + "loss": 0.73453903, + "num_input_tokens_seen": 356063000, + "step": 16497, + "time_per_iteration": 2.599478006362915 + }, + { + "auxiliary_loss_clip": 0.01074501, + "auxiliary_loss_mlp": 0.01029548, + "balance_loss_clip": 1.03391826, + "balance_loss_mlp": 1.01661968, + "epoch": 0.9919134225161581, + "flos": 14975719920000.0, + "grad_norm": 2.245758768760357, + "language_loss": 0.82269663, + "learning_rate": 6.808546878249721e-10, + "loss": 0.84373713, + "num_input_tokens_seen": 356078130, + "step": 16498, + "time_per_iteration": 2.5504136085510254 + }, + { + "auxiliary_loss_clip": 0.01069694, + "auxiliary_loss_mlp": 0.01037587, + "balance_loss_clip": 1.03642023, + "balance_loss_mlp": 1.02506423, + "epoch": 0.9919735457688261, + "flos": 27818201064960.0, + "grad_norm": 1.6163941954597265, + "language_loss": 0.68231177, + "learning_rate": 6.707311614246869e-10, + "loss": 0.70338452, + "num_input_tokens_seen": 356101655, + "step": 16499, + "time_per_iteration": 2.646113395690918 + }, + { + "auxiliary_loss_clip": 0.01105652, + "auxiliary_loss_mlp": 0.01026668, + "balance_loss_clip": 1.03607833, + "balance_loss_mlp": 1.01501524, + "epoch": 0.992033669021494, + "flos": 22562782266240.0, + "grad_norm": 2.894901545538992, + "language_loss": 0.82274354, + "learning_rate": 6.606834497904223e-10, + "loss": 0.84406674, + "num_input_tokens_seen": 356121425, + "step": 16500, + "time_per_iteration": 2.456714391708374 + }, + { + "auxiliary_loss_clip": 0.0108401, + "auxiliary_loss_mlp": 0.01030583, + "balance_loss_clip": 1.0353694, + "balance_loss_mlp": 1.01810753, + "epoch": 0.9920937922741621, + "flos": 25374587846400.0, + "grad_norm": 1.8549351730428476, + "language_loss": 0.82064319, + "learning_rate": 6.507115533036511e-10, + "loss": 0.84178913, + "num_input_tokens_seen": 356140710, + "step": 16501, + "time_per_iteration": 2.5550851821899414 + }, + { + "auxiliary_loss_clip": 0.0109538, + "auxiliary_loss_mlp": 0.01028569, + "balance_loss_clip": 1.03569913, + "balance_loss_mlp": 1.01625454, + "epoch": 0.99215391552683, + "flos": 22054466949120.0, + "grad_norm": 1.868956153883237, + "language_loss": 0.76868397, + "learning_rate": 6.408154723420711e-10, + "loss": 0.78992343, + "num_input_tokens_seen": 356159835, + "step": 16502, + "time_per_iteration": 2.483881711959839 + }, + { + "auxiliary_loss_clip": 0.01078233, + "auxiliary_loss_mlp": 0.01030909, + "balance_loss_clip": 1.03368258, + "balance_loss_mlp": 1.01760495, + "epoch": 0.992214038779498, + "flos": 15413937845760.0, + "grad_norm": 2.083895387085456, + "language_loss": 0.71414047, + "learning_rate": 6.309952072811597e-10, + "loss": 0.73523188, + "num_input_tokens_seen": 356177555, + "step": 16503, + "time_per_iteration": 2.4898693561553955 + }, + { + "auxiliary_loss_clip": 0.01020338, + "auxiliary_loss_mlp": 0.01010749, + "balance_loss_clip": 1.0078516, + "balance_loss_mlp": 1.00944924, + "epoch": 0.9922741620321659, + "flos": 62014498467840.0, + "grad_norm": 0.6523752280088663, + "language_loss": 0.55111074, + "learning_rate": 6.212507584932858e-10, + "loss": 0.57142162, + "num_input_tokens_seen": 356244975, + "step": 16504, + "time_per_iteration": 4.587971448898315 + }, + { + "auxiliary_loss_clip": 0.01070465, + "auxiliary_loss_mlp": 0.01025453, + "balance_loss_clip": 1.03311455, + "balance_loss_mlp": 1.01460552, + "epoch": 0.9923342852848339, + "flos": 17165480745600.0, + "grad_norm": 1.809670402358836, + "language_loss": 0.69274271, + "learning_rate": 6.115821263481536e-10, + "loss": 0.71370184, + "num_input_tokens_seen": 356262605, + "step": 16505, + "time_per_iteration": 2.549145460128784 + }, + { + "auxiliary_loss_clip": 0.01067792, + "auxiliary_loss_mlp": 0.01028637, + "balance_loss_clip": 1.03285122, + "balance_loss_mlp": 1.01542306, + "epoch": 0.9923944085375018, + "flos": 23183210908800.0, + "grad_norm": 2.369216998845462, + "language_loss": 0.6553821, + "learning_rate": 6.019893112119146e-10, + "loss": 0.6763463, + "num_input_tokens_seen": 356278935, + "step": 16506, + "time_per_iteration": 2.562612771987915 + }, + { + "auxiliary_loss_clip": 0.01040049, + "auxiliary_loss_mlp": 0.0102977, + "balance_loss_clip": 1.03287995, + "balance_loss_mlp": 1.01713991, + "epoch": 0.9924545317901698, + "flos": 20813861059200.0, + "grad_norm": 1.7830816531924956, + "language_loss": 0.63135743, + "learning_rate": 5.924723134487219e-10, + "loss": 0.65205562, + "num_input_tokens_seen": 356295675, + "step": 16507, + "time_per_iteration": 2.58758282661438 + }, + { + "auxiliary_loss_clip": 0.0110324, + "auxiliary_loss_mlp": 0.01030028, + "balance_loss_clip": 1.03425431, + "balance_loss_mlp": 1.0182265, + "epoch": 0.9925146550428379, + "flos": 20083437993600.0, + "grad_norm": 2.2522685186945486, + "language_loss": 0.72900707, + "learning_rate": 5.830311334193983e-10, + "loss": 0.75033969, + "num_input_tokens_seen": 356312885, + "step": 16508, + "time_per_iteration": 2.427483081817627 + }, + { + "auxiliary_loss_clip": 0.01103195, + "auxiliary_loss_mlp": 0.01028799, + "balance_loss_clip": 1.03418183, + "balance_loss_mlp": 1.0165205, + "epoch": 0.9925747782955058, + "flos": 24973717086720.0, + "grad_norm": 1.521504066835369, + "language_loss": 0.70677674, + "learning_rate": 5.736657714818793e-10, + "loss": 0.72809666, + "num_input_tokens_seen": 356334070, + "step": 16509, + "time_per_iteration": 2.4735260009765625 + }, + { + "auxiliary_loss_clip": 0.01092442, + "auxiliary_loss_mlp": 0.01033809, + "balance_loss_clip": 1.03302646, + "balance_loss_mlp": 1.02187645, + "epoch": 0.9926349015481738, + "flos": 60472526492160.0, + "grad_norm": 1.5847121218277973, + "language_loss": 0.68680686, + "learning_rate": 5.643762279912146e-10, + "loss": 0.70806938, + "num_input_tokens_seen": 356359410, + "step": 16510, + "time_per_iteration": 2.8422091007232666 + }, + { + "auxiliary_loss_clip": 0.01070211, + "auxiliary_loss_mlp": 0.01030973, + "balance_loss_clip": 1.03422379, + "balance_loss_mlp": 1.01877785, + "epoch": 0.9926950248008417, + "flos": 20741716592640.0, + "grad_norm": 2.1300847986677023, + "language_loss": 0.81129414, + "learning_rate": 5.551625032997886e-10, + "loss": 0.83230603, + "num_input_tokens_seen": 356378345, + "step": 16511, + "time_per_iteration": 2.5496416091918945 + }, + { + "auxiliary_loss_clip": 0.01059309, + "auxiliary_loss_mlp": 0.01033809, + "balance_loss_clip": 1.03138828, + "balance_loss_mlp": 1.02213871, + "epoch": 0.9927551480535097, + "flos": 24352965221760.0, + "grad_norm": 1.8714666594477027, + "language_loss": 0.91973662, + "learning_rate": 5.460245977570998e-10, + "loss": 0.94066775, + "num_input_tokens_seen": 356397345, + "step": 16512, + "time_per_iteration": 2.5928525924682617 + }, + { + "auxiliary_loss_clip": 0.01000766, + "auxiliary_loss_mlp": 0.01001226, + "balance_loss_clip": 1.00819206, + "balance_loss_mlp": 1.00024819, + "epoch": 0.9928152713061776, + "flos": 71275572207360.0, + "grad_norm": 0.7298586358038401, + "language_loss": 0.55166966, + "learning_rate": 5.369625117095378e-10, + "loss": 0.57168961, + "num_input_tokens_seen": 356459160, + "step": 16513, + "time_per_iteration": 3.2705729007720947 + }, + { + "auxiliary_loss_clip": 0.01074362, + "auxiliary_loss_mlp": 0.0102818, + "balance_loss_clip": 1.03301585, + "balance_loss_mlp": 1.01598454, + "epoch": 0.9928753945588457, + "flos": 57809499045120.0, + "grad_norm": 1.3707993044343254, + "language_loss": 0.65080255, + "learning_rate": 5.279762455006054e-10, + "loss": 0.67182797, + "num_input_tokens_seen": 356486405, + "step": 16514, + "time_per_iteration": 2.8496253490448 + }, + { + "auxiliary_loss_clip": 0.01073355, + "auxiliary_loss_mlp": 0.01028281, + "balance_loss_clip": 1.03300953, + "balance_loss_mlp": 1.01508462, + "epoch": 0.9929355178115136, + "flos": 19568981450880.0, + "grad_norm": 2.0759117057834064, + "language_loss": 0.73188823, + "learning_rate": 5.190657994713632e-10, + "loss": 0.75290459, + "num_input_tokens_seen": 356502905, + "step": 16515, + "time_per_iteration": 2.5331003665924072 + }, + { + "auxiliary_loss_clip": 0.01061912, + "auxiliary_loss_mlp": 0.01036988, + "balance_loss_clip": 1.03355086, + "balance_loss_mlp": 1.02390456, + "epoch": 0.9929956410641816, + "flos": 22964658606720.0, + "grad_norm": 1.4095818371979523, + "language_loss": 0.77319306, + "learning_rate": 5.102311739593191e-10, + "loss": 0.79418206, + "num_input_tokens_seen": 356523830, + "step": 16516, + "time_per_iteration": 2.596958875656128 + }, + { + "auxiliary_loss_clip": 0.01070182, + "auxiliary_loss_mlp": 0.01028036, + "balance_loss_clip": 1.03244185, + "balance_loss_mlp": 1.01669943, + "epoch": 0.9930557643168495, + "flos": 22566409539840.0, + "grad_norm": 1.3491478864742905, + "language_loss": 0.77997667, + "learning_rate": 5.014723692997602e-10, + "loss": 0.80095881, + "num_input_tokens_seen": 356543965, + "step": 16517, + "time_per_iteration": 2.6000726222991943 + }, + { + "auxiliary_loss_clip": 0.01091684, + "auxiliary_loss_mlp": 0.01038061, + "balance_loss_clip": 1.03625059, + "balance_loss_mlp": 1.02417874, + "epoch": 0.9931158875695175, + "flos": 17201032231680.0, + "grad_norm": 2.293092785455623, + "language_loss": 0.66967887, + "learning_rate": 4.927893858248655e-10, + "loss": 0.69097626, + "num_input_tokens_seen": 356561530, + "step": 16518, + "time_per_iteration": 2.4797003269195557 + }, + { + "auxiliary_loss_clip": 0.01012093, + "auxiliary_loss_mlp": 0.00998223, + "balance_loss_clip": 1.01300466, + "balance_loss_mlp": 0.99704885, + "epoch": 0.9931760108221854, + "flos": 63711204278400.0, + "grad_norm": 0.7268922857060319, + "language_loss": 0.53449887, + "learning_rate": 4.84182223863483e-10, + "loss": 0.55460203, + "num_input_tokens_seen": 356616845, + "step": 16519, + "time_per_iteration": 3.0249719619750977 + }, + { + "auxiliary_loss_clip": 0.01060592, + "auxiliary_loss_mlp": 0.01038487, + "balance_loss_clip": 1.03191447, + "balance_loss_mlp": 1.02571964, + "epoch": 0.9932361340748534, + "flos": 15304805349120.0, + "grad_norm": 1.6261714503595328, + "language_loss": 0.6011889, + "learning_rate": 4.756508837426842e-10, + "loss": 0.62217969, + "num_input_tokens_seen": 356633560, + "step": 16520, + "time_per_iteration": 2.618563413619995 + }, + { + "auxiliary_loss_clip": 0.0108181, + "auxiliary_loss_mlp": 0.01031372, + "balance_loss_clip": 1.0350579, + "balance_loss_mlp": 1.01928449, + "epoch": 0.9932962573275215, + "flos": 36064906727040.0, + "grad_norm": 1.658699932415915, + "language_loss": 0.61963224, + "learning_rate": 4.671953657853223e-10, + "loss": 0.64076406, + "num_input_tokens_seen": 356657600, + "step": 16521, + "time_per_iteration": 2.6613929271698 + }, + { + "auxiliary_loss_clip": 0.01081402, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.03622448, + "balance_loss_mlp": 1.01891065, + "epoch": 0.9933563805801894, + "flos": 21470523546240.0, + "grad_norm": 2.943643603396264, + "language_loss": 0.74303418, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.76416349, + "num_input_tokens_seen": 356675880, + "step": 16522, + "time_per_iteration": 2.5161147117614746 + }, + { + "auxiliary_loss_clip": 0.01069167, + "auxiliary_loss_mlp": 0.01030594, + "balance_loss_clip": 1.03369236, + "balance_loss_mlp": 1.01903653, + "epoch": 0.9934165038328574, + "flos": 23986532626560.0, + "grad_norm": 1.597211862440945, + "language_loss": 0.73218155, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.75317913, + "num_input_tokens_seen": 356696000, + "step": 16523, + "time_per_iteration": 2.5879104137420654 + }, + { + "auxiliary_loss_clip": 0.01080731, + "auxiliary_loss_mlp": 0.00782354, + "balance_loss_clip": 1.03275061, + "balance_loss_mlp": 1.00671422, + "epoch": 0.9934766270855253, + "flos": 21907807718400.0, + "grad_norm": 1.54468569373802, + "language_loss": 0.71182275, + "learning_rate": 4.422837480875241e-10, + "loss": 0.73045349, + "num_input_tokens_seen": 356716845, + "step": 16524, + "time_per_iteration": 2.5271873474121094 + }, + { + "auxiliary_loss_clip": 0.01070159, + "auxiliary_loss_mlp": 0.01030984, + "balance_loss_clip": 1.03340364, + "balance_loss_mlp": 1.01978469, + "epoch": 0.9935367503381933, + "flos": 17129139160320.0, + "grad_norm": 1.8006544215756555, + "language_loss": 0.79282951, + "learning_rate": 4.341315219624775e-10, + "loss": 0.81384099, + "num_input_tokens_seen": 356732100, + "step": 16525, + "time_per_iteration": 2.547050714492798 + }, + { + "auxiliary_loss_clip": 0.01067077, + "auxiliary_loss_mlp": 0.01025851, + "balance_loss_clip": 1.03583694, + "balance_loss_mlp": 1.01452017, + "epoch": 0.9935968735908612, + "flos": 22346241125760.0, + "grad_norm": 2.0926658357511636, + "language_loss": 0.74911249, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.77004176, + "num_input_tokens_seen": 356751480, + "step": 16526, + "time_per_iteration": 2.5579802989959717 + }, + { + "auxiliary_loss_clip": 0.01099194, + "auxiliary_loss_mlp": 0.00781718, + "balance_loss_clip": 1.03287685, + "balance_loss_mlp": 1.0079385, + "epoch": 0.9936569968435293, + "flos": 29460539640960.0, + "grad_norm": 1.4241634092991735, + "language_loss": 0.72501278, + "learning_rate": 4.180545412333369e-10, + "loss": 0.74382192, + "num_input_tokens_seen": 356772650, + "step": 16527, + "time_per_iteration": 2.519413948059082 + }, + { + "auxiliary_loss_clip": 0.01082212, + "auxiliary_loss_mlp": 0.01027084, + "balance_loss_clip": 1.03394794, + "balance_loss_mlp": 1.0149188, + "epoch": 0.9937171200961972, + "flos": 16544046522240.0, + "grad_norm": 2.2200401480973113, + "language_loss": 0.75956744, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.78066039, + "num_input_tokens_seen": 356788510, + "step": 16528, + "time_per_iteration": 2.4676499366760254 + }, + { + "auxiliary_loss_clip": 0.01084365, + "auxiliary_loss_mlp": 0.01028226, + "balance_loss_clip": 1.0332973, + "balance_loss_mlp": 1.01506555, + "epoch": 0.9937772433488652, + "flos": 24390276474240.0, + "grad_norm": 2.329084625622962, + "language_loss": 0.68282348, + "learning_rate": 4.022808578922898e-10, + "loss": 0.70394933, + "num_input_tokens_seen": 356809115, + "step": 16529, + "time_per_iteration": 2.5335917472839355 + }, + { + "auxiliary_loss_clip": 0.01097568, + "auxiliary_loss_mlp": 0.0103535, + "balance_loss_clip": 1.03628123, + "balance_loss_mlp": 1.02142, + "epoch": 0.9938373666015331, + "flos": 15669909141120.0, + "grad_norm": 2.5015567234408076, + "language_loss": 0.65255225, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.67388147, + "num_input_tokens_seen": 356826410, + "step": 16530, + "time_per_iteration": 3.881117582321167 + }, + { + "auxiliary_loss_clip": 0.01088059, + "auxiliary_loss_mlp": 0.01029329, + "balance_loss_clip": 1.03336143, + "balance_loss_mlp": 1.01787865, + "epoch": 0.9938974898542011, + "flos": 19496190539520.0, + "grad_norm": 3.051281854558218, + "language_loss": 0.71272612, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.73390001, + "num_input_tokens_seen": 356844990, + "step": 16531, + "time_per_iteration": 2.457932472229004 + }, + { + "auxiliary_loss_clip": 0.01089692, + "auxiliary_loss_mlp": 0.01030315, + "balance_loss_clip": 1.03405285, + "balance_loss_mlp": 1.0176487, + "epoch": 0.993957613106869, + "flos": 26906896085760.0, + "grad_norm": 5.058186284074813, + "language_loss": 0.74242675, + "learning_rate": 3.791890207045512e-10, + "loss": 0.76362675, + "num_input_tokens_seen": 356866530, + "step": 16532, + "time_per_iteration": 3.9516167640686035 + }, + { + "auxiliary_loss_clip": 0.01047322, + "auxiliary_loss_mlp": 0.01028569, + "balance_loss_clip": 1.03156555, + "balance_loss_mlp": 1.01765585, + "epoch": 0.994017736359537, + "flos": 14939593816320.0, + "grad_norm": 1.5860605229242386, + "language_loss": 0.70599902, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.72675788, + "num_input_tokens_seen": 356884660, + "step": 16533, + "time_per_iteration": 3.941800355911255 + }, + { + "auxiliary_loss_clip": 0.01092941, + "auxiliary_loss_mlp": 0.01028439, + "balance_loss_clip": 1.03686011, + "balance_loss_mlp": 1.01622629, + "epoch": 0.9940778596122051, + "flos": 15377883569280.0, + "grad_norm": 1.8745646908565194, + "language_loss": 0.84079063, + "learning_rate": 3.641735912007782e-10, + "loss": 0.86200446, + "num_input_tokens_seen": 356900895, + "step": 16534, + "time_per_iteration": 2.450599193572998 + }, + { + "auxiliary_loss_clip": 0.01062996, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.03182793, + "balance_loss_mlp": 1.01826739, + "epoch": 0.994137982864873, + "flos": 25228108183680.0, + "grad_norm": 1.3393551761619618, + "language_loss": 0.65768456, + "learning_rate": 3.567796158934211e-10, + "loss": 0.67860925, + "num_input_tokens_seen": 356920985, + "step": 16535, + "time_per_iteration": 2.6040236949920654 + }, + { + "auxiliary_loss_clip": 0.01063872, + "auxiliary_loss_mlp": 0.01026454, + "balance_loss_clip": 1.03557253, + "balance_loss_mlp": 1.01566505, + "epoch": 0.994198106117541, + "flos": 18442140912000.0, + "grad_norm": 1.5860370388134188, + "language_loss": 0.64901984, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.66992313, + "num_input_tokens_seen": 356939800, + "step": 16536, + "time_per_iteration": 2.5361204147338867 + }, + { + "auxiliary_loss_clip": 0.01065204, + "auxiliary_loss_mlp": 0.01038445, + "balance_loss_clip": 1.03262329, + "balance_loss_mlp": 1.02441382, + "epoch": 0.9942582293702089, + "flos": 16654112772480.0, + "grad_norm": 1.7937160985399248, + "language_loss": 0.78449738, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.80553389, + "num_input_tokens_seen": 356957780, + "step": 16537, + "time_per_iteration": 2.530407667160034 + }, + { + "auxiliary_loss_clip": 0.01098663, + "auxiliary_loss_mlp": 0.01032481, + "balance_loss_clip": 1.03510857, + "balance_loss_mlp": 1.01936817, + "epoch": 0.9943183526228769, + "flos": 21944580266880.0, + "grad_norm": 1.4517666942224632, + "language_loss": 0.68918395, + "learning_rate": 3.35052651107004e-10, + "loss": 0.71049547, + "num_input_tokens_seen": 356979185, + "step": 16538, + "time_per_iteration": 2.508413553237915 + }, + { + "auxiliary_loss_clip": 0.01061838, + "auxiliary_loss_mlp": 0.01032752, + "balance_loss_clip": 1.03091431, + "balance_loss_mlp": 1.02065182, + "epoch": 0.9943784758755448, + "flos": 23842566915840.0, + "grad_norm": 1.9884840077714971, + "language_loss": 0.74848974, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.76943558, + "num_input_tokens_seen": 356997735, + "step": 16539, + "time_per_iteration": 2.573502540588379 + }, + { + "auxiliary_loss_clip": 0.01054238, + "auxiliary_loss_mlp": 0.01031955, + "balance_loss_clip": 1.03353834, + "balance_loss_mlp": 1.01947427, + "epoch": 0.9944385991282129, + "flos": 21469984842240.0, + "grad_norm": 2.061185529639008, + "language_loss": 0.70490974, + "learning_rate": 3.209471449341361e-10, + "loss": 0.72577167, + "num_input_tokens_seen": 357015660, + "step": 16540, + "time_per_iteration": 2.5730717182159424 + }, + { + "auxiliary_loss_clip": 0.01088427, + "auxiliary_loss_mlp": 0.01027249, + "balance_loss_clip": 1.03369987, + "balance_loss_mlp": 1.01658022, + "epoch": 0.9944987223808808, + "flos": 22927024131840.0, + "grad_norm": 1.9090727514975214, + "language_loss": 0.75384074, + "learning_rate": 3.140081337600353e-10, + "loss": 0.77499747, + "num_input_tokens_seen": 357034800, + "step": 16541, + "time_per_iteration": 2.503122329711914 + }, + { + "auxiliary_loss_clip": 0.01082008, + "auxiliary_loss_mlp": 0.01032749, + "balance_loss_clip": 1.03300536, + "balance_loss_mlp": 1.02086413, + "epoch": 0.9945588456335488, + "flos": 22383013674240.0, + "grad_norm": 1.7875042268377968, + "language_loss": 0.76624858, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.78739613, + "num_input_tokens_seen": 357053785, + "step": 16542, + "time_per_iteration": 3.8983395099639893 + }, + { + "auxiliary_loss_clip": 0.01093371, + "auxiliary_loss_mlp": 0.0102875, + "balance_loss_clip": 1.03423238, + "balance_loss_mlp": 1.01635194, + "epoch": 0.9946189688862167, + "flos": 21397517153280.0, + "grad_norm": 2.1037959053843123, + "language_loss": 0.74649221, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.76771343, + "num_input_tokens_seen": 357072025, + "step": 16543, + "time_per_iteration": 2.466060161590576 + }, + { + "auxiliary_loss_clip": 0.0108971, + "auxiliary_loss_mlp": 0.01032345, + "balance_loss_clip": 1.03335607, + "balance_loss_mlp": 1.01916051, + "epoch": 0.9946790921388847, + "flos": 12416545670400.0, + "grad_norm": 2.314513467839334, + "language_loss": 0.81457132, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.83579195, + "num_input_tokens_seen": 357086960, + "step": 16544, + "time_per_iteration": 2.47878360748291 + }, + { + "auxiliary_loss_clip": 0.0110262, + "auxiliary_loss_mlp": 0.01027475, + "balance_loss_clip": 1.03392172, + "balance_loss_mlp": 1.01545882, + "epoch": 0.9947392153915526, + "flos": 19058295836160.0, + "grad_norm": 2.004681638815712, + "language_loss": 0.78613293, + "learning_rate": 2.870103745831187e-10, + "loss": 0.80743384, + "num_input_tokens_seen": 357105095, + "step": 16545, + "time_per_iteration": 2.4189581871032715 + }, + { + "auxiliary_loss_clip": 0.01074368, + "auxiliary_loss_mlp": 0.01029436, + "balance_loss_clip": 1.03380406, + "balance_loss_mlp": 1.01706195, + "epoch": 0.9947993386442207, + "flos": 27308808339840.0, + "grad_norm": 1.73062405663195, + "language_loss": 0.72542286, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.74646086, + "num_input_tokens_seen": 357125065, + "step": 16546, + "time_per_iteration": 2.596069812774658 + }, + { + "auxiliary_loss_clip": 0.01089321, + "auxiliary_loss_mlp": 0.01034236, + "balance_loss_clip": 1.03297257, + "balance_loss_mlp": 1.02285123, + "epoch": 0.9948594618968887, + "flos": 20806498771200.0, + "grad_norm": 2.961944051125757, + "language_loss": 0.77560139, + "learning_rate": 2.739664698798716e-10, + "loss": 0.79683697, + "num_input_tokens_seen": 357141600, + "step": 16547, + "time_per_iteration": 2.457759141921997 + }, + { + "auxiliary_loss_clip": 0.01080764, + "auxiliary_loss_mlp": 0.01029157, + "balance_loss_clip": 1.03227794, + "balance_loss_mlp": 1.01768351, + "epoch": 0.9949195851495566, + "flos": 23292953936640.0, + "grad_norm": 2.0610085199600543, + "language_loss": 0.69844037, + "learning_rate": 2.67558262122769e-10, + "loss": 0.71953958, + "num_input_tokens_seen": 357157880, + "step": 16548, + "time_per_iteration": 2.5126731395721436 + }, + { + "auxiliary_loss_clip": 0.01090726, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.03514898, + "balance_loss_mlp": 1.02087736, + "epoch": 0.9949797084022246, + "flos": 18515470527360.0, + "grad_norm": 3.3184660994876185, + "language_loss": 0.75579756, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.77703398, + "num_input_tokens_seen": 357176705, + "step": 16549, + "time_per_iteration": 2.440439462661743 + }, + { + "auxiliary_loss_clip": 0.01075755, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.03515255, + "balance_loss_mlp": 1.02101612, + "epoch": 0.9950398316548925, + "flos": 30407719328640.0, + "grad_norm": 1.7179432323231127, + "language_loss": 0.7461971, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.76729202, + "num_input_tokens_seen": 357197630, + "step": 16550, + "time_per_iteration": 2.6116130352020264 + }, + { + "auxiliary_loss_clip": 0.01053531, + "auxiliary_loss_mlp": 0.00782083, + "balance_loss_clip": 1.03260791, + "balance_loss_mlp": 1.00646257, + "epoch": 0.9950999549075605, + "flos": 19900868140800.0, + "grad_norm": 1.418495948830359, + "language_loss": 0.77835631, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.79671252, + "num_input_tokens_seen": 357215445, + "step": 16551, + "time_per_iteration": 2.5817551612854004 + }, + { + "auxiliary_loss_clip": 0.01085435, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.03299546, + "balance_loss_mlp": 1.02007842, + "epoch": 0.9951600781602284, + "flos": 17603555016960.0, + "grad_norm": 1.392059953026195, + "language_loss": 0.667068, + "learning_rate": 2.426837340270271e-10, + "loss": 0.68822658, + "num_input_tokens_seen": 357234285, + "step": 16552, + "time_per_iteration": 2.4755401611328125 + }, + { + "auxiliary_loss_clip": 0.01102879, + "auxiliary_loss_mlp": 0.01029258, + "balance_loss_clip": 1.03377008, + "balance_loss_mlp": 1.01717639, + "epoch": 0.9952202014128965, + "flos": 28950715952640.0, + "grad_norm": 1.3966898960990746, + "language_loss": 0.81700343, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.83832479, + "num_input_tokens_seen": 357257565, + "step": 16553, + "time_per_iteration": 2.5164520740509033 + }, + { + "auxiliary_loss_clip": 0.01016764, + "auxiliary_loss_mlp": 0.0100186, + "balance_loss_clip": 1.00576913, + "balance_loss_mlp": 1.00079942, + "epoch": 0.9952803246655644, + "flos": 70810386145920.0, + "grad_norm": 0.7205071741812054, + "language_loss": 0.5732578, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.59344411, + "num_input_tokens_seen": 357320205, + "step": 16554, + "time_per_iteration": 3.1861236095428467 + }, + { + "auxiliary_loss_clip": 0.01096351, + "auxiliary_loss_mlp": 0.01033499, + "balance_loss_clip": 1.036394, + "balance_loss_mlp": 1.02195406, + "epoch": 0.9953404479182324, + "flos": 21799070271360.0, + "grad_norm": 1.5499778392395942, + "language_loss": 0.769647, + "learning_rate": 2.24824062597051e-10, + "loss": 0.79094553, + "num_input_tokens_seen": 357340695, + "step": 16555, + "time_per_iteration": 2.4751126766204834 + }, + { + "auxiliary_loss_clip": 0.0107133, + "auxiliary_loss_mlp": 0.01031311, + "balance_loss_clip": 1.03114152, + "balance_loss_mlp": 1.01893139, + "epoch": 0.9954005711709003, + "flos": 21937397546880.0, + "grad_norm": 2.524991524125413, + "language_loss": 0.86389595, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.88492233, + "num_input_tokens_seen": 357357505, + "step": 16556, + "time_per_iteration": 2.564124345779419 + }, + { + "auxiliary_loss_clip": 0.01058307, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.03381395, + "balance_loss_mlp": 1.01840448, + "epoch": 0.9954606944235683, + "flos": 19354559212800.0, + "grad_norm": 1.6172949342302496, + "language_loss": 0.7314136, + "learning_rate": 2.132967729762125e-10, + "loss": 0.75229818, + "num_input_tokens_seen": 357375395, + "step": 16557, + "time_per_iteration": 2.5364925861358643 + }, + { + "auxiliary_loss_clip": 0.01091269, + "auxiliary_loss_mlp": 0.01032344, + "balance_loss_clip": 1.03422976, + "balance_loss_mlp": 1.02107239, + "epoch": 0.9955208176762362, + "flos": 30518611591680.0, + "grad_norm": 2.1456111343934228, + "language_loss": 0.76454818, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.78578424, + "num_input_tokens_seen": 357397375, + "step": 16558, + "time_per_iteration": 2.57104229927063 + }, + { + "auxiliary_loss_clip": 0.01078405, + "auxiliary_loss_mlp": 0.0102996, + "balance_loss_clip": 1.03388822, + "balance_loss_mlp": 1.01768112, + "epoch": 0.9955809409289043, + "flos": 30008249199360.0, + "grad_norm": 2.4592762101287247, + "language_loss": 0.63552135, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.65660501, + "num_input_tokens_seen": 357418880, + "step": 16559, + "time_per_iteration": 2.565115213394165 + }, + { + "auxiliary_loss_clip": 0.01089435, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.0334903, + "balance_loss_mlp": 1.01592791, + "epoch": 0.9956410641815723, + "flos": 21543278544000.0, + "grad_norm": 2.195594746129208, + "language_loss": 0.74445486, + "learning_rate": 1.965745799148433e-10, + "loss": 0.76562965, + "num_input_tokens_seen": 357438310, + "step": 16560, + "time_per_iteration": 2.4962832927703857 + }, + { + "auxiliary_loss_clip": 0.01045919, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.0310446, + "balance_loss_mlp": 1.01669574, + "epoch": 0.9957011874342402, + "flos": 21689470897920.0, + "grad_norm": 1.7128798437871364, + "language_loss": 0.79205585, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.81279701, + "num_input_tokens_seen": 357457155, + "step": 16561, + "time_per_iteration": 2.5994818210601807 + }, + { + "auxiliary_loss_clip": 0.01100625, + "auxiliary_loss_mlp": 0.01028825, + "balance_loss_clip": 1.03529871, + "balance_loss_mlp": 1.01794124, + "epoch": 0.9957613106869082, + "flos": 17702667619200.0, + "grad_norm": 2.8547904590763937, + "language_loss": 0.65297949, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.67427403, + "num_input_tokens_seen": 357468060, + "step": 16562, + "time_per_iteration": 2.412872552871704 + }, + { + "auxiliary_loss_clip": 0.01079203, + "auxiliary_loss_mlp": 0.00783862, + "balance_loss_clip": 1.03540051, + "balance_loss_mlp": 1.01069927, + "epoch": 0.9958214339395761, + "flos": 30555994671360.0, + "grad_norm": 1.7741888614636785, + "language_loss": 0.64010078, + "learning_rate": 1.805348815528962e-10, + "loss": 0.65873146, + "num_input_tokens_seen": 357489665, + "step": 16563, + "time_per_iteration": 2.573263168334961 + }, + { + "auxiliary_loss_clip": 0.01079189, + "auxiliary_loss_mlp": 0.01029873, + "balance_loss_clip": 1.03349233, + "balance_loss_mlp": 1.01777864, + "epoch": 0.9958815571922441, + "flos": 24169174306560.0, + "grad_norm": 2.959805552145584, + "language_loss": 0.64607549, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.66716611, + "num_input_tokens_seen": 357511975, + "step": 16564, + "time_per_iteration": 2.5864317417144775 + }, + { + "auxiliary_loss_clip": 0.01079291, + "auxiliary_loss_mlp": 0.00780005, + "balance_loss_clip": 1.03417516, + "balance_loss_mlp": 1.00511599, + "epoch": 0.995941680444912, + "flos": 15487016065920.0, + "grad_norm": 1.9013417943629443, + "language_loss": 0.74135298, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.75994593, + "num_input_tokens_seen": 357529345, + "step": 16565, + "time_per_iteration": 2.4879424571990967 + }, + { + "auxiliary_loss_clip": 0.01082495, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.03330183, + "balance_loss_mlp": 1.01956248, + "epoch": 0.9960018036975801, + "flos": 18621227145600.0, + "grad_norm": 1.7813955003044877, + "language_loss": 0.79297155, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.81410652, + "num_input_tokens_seen": 357547615, + "step": 16566, + "time_per_iteration": 2.5112364292144775 + }, + { + "auxiliary_loss_clip": 0.010525, + "auxiliary_loss_mlp": 0.00782149, + "balance_loss_clip": 1.03336251, + "balance_loss_mlp": 1.00947046, + "epoch": 0.996061926950248, + "flos": 20084120352000.0, + "grad_norm": 1.6769048601194516, + "language_loss": 0.70935625, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.72770274, + "num_input_tokens_seen": 357567380, + "step": 16567, + "time_per_iteration": 2.550163507461548 + }, + { + "auxiliary_loss_clip": 0.0109169, + "auxiliary_loss_mlp": 0.01032214, + "balance_loss_clip": 1.03335238, + "balance_loss_mlp": 1.01939321, + "epoch": 0.996122050202916, + "flos": 24347829576960.0, + "grad_norm": 7.120991661128482, + "language_loss": 0.79103339, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.81227249, + "num_input_tokens_seen": 357586435, + "step": 16568, + "time_per_iteration": 3.93766188621521 + }, + { + "auxiliary_loss_clip": 0.01088482, + "auxiliary_loss_mlp": 0.01028856, + "balance_loss_clip": 1.03448188, + "balance_loss_mlp": 1.01785326, + "epoch": 0.9961821734555839, + "flos": 24199302839040.0, + "grad_norm": 1.6485121911690461, + "language_loss": 0.81838489, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.83955824, + "num_input_tokens_seen": 357604720, + "step": 16569, + "time_per_iteration": 2.534306287765503 + }, + { + "auxiliary_loss_clip": 0.01064782, + "auxiliary_loss_mlp": 0.0078453, + "balance_loss_clip": 1.03409052, + "balance_loss_mlp": 1.01198483, + "epoch": 0.9962422967082519, + "flos": 22633741584000.0, + "grad_norm": 2.2886239590763857, + "language_loss": 0.706568, + "learning_rate": 1.457630950747468e-10, + "loss": 0.72506112, + "num_input_tokens_seen": 357622345, + "step": 16570, + "time_per_iteration": 2.5477631092071533 + }, + { + "auxiliary_loss_clip": 0.01074543, + "auxiliary_loss_mlp": 0.01027765, + "balance_loss_clip": 1.03543782, + "balance_loss_mlp": 1.0157125, + "epoch": 0.9963024199609198, + "flos": 26396030903040.0, + "grad_norm": 1.58145761227336, + "language_loss": 0.7495079, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.770531, + "num_input_tokens_seen": 357642710, + "step": 16571, + "time_per_iteration": 4.002007007598877 + }, + { + "auxiliary_loss_clip": 0.01082029, + "auxiliary_loss_mlp": 0.010322, + "balance_loss_clip": 1.03439295, + "balance_loss_mlp": 1.02017212, + "epoch": 0.9963625432135879, + "flos": 16581537342720.0, + "grad_norm": 1.8398003542360977, + "language_loss": 0.79586112, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.81700349, + "num_input_tokens_seen": 357659870, + "step": 16572, + "time_per_iteration": 3.8611907958984375 + }, + { + "auxiliary_loss_clip": 0.0107944, + "auxiliary_loss_mlp": 0.01029566, + "balance_loss_clip": 1.03503346, + "balance_loss_mlp": 1.01791906, + "epoch": 0.9964226664662559, + "flos": 26468534505600.0, + "grad_norm": 1.7734871998858, + "language_loss": 0.70656145, + "learning_rate": 1.3199841727074e-10, + "loss": 0.72765148, + "num_input_tokens_seen": 357677075, + "step": 16573, + "time_per_iteration": 2.532543659210205 + }, + { + "auxiliary_loss_clip": 0.01081867, + "auxiliary_loss_mlp": 0.01031466, + "balance_loss_clip": 1.03610647, + "balance_loss_mlp": 1.01914573, + "epoch": 0.9964827897189238, + "flos": 27448320764160.0, + "grad_norm": 2.2907500380003283, + "language_loss": 0.63440758, + "learning_rate": 1.275618614968721e-10, + "loss": 0.65554094, + "num_input_tokens_seen": 357696715, + "step": 16574, + "time_per_iteration": 2.631331443786621 + }, + { + "auxiliary_loss_clip": 0.01075413, + "auxiliary_loss_mlp": 0.01032026, + "balance_loss_clip": 1.03899539, + "balance_loss_mlp": 1.01932991, + "epoch": 0.9965429129715918, + "flos": 11721566350080.0, + "grad_norm": 2.6100508764097876, + "language_loss": 0.76249325, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.78356767, + "num_input_tokens_seen": 357712345, + "step": 16575, + "time_per_iteration": 2.526296615600586 + }, + { + "auxiliary_loss_clip": 0.01081358, + "auxiliary_loss_mlp": 0.01030029, + "balance_loss_clip": 1.03677154, + "balance_loss_mlp": 1.017959, + "epoch": 0.9966030362242597, + "flos": 19756004590080.0, + "grad_norm": 4.312242747907826, + "language_loss": 0.70189822, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.72301209, + "num_input_tokens_seen": 357731815, + "step": 16576, + "time_per_iteration": 2.534085512161255 + }, + { + "auxiliary_loss_clip": 0.01089795, + "auxiliary_loss_mlp": 0.01025129, + "balance_loss_clip": 1.03415096, + "balance_loss_mlp": 1.01354218, + "epoch": 0.9966631594769277, + "flos": 23915178259200.0, + "grad_norm": 1.537633629094008, + "language_loss": 0.71900058, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.7401498, + "num_input_tokens_seen": 357751640, + "step": 16577, + "time_per_iteration": 2.481809139251709 + }, + { + "auxiliary_loss_clip": 0.01078124, + "auxiliary_loss_mlp": 0.01031785, + "balance_loss_clip": 1.03529477, + "balance_loss_mlp": 1.02018034, + "epoch": 0.9967232827295956, + "flos": 15559591495680.0, + "grad_norm": 1.8987269292847335, + "language_loss": 0.78216684, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.80326593, + "num_input_tokens_seen": 357769850, + "step": 16578, + "time_per_iteration": 2.5189197063446045 + }, + { + "auxiliary_loss_clip": 0.0106085, + "auxiliary_loss_mlp": 0.00781058, + "balance_loss_clip": 1.03338981, + "balance_loss_mlp": 1.00855494, + "epoch": 0.9967834059822637, + "flos": 20813035046400.0, + "grad_norm": 1.6878641958822334, + "language_loss": 0.76113296, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.77955198, + "num_input_tokens_seen": 357789550, + "step": 16579, + "time_per_iteration": 2.5710489749908447 + }, + { + "auxiliary_loss_clip": 0.01082621, + "auxiliary_loss_mlp": 0.01032902, + "balance_loss_clip": 1.03849137, + "balance_loss_mlp": 1.01943779, + "epoch": 0.9968435292349316, + "flos": 36719234830080.0, + "grad_norm": 2.2363890727953932, + "language_loss": 0.69582492, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.7169801, + "num_input_tokens_seen": 357809525, + "step": 16580, + "time_per_iteration": 2.6679892539978027 + }, + { + "auxiliary_loss_clip": 0.01052405, + "auxiliary_loss_mlp": 0.01030572, + "balance_loss_clip": 1.03338408, + "balance_loss_mlp": 1.0185858, + "epoch": 0.9969036524875996, + "flos": 26760919213440.0, + "grad_norm": 2.033260115044349, + "language_loss": 0.80241179, + "learning_rate": 9.862937031113184e-11, + "loss": 0.82324159, + "num_input_tokens_seen": 357829795, + "step": 16581, + "time_per_iteration": 3.9894614219665527 + }, + { + "auxiliary_loss_clip": 0.01074135, + "auxiliary_loss_mlp": 0.01027188, + "balance_loss_clip": 1.03491533, + "balance_loss_mlp": 1.01647687, + "epoch": 0.9969637757402675, + "flos": 24827237424000.0, + "grad_norm": 1.5279830601619302, + "language_loss": 0.80223626, + "learning_rate": 9.479950191249031e-11, + "loss": 0.82324946, + "num_input_tokens_seen": 357851655, + "step": 16582, + "time_per_iteration": 2.5572047233581543 + }, + { + "auxiliary_loss_clip": 0.01088881, + "auxiliary_loss_mlp": 0.01029848, + "balance_loss_clip": 1.03316355, + "balance_loss_mlp": 1.01856446, + "epoch": 0.9970238989929355, + "flos": 23038742407680.0, + "grad_norm": 1.777530068098933, + "language_loss": 0.60386693, + "learning_rate": 9.104547011951069e-11, + "loss": 0.6250543, + "num_input_tokens_seen": 357871205, + "step": 16583, + "time_per_iteration": 2.4967753887176514 + }, + { + "auxiliary_loss_clip": 0.0108263, + "auxiliary_loss_mlp": 0.01032751, + "balance_loss_clip": 1.03444052, + "balance_loss_mlp": 1.02072275, + "epoch": 0.9970840222456034, + "flos": 25298816106240.0, + "grad_norm": 1.9618293167763976, + "language_loss": 0.77941829, + "learning_rate": 8.736727507452357e-11, + "loss": 0.8005721, + "num_input_tokens_seen": 357892145, + "step": 16584, + "time_per_iteration": 2.5463454723358154 + }, + { + "auxiliary_loss_clip": 0.01076472, + "auxiliary_loss_mlp": 0.01029499, + "balance_loss_clip": 1.03234041, + "balance_loss_mlp": 1.01886511, + "epoch": 0.9971441454982715, + "flos": 21615602578560.0, + "grad_norm": 1.4204972284084216, + "language_loss": 0.69466901, + "learning_rate": 8.376491691697297e-11, + "loss": 0.7157287, + "num_input_tokens_seen": 357911205, + "step": 16585, + "time_per_iteration": 2.534080743789673 + }, + { + "auxiliary_loss_clip": 0.01102221, + "auxiliary_loss_mlp": 0.01029068, + "balance_loss_clip": 1.03537989, + "balance_loss_mlp": 1.01704609, + "epoch": 0.9972042687509394, + "flos": 14975612179200.0, + "grad_norm": 2.383714171437012, + "language_loss": 0.81093621, + "learning_rate": 8.023839578363834e-11, + "loss": 0.8322491, + "num_input_tokens_seen": 357928190, + "step": 16586, + "time_per_iteration": 2.4236743450164795 + }, + { + "auxiliary_loss_clip": 0.01079071, + "auxiliary_loss_mlp": 0.01036348, + "balance_loss_clip": 1.0318315, + "balance_loss_mlp": 1.02478445, + "epoch": 0.9972643920036074, + "flos": 25806664546560.0, + "grad_norm": 1.5830064606276297, + "language_loss": 0.77810055, + "learning_rate": 7.678771180796851e-11, + "loss": 0.79925478, + "num_input_tokens_seen": 357946985, + "step": 16587, + "time_per_iteration": 2.5617570877075195 + }, + { + "auxiliary_loss_clip": 0.01084192, + "auxiliary_loss_mlp": 0.0103757, + "balance_loss_clip": 1.03561044, + "balance_loss_mlp": 1.02535653, + "epoch": 0.9973245152562754, + "flos": 23326242865920.0, + "grad_norm": 1.815911400651242, + "language_loss": 0.72710073, + "learning_rate": 7.341286512074773e-11, + "loss": 0.74831837, + "num_input_tokens_seen": 357966720, + "step": 16588, + "time_per_iteration": 2.511484146118164 + }, + { + "auxiliary_loss_clip": 0.01107976, + "auxiliary_loss_mlp": 0.01028225, + "balance_loss_clip": 1.03565741, + "balance_loss_mlp": 1.01623869, + "epoch": 0.9973846385089433, + "flos": 12166212810240.0, + "grad_norm": 2.829169957408843, + "language_loss": 0.8245039, + "learning_rate": 7.011385585031781e-11, + "loss": 0.84586591, + "num_input_tokens_seen": 357981375, + "step": 16589, + "time_per_iteration": 2.412475824356079 + }, + { + "auxiliary_loss_clip": 0.01096494, + "auxiliary_loss_mlp": 0.01035423, + "balance_loss_clip": 1.0344826, + "balance_loss_mlp": 1.02194691, + "epoch": 0.9974447617616113, + "flos": 20045157073920.0, + "grad_norm": 2.091142001783348, + "language_loss": 0.70083052, + "learning_rate": 6.689068412168986e-11, + "loss": 0.72214973, + "num_input_tokens_seen": 358000290, + "step": 16590, + "time_per_iteration": 2.462782382965088 + }, + { + "auxiliary_loss_clip": 0.01081637, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.03407991, + "balance_loss_mlp": 1.01751196, + "epoch": 0.9975048850142793, + "flos": 32014614159360.0, + "grad_norm": 1.6490247537033744, + "language_loss": 0.63715148, + "learning_rate": 6.374335005676634e-11, + "loss": 0.65826708, + "num_input_tokens_seen": 358022075, + "step": 16591, + "time_per_iteration": 2.6072616577148438 + }, + { + "auxiliary_loss_clip": 0.01078902, + "auxiliary_loss_mlp": 0.01025659, + "balance_loss_clip": 1.0321368, + "balance_loss_mlp": 1.01401782, + "epoch": 0.9975650082669473, + "flos": 36933728895360.0, + "grad_norm": 1.6677858209369945, + "language_loss": 0.73246253, + "learning_rate": 6.067185377522933e-11, + "loss": 0.75350809, + "num_input_tokens_seen": 358043940, + "step": 16592, + "time_per_iteration": 2.6222312450408936 + }, + { + "auxiliary_loss_clip": 0.01080952, + "auxiliary_loss_mlp": 0.01028034, + "balance_loss_clip": 1.03397059, + "balance_loss_mlp": 1.01597047, + "epoch": 0.9976251315196152, + "flos": 16472117537280.0, + "grad_norm": 1.4458137758187946, + "language_loss": 0.84853339, + "learning_rate": 5.767619539343016e-11, + "loss": 0.8696233, + "num_input_tokens_seen": 358062720, + "step": 16593, + "time_per_iteration": 2.5073728561401367 + }, + { + "auxiliary_loss_clip": 0.0109977, + "auxiliary_loss_mlp": 0.00781107, + "balance_loss_clip": 1.03421879, + "balance_loss_mlp": 1.00807738, + "epoch": 0.9976852547722832, + "flos": 19646836179840.0, + "grad_norm": 1.754549658240388, + "language_loss": 0.69579291, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.71460164, + "num_input_tokens_seen": 358081560, + "step": 16594, + "time_per_iteration": 2.4424185752868652 + }, + { + "auxiliary_loss_clip": 0.01061401, + "auxiliary_loss_mlp": 0.01023738, + "balance_loss_clip": 1.03600323, + "balance_loss_mlp": 1.01198959, + "epoch": 0.9977453780249511, + "flos": 20448434044800.0, + "grad_norm": 2.0454728323364892, + "language_loss": 0.72896576, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.74981713, + "num_input_tokens_seen": 358099065, + "step": 16595, + "time_per_iteration": 2.5930068492889404 + }, + { + "auxiliary_loss_clip": 0.01014952, + "auxiliary_loss_mlp": 0.01001225, + "balance_loss_clip": 1.00462914, + "balance_loss_mlp": 1.00018239, + "epoch": 0.9978055012776191, + "flos": 65455097581440.0, + "grad_norm": 0.7901317989714431, + "language_loss": 0.60366356, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.62382543, + "num_input_tokens_seen": 358156095, + "step": 16596, + "time_per_iteration": 2.956843376159668 + }, + { + "auxiliary_loss_clip": 0.01092647, + "auxiliary_loss_mlp": 0.01028244, + "balance_loss_clip": 1.03520465, + "balance_loss_mlp": 1.01663947, + "epoch": 0.997865624530287, + "flos": 20631506688000.0, + "grad_norm": 1.685240371981805, + "language_loss": 0.77660549, + "learning_rate": 4.645194309227385e-11, + "loss": 0.79781437, + "num_input_tokens_seen": 358175230, + "step": 16597, + "time_per_iteration": 2.4770374298095703 + }, + { + "auxiliary_loss_clip": 0.01091168, + "auxiliary_loss_mlp": 0.01030817, + "balance_loss_clip": 1.03223419, + "balance_loss_mlp": 1.01866913, + "epoch": 0.9979257477829551, + "flos": 29387102284800.0, + "grad_norm": 1.859973030885209, + "language_loss": 0.82016158, + "learning_rate": 4.383547585562475e-11, + "loss": 0.84138143, + "num_input_tokens_seen": 358197075, + "step": 16598, + "time_per_iteration": 2.5209484100341797 + }, + { + "auxiliary_loss_clip": 0.01084817, + "auxiliary_loss_mlp": 0.0103986, + "balance_loss_clip": 1.03510094, + "balance_loss_mlp": 1.02671766, + "epoch": 0.997985871035623, + "flos": 22635070387200.0, + "grad_norm": 1.868157746287422, + "language_loss": 0.64362121, + "learning_rate": 4.129484715709175e-11, + "loss": 0.664868, + "num_input_tokens_seen": 358215925, + "step": 16599, + "time_per_iteration": 2.5198020935058594 + }, + { + "auxiliary_loss_clip": 0.01008736, + "auxiliary_loss_mlp": 0.01003056, + "balance_loss_clip": 1.00636697, + "balance_loss_mlp": 1.00203097, + "epoch": 0.998045994288291, + "flos": 61806968663040.0, + "grad_norm": 0.8532045371886507, + "language_loss": 0.62396759, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.64408553, + "num_input_tokens_seen": 358269035, + "step": 16600, + "time_per_iteration": 3.0389177799224854 + }, + { + "auxiliary_loss_clip": 0.01080121, + "auxiliary_loss_mlp": 0.0102983, + "balance_loss_clip": 1.03420472, + "balance_loss_mlp": 1.01914859, + "epoch": 0.998106117540959, + "flos": 19245534456960.0, + "grad_norm": 1.677989481534099, + "language_loss": 0.78458488, + "learning_rate": 3.644110575717896e-11, + "loss": 0.80568433, + "num_input_tokens_seen": 358287680, + "step": 16601, + "time_per_iteration": 2.506166934967041 + }, + { + "auxiliary_loss_clip": 0.01073387, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.03446424, + "balance_loss_mlp": 1.01780832, + "epoch": 0.9981662407936269, + "flos": 21106209853440.0, + "grad_norm": 2.118776507525557, + "language_loss": 0.82653213, + "learning_rate": 3.412799323987414e-11, + "loss": 0.84756255, + "num_input_tokens_seen": 358304080, + "step": 16602, + "time_per_iteration": 2.527155637741089 + }, + { + "auxiliary_loss_clip": 0.010639, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.03700805, + "balance_loss_mlp": 1.01988018, + "epoch": 0.998226364046295, + "flos": 24316839118080.0, + "grad_norm": 1.887301513479427, + "language_loss": 0.62746936, + "learning_rate": 3.189071962883538e-11, + "loss": 0.64842331, + "num_input_tokens_seen": 358323670, + "step": 16603, + "time_per_iteration": 2.581286907196045 + }, + { + "auxiliary_loss_clip": 0.01078496, + "auxiliary_loss_mlp": 0.01027924, + "balance_loss_clip": 1.03238809, + "balance_loss_mlp": 1.01584232, + "epoch": 0.9982864872989629, + "flos": 23836389776640.0, + "grad_norm": 1.9026016785368751, + "language_loss": 0.71108395, + "learning_rate": 2.972928500866168e-11, + "loss": 0.73214817, + "num_input_tokens_seen": 358341980, + "step": 16604, + "time_per_iteration": 2.519912004470825 + }, + { + "auxiliary_loss_clip": 0.01102374, + "auxiliary_loss_mlp": 0.01025152, + "balance_loss_clip": 1.03414023, + "balance_loss_mlp": 1.01288533, + "epoch": 0.9983466105516309, + "flos": 18333116156160.0, + "grad_norm": 1.5924326000237703, + "language_loss": 0.64325321, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.66452843, + "num_input_tokens_seen": 358360400, + "step": 16605, + "time_per_iteration": 2.4258415699005127 + }, + { + "auxiliary_loss_clip": 0.01068581, + "auxiliary_loss_mlp": 0.0102923, + "balance_loss_clip": 1.03297162, + "balance_loss_mlp": 1.01751721, + "epoch": 0.9984067338042988, + "flos": 17236763285760.0, + "grad_norm": 2.944242900860713, + "language_loss": 0.71610641, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.73708451, + "num_input_tokens_seen": 358378990, + "step": 16606, + "time_per_iteration": 3.9524030685424805 + }, + { + "auxiliary_loss_clip": 0.01092263, + "auxiliary_loss_mlp": 0.0078122, + "balance_loss_clip": 1.03476965, + "balance_loss_mlp": 1.00628614, + "epoch": 0.9984668570569668, + "flos": 20667884186880.0, + "grad_norm": 1.9420234398598772, + "language_loss": 0.82043028, + "learning_rate": 2.370001590090709e-11, + "loss": 0.83916509, + "num_input_tokens_seen": 358395970, + "step": 16607, + "time_per_iteration": 2.4806463718414307 + }, + { + "auxiliary_loss_clip": 0.01069867, + "auxiliary_loss_mlp": 0.01031262, + "balance_loss_clip": 1.03145027, + "balance_loss_mlp": 1.01887059, + "epoch": 0.9985269803096347, + "flos": 30262532555520.0, + "grad_norm": 1.5691797383449366, + "language_loss": 0.66682851, + "learning_rate": 2.184193803622669e-11, + "loss": 0.68783975, + "num_input_tokens_seen": 358417355, + "step": 16608, + "time_per_iteration": 2.6165785789489746 + }, + { + "auxiliary_loss_clip": 0.0106344, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.03399205, + "balance_loss_mlp": 1.01802182, + "epoch": 0.9985871035623027, + "flos": 10560970005120.0, + "grad_norm": 5.488218012838019, + "language_loss": 0.80622649, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.82715923, + "num_input_tokens_seen": 358434345, + "step": 16609, + "time_per_iteration": 3.931772470474243 + }, + { + "auxiliary_loss_clip": 0.01082386, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.03205347, + "balance_loss_mlp": 1.02084982, + "epoch": 0.9986472268149706, + "flos": 16873455173760.0, + "grad_norm": 1.4826627704400046, + "language_loss": 0.6297071, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.65086067, + "num_input_tokens_seen": 358452870, + "step": 16610, + "time_per_iteration": 2.495527744293213 + }, + { + "auxiliary_loss_clip": 0.01091798, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.03398561, + "balance_loss_mlp": 1.02233171, + "epoch": 0.9987073500676387, + "flos": 22054538776320.0, + "grad_norm": 10.670550101187457, + "language_loss": 0.67923367, + "learning_rate": 1.672274094288717e-11, + "loss": 0.70048726, + "num_input_tokens_seen": 358472210, + "step": 16611, + "time_per_iteration": 3.8829832077026367 + }, + { + "auxiliary_loss_clip": 0.01060002, + "auxiliary_loss_mlp": 0.01034056, + "balance_loss_clip": 1.03283393, + "balance_loss_mlp": 1.02104402, + "epoch": 0.9987674733203066, + "flos": 30482880537600.0, + "grad_norm": 1.4055315080165434, + "language_loss": 0.69653618, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.71747673, + "num_input_tokens_seen": 358493840, + "step": 16612, + "time_per_iteration": 2.612689971923828 + }, + { + "auxiliary_loss_clip": 0.01076906, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_clip": 1.03413808, + "balance_loss_mlp": 1.01826143, + "epoch": 0.9988275965729746, + "flos": 27745230585600.0, + "grad_norm": 1.6063592277641376, + "language_loss": 0.73835993, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.7594198, + "num_input_tokens_seen": 358515060, + "step": 16613, + "time_per_iteration": 2.633131742477417 + }, + { + "auxiliary_loss_clip": 0.01067017, + "auxiliary_loss_mlp": 0.00785509, + "balance_loss_clip": 1.03286195, + "balance_loss_mlp": 1.00858021, + "epoch": 0.9988877198256426, + "flos": 17524191916800.0, + "grad_norm": 2.0165002279492663, + "language_loss": 0.73550928, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.75403452, + "num_input_tokens_seen": 358528200, + "step": 16614, + "time_per_iteration": 2.484316825866699 + }, + { + "auxiliary_loss_clip": 0.01089366, + "auxiliary_loss_mlp": 0.01030511, + "balance_loss_clip": 1.03418529, + "balance_loss_mlp": 1.01929975, + "epoch": 0.9989478430783105, + "flos": 20996502739200.0, + "grad_norm": 1.705809737280282, + "language_loss": 0.72609651, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.74729532, + "num_input_tokens_seen": 358548360, + "step": 16615, + "time_per_iteration": 2.486840009689331 + }, + { + "auxiliary_loss_clip": 0.01106865, + "auxiliary_loss_mlp": 0.00782832, + "balance_loss_clip": 1.03658831, + "balance_loss_mlp": 1.00906563, + "epoch": 0.9990079663309785, + "flos": 13370620769280.0, + "grad_norm": 3.9348376290084155, + "language_loss": 0.77619541, + "learning_rate": 9.70753783247069e-12, + "loss": 0.7950924, + "num_input_tokens_seen": 358566270, + "step": 16616, + "time_per_iteration": 2.404435873031616 + }, + { + "auxiliary_loss_clip": 0.01084962, + "auxiliary_loss_mlp": 0.01028862, + "balance_loss_clip": 1.0363549, + "balance_loss_mlp": 1.01669073, + "epoch": 0.9990680895836465, + "flos": 17310236555520.0, + "grad_norm": 1.8949845001714494, + "language_loss": 0.8258456, + "learning_rate": 8.532016508855378e-12, + "loss": 0.84698385, + "num_input_tokens_seen": 358584710, + "step": 16617, + "time_per_iteration": 2.5276079177856445 + }, + { + "auxiliary_loss_clip": 0.01081967, + "auxiliary_loss_mlp": 0.01027101, + "balance_loss_clip": 1.03425992, + "balance_loss_mlp": 1.01563859, + "epoch": 0.9991282128363145, + "flos": 24207993930240.0, + "grad_norm": 1.4393801220047557, + "language_loss": 0.7856729, + "learning_rate": 7.43233506206309e-12, + "loss": 0.80676353, + "num_input_tokens_seen": 358606750, + "step": 16618, + "time_per_iteration": 2.5549628734588623 + }, + { + "auxiliary_loss_clip": 0.01101043, + "auxiliary_loss_mlp": 0.01027993, + "balance_loss_clip": 1.03330278, + "balance_loss_mlp": 1.01644111, + "epoch": 0.9991883360889824, + "flos": 21175301664000.0, + "grad_norm": 1.6424965107360727, + "language_loss": 0.74615735, + "learning_rate": 6.408493534060255e-12, + "loss": 0.76744771, + "num_input_tokens_seen": 358624675, + "step": 16619, + "time_per_iteration": 2.4647164344787598 + }, + { + "auxiliary_loss_clip": 0.0108853, + "auxiliary_loss_mlp": 0.01027258, + "balance_loss_clip": 1.03240013, + "balance_loss_mlp": 1.01657128, + "epoch": 0.9992484593416504, + "flos": 19901155449600.0, + "grad_norm": 2.006624435304398, + "language_loss": 0.86377752, + "learning_rate": 5.460491963260594e-12, + "loss": 0.88493544, + "num_input_tokens_seen": 358640715, + "step": 16620, + "time_per_iteration": 3.923424243927002 + }, + { + "auxiliary_loss_clip": 0.01064765, + "auxiliary_loss_mlp": 0.01025792, + "balance_loss_clip": 1.03134763, + "balance_loss_mlp": 1.01450253, + "epoch": 0.9993085825943183, + "flos": 24857832833280.0, + "grad_norm": 1.9258330442567073, + "language_loss": 0.72729498, + "learning_rate": 4.58833038607942e-12, + "loss": 0.74820054, + "num_input_tokens_seen": 358659630, + "step": 16621, + "time_per_iteration": 2.545844316482544 + }, + { + "auxiliary_loss_clip": 0.00998604, + "auxiliary_loss_mlp": 0.01003661, + "balance_loss_clip": 1.00620592, + "balance_loss_mlp": 1.00262344, + "epoch": 0.9993687058469863, + "flos": 71284478780160.0, + "grad_norm": 0.7362386165268472, + "language_loss": 0.56500053, + "learning_rate": 3.79200883515729e-12, + "loss": 0.58502316, + "num_input_tokens_seen": 358727840, + "step": 16622, + "time_per_iteration": 3.341364860534668 + }, + { + "auxiliary_loss_clip": 0.01063653, + "auxiliary_loss_mlp": 0.01027204, + "balance_loss_clip": 1.03293681, + "balance_loss_mlp": 1.01508057, + "epoch": 0.9994288290996542, + "flos": 12199573566720.0, + "grad_norm": 3.353668276695799, + "language_loss": 0.71405113, + "learning_rate": 3.071527340914315e-12, + "loss": 0.73495972, + "num_input_tokens_seen": 358744125, + "step": 16623, + "time_per_iteration": 2.506336212158203 + }, + { + "auxiliary_loss_clip": 0.01066469, + "auxiliary_loss_mlp": 0.01028625, + "balance_loss_clip": 1.03515387, + "balance_loss_mlp": 1.01628137, + "epoch": 0.9994889523523223, + "flos": 17889942153600.0, + "grad_norm": 2.5589776149256536, + "language_loss": 0.748088, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.76903892, + "num_input_tokens_seen": 358761420, + "step": 16624, + "time_per_iteration": 2.5339014530181885 + }, + { + "auxiliary_loss_clip": 0.01074052, + "auxiliary_loss_mlp": 0.01031029, + "balance_loss_clip": 1.03382421, + "balance_loss_mlp": 1.01859593, + "epoch": 0.9995490756049902, + "flos": 26578888064640.0, + "grad_norm": 1.6565297346606276, + "language_loss": 0.74126279, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.7623136, + "num_input_tokens_seen": 358782600, + "step": 16625, + "time_per_iteration": 2.592473268508911 + }, + { + "auxiliary_loss_clip": 0.01089533, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.03379273, + "balance_loss_mlp": 1.02163446, + "epoch": 0.9996091988576582, + "flos": 22200048771840.0, + "grad_norm": 2.4106335339446354, + "language_loss": 0.76861835, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.78984356, + "num_input_tokens_seen": 358801220, + "step": 16626, + "time_per_iteration": 2.497978687286377 + }, + { + "auxiliary_loss_clip": 0.01101655, + "auxiliary_loss_mlp": 0.01032822, + "balance_loss_clip": 1.03587604, + "balance_loss_mlp": 1.02104425, + "epoch": 0.9996693221103262, + "flos": 27373195468800.0, + "grad_norm": 1.834097594039187, + "language_loss": 0.82148588, + "learning_rate": 9.480024334429515e-13, + "loss": 0.84283066, + "num_input_tokens_seen": 358819190, + "step": 16627, + "time_per_iteration": 2.485482931137085 + }, + { + "auxiliary_loss_clip": 0.0109828, + "auxiliary_loss_mlp": 0.01035599, + "balance_loss_clip": 1.03706753, + "balance_loss_mlp": 1.02246821, + "epoch": 0.9997294453629941, + "flos": 26870410846080.0, + "grad_norm": 2.1317555430853203, + "language_loss": 0.70762205, + "learning_rate": 6.067215747584952e-13, + "loss": 0.72896087, + "num_input_tokens_seen": 358839850, + "step": 16628, + "time_per_iteration": 2.5652377605438232 + }, + { + "auxiliary_loss_clip": 0.0109184, + "auxiliary_loss_mlp": 0.01025957, + "balance_loss_clip": 1.03195941, + "balance_loss_mlp": 1.01410818, + "epoch": 0.9997895686156621, + "flos": 23476996247040.0, + "grad_norm": 1.3729360688507088, + "language_loss": 0.7546463, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.77582425, + "num_input_tokens_seen": 358859805, + "step": 16629, + "time_per_iteration": 2.479379415512085 + }, + { + "auxiliary_loss_clip": 0.01085618, + "auxiliary_loss_mlp": 0.01033616, + "balance_loss_clip": 1.03558874, + "balance_loss_mlp": 1.02126551, + "epoch": 0.9998496918683301, + "flos": 20224961579520.0, + "grad_norm": 1.6201642931700957, + "language_loss": 0.60554153, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.6267339, + "num_input_tokens_seen": 358877900, + "step": 16630, + "time_per_iteration": 2.511185646057129 + }, + { + "auxiliary_loss_clip": 0.01060604, + "auxiliary_loss_mlp": 0.01028799, + "balance_loss_clip": 1.03428459, + "balance_loss_mlp": 1.01668119, + "epoch": 0.9999098151209981, + "flos": 21652913831040.0, + "grad_norm": 1.963784734566383, + "language_loss": 0.60584134, + "learning_rate": 3.792010017100722e-14, + "loss": 0.62673533, + "num_input_tokens_seen": 358897285, + "step": 16631, + "time_per_iteration": 2.5895564556121826 + }, + { + "auxiliary_loss_clip": 0.01050255, + "auxiliary_loss_mlp": 0.007821, + "balance_loss_clip": 1.03462207, + "balance_loss_mlp": 1.00652003, + "epoch": 0.999969938373666, + "flos": 11544599018880.0, + "grad_norm": 1.809182672960919, + "language_loss": 0.72743851, + "learning_rate": 0.0, + "loss": 0.74576211, + "num_input_tokens_seen": 358911570, + "step": 16632, + "time_per_iteration": 2.5463998317718506 + }, + { + "epoch": 0.999969938373666, + "num_input_tokens_seen": 358911570, + "step": 16632, + "total_flos": 1.3992169073237033e+18, + "train_loss": 0.7693321485898424, + "train_runtime": 46762.2544, + "train_samples_per_second": 14.227, + "train_steps_per_second": 0.356 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3992169073237033e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}